diff --git a/.github/workflows/reproducibility-check.yml b/.github/workflows/reproducibility-check.yml new file mode 100644 index 0000000..896d8c9 --- /dev/null +++ b/.github/workflows/reproducibility-check.yml @@ -0,0 +1,42 @@ +name: Reproducibility Check + +on: + pull_request: + paths: + - 'reproducibility/**' + - 'examples/querygym_pyserini/pipeline.py' + - 'dataset_registry.yaml' + push: + branches: [main] + paths: + - 'reproducibility/**' + - 'examples/querygym_pyserini/pipeline.py' + - 'dataset_registry.yaml' + +concurrency: + group: repro-${{ github.ref }} + cancel-in-progress: true + +jobs: + check: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.9' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[repro,dev]" + pip install pytest-cov + + - name: Run repro tests + run: pytest reproducibility/tests -v --no-cov + + - name: Aggregator --check + run: python -m reproducibility.scripts.aggregate_runs --check diff --git a/MANIFEST.in b/MANIFEST.in index 42f3f29..539e427 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -14,6 +14,11 @@ recursive-include examples *.py *.tsv *.ipynb # Include docs recursive-include docs *.md +# Exclude reproducibility umbrella and (preemptive) future web/ from sdist. +# These are repo-only artifacts; nothing in them belongs in the PyPI source dist. +prune reproducibility +prune web + # Exclude development and build artifacts global-exclude __pycache__ global-exclude *.py[co] diff --git a/Makefile b/Makefile index 8b4b20c..db89795 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help build build-cpu build-all test clean +.PHONY: help build build-cpu build-all test clean repro-aggregate repro-check repro-test # Makefile for QueryGym Docker Development # This is for developers/contributors who need to build images locally @@ -55,6 +55,16 @@ test: @echo "" @echo "✓ All tests passed!" +# Reproducibility data pipeline +repro-aggregate: + python -m reproducibility.scripts.aggregate_runs + +repro-check: + python -m reproducibility.scripts.aggregate_runs --check + +repro-test: + pytest reproducibility/tests -q --no-cov + # Clean up locally built images clean: @echo "Removing locally built images..." diff --git a/docs/user-guide/reproducibility.md b/docs/user-guide/reproducibility.md new file mode 100644 index 0000000..c825d22 --- /dev/null +++ b/docs/user-guide/reproducibility.md @@ -0,0 +1,82 @@ +# Reproducibility & Leaderboard Submissions + +QueryGym ships with a reproducibility pipeline that powers `leaderboard.querygym.com` and the SIGIR 2026 reproducibility paper. This page explains how to submit a result. + +The full schema lives at `reproducibility/schema.md` (human-readable) and `reproducibility/schema.json` (machine-readable). All submitted JSONs are validated against it three times: at emit time, at submit time, and at aggregate time in CI. + +## Trusted contributor flow + +If you have commit access: + +```bash +# 1. Run the example pipeline. +python examples/querygym_pyserini/pipeline.py \ + --dataset msmarco-v1-passage.trecdl2019 \ + --method query2e \ + --model gpt-4.1-mini \ + --output-dir outputs/dl19_query2e_zs + +# 2. Copy the output into the canonical layout. +python -m reproducibility.scripts.submit_run --from-dir outputs/dl19_query2e_zs + +# 3. Regenerate the aggregate CSV + manifest. +make repro-aggregate + +# 4. Commit and open a PR. +git add reproducibility/data/ +git commit -m "add query2e/gpt-4.1-mini result on dl19-passage" +git push +gh pr create +``` + +CI runs the schema/validator tests and `aggregate_runs.py --check`. If everything is green, the leaderboard rebuilds on merge. + +### Common failure modes + +| Symptom | Cause | Fix | +|---|---|---| +| `aggregator --check failed: results.csv is out of date` | You forgot step 3. | Run `make repro-aggregate`, commit the diff. | +| `dataset_id 'foo' not in dataset_registry.yaml` | Typo or new dataset not registered. | Add the dataset to `dataset_registry.yaml` first, then re-submit. | +| `method_id 'foo' not in registered methods` | Method not registered or name typo'd. | Register via `@register_method("foo")` in `querygym/methods/`. | +| `params_hash mismatch` | The JSON was hand-edited. | Don't hand-edit run JSONs — re-run the emitter or use `submit_run` instead. | +| `metric(s) ['bleu'] not in eval_metrics for dataset 'X'` | Unsupported metric for that dataset. | Either drop the metric or add it to the dataset's `output.eval_metrics` in the registry. | + +## External (fork) contributor flow + +If you don't have commit access: + +1. Fork `ls3-lab/QueryGym` on GitHub and clone your fork. +2. Run steps 1–3 from the trusted flow above. +3. Push to your fork and open a PR against `ls3-lab/QueryGym:main`. + +CI runs the same schema/validator/aggregator checks against your PR — no LLM keys or Pyserini are needed for these checks, so fork PRs get fast feedback. + +A maintainer will additionally **re-verify your numbers locally** before merging: + +- **Cheap pre-check (~30s):** the maintainer runs `pytrec_eval` against your submitted `run.txt` using the dataset's qrels and confirms the reported metrics match. +- **Full re-run (only if needed):** if the cheap check is suspicious, the maintainer runs the example pipeline with your `config` block as inputs and compares reformulated queries + run file. + +This is why every submission must include `run.txt` and `reformulated_queries.tsv` alongside the JSON — they make verification cheap. + +## Verifying a published number (paper readers) + +Each leaderboard row links to the canonical files at a paper-release tag. To verify independently: + +```bash +git clone --depth=1 --branch=paper-sigir2026 https://github.com/ls3-lab/QueryGym.git +cd QueryGym + +# Pick a run. +RUN_DIR=reproducibility/data/runs/msmarco-v1-passage.trecdl2019/query2e/gpt-4.1-mini + +# Re-run trec_eval against the public qrels (Pyserini ships them). +python -m pyserini.eval.trec_eval -m ndcg_cut.10 dl19-passage "${RUN_DIR}"/*.run.txt +``` + +The number from `pyserini.eval.trec_eval` should match `metrics.ndcg_cut_10` in the corresponding JSON. + +## External tools (dashboard, third parties) + +The contract is `reproducibility/schema.json` — a Draft 2020-12 JSON Schema document. Any tool that emits a conformant JSON can submit (subject to the trusted vs. fork flows above). You don't need to import any Python from QueryGym; just read the schema file and validate locally with whatever JSON Schema library your stack provides (`Ajv` for JS, `jsonschema` for Python, `everit-org/json-schema` for Java). + +`schema_version` is `"const": 1` today. Bumping it to 2 will be a breaking change announced ahead of time. diff --git a/examples/querygym_pyserini/pipeline.py b/examples/querygym_pyserini/pipeline.py index 25660dd..3b66f4c 100755 --- a/examples/querygym_pyserini/pipeline.py +++ b/examples/querygym_pyserini/pipeline.py @@ -38,6 +38,102 @@ from examples.querygym_pyserini import reformulate_queries from examples.querygym_pyserini import retrieve from examples.querygym_pyserini import evaluate +from reproducibility.lib import build_run_summary, validate, ValidationError + + +def _load_dataset_config_from_registry(dataset_id: str, registry_path: str) -> dict | None: + """Pull dataset_config fields from dataset_registry.yaml. None if not registered.""" + try: + import yaml + with open(registry_path, 'r') as f: + registry = yaml.safe_load(f) or {} + except Exception: + return None + entry = registry.get('datasets', {}).get(dataset_id) + if not entry: + return None + return { + 'topics': entry.get('topics', {}).get('name', ''), + 'index': entry.get('index', {}).get('name', ''), + 'num_queries': 0, # filled by reformulation metadata when available + 'bm25_weights': entry.get('bm25_weights', {'k1': 0.0, 'b': 0.0}), + } + + +def _build_v1_summary( + *, results, dataset_name, method, model, method_params, llm_config, + steps, pipeline_time, registry_path, queries_file, index_name, +) -> dict: + """Pull fields from per-step metadata and call reproducibility.lib.build_run_summary.""" + reform = results.get('reformulation', {}) + retrieval = results.get('retrieval', {}) + evaluation = results.get('evaluation', {}) + + reform_inner = reform.get('reformulation', {}) if isinstance(reform, dict) else {} + dataset_inner = reform.get('dataset', {}) if isinstance(reform, dict) else {} + + # Resolve dataset_config: prefer reformulation metadata (richest), fall back + # to the registry, then to file-based info. + if dataset_inner.get('topics') or dataset_inner.get('index'): + dataset_config = { + 'topics': dataset_inner.get('topics') or '', + 'index': dataset_inner.get('index') or (index_name or ''), + 'num_queries': int(dataset_inner.get('num_queries') or 0), + 'bm25_weights': dataset_inner.get('bm25_weights') or {'k1': 0.0, 'b': 0.0}, + } + else: + dataset_config = _load_dataset_config_from_registry(dataset_name, registry_path) or { + 'topics': '', + 'index': index_name or '', + 'num_queries': 0, + 'bm25_weights': {'k1': 0.0, 'b': 0.0}, + } + # If reformulation produced num_queries but no other config, splice it in. + if dataset_inner.get('num_queries'): + dataset_config['num_queries'] = int(dataset_inner['num_queries']) + + # Searcher: use reformulation's searcher info if present, otherwise unknown. + searcher_info = reform_inner.get('searcher') or {} + searcher = { + 'name': searcher_info.get('name') or 'unknown', + 'type': searcher_info.get('type') or 'unknown', + } + + # Effective method_params / llm_config: prefer the per-step metadata's view + # (already resolved with method-specific defaults), fall back to caller args. + eff_method_params = reform_inner.get('method_params') or dict(method_params or {}) + # method_params may include a non-serializable searcher; strip it. + eff_method_params = { + k: v for k, v in eff_method_params.items() + if not callable(v) and not hasattr(v, '__dict__') or isinstance(v, (dict, list, str, int, float, bool, type(None))) + } + eff_llm_config = reform_inner.get('llm_config') or dict(llm_config or {}) + # Keep only schema-relevant keys; build_run_summary accepts extras but the schema + # only requires temperature + max_tokens. + if 'temperature' not in eff_llm_config: + eff_llm_config['temperature'] = (llm_config or {}).get('temperature', 0.0) + if 'max_tokens' not in eff_llm_config: + eff_llm_config['max_tokens'] = (llm_config or {}).get('max_tokens', 1) + + timing = { + 'reformulation_seconds': float(reform.get('timing', {}).get('total_time_seconds', 0.0)), + 'retrieval_seconds': float(retrieval.get('timing', {}).get('total_time_seconds', 0.0)), + 'evaluation_seconds': float(evaluation.get('timing', {}).get('eval_time_seconds', 0.0)), + } + + return build_run_summary( + dataset_id=dataset_name or queries_file.stem if queries_file else (dataset_name or 'unknown'), + method_id=method, + model=model, + method_params=eff_method_params, + llm_config=eff_llm_config, + searcher=searcher, + dataset_config=dataset_config, + metrics=evaluation.get('results', {}), + timing=timing, + steps_completed=steps, + total_time_seconds=pipeline_time, + ) def run_pipeline( @@ -189,22 +285,56 @@ def run_pipeline( # Pipeline complete pipeline_time = time.time() - pipeline_start - - # Save pipeline summary - summary = { - 'pipeline': { - 'dataset': dataset_name, - 'method': method, - 'model': model, - 'steps_completed': steps, - 'total_time_seconds': pipeline_time, - 'formatted_time': format_time(pipeline_time) - }, - 'results': results - } - + + # Build the canonical v1 run summary if the full pipeline ran with metrics. + # Partial runs (no evaluate step / no metrics) fall back to a debug summary + # that is NOT leaderboard-eligible. + eval_block = results.get('evaluation', {}) + metrics = eval_block.get('results') if isinstance(eval_block, dict) else None + summary_file = output_dir / 'pipeline_summary.json' - save_config(summary, summary_file) + + if metrics: + payload = _build_v1_summary( + results=results, + dataset_name=dataset_name, + method=method, + model=model, + method_params=method_params, + llm_config=llm_config, + steps=steps, + pipeline_time=pipeline_time, + registry_path=registry_path, + queries_file=queries_file, + index_name=index_name, + ) + try: + validate(payload) + except ValidationError as e: + logging.error(f"v1 schema validation failed: {e}") + raise + with open(summary_file, 'w') as f: + json.dump(payload, f, indent=2, sort_keys=False) + f.write('\n') + logging.info(f"v1 run summary written: {summary_file}") + else: + # Partial pipeline → legacy debug shape, different filename so submit_run + # doesn't accidentally pick it up. + partial_file = output_dir / 'pipeline_partial.json' + partial = { + 'pipeline': { + 'dataset': dataset_name, + 'method': method, + 'model': model, + 'steps_completed': steps, + 'total_time_seconds': pipeline_time, + 'formatted_time': format_time(pipeline_time), + }, + 'results': results, + 'note': 'partial pipeline; v1 summary not emitted (no metrics)', + } + save_config(partial, partial_file) + logging.info(f"partial summary written: {partial_file}") # Create human-readable summary summary_txt = output_dir / 'pipeline_summary.txt' diff --git a/mkdocs.yml b/mkdocs.yml index f3132ef..a3eea54 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -82,6 +82,7 @@ nav: - Docker Guide: user-guide/docker.md - Searcher Interface: user-guide/searcher.md - Prompt Bank: user-guide/prompts.md + - Reproducibility: user-guide/reproducibility.md - API Reference: - Core: api/core.md - Methods: api/methods.md diff --git a/pyproject.toml b/pyproject.toml index 9a8b356..6553242 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,11 +59,19 @@ dev = [ "mkdocstrings[python]>=0.23.0", ] +# Reproducibility tooling extras (aggregator, validator, leaderboard build helpers) +repro = [ + "pandas>=2.0.0", + "jsonschema>=4.20.0", +] + # All extras combined all = [ "datasets>=2.20.0", "beir>=2.0.0", "pyserini>=0.22.0", + "pandas>=2.0.0", + "jsonschema>=4.20.0", ] [project.scripts] @@ -97,7 +105,7 @@ ignore = ["E501"] # Line too long (handled by black) # Pytest configuration [tool.pytest.ini_options] -testpaths = ["tests"] +testpaths = ["tests", "reproducibility/tests"] python_files = ["test_*.py"] python_classes = ["Test*"] python_functions = ["test_*"] diff --git a/reproducibility/README.md b/reproducibility/README.md new file mode 100644 index 0000000..ce163e2 --- /dev/null +++ b/reproducibility/README.md @@ -0,0 +1,73 @@ +# reproducibility/ + +Everything that backs the QueryGym leaderboard and the SIGIR 2026 reproducibility paper lives here. The goal is one self-contained directory holding the schema, the data, the tooling, the tests, and (later) the leaderboard site. + +## What's in here + +| Path | What it is | +|---|---| +| `schema.json` | Canonical JSON Schema (Draft 2020-12) — the contract every run JSON must match. Read this from any language; it's not Python-specific. | +| `schema.md` | Human-readable mirror of `schema.json` with field descriptions and a worked example. | +| `lib/` | Private Python helpers used by this repo's tooling (the example pipeline, the aggregator, `submit_run`, the tests). Not a public API; external consumers should read `schema.json`. | +| `data/runs/` | Per-run JSONs (canonical, citable). One JSON + sibling `.run.txt` + `.queries.tsv` per `(dataset, method, model, params)` cell. | +| `data/results.csv` | Long-format aggregate, one row per `(run, metric)`. Derived; regenerated by `aggregate_runs.py`. | +| `data/manifest.json` | Run count, row count, content hash, schema version. Derived. | +| `scripts/aggregate_runs.py` | Walks `data/runs/`, validates each JSON, emits `results.csv` + `manifest.json`. Has a `--check` mode for CI. | +| `scripts/submit_run.py` | Validates a fresh pipeline output dir and copies its files into the canonical `data/runs/` layout. | +| `tests/` | Tests for `lib/`. Run via `pytest reproducibility/tests` or `make repro-test`. | +| `site/` *(future)* | The leaderboard Astro app deployed to `leaderboard.querygym.com`. Not in this PR. | + +## Layout + +Runs are organized by **dataset → method → LLM**: + +``` +data/runs/{dataset_id}/{method_id}/{model}/{params_hash}.{json,run.txt,queries.tsv} +``` + +- `dataset_id` matches a key in `dataset_registry.yaml` at the repo root. +- `method_id` matches a `@register_method(...)` name in `querygym/methods/`. +- `model` is the LLM identifier (e.g. `gpt-4.1-mini`). +- `params_hash` is an 8-char SHA-256 over `(method_id, model, method_params, llm_config)`. Two runs with the same config share a hash and overwrite each other intentionally; runs with different temperature get different hashes. + +## Submitting a run + +1. Run the example pipeline: + ```bash + python examples/querygym_pyserini/pipeline.py \ + --dataset msmarco-v1-passage.trecdl2019 \ + --method query2e \ + --model gpt-4.1-mini \ + --output-dir outputs/dl19_query2e + ``` + The pipeline writes `pipeline_summary.json` (v1 schema) plus `runs/run.txt` and `queries/reformulated_queries.tsv` into the output dir. + +2. Copy into the canonical layout: + ```bash + python -m reproducibility.scripts.submit_run --from-dir outputs/dl19_query2e + ``` + +3. Regenerate `results.csv` and `manifest.json`: + ```bash + make repro-aggregate + ``` + +4. Commit both the new `data/runs/...` files and the updated `data/results.csv` + `data/manifest.json`. + +5. Open a PR. CI will: + - Run the schema/validator tests. + - Run `aggregate_runs.py --check` to confirm `results.csv` is in sync with `runs/`. + +For external (fork) submissions, a maintainer will additionally verify by re-running locally before merging — see `docs/user-guide/reproducibility.md` for details. + +## For external consumers (dashboard, third parties) + +The contract is `schema.json`. Read it from any tool that supports JSON Schema (Python `jsonschema`, JS `Ajv`, Java `everit-org/json-schema`, etc.) and emit conformant payloads from your own code. You do not need to import any Python from this repo. + +Bumping `schema_version` to a new integer is a breaking change; the field is `"const": 1` today and will be replaced (not extended) when v2 lands. + +## Why a separate top-level directory? + +The `querygym/` package on PyPI is the toolkit. This umbrella is the **reproducibility artifact** — schema, data, tooling, eventually the site — and it shouldn't bloat the wheel. `MANIFEST.in` prunes it from the sdist; `pyproject.toml`'s `tool.setuptools.packages.find` only includes `querygym*`. + +The example pipeline in `examples/querygym_pyserini/` imports `reproducibility.lib` because it runs from a clone of this repo; `pip install querygym` users don't need it. diff --git a/reproducibility/__init__.py b/reproducibility/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reproducibility/data/manifest.json b/reproducibility/data/manifest.json new file mode 100644 index 0000000..19f9959 --- /dev/null +++ b/reproducibility/data/manifest.json @@ -0,0 +1,8 @@ +{ + "content_hash": "8f32fb55fdd189d27cfaa021fc81e55652fe80e151083c24fde1cce7ef99341f", + "generated_at": "2026-04-29T19:52:22Z", + "querygym_version": "unknown", + "row_count": 0, + "run_count": 0, + "schema_version": 1 +} diff --git a/reproducibility/data/results.csv b/reproducibility/data/results.csv new file mode 100644 index 0000000..1a89e7a --- /dev/null +++ b/reproducibility/data/results.csv @@ -0,0 +1 @@ +schema_version,run_id,dataset_id,method_id,model,params_hash,method_params_json,llm_temperature,llm_max_tokens,metric,value,num_queries,total_time_seconds,querygym_version,run_file_path diff --git a/reproducibility/data/runs/.gitkeep b/reproducibility/data/runs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/reproducibility/schema.json b/reproducibility/schema.json new file mode 100644 index 0000000..a272997 --- /dev/null +++ b/reproducibility/schema.json @@ -0,0 +1,159 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://querygym.com/schemas/run-summary/v1.json", + "title": "QueryGym Run Summary v1", + "description": "Canonical per-run metrics document emitted by QueryGym pipelines and consumed by the leaderboard.", + "type": "object", + "additionalProperties": false, + "required": [ + "schema_version", + "run_id", + "params_hash", + "submitted_at", + "querygym_version", + "environment", + "pipeline", + "config", + "metrics", + "timing", + "artifacts" + ], + "properties": { + "schema_version": { + "const": 1, + "description": "Schema version. Bumping is a breaking change." + }, + "run_id": { + "type": "string", + "pattern": "^[0-9a-f]{16}$", + "description": "Hash over the full payload minus volatile fields. Identifies a specific execution." + }, + "params_hash": { + "type": "string", + "pattern": "^[0-9a-f]{8}$", + "description": "Hash over the tuning surface (method_id, model, method_params, llm_config). Doubles as filename." + }, + "submitted_at": { + "type": "string", + "format": "date-time", + "description": "ISO 8601 UTC timestamp of JSON generation. Excluded from run_id hash." + }, + "querygym_version": { + "type": "string", + "description": "querygym package __version__ at emit time." + }, + "environment": { + "type": "object", + "additionalProperties": false, + "required": ["python_version", "platform"], + "properties": { + "python_version": {"type": "string"}, + "platform": {"type": "string"}, + "git_commit": {"type": ["string", "null"]} + } + }, + "pipeline": { + "type": "object", + "additionalProperties": false, + "required": ["dataset_id", "method_id", "model", "steps_completed", "total_time_seconds"], + "properties": { + "dataset_id": { + "type": "string", + "description": "Must match a key in dataset_registry.yaml; enforced at validate time." + }, + "method_id": { + "type": "string", + "description": "Must match a name registered via @register_method; enforced at validate time." + }, + "model": { + "type": "string", + "description": "LLM model identifier (e.g. gpt-4.1-mini, qwen2.5:7b)." + }, + "steps_completed": { + "type": "array", + "items": {"enum": ["reformulate", "retrieve", "evaluate"]}, + "uniqueItems": true, + "minItems": 1 + }, + "total_time_seconds": {"type": "number", "minimum": 0} + } + }, + "config": { + "type": "object", + "additionalProperties": false, + "required": ["method_params", "llm_config", "searcher", "dataset_config"], + "properties": { + "method_params": { + "type": "object", + "description": "Method-specific parameters. Free-form per method, included in params_hash." + }, + "llm_config": { + "type": "object", + "additionalProperties": true, + "required": ["temperature", "max_tokens"], + "properties": { + "temperature": {"type": "number"}, + "max_tokens": {"type": "integer", "minimum": 1}, + "top_p": {"type": "number"} + } + }, + "searcher": { + "type": "object", + "additionalProperties": false, + "required": ["name", "type"], + "properties": { + "name": {"type": "string"}, + "type": {"type": "string"} + } + }, + "dataset_config": { + "type": "object", + "additionalProperties": false, + "required": ["topics", "index", "num_queries", "bm25_weights"], + "properties": { + "topics": {"type": "string"}, + "index": {"type": "string"}, + "num_queries": {"type": "integer", "minimum": 0}, + "bm25_weights": { + "type": "object", + "additionalProperties": false, + "required": ["k1", "b"], + "properties": { + "k1": {"type": "number"}, + "b": {"type": "number"} + } + } + } + } + } + }, + "metrics": { + "type": "object", + "minProperties": 1, + "additionalProperties": {"type": "number"}, + "description": "Flat metric_name -> value. Keys must be a subset of the dataset's eval_metrics (with dots normalized to underscores); enforced at validate time." + }, + "timing": { + "type": "object", + "additionalProperties": {"type": "number", "minimum": 0}, + "description": "Per-step seconds. Common keys: reformulation_seconds, retrieval_seconds, evaluation_seconds." + }, + "artifacts": { + "type": "object", + "additionalProperties": false, + "required": ["run_file", "reformulated_queries"], + "properties": { + "run_file": { + "type": "string", + "pattern": "^[0-9a-f]{8}\\.run\\.txt$", + "description": "Sibling TREC run file. Filename must equal {params_hash}.run.txt." + }, + "reformulated_queries": { + "type": "string", + "pattern": "^[0-9a-f]{8}\\.queries\\.tsv$", + "description": "Sibling reformulated queries TSV. Filename must equal {params_hash}.queries.tsv." + } + } + } + } +} diff --git a/reproducibility/schema.md b/reproducibility/schema.md new file mode 100644 index 0000000..04d48ee --- /dev/null +++ b/reproducibility/schema.md @@ -0,0 +1,117 @@ +# Run Summary Schema (v1) + +This document mirrors `reproducibility/schema.json` in human-readable form. Both files are kept in sync via `reproducibility/tests/test_repro_schema.py`, which embeds the same canonical fixture used here. + +## Top-level fields + +| Field | Type | Required | Description | +|---|---|---|---| +| `schema_version` | `int` | yes | Always `1`. Bumping is a breaking change. | +| `run_id` | `string` (16-char hex) | yes | SHA-256 prefix over the payload minus volatile fields. Identifies a specific execution. | +| `params_hash` | `string` (8-char hex) | yes | SHA-256 prefix over `(method_id, model, method_params, llm_config)`. Doubles as the on-disk filename. | +| `submitted_at` | ISO 8601 UTC | yes | Wall-clock time the JSON was generated. Excluded from `run_id`. | +| `querygym_version` | `string` | yes | `querygym.__version__` at emit time. | +| `environment` | object | yes | Python version, platform, optional git commit. | +| `pipeline` | object | yes | dataset_id, method_id, model, steps_completed, total_time_seconds. | +| `config` | object | yes | method_params, llm_config, searcher, dataset_config. | +| `metrics` | object | yes | Flat `metric_name -> float`. Must have ≥1 entry. | +| `timing` | object | yes | Per-step seconds. | +| `artifacts` | object | yes | Sibling `run_file` and `reformulated_queries` filenames. | + +## Validation rules (beyond the static schema) + +These are enforced by `reproducibility.lib.validate(...)` at runtime: + +1. `pipeline.dataset_id` must be a key in `dataset_registry.yaml`. +2. `pipeline.method_id` must be registered via `@register_method(...)` in `querygym/methods/`. +3. Each `metrics` key must be in the dataset's `output.eval_metrics` (after normalizing dots to underscores: `ndcg_cut.10` → `ndcg_cut_10`). +4. `params_hash` is recomputed from `(method_id, model, method_params, llm_config)` and must equal the stored value. +5. `run_id` is recomputed from the payload (minus `run_id`, `submitted_at`, `environment`) and must equal the stored value. +6. `artifacts.run_file` must equal `{params_hash}.run.txt`; `artifacts.reformulated_queries` must equal `{params_hash}.queries.tsv`. + +Hand-editing a metric value without re-running the emitter will fail validation (rule 5). This catches silent tampering. + +## Hashing details + +```python +def compute_params_hash(method_id, model, method_params, llm_config) -> str: + payload = {"method_id": ..., "model": ..., "method_params": ..., "llm_config": ...} + return sha256(json.dumps(payload, sort_keys=True, separators=(",",":"))).hexdigest()[:8] + +def compute_run_id(payload) -> str: + stripped = {k: v for k, v in payload.items() if k not in ("run_id", "submitted_at", "environment")} + return sha256(json.dumps(stripped, sort_keys=True, separators=(",",":"))).hexdigest()[:16] +``` + +`json.dumps(..., sort_keys=True)` makes hashes invariant to key ordering. Hashes change when any field they cover changes. + +## Canonical example + +This is `reproducibility/tests/fixtures/sample_run.json` — used by tests, embedded here, and produced from the inputs in `test_repro_schema._build_kwargs()`: + +```json +{ + "schema_version": 1, + "run_id": "cabe83ca1236a3bb", + "params_hash": "ddb15ccf", + "submitted_at": "2026-04-29T10:14:22Z", + "querygym_version": "0.3.0", + "environment": { + "python_version": "3.10.13", + "platform": "Linux-5.15.0-x86_64", + "git_commit": "5c46a51" + }, + "pipeline": { + "dataset_id": "msmarco-v1-passage.trecdl2019", + "method_id": "query2e", + "model": "gpt-4.1-mini", + "steps_completed": ["reformulate", "retrieve", "evaluate"], + "total_time_seconds": 89.37 + }, + "config": { + "method_params": {"mode": "zs"}, + "llm_config": {"temperature": 1.0, "max_tokens": 128, "top_p": 1.0}, + "searcher": {"name": "UserPyseriniWrapper", "type": "user_pyserini"}, + "dataset_config": { + "topics": "dl19-passage", + "index": "msmarco-v1-passage", + "num_queries": 43, + "bm25_weights": {"k1": 0.9, "b": 0.4} + } + }, + "metrics": { + "map": 0.3709, + "ndcg_cut_10": 0.5679, + "recall_1000": 0.8384 + }, + "timing": { + "reformulation_seconds": 65.24, + "retrieval_seconds": 3.01, + "evaluation_seconds": 10.53 + }, + "artifacts": { + "run_file": "ddb15ccf.run.txt", + "reformulated_queries": "ddb15ccf.queries.tsv" + } +} +``` + +## On-disk layout + +A run lives under: + +``` +reproducibility/data/runs/{dataset_id}/{method_id}/{model}/{params_hash}.{json,run.txt,queries.tsv} +``` + +The three sibling files together describe one run completely. The `.run.txt` is a TREC-format retrieval run that allows independent re-evaluation with `pytrec_eval`; the `.queries.tsv` lets reviewers spot-check reformulations. + +## Bumping the schema + +Future schema changes require: +1. Bumping `SCHEMA_VERSION` in `reproducibility/lib/emit.py` to 2. +2. Updating `schema.json`'s `schema_version.const` to 2. +3. Re-emitting all existing JSONs under v2 (one bulk PR). +4. Updating the dashboard product to consume v2. + +The schema is intentionally hard to change so that the leaderboard's history stays comparable. diff --git a/reproducibility/scripts/__init__.py b/reproducibility/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reproducibility/scripts/aggregate_runs.py b/reproducibility/scripts/aggregate_runs.py new file mode 100644 index 0000000..0c1016a --- /dev/null +++ b/reproducibility/scripts/aggregate_runs.py @@ -0,0 +1,268 @@ +"""Walk reproducibility/data/runs/, validate each JSON, emit results.csv + manifest.json. + +Deterministic by design: sorted rows, fixed column order, LF line endings, sorted JSON. +The committed CSV must equal the output of this script for any given runs/ tree, which +the CI workflow enforces via --check. +""" + +from __future__ import annotations + +import argparse +import csv +import hashlib +import io +import json +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Iterator + +# Make `reproducibility.lib` importable when invoked as a script from the repo root. +_REPO_ROOT = Path(__file__).resolve().parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +from reproducibility.lib import SCHEMA_VERSION, validate, ValidationError # noqa: E402 + +DATA_DIR = _REPO_ROOT / "reproducibility" / "data" +RUNS_DIR = DATA_DIR / "runs" +RESULTS_CSV = DATA_DIR / "results.csv" +MANIFEST_JSON = DATA_DIR / "manifest.json" + +CSV_COLUMNS = [ + "schema_version", + "run_id", + "dataset_id", + "method_id", + "model", + "params_hash", + "method_params_json", + "llm_temperature", + "llm_max_tokens", + "metric", + "value", + "num_queries", + "total_time_seconds", + "querygym_version", + "run_file_path", +] + + +def _iter_run_files(runs_dir: Path) -> Iterator[Path]: + yield from sorted(runs_dir.rglob("*.json")) + + +def _load_and_validate(path: Path, dataset_registry, method_registry) -> dict: + with path.open("r", encoding="utf-8") as f: + payload = json.load(f) + try: + validate( + payload, + dataset_registry=dataset_registry, + method_registry=method_registry, + ) + except ValidationError as e: + raise SystemExit(f"validation failed for {path}: {e}") from e + return payload + + +def _payload_to_rows(payload: dict, run_path: Path) -> list[list]: + """One row per metric. Returns rows in CSV_COLUMNS order.""" + pipeline = payload["pipeline"] + config = payload["config"] + rel_path = run_path.relative_to(_REPO_ROOT).as_posix() + + base = [ + payload["schema_version"], + payload["run_id"], + pipeline["dataset_id"], + pipeline["method_id"], + pipeline["model"], + payload["params_hash"], + json.dumps(config["method_params"], sort_keys=True, separators=(",", ":")), + config["llm_config"]["temperature"], + config["llm_config"]["max_tokens"], + # metric / value filled per row below + None, + None, + config["dataset_config"]["num_queries"], + pipeline["total_time_seconds"], + payload["querygym_version"], + rel_path, + ] + + rows = [] + for metric in sorted(payload["metrics"].keys()): + row = list(base) + row[9] = metric + row[10] = payload["metrics"][metric] + rows.append(row) + return rows + + +def _write_csv(rows: list[list]) -> str: + """Render CSV to a string with deterministic settings.""" + buf = io.StringIO(newline="") + # csv.writer with QUOTE_MINIMAL + LF is deterministic across platforms. + writer = csv.writer(buf, lineterminator="\n", quoting=csv.QUOTE_MINIMAL) + writer.writerow(CSV_COLUMNS) + writer.writerows(rows) + return buf.getvalue() + + +def _content_hash(text: str) -> str: + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + +def _querygym_version() -> str: + try: + import querygym # type: ignore + return getattr(querygym, "__version__", "unknown") + except ImportError: + return "unknown" + + +def aggregate(runs_dir: Path) -> tuple[str, dict]: + """Build the canonical CSV text and a manifest dict from runs_dir. + + Returns (csv_text, manifest) — both deterministic for a given runs_dir. + """ + # Lazy load registries once for the whole walk (saves file IO per run). + from reproducibility.lib.validate import ( + _load_dataset_registry, + _load_method_registry, + ) + + dataset_registry = _load_dataset_registry(None) + method_registry = list(_load_method_registry()) + + all_rows: list[list] = [] + run_count = 0 + for run_path in _iter_run_files(runs_dir): + payload = _load_and_validate(run_path, dataset_registry, method_registry) + all_rows.extend(_payload_to_rows(payload, run_path)) + run_count += 1 + + # Sort by (dataset_id, method_id, model, params_hash, metric) for stable diffs. + sort_idx = ( + CSV_COLUMNS.index("dataset_id"), + CSV_COLUMNS.index("method_id"), + CSV_COLUMNS.index("model"), + CSV_COLUMNS.index("params_hash"), + CSV_COLUMNS.index("metric"), + ) + all_rows.sort(key=lambda r: tuple(r[i] for i in sort_idx)) + + csv_text = _write_csv(all_rows) + manifest = { + "schema_version": SCHEMA_VERSION, + "querygym_version": _querygym_version(), + "run_count": run_count, + "row_count": len(all_rows), + "content_hash": _content_hash(csv_text), + } + return csv_text, manifest + + +def _read_committed_files() -> tuple[str | None, dict | None]: + csv_text = RESULTS_CSV.read_text(encoding="utf-8") if RESULTS_CSV.exists() else None + manifest = ( + json.loads(MANIFEST_JSON.read_text(encoding="utf-8")) + if MANIFEST_JSON.exists() + else None + ) + return csv_text, manifest + + +def cmd_write(runs_dir: Path) -> int: + csv_text, manifest = aggregate(runs_dir) + manifest["generated_at"] = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + DATA_DIR.mkdir(parents=True, exist_ok=True) + RESULTS_CSV.write_text(csv_text, encoding="utf-8") + MANIFEST_JSON.write_text( + json.dumps(manifest, indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + print( + f"wrote {RESULTS_CSV.relative_to(_REPO_ROOT)} " + f"({manifest['run_count']} runs, {manifest['row_count']} rows, " + f"content_hash={manifest['content_hash'][:12]}...)" + ) + return 0 + + +def cmd_check(runs_dir: Path) -> int: + csv_text, manifest = aggregate(runs_dir) + committed_csv, committed_manifest = _read_committed_files() + + failures = [] + + if committed_csv is None: + failures.append(f"{RESULTS_CSV.relative_to(_REPO_ROOT)} is missing") + elif committed_csv != csv_text: + failures.append( + f"{RESULTS_CSV.relative_to(_REPO_ROOT)} is out of date " + f"(committed != regenerated)" + ) + + if committed_manifest is None: + failures.append(f"{MANIFEST_JSON.relative_to(_REPO_ROOT)} is missing") + else: + # Compare everything except generated_at (which is intentionally volatile). + for key in ("schema_version", "querygym_version", "run_count", "row_count", "content_hash"): + committed_val = committed_manifest.get(key) + fresh_val = manifest.get(key) + if committed_val != fresh_val: + failures.append( + f"{MANIFEST_JSON.relative_to(_REPO_ROOT)}: {key} mismatch " + f"(committed={committed_val!r}, regenerated={fresh_val!r})" + ) + + if failures: + print("Aggregator --check failed:", file=sys.stderr) + for f in failures: + print(f" - {f}", file=sys.stderr) + print( + "\nFix by running:\n" + " python -m reproducibility.scripts.aggregate_runs\n" + "and committing the diff.", + file=sys.stderr, + ) + return 1 + + print( + f"OK: {manifest['run_count']} runs, {manifest['row_count']} rows, " + f"content_hash={manifest['content_hash'][:12]}..." + ) + return 0 + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Aggregate run JSONs into results.csv + manifest.json." + ) + parser.add_argument( + "--check", + action="store_true", + help="Verify committed files match what the aggregator would produce. " + "Exits non-zero on mismatch. Used by CI.", + ) + parser.add_argument( + "--runs-dir", + type=Path, + default=RUNS_DIR, + help=f"Directory to walk for run JSONs (default: {RUNS_DIR.relative_to(_REPO_ROOT)}).", + ) + args = parser.parse_args() + + if not args.runs_dir.exists(): + args.runs_dir.mkdir(parents=True, exist_ok=True) + + if args.check: + return cmd_check(args.runs_dir) + return cmd_write(args.runs_dir) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/reproducibility/scripts/submit_run.py b/reproducibility/scripts/submit_run.py new file mode 100644 index 0000000..4ed6612 --- /dev/null +++ b/reproducibility/scripts/submit_run.py @@ -0,0 +1,154 @@ +"""Validate a fresh run JSON and copy it (plus sibling artifacts) into the canonical layout. + +Used by: +- Internal trusted contributors after running the example pipeline. +- External fork contributors before opening a PR. +- The one-time SIGIR backfill (re-emitting legacy JSONs under v1). + +The example pipeline writes pipeline_summary.json (v1 schema) plus run.txt and +reformulated_queries.tsv to its --output-dir. This script picks those up and lays them +into reproducibility/data/runs/{dataset_id}/{method_id}/{model}/{params_hash}.{ext}. +""" + +from __future__ import annotations + +import argparse +import json +import shutil +import sys +from pathlib import Path + +_REPO_ROOT = Path(__file__).resolve().parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +from reproducibility.lib import validate, ValidationError # noqa: E402 + +DEFAULT_RUNS_DIR = _REPO_ROOT / "reproducibility" / "data" / "runs" + +# Common filenames the example pipeline produces in --output-dir. +SUMMARY_CANDIDATES = ("pipeline_summary.json",) +RUN_FILE_CANDIDATES = ("runs/run.txt", "run.txt") +QUERIES_CANDIDATES = ( + "queries/reformulated_queries.tsv", + "reformulated_queries.tsv", +) + + +def _find(from_dir: Path, candidates: tuple[str, ...]) -> Path | None: + for c in candidates: + p = from_dir / c + if p.exists(): + return p + return None + + +def _resolve_inputs(from_dir: Path) -> tuple[Path, Path, Path]: + summary = _find(from_dir, SUMMARY_CANDIDATES) + if summary is None: + raise SystemExit( + f"could not find pipeline_summary.json under {from_dir}. " + f"Did the pipeline complete?" + ) + run_file = _find(from_dir, RUN_FILE_CANDIDATES) + if run_file is None: + raise SystemExit( + f"could not find run.txt under {from_dir} (looked in: {RUN_FILE_CANDIDATES})" + ) + queries = _find(from_dir, QUERIES_CANDIDATES) + if queries is None: + raise SystemExit( + f"could not find reformulated_queries.tsv under {from_dir} " + f"(looked in: {QUERIES_CANDIDATES})" + ) + return summary, run_file, queries + + +def _canonical_dir(runs_dir: Path, payload: dict) -> Path: + p = payload["pipeline"] + return runs_dir / p["dataset_id"] / p["method_id"] / p["model"] + + +def _copy(src: Path, dst: Path, *, force: bool) -> None: + if dst.exists() and not force: + raise SystemExit( + f"refusing to overwrite {dst.relative_to(_REPO_ROOT)} (use --force)." + ) + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dst) + + +def main() -> int: + parser = argparse.ArgumentParser( + description=( + "Validate a run output directory and copy its files into the canonical " + "reproducibility/data/runs/ layout." + ) + ) + parser.add_argument( + "--from-dir", + type=Path, + required=True, + help="Directory produced by examples/querygym_pyserini/pipeline.py.", + ) + parser.add_argument( + "--runs-dir", + type=Path, + default=DEFAULT_RUNS_DIR, + help=f"Target runs directory (default: {DEFAULT_RUNS_DIR.relative_to(_REPO_ROOT)}).", + ) + parser.add_argument( + "--force", + action="store_true", + help="Overwrite an existing run with the same params_hash.", + ) + parser.add_argument( + "--skip-registry-checks", + action="store_true", + help="Skip dataset/method registry validation (use only for synthetic test runs).", + ) + args = parser.parse_args() + + if not args.from_dir.is_dir(): + raise SystemExit(f"--from-dir does not exist: {args.from_dir}") + + summary, run_file, queries = _resolve_inputs(args.from_dir) + + with summary.open("r", encoding="utf-8") as f: + payload = json.load(f) + + try: + validate(payload, skip_registry_checks=args.skip_registry_checks) + except ValidationError as e: + raise SystemExit(f"validation failed for {summary}: {e}") from e + + target_dir = _canonical_dir(args.runs_dir, payload) + h = payload["params_hash"] + + json_dst = target_dir / f"{h}.json" + run_dst = target_dir / f"{h}.run.txt" + queries_dst = target_dir / f"{h}.queries.tsv" + + target_dir.mkdir(parents=True, exist_ok=True) + + # Write the validated payload (not a verbatim copy of summary) — guarantees + # the on-disk JSON is byte-identical to what the validator just OK'd. + if json_dst.exists() and not args.force: + raise SystemExit( + f"refusing to overwrite {json_dst.relative_to(_REPO_ROOT)} (use --force)." + ) + with json_dst.open("w", encoding="utf-8") as f: + json.dump(payload, f, indent=2, sort_keys=False) + f.write("\n") + + _copy(run_file, run_dst, force=args.force) + _copy(queries, queries_dst, force=args.force) + + rel = json_dst.relative_to(_REPO_ROOT).as_posix() + print(f"wrote {rel}") + print("Now run:\n make repro-aggregate\nand commit the diff.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/reproducibility/tests/__init__.py b/reproducibility/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reproducibility/tests/fixtures/sample_run.json b/reproducibility/tests/fixtures/sample_run.json new file mode 100644 index 0000000..debc31b --- /dev/null +++ b/reproducibility/tests/fixtures/sample_run.json @@ -0,0 +1,60 @@ +{ + "schema_version": 1, + "run_id": "cabe83ca1236a3bb", + "params_hash": "ddb15ccf", + "submitted_at": "2026-04-29T10:14:22Z", + "querygym_version": "0.3.0", + "environment": { + "python_version": "3.10.13", + "platform": "Linux-5.15.0-x86_64", + "git_commit": "5c46a51" + }, + "pipeline": { + "dataset_id": "msmarco-v1-passage.trecdl2019", + "method_id": "query2e", + "model": "gpt-4.1-mini", + "steps_completed": [ + "reformulate", + "retrieve", + "evaluate" + ], + "total_time_seconds": 89.37 + }, + "config": { + "method_params": { + "mode": "zs" + }, + "llm_config": { + "temperature": 1.0, + "max_tokens": 128, + "top_p": 1.0 + }, + "searcher": { + "name": "UserPyseriniWrapper", + "type": "user_pyserini" + }, + "dataset_config": { + "topics": "dl19-passage", + "index": "msmarco-v1-passage", + "num_queries": 43, + "bm25_weights": { + "k1": 0.9, + "b": 0.4 + } + } + }, + "metrics": { + "map": 0.3709, + "ndcg_cut_10": 0.5679, + "recall_1000": 0.8384 + }, + "timing": { + "reformulation_seconds": 65.24, + "retrieval_seconds": 3.01, + "evaluation_seconds": 10.53 + }, + "artifacts": { + "run_file": "ddb15ccf.run.txt", + "reformulated_queries": "ddb15ccf.queries.tsv" + } +} diff --git a/reproducibility/tests/test_repro_schema.py b/reproducibility/tests/test_repro_schema.py new file mode 100644 index 0000000..c7d2c99 --- /dev/null +++ b/reproducibility/tests/test_repro_schema.py @@ -0,0 +1,216 @@ +"""Tests for reproducibility.lib (emit + validate).""" + +from __future__ import annotations + +import copy +import json +from pathlib import Path + +import pytest + +from reproducibility.lib import ( + SCHEMA_VERSION, + ValidationError, + build_run_summary, + compute_params_hash, + compute_run_id, + validate, +) + +FIXTURE = Path(__file__).parent / "fixtures" / "sample_run.json" + + +def _load_fixture() -> dict: + with FIXTURE.open("r", encoding="utf-8") as f: + return json.load(f) + + +def _build_kwargs() -> dict: + """Inputs that produce the canonical fixture, modulo volatile fields.""" + return dict( + dataset_id="msmarco-v1-passage.trecdl2019", + method_id="query2e", + model="gpt-4.1-mini", + method_params={"mode": "zs"}, + llm_config={"temperature": 1.0, "max_tokens": 128, "top_p": 1.0}, + searcher={"name": "UserPyseriniWrapper", "type": "user_pyserini"}, + dataset_config={ + "topics": "dl19-passage", + "index": "msmarco-v1-passage", + "num_queries": 43, + "bm25_weights": {"k1": 0.9, "b": 0.4}, + }, + metrics={"map": 0.3709, "ndcg_cut_10": 0.5679, "recall_1000": 0.8384}, + timing={ + "reformulation_seconds": 65.24, + "retrieval_seconds": 3.01, + "evaluation_seconds": 10.53, + }, + steps_completed=["reformulate", "retrieve", "evaluate"], + total_time_seconds=89.37, + # Pin volatile fields for determinism in tests. + submitted_at="2026-04-29T10:14:22Z", + environment={ + "python_version": "3.10.13", + "platform": "Linux-5.15.0-x86_64", + "git_commit": "5c46a51", + }, + querygym_version="0.3.0", + ) + + +# ---------- Hash properties -------------------------------------------------- + + +def test_params_hash_is_8_hex(): + h = compute_params_hash("query2e", "gpt-4.1-mini", {"mode": "zs"}, {"t": 1.0}) + assert len(h) == 8 + assert all(c in "0123456789abcdef" for c in h) + + +def test_params_hash_is_stable(): + a = compute_params_hash("query2e", "gpt-4.1-mini", {"mode": "zs"}, {"temperature": 1.0}) + b = compute_params_hash("query2e", "gpt-4.1-mini", {"mode": "zs"}, {"temperature": 1.0}) + assert a == b + + +def test_params_hash_changes_on_temperature(): + a = compute_params_hash("query2e", "gpt-4.1-mini", {"mode": "zs"}, {"temperature": 1.0}) + b = compute_params_hash("query2e", "gpt-4.1-mini", {"mode": "zs"}, {"temperature": 0.5}) + assert a != b + + +def test_params_hash_invariant_to_key_order(): + a = compute_params_hash("query2e", "gpt-4.1-mini", {"a": 1, "b": 2}, {"x": 1, "y": 2}) + b = compute_params_hash("query2e", "gpt-4.1-mini", {"b": 2, "a": 1}, {"y": 2, "x": 1}) + assert a == b + + +def test_run_id_excludes_volatile_fields(): + payload = _load_fixture() + rid_a = compute_run_id(payload) + payload2 = copy.deepcopy(payload) + payload2["submitted_at"] = "2099-12-31T23:59:59Z" + payload2["environment"] = {"python_version": "9.9", "platform": "any", "git_commit": None} + rid_b = compute_run_id(payload2) + assert rid_a == rid_b + + +def test_run_id_changes_on_metric_change(): + payload = _load_fixture() + rid_a = compute_run_id(payload) + payload2 = copy.deepcopy(payload) + payload2["metrics"]["map"] = 0.9999 + rid_b = compute_run_id(payload2) + assert rid_a != rid_b + + +# ---------- build_run_summary ------------------------------------------------ + + +def test_build_run_summary_matches_fixture(): + """The fixture should be exactly what build_run_summary produces from canonical inputs.""" + built = build_run_summary(**_build_kwargs()) + fixture = _load_fixture() + assert built == fixture + + +def test_build_run_summary_validates_clean(): + payload = build_run_summary(**_build_kwargs()) + validate(payload) + + +def test_build_run_summary_artifact_filenames_use_params_hash(): + payload = build_run_summary(**_build_kwargs()) + h = payload["params_hash"] + assert payload["artifacts"]["run_file"] == f"{h}.run.txt" + assert payload["artifacts"]["reformulated_queries"] == f"{h}.queries.tsv" + + +# ---------- Validator: schema-level rejections ------------------------------- + + +def test_validator_rejects_missing_schema_version(): + payload = _load_fixture() + del payload["schema_version"] + with pytest.raises(ValidationError, match="schema_version"): + validate(payload) + + +def test_validator_rejects_wrong_schema_version(): + payload = _load_fixture() + payload["schema_version"] = 2 + with pytest.raises(ValidationError): + validate(payload) + + +def test_validator_rejects_extra_top_level_field(): + payload = _load_fixture() + payload["whoops"] = "extra" + with pytest.raises(ValidationError): + validate(payload) + + +def test_validator_rejects_malformed_artifact_filename(): + payload = _load_fixture() + payload["artifacts"]["run_file"] = "not-a-hash.run.txt" + with pytest.raises(ValidationError, match="artifacts"): + validate(payload) + + +# ---------- Validator: registry-level rejections ----------------------------- + + +def test_validator_rejects_unknown_dataset(): + payload = _load_fixture() + payload["pipeline"]["dataset_id"] = "fake-dataset" + # Recompute hashes so we hit the registry check, not the hash check. + payload["run_id"] = compute_run_id(payload) + with pytest.raises(ValidationError, match="dataset_id 'fake-dataset'"): + validate(payload) + + +def test_validator_rejects_metric_outside_eval_metrics(): + payload = _load_fixture() + payload["metrics"]["bleu"] = 0.5 + payload["run_id"] = compute_run_id(payload) + with pytest.raises(ValidationError, match="not in eval_metrics"): + validate(payload) + + +# ---------- Validator: hash-level rejections --------------------------------- + + +def test_validator_rejects_tampered_params_hash(): + payload = _load_fixture() + payload["params_hash"] = "deadbeef" + payload["run_id"] = compute_run_id(payload) + with pytest.raises(ValidationError, match="params_hash mismatch"): + validate(payload) + + +def test_validator_rejects_tampered_run_id(): + payload = _load_fixture() + payload["run_id"] = "0" * 16 + with pytest.raises(ValidationError, match="run_id mismatch"): + validate(payload) + + +def test_validator_rejects_silent_metric_edit(): + """Hand-editing a metric value without recomputing run_id must be caught.""" + payload = _load_fixture() + payload["metrics"]["map"] = 0.9999 # leave run_id alone + with pytest.raises(ValidationError, match="run_id mismatch"): + validate(payload) + + +# ---------- Skip-registry-checks escape hatch (for tests with synthetic ids) --- + + +def test_validator_skip_registry_checks_allows_unknown_ids(): + """build_run_summary with an unknown dataset still validates if registry checks are off.""" + kwargs = _build_kwargs() + kwargs["dataset_id"] = "synthetic-test-dataset" + payload = build_run_summary(**kwargs) + # Schema/hash checks still pass; registry check is skipped. + validate(payload, skip_registry_checks=True)