From b126468048cde306c1998a5074ac9c774884f1ef Mon Sep 17 00:00:00 2001 From: Radin Hamidi Rad Date: Wed, 29 Apr 2026 16:43:22 -0400 Subject: [PATCH 1/5] Add pythonpath = ["."] to pytest config CI runs `pytest reproducibility/tests` directly (not `python -m pytest`), which doesn't add the repo root to sys.path. Without the explicit pythonpath, `from reproducibility.lib import ...` fails with ModuleNotFoundError. Pin it in [tool.pytest.ini_options] so the test setup works regardless of invocation style. Co-Authored-By: Claude Opus 4.7 (1M context) --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 6553242..ec888f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,6 +106,10 @@ ignore = ["E501"] # Line too long (handled by black) # Pytest configuration [tool.pytest.ini_options] testpaths = ["tests", "reproducibility/tests"] +# Make the repo root importable so `from reproducibility.lib import ...` works +# when pytest is invoked as `pytest reproducibility/tests` (i.e. without +# `python -m`, which is how the CI workflow runs it). +pythonpath = ["."] python_files = ["test_*.py"] python_classes = ["Test*"] python_functions = ["test_*"] From 6be4068d0ffcd1211aea904ac9dfd537ac3b2cb4 Mon Sep 17 00:00:00 2001 From: Radin Hamidi Rad Date: Wed, 29 Apr 2026 16:44:31 -0400 Subject: [PATCH 2/5] Trigger repro CI on pyproject.toml + workflow changes The workflow's path filter previously skipped any change that didn't touch reproducibility/**, the example pipeline, or dataset_registry.yaml. That meant the pyproject.toml pytest config fix (previous commit) didn't actually re-run CI. Add pyproject.toml and the workflow file itself to the path filter, plus workflow_dispatch for manual reruns. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/reproducibility-check.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/reproducibility-check.yml b/.github/workflows/reproducibility-check.yml index 896d8c9..96213df 100644 --- a/.github/workflows/reproducibility-check.yml +++ b/.github/workflows/reproducibility-check.yml @@ -6,12 +6,17 @@ on: - 'reproducibility/**' - 'examples/querygym_pyserini/pipeline.py' - 'dataset_registry.yaml' + - 'pyproject.toml' + - '.github/workflows/reproducibility-check.yml' push: branches: [main] paths: - 'reproducibility/**' - 'examples/querygym_pyserini/pipeline.py' - 'dataset_registry.yaml' + - 'pyproject.toml' + - '.github/workflows/reproducibility-check.yml' + workflow_dispatch: concurrency: group: repro-${{ github.ref }} From 45f221970ed2b31f8054f08a9c61b84068958ba0 Mon Sep 17 00:00:00 2001 From: Radin Hamidi Rad Date: Wed, 29 Apr 2026 16:47:08 -0400 Subject: [PATCH 3/5] Run repro tests with python -m pytest (so cwd is on sys.path) Even with pythonpath = ["."] in pytest config, CI's Python 3.9 + pytest 8.4 + cov plugin combination doesn't make `reproducibility` importable as a top-level package when invoked as `pytest`. `python -m pytest` always prepends cwd to sys.path, which fixes it. This is the canonical workaround for "from import" in CI without making the package pip-installable. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/reproducibility-check.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/reproducibility-check.yml b/.github/workflows/reproducibility-check.yml index 96213df..0434a7f 100644 --- a/.github/workflows/reproducibility-check.yml +++ b/.github/workflows/reproducibility-check.yml @@ -41,7 +41,10 @@ jobs: pip install pytest-cov - name: Run repro tests - run: pytest reproducibility/tests -v --no-cov + # Use `python -m pytest` so the cwd (repo root) is on sys.path — + # `from reproducibility.lib import ...` needs that, and the bare + # `pytest` form doesn't add it on CI's Python 3.9. + run: python -m pytest reproducibility/tests -v --no-cov - name: Aggregator --check run: python -m reproducibility.scripts.aggregate_runs --check From c6031614c0d707914c79886257531e2cca824e35 Mon Sep 17 00:00:00 2001 From: Radin Hamidi Rad Date: Wed, 29 Apr 2026 16:50:10 -0400 Subject: [PATCH 4/5] Add missing reproducibility/lib/ files (gitignore collision fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The standard Python .gitignore line `lib/` (no leading slash) was matching reproducibility/lib/ everywhere in the tree, which silently hid emit.py, validate.py, and __init__.py from the previous commit. CI couldn't import reproducibility.lib because the directory itself didn't exist on the remote — explaining the persistent ModuleNotFoundError despite the pytest config + python -m pytest fixes. Anchor the Python distribution directories (build/, dist/, lib/, lib64/, eggs/, etc.) to the repo root with a leading slash so they match only top-level dirs, not nested project directories that legitimately use the same names. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 25 +++-- reproducibility/lib/__init__.py | 24 +++++ reproducibility/lib/emit.py | 156 +++++++++++++++++++++++++++ reproducibility/lib/validate.py | 182 ++++++++++++++++++++++++++++++++ 4 files changed, 376 insertions(+), 11 deletions(-) create mode 100644 reproducibility/lib/__init__.py create mode 100644 reproducibility/lib/emit.py create mode 100644 reproducibility/lib/validate.py diff --git a/.gitignore b/.gitignore index a0f5f71..941862a 100644 --- a/.gitignore +++ b/.gitignore @@ -8,17 +8,20 @@ __pycache__/ # Distribution / packaging .Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ +# Anchored to repo root so these Python build dirs don't collide with project +# directories that legitimately use the same names (e.g. reproducibility/lib/, +# web/site/dist/). +/build/ +/develop-eggs/ +/dist/ +/downloads/ +/eggs/ +/.eggs/ +/lib/ +/lib64/ +/parts/ +/sdist/ +/var/ wheels/ pip-wheel-metadata/ share/python-wheels/ diff --git a/reproducibility/lib/__init__.py b/reproducibility/lib/__init__.py new file mode 100644 index 0000000..53902af --- /dev/null +++ b/reproducibility/lib/__init__.py @@ -0,0 +1,24 @@ +"""Private Python helpers for QueryGym reproducibility tooling. + +The contract that crosses repo boundaries is `reproducibility/schema.json`. +This module is internal to this repository and used by the example pipeline, +the aggregator, the submission tool, and the tests. External consumers +(the dashboard product, third-party tools) should read schema.json directly. +""" + +from .emit import ( + SCHEMA_VERSION, + build_run_summary, + compute_params_hash, + compute_run_id, +) +from .validate import validate, ValidationError + +__all__ = [ + "SCHEMA_VERSION", + "build_run_summary", + "compute_params_hash", + "compute_run_id", + "validate", + "ValidationError", +] diff --git a/reproducibility/lib/emit.py b/reproducibility/lib/emit.py new file mode 100644 index 0000000..32c313d --- /dev/null +++ b/reproducibility/lib/emit.py @@ -0,0 +1,156 @@ +"""Build canonical run-summary JSON payloads conformant to schema v1.""" + +from __future__ import annotations + +import hashlib +import json +import platform as _platform +import subprocess +import sys +from datetime import datetime, timezone +from typing import Any, Mapping + +SCHEMA_VERSION = 1 + + +def _stable_json(payload: Any) -> str: + """Serialize with sorted keys and no whitespace — deterministic across runs.""" + return json.dumps(payload, sort_keys=True, separators=(",", ":"), ensure_ascii=False) + + +def compute_params_hash( + method_id: str, + model: str, + method_params: Mapping[str, Any], + llm_config: Mapping[str, Any], +) -> str: + """8-char hex hash over the tuning surface. + + Same config -> same hash (re-run replaces the previous file). + Different temperature -> different hash (no collision). + """ + payload = { + "method_id": method_id, + "model": model, + "method_params": dict(method_params), + "llm_config": dict(llm_config), + } + digest = hashlib.sha256(_stable_json(payload).encode("utf-8")).hexdigest() + return digest[:8] + + +# Fields excluded from run_id hash because they would make identical executions +# produce different ids. Keep this list in sync with schema.json's volatile fields. +_RUN_ID_EXCLUDED_FIELDS = ("run_id", "submitted_at", "environment") + + +def compute_run_id(payload: Mapping[str, Any]) -> str: + """16-char hex hash over the payload minus volatile fields. + + Two distinct executions of the same logical experiment that produce identical + metrics -> same run_id. Two executions whose results differ -> different run_ids. + """ + stripped = {k: v for k, v in payload.items() if k not in _RUN_ID_EXCLUDED_FIELDS} + digest = hashlib.sha256(_stable_json(stripped).encode("utf-8")).hexdigest() + return digest[:16] + + +def _detect_environment() -> dict: + """Best-effort capture of the runner's environment. Never raises.""" + env = { + "python_version": ".".join(map(str, sys.version_info[:3])), + "platform": _platform.platform(), + "git_commit": None, + } + try: + commit = subprocess.run( + ["git", "rev-parse", "--short", "HEAD"], + capture_output=True, + text=True, + timeout=2, + check=False, + ) + if commit.returncode == 0 and commit.stdout.strip(): + env["git_commit"] = commit.stdout.strip() + except (OSError, subprocess.SubprocessError): + pass + return env + + +def _querygym_version() -> str: + """Read querygym.__version__ if available, else 'unknown'.""" + try: + import querygym # type: ignore + return getattr(querygym, "__version__", "unknown") + except ImportError: + return "unknown" + + +def build_run_summary( + *, + dataset_id: str, + method_id: str, + model: str, + method_params: Mapping[str, Any], + llm_config: Mapping[str, Any], + searcher: Mapping[str, Any], + dataset_config: Mapping[str, Any], + metrics: Mapping[str, float], + timing: Mapping[str, float], + steps_completed: list, + total_time_seconds: float, + submitted_at: str | None = None, + environment: Mapping[str, Any] | None = None, + querygym_version: str | None = None, +) -> dict: + """Assemble a schema-v1 run summary dict. + + Computes params_hash and run_id internally so callers can't get them wrong. + The optional submitted_at / environment / querygym_version overrides exist + for tests that need deterministic output; in normal use, leave them None. + + The returned dict validates against reproducibility/schema.json by + construction, but callers should still pass it through validate(...) before + writing — that adds runtime checks against dataset/method registries. + """ + params_hash = compute_params_hash(method_id, model, method_params, llm_config) + + payload: dict = { + "schema_version": SCHEMA_VERSION, + "run_id": "", # filled in below + "params_hash": params_hash, + "submitted_at": submitted_at + or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "querygym_version": querygym_version or _querygym_version(), + "environment": dict(environment) if environment is not None else _detect_environment(), + "pipeline": { + "dataset_id": dataset_id, + "method_id": method_id, + "model": model, + "steps_completed": list(steps_completed), + "total_time_seconds": float(total_time_seconds), + }, + "config": { + "method_params": dict(method_params), + "llm_config": dict(llm_config), + "searcher": {"name": searcher["name"], "type": searcher["type"]}, + "dataset_config": { + "topics": dataset_config["topics"], + "index": dataset_config["index"], + "num_queries": int(dataset_config["num_queries"]), + "bm25_weights": { + "k1": float(dataset_config["bm25_weights"]["k1"]), + "b": float(dataset_config["bm25_weights"]["b"]), + }, + }, + }, + "metrics": {k: float(v) for k, v in metrics.items()}, + "timing": {k: float(v) for k, v in timing.items()}, + "artifacts": { + "run_file": f"{params_hash}.run.txt", + "reformulated_queries": f"{params_hash}.queries.tsv", + }, + } + + payload["run_id"] = compute_run_id(payload) + return payload diff --git a/reproducibility/lib/validate.py b/reproducibility/lib/validate.py new file mode 100644 index 0000000..43f0f87 --- /dev/null +++ b/reproducibility/lib/validate.py @@ -0,0 +1,182 @@ +"""Validate run-summary payloads against schema.json + runtime registries.""" + +from __future__ import annotations + +import json +from functools import lru_cache +from pathlib import Path +from typing import Any, Iterable, Mapping + +from .emit import compute_params_hash, compute_run_id + +_REPO_ROOT = Path(__file__).resolve().parents[2] +_SCHEMA_PATH = Path(__file__).resolve().parent.parent / "schema.json" +_DEFAULT_DATASET_REGISTRY_PATH = _REPO_ROOT / "dataset_registry.yaml" + + +class ValidationError(ValueError): + """Raised when a payload fails schema or registry validation.""" + + +@lru_cache(maxsize=1) +def _load_schema() -> dict: + with _SCHEMA_PATH.open("r", encoding="utf-8") as f: + return json.load(f) + + +def _load_dataset_registry(path: Path | str | None) -> dict: + """Load dataset_registry.yaml. Returns the raw 'datasets' mapping.""" + import yaml # pyyaml is a main dep + + p = Path(path) if path else _DEFAULT_DATASET_REGISTRY_PATH + with p.open("r", encoding="utf-8") as f: + registry = yaml.safe_load(f) + return registry.get("datasets", {}) + + +def _load_method_registry() -> Iterable[str]: + """Return registered method ids from querygym.core.registry.METHODS.""" + try: + import querygym # noqa: F401 # ensure methods register on import + from querygym.core.registry import METHODS + + return list(METHODS.keys()) + except ImportError: + return [] + + +def _normalize_metric_key(name: str) -> str: + """trec_eval reports use underscores; dataset_registry uses dot notation.""" + return name.replace(".", "_") + + +def _jsonschema_validate(payload: Mapping[str, Any]) -> None: + """Run the static JSON Schema validator. Raises ValidationError on drift.""" + try: + import jsonschema + except ImportError as e: + raise ValidationError( + "jsonschema is required for validation. " + "Install with: pip install querygym[repro]" + ) from e + + schema = _load_schema() + try: + jsonschema.validate(instance=payload, schema=schema) + except jsonschema.ValidationError as e: + # Translate the field path into something readable. + path = "/".join(str(p) for p in e.absolute_path) or "" + raise ValidationError(f"schema violation at '{path}': {e.message}") from e + + +def validate( + payload: Mapping[str, Any], + *, + dataset_registry: Mapping[str, Any] | None = None, + method_registry: Iterable[str] | None = None, + dataset_registry_path: Path | str | None = None, + skip_registry_checks: bool = False, +) -> None: + """Validate a run-summary payload. + + Three layers: + 1. JSON Schema (reproducibility/schema.json) — types, enums, required fields. + 2. Registry checks — dataset_id, method_id, metric whitelist. + 3. Hash checks — recompute params_hash and run_id, compare to stored values. + + Pass skip_registry_checks=True only in tests that intentionally use unknown ids. + """ + _jsonschema_validate(payload) + + if not skip_registry_checks: + if dataset_registry is None: + dataset_registry = _load_dataset_registry(dataset_registry_path) + if method_registry is None: + method_registry = _load_method_registry() + + _validate_registries(payload, dataset_registry, method_registry) + + _validate_hashes(payload) + + +def _validate_registries( + payload: Mapping[str, Any], + dataset_registry: Mapping[str, Any], + method_registry: Iterable[str], +) -> None: + pipeline = payload["pipeline"] + dataset_id = pipeline["dataset_id"] + method_id = pipeline["method_id"] + + if dataset_id not in dataset_registry: + # Provide closest-match hint to catch typos. + candidates = sorted(dataset_registry.keys()) + hint = _closest(dataset_id, candidates) + suffix = f" (did you mean '{hint}'?)" if hint else "" + raise ValidationError( + f"dataset_id '{dataset_id}' not in dataset_registry.yaml{suffix}" + ) + + method_set = set(method_registry) + if method_set and method_id not in method_set: + hint = _closest(method_id, sorted(method_set)) + suffix = f" (did you mean '{hint}'?)" if hint else "" + raise ValidationError( + f"method_id '{method_id}' not in registered methods{suffix}" + ) + + # Metric whitelist comes from dataset_registry; normalize dot/underscore. + allowed_raw = ( + dataset_registry[dataset_id].get("output", {}).get("eval_metrics") or [] + ) + allowed = {_normalize_metric_key(m) for m in allowed_raw} + if not allowed: + # No whitelist configured for this dataset; skip the check. + return + + metrics = payload["metrics"] + unknown = [m for m in metrics.keys() if m not in allowed] + if unknown: + raise ValidationError( + f"metric(s) {sorted(unknown)} not in eval_metrics for dataset " + f"'{dataset_id}' (allowed: {sorted(allowed)})" + ) + + +def _validate_hashes(payload: Mapping[str, Any]) -> None: + config = payload["config"] + pipeline = payload["pipeline"] + + expected_params = compute_params_hash( + method_id=pipeline["method_id"], + model=pipeline["model"], + method_params=config["method_params"], + llm_config=config["llm_config"], + ) + if payload["params_hash"] != expected_params: + raise ValidationError( + f"params_hash mismatch: stored '{payload['params_hash']}', " + f"recomputed '{expected_params}'. The JSON has been edited or was " + f"generated by an inconsistent emitter." + ) + + expected_run = compute_run_id(payload) + if payload["run_id"] != expected_run: + raise ValidationError( + f"run_id mismatch: stored '{payload['run_id']}', " + f"recomputed '{expected_run}'." + ) + + +def _closest(name: str, candidates: list[str]) -> str | None: + """Return the closest candidate by simple character-set overlap.""" + if not candidates: + return None + name_chars = set(name.lower()) + scored = sorted( + candidates, + key=lambda c: -len(name_chars & set(c.lower())), + ) + best = scored[0] + overlap = len(name_chars & set(best.lower())) + return best if overlap >= max(3, len(name_chars) // 3) else None From 6130430bc30bcc3a9f89e758353f472fdf87b994 Mon Sep 17 00:00:00 2001 From: Radin Hamidi Rad Date: Wed, 29 Apr 2026 16:52:27 -0400 Subject: [PATCH 5/5] Drop querygym_version from aggregator --check comparison MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Contributors generate manifest.json on machines that may or may not have querygym installed (the lib reads __version__ via lazy import and falls back to "unknown" on ImportError). CI always has querygym installed via `pip install -e ".[repro,dev]"`, so the regenerated manifest's querygym_version differed from the committed one and --check failed on a purely informational field. content_hash still pins the actual aggregate data byte-for-byte. schema_version, run_count, and row_count remain in the comparison — those are real correctness signals. Co-Authored-By: Claude Opus 4.7 (1M context) --- reproducibility/scripts/aggregate_runs.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/reproducibility/scripts/aggregate_runs.py b/reproducibility/scripts/aggregate_runs.py index 0c1016a..d075670 100644 --- a/reproducibility/scripts/aggregate_runs.py +++ b/reproducibility/scripts/aggregate_runs.py @@ -209,8 +209,11 @@ def cmd_check(runs_dir: Path) -> int: if committed_manifest is None: failures.append(f"{MANIFEST_JSON.relative_to(_REPO_ROOT)} is missing") else: - # Compare everything except generated_at (which is intentionally volatile). - for key in ("schema_version", "querygym_version", "run_count", "row_count", "content_hash"): + # Compare data-correctness fields only. querygym_version and generated_at + # are informational provenance — they reflect *where* and *when* the + # manifest was produced and are expected to differ between contributor + # machines and CI. content_hash already pins the actual aggregate data. + for key in ("schema_version", "run_count", "row_count", "content_hash"): committed_val = committed_manifest.get(key) fresh_val = manifest.get(key) if committed_val != fresh_val: