Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion .github/workflows/reproducibility-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,17 @@ on:
- 'reproducibility/**'
- 'examples/querygym_pyserini/pipeline.py'
- 'dataset_registry.yaml'
- 'pyproject.toml'
- '.github/workflows/reproducibility-check.yml'
push:
branches: [main]
paths:
- 'reproducibility/**'
- 'examples/querygym_pyserini/pipeline.py'
- 'dataset_registry.yaml'
- 'pyproject.toml'
- '.github/workflows/reproducibility-check.yml'
workflow_dispatch:

concurrency:
group: repro-${{ github.ref }}
Expand All @@ -36,7 +41,10 @@ jobs:
pip install pytest-cov

- name: Run repro tests
run: pytest reproducibility/tests -v --no-cov
# Use `python -m pytest` so the cwd (repo root) is on sys.path —
# `from reproducibility.lib import ...` needs that, and the bare
# `pytest` form doesn't add it on CI's Python 3.9.
run: python -m pytest reproducibility/tests -v --no-cov

- name: Aggregator --check
run: python -m reproducibility.scripts.aggregate_runs --check
25 changes: 14 additions & 11 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,20 @@ __pycache__/

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
# Anchored to repo root so these Python build dirs don't collide with project
# directories that legitimately use the same names (e.g. reproducibility/lib/,
# web/site/dist/).
/build/
/develop-eggs/
/dist/
/downloads/
/eggs/
/.eggs/
/lib/
/lib64/
/parts/
/sdist/
/var/
wheels/
pip-wheel-metadata/
share/python-wheels/
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ ignore = ["E501"] # Line too long (handled by black)
# Pytest configuration
[tool.pytest.ini_options]
testpaths = ["tests", "reproducibility/tests"]
# Make the repo root importable so `from reproducibility.lib import ...` works
# when pytest is invoked as `pytest reproducibility/tests` (i.e. without
# `python -m`, which is how the CI workflow runs it).
pythonpath = ["."]
python_files = ["test_*.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]
Expand Down
24 changes: 24 additions & 0 deletions reproducibility/lib/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Private Python helpers for QueryGym reproducibility tooling.

The contract that crosses repo boundaries is `reproducibility/schema.json`.
This module is internal to this repository and used by the example pipeline,
the aggregator, the submission tool, and the tests. External consumers
(the dashboard product, third-party tools) should read schema.json directly.
"""

from .emit import (
SCHEMA_VERSION,
build_run_summary,
compute_params_hash,
compute_run_id,
)
from .validate import validate, ValidationError

__all__ = [
"SCHEMA_VERSION",
"build_run_summary",
"compute_params_hash",
"compute_run_id",
"validate",
"ValidationError",
]
156 changes: 156 additions & 0 deletions reproducibility/lib/emit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
"""Build canonical run-summary JSON payloads conformant to schema v1."""

from __future__ import annotations

import hashlib
import json
import platform as _platform
import subprocess
import sys
from datetime import datetime, timezone
from typing import Any, Mapping

SCHEMA_VERSION = 1


def _stable_json(payload: Any) -> str:
"""Serialize with sorted keys and no whitespace — deterministic across runs."""
return json.dumps(payload, sort_keys=True, separators=(",", ":"), ensure_ascii=False)


def compute_params_hash(
method_id: str,
model: str,
method_params: Mapping[str, Any],
llm_config: Mapping[str, Any],
) -> str:
"""8-char hex hash over the tuning surface.

Same config -> same hash (re-run replaces the previous file).
Different temperature -> different hash (no collision).
"""
payload = {
"method_id": method_id,
"model": model,
"method_params": dict(method_params),
"llm_config": dict(llm_config),
}
digest = hashlib.sha256(_stable_json(payload).encode("utf-8")).hexdigest()
return digest[:8]


# Fields excluded from run_id hash because they would make identical executions
# produce different ids. Keep this list in sync with schema.json's volatile fields.
_RUN_ID_EXCLUDED_FIELDS = ("run_id", "submitted_at", "environment")


def compute_run_id(payload: Mapping[str, Any]) -> str:
"""16-char hex hash over the payload minus volatile fields.

Two distinct executions of the same logical experiment that produce identical
metrics -> same run_id. Two executions whose results differ -> different run_ids.
"""
stripped = {k: v for k, v in payload.items() if k not in _RUN_ID_EXCLUDED_FIELDS}
digest = hashlib.sha256(_stable_json(stripped).encode("utf-8")).hexdigest()
return digest[:16]


def _detect_environment() -> dict:
"""Best-effort capture of the runner's environment. Never raises."""
env = {
"python_version": ".".join(map(str, sys.version_info[:3])),
"platform": _platform.platform(),
"git_commit": None,
}
try:
commit = subprocess.run(
["git", "rev-parse", "--short", "HEAD"],
capture_output=True,
text=True,
timeout=2,
check=False,
)
if commit.returncode == 0 and commit.stdout.strip():
env["git_commit"] = commit.stdout.strip()
except (OSError, subprocess.SubprocessError):
pass
return env


def _querygym_version() -> str:
"""Read querygym.__version__ if available, else 'unknown'."""
try:
import querygym # type: ignore
return getattr(querygym, "__version__", "unknown")
except ImportError:
return "unknown"


def build_run_summary(
*,
dataset_id: str,
method_id: str,
model: str,
method_params: Mapping[str, Any],
llm_config: Mapping[str, Any],
searcher: Mapping[str, Any],
dataset_config: Mapping[str, Any],
metrics: Mapping[str, float],
timing: Mapping[str, float],
steps_completed: list,
total_time_seconds: float,
submitted_at: str | None = None,
environment: Mapping[str, Any] | None = None,
querygym_version: str | None = None,
) -> dict:
"""Assemble a schema-v1 run summary dict.

Computes params_hash and run_id internally so callers can't get them wrong.
The optional submitted_at / environment / querygym_version overrides exist
for tests that need deterministic output; in normal use, leave them None.

The returned dict validates against reproducibility/schema.json by
construction, but callers should still pass it through validate(...) before
writing — that adds runtime checks against dataset/method registries.
"""
params_hash = compute_params_hash(method_id, model, method_params, llm_config)

payload: dict = {
"schema_version": SCHEMA_VERSION,
"run_id": "", # filled in below
"params_hash": params_hash,
"submitted_at": submitted_at
or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"querygym_version": querygym_version or _querygym_version(),
"environment": dict(environment) if environment is not None else _detect_environment(),
"pipeline": {
"dataset_id": dataset_id,
"method_id": method_id,
"model": model,
"steps_completed": list(steps_completed),
"total_time_seconds": float(total_time_seconds),
},
"config": {
"method_params": dict(method_params),
"llm_config": dict(llm_config),
"searcher": {"name": searcher["name"], "type": searcher["type"]},
"dataset_config": {
"topics": dataset_config["topics"],
"index": dataset_config["index"],
"num_queries": int(dataset_config["num_queries"]),
"bm25_weights": {
"k1": float(dataset_config["bm25_weights"]["k1"]),
"b": float(dataset_config["bm25_weights"]["b"]),
},
},
},
"metrics": {k: float(v) for k, v in metrics.items()},
"timing": {k: float(v) for k, v in timing.items()},
"artifacts": {
"run_file": f"{params_hash}.run.txt",
"reformulated_queries": f"{params_hash}.queries.tsv",
},
}

payload["run_id"] = compute_run_id(payload)
return payload
Loading
Loading