Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 36 additions & 8 deletions reproducibility/site/scripts/build-data.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ interface RunDetail {
params_hash: string;
dataset_id: string;
method_id: string;
method_display: string;
model: string;
retriever_id: string;
retriever_display: string;
Expand Down Expand Up @@ -195,11 +196,16 @@ function readRunDetails(retrievers: Record<string, { display_name: string; parad
const retr = payload.config?.retrieval ?? {};
const retrId = retr.retriever_id ?? "";
const artifacts = payload.artifacts ?? {};
const lm = logicalMethod(
payload.pipeline.method_id,
JSON.stringify(payload.config?.method_params ?? {}),
);
out[payload.run_id] = {
run_id: payload.run_id,
params_hash: hash,
dataset_id: payload.pipeline.dataset_id,
method_id: payload.pipeline.method_id,
method_display: lm.display,
model: payload.pipeline.model,
model_display: displayModel(payload.pipeline.model),
retriever_id: retrId,
Expand Down Expand Up @@ -235,10 +241,9 @@ function buildPerDatasetViews(
}

for (const [datasetId, dsRows] of byDataset) {
const allowed = datasets[datasetId]?.eval_metrics ?? [];

// Pivot to one row per (logical_method, model, retriever). Variants are
// folded by max value per metric — matches the home matrix.
// folded by max value per metric — matches the home matrix. Track run_id
// per metric so the "best" cell links to the run that achieved it.
const map = new Map<string, any>();
for (const r of dsRows) {
const lm = logicalMethod(r.method_id, r.method_params_json);
Expand All @@ -251,21 +256,42 @@ function buildPerDatasetViews(
model_display: displayModel(r.model),
retriever_id: r.retriever_id,
retriever_display: r.retriever,
run_id: r.run_id, // populated/overwritten by the best cell
run_ids: {} as Record<string, string>, // metric → run_id of the winning value
metrics: {} as Record<string, number>,
best_for: {} as Record<string, boolean>,
});
}
const row = map.get(key);
if (row.metrics[r.metric] === undefined || r.value > row.metrics[r.metric]) {
row.metrics[r.metric] = r.value;
row.run_id = r.run_id;
row.run_ids[r.metric] = r.run_id;
}
}

// Discover which metrics actually exist in the data — the registry's
// eval_metrics is aspirational and may over-specify (e.g. MAP on DL,
// recall_1000 on BEIR). Render only what we have.
const present = new Set<string>();
for (const row of map.values()) {
for (const m of Object.keys(row.metrics)) present.add(m);
}
const allMetrics = Array.from(present);
const primary = present.has("ndcg_cut_10") ? "ndcg_cut_10" : allMetrics[0] ?? null;
const secondary = present.has("recall_1000")
? "recall_1000"
: present.has("recall_100")
? "recall_100"
: allMetrics.find((m) => m !== primary) ?? null;
// Order: primary first, secondary second, then anything else.
const orderedMetrics = [
...(primary ? [primary] : []),
...(secondary && secondary !== primary ? [secondary] : []),
...allMetrics.filter((m) => m !== primary && m !== secondary),
];

// best_for flags relative to the rows above.
const list = Array.from(map.values());
for (const m of allowed) {
for (const m of orderedMetrics) {
let best = -Infinity;
let bestRow: any = null;
for (const row of list) {
Expand All @@ -277,8 +303,10 @@ function buildPerDatasetViews(

writeJSON(path.join(VIEWS_DIR, `dataset-${datasetId}.json`), {
dataset_id: datasetId,
dataset: datasets[datasetId] ?? { id: datasetId, name: datasetId, eval_metrics: allowed },
metric_columns: allowed,
dataset: datasets[datasetId] ?? { id: datasetId, name: datasetId, eval_metrics: orderedMetrics },
metric_columns: orderedMetrics,
primary_metric: primary,
secondary_metric: secondary,
runs: list,
});
}
Expand Down
131 changes: 131 additions & 0 deletions reproducibility/site/src/components/FilterChips.astro
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
---
/**
* Chip-style filter bar for any leaderboard table.
*
* Each group corresponds to a column on the table's <tr data-*> attributes
* (e.g. data-method, data-model). Clicking a chip hides rows whose attribute
* doesn't match, by toggling the .qg-chip-hidden class and dispatching
* "qg-itable-reapply" on the nearest .qg-itable wrapper so InteractiveTable
* re-syncs its row-visibility + shown-count.
*
* The optional `metric` group is special: it swaps .qg-cell-primary /
* .qg-cell-secondary visibility and the matching column-label spans across
* the whole page, then re-keys cells' data-sort-value to the now-visible
* metric so sort follows what's on screen.
*/
interface ChipValue {
value: string;
label: string;
}
interface ChipGroup {
/** "method" | "model" | "retriever" | "metric"; matches <tr data-{key}> */
key: string;
/** Visible header text. */
label: string;
/** First item is shown as the active default. For `metric`, use
* [{value:"primary", label:"nDCG@10"}, {value:"secondary", label:"Recall"}]. */
values: ChipValue[];
}
interface Props {
/** id of the table to filter (used to scope row queries to this table). */
tableId: string;
groups: ChipGroup[];
}
const { tableId, groups } = Astro.props;
---

<section class="mb-4 flex flex-wrap gap-x-6 gap-y-3 text-sm" data-qg-filters data-table={tableId}>
{groups.map((g) => (
<div class="flex flex-wrap items-center gap-2">
<span class="text-qg-fg-muted">{g.label}:</span>
<div data-group={g.key} class="flex flex-wrap gap-1.5">
{g.values.map((v, i) => (
<button
type="button"
data-value={v.value}
class:list={["qg-chip", i === 0 && "qg-chip-active"]}
>
{v.label}
</button>
))}
</div>
</div>
))}
</section>

<style>
.qg-chip {
@apply rounded-full border border-qg-border bg-qg-bg-soft px-3 py-1 text-xs font-medium text-qg-fg-muted hover:border-qg-accent;
}
.qg-chip-active {
@apply border-qg-accent bg-qg-accent text-white hover:border-qg-accent;
}
</style>

<script>
document.querySelectorAll<HTMLElement>("[data-qg-filters]").forEach((bar) => {
if (bar.dataset.qgWired === "1") return;
bar.dataset.qgWired = "1";

const tableId = bar.dataset.table!;
const table = document.getElementById(tableId);
if (!table) return;
const tbody = table.querySelector("tbody");
if (!tbody) return;
const itableRoot = table.closest(".qg-itable") as HTMLElement | null;

const state: Record<string, string> = {};
bar.querySelectorAll<HTMLElement>("[data-group]").forEach((g) => {
const key = g.dataset.group!;
const active = g.querySelector<HTMLButtonElement>(".qg-chip-active");
state[key] = active?.dataset.value ?? "";
});

function applyRowFilters() {
tbody!.querySelectorAll<HTMLTableRowElement>("tr").forEach((tr) => {
let hide = false;
for (const [key, val] of Object.entries(state)) {
if (key === "metric" || !val) continue;
if (tr.dataset[key] !== val) {
hide = true;
break;
}
}
tr.classList.toggle("qg-chip-hidden", hide);
});
itableRoot?.dispatchEvent(new CustomEvent("qg-itable-reapply"));
}

function applyMetricMode() {
const primaryShown = state.metric !== "secondary";
// Scope the column-label + cell span toggles to *this* table only — if
// multiple tables coexist on a page, each bar controls only its own.
table!.querySelectorAll(".qg-col-label-primary").forEach((el) => el.classList.toggle("hidden", !primaryShown));
table!.querySelectorAll(".qg-col-label-secondary").forEach((el) => el.classList.toggle("hidden", primaryShown));
table!.querySelectorAll(".qg-cell-primary").forEach((el) => el.classList.toggle("hidden", !primaryShown));
table!.querySelectorAll(".qg-cell-secondary").forEach((el) => el.classList.toggle("hidden", primaryShown));
// Re-key sort value so a subsequent header click sorts by the now-visible metric.
table!.querySelectorAll<HTMLTableCellElement>("td[data-primary-value]").forEach((td) => {
const v = primaryShown ? td.dataset.primaryValue : td.dataset.secondaryValue;
td.dataset.sortValue = v ?? "";
});
}

bar.querySelectorAll<HTMLElement>("[data-group]").forEach((g) => {
const key = g.dataset.group!;
g.querySelectorAll<HTMLButtonElement>("button").forEach((btn) => {
btn.addEventListener("click", () => {
g.querySelectorAll("button").forEach((b) => b.classList.remove("qg-chip-active"));
btn.classList.add("qg-chip-active");
state[key] = btn.dataset.value ?? "";
if (key === "metric") applyMetricMode();
else applyRowFilters();
});
});
});

// Initial apply so any non-default starting chips take effect.
applyMetricMode();
applyRowFilters();
});
</script>
62 changes: 62 additions & 0 deletions reproducibility/site/src/components/MatrixCell.astro
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
---
/**
* One cell in any of the leaderboard tables.
*
* Cells render two metric values (primary + optional secondary) layered on top
* of each other; FilterChips' metric-toggle flips visibility via the global
* .qg-cell-primary / .qg-cell-secondary classes. The cell exposes
* data-primary-value and data-secondary-value so InteractiveTable's sort
* picks up whichever metric is currently visible.
*/
interface Cell {
value: number;
best: boolean;
}
interface Props {
primary?: Cell;
secondary?: Cell | null;
runId?: string | null;
digits?: number;
}
const { primary, secondary, runId, digits = 3 } = Astro.props;
const primaryValue = primary?.value ?? "";
const secondaryValue = secondary?.value ?? "";
const empty = primary === undefined && (secondary == null || secondary === undefined);
---

<td
class="qg-mono px-3 py-2 text-right tabular-nums"
data-sort-value={primaryValue}
data-primary-value={primaryValue}
data-secondary-value={secondaryValue}
>
{empty ? (
<span class="text-qg-fg-muted">—</span>
) : runId ? (
<a class="hover:text-qg-accent hover:underline" href={`/runs/${runId}`} title="View run + reproduce">
{primary !== undefined && (
<span class={`qg-cell-primary ${primary.best ? "font-bold text-qg-accent" : ""}`}>
{primary.value.toFixed(digits)}
</span>
)}
{secondary != null && (
<span class={`qg-cell-secondary hidden ${secondary.best ? "font-bold text-qg-accent" : ""}`}>
{secondary.value.toFixed(digits)}
</span>
)}
</a>
) : (
<>
{primary !== undefined && (
<span class={`qg-cell-primary ${primary.best ? "font-bold text-qg-accent" : ""}`}>
{primary.value.toFixed(digits)}
</span>
)}
{secondary != null && (
<span class={`qg-cell-secondary hidden ${secondary.best ? "font-bold text-qg-accent" : ""}`}>
{secondary.value.toFixed(digits)}
</span>
)}
</>
)}
</td>
37 changes: 20 additions & 17 deletions reproducibility/site/src/pages/about.astro
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
---
import Default from "../layouts/Default.astro";
import CodeBlock from "../components/CodeBlock.astro";

const submitCmd = `python examples/querygym_pyserini/pipeline.py \\
--dataset msmarco-v1-passage.trecdl2019 \\
--method query2doc --model gpt-4.1 \\
--output-dir outputs/dl19_query2doc

python -m reproducibility.scripts.submit_run --from-dir outputs/dl19_query2doc
make repro-aggregate
git add reproducibility/data/ && git commit -m "..." && git push
gh pr create`;
---

<Default title="About" description="What this leaderboard is and how to submit.">
Expand All @@ -8,16 +19,14 @@ import Default from "../layouts/Default.astro";
<section class="prose prose-slate mt-4 max-w-3xl text-qg-fg-muted">
<p>
The QueryGym Leaderboard tracks reproducible query-reformulation results
across IR benchmarks (BEIR, MS MARCO, TREC DL). Every row is backed by:
across IR benchmarks (BEIR, MS MARCO, TREC DL). Every row is backed by a
JSON file conforming to <code class="qg-mono">reproducibility/schema.json</code> v1.
Submissions may also include the reformulated-queries TSV and a
TREC-format <code class="qg-mono">.run.txt</code> for full re-evaluation; both are optional.
</p>
<ul class="ml-5 list-disc">
<li>a JSON file conforming to <code class="qg-mono">reproducibility/schema.json</code> v1,</li>
<li>a TREC-format <code class="qg-mono">.run.txt</code> for re-evaluation, and</li>
<li>the reformulated queries TSV used to produce the run file.</li>
</ul>
<p>
All three live in <a href="https://github.com/ls3-lab/QueryGym/tree/main/reproducibility/data/runs" target="_blank" rel="noopener noreferrer">the repository</a>
under <code class="qg-mono">reproducibility/data/runs/&#123;dataset&#125;/&#123;method&#125;/&#123;model&#125;/</code>.
All artifacts live in <a href="https://github.com/ls3-lab/QueryGym/tree/main/reproducibility/data/runs" target="_blank" rel="noopener noreferrer">the repository</a>
under <code class="qg-mono">reproducibility/data/runs/&#123;dataset&#125;/&#123;method&#125;/&#123;model&#125;/&#123;retriever&#125;/</code>.
Citing a number is as simple as linking the commit + the run JSON.
</p>
</section>
Expand All @@ -27,15 +36,9 @@ import Default from "../layouts/Default.astro";
<p class="text-qg-fg-muted">
Run the example pipeline, then use <code class="qg-mono">submit_run.py</code> and open a PR.
</p>
<pre class="mt-3 overflow-x-auto rounded bg-qg-bg-soft p-4 text-xs"><code>python examples/querygym_pyserini/pipeline.py \
--dataset msmarco-v1-passage.trecdl2019 \
--method query2e --model gpt-4.1-mini \
--output-dir outputs/dl19_query2e

python -m reproducibility.scripts.submit_run --from-dir outputs/dl19_query2e
make repro-aggregate
git add reproducibility/data/ &amp;&amp; git commit -m "..." &amp;&amp; git push
gh pr create</code></pre>
<div class="mt-3">
<CodeBlock filename="submit.sh" language="bash">{submitCmd}</CodeBlock>
</div>
<p class="mt-3 text-qg-fg-muted text-sm">
Full guide: <a class="text-qg-accent hover:underline" href="https://querygym.readthedocs.io/en/latest/user-guide/reproducibility/" target="_blank" rel="noopener noreferrer">Reproducibility User Guide ↗</a>
</p>
Expand Down
Loading
Loading