Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 21 additions & 10 deletions cli/eval/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,20 @@ def execute_test_eval(
return test_result


def _extract_first_json(text: str) -> dict | None:
"""Return the first JSON object containing a ``"score"`` key, or ``None``."""
decoder = json.JSONDecoder()
for i, ch in enumerate(text):
if ch == "{":
try:
obj, _ = decoder.raw_decode(text, i)
if "score" in obj:
return obj
except json.JSONDecodeError:
continue
return None


def parse_judge_output(judge_output: str) -> tuple[int | None, str]:
"""Parse score and justification from a judge model's output string.

Expand All @@ -377,16 +391,13 @@ def parse_judge_output(judge_output: str) -> tuple[int | None, str]:
``None`` if parsing failed) and ``justification`` is an explanatory
string.
"""
try:
json_match = re.search(r'\{[^}]*"score"[^}]*\}', judge_output, re.DOTALL)
if json_match:
json_str = json_match.group(0)
data = json.loads(json_str)
score = data.get("score")
justification = data.get("justification")
return score, justification
except (json.JSONDecodeError, AttributeError):
pass
data = _extract_first_json(judge_output)
if data is not None:
score = data.get("score")
justification = data.get("justification")
return score, (
justification if isinstance(justification, str) else judge_output
)

# if the above fails, search the text for the score
score_match = re.search(r'score["\s:]+(\d+)', judge_output, re.IGNORECASE)
Expand Down
60 changes: 58 additions & 2 deletions test/cli/test_eval_unit.py
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May be worth adding some additional test cases:

  • JSON with score but no justification key
  • "justification": null (explicit null)
  • Multiple JSON objects where the first lacks score
  • Direct unit tests for _extract_first_json

Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
"""Unit tests for eval runner pure-logic helpers — no backend, no model required.

Covers InputEvalResult, TestEvalResult, parse_judge_output.
Covers InputEvalResult, TestEvalResult, parse_judge_output, _extract_first_json.
"""

import pytest

from cli.eval.runner import InputEvalResult, TestEvalResult, parse_judge_output
from cli.eval.runner import (
InputEvalResult,
TestEvalResult,
_extract_first_json,
parse_judge_output,
)
from mellea.stdlib.components.unit_test_eval import TestBasedEval

# --- InputEvalResult ---
Expand Down Expand Up @@ -142,5 +147,56 @@ def test_parse_zero_score():
assert reason == "Failed"


def test_parse_nested_json_preserves_justification():
output = '{"score": 1, "justification": "Correct", "reasoning": {"detail": "step-by-step"}}'
score, reason = parse_judge_output(output)
assert score == 1
assert reason == "Correct"


def test_parse_json_score_no_justification_key():
output = '{"score": 1}'
score, reason = parse_judge_output(output)
assert score == 1
assert reason == output


def test_parse_json_justification_null():
output = '{"score": 0, "justification": null}'
score, reason = parse_judge_output(output)
assert score == 0
assert reason == output


def test_parse_second_json_when_first_lacks_score():
output = '{"context": "intro"} {"score": 1, "justification": "Looks good"}'
score, reason = parse_judge_output(output)
assert score == 1
assert reason == "Looks good"


# --- _extract_first_json ---


def test_extract_first_json_finds_score_object():
assert _extract_first_json('{"score": 1, "justification": "ok"}') == {
"score": 1,
"justification": "ok",
}


def test_extract_first_json_skips_object_without_score():
text = '{"foo": "bar"} {"score": 0}'
assert _extract_first_json(text) == {"score": 0}


def test_extract_first_json_no_json_returns_none():
assert _extract_first_json("plain text, no JSON here") is None


def test_extract_first_json_no_score_key_returns_none():
assert _extract_first_json('{"justification": "no score anywhere"}') is None


if __name__ == "__main__":
pytest.main([__file__, "-v"])
Loading