diff --git a/cli/eval/runner.py b/cli/eval/runner.py index a456df0d6..58c1ea4df 100644 --- a/cli/eval/runner.py +++ b/cli/eval/runner.py @@ -366,6 +366,20 @@ def execute_test_eval( return test_result +def _extract_first_json(text: str) -> dict | None: + """Return the first JSON object containing a ``"score"`` key, or ``None``.""" + decoder = json.JSONDecoder() + for i, ch in enumerate(text): + if ch == "{": + try: + obj, _ = decoder.raw_decode(text, i) + if "score" in obj: + return obj + except json.JSONDecodeError: + continue + return None + + def parse_judge_output(judge_output: str) -> tuple[int | None, str]: """Parse score and justification from a judge model's output string. @@ -377,16 +391,13 @@ def parse_judge_output(judge_output: str) -> tuple[int | None, str]: ``None`` if parsing failed) and ``justification`` is an explanatory string. """ - try: - json_match = re.search(r'\{[^}]*"score"[^}]*\}', judge_output, re.DOTALL) - if json_match: - json_str = json_match.group(0) - data = json.loads(json_str) - score = data.get("score") - justification = data.get("justification") - return score, justification - except (json.JSONDecodeError, AttributeError): - pass + data = _extract_first_json(judge_output) + if data is not None: + score = data.get("score") + justification = data.get("justification") + return score, ( + justification if isinstance(justification, str) else judge_output + ) # if the above fails, search the text for the score score_match = re.search(r'score["\s:]+(\d+)', judge_output, re.IGNORECASE) diff --git a/test/cli/test_eval_unit.py b/test/cli/test_eval_unit.py index ad9d4c452..b7fce2aba 100644 --- a/test/cli/test_eval_unit.py +++ b/test/cli/test_eval_unit.py @@ -1,11 +1,16 @@ """Unit tests for eval runner pure-logic helpers — no backend, no model required. -Covers InputEvalResult, TestEvalResult, parse_judge_output. +Covers InputEvalResult, TestEvalResult, parse_judge_output, _extract_first_json. """ import pytest -from cli.eval.runner import InputEvalResult, TestEvalResult, parse_judge_output +from cli.eval.runner import ( + InputEvalResult, + TestEvalResult, + _extract_first_json, + parse_judge_output, +) from mellea.stdlib.components.unit_test_eval import TestBasedEval # --- InputEvalResult --- @@ -142,5 +147,56 @@ def test_parse_zero_score(): assert reason == "Failed" +def test_parse_nested_json_preserves_justification(): + output = '{"score": 1, "justification": "Correct", "reasoning": {"detail": "step-by-step"}}' + score, reason = parse_judge_output(output) + assert score == 1 + assert reason == "Correct" + + +def test_parse_json_score_no_justification_key(): + output = '{"score": 1}' + score, reason = parse_judge_output(output) + assert score == 1 + assert reason == output + + +def test_parse_json_justification_null(): + output = '{"score": 0, "justification": null}' + score, reason = parse_judge_output(output) + assert score == 0 + assert reason == output + + +def test_parse_second_json_when_first_lacks_score(): + output = '{"context": "intro"} {"score": 1, "justification": "Looks good"}' + score, reason = parse_judge_output(output) + assert score == 1 + assert reason == "Looks good" + + +# --- _extract_first_json --- + + +def test_extract_first_json_finds_score_object(): + assert _extract_first_json('{"score": 1, "justification": "ok"}') == { + "score": 1, + "justification": "ok", + } + + +def test_extract_first_json_skips_object_without_score(): + text = '{"foo": "bar"} {"score": 0}' + assert _extract_first_json(text) == {"score": 0} + + +def test_extract_first_json_no_json_returns_none(): + assert _extract_first_json("plain text, no JSON here") is None + + +def test_extract_first_json_no_score_key_returns_none(): + assert _extract_first_json('{"justification": "no score anywhere"}') is None + + if __name__ == "__main__": pytest.main([__file__, "-v"])