generative-computing · sjoerdvink99 · Apr 16, 2026 · psschwei · Apr 16, 2026
@@ -366,6 +366,20 @@ def execute_test_eval(
     return test_result
 
 
+def _extract_first_json(text: str) -> dict | None:
+    """Return the first JSON object containing a ``"score"`` key, or ``None``."""
+    decoder = json.JSONDecoder()
+    for i, ch in enumerate(text):
+        if ch == "{":
+            try:
+                obj, _ = decoder.raw_decode(text, i)
+                if "score" in obj:
+                    return obj
+            except json.JSONDecodeError:
+                continue
+    return None
+
+
 def parse_judge_output(judge_output: str) -> tuple[int | None, str]:
     """Parse score and justification from a judge model's output string.
 
@@ -377,16 +391,13 @@ def parse_judge_output(judge_output: str) -> tuple[int | None, str]:
         ``None`` if parsing failed) and ``justification`` is an explanatory
         string.
     """
-    try:
-        json_match = re.search(r'\{[^}]*"score"[^}]*\}', judge_output, re.DOTALL)
-        if json_match:
-            json_str = json_match.group(0)
-            data = json.loads(json_str)
-            score = data.get("score")
-            justification = data.get("justification")
-            return score, justification
-    except (json.JSONDecodeError, AttributeError):
-        pass
+    data = _extract_first_json(judge_output)
+    if data is not None:
+        score = data.get("score")
+        justification = data.get("justification")
+        return score, (
+            justification if isinstance(justification, str) else judge_output
+        )
 
     # if the above fails, search the text for the score
     score_match = re.search(r'score["\s:]+(\d+)', judge_output, re.IGNORECASE)

@@ -1,11 +1,16 @@
 """Unit tests for eval runner pure-logic helpers — no backend, no model required.
 
-Covers InputEvalResult, TestEvalResult, parse_judge_output.
+Covers InputEvalResult, TestEvalResult, parse_judge_output, _extract_first_json.
 """
 
 import pytest
 
-from cli.eval.runner import InputEvalResult, TestEvalResult, parse_judge_output
+from cli.eval.runner import (
+    InputEvalResult,
+    TestEvalResult,
+    _extract_first_json,
+    parse_judge_output,
+)
 from mellea.stdlib.components.unit_test_eval import TestBasedEval
 
 # --- InputEvalResult ---
@@ -142,5 +147,56 @@ def test_parse_zero_score():
     assert reason == "Failed"
 
 
+def test_parse_nested_json_preserves_justification():
+    output = '{"score": 1, "justification": "Correct", "reasoning": {"detail": "step-by-step"}}'
+    score, reason = parse_judge_output(output)
+    assert score == 1
+    assert reason == "Correct"
+
+
+def test_parse_json_score_no_justification_key():
+    output = '{"score": 1}'
+    score, reason = parse_judge_output(output)
+    assert score == 1
+    assert reason == output
+
+
+def test_parse_json_justification_null():
+    output = '{"score": 0, "justification": null}'
+    score, reason = parse_judge_output(output)
+    assert score == 0
+    assert reason == output
+
+
+def test_parse_second_json_when_first_lacks_score():
+    output = '{"context": "intro"} {"score": 1, "justification": "Looks good"}'
+    score, reason = parse_judge_output(output)
+    assert score == 1
+    assert reason == "Looks good"
+
+
+# --- _extract_first_json ---
+
+
+def test_extract_first_json_finds_score_object():
+    assert _extract_first_json('{"score": 1, "justification": "ok"}') == {
+        "score": 1,
+        "justification": "ok",
+    }
+
+
+def test_extract_first_json_skips_object_without_score():
+    text = '{"foo": "bar"} {"score": 0}'
+    assert _extract_first_json(text) == {"score": 0}
+
+
+def test_extract_first_json_no_json_returns_none():
+    assert _extract_first_json("plain text, no JSON here") is None
+
+
+def test_extract_first_json_no_score_key_returns_none():
+    assert _extract_first_json('{"justification": "no score anywhere"}') is None
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])