Skip to content

Commit 040b479

Browse files
caohy1988claude
andcommitted
fix: separate byte and character limits in BQ plugin GCS text offload
The GCS text offload decision mixed byte-based and character-based limits in a single min() comparison. inline_text_limit (32KB) is a byte-based storage guard, while max_content_length is a character- based truncation limit. Computing min(bytes, chars) produced wrong offload decisions for multi-byte text (CJK, emoji). The fix evaluates each limit in its own unit: - inline_text_limit: compared against UTF-8 byte length - max_content_length: compared against character count Text is offloaded if either limit is exceeded. Includes regression test for the specific #5561 case: 3K emoji chars (12K bytes) with max_length=10000 — under both real limits but falsely offloaded by the old mixed-unit min(). Fixes #5561 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 7b20c4d commit 040b479

2 files changed

Lines changed: 135 additions & 8 deletions

File tree

src/google/adk/plugins/bigquery_agent_analytics_plugin.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1430,14 +1430,20 @@ async def _parse_content_object(
14301430

14311431
# CASE C: Text
14321432
elif hasattr(part, "text") and part.text:
1433-
text_len = len(part.text.encode("utf-8"))
1434-
# If max_length is set and smaller than inline limit, use it as threshold
1435-
# to prefer offloading over truncation.
1436-
offload_threshold = self.inline_text_limit
1437-
if self.max_length != -1 and self.max_length < offload_threshold:
1438-
offload_threshold = self.max_length
1439-
1440-
if self.offloader and text_len > offload_threshold:
1433+
char_len = len(part.text)
1434+
byte_len = len(part.text.encode("utf-8"))
1435+
1436+
# Decide whether to offload using each limit in its own
1437+
# unit. inline_text_limit is a byte-based storage guard;
1438+
# max_length is a character-based truncation limit.
1439+
# Comparing them in a single min() mixes units and
1440+
# produces wrong decisions for multi-byte text.
1441+
exceeds_inline_byte_limit = byte_len > self.inline_text_limit
1442+
exceeds_char_limit = (
1443+
self.max_length != -1 and char_len > self.max_length
1444+
)
1445+
1446+
if self.offloader and (exceeds_inline_byte_limit or exceeds_char_limit):
14411447
# Text is too big, treat as file
14421448
path = f"{datetime.now().date()}/{self.trace_id}/{self.span_id}_p{idx}.txt"
14431449
try:

tests/unittests/plugins/test_bigquery_agent_analytics_plugin.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7483,3 +7483,124 @@ async def test_reset_on_real_fork(
74837483
# Should have called _reset_runtime_state because
74847484
# _init_pid is a real PID different from os.getpid()
74857485
mock_reset.assert_called_once()
7486+
7487+
7488+
# ================================================================
7489+
# TEST CLASS: GCS offload unit mismatch fix (Issue #5561)
7490+
# ================================================================
7491+
class TestOffloadUnitSeparation:
7492+
"""Tests that byte-based inline limit and character-based truncation
7493+
limit are evaluated independently for the GCS offload decision."""
7494+
7495+
@pytest.mark.asyncio
7496+
async def test_multibyte_text_offloaded_by_byte_limit(self):
7497+
"""Multi-byte text exceeding inline_text_limit bytes is offloaded."""
7498+
mock_offloader = mock.AsyncMock()
7499+
mock_offloader.upload_content.return_value = "gs://bucket/offloaded.txt"
7500+
7501+
parser = bigquery_agent_analytics_plugin.HybridContentParser(
7502+
offloader=mock_offloader,
7503+
trace_id="t",
7504+
span_id="s",
7505+
max_length=-1, # no character truncation
7506+
)
7507+
# 10K emoji chars → ~40KB UTF-8, exceeds inline_text_limit (32KB)
7508+
text = "\U0001f600" * 10000
7509+
assert len(text) == 10000 # characters
7510+
assert len(text.encode("utf-8")) > 32 * 1024 # bytes
7511+
7512+
content = types.Content(parts=[types.Part(text=text)])
7513+
_, parts, _ = await parser._parse_content_object(content)
7514+
7515+
mock_offloader.upload_content.assert_called_once()
7516+
assert parts[0]["storage_mode"] == "GCS_REFERENCE"
7517+
7518+
@pytest.mark.asyncio
7519+
async def test_ascii_under_both_limits_stays_inline(self):
7520+
"""ASCII text under both byte and character limits stays inline."""
7521+
mock_offloader = mock.AsyncMock()
7522+
7523+
parser = bigquery_agent_analytics_plugin.HybridContentParser(
7524+
offloader=mock_offloader,
7525+
trace_id="t",
7526+
span_id="s",
7527+
max_length=50000,
7528+
)
7529+
text = "A" * 1000 # 1K chars = 1K bytes, under both limits
7530+
content = types.Content(parts=[types.Part(text=text)])
7531+
_, parts, _ = await parser._parse_content_object(content)
7532+
7533+
mock_offloader.upload_content.assert_not_called()
7534+
assert parts[0]["storage_mode"] == "INLINE"
7535+
assert parts[0]["text"] == text
7536+
7537+
@pytest.mark.asyncio
7538+
async def test_text_exceeding_char_limit_offloaded(self):
7539+
"""ASCII text exceeding max_length characters is offloaded."""
7540+
mock_offloader = mock.AsyncMock()
7541+
mock_offloader.upload_content.return_value = "gs://bucket/big.txt"
7542+
7543+
parser = bigquery_agent_analytics_plugin.HybridContentParser(
7544+
offloader=mock_offloader,
7545+
trace_id="t",
7546+
span_id="s",
7547+
max_length=100, # small char limit
7548+
)
7549+
# 200 ASCII chars — under byte limit (32KB) but over char limit
7550+
text = "X" * 200
7551+
assert len(text.encode("utf-8")) < 32 * 1024
7552+
assert len(text) > 100
7553+
7554+
content = types.Content(parts=[types.Part(text=text)])
7555+
_, parts, _ = await parser._parse_content_object(content)
7556+
7557+
mock_offloader.upload_content.assert_called_once()
7558+
assert parts[0]["storage_mode"] == "GCS_REFERENCE"
7559+
7560+
@pytest.mark.asyncio
7561+
async def test_no_offloader_falls_back_to_truncate(self):
7562+
"""Without offloader, text exceeding char limit is truncated inline."""
7563+
parser = bigquery_agent_analytics_plugin.HybridContentParser(
7564+
offloader=None,
7565+
trace_id="t",
7566+
span_id="s",
7567+
max_length=50,
7568+
)
7569+
text = "Z" * 200
7570+
content = types.Content(parts=[types.Part(text=text)])
7571+
_, parts, is_truncated = await parser._parse_content_object(content)
7572+
7573+
assert is_truncated
7574+
assert parts[0]["storage_mode"] == "INLINE"
7575+
assert "TRUNCATED" in parts[0]["text"]
7576+
7577+
@pytest.mark.asyncio
7578+
async def test_multibyte_under_char_and_byte_limits_stays_inline(self):
7579+
"""Multi-byte text under both char limit and byte limit stays inline.
7580+
7581+
This is the specific regression case from #5561: with the old
7582+
mixed-unit min(), max_length=10000 became the offload_threshold,
7583+
and byte_len (12K) > 10000 triggered a false offload even though
7584+
char_len (3K) < max_length and byte_len (12K) < inline_text_limit
7585+
(32KB).
7586+
"""
7587+
mock_offloader = mock.AsyncMock()
7588+
parser = bigquery_agent_analytics_plugin.HybridContentParser(
7589+
offloader=mock_offloader,
7590+
trace_id="t",
7591+
span_id="s",
7592+
max_length=10000,
7593+
)
7594+
7595+
# 3K emoji chars → ~12K bytes
7596+
text = "\U0001f600" * 3000
7597+
assert len(text) < 10000 # under char limit
7598+
assert len(text.encode("utf-8")) > 10000 # bytes > max_length
7599+
assert len(text.encode("utf-8")) < 32 * 1024 # under byte limit
7600+
7601+
content = types.Content(parts=[types.Part(text=text)])
7602+
_, parts, _ = await parser._parse_content_object(content)
7603+
7604+
# Should NOT offload: under both real limits
7605+
mock_offloader.upload_content.assert_not_called()
7606+
assert parts[0]["storage_mode"] == "INLINE"

0 commit comments

Comments
 (0)