@@ -7483,3 +7483,124 @@ async def test_reset_on_real_fork(
74837483 # Should have called _reset_runtime_state because
74847484 # _init_pid is a real PID different from os.getpid()
74857485 mock_reset .assert_called_once ()
7486+
7487+
7488+ # ================================================================
7489+ # TEST CLASS: GCS offload unit mismatch fix (Issue #5561)
7490+ # ================================================================
7491+ class TestOffloadUnitSeparation :
7492+ """Tests that byte-based inline limit and character-based truncation
7493+ limit are evaluated independently for the GCS offload decision."""
7494+
7495+ @pytest .mark .asyncio
7496+ async def test_multibyte_text_offloaded_by_byte_limit (self ):
7497+ """Multi-byte text exceeding inline_text_limit bytes is offloaded."""
7498+ mock_offloader = mock .AsyncMock ()
7499+ mock_offloader .upload_content .return_value = "gs://bucket/offloaded.txt"
7500+
7501+ parser = bigquery_agent_analytics_plugin .HybridContentParser (
7502+ offloader = mock_offloader ,
7503+ trace_id = "t" ,
7504+ span_id = "s" ,
7505+ max_length = - 1 , # no character truncation
7506+ )
7507+ # 10K emoji chars → ~40KB UTF-8, exceeds inline_text_limit (32KB)
7508+ text = "\U0001f600 " * 10000
7509+ assert len (text ) == 10000 # characters
7510+ assert len (text .encode ("utf-8" )) > 32 * 1024 # bytes
7511+
7512+ content = types .Content (parts = [types .Part (text = text )])
7513+ _ , parts , _ = await parser ._parse_content_object (content )
7514+
7515+ mock_offloader .upload_content .assert_called_once ()
7516+ assert parts [0 ]["storage_mode" ] == "GCS_REFERENCE"
7517+
7518+ @pytest .mark .asyncio
7519+ async def test_ascii_under_both_limits_stays_inline (self ):
7520+ """ASCII text under both byte and character limits stays inline."""
7521+ mock_offloader = mock .AsyncMock ()
7522+
7523+ parser = bigquery_agent_analytics_plugin .HybridContentParser (
7524+ offloader = mock_offloader ,
7525+ trace_id = "t" ,
7526+ span_id = "s" ,
7527+ max_length = 50000 ,
7528+ )
7529+ text = "A" * 1000 # 1K chars = 1K bytes, under both limits
7530+ content = types .Content (parts = [types .Part (text = text )])
7531+ _ , parts , _ = await parser ._parse_content_object (content )
7532+
7533+ mock_offloader .upload_content .assert_not_called ()
7534+ assert parts [0 ]["storage_mode" ] == "INLINE"
7535+ assert parts [0 ]["text" ] == text
7536+
7537+ @pytest .mark .asyncio
7538+ async def test_text_exceeding_char_limit_offloaded (self ):
7539+ """ASCII text exceeding max_length characters is offloaded."""
7540+ mock_offloader = mock .AsyncMock ()
7541+ mock_offloader .upload_content .return_value = "gs://bucket/big.txt"
7542+
7543+ parser = bigquery_agent_analytics_plugin .HybridContentParser (
7544+ offloader = mock_offloader ,
7545+ trace_id = "t" ,
7546+ span_id = "s" ,
7547+ max_length = 100 , # small char limit
7548+ )
7549+ # 200 ASCII chars — under byte limit (32KB) but over char limit
7550+ text = "X" * 200
7551+ assert len (text .encode ("utf-8" )) < 32 * 1024
7552+ assert len (text ) > 100
7553+
7554+ content = types .Content (parts = [types .Part (text = text )])
7555+ _ , parts , _ = await parser ._parse_content_object (content )
7556+
7557+ mock_offloader .upload_content .assert_called_once ()
7558+ assert parts [0 ]["storage_mode" ] == "GCS_REFERENCE"
7559+
7560+ @pytest .mark .asyncio
7561+ async def test_no_offloader_falls_back_to_truncate (self ):
7562+ """Without offloader, text exceeding char limit is truncated inline."""
7563+ parser = bigquery_agent_analytics_plugin .HybridContentParser (
7564+ offloader = None ,
7565+ trace_id = "t" ,
7566+ span_id = "s" ,
7567+ max_length = 50 ,
7568+ )
7569+ text = "Z" * 200
7570+ content = types .Content (parts = [types .Part (text = text )])
7571+ _ , parts , is_truncated = await parser ._parse_content_object (content )
7572+
7573+ assert is_truncated
7574+ assert parts [0 ]["storage_mode" ] == "INLINE"
7575+ assert "TRUNCATED" in parts [0 ]["text" ]
7576+
7577+ @pytest .mark .asyncio
7578+ async def test_multibyte_under_char_and_byte_limits_stays_inline (self ):
7579+ """Multi-byte text under both char limit and byte limit stays inline.
7580+
7581+ This is the specific regression case from #5561: with the old
7582+ mixed-unit min(), max_length=10000 became the offload_threshold,
7583+ and byte_len (12K) > 10000 triggered a false offload even though
7584+ char_len (3K) < max_length and byte_len (12K) < inline_text_limit
7585+ (32KB).
7586+ """
7587+ mock_offloader = mock .AsyncMock ()
7588+ parser = bigquery_agent_analytics_plugin .HybridContentParser (
7589+ offloader = mock_offloader ,
7590+ trace_id = "t" ,
7591+ span_id = "s" ,
7592+ max_length = 10000 ,
7593+ )
7594+
7595+ # 3K emoji chars → ~12K bytes
7596+ text = "\U0001f600 " * 3000
7597+ assert len (text ) < 10000 # under char limit
7598+ assert len (text .encode ("utf-8" )) > 10000 # bytes > max_length
7599+ assert len (text .encode ("utf-8" )) < 32 * 1024 # under byte limit
7600+
7601+ content = types .Content (parts = [types .Part (text = text )])
7602+ _ , parts , _ = await parser ._parse_content_object (content )
7603+
7604+ # Should NOT offload: under both real limits
7605+ mock_offloader .upload_content .assert_not_called ()
7606+ assert parts [0 ]["storage_mode" ] == "INLINE"
0 commit comments