diff --git a/testing/e2e/global-setup.ts b/testing/e2e/global-setup.ts index 42011f43f..c3c7dd808 100644 --- a/testing/e2e/global-setup.ts +++ b/testing/e2e/global-setup.ts @@ -43,6 +43,18 @@ export default async function globalSetup() { mock.mount('/v1/text-to-speech', elevenLabsTTSMount()) mock.mount('/v1/speech-to-text', elevenLabsSTTMount()) + // Gemini TTS hits the standard Gemini generateContent endpoint + // (POST /v1beta/models/{model}:generateContent) with + // responseModalities: ['AUDIO']. aimock's native Gemini audio helper derives + // the mime type from the fixture's `format`/`contentType`, so it can't emit + // the raw `audio/L16;codec=pcm;rate=24000` PCM that real Gemini TTS returns. + // Mount the TTS model's generateContent path directly so we can hand back + // PCM and exercise the adapter's PCM→WAV normalization. The path is specific + // to the TTS model, so it doesn't intercept Gemini chat/summarize requests. + mock.mount( + '/v1beta/models/gemini-3.1-flash-tts-preview:generateContent', + geminiTTSMount(), + ) // Gemini Veo video generation. aimock 1.29 mocks Gemini's `:predict` // (Imagen) endpoint but not the long-running `:predictLongRunning` + // operations-polling pair Veo uses, so mount both here. Non-Veo paths @@ -85,7 +97,7 @@ export default async function globalSetup() { await mock.start() console.log(`[aimock] started on port 4010`) - ;(globalThis as any).__aimock = mock + ; (globalThis as any).__aimock = mock } function registerMediaFixtures(mock: LLMock) { @@ -137,6 +149,14 @@ const FAKE_MP3_BYTES = Buffer.from([ 0xff, 0xfb, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ]) +/** + * Raw 16-bit little-endian PCM bytes. Gemini TTS returns audio as + * `audio/L16;codec=pcm;rate=24000` inlineData, which the adapter wraps in a + * RIFF/WAV header before handing it to the browser. The samples are arbitrary + * silence — the spec only asserts the `