bernardladenthin · vaiju1981 · Jun 21, 2026 · Jun 21, 2026 · Jun 21, 2026 · Jun 21, 2026
@@ -26,6 +26,8 @@ OPTIONAL_MODELS=(
   "models/nomic-embed-text-v1.5.f16.gguf"
   "models/SmolVLM-500M-Instruct-Q8_0.gguf"
   "models/mmproj-SmolVLM-500M-Instruct-Q8_0.gguf"
+  "models/OuteTTS-0.2-500M-Q4_K_M.gguf"
+  "models/WavTokenizer-Large-75-F16.gguf"
 )
 
 validate_gguf() {

@@ -41,6 +41,11 @@ env:
   VISION_MODEL_NAME: "SmolVLM-500M-Instruct-Q8_0.gguf"
   VISION_MMPROJ_URL: "https://huggingface.co/ggml-org/SmolVLM-500M-Instruct-GGUF/resolve/main/mmproj-SmolVLM-500M-Instruct-Q8_0.gguf"
   VISION_MMPROJ_NAME: "mmproj-SmolVLM-500M-Instruct-Q8_0.gguf"
+  # Text-to-speech models for AudioInputIntegrationTest's sibling TtsIntegrationTest (OuteTTS pipeline).
+  TTS_MODEL_URL: "https://huggingface.co/second-state/OuteTTS-0.2-500M-GGUF/resolve/main/OuteTTS-0.2-500M-Q4_K_M.gguf"
+  TTS_MODEL_NAME: "OuteTTS-0.2-500M-Q4_K_M.gguf"
+  TTS_VOCODER_URL: "https://huggingface.co/ggml-org/WavTokenizer/resolve/main/WavTokenizer-Large-75-F16.gguf"
+  TTS_VOCODER_NAME: "WavTokenizer-Large-75-F16.gguf"
   # Test image used by MultimodalIntegrationTest is committed to the repo
   # at src/test/resources/images/test-image.jpg (see the README in that
   # directory for licensing). No download step is needed; CI just points
@@ -797,14 +802,20 @@ jobs:
         run: |
           ulimit -c unlimited
           echo "${{ github.workspace }}/core.%e.%p" | sudo tee /proc/sys/kernel/core_pattern
+      - name: Download TTS model (OuteTTS)
+        run: test -f models/${TTS_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TTS_MODEL_URL} --create-dirs -o models/${TTS_MODEL_NAME}
+      - name: Download TTS vocoder (WavTokenizer)
+        run: test -f models/${TTS_VOCODER_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TTS_VOCODER_URL} --create-dirs -o models/${TTS_VOCODER_NAME}
       - name: Run tests
         run: |
           mvn -e --no-transfer-progress -P jcstress test \
             -Dnet.ladenthin.llama.tool.model=models/${TOOL_MODEL_NAME} \
             -Dnet.ladenthin.llama.nomic.path=models/${NOMIC_EMBED_MODEL_NAME} \
             -Dnet.ladenthin.llama.vision.model=models/${VISION_MODEL_NAME} \
             -Dnet.ladenthin.llama.vision.mmproj=models/${VISION_MMPROJ_NAME} \
-            -Dnet.ladenthin.llama.vision.image=${VISION_IMAGE_PATH}
+            -Dnet.ladenthin.llama.vision.image=${VISION_IMAGE_PATH} \
+            -Dnet.ladenthin.llama.tts.ttc.model=models/${TTS_MODEL_NAME} \
+            -Dnet.ladenthin.llama.tts.vocoder.model=models/${TTS_VOCODER_NAME}
       - uses: actions/upload-artifact@v7
         if: success()
         with:

@@ -384,6 +384,38 @@ Current patches:
 |-------|-------|
 | `0001-win32-arg-parse-embed-guard.patch` | Windows JNI regression from llama.cpp **#24779** (b9739): `common_params_parse` unconditionally replaced the caller's argv with the process command line (`GetCommandLineW`), so an embedded/JNI caller (`java.exe`) lost its `--model …` args → "Failed to parse model parameters". The patch **drops the override for our build** (keeps the `make_utf8_argv()` call referenced so there's no `-Wunused-function`, but never adopts its result), so the caller's already-UTF-8 argv is always used. This is **deterministic** — an earlier count-guard variant (only override when the re-derived arg count equals `argc`) collided on the server-integration tests whose argv length happened to equal `java.exe`'s and kept them failing. The upstream PR can instead expose an opt-out / `common_params_parse_argv` that preserves the standalone tools' UTF-8 fix. |
 
+## OuteTTS build-time extraction (`cmake/generate-tts-upstream.cmake`)
+
+The `TextToSpeech` native pipeline reuses llama.cpp's OuteTTS helpers (`tools/tts/tts.cpp`)
+**without hand-copying them**. A verbatim copy would be a DRY/maintenance hazard that silently
+diverges on every upgrade, and `tts.cpp` cannot simply be added to `target_sources` — it defines its
+own `main()`, which would clash at link time (the same reason `tools/server/server.cpp` is excluded
+while `server-*.cpp` are compiled in), and all its helpers are `static` (internal linkage), so they
+are unreachable from another TU even if it were linked.
+
+Instead the helpers are **DERIVED mechanically at configure time** from the pinned upstream source:
+
+- **`cmake/generate-tts-upstream.cmake`** — reads `${llama.cpp_SOURCE_DIR}/tools/tts/tts.cpp`, keeps
+  the pre-`main()` span (the DSP `fill_hann_window`/`irfft`/`fold`/`embd_to_audio`, the prompt/text
+  helpers incl. `process_text`'s number-to-words, the `outetts_version` enum), strips `static` from
+  the handful the JNI engine calls (giving them external linkage), and extracts the two hard-coded
+  default-speaker literals out of `main()` into `extern const` strings. Writes
+  `build/tts_generated/tts_upstream_gen.cpp`.
+- **`CMakeLists.txt`** — runs the generator via `execute_process` right after
+  `FetchContent_MakeAvailable(llama.cpp)`, then compiles the generated TU into `jllama`. The file is
+  **never committed** (build artifact, like the native libs / WebUI assets); it is regenerated from
+  whatever `tts.cpp` the pinned `GIT_TAG` resolves to, so a version bump is picked up automatically.
+- **`src/main/cpp/tts_upstream.h`** — committed, hand-written declarations of the extracted symbols
+  (interface facts, not the implementation). `tts_engine.cpp` includes it and links against the
+  generated definitions. The in-memory WAV writer (`tts_wav.hpp`) is ours, not extracted.
+
+**Fail-loud on drift (same contract as `patches/`):** the generator asserts every anchor — the
+`int main(` split point, each `static <signature>` it de-statics, and both speaker literals. If an
+upgrade renames a helper or moves a literal, the **configure step aborts** with a pointer to the
+generator; if upstream changes a *type*, `tts_upstream.h` stops matching and the **link fails**.
+Either way a silent divergence is impossible. On a llama.cpp bump, re-verify the generator the same
+way you re-verify `patches/`.
+
 ## Upgrading/Downgrading llama.cpp Version
 
 To change the llama.cpp version, update the following **three** files (and re-verify `patches/`):
@@ -588,6 +620,8 @@ the README. The summary below covers only the optional-model bindings:
 | `net.ladenthin.llama.audio.model` | `AudioInputIntegrationTest` (llama.cpp discussion #13759) | audio-input model GGUF, e.g. `ultravox-v0_5-llama-3_2-1b.gguf` |
 | `net.ladenthin.llama.audio.mmproj` | `AudioInputIntegrationTest` | matching audio mmproj/encoder, e.g. `mmproj-ultravox-v0_5-llama-3_2-1b-f16.gguf` |
 | `net.ladenthin.llama.audio.input` | `AudioInputIntegrationTest` | a `.wav`/`.mp3` clip on disk (no committed default — audio is not committed) |
+| `net.ladenthin.llama.tts.ttc.model` | `TtsIntegrationTest` | OuteTTS text-to-codes model, e.g. `OuteTTS-0.2-500M-Q4_K_M.gguf` |
+| `net.ladenthin.llama.tts.vocoder.model` | `TtsIntegrationTest` | matching codes-to-speech vocoder, e.g. `WavTokenizer-Large-75-F16.gguf` |
 
 Run those tests by setting the property:
 ```bash
@@ -605,6 +639,9 @@ mvn test -Dtest=AudioInputIntegrationTest \
          -Dnet.ladenthin.llama.audio.model=models/ultravox-v0_5-llama-3_2-1b.gguf \
          -Dnet.ladenthin.llama.audio.mmproj=models/mmproj-ultravox-v0_5-llama-3_2-1b-f16.gguf \
          -Dnet.ladenthin.llama.audio.input=/path/to/speech.wav
+mvn test -Dtest=TtsIntegrationTest \
+         -Dnet.ladenthin.llama.tts.ttc.model=models/OuteTTS-0.2-500M-Q4_K_M.gguf \
+         -Dnet.ladenthin.llama.tts.vocoder.model=models/WavTokenizer-Large-75-F16.gguf
 ```
 
 `MultimodalIntegrationTest` self-skips when any of the three vision properties
@@ -739,6 +776,7 @@ If the local check passes (`BUILD SUCCESS`), the `mvn package` job in
 
 **Java layer** (`src/main/java/net/ladenthin/llama/`):
 - `LlamaModel` — Main API class (AutoCloseable). Wraps native context for inference, embeddings, re-ranking, and tokenization.
+- `TextToSpeech` — Separate AutoCloseable native type for speech synthesis over the two-model OuteTTS (text-to-codes) + WavTokenizer (codes-to-speech vocoder) pipeline; `synthesize(text)` returns a 24 kHz mono 16-bit WAV byte stream. Native orchestration in `tts_engine.{h,cpp}`; the OuteTTS DSP / prompt / text helpers + default speaker are **derived at build time from upstream `tts.cpp`** (see "OuteTTS build-time extraction" below), not hand-copied; the in-memory WAV writer is `tts_wav.hpp`.
 - `ModelParameters` / `InferenceParameters` — Builder-pattern parameter classes that serialize to JSON (extend `JsonParameters`) for passing to native code.
 - `LlamaIterator` / `LlamaIterable` — Streaming generation via Java `Iterator`/`Iterable`.
 - `LlamaLoader` — Extracts the platform-specific native library from the JAR to a temp directory, or finds it on `java.library.path`.
@@ -750,7 +788,7 @@ If the local check passes (`BUILD SUCCESS`), the `mvn package` job in
   - The `server` package is a dedicated top layer in the ArchUnit `layeredArchitecture` rule (the only layer allowed to access the root `Api`); `noInternalJdkImports` carries an explicit exception for the supported `com.sun.net.httpserver` (the exported `jdk.httpserver` module, which `module-info.java` `requires`). See README "OpenAI-compatible HTTP server".
 
 **Native layer** (`src/main/cpp/`):
-- `jllama.cpp` — JNI implementation bridging Java calls to llama.cpp. ~1,215 lines; 17 native methods.
+- `jllama.cpp` — JNI implementation bridging Java calls to llama.cpp. ~1,516 lines; 30 native methods (27 `LlamaModel` + 3 `TextToSpeech`).
 - `utils.hpp` — Helper utilities (format helpers, argv stripping, token-piece serialisation).
 - `json_helpers.hpp` — Pure JSON transformation helpers (no JNI, no llama state). Independently unit-testable.
 - `jni_helpers.hpp` — JNI bridge helpers (handle management + server orchestration). Includes `json_helpers.hpp`.
@@ -905,12 +943,13 @@ ctest --test-dir build --output-on-failure -R "ResultsToJson"
 | File | Tests | Scope |
 |------|-------|-------|
 | `src/test/cpp/test_utils.cpp` | 156 | Upstream helpers: `server_tokens`, `server_grammar_trigger`, `gen_tool_call_id`, `json_value`, `json_get_nested_values`, UTF-8 helpers, `format_response_rerank`, `format_embeddings_response_oaicompat`, `oaicompat_completion_params_parse`, `oaicompat_chat_params_parse`, `are_lora_equal`, `strip_flag_from_argv`, `token_piece_value`, `json_is_array_and_contains_numbers`, `format_oai_sse`, `format_oai_resp_sse`, `format_anthropic_sse` |
-| `src/test/cpp/test_server.cpp` | 188 | Upstream result types: `result_timings`, `task_params::to_json()` (incl. `dry_sequence_breakers`, `preserved_tokens`, `timings_per_token`), `completion_token_output`, `server_task_result_cmpl_partial` (non-oaicompat + `to_json_oaicompat` + logprobs + `to_json_oaicompat_chat` + `to_json_anthropic` + dispatcher), `server_task_result_cmpl_final` (non-oaicompat + `to_json_oaicompat` + `to_json_oaicompat_chat` + `to_json_oaicompat_chat_stream` + `to_json_anthropic` + `to_json_anthropic_stream` + tool_calls + dispatcher), `server_task_result_embd`, `server_task_result_rerank`, `server_task_result_metrics`, `server_task_result_slot_save_load`, `server_task_result_slot_erase`, `server_task_result_apply_lora`, `server_task_result_error`, `format_error_response`, `server_task::need_sampling()`, `server_task::n_tokens()`, `server_schema::eval_llama_cmpl_schema()` (parsing pipeline + grammar routing + error paths), `response_fields` projection |
+| `src/test/cpp/test_server.cpp` | 189 | Upstream result types: `result_timings`, `task_params::to_json()` (incl. `dry_sequence_breakers`, `preserved_tokens`, `timings_per_token`), `completion_token_output`, `server_task_result_cmpl_partial` (non-oaicompat + `to_json_oaicompat` + logprobs + `to_json_oaicompat_chat` + `to_json_anthropic` + dispatcher), `server_task_result_cmpl_final` (non-oaicompat + `to_json_oaicompat` + `to_json_oaicompat_chat` + `to_json_oaicompat_chat_stream` + `to_json_anthropic` + `to_json_anthropic_stream` + tool_calls + dispatcher), `server_task_result_embd`, `server_task_result_rerank`, `server_task_result_metrics`, `server_task_result_slot_save_load`, `server_task_result_slot_erase`, `server_task_result_apply_lora`, `server_task_result_error`, `format_error_response`, `server_task::need_sampling()`, `server_task::n_tokens()`, `server_schema::eval_llama_cmpl_schema()` (parsing pipeline + grammar routing + error paths), `response_fields` projection |
 | `src/test/cpp/test_json_helpers.cpp` | 47 | All functions in `json_helpers.hpp`: `get_result_error_message`, `results_to_json`, `rerank_results_to_json`, `parse_encoding_format`, `extract_embedding_prompt`, `is_infill_request`, `parse_slot_prompt_similarity`, `parse_positive_int_config`, `wrap_stream_chunk` |
 | `src/test/cpp/test_log_helpers.cpp` | 13 | All functions in `log_helpers.hpp`: `log_level_name`, `format_log_as_json` |
-| `src/test/cpp/test_jni_helpers.cpp` | 41 | All functions in `jni_helpers.hpp` using a zero-filled `JNINativeInterface_` mock |
+| `src/test/cpp/test_jni_helpers.cpp` | 47 | All functions in `jni_helpers.hpp` using a zero-filled `JNINativeInterface_` mock |
+| `src/test/cpp/test_tts_wav.cpp` | 2 | The in-memory WAV writer `pcm_to_wav16_bytes` in `tts_wav.hpp` (WAV header/payload + little-endian clamping). The OuteTTS DSP it pairs with is derived from upstream `tts.cpp` and covered end-to-end by the Java `TtsIntegrationTest`, not unit-tested here. |
 
-**Current total: 445 tests (all passing).**
+**Current total: 454 tests (all passing).**
 
 #### Upstream source location (in CMake build tree)
 

@@ -151,6 +151,29 @@ FetchContent_Declare(
 )
 FetchContent_MakeAvailable(llama.cpp)
 
+# OuteTTS native pipeline: DERIVE the upstream tts.cpp helpers (DSP + prompt + text + the default
+# speaker profile) into a compilable translation unit at configure time, rather than hand-copying
+# them — a hand copy is a DRY/maintenance hazard that silently diverges on every llama.cpp upgrade.
+# tts.cpp cannot simply be added to target_sources because it defines its own main(); the generator
+# drops main() and gives the helpers external linkage. See cmake/generate-tts-upstream.cmake. The
+# generated file is never committed; it is regenerated from whatever tts.cpp the pinned GIT_TAG
+# resolves to, so a version bump is picked up automatically. The tag below is cosmetic provenance in
+# the generated banner — keep it in sync with the llama.cpp GIT_TAG above.
+set(JLLAMA_TTS_GEN_DIR ${CMAKE_BINARY_DIR}/tts_generated)
+set(JLLAMA_TTS_GEN_CPP ${JLLAMA_TTS_GEN_DIR}/tts_upstream_gen.cpp)
+file(MAKE_DIRECTORY ${JLLAMA_TTS_GEN_DIR})
+execute_process(
+    COMMAND ${CMAKE_COMMAND}
+        -DTTS_SRC=${llama.cpp_SOURCE_DIR}/tools/tts/tts.cpp
+        -DOUT_CPP=${JLLAMA_TTS_GEN_CPP}
+        -DLLAMA_TAG=b9739
+        -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/generate-tts-upstream.cmake
+    RESULT_VARIABLE JLLAMA_TTS_GEN_RESULT
+)
+if(NOT JLLAMA_TTS_GEN_RESULT EQUAL 0)
+    message(FATAL_ERROR "OuteTTS extraction failed; see cmake/generate-tts-upstream.cmake")
+endif()
+
 # b8831 added ggml_graph_next_uid() which calls _InterlockedIncrement64 via
 # <intrin.h> on x86. The intrinsic only exists on x64; provide the
 # implementation in a compat TU so the linker resolves __InterlockedIncrement64.
@@ -263,10 +286,19 @@ endif()
 
 add_library(jllama SHARED
     src/main/cpp/jllama.cpp
+    src/main/cpp/tts_engine.cpp
+    ${JLLAMA_TTS_GEN_CPP}
     src/main/cpp/utils.hpp
     ${llama.cpp_SOURCE_DIR}/tools/server/server-common.cpp
     ${llama.cpp_SOURCE_DIR}/tools/server/server-chat.cpp)
 
+# The generated TU keeps the whole pre-main() span of tts.cpp, so a few upstream CLI-only
+# helpers (print_usage, save_wav16, xterm colour) come along unused. Silence the resulting
+# unused-function warning on that one file (non-MSVC; MSVC's C4505 is off by default).
+if(NOT MSVC)
+    set_source_files_properties(${JLLAMA_TTS_GEN_CPP} PROPERTIES COMPILE_FLAGS "-Wno-unused-function")
+endif()
+
 # Phase 1 refactoring: compile upstream server library units directly into jllama
 # server.hpp has been replaced by direct upstream includes in jllama.cpp.
 # server-context.cpp, server-queue.cpp, server-task.cpp compile on all platforms
@@ -411,6 +443,7 @@ if(BUILD_TESTING)
         src/test/cpp/test_jni_helpers.cpp
         src/test/cpp/test_json_helpers.cpp
         src/test/cpp/test_log_helpers.cpp
+        src/test/cpp/test_tts_wav.cpp
         ${llama.cpp_SOURCE_DIR}/tools/server/server-common.cpp
         ${llama.cpp_SOURCE_DIR}/tools/server/server-chat.cpp
         ${llama.cpp_SOURCE_DIR}/tools/server/server-context.cpp