diff --git a/CLAUDE.md b/CLAUDE.md index ad0289d8..f6aeab79 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -585,6 +585,9 @@ the README. The summary below covers only the optional-model bindings: | `net.ladenthin.llama.vision.model` | `MultimodalIntegrationTest` (upstream kherud/java-llama.cpp#103 / #34) | `SmolVLM-500M-Instruct-Q8_0.gguf` (any vision-capable GGUF works) | | `net.ladenthin.llama.vision.mmproj` | `MultimodalIntegrationTest` | matching mmproj for the vision model, e.g. `mmproj-SmolVLM-500M-Instruct-Q8_0.gguf` | | `net.ladenthin.llama.vision.image` | `MultimodalIntegrationTest` | committed default `src/test/resources/images/test-image.jpg`; override to any png/jpeg/webp/gif on disk | +| `net.ladenthin.llama.audio.model` | `AudioInputIntegrationTest` (llama.cpp discussion #13759) | audio-input model GGUF, e.g. `ultravox-v0_5-llama-3_2-1b.gguf` | +| `net.ladenthin.llama.audio.mmproj` | `AudioInputIntegrationTest` | matching audio mmproj/encoder, e.g. `mmproj-ultravox-v0_5-llama-3_2-1b-f16.gguf` | +| `net.ladenthin.llama.audio.input` | `AudioInputIntegrationTest` | a `.wav`/`.mp3` clip on disk (no committed default — audio is not committed) | Run those tests by setting the property: ```bash @@ -596,6 +599,12 @@ mvn test -Dtest=MultimodalIntegrationTest \ # The vision.image property defaults to src/test/resources/images/test-image.jpg # (a CC-BY-4.0 / MIT-granted photo of flowers and bees by the project author); # override only if you want to test a different image. + +# Audio input (Ultravox / Qwen2.5-Omni; the audio clip has no committed default): +mvn test -Dtest=AudioInputIntegrationTest \ + -Dnet.ladenthin.llama.audio.model=models/ultravox-v0_5-llama-3_2-1b.gguf \ + -Dnet.ladenthin.llama.audio.mmproj=models/mmproj-ultravox-v0_5-llama-3_2-1b-f16.gguf \ + -Dnet.ladenthin.llama.audio.input=/path/to/speech.wav ``` `MultimodalIntegrationTest` self-skips when any of the three vision properties diff --git a/README.md b/README.md index df756d66..2bba0dc2 100644 --- a/README.md +++ b/README.md @@ -278,8 +278,11 @@ Every `net.ladenthin.llama.*` system property recognised by the library, deep-sc | `net.ladenthin.llama.vision.model` | unset (test self-skips) | test | `MultimodalIntegrationTest` (upstream kherud/java-llama.cpp#103 / #34) | Path to a vision-capable model GGUF. Any vision-capable GGUF works; CI default is `SmolVLM-500M-Instruct-Q8_0.gguf`. | | `net.ladenthin.llama.vision.mmproj` | unset (test self-skips) | test | `MultimodalIntegrationTest` | Matching mmproj GGUF for the vision model. | | `net.ladenthin.llama.vision.image` | `src/test/resources/images/test-image.jpg` (a CC-BY-4.0 / MIT-granted photo committed to the repo) | test | `MultimodalIntegrationTest` | Visual prompt image. Any png/jpeg/webp/gif works; the extension drives MIME detection. | +| `net.ladenthin.llama.audio.model` | unset (test self-skips) | test | `AudioInputIntegrationTest` (llama.cpp discussion #13759) | Path to an audio-input model GGUF (e.g. Ultravox, Qwen2.5-Omni). | +| `net.ladenthin.llama.audio.mmproj` | unset (test self-skips) | test | `AudioInputIntegrationTest` | Matching audio mmproj (encoder) GGUF. | +| `net.ladenthin.llama.audio.input` | unset (test self-skips) | test | `AudioInputIntegrationTest` | `.wav`/`.mp3` audio prompt clip; the extension drives format detection. | -`MultimodalIntegrationTest` self-skips when any of the three `vision.*` properties points at a missing path, so a partial setup (just the vision model + the committed image, no mmproj) lets the test class load without erroring. +`MultimodalIntegrationTest` self-skips when any of the three `vision.*` properties points at a missing path, so a partial setup (just the vision model + the committed image, no mmproj) lets the test class load without erroring. `AudioInputIntegrationTest` self-skips the same way over the three `audio.*` properties. ## Documentation @@ -409,6 +412,30 @@ OpenAI-compatible `/v1/chat/completions` server. For a strictly CPU-only run, us `setDevices("none").setMmprojOffload(false)` in addition to `setGpuLayers(0)`; projector offload has its own upstream default. +**Audio input** works identically — load an audio-capable model (Ultravox, Qwen2.5-Omni, …) with its +audio `--mmproj` and add a `ContentPart.audioFile(...)` (or `inputAudio(bytes, "wav"|"mp3")`) part. It +serializes to the OpenAI `input_audio` content part and routes through the same `mtmd` pipeline: + +```java +ModelParameters modelParams = new ModelParameters() + .setModel("models/ultravox-v0_5-llama-3_2-1b.gguf") + .setMmproj("models/mmproj-ultravox-v0_5-llama-3_2-1b-f16.gguf"); + +ChatMessage message = ChatMessage.userMultimodal( + ContentPart.text("Transcribe the audio."), + ContentPart.audioFile(Paths.get("speech.wav"))); + +try (LlamaModel model = new LlamaModel(modelParams)) { + System.out.println(model.supportsAudio()); // true + String answer = model.chatCompleteText(InferenceParameters.empty() + .withMessages(Collections.singletonList(message)) + .withNPredict(64)); + System.out.println(answer); +} +``` + +`LlamaModel.supportsVision()` / `supportsAudio()` report which modalities the loaded projector enables. + ### Tool Calling Use a tool-aware instruct model and enable Jinja when loading it. A typed request can either return diff --git a/src/main/java/net/ladenthin/llama/LlamaModel.java b/src/main/java/net/ladenthin/llama/LlamaModel.java index a176a986..bc6793e5 100644 --- a/src/main/java/net/ladenthin/llama/LlamaModel.java +++ b/src/main/java/net/ladenthin/llama/LlamaModel.java @@ -851,6 +851,17 @@ public boolean supportsVision() { return getModelMeta().supportsVision(); } + /** + * Reports whether the loaded model accepts audio input (an audio-capable {@code --mmproj}, + * e.g. Ultravox / Qwen2.5-Omni). Audio clips are supplied as + * {@link net.ladenthin.llama.value.ContentPart#inputAudio(byte[], String)} parts. + * + * @return {@code true} when audio input is available + */ + public boolean supportsAudio() { + return getModelMeta().supportsAudio(); + } + native String getModelMetaJson(); /** diff --git a/src/main/java/net/ladenthin/llama/parameters/ParameterJsonSerializer.java b/src/main/java/net/ladenthin/llama/parameters/ParameterJsonSerializer.java index b07ddc59..7824b22a 100644 --- a/src/main/java/net/ladenthin/llama/parameters/ParameterJsonSerializer.java +++ b/src/main/java/net/ladenthin/llama/parameters/ParameterJsonSerializer.java @@ -126,6 +126,14 @@ public ArrayNode buildMessages(List messages) { part.put("type", "text"); final String text = p.getText(); part.put("text", text != null ? text : ""); + } else if (p.getType() == ContentPart.Type.INPUT_AUDIO) { + part.put("type", "input_audio"); + ObjectNode inputAudio = OBJECT_MAPPER.createObjectNode(); + final String data = p.getAudioData(); + final String format = p.getAudioFormat(); + inputAudio.put("data", data != null ? data : ""); + inputAudio.put("format", format != null ? format : "wav"); + part.set("input_audio", inputAudio); } else { part.put("type", "image_url"); ObjectNode imageUrl = OBJECT_MAPPER.createObjectNode(); diff --git a/src/main/java/net/ladenthin/llama/value/ContentPart.java b/src/main/java/net/ladenthin/llama/value/ContentPart.java index 89ec20a6..e167fa7d 100644 --- a/src/main/java/net/ladenthin/llama/value/ContentPart.java +++ b/src/main/java/net/ladenthin/llama/value/ContentPart.java @@ -44,17 +44,28 @@ public enum Type { /** A plain-text fragment. */ TEXT, /** An image reference (data URI or remote URL). */ - IMAGE_URL + IMAGE_URL, + /** An audio clip (base64 {@code data} + {@code format}), for audio-input models. */ + INPUT_AUDIO } private final Type type; private final @Nullable String text; private final @Nullable String imageUrl; + private final @Nullable String audioData; + private final @Nullable String audioFormat; - private ContentPart(Type type, @Nullable String text, @Nullable String imageUrl) { + private ContentPart( + Type type, + @Nullable String text, + @Nullable String imageUrl, + @Nullable String audioData, + @Nullable String audioFormat) { this.type = type; this.text = text; this.imageUrl = imageUrl; + this.audioData = audioData; + this.audioFormat = audioFormat; } /** @@ -65,7 +76,7 @@ private ContentPart(Type type, @Nullable String text, @Nullable String imageUrl) */ public static ContentPart text(String text) { Objects.requireNonNull(text, "text"); - return new ContentPart(Type.TEXT, text, null); + return new ContentPart(Type.TEXT, text, null, null, null); } /** @@ -78,7 +89,7 @@ public static ContentPart text(String text) { */ public static ContentPart imageUrl(String url) { Objects.requireNonNull(url, "url"); - return new ContentPart(Type.IMAGE_URL, null, url); + return new ContentPart(Type.IMAGE_URL, null, url, null, null); } /** @@ -96,7 +107,7 @@ public static ContentPart imageBytes(byte[] bytes, String mimeType) { throw new IllegalArgumentException("mimeType must not be empty (bytes.length=" + bytes.length + ")"); } String encoded = Base64.getEncoder().encodeToString(bytes); - return new ContentPart(Type.IMAGE_URL, null, "data:" + mimeType + ";base64," + encoded); + return new ContentPart(Type.IMAGE_URL, null, "data:" + mimeType + ";base64," + encoded, null, null); } /** @@ -133,6 +144,56 @@ public static ContentPart imageFile(Path imagePath) throws IOException { return imageBytes(Files.readAllBytes(imagePath), mimeType); } + /** + * Build an audio part from raw bytes plus an explicit container format. Mirrors the OpenAI + * {@code input_audio} content part the upstream {@code llama.cpp} server understands, routed + * through the {@code mtmd} audio pipeline (requires an audio-capable {@code --mmproj}). The bytes + * are base64-encoded. + * + * @param audioBytes raw audio bytes (must not be {@code null}) + * @param format container format, {@code "wav"} or {@code "mp3"} (case-insensitive) + * @return an INPUT_AUDIO part carrying the base64 data and normalised format + * @throws IllegalArgumentException if {@code format} is not {@code "wav"} or {@code "mp3"} + */ + public static ContentPart inputAudio(byte[] audioBytes, String format) { + Objects.requireNonNull(audioBytes, "audioBytes"); + Objects.requireNonNull(format, "format"); + String normalized = format.toLowerCase(Locale.ROOT); + if (!normalized.equals("wav") && !normalized.equals("mp3")) { + throw new IllegalArgumentException("audio format must be 'wav' or 'mp3', was: " + format); + } + String encoded = Base64.getEncoder().encodeToString(audioBytes); + return new ContentPart(Type.INPUT_AUDIO, null, null, encoded, normalized); + } + + /** + * Build an audio part by reading a file from disk and detecting its format from the file + * extension. Recognised extensions: {@code .wav}, {@code .mp3}. Anything else throws + * {@link IllegalArgumentException}; use {@link #inputAudio(byte[], String)} to force a format. + * + * @param audioPath path to the audio file (must not be {@code null}) + * @return an INPUT_AUDIO part carrying the data + * @throws IOException if the file cannot be read + */ + public static ContentPart audioFile(Path audioPath) throws IOException { + Objects.requireNonNull(audioPath, "audioPath"); + Path fileNamePath = audioPath.getFileName(); + if (fileNamePath == null) { + throw new IllegalArgumentException("audioPath has no file name component: " + audioPath); + } + String name = fileNamePath.toString().toLowerCase(Locale.ROOT); + String format; + if (name.endsWith(".wav")) { + format = "wav"; + } else if (name.endsWith(".mp3")) { + format = "mp3"; + } else { + throw new IllegalArgumentException("Cannot infer audio format from extension: " + audioPath + + " — use ContentPart.inputAudio(bytes, format) instead"); + } + return inputAudio(Files.readAllBytes(audioPath), format); + } + /** * Part-kind accessor. * @return the discriminator selecting {@link #getText()} or {@link #getImageUrl()} @@ -156,4 +217,20 @@ public Type getType() { public @Nullable String getImageUrl() { return imageUrl; } + + /** + * Base64 audio-data accessor (only set for {@link Type#INPUT_AUDIO}). + * @return the base64-encoded audio bytes, or {@code null} for non-audio parts + */ + public @Nullable String getAudioData() { + return audioData; + } + + /** + * Audio container-format accessor (only set for {@link Type#INPUT_AUDIO}). + * @return {@code "wav"} or {@code "mp3"}, or {@code null} for non-audio parts + */ + public @Nullable String getAudioFormat() { + return audioFormat; + } } diff --git a/src/test/java/net/ladenthin/llama/AudioInputIntegrationTest.java b/src/test/java/net/ladenthin/llama/AudioInputIntegrationTest.java new file mode 100644 index 00000000..5176c5c1 --- /dev/null +++ b/src/test/java/net/ladenthin/llama/AudioInputIntegrationTest.java @@ -0,0 +1,100 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Paths; +import java.util.Collections; +import java.util.concurrent.TimeUnit; +import net.ladenthin.llama.parameters.InferenceParameters; +import net.ladenthin.llama.parameters.ModelParameters; +import net.ladenthin.llama.value.ChatMessage; +import net.ladenthin.llama.value.ContentPart; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; + +/** + * Real-model coverage for audio input (llama.cpp discussion #13759). Loads an audio-capable + * model (Ultravox / Qwen2.5-Omni) with its audio {@code --mmproj} and sends a multipart message + * carrying a {@link ContentPart#audioFile(java.nio.file.Path)} clip, exercising: + * + * + *

Self-skips when any of the three system properties + * ({@link TestConstants#PROP_AUDIO_MODEL_PATH} / {@link TestConstants#PROP_AUDIO_MMPROJ_PATH} / + * {@link TestConstants#PROP_AUDIO_PATH}) is unset or its file is missing, so it runs only in CI or on a + * dev machine where the (large) audio model and a clip have been staged. + */ +public class AudioInputIntegrationTest { + + private static LlamaModel model; + private static String audioPath; + + @BeforeAll + public static void setup() { + String modelPath = System.getProperty(TestConstants.PROP_AUDIO_MODEL_PATH); + String mmprojPath = System.getProperty(TestConstants.PROP_AUDIO_MMPROJ_PATH); + audioPath = System.getProperty(TestConstants.PROP_AUDIO_PATH); + + Assumptions.assumeTrue( + modelPath != null && !modelPath.isEmpty(), + "Audio model path not set (-D" + TestConstants.PROP_AUDIO_MODEL_PATH + "=...)"); + Assumptions.assumeTrue( + mmprojPath != null && !mmprojPath.isEmpty(), + "Audio mmproj path not set (-D" + TestConstants.PROP_AUDIO_MMPROJ_PATH + "=...)"); + Assumptions.assumeTrue( + audioPath != null && !audioPath.isEmpty(), + "Audio clip path not set (-D" + TestConstants.PROP_AUDIO_PATH + "=...)"); + Assumptions.assumeTrue(new File(modelPath).exists(), "Audio model file missing: " + modelPath); + Assumptions.assumeTrue(new File(mmprojPath).exists(), "Audio mmproj file missing: " + mmprojPath); + Assumptions.assumeTrue(new File(audioPath).exists(), "Audio clip missing: " + audioPath); + + int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL); + ModelParameters parameters = new ModelParameters() + .setCtxSize(4096) + .setModel(modelPath) + .setMmproj(mmprojPath) + .setGpuLayers(gpuLayers) + .setFit(false); + if (gpuLayers == 0) { + parameters.setDevices("none").setMmprojOffload(false); + } + model = new LlamaModel(parameters); + assertTrue(model.supportsAudio(), "loaded model + mmproj must advertise audio input"); + } + + @AfterAll + public static void tearDown() { + if (model != null) { + model.close(); + } + } + + @Test + @DisplayName("an input_audio content part reaches the model and yields a non-empty reply") + @Timeout(value = 240_000, unit = TimeUnit.MILLISECONDS) + public void audioInputProducesNonEmptyReply() throws IOException { + ChatMessage message = ChatMessage.userMultimodal( + ContentPart.text("Transcribe the audio."), ContentPart.audioFile(Paths.get(audioPath))); + + String reply = model.chatCompleteText(InferenceParameters.empty() + .withMessages(Collections.singletonList(message)) + .withNPredict(64)); + + assertFalse(reply.trim().isEmpty(), "reply must be non-empty for an audio prompt; got: \"" + reply + "\""); + } +} diff --git a/src/test/java/net/ladenthin/llama/TestConstants.java b/src/test/java/net/ladenthin/llama/TestConstants.java index d15e36b3..3a3db3c4 100644 --- a/src/test/java/net/ladenthin/llama/TestConstants.java +++ b/src/test/java/net/ladenthin/llama/TestConstants.java @@ -71,4 +71,21 @@ public class TestConstants { * resource so the test needs no network access for the visual prompt. */ public static final String DEFAULT_VISION_IMAGE_PATH = "src/test/resources/images/test-image.jpg"; + + /** + * System property holding a path to an audio-input model GGUF (e.g. Ultravox / Qwen2.5-Omni). + * Consumed by {@code AudioInputIntegrationTest} (llama.cpp discussion #13759). The test self-skips + * when this, the mmproj, or the audio clip is unset/missing. + */ + public static final String PROP_AUDIO_MODEL_PATH = LlamaSystemProperties.PREFIX + ".audio.model"; + + /** System property holding a path to the matching audio mmproj (encoder) GGUF. */ + public static final String PROP_AUDIO_MMPROJ_PATH = LlamaSystemProperties.PREFIX + ".audio.mmproj"; + + /** + * System property holding a path to a {@code .wav} or {@code .mp3} clip used as the audio prompt in + * {@code AudioInputIntegrationTest}. The matching extension drives format detection in + * {@code ContentPart.audioFile(Path)}. + */ + public static final String PROP_AUDIO_PATH = LlamaSystemProperties.PREFIX + ".audio.input"; } diff --git a/src/test/java/net/ladenthin/llama/parameters/ChatRequestTest.java b/src/test/java/net/ladenthin/llama/parameters/ChatRequestTest.java index f9ccd10e..2b1ae2dc 100644 --- a/src/test/java/net/ladenthin/llama/parameters/ChatRequestTest.java +++ b/src/test/java/net/ladenthin/llama/parameters/ChatRequestTest.java @@ -210,6 +210,19 @@ void buildMessagesJsonPreservesMultimodalParts() { assertThat(json, containsString("data:image/png;base64,AAAA")); } + @Test + void buildMessagesJsonEmitsInputAudioParts() { + ChatRequest req = ChatRequest.empty() + .appendMessage(ChatMessage.userMultimodal( + ContentPart.text("transcribe"), ContentPart.inputAudio(new byte[] {1, 2, 3}, "wav"))); + String json = req.buildMessagesJson(); + assertThat(json, containsString("\"type\":\"input_audio\"")); + assertThat(json, containsString("\"format\":\"wav\"")); + assertThat( + json, + containsString("\"data\":\"" + java.util.Base64.getEncoder().encodeToString(new byte[] {1, 2, 3}))); + } + @Test void buildToolsJsonEmptyWhenNoTools() { assertThat(ChatRequest.empty().buildToolsJson().isPresent(), is(false)); diff --git a/src/test/java/net/ladenthin/llama/value/ContentPartTest.java b/src/test/java/net/ladenthin/llama/value/ContentPartTest.java index 0c0bcd84..fb24c75b 100644 --- a/src/test/java/net/ladenthin/llama/value/ContentPartTest.java +++ b/src/test/java/net/ladenthin/llama/value/ContentPartTest.java @@ -50,6 +50,27 @@ public void imageBytesProducesDataUri() { assertThat(p.getImageUrl(), is(expected)); } + @Test + public void inputAudioBase64EncodesAndNormalisesFormat() { + byte[] bytes = {1, 2, 3, 4, 5}; + ContentPart p = ContentPart.inputAudio(bytes, "WAV"); + assertThat(p.getType(), is(ContentPart.Type.INPUT_AUDIO)); + assertThat(p.getAudioData(), is(Base64.getEncoder().encodeToString(bytes))); + assertThat(p.getAudioFormat(), is("wav")); + assertThat(p.getImageUrl(), is(nullValue())); + assertThat(p.getText(), is(nullValue())); + } + + @Test + public void inputAudioRejectsUnsupportedFormat() { + assertThrows(IllegalArgumentException.class, () -> ContentPart.inputAudio(new byte[] {1}, "ogg")); + } + + @Test + public void inputAudioRejectsNullBytes() { + assertThrows(NullPointerException.class, () -> ContentPart.inputAudio(null, "wav")); + } + @Test public void textRejectsNull() { assertThrows(NullPointerException.class, () -> ContentPart.text(null));