jobrunr · soufianebouaddis · Mar 26, 2026 · Apr 15, 2026
diff --git a/.gitignore b/.gitignore
@@ -44,3 +44,4 @@ workspace/skills/*
 workspace/tasks/*
 workspace/app.*
 *.private*
+workspace/conversations/*
diff --git a/base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java
@@ -0,0 +1,16 @@
+package ai.javaclaw.speech;
+
+import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
+import org.springframework.stereotype.Service;
+
+import java.io.InputStream;
+
+@Service
+@ConditionalOnProperty(name = "speech.provider", havingValue = "mock", matchIfMissing = true)
+public class MockSpeechToTextService implements SpeechToTextService {
+
+    @Override
+    public String transcribe(InputStream audioStream) {
+        return "[voice message]";
+    }
+}
diff --git a/base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java
@@ -0,0 +1,61 @@
+package ai.javaclaw.speech;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
+import org.springframework.core.io.InputStreamResource;
+import org.springframework.http.MediaType;
+import org.springframework.http.client.MultipartBodyBuilder;
+import org.springframework.stereotype.Service;
+import org.springframework.web.client.RestClient;
+
+import java.io.InputStream;
+
+@Service
+@ConditionalOnProperty(name = "speech.provider", havingValue = "openai")
+public class OpenAiSpeechToTextService implements SpeechToTextService {
+
+    private static final Logger log = LoggerFactory.getLogger(OpenAiSpeechToTextService.class);
+
+    private final RestClient restClient;
+    private final String model;
+
+    public OpenAiSpeechToTextService(
+            @Value("${spring.ai.openai.api-key}") String apiKey,
+            @Value("${speech.openai.model:whisper-1}") String model,
+            @Value("${speech.openai.base-url:https://api.openai.com/v1}") String baseUrl) {
+        this.model = model;
+        this.restClient = RestClient.builder()
+                .baseUrl(baseUrl)
+                .defaultHeader("Authorization", "Bearer " + apiKey)
+                .build();
+    }
+
+    @Override
+    public String transcribe(InputStream audioStream) {
+        log.info("Transcribing audio via OpenAI Whisper (model: {})", model);
+
+        MultipartBodyBuilder builder = new MultipartBodyBuilder();
+        builder.part("file", new InputStreamResource(audioStream))
+                .filename("voice.ogg")
+                .contentType(MediaType.parseMediaType("audio/ogg"));
+        builder.part("model", model);
+
+        TranscriptionResponse response = restClient.post()
+                .uri("/audio/transcriptions")
+                .contentType(MediaType.MULTIPART_FORM_DATA)
+                .body(builder.build())
+                .retrieve()
+                .body(TranscriptionResponse.class);
+
+        if (response == null || response.text() == null || response.text().isBlank()) {
+            throw new SpeechToTextException("OpenAI returned empty transcription");
+        }
+
+        log.info("OpenAI transcription completed successfully");
+        return response.text().trim();
+    }
+
+    private record TranscriptionResponse(String text) {}
+}
diff --git a/base/src/main/java/ai/javaclaw/speech/SpeechToTextException.java b/base/src/main/java/ai/javaclaw/speech/SpeechToTextException.java
@@ -0,0 +1,12 @@
+package ai.javaclaw.speech;
+
+public class SpeechToTextException extends RuntimeException {
+
+    public SpeechToTextException(String message) {
+        super(message);
+    }
+
+    public SpeechToTextException(String message, Throwable cause) {
+        super(message, cause);
+    }
+}
diff --git a/base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java
@@ -0,0 +1,8 @@
+package ai.javaclaw.speech;
+
+import java.io.InputStream;
+
+public interface SpeechToTextService {
+
+    String transcribe(InputStream audioStream);
+}
diff --git a/base/src/main/java/ai/javaclaw/speech/WhisperCppSpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/WhisperCppSpeechToTextService.java
@@ -0,0 +1,139 @@
+package ai.javaclaw.speech;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
+import org.springframework.stereotype.Service;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.util.concurrent.TimeUnit;
+
+@Service
+@ConditionalOnProperty(name = "speech.provider", havingValue = "whisper-cpp")
+public class WhisperCppSpeechToTextService implements SpeechToTextService {
+
+    private static final Logger log = LoggerFactory.getLogger(WhisperCppSpeechToTextService.class);
+    private static final long TIMEOUT_SECONDS = 60;
+
+    private final String modelPath;
+
+    public WhisperCppSpeechToTextService(
+            @Value("${speech.whisper-cpp.model-path}") String modelPath) {
+        this.modelPath = modelPath;
+    }
+
+    @Override
+    public String transcribe(InputStream audioStream) {
+        Path tempOgg = null;
+        Path tempWav = null;
+        Path tempOutput = null;
+        try {
+            // Save the OGG stream to a temp file
+            tempOgg = Files.createTempFile("javaclaw-voice-", ".ogg");
+            Files.copy(audioStream, tempOgg, StandardCopyOption.REPLACE_EXISTING);
+
+            // Convert OGG/Opus to WAV (16kHz mono) — whisper-cli only accepts WAV
+            tempWav = Files.createTempFile("javaclaw-voice-", ".wav");
+            convertToWav(tempOgg, tempWav);
+
+            // Prepare output base path for whisper-cli
+            tempOutput = Files.createTempFile("javaclaw-whisper-", "");
+            String outputBase = tempOutput.toString();
+            Files.deleteIfExists(tempOutput);
+
+            log.info("Transcribing audio via whisper-cpp (model: {})", modelPath);
+
+            ProcessBuilder pb = new ProcessBuilder(
+                    "whisper-cli",
+                    "-m", modelPath,
+                    "-otxt",
+                    "-of", outputBase,
+                    "-np",
+                    "-nt",
+                    "-f", tempWav.toString()
+            );
+            pb.redirectErrorStream(true);
+
+            Process process = pb.start();
+            String processOutput = new String(process.getInputStream().readAllBytes());
+            boolean finished = process.waitFor(TIMEOUT_SECONDS, TimeUnit.SECONDS);
+            if (!finished) {
+                process.destroyForcibly();
+                throw new SpeechToTextException("whisper-cpp timed out after " + TIMEOUT_SECONDS + " seconds");
+            }
+
+            if (process.exitValue() != 0) {
+                log.error("whisper-cpp failed with output: {}", processOutput);
+                throw new SpeechToTextException("whisper-cpp exited with code " + process.exitValue() + ": " + processOutput);
+            }
+
+            // whisper-cli appends .txt to the output base path
+            Path resultFile = Path.of(outputBase + ".txt");
+            if (!Files.exists(resultFile)) {
+                log.error("whisper-cli output: {}", processOutput);
+                throw new SpeechToTextException("whisper-cpp did not produce output file: " + resultFile);
+            }
+
+            String transcription = Files.readString(resultFile).trim();
+            Files.deleteIfExists(resultFile);
+
+            if (transcription.isEmpty()) {
+                throw new SpeechToTextException("whisper-cpp returned empty transcription");
+            }
+
+            log.info("whisper-cpp transcription completed successfully");
+            return transcription;
+
+        } catch (IOException e) {
+            throw new SpeechToTextException("Failed to process audio file for whisper-cpp", e);
+        } catch (InterruptedException e) {
+            Thread.currentThread().interrupt();
+            throw new SpeechToTextException("whisper-cpp transcription interrupted", e);
+        } finally {
+            deleteSilently(tempOgg);
+            deleteSilently(tempWav);
+            deleteSilently(tempOutput);
+        }
+    }
+
+    private void convertToWav(Path input, Path output) throws IOException {
+        try {
+            Process process = new ProcessBuilder(
+                    "ffmpeg", "-y",
+                    "-i", input.toString(),
+                    "-ar", "16000",
+                    "-ac", "1",
+                    "-c:a", "pcm_s16le",
+                    output.toString()
+            ).redirectErrorStream(true).start();
+
+            process.getInputStream().readAllBytes();
+            boolean finished = process.waitFor(30, TimeUnit.SECONDS);
+            if (!finished) {
+                process.destroyForcibly();
+                throw new SpeechToTextException("ffmpeg conversion timed out");
+            }
+            if (process.exitValue() != 0) {
+                throw new SpeechToTextException("ffmpeg conversion failed with exit code " + process.exitValue());
+            }
+        } catch (InterruptedException e) {
+            Thread.currentThread().interrupt();
+            throw new SpeechToTextException("ffmpeg conversion interrupted", e);
+        }
+    }
+
+    private static void deleteSilently(Path path) {
+        if (path != null) {
+            try {
+                Files.deleteIfExists(path);
+            } catch (IOException e) {
+                log.warn("Failed to delete temp file: {}", path, e);
+            }
+        }
+    }
+}
diff --git a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java
@@ -4,6 +4,7 @@
 import ai.javaclaw.channels.Channel;
 import ai.javaclaw.channels.ChannelMessageReceivedEvent;
 import ai.javaclaw.channels.ChannelRegistry;
+import ai.javaclaw.speech.SpeechToTextService;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.telegram.telegrambots.client.okhttp.OkHttpTelegramClient;
@@ -16,6 +17,10 @@
 import org.telegram.telegrambots.meta.exceptions.TelegramApiException;
 import org.telegram.telegrambots.meta.generics.TelegramClient;
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Optional;
+
 import static java.util.Optional.ofNullable;
 
 public class TelegramChannel implements Channel, SpringLongPollingBot, LongPollingSingleThreadUpdateConsumer {
@@ -26,19 +31,27 @@ public class TelegramChannel implements Channel, SpringLongPollingBot, LongPolli
     private final TelegramClient telegramClient;
     private final Agent agent;
     private final ChannelRegistry channelRegistry;
+    private final SpeechToTextService speechToTextService;
+    private final TelegramVoiceDownloader voiceDownloader;
     private Long chatId;
     private Integer messageThreadId;
 
-    public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry) {
-        this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry);
+    public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) {
+        this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry, speechToTextService);
+    }
+
+    TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) {
+        this(botToken, allowedUsername, telegramClient, agent, channelRegistry, speechToTextService, new TelegramVoiceDownloader(telegramClient, botToken));
     }
 
-    TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry) {
+    TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService, TelegramVoiceDownloader voiceDownloader) {
         this.botToken = botToken;
         this.allowedUsername = normalizeUsername(allowedUsername);
         this.telegramClient = telegramClient;
         this.agent = agent;
         this.channelRegistry = channelRegistry;
+        this.speechToTextService = speechToTextService;
+        this.voiceDownloader = voiceDownloader;
         channelRegistry.registerChannel(this);
         log.info("Started Telegram integration");
     }
@@ -55,7 +68,7 @@ public LongPollingUpdateConsumer getUpdatesConsumer() {
 
     @Override
     public void consume(Update update) {
-        if (!(update.hasMessage() && update.getMessage().hasText())) return;
+        if (!update.hasMessage()) return;
 
         Message requestMessage = update.getMessage();
         String userName = requestMessage.getFrom() == null ? null : requestMessage.getFrom().getUserName();
@@ -65,11 +78,13 @@ public void consume(Update update) {
             return;
         }
 
-        String messageText = requestMessage.getText();
+        Optional<String> messageText = resolveMessageText(requestMessage);
+        if (messageText.isEmpty()) return;
+
         this.chatId = requestMessage.getChatId();
         this.messageThreadId = requestMessage.getMessageThreadId();
-        channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText, chatId, messageThreadId));
-        String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText);
+        channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText.get(), chatId, messageThreadId));
+        String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText.get());
         sendMessage(chatId, messageThreadId, response);
     }
 
@@ -95,6 +110,28 @@ public void sendMessage(long chatId, Integer messageThreadId, String message) {
         }
     }
 
+    private Optional<String> resolveMessageText(Message message) {
+        if (message.hasText()) {
+            return Optional.of(message.getText());
+        }
+        if (message.hasVoice()) {
+            return transcribeVoice(message);
+        }
+        return Optional.empty();
+    }
+
+    private Optional<String> transcribeVoice(Message message) {
+        log.info("Voice message received, downloading audio");
+        try (InputStream voiceStream = voiceDownloader.download(message)) {
+            String transcribed = speechToTextService.transcribe(voiceStream);
+            log.info("Voice message transcribed successfully");
+            return Optional.of(transcribed);
+        } catch (IOException | TelegramApiException e) {
+            log.error("Failed to process voice message", e);
+            return Optional.empty();
+        }
+    }
+
     private boolean isAllowedUser(String userName) {
         String normalizedUserName = normalizeUsername(userName);
         return normalizedUserName != null && normalizedUserName.equalsIgnoreCase(allowedUsername);

diff --git a/...elegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java b/...elegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java
@@ -3,6 +3,7 @@
 
 import ai.javaclaw.agent.Agent;
 import ai.javaclaw.channels.ChannelRegistry;
+import ai.javaclaw.speech.SpeechToTextService;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.boot.autoconfigure.AutoConfiguration;
 import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
@@ -19,7 +20,8 @@ public class TelegramChannelAutoConfiguration {
     public TelegramChannel telegramChannel(@Value("${agent.channels.telegram.token:null}") String botToken,
                                            @Value("${agent.channels.telegram.username:null}") String allowedUsername,
                                            Agent agent,
-                                           ChannelRegistry channelRegistry) {
-        return new TelegramChannel(botToken, allowedUsername, agent, channelRegistry);
+                                           ChannelRegistry channelRegistry,
+                                           SpeechToTextService speechToTextService) {
+        return new TelegramChannel(botToken, allowedUsername, agent, channelRegistry, speechToTextService);
     }
 }