diff --git a/.gitignore b/.gitignore index 0485df7..3c6c38a 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,4 @@ workspace/skills/* workspace/tasks/* workspace/app.* *.private* +workspace/conversations/* \ No newline at end of file diff --git a/base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java new file mode 100644 index 0000000..8d94106 --- /dev/null +++ b/base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java @@ -0,0 +1,16 @@ +package ai.javaclaw.speech; + +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.stereotype.Service; + +import java.io.InputStream; + +@Service +@ConditionalOnProperty(name = "speech.provider", havingValue = "mock", matchIfMissing = true) +public class MockSpeechToTextService implements SpeechToTextService { + + @Override + public String transcribe(InputStream audioStream) { + return "[voice message]"; + } +} diff --git a/base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java new file mode 100644 index 0000000..3530a12 --- /dev/null +++ b/base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java @@ -0,0 +1,61 @@ +package ai.javaclaw.speech; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.core.io.InputStreamResource; +import org.springframework.http.MediaType; +import org.springframework.http.client.MultipartBodyBuilder; +import org.springframework.stereotype.Service; +import org.springframework.web.client.RestClient; + +import java.io.InputStream; + +@Service +@ConditionalOnProperty(name = "speech.provider", havingValue = "openai") +public class OpenAiSpeechToTextService implements SpeechToTextService { + + private static final Logger log = LoggerFactory.getLogger(OpenAiSpeechToTextService.class); + + private final RestClient restClient; + private final String model; + + public OpenAiSpeechToTextService( + @Value("${spring.ai.openai.api-key}") String apiKey, + @Value("${speech.openai.model:whisper-1}") String model, + @Value("${speech.openai.base-url:https://api.openai.com/v1}") String baseUrl) { + this.model = model; + this.restClient = RestClient.builder() + .baseUrl(baseUrl) + .defaultHeader("Authorization", "Bearer " + apiKey) + .build(); + } + + @Override + public String transcribe(InputStream audioStream) { + log.info("Transcribing audio via OpenAI Whisper (model: {})", model); + + MultipartBodyBuilder builder = new MultipartBodyBuilder(); + builder.part("file", new InputStreamResource(audioStream)) + .filename("voice.ogg") + .contentType(MediaType.parseMediaType("audio/ogg")); + builder.part("model", model); + + TranscriptionResponse response = restClient.post() + .uri("/audio/transcriptions") + .contentType(MediaType.MULTIPART_FORM_DATA) + .body(builder.build()) + .retrieve() + .body(TranscriptionResponse.class); + + if (response == null || response.text() == null || response.text().isBlank()) { + throw new SpeechToTextException("OpenAI returned empty transcription"); + } + + log.info("OpenAI transcription completed successfully"); + return response.text().trim(); + } + + private record TranscriptionResponse(String text) {} +} diff --git a/base/src/main/java/ai/javaclaw/speech/SpeechToTextException.java b/base/src/main/java/ai/javaclaw/speech/SpeechToTextException.java new file mode 100644 index 0000000..d5483a4 --- /dev/null +++ b/base/src/main/java/ai/javaclaw/speech/SpeechToTextException.java @@ -0,0 +1,12 @@ +package ai.javaclaw.speech; + +public class SpeechToTextException extends RuntimeException { + + public SpeechToTextException(String message) { + super(message); + } + + public SpeechToTextException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java new file mode 100644 index 0000000..ac9f860 --- /dev/null +++ b/base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java @@ -0,0 +1,8 @@ +package ai.javaclaw.speech; + +import java.io.InputStream; + +public interface SpeechToTextService { + + String transcribe(InputStream audioStream); +} diff --git a/base/src/main/java/ai/javaclaw/speech/WhisperCppSpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/WhisperCppSpeechToTextService.java new file mode 100644 index 0000000..d656f71 --- /dev/null +++ b/base/src/main/java/ai/javaclaw/speech/WhisperCppSpeechToTextService.java @@ -0,0 +1,139 @@ +package ai.javaclaw.speech; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.stereotype.Service; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.util.concurrent.TimeUnit; + +@Service +@ConditionalOnProperty(name = "speech.provider", havingValue = "whisper-cpp") +public class WhisperCppSpeechToTextService implements SpeechToTextService { + + private static final Logger log = LoggerFactory.getLogger(WhisperCppSpeechToTextService.class); + private static final long TIMEOUT_SECONDS = 60; + + private final String modelPath; + + public WhisperCppSpeechToTextService( + @Value("${speech.whisper-cpp.model-path}") String modelPath) { + this.modelPath = modelPath; + } + + @Override + public String transcribe(InputStream audioStream) { + Path tempOgg = null; + Path tempWav = null; + Path tempOutput = null; + try { + // Save the OGG stream to a temp file + tempOgg = Files.createTempFile("javaclaw-voice-", ".ogg"); + Files.copy(audioStream, tempOgg, StandardCopyOption.REPLACE_EXISTING); + + // Convert OGG/Opus to WAV (16kHz mono) — whisper-cli only accepts WAV + tempWav = Files.createTempFile("javaclaw-voice-", ".wav"); + convertToWav(tempOgg, tempWav); + + // Prepare output base path for whisper-cli + tempOutput = Files.createTempFile("javaclaw-whisper-", ""); + String outputBase = tempOutput.toString(); + Files.deleteIfExists(tempOutput); + + log.info("Transcribing audio via whisper-cpp (model: {})", modelPath); + + ProcessBuilder pb = new ProcessBuilder( + "whisper-cli", + "-m", modelPath, + "-otxt", + "-of", outputBase, + "-np", + "-nt", + "-f", tempWav.toString() + ); + pb.redirectErrorStream(true); + + Process process = pb.start(); + String processOutput = new String(process.getInputStream().readAllBytes()); + boolean finished = process.waitFor(TIMEOUT_SECONDS, TimeUnit.SECONDS); + if (!finished) { + process.destroyForcibly(); + throw new SpeechToTextException("whisper-cpp timed out after " + TIMEOUT_SECONDS + " seconds"); + } + + if (process.exitValue() != 0) { + log.error("whisper-cpp failed with output: {}", processOutput); + throw new SpeechToTextException("whisper-cpp exited with code " + process.exitValue() + ": " + processOutput); + } + + // whisper-cli appends .txt to the output base path + Path resultFile = Path.of(outputBase + ".txt"); + if (!Files.exists(resultFile)) { + log.error("whisper-cli output: {}", processOutput); + throw new SpeechToTextException("whisper-cpp did not produce output file: " + resultFile); + } + + String transcription = Files.readString(resultFile).trim(); + Files.deleteIfExists(resultFile); + + if (transcription.isEmpty()) { + throw new SpeechToTextException("whisper-cpp returned empty transcription"); + } + + log.info("whisper-cpp transcription completed successfully"); + return transcription; + + } catch (IOException e) { + throw new SpeechToTextException("Failed to process audio file for whisper-cpp", e); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new SpeechToTextException("whisper-cpp transcription interrupted", e); + } finally { + deleteSilently(tempOgg); + deleteSilently(tempWav); + deleteSilently(tempOutput); + } + } + + private void convertToWav(Path input, Path output) throws IOException { + try { + Process process = new ProcessBuilder( + "ffmpeg", "-y", + "-i", input.toString(), + "-ar", "16000", + "-ac", "1", + "-c:a", "pcm_s16le", + output.toString() + ).redirectErrorStream(true).start(); + + process.getInputStream().readAllBytes(); + boolean finished = process.waitFor(30, TimeUnit.SECONDS); + if (!finished) { + process.destroyForcibly(); + throw new SpeechToTextException("ffmpeg conversion timed out"); + } + if (process.exitValue() != 0) { + throw new SpeechToTextException("ffmpeg conversion failed with exit code " + process.exitValue()); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new SpeechToTextException("ffmpeg conversion interrupted", e); + } + } + + private static void deleteSilently(Path path) { + if (path != null) { + try { + Files.deleteIfExists(path); + } catch (IOException e) { + log.warn("Failed to delete temp file: {}", path, e); + } + } + } +} diff --git a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java index fbc2af3..e321182 100644 --- a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java +++ b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java @@ -4,6 +4,7 @@ import ai.javaclaw.channels.Channel; import ai.javaclaw.channels.ChannelMessageReceivedEvent; import ai.javaclaw.channels.ChannelRegistry; +import ai.javaclaw.speech.SpeechToTextService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.telegram.telegrambots.client.okhttp.OkHttpTelegramClient; @@ -16,6 +17,10 @@ import org.telegram.telegrambots.meta.exceptions.TelegramApiException; import org.telegram.telegrambots.meta.generics.TelegramClient; +import java.io.IOException; +import java.io.InputStream; +import java.util.Optional; + import static java.util.Optional.ofNullable; public class TelegramChannel implements Channel, SpringLongPollingBot, LongPollingSingleThreadUpdateConsumer { @@ -26,19 +31,27 @@ public class TelegramChannel implements Channel, SpringLongPollingBot, LongPolli private final TelegramClient telegramClient; private final Agent agent; private final ChannelRegistry channelRegistry; + private final SpeechToTextService speechToTextService; + private final TelegramVoiceDownloader voiceDownloader; private Long chatId; private Integer messageThreadId; - public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry) { - this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry); + public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) { + this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry, speechToTextService); + } + + TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) { + this(botToken, allowedUsername, telegramClient, agent, channelRegistry, speechToTextService, new TelegramVoiceDownloader(telegramClient, botToken)); } - TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry) { + TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService, TelegramVoiceDownloader voiceDownloader) { this.botToken = botToken; this.allowedUsername = normalizeUsername(allowedUsername); this.telegramClient = telegramClient; this.agent = agent; this.channelRegistry = channelRegistry; + this.speechToTextService = speechToTextService; + this.voiceDownloader = voiceDownloader; channelRegistry.registerChannel(this); log.info("Started Telegram integration"); } @@ -55,7 +68,7 @@ public LongPollingUpdateConsumer getUpdatesConsumer() { @Override public void consume(Update update) { - if (!(update.hasMessage() && update.getMessage().hasText())) return; + if (!update.hasMessage()) return; Message requestMessage = update.getMessage(); String userName = requestMessage.getFrom() == null ? null : requestMessage.getFrom().getUserName(); @@ -65,11 +78,13 @@ public void consume(Update update) { return; } - String messageText = requestMessage.getText(); + Optional messageText = resolveMessageText(requestMessage); + if (messageText.isEmpty()) return; + this.chatId = requestMessage.getChatId(); this.messageThreadId = requestMessage.getMessageThreadId(); - channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText, chatId, messageThreadId)); - String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText); + channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText.get(), chatId, messageThreadId)); + String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText.get()); sendMessage(chatId, messageThreadId, response); } @@ -95,6 +110,28 @@ public void sendMessage(long chatId, Integer messageThreadId, String message) { } } + private Optional resolveMessageText(Message message) { + if (message.hasText()) { + return Optional.of(message.getText()); + } + if (message.hasVoice()) { + return transcribeVoice(message); + } + return Optional.empty(); + } + + private Optional transcribeVoice(Message message) { + log.info("Voice message received, downloading audio"); + try (InputStream voiceStream = voiceDownloader.download(message)) { + String transcribed = speechToTextService.transcribe(voiceStream); + log.info("Voice message transcribed successfully"); + return Optional.of(transcribed); + } catch (IOException | TelegramApiException e) { + log.error("Failed to process voice message", e); + return Optional.empty(); + } + } + private boolean isAllowedUser(String userName) { String normalizedUserName = normalizeUsername(userName); return normalizedUserName != null && normalizedUserName.equalsIgnoreCase(allowedUsername); diff --git a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java index 76bb8c4..a007c61 100644 --- a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java +++ b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java @@ -3,6 +3,7 @@ import ai.javaclaw.agent.Agent; import ai.javaclaw.channels.ChannelRegistry; +import ai.javaclaw.speech.SpeechToTextService; import org.springframework.beans.factory.annotation.Value; import org.springframework.boot.autoconfigure.AutoConfiguration; import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean; @@ -19,7 +20,8 @@ public class TelegramChannelAutoConfiguration { public TelegramChannel telegramChannel(@Value("${agent.channels.telegram.token:null}") String botToken, @Value("${agent.channels.telegram.username:null}") String allowedUsername, Agent agent, - ChannelRegistry channelRegistry) { - return new TelegramChannel(botToken, allowedUsername, agent, channelRegistry); + ChannelRegistry channelRegistry, + SpeechToTextService speechToTextService) { + return new TelegramChannel(botToken, allowedUsername, agent, channelRegistry, speechToTextService); } } diff --git a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramVoiceDownloader.java b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramVoiceDownloader.java new file mode 100644 index 0000000..5ab8a04 --- /dev/null +++ b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramVoiceDownloader.java @@ -0,0 +1,29 @@ +package ai.javaclaw.channels.telegram; + +import org.telegram.telegrambots.meta.api.methods.GetFile; +import org.telegram.telegrambots.meta.api.objects.message.Message; +import org.telegram.telegrambots.meta.exceptions.TelegramApiException; +import org.telegram.telegrambots.meta.generics.TelegramClient; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; + +class TelegramVoiceDownloader { + + private final TelegramClient telegramClient; + private final String botToken; + + TelegramVoiceDownloader(TelegramClient telegramClient, String botToken) { + this.telegramClient = telegramClient; + this.botToken = botToken; + } + + InputStream download(Message message) throws TelegramApiException, IOException { + String fileId = message.getVoice().getFileId(); + GetFile getFile = new GetFile(fileId); + String filePath = telegramClient.execute(getFile).getFilePath(); + String fileUrl = "https://api.telegram.org/file/bot" + botToken + "/" + filePath; + return URI.create(fileUrl).toURL().openStream(); + } +} diff --git a/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java b/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java index adbcc52..2240d5c 100644 --- a/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java +++ b/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java @@ -2,6 +2,7 @@ import ai.javaclaw.agent.Agent; import ai.javaclaw.channels.ChannelRegistry; +import ai.javaclaw.speech.SpeechToTextService; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; @@ -13,6 +14,10 @@ import org.telegram.telegrambots.meta.exceptions.TelegramApiException; import org.telegram.telegrambots.meta.generics.TelegramClient; +import java.io.ByteArrayInputStream; +import java.io.InputStream; + +import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.ArgumentMatchers.argThat; import static org.mockito.ArgumentMatchers.eq; @@ -31,6 +36,10 @@ class TelegramChannelTest { @Mock private Agent agent; + @Mock + private SpeechToTextService speechToTextService; + + // ----------------------------------------------------------------------- // Ignored updates // ----------------------------------------------------------------------- @@ -47,13 +56,17 @@ void ignoresUpdatesWithoutMessage() { } @Test - void ignoresUpdatesWithoutText() { + void ignoresUpdatesWithoutTextOrVoice() { TelegramChannel channel = channel("allowed_user"); Update update = mock(Update.class); Message message = mock(Message.class); + User user = mock(User.class); when(update.hasMessage()).thenReturn(true); when(update.getMessage()).thenReturn(message); + when(message.getFrom()).thenReturn(user); + when(user.getUserName()).thenReturn("allowed_user"); when(message.hasText()).thenReturn(false); + when(message.hasVoice()).thenReturn(false); channel.consume(update); @@ -67,7 +80,6 @@ void ignoresMessagesFromNullUsername() { Message message = mock(Message.class); when(update.hasMessage()).thenReturn(true); when(update.getMessage()).thenReturn(message); - when(message.hasText()).thenReturn(true); when(message.getFrom()).thenReturn(null); channel.consume(update); @@ -176,6 +188,39 @@ void doesNotSetMessageThreadIdWhenAbsent() throws TelegramApiException { msg.getMessageThreadId() == null)); } + // ----------------------------------------------------------------------- + // Voice message handling + // ----------------------------------------------------------------------- + + @Test + void voiceMessageIsTranscribedAndRoutedToAgent() throws Exception { + TelegramVoiceDownloader downloader = mock(TelegramVoiceDownloader.class); + TelegramChannel channel = channelWithVoice("allowed_user", downloader); + when(downloader.download(any(Message.class))).thenReturn(new ByteArrayInputStream(new byte[0])); + when(speechToTextService.transcribe(any(InputStream.class))).thenReturn("hello from voice"); + when(agent.respondTo(anyString(), anyString())).thenReturn("voice reply"); + + channel.consume(voiceUpdateFrom("allowed_user", "file-id-123", 42L, null)); + + verify(speechToTextService).transcribe(any(InputStream.class)); + verify(agent).respondTo(eq("telegram-42"), eq("hello from voice")); + verify(telegramClient).execute(argThat((SendMessage msg) -> + "42".equals(msg.getChatId()) && "voice reply".equals(msg.getText()))); + } + + @Test + void voiceMessageUsesConversationIdWithThreadId() throws Exception { + TelegramVoiceDownloader downloader = mock(TelegramVoiceDownloader.class); + TelegramChannel channel = channelWithVoice("allowed_user", downloader); + when(downloader.download(any(Message.class))).thenReturn(new ByteArrayInputStream(new byte[0])); + when(speechToTextService.transcribe(any(InputStream.class))).thenReturn("voice text"); + when(agent.respondTo(anyString(), anyString())).thenReturn("reply"); + + channel.consume(voiceUpdateFrom("allowed_user", "file-id-123", 42L, 567)); + + verify(agent).respondTo(eq("telegram-42-567"), eq("voice text")); + } + // ----------------------------------------------------------------------- // sendMessage fallback // ----------------------------------------------------------------------- @@ -195,7 +240,11 @@ void sendMessageDoesNothingWhenNoChatIdKnown() { // ----------------------------------------------------------------------- private TelegramChannel channel(String allowedUsername) { - return new TelegramChannel("token", allowedUsername, telegramClient, agent, new ChannelRegistry()); + return new TelegramChannel("token", allowedUsername, telegramClient, agent, new ChannelRegistry(), speechToTextService); + } + + private TelegramChannel channelWithVoice(String allowedUsername, TelegramVoiceDownloader voiceDownloader) { + return new TelegramChannel("token", allowedUsername, telegramClient, agent, new ChannelRegistry(), speechToTextService, voiceDownloader); } private Update updateFromUnknownUser(String username) { @@ -204,7 +253,6 @@ private Update updateFromUnknownUser(String username) { User user = mock(User.class); when(update.hasMessage()).thenReturn(true); when(update.getMessage()).thenReturn(message); - when(message.hasText()).thenReturn(true); when(message.getFrom()).thenReturn(user); when(user.getUserName()).thenReturn(username); return update; @@ -226,4 +274,21 @@ private Update updateFrom(String username, String text, long chatId, Integer mes return update; } + + private Update voiceUpdateFrom(String username, String fileId, long chatId, Integer messageThreadId) { + Update update = mock(Update.class); + Message message = mock(Message.class); + User user = mock(User.class); + + when(update.hasMessage()).thenReturn(true); + when(update.getMessage()).thenReturn(message); + when(message.hasText()).thenReturn(false); + when(message.hasVoice()).thenReturn(true); + when(message.getChatId()).thenReturn(chatId); + when(message.getMessageThreadId()).thenReturn(messageThreadId); + when(message.getFrom()).thenReturn(user); + when(user.getUserName()).thenReturn(username); + + return update; + } }