Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,4 @@ workspace/skills/*
workspace/tasks/*
workspace/app.*
*.private*
workspace/conversations/*
16 changes: 16 additions & 0 deletions base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package ai.javaclaw.speech;

import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service;

import java.io.InputStream;

@Service
@ConditionalOnProperty(name = "speech.provider", havingValue = "mock", matchIfMissing = true)
public class MockSpeechToTextService implements SpeechToTextService {

@Override
public String transcribe(InputStream audioStream) {
return "[voice message]";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package ai.javaclaw.speech;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.core.io.InputStreamResource;
import org.springframework.http.MediaType;
import org.springframework.http.client.MultipartBodyBuilder;
import org.springframework.stereotype.Service;
import org.springframework.web.client.RestClient;

import java.io.InputStream;

@Service
@ConditionalOnProperty(name = "speech.provider", havingValue = "openai")
public class OpenAiSpeechToTextService implements SpeechToTextService {

private static final Logger log = LoggerFactory.getLogger(OpenAiSpeechToTextService.class);

private final RestClient restClient;
private final String model;

public OpenAiSpeechToTextService(
@Value("${spring.ai.openai.api-key}") String apiKey,
@Value("${speech.openai.model:whisper-1}") String model,
@Value("${speech.openai.base-url:https://api.openai.com/v1}") String baseUrl) {
this.model = model;
this.restClient = RestClient.builder()
.baseUrl(baseUrl)
.defaultHeader("Authorization", "Bearer " + apiKey)
.build();
}

@Override
public String transcribe(InputStream audioStream) {
log.info("Transcribing audio via OpenAI Whisper (model: {})", model);

MultipartBodyBuilder builder = new MultipartBodyBuilder();
builder.part("file", new InputStreamResource(audioStream))
.filename("voice.ogg")
.contentType(MediaType.parseMediaType("audio/ogg"));
builder.part("model", model);

TranscriptionResponse response = restClient.post()
.uri("/audio/transcriptions")
.contentType(MediaType.MULTIPART_FORM_DATA)
.body(builder.build())
.retrieve()
.body(TranscriptionResponse.class);

if (response == null || response.text() == null || response.text().isBlank()) {
throw new SpeechToTextException("OpenAI returned empty transcription");
}

log.info("OpenAI transcription completed successfully");
return response.text().trim();
}

private record TranscriptionResponse(String text) {}
}
12 changes: 12 additions & 0 deletions base/src/main/java/ai/javaclaw/speech/SpeechToTextException.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package ai.javaclaw.speech;

public class SpeechToTextException extends RuntimeException {

public SpeechToTextException(String message) {
super(message);
}

public SpeechToTextException(String message, Throwable cause) {
super(message, cause);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package ai.javaclaw.speech;

import java.io.InputStream;

public interface SpeechToTextService {

String transcribe(InputStream audioStream);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
package ai.javaclaw.speech;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.concurrent.TimeUnit;

@Service
@ConditionalOnProperty(name = "speech.provider", havingValue = "whisper-cpp")
public class WhisperCppSpeechToTextService implements SpeechToTextService {

private static final Logger log = LoggerFactory.getLogger(WhisperCppSpeechToTextService.class);
private static final long TIMEOUT_SECONDS = 60;

private final String modelPath;

public WhisperCppSpeechToTextService(
@Value("${speech.whisper-cpp.model-path}") String modelPath) {
this.modelPath = modelPath;
}

@Override
public String transcribe(InputStream audioStream) {
Path tempOgg = null;
Path tempWav = null;
Path tempOutput = null;
try {
// Save the OGG stream to a temp file
tempOgg = Files.createTempFile("javaclaw-voice-", ".ogg");
Files.copy(audioStream, tempOgg, StandardCopyOption.REPLACE_EXISTING);

// Convert OGG/Opus to WAV (16kHz mono) — whisper-cli only accepts WAV
tempWav = Files.createTempFile("javaclaw-voice-", ".wav");
convertToWav(tempOgg, tempWav);

// Prepare output base path for whisper-cli
tempOutput = Files.createTempFile("javaclaw-whisper-", "");
String outputBase = tempOutput.toString();
Files.deleteIfExists(tempOutput);

log.info("Transcribing audio via whisper-cpp (model: {})", modelPath);

ProcessBuilder pb = new ProcessBuilder(
"whisper-cli",
"-m", modelPath,
"-otxt",
"-of", outputBase,
"-np",
"-nt",
"-f", tempWav.toString()
);
pb.redirectErrorStream(true);

Process process = pb.start();
String processOutput = new String(process.getInputStream().readAllBytes());
boolean finished = process.waitFor(TIMEOUT_SECONDS, TimeUnit.SECONDS);
if (!finished) {
process.destroyForcibly();
throw new SpeechToTextException("whisper-cpp timed out after " + TIMEOUT_SECONDS + " seconds");
}

if (process.exitValue() != 0) {
log.error("whisper-cpp failed with output: {}", processOutput);
throw new SpeechToTextException("whisper-cpp exited with code " + process.exitValue() + ": " + processOutput);
}

// whisper-cli appends .txt to the output base path
Path resultFile = Path.of(outputBase + ".txt");
if (!Files.exists(resultFile)) {
log.error("whisper-cli output: {}", processOutput);
throw new SpeechToTextException("whisper-cpp did not produce output file: " + resultFile);
}

String transcription = Files.readString(resultFile).trim();
Files.deleteIfExists(resultFile);

if (transcription.isEmpty()) {
throw new SpeechToTextException("whisper-cpp returned empty transcription");
}

log.info("whisper-cpp transcription completed successfully");
return transcription;

} catch (IOException e) {
throw new SpeechToTextException("Failed to process audio file for whisper-cpp", e);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new SpeechToTextException("whisper-cpp transcription interrupted", e);
} finally {
deleteSilently(tempOgg);
deleteSilently(tempWav);
deleteSilently(tempOutput);
}
}

private void convertToWav(Path input, Path output) throws IOException {
try {
Process process = new ProcessBuilder(
"ffmpeg", "-y",
"-i", input.toString(),
"-ar", "16000",
"-ac", "1",
"-c:a", "pcm_s16le",
output.toString()
).redirectErrorStream(true).start();

process.getInputStream().readAllBytes();
boolean finished = process.waitFor(30, TimeUnit.SECONDS);
if (!finished) {
process.destroyForcibly();
throw new SpeechToTextException("ffmpeg conversion timed out");
}
if (process.exitValue() != 0) {
throw new SpeechToTextException("ffmpeg conversion failed with exit code " + process.exitValue());
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new SpeechToTextException("ffmpeg conversion interrupted", e);
}
}

private static void deleteSilently(Path path) {
if (path != null) {
try {
Files.deleteIfExists(path);
} catch (IOException e) {
log.warn("Failed to delete temp file: {}", path, e);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import ai.javaclaw.channels.Channel;
import ai.javaclaw.channels.ChannelMessageReceivedEvent;
import ai.javaclaw.channels.ChannelRegistry;
import ai.javaclaw.speech.SpeechToTextService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.telegram.telegrambots.client.okhttp.OkHttpTelegramClient;
Expand All @@ -16,6 +17,10 @@
import org.telegram.telegrambots.meta.exceptions.TelegramApiException;
import org.telegram.telegrambots.meta.generics.TelegramClient;

import java.io.IOException;
import java.io.InputStream;
import java.util.Optional;

import static java.util.Optional.ofNullable;

public class TelegramChannel implements Channel, SpringLongPollingBot, LongPollingSingleThreadUpdateConsumer {
Expand All @@ -26,19 +31,27 @@ public class TelegramChannel implements Channel, SpringLongPollingBot, LongPolli
private final TelegramClient telegramClient;
private final Agent agent;
private final ChannelRegistry channelRegistry;
private final SpeechToTextService speechToTextService;
private final TelegramVoiceDownloader voiceDownloader;
private Long chatId;
private Integer messageThreadId;

public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry) {
this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry);
public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) {
this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry, speechToTextService);
}

TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) {
this(botToken, allowedUsername, telegramClient, agent, channelRegistry, speechToTextService, new TelegramVoiceDownloader(telegramClient, botToken));
}

TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry) {
TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService, TelegramVoiceDownloader voiceDownloader) {
this.botToken = botToken;
this.allowedUsername = normalizeUsername(allowedUsername);
this.telegramClient = telegramClient;
this.agent = agent;
this.channelRegistry = channelRegistry;
this.speechToTextService = speechToTextService;
this.voiceDownloader = voiceDownloader;
channelRegistry.registerChannel(this);
log.info("Started Telegram integration");
}
Expand All @@ -55,7 +68,7 @@ public LongPollingUpdateConsumer getUpdatesConsumer() {

@Override
public void consume(Update update) {
if (!(update.hasMessage() && update.getMessage().hasText())) return;
if (!update.hasMessage()) return;

Message requestMessage = update.getMessage();
String userName = requestMessage.getFrom() == null ? null : requestMessage.getFrom().getUserName();
Expand All @@ -65,11 +78,13 @@ public void consume(Update update) {
return;
}

String messageText = requestMessage.getText();
Optional<String> messageText = resolveMessageText(requestMessage);
if (messageText.isEmpty()) return;

this.chatId = requestMessage.getChatId();
this.messageThreadId = requestMessage.getMessageThreadId();
channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText, chatId, messageThreadId));
String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText);
channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText.get(), chatId, messageThreadId));
String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText.get());
sendMessage(chatId, messageThreadId, response);
}

Expand All @@ -95,6 +110,28 @@ public void sendMessage(long chatId, Integer messageThreadId, String message) {
}
}

private Optional<String> resolveMessageText(Message message) {
if (message.hasText()) {
return Optional.of(message.getText());
}
if (message.hasVoice()) {
return transcribeVoice(message);
}
return Optional.empty();
}

private Optional<String> transcribeVoice(Message message) {
log.info("Voice message received, downloading audio");
try (InputStream voiceStream = voiceDownloader.download(message)) {
String transcribed = speechToTextService.transcribe(voiceStream);
log.info("Voice message transcribed successfully");
return Optional.of(transcribed);
} catch (IOException | TelegramApiException e) {
log.error("Failed to process voice message", e);
return Optional.empty();
}
}

private boolean isAllowedUser(String userName) {
String normalizedUserName = normalizeUsername(userName);
return normalizedUserName != null && normalizedUserName.equalsIgnoreCase(allowedUsername);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import ai.javaclaw.agent.Agent;
import ai.javaclaw.channels.ChannelRegistry;
import ai.javaclaw.speech.SpeechToTextService;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.AutoConfiguration;
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
Expand All @@ -19,7 +20,8 @@ public class TelegramChannelAutoConfiguration {
public TelegramChannel telegramChannel(@Value("${agent.channels.telegram.token:null}") String botToken,
@Value("${agent.channels.telegram.username:null}") String allowedUsername,
Agent agent,
ChannelRegistry channelRegistry) {
return new TelegramChannel(botToken, allowedUsername, agent, channelRegistry);
ChannelRegistry channelRegistry,
SpeechToTextService speechToTextService) {
return new TelegramChannel(botToken, allowedUsername, agent, channelRegistry, speechToTextService);
}
}
Loading