feat: audio speech streaming (#56)

iceAndFireisFailed · yuhongxiao · web-flow · commit 3120753df39b · 2025-09-29T17:08:18.000+08:00
Co-authored-by: yuhongxiao &lt;hongxiao.yu@aminer.cn&gt;
diff --git a/core/src/main/java/ai/z/openapi/api/audio/AudioApi.java b/core/src/main/java/ai/z/openapi/api/audio/AudioApi.java
@@ -39,6 +39,20 @@ public interface AudioApi {
 	@POST("audio/speech")
 	Single<ResponseBody> audioSpeech(@Body AudioSpeechRequest request);
 
+	/**
+	 * Text-to-Speech (TTS) conversion using GLM-4-Voice Converts text input into
+	 * natural-sounding speech audio with emotion and tone control Supports multiple
+	 * voices, languages, speed adjustment, and various audio formats Features advanced
+	 * voice synthesis with customizable emotional expressions and dialects
+	 * @param request TTS parameters including text, voice selection, emotion, speed,
+	 * tone, and output format
+	 * @return Generated high-quality audio streaming in specified format with natural
+	 * prosody
+	 */
+	@Streaming
+	@POST("audio/speech")
+	Call<ResponseBody> audioSpeechStreaming(@Body AudioSpeechRequest request);
+
 	/**
 	 * Voice cloning and customization using advanced neural models Creates custom voice
 	 * models from provided audio samples with high fidelity Enables personalized speech
diff --git a/core/src/main/java/ai/z/openapi/service/audio/AudioService.java b/core/src/main/java/ai/z/openapi/service/audio/AudioService.java
@@ -12,6 +12,13 @@ public interface AudioService {
 	 */
 	AudioSpeechResponse createSpeech(AudioSpeechRequest request);
 
+	/**
+	 * Creates speech from text using text-to-speech.
+	 * @param request the speech generation request
+	 * @return AudioSpeechStreamingResponse containing the generated speech streaming
+	 */
+	AudioSpeechStreamingResponse createStreamingSpeechStreaming(AudioSpeechRequest request);
+
 	/**
 	 * Creates customized speech with specific voice characteristics.
 	 * @param request the speech customization request
diff --git a/core/src/main/java/ai/z/openapi/service/audio/AudioServiceImpl.java b/core/src/main/java/ai/z/openapi/service/audio/AudioServiceImpl.java
@@ -6,6 +6,7 @@
 import ai.z.openapi.utils.FlowableRequestSupplier;
 import ai.z.openapi.utils.RequestSupplier;
 import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ObjectNode;
 import io.reactivex.rxjava3.core.Single;
 import lombok.extern.slf4j.Slf4j;
 import okhttp3.MediaType;
@@ -57,6 +58,13 @@ public AudioSpeechResponse createSpeech(AudioSpeechRequest request) {
 		return this.zAiClient.executeRequest(request, supplier, AudioSpeechResponse.class);
 	}
 
+	@Override
+	public AudioSpeechStreamingResponse createStreamingSpeechStreaming(AudioSpeechRequest request) {
+		validateSpeechParams(request);
+		FlowableRequestSupplier<AudioSpeechRequest, retrofit2.Call<ResponseBody>> supplier = audioApi::audioSpeechStreaming;
+		return this.zAiClient.streamRequest(request, supplier, AudioSpeechStreamingResponse.class, ObjectNode.class);
+	}
+
 	@Override
 	public AudioCustomizationResponse createCustomSpeech(AudioCustomizationRequest request) {
 		validateCustomSpeechParams(request);
diff --git a/core/src/main/java/ai/z/openapi/service/audio/AudioSpeechStreamingResponse.java b/core/src/main/java/ai/z/openapi/service/audio/AudioSpeechStreamingResponse.java
@@ -0,0 +1,27 @@
+package ai.z.openapi.service.audio;
+
+import ai.z.openapi.core.model.ClientResponse;
+import ai.z.openapi.core.model.FlowableClientResponse;
+import ai.z.openapi.service.model.ChatError;
+import java.io.File;
+
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import io.reactivex.rxjava3.core.Flowable;
+import lombok.Data;
+
+@Data
+public class AudioSpeechStreamingResponse implements FlowableClientResponse<ObjectNode> {
+
+	private int code;
+
+	private String msg;
+
+	private boolean success;
+
+	private ObjectNode data;
+
+	private ChatError error;
+
+	private Flowable<ObjectNode> flowable;
+
+}
diff --git a/core/src/test/java/ai/z/openapi/service/audio/AudioServiceTest.java b/core/src/test/java/ai/z/openapi/service/audio/AudioServiceTest.java
@@ -82,6 +82,29 @@ void shouldGenerateSpeechFromTextSuccessfully() throws JsonProcessingException {
 		logger.info("Text-to-speech response: {}", mapper.writeValueAsString(response));
 	}
 
+	@Test
+	@DisplayName("Should generate speech streaming from text successfully")
+	@EnabledIfEnvironmentVariable(named = "ZAI_API_KEY", matches = "^[^.]+\\.[^.]+$")
+	void testAudioSpeechStreaming() {
+		String requestId = String.format(REQUEST_ID_TEMPLATE, System.currentTimeMillis());
+		AudioSpeechRequest audioSpeechRequest = AudioSpeechRequest.builder()
+			.model(Constants.ModelTTS)
+			.encodeFormat("base64")
+			.input("Hello, this is a test for text-to-speech functionality.")
+			.voice("female")
+			.speed(1.0f)
+			.volume(1.0f)
+			.stream(Boolean.TRUE)
+			.responseFormat("wav")
+			.requestId(requestId)
+			.build();
+		AudioSpeechStreamingResponse audioSpeechStreamingApiResponse = audioService
+			.createStreamingSpeechStreaming(audioSpeechRequest);
+		audioSpeechStreamingApiResponse.getFlowable()
+			.doOnNext(speechPro -> logger.info("speechPro: {}", speechPro.toString()))
+			.blockingSubscribe();
+	}
+
 	@Test
 	@DisplayName("Should generate custom speech with voice cloning successfully")
 	@EnabledIfEnvironmentVariable(named = "ZAI_API_KEY", matches = "^[^.]+\\.[^.]+$")