diff --git a/core/src/main/java/ai/z/openapi/core/Constants.java b/core/src/main/java/ai/z/openapi/core/Constants.java index fa559ba..fb8dcf9 100644 --- a/core/src/main/java/ai/z/openapi/core/Constants.java +++ b/core/src/main/java/ai/z/openapi/core/Constants.java @@ -197,7 +197,7 @@ private Constants() { * fluent and readable text. Supports Chinese, English, and various Chinese dialects. * Improved performance in noisy environments. */ - public static final String ModelGLMASR = "glm-asr"; + public static final String ModelGLMASR = "glm-asr-2512"; // ============================================================================= // Real-time Interaction Models @@ -356,9 +356,9 @@ private Constants() { public static final String ModelCharGLM3 = "charglm-3"; /** - * CogTTS model - Text-to-Speech synthesis model. + * GLM-TTS model - Text-to-Speech synthesis model. */ - public static final String ModelTTS = "cogtts"; + public static final String ModelTTS = "glm-tts"; /** * Rerank model - Text reordering and relevance scoring. diff --git a/core/src/main/java/ai/z/openapi/service/audio/AudioService.java b/core/src/main/java/ai/z/openapi/service/audio/AudioService.java index e44b11f..c401d2c 100644 --- a/core/src/main/java/ai/z/openapi/service/audio/AudioService.java +++ b/core/src/main/java/ai/z/openapi/service/audio/AudioService.java @@ -16,9 +16,19 @@ public interface AudioService { * Creates speech from text using text-to-speech. * @param request the speech generation request * @return AudioSpeechStreamingResponse containing the generated speech streaming + * @Deprecated This method is deprecated and will be removed in a future release. + * Please use createStreamingSpeech instead. */ + @Deprecated AudioSpeechStreamingResponse createStreamingSpeechStreaming(AudioSpeechRequest request); + /** + * Creates speech from text using text-to-speech. 该方法接收一个音频语音请求对象,并返回一个流式响应对象 + * @param request the speech generation request + * @return AudioSpeechStreamingResponse containing the generated speech streaming + */ + AudioSpeechStreamingResponse createStreamingSpeech(AudioSpeechRequest request); + /** * Creates customized speech with specific voice characteristics. * @param request the speech customization request diff --git a/core/src/main/java/ai/z/openapi/service/audio/AudioServiceImpl.java b/core/src/main/java/ai/z/openapi/service/audio/AudioServiceImpl.java index f0c0d1e..d974e4b 100644 --- a/core/src/main/java/ai/z/openapi/service/audio/AudioServiceImpl.java +++ b/core/src/main/java/ai/z/openapi/service/audio/AudioServiceImpl.java @@ -3,10 +3,10 @@ import ai.z.openapi.AbstractAiClient; import ai.z.openapi.api.audio.AudioApi; import ai.z.openapi.service.deserialize.MessageDeserializeFactory; +import ai.z.openapi.service.model.ModelData; import ai.z.openapi.utils.FlowableRequestSupplier; import ai.z.openapi.utils.RequestSupplier; import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.node.ObjectNode; import io.reactivex.rxjava3.core.Single; import okhttp3.MediaType; import okhttp3.MultipartBody; @@ -48,7 +48,8 @@ public AudioSpeechResponse createSpeech(AudioSpeechRequest request) { RequestSupplier supplier = (params) -> { try { Single responseBody = audioApi.audioSpeech(params); - Path tempDirectory = Files.createTempFile("audio_speech" + UUID.randomUUID(), ".wav"); + Path tempDirectory = Files.createTempFile("audio_speech" + UUID.randomUUID(), + "." + request.getResponseFormat()); java.io.File file = tempDirectory.toFile(); writeResponseBodyToFile(responseBody.blockingGet(), file); return Single.just(file); @@ -61,10 +62,18 @@ public AudioSpeechResponse createSpeech(AudioSpeechRequest request) { } @Override + @Deprecated public AudioSpeechStreamingResponse createStreamingSpeechStreaming(AudioSpeechRequest request) { validateSpeechParams(request); FlowableRequestSupplier> supplier = audioApi::audioSpeechStreaming; - return this.zAiClient.streamRequest(request, supplier, AudioSpeechStreamingResponse.class, ObjectNode.class); + return this.zAiClient.streamRequest(request, supplier, AudioSpeechStreamingResponse.class, ModelData.class); + } + + @Override + public AudioSpeechStreamingResponse createStreamingSpeech(AudioSpeechRequest request) { + validateSpeechParams(request); + FlowableRequestSupplier> supplier = audioApi::audioSpeechStreaming; + return this.zAiClient.streamRequest(request, supplier, AudioSpeechStreamingResponse.class, ModelData.class); } @Override @@ -200,6 +209,9 @@ private void validateSpeechParams(AudioSpeechRequest request) { if (request.getInput() == null || request.getInput().trim().isEmpty()) { throw new IllegalArgumentException("request input cannot be null or empty"); } + if (request.getVoice() == null || request.getVoice().trim().isEmpty()) { + throw new IllegalArgumentException("request voice cannot be null or empty"); + } } private void validateCustomSpeechParams(AudioCustomizationRequest request) { diff --git a/core/src/main/java/ai/z/openapi/service/audio/AudioSpeechStreamingResponse.java b/core/src/main/java/ai/z/openapi/service/audio/AudioSpeechStreamingResponse.java index 7f5770b..ee3b04d 100644 --- a/core/src/main/java/ai/z/openapi/service/audio/AudioSpeechStreamingResponse.java +++ b/core/src/main/java/ai/z/openapi/service/audio/AudioSpeechStreamingResponse.java @@ -5,12 +5,19 @@ import ai.z.openapi.service.model.ChatError; import java.io.File; +import ai.z.openapi.service.model.ModelData; import com.fasterxml.jackson.databind.node.ObjectNode; import io.reactivex.rxjava3.core.Flowable; +import lombok.AllArgsConstructor; +import lombok.Builder; import lombok.Data; +import lombok.NoArgsConstructor; @Data -public class AudioSpeechStreamingResponse implements FlowableClientResponse { +@Builder +@AllArgsConstructor +@NoArgsConstructor +public class AudioSpeechStreamingResponse implements FlowableClientResponse { private int code; @@ -18,10 +25,10 @@ public class AudioSpeechStreamingResponse implements FlowableClientResponse flowable; + private Flowable flowable; } diff --git a/core/src/main/java/ai/z/openapi/service/audio/AudioTranscriptionChunk.java b/core/src/main/java/ai/z/openapi/service/audio/AudioTranscriptionChunk.java index 1d2f8d8..e20d34b 100644 --- a/core/src/main/java/ai/z/openapi/service/audio/AudioTranscriptionChunk.java +++ b/core/src/main/java/ai/z/openapi/service/audio/AudioTranscriptionChunk.java @@ -1,12 +1,8 @@ package ai.z.openapi.service.audio; -import com.fasterxml.jackson.annotation.JsonProperty; - -import ai.z.openapi.service.model.Choice; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; -import java.util.List; import lombok.NoArgsConstructor; @Data @@ -15,9 +11,6 @@ @Builder public final class AudioTranscriptionChunk { - @JsonProperty("choices") - private List choices; - private Long created; private String model; diff --git a/core/src/main/java/ai/z/openapi/service/audio/AudioTranscriptionRequest.java b/core/src/main/java/ai/z/openapi/service/audio/AudioTranscriptionRequest.java index 350f603..5b7c103 100644 --- a/core/src/main/java/ai/z/openapi/service/audio/AudioTranscriptionRequest.java +++ b/core/src/main/java/ai/z/openapi/service/audio/AudioTranscriptionRequest.java @@ -10,6 +10,7 @@ import lombok.experimental.SuperBuilder; import java.io.File; +import java.util.List; @EqualsAndHashCode(callSuper = true) @SuperBuilder @@ -34,18 +35,28 @@ public class AudioTranscriptionRequest extends CommonRequest implements ClientRe /** * Audio file to be transcribed (Required) Supported audio file formats: .wav / .mp3 - * Specification limits: file size ≤ 25 MB, audio duration ≤ 60 seconds + * Specification limits: file size ≤ 25 MB, audio duration ≤ 30 seconds */ private File file; /** - * Sampling temperature, controls output randomness, must be positive (Optional) - * Range: [0.0, 1.0], default value is 0.95 Higher values make output more random and - * creative; lower values make output more stable or deterministic It's recommended to - * adjust either top_p or temperature parameter based on your use case, but not both - * simultaneously + * Base64 encoded audio file. Only one of file_base64 and file needs to be passed (if + * both are passed, file takes precedence) */ - private Float temperature; + @JsonProperty("file_base64") + private String fileBase64; + + /** + * In long text scenarios, previous transcription results can be provided as context. + * Recommended to be less than 8000 characters. + */ + private String prompt; + + /** + * Hot word list to improve recognition rate of specific domain vocabulary. Format + * example: ["Person name", "Place name"], recommended not to exceed 100 items. + */ + private List hotwords; /** * Unique identifier for each request (Optional) Passed by the client, must be unique. diff --git a/core/src/main/java/ai/z/openapi/service/audio/AudioTranscriptionResult.java b/core/src/main/java/ai/z/openapi/service/audio/AudioTranscriptionResult.java index 6457ba9..e47822e 100644 --- a/core/src/main/java/ai/z/openapi/service/audio/AudioTranscriptionResult.java +++ b/core/src/main/java/ai/z/openapi/service/audio/AudioTranscriptionResult.java @@ -2,11 +2,9 @@ import com.fasterxml.jackson.annotation.JsonProperty; -import ai.z.openapi.service.model.Segment; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; -import java.util.List; import lombok.NoArgsConstructor; @Data @@ -26,6 +24,4 @@ public final class AudioTranscriptionResult { private String text; - private List segments; - } diff --git a/core/src/main/java/ai/z/openapi/service/model/ChatFunction.java b/core/src/main/java/ai/z/openapi/service/model/ChatFunction.java index 1a3581f..8fcf2f9 100644 --- a/core/src/main/java/ai/z/openapi/service/model/ChatFunction.java +++ b/core/src/main/java/ai/z/openapi/service/model/ChatFunction.java @@ -1,6 +1,5 @@ package ai.z.openapi.service.model; -import java.util.List; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; @@ -18,6 +17,4 @@ public class ChatFunction { private ChatFunctionParameters parameters; - private List required; - } diff --git a/core/src/test/java/ai/z/openapi/service/audio/AudioServiceTest.java b/core/src/test/java/ai/z/openapi/service/audio/AudioServiceTest.java index e7675ac..6764428 100644 --- a/core/src/test/java/ai/z/openapi/service/audio/AudioServiceTest.java +++ b/core/src/test/java/ai/z/openapi/service/audio/AudioServiceTest.java @@ -105,36 +105,6 @@ void testAudioSpeechStreaming() { .blockingSubscribe(); } - @Test - @DisplayName("Should generate custom speech with voice cloning successfully") - @EnabledIfEnvironmentVariable(named = "ZAI_API_KEY", matches = "^[^.]+\\.[^.]+$") - void shouldGenerateCustomSpeechWithVoiceCloningSuccessfully() throws JsonProcessingException { - // Prepare test data - String requestId = String.format(REQUEST_ID_TEMPLATE, System.currentTimeMillis()); - File voiceFile = new File("src/test/resources/asr.wav"); - - AudioCustomizationRequest request = AudioCustomizationRequest.builder() - .model(Constants.ModelTTS) - .input("This is a test for custom voice generation.") - .voiceData(voiceFile) - .voiceText("Sample voice text for cloning") - .responseFormat("wav") - .requestId(requestId) - .build(); - - // Execute test - AudioCustomizationResponse response = audioService.createCustomSpeech(request); - - // Verify results - assertNotNull(response, "Custom speech response should not be null"); - assertTrue(response.isSuccess(), "Custom speech response should be successful"); - assertNotNull(response.getData(), "Custom speech response data should not be null"); - assertTrue(response.getData().exists(), "Generated custom audio file should exist"); - assertTrue(response.getData().length() > 0, "Generated custom audio file should not be empty"); - assertNull(response.getError(), "Response error should be null"); - logger.info("Custom speech response: {}", mapper.writeValueAsString(response)); - } - @Test @DisplayName("Should transcribe audio with blocking") @EnabledIfEnvironmentVariable(named = "ZAI_API_KEY", matches = "^[^.]+\\.[^.]+$") @@ -192,12 +162,9 @@ void shouldTranscribeAudioWithStreaming() throws JsonProcessingException { if (isFirst.getAndSet(false)) { logger.info("Starting to receive stream transcription response:"); } - if (modelData.getChoices() != null && !modelData.getChoices().isEmpty()) { - Choice choice = modelData.getChoices().get(0); - if (choice.getDelta() != null && choice.getDelta().getContent() != null) { - logger.info("Received transcription content: {}", choice.getDelta().getContent()); - messageCount.incrementAndGet(); - } + if (modelData.getDelta() != null) { + logger.info("Received transcription content: {}", modelData.getDelta()); + messageCount.incrementAndGet(); } }) .doOnComplete(() -> logger.info("Stream transcription completed, received {} messages in total", diff --git a/samples/src/main/ai.z.openapi.samples/AudioSpeechExample.java b/samples/src/main/ai.z.openapi.samples/AudioSpeechExample.java index 4710b52..f50ddb5 100644 --- a/samples/src/main/ai.z.openapi.samples/AudioSpeechExample.java +++ b/samples/src/main/ai.z.openapi.samples/AudioSpeechExample.java @@ -18,12 +18,19 @@ public static void main(String[] args) { // for Z.ai use the `ZaiClient`, for Zhipu AI use the ZhipuAiClient.builder().ofZHIPU().build() ZaiClient client = ZaiClient.builder().ofZAI().build(); + // Or set API Key via code + // ZaiClient client = ZaiClient.builder() + // .apiKey("your.api_key") + // .build(); + // Create request AudioSpeechRequest request = AudioSpeechRequest.builder() - .model(Constants.ModelTTS) - .input("Hello, this is a test for text-to-speech functionality.") - .voice("tongtong") - .build(); + .model(Constants.ModelTTS) + .input("Hello, this is a test for text-to-speech functionality.") + .voice("tongtong") + .stream(false) + .responseFormat("pcm") + .build(); try { // Execute request diff --git a/samples/src/main/ai.z.openapi.samples/AudioSpeechStreamExample.java b/samples/src/main/ai.z.openapi.samples/AudioSpeechStreamExample.java new file mode 100644 index 0000000..dc85314 --- /dev/null +++ b/samples/src/main/ai.z.openapi.samples/AudioSpeechStreamExample.java @@ -0,0 +1,68 @@ +package ai.z.openapi.samples; + +import ai.z.openapi.ZaiClient; +import ai.z.openapi.core.Constants; +import ai.z.openapi.service.audio.AudioSpeechRequest; +import ai.z.openapi.service.audio.AudioSpeechStreamingResponse; +import ai.z.openapi.service.model.Delta; +import ai.z.openapi.service.model.ModelData; + +/** + * Streaming Audio Speech Example + * Demonstrates how to use ZaiClient for streaming text-to-speech conversion + */ +public class AudioSpeechStreamExample { + + public static void main(String[] args) { + // Create client, recommended to set API Key via environment variable + // export ZAI_API_KEY=your.api_key + // for Z.ai use the `ZaiClient`, for Zhipu AI use the ZhipuAiClient.builder().ofZHIPU().build() + ZaiClient client = ZaiClient.builder().ofZAI().build(); + + // Create speech request with streaming enabled + AudioSpeechRequest request = AudioSpeechRequest.builder() + .model(Constants.ModelTTS) + .input("Hello, how's the weather today") + .voice("tongtong") + .responseFormat("pcm") + .stream(true) // Enable streaming response + .build(); + + try { + // Execute streaming request + AudioSpeechStreamingResponse response = client.audio().createStreamingSpeech(request); + + if (response.isSuccess() && response.getFlowable() != null) { + System.out.println("Starting streaming TTS response..."); + + response.getFlowable().subscribe( + data -> { + // Process each streaming response chunk + if (data.getChoices() != null && !data.getChoices().isEmpty()) { + // Get content of current chunk + Delta delta = data.getChoices().get(0).getDelta(); + + // Print current audio content (base64 encoded) + if (delta != null && delta.getContent() != null) { + System.out.println("Received audio chunk: " + delta.getContent()); + } + } + }, + error -> System.err.println("\nStream error: " + error.getMessage()), + // Process streaming response completion event + () -> System.out.println("\nStreaming TTS completed") + ); + + // Wait for streaming to complete + Thread.sleep(10000); + } else { + System.err.println("Error: " + response.getMsg()); + } + } catch (Exception e) { + System.err.println("Exception occurred: " + e.getMessage()); + e.printStackTrace(); + } finally { + client.close(); + } + } +} \ No newline at end of file diff --git a/samples/src/main/ai.z.openapi.samples/AudioTranscriptionsExample.java b/samples/src/main/ai.z.openapi.samples/AudioTranscriptionsExample.java new file mode 100644 index 0000000..d9b4519 --- /dev/null +++ b/samples/src/main/ai.z.openapi.samples/AudioTranscriptionsExample.java @@ -0,0 +1,57 @@ +package ai.z.openapi.samples; + +import ai.z.openapi.ZaiClient; +import ai.z.openapi.core.Constants; +import ai.z.openapi.service.audio.AudioTranscriptionRequest; +import ai.z.openapi.service.audio.AudioTranscriptionResponse; + +import java.io.File; + +/** + * Audio Transcriptions Example + * Demonstrates how to use ZaiClient for audio transcription (speech-to-text) + */ +public class AudioTranscriptionsExample { + + public static void main(String[] args) { + // Create client, recommended to set API Key via environment variable + // export ZAI_API_KEY=your.api_key + // for Z.ai use the `ZaiClient`, for Zhipu AI use the ZhipuAiClient.builder().ofZHIPU().build() + ZaiClient client = ZaiClient.builder().ofZAI().build(); + + // Prepare audio file + // Supported formats: .wav, .mp3 + // Limitations: file size ≤ 25 MB, duration ≤ 30 seconds + // The sample audio file is located at: samples/src/main/resources/asr.wav + File audioFile = new File("samples/src/main/resources/asr.wav"); + + // Create transcription request + AudioTranscriptionRequest request = AudioTranscriptionRequest.builder() + .model(Constants.ModelGLMASR) + .file(audioFile) + .stream(false) + .build(); + + try { + // Execute request + AudioTranscriptionResponse response = client.audio().createTranscription(request); + + if (response.isSuccess()) { + System.out.println("Transcription Result:"); + String text = response.getData().getText(); + // Remove leading newline if present + if (text != null && text.startsWith("\n")) { + text = text.substring(1); + } + System.out.println(text); + } else { + System.err.println("Error: " + response.getMsg()); + } + } catch (Exception e) { + System.err.println("Exception occurred: " + e.getMessage()); + e.printStackTrace(); + } finally { + client.close(); + } + } +} \ No newline at end of file diff --git a/samples/src/main/ai.z.openapi.samples/AudioTranscriptionsStreamExample.java b/samples/src/main/ai.z.openapi.samples/AudioTranscriptionsStreamExample.java new file mode 100644 index 0000000..2bdf118 --- /dev/null +++ b/samples/src/main/ai.z.openapi.samples/AudioTranscriptionsStreamExample.java @@ -0,0 +1,69 @@ +package ai.z.openapi.samples; + +import ai.z.openapi.ZaiClient; +import ai.z.openapi.core.Constants; +import ai.z.openapi.service.audio.AudioTranscriptionChunk; +import ai.z.openapi.service.audio.AudioTranscriptionRequest; +import ai.z.openapi.service.audio.AudioTranscriptionResponse; + +import java.io.File; + +/** + * Streaming Audio Transcription Example + * Demonstrates how to use ZaiClient for streaming audio transcription (speech-to-text) + */ +public class AudioTranscriptionsStreamExample { + + public static void main(String[] args) { + // Create client, recommended to set API Key via environment variable + // export ZAI_API_KEY=your.api_key + // for Z.ai use the `ZaiClient`, for Zhipu AI use the ZhipuAiClient.builder().ofZHIPU().build() + ZaiClient client = ZaiClient.builder().ofZAI().build(); + + // Prepare audio file + // Supported formats: .wav, .mp3 + // Limitations: file size ≤ 25 MB, duration ≤ 30 seconds + // The sample audio file is located at: samples/src/main/resources/asr.wav + File audioFile = new File("samples/src/main/resources/asr.wav"); + + // Create transcription request with streaming enabled + AudioTranscriptionRequest request = AudioTranscriptionRequest.builder() + .model(Constants.ModelGLMASR) + .file(audioFile) + .stream(true) // Enable streaming response + .build(); + + try { + // Execute streaming request + AudioTranscriptionResponse response = client.audio().createTranscription(request); + + if (response.isSuccess() && response.getFlowable() != null) { + System.out.println("Starting streaming transcription..."); + + response.getFlowable().subscribe( + chunk -> { + // Process each streaming response chunk + // delta is already a String containing the text content + if (chunk.getDelta() != null) { + String delta = chunk.getDelta(); + // Print each chunk on a new line to show streaming effect + System.out.println(delta); + } + }, + error -> System.err.println("Stream error: " + error.getMessage()), + () -> System.out.println("Streaming transcription completed") + ); + + // Wait for streaming to complete + Thread.sleep(10000); + } else { + System.err.println("Error: " + response.getMsg()); + } + } catch (Exception e) { + System.err.println("Exception occurred: " + e.getMessage()); + e.printStackTrace(); + } finally { + client.close(); + } + } +} \ No newline at end of file diff --git a/samples/src/main/resources/asr.wav b/samples/src/main/resources/asr.wav new file mode 100644 index 0000000..e29ac13 Binary files /dev/null and b/samples/src/main/resources/asr.wav differ