Skip to content
6 changes: 3 additions & 3 deletions core/src/main/java/ai/z/openapi/core/Constants.java
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ private Constants() {
* fluent and readable text. Supports Chinese, English, and various Chinese dialects.
* Improved performance in noisy environments.
*/
public static final String ModelGLMASR = "glm-asr";
public static final String ModelGLMASR = "glm-asr-2512";

// =============================================================================
// Real-time Interaction Models
Expand Down Expand Up @@ -356,9 +356,9 @@ private Constants() {
public static final String ModelCharGLM3 = "charglm-3";

/**
* CogTTS model - Text-to-Speech synthesis model.
* GLM-TTS model - Text-to-Speech synthesis model.
*/
public static final String ModelTTS = "cogtts";
public static final String ModelTTS = "glm-tts";

/**
* Rerank model - Text reordering and relevance scoring.
Expand Down
10 changes: 10 additions & 0 deletions core/src/main/java/ai/z/openapi/service/audio/AudioService.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,19 @@ public interface AudioService {
* Creates speech from text using text-to-speech.
* @param request the speech generation request
* @return AudioSpeechStreamingResponse containing the generated speech streaming
* @Deprecated This method is deprecated and will be removed in a future release.
* Please use createStreamingSpeech instead.
*/
@Deprecated
AudioSpeechStreamingResponse createStreamingSpeechStreaming(AudioSpeechRequest request);

/**
* Creates speech from text using text-to-speech. 该方法接收一个音频语音请求对象,并返回一个流式响应对象
* @param request the speech generation request
* @return AudioSpeechStreamingResponse containing the generated speech streaming
*/
AudioSpeechStreamingResponse createStreamingSpeech(AudioSpeechRequest request);

/**
* Creates customized speech with specific voice characteristics.
* @param request the speech customization request
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
import ai.z.openapi.AbstractAiClient;
import ai.z.openapi.api.audio.AudioApi;
import ai.z.openapi.service.deserialize.MessageDeserializeFactory;
import ai.z.openapi.service.model.ModelData;
import ai.z.openapi.utils.FlowableRequestSupplier;
import ai.z.openapi.utils.RequestSupplier;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import io.reactivex.rxjava3.core.Single;
import okhttp3.MediaType;
import okhttp3.MultipartBody;
Expand Down Expand Up @@ -48,7 +48,8 @@ public AudioSpeechResponse createSpeech(AudioSpeechRequest request) {
RequestSupplier<AudioSpeechRequest, java.io.File> supplier = (params) -> {
try {
Single<ResponseBody> responseBody = audioApi.audioSpeech(params);
Path tempDirectory = Files.createTempFile("audio_speech" + UUID.randomUUID(), ".wav");
Path tempDirectory = Files.createTempFile("audio_speech" + UUID.randomUUID(),
"." + request.getResponseFormat());
java.io.File file = tempDirectory.toFile();
writeResponseBodyToFile(responseBody.blockingGet(), file);
return Single.just(file);
Expand All @@ -61,10 +62,18 @@ public AudioSpeechResponse createSpeech(AudioSpeechRequest request) {
}

@Override
@Deprecated
public AudioSpeechStreamingResponse createStreamingSpeechStreaming(AudioSpeechRequest request) {
validateSpeechParams(request);
FlowableRequestSupplier<AudioSpeechRequest, retrofit2.Call<ResponseBody>> supplier = audioApi::audioSpeechStreaming;
return this.zAiClient.streamRequest(request, supplier, AudioSpeechStreamingResponse.class, ObjectNode.class);
return this.zAiClient.streamRequest(request, supplier, AudioSpeechStreamingResponse.class, ModelData.class);
}

@Override
public AudioSpeechStreamingResponse createStreamingSpeech(AudioSpeechRequest request) {
validateSpeechParams(request);
FlowableRequestSupplier<AudioSpeechRequest, retrofit2.Call<ResponseBody>> supplier = audioApi::audioSpeechStreaming;
return this.zAiClient.streamRequest(request, supplier, AudioSpeechStreamingResponse.class, ModelData.class);
}

@Override
Expand Down Expand Up @@ -200,6 +209,9 @@ private void validateSpeechParams(AudioSpeechRequest request) {
if (request.getInput() == null || request.getInput().trim().isEmpty()) {
throw new IllegalArgumentException("request input cannot be null or empty");
}
if (request.getVoice() == null || request.getVoice().trim().isEmpty()) {
throw new IllegalArgumentException("request voice cannot be null or empty");
}
}

private void validateCustomSpeechParams(AudioCustomizationRequest request) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,30 @@
import ai.z.openapi.service.model.ChatError;
import java.io.File;

import ai.z.openapi.service.model.ModelData;
import com.fasterxml.jackson.databind.node.ObjectNode;
import io.reactivex.rxjava3.core.Flowable;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;

@Data
public class AudioSpeechStreamingResponse implements FlowableClientResponse<ObjectNode> {
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class AudioSpeechStreamingResponse implements FlowableClientResponse<ModelData> {

private int code;

private String msg;

private boolean success;

private ObjectNode data;
private ModelData data;

private ChatError error;

private Flowable<ObjectNode> flowable;
private Flowable<ModelData> flowable;

}
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
package ai.z.openapi.service.audio;

import com.fasterxml.jackson.annotation.JsonProperty;

import ai.z.openapi.service.model.Choice;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import java.util.List;
import lombok.NoArgsConstructor;

@Data
Expand All @@ -15,9 +11,6 @@
@Builder
public final class AudioTranscriptionChunk {

@JsonProperty("choices")
private List<Choice> choices;

private Long created;

private String model;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import lombok.experimental.SuperBuilder;

import java.io.File;
import java.util.List;

@EqualsAndHashCode(callSuper = true)
@SuperBuilder
Expand All @@ -34,18 +35,28 @@ public class AudioTranscriptionRequest extends CommonRequest implements ClientRe

/**
* Audio file to be transcribed (Required) Supported audio file formats: .wav / .mp3
* Specification limits: file size ≤ 25 MB, audio duration ≤ 60 seconds
* Specification limits: file size ≤ 25 MB, audio duration ≤ 30 seconds
*/
private File file;

/**
* Sampling temperature, controls output randomness, must be positive (Optional)
* Range: [0.0, 1.0], default value is 0.95 Higher values make output more random and
* creative; lower values make output more stable or deterministic It's recommended to
* adjust either top_p or temperature parameter based on your use case, but not both
* simultaneously
* Base64 encoded audio file. Only one of file_base64 and file needs to be passed (if
* both are passed, file takes precedence)
*/
private Float temperature;
@JsonProperty("file_base64")
private String fileBase64;

/**
* In long text scenarios, previous transcription results can be provided as context.
* Recommended to be less than 8000 characters.
*/
private String prompt;

/**
* Hot word list to improve recognition rate of specific domain vocabulary. Format
* example: ["Person name", "Place name"], recommended not to exceed 100 items.
*/
private List<String> hotwords;

/**
* Unique identifier for each request (Optional) Passed by the client, must be unique.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,9 @@

import com.fasterxml.jackson.annotation.JsonProperty;

import ai.z.openapi.service.model.Segment;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import java.util.List;
import lombok.NoArgsConstructor;

@Data
Expand All @@ -26,6 +24,4 @@ public final class AudioTranscriptionResult {

private String text;

private List<Segment> segments;

}
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package ai.z.openapi.service.model;

import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
Expand All @@ -18,6 +17,4 @@ public class ChatFunction {

private ChatFunctionParameters parameters;

private List<String> required;

}
Original file line number Diff line number Diff line change
Expand Up @@ -105,36 +105,6 @@ void testAudioSpeechStreaming() {
.blockingSubscribe();
}

@Test
@DisplayName("Should generate custom speech with voice cloning successfully")
@EnabledIfEnvironmentVariable(named = "ZAI_API_KEY", matches = "^[^.]+\\.[^.]+$")
void shouldGenerateCustomSpeechWithVoiceCloningSuccessfully() throws JsonProcessingException {
// Prepare test data
String requestId = String.format(REQUEST_ID_TEMPLATE, System.currentTimeMillis());
File voiceFile = new File("src/test/resources/asr.wav");

AudioCustomizationRequest request = AudioCustomizationRequest.builder()
.model(Constants.ModelTTS)
.input("This is a test for custom voice generation.")
.voiceData(voiceFile)
.voiceText("Sample voice text for cloning")
.responseFormat("wav")
.requestId(requestId)
.build();

// Execute test
AudioCustomizationResponse response = audioService.createCustomSpeech(request);

// Verify results
assertNotNull(response, "Custom speech response should not be null");
assertTrue(response.isSuccess(), "Custom speech response should be successful");
assertNotNull(response.getData(), "Custom speech response data should not be null");
assertTrue(response.getData().exists(), "Generated custom audio file should exist");
assertTrue(response.getData().length() > 0, "Generated custom audio file should not be empty");
assertNull(response.getError(), "Response error should be null");
logger.info("Custom speech response: {}", mapper.writeValueAsString(response));
}

@Test
@DisplayName("Should transcribe audio with blocking")
@EnabledIfEnvironmentVariable(named = "ZAI_API_KEY", matches = "^[^.]+\\.[^.]+$")
Expand Down Expand Up @@ -192,12 +162,9 @@ void shouldTranscribeAudioWithStreaming() throws JsonProcessingException {
if (isFirst.getAndSet(false)) {
logger.info("Starting to receive stream transcription response:");
}
if (modelData.getChoices() != null && !modelData.getChoices().isEmpty()) {
Choice choice = modelData.getChoices().get(0);
if (choice.getDelta() != null && choice.getDelta().getContent() != null) {
logger.info("Received transcription content: {}", choice.getDelta().getContent());
messageCount.incrementAndGet();
}
if (modelData.getDelta() != null) {
logger.info("Received transcription content: {}", modelData.getDelta());
messageCount.incrementAndGet();
}
})
.doOnComplete(() -> logger.info("Stream transcription completed, received {} messages in total",
Expand Down
15 changes: 11 additions & 4 deletions samples/src/main/ai.z.openapi.samples/AudioSpeechExample.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,19 @@ public static void main(String[] args) {
// for Z.ai use the `ZaiClient`, for Zhipu AI use the ZhipuAiClient.builder().ofZHIPU().build()
ZaiClient client = ZaiClient.builder().ofZAI().build();

// Or set API Key via code
// ZaiClient client = ZaiClient.builder()
// .apiKey("your.api_key")
// .build();

// Create request
AudioSpeechRequest request = AudioSpeechRequest.builder()
.model(Constants.ModelTTS)
.input("Hello, this is a test for text-to-speech functionality.")
.voice("tongtong")
.build();
.model(Constants.ModelTTS)
.input("Hello, this is a test for text-to-speech functionality.")
.voice("tongtong")
.stream(false)
.responseFormat("pcm")
.build();

try {
// Execute request
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package ai.z.openapi.samples;

import ai.z.openapi.ZaiClient;
import ai.z.openapi.core.Constants;
import ai.z.openapi.service.audio.AudioSpeechRequest;
import ai.z.openapi.service.audio.AudioSpeechStreamingResponse;
import ai.z.openapi.service.model.Delta;
import ai.z.openapi.service.model.ModelData;

/**
* Streaming Audio Speech Example
* Demonstrates how to use ZaiClient for streaming text-to-speech conversion
*/
public class AudioSpeechStreamExample {

public static void main(String[] args) {
// Create client, recommended to set API Key via environment variable
// export ZAI_API_KEY=your.api_key
// for Z.ai use the `ZaiClient`, for Zhipu AI use the ZhipuAiClient.builder().ofZHIPU().build()
ZaiClient client = ZaiClient.builder().ofZAI().build();

// Create speech request with streaming enabled
AudioSpeechRequest request = AudioSpeechRequest.builder()
.model(Constants.ModelTTS)
.input("Hello, how's the weather today")
.voice("tongtong")
.responseFormat("pcm")
.stream(true) // Enable streaming response
.build();

try {
// Execute streaming request
AudioSpeechStreamingResponse response = client.audio().createStreamingSpeech(request);

if (response.isSuccess() && response.getFlowable() != null) {
System.out.println("Starting streaming TTS response...");

response.getFlowable().subscribe(
data -> {
// Process each streaming response chunk
if (data.getChoices() != null && !data.getChoices().isEmpty()) {
// Get content of current chunk
Delta delta = data.getChoices().get(0).getDelta();

// Print current audio content (base64 encoded)
if (delta != null && delta.getContent() != null) {
System.out.println("Received audio chunk: " + delta.getContent());
}
}
},
error -> System.err.println("\nStream error: " + error.getMessage()),
// Process streaming response completion event
() -> System.out.println("\nStreaming TTS completed")
);

// Wait for streaming to complete
Thread.sleep(10000);
} else {
System.err.println("Error: " + response.getMsg());
}
} catch (Exception e) {
System.err.println("Exception occurred: " + e.getMessage());
e.printStackTrace();
} finally {
client.close();
}
}
}
Loading