diff --git a/.gitignore b/.gitignore index 5b02ee9..66aa4c0 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,9 @@ buildNumber.properties .settings/ .project .classpath +.metals/ +.bsp/ +.bazelbsp/ # OS .DS_Store diff --git a/CHANGELOG.md b/CHANGELOG.md index 450ad94..26678be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,58 @@ # Version changelog +## Release v0.2.0 + +#### Native Rust Backend (JNI Migration) +- The SDK now uses JNI (Java Native Interface) to call the Zerobus Rust SDK instead of pure Java gRPC calls +- Native library is automatically loaded from the classpath or system library path + +#### New APIs + +**Offset-Based Ingestion API** - Preferred alternative to CompletableFuture-based API: +- `ZerobusStream.ingestRecordOffset(IngestableRecord)` - Returns offset immediately without future allocation +- `ZerobusStream.ingestRecordsOffset(Iterable)` - Batch ingestion returning `Optional` (empty for empty batch) +- `ZerobusStream.waitForOffset(long)` - Block until specific offset is acknowledged + +**JSON Record Support**: +- `IngestableRecord` interface - Unified interface for all record types +- `JsonRecord` class - JSON string wrapper implementing IngestableRecord +- `ProtoRecord` class - Protocol Buffer wrapper implementing IngestableRecord +- `RecordType` enum - Specifies stream serialization format (`PROTO` or `JSON`) +- `StreamConfigurationOptions.setRecordType(RecordType)` - Configure stream for JSON or Proto records +- Both record types work with `ingestRecord()` and `ingestRecordOffset()` methods + +**Batch Operations**: +- `ZerobusStream.ingestRecords(Iterable)` - Ingest multiple records with single acknowledgment +- `ZerobusStream.getUnackedBatches()` - Get unacknowledged records preserving batch grouping +- `EncodedBatch` class - Represents a batch of encoded records + +**Arrow Flight Support** (Experimental): +- `ZerobusArrowStream` class - High-performance columnar data ingestion +- `ArrowTableProperties` class - Table configuration with Arrow schema +- `ArrowStreamConfigurationOptions` class - Arrow stream configuration +- `ZerobusSdk.createArrowStream()` - Create Arrow Flight streams +- `ZerobusSdk.recreateArrowStream()` - Recover failed Arrow streams + +**New Callback Interface**: +- `AckCallback` interface with `onAck(long offsetId)` and `onError(long offsetId, String message)` +- More detailed error information than the deprecated Consumer-based callback + +### Deprecated + +- `ZerobusStream.ingestRecord(RecordType)` - Use `ingestRecordOffset()` instead. The offset-based API avoids CompletableFuture allocation overhead. +- `ZerobusStream.ingestRecord(IngestableRecord)` - Use `ingestRecordOffset()` instead. +- `ZerobusStream.ingestRecords(Iterable)` - Use `ingestRecordsOffset()` instead. +- `ZerobusStream.getState()` - Stream state is no longer exposed by the native backend. Returns `OPENED` or `CLOSED` only. +- `ZerobusStream.getUnackedRecords()` - Returns empty iterator. Use `getUnackedBatches()` or `getUnackedRecordsRaw()` instead. +- `StreamConfigurationOptions.Builder.setAckCallback(Consumer)` - Use `setAckCallback(AckCallback)` instead. +- `ZerobusSdk.setStubFactory()` - gRPC stub factory is no longer used with native backend. Throws `UnsupportedOperationException`. + +### Platform Support + +- Linux x86_64: Supported +- Windows x86_64: Supported +- macOS: Not yet supported (planned for future release) + ## Release v0.1.0 Initial release of the Databricks Zerobus Ingest SDK for Java. diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index 8403e31..4f1f4f0 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -1,38 +1,13 @@ # NEXT CHANGELOG -## Release v0.2.0 +## Release v0.3.0 ### New Features and Improvements -- Updated Protocol Buffers from 3.24.0 to 4.33.0 for improved performance and latest features -- Updated gRPC dependencies from 1.58.0 to 1.76.0 for enhanced stability and security -- Updated SLF4J logging framework from 1.7.36 to 2.0.17 for modern logging capabilities - ### Bug Fixes ### Documentation -- Updated README.md with new dependency versions -- Updated protoc compiler version recommendations -- Updated Logback version compatibility for SLF4J 2.0 - ### Internal Changes -- Updated maven-compiler-plugin from 3.11.0 to 3.14.1 -- All gRPC artifacts now consistently use version 1.76.0 - ### API Changes - -**Breaking Changes** - -- **Protocol Buffers 4.x Migration**: If you use the regular JAR (not the fat JAR), you must upgrade to protobuf-java 4.33.0 and regenerate any custom `.proto` files using protoc 4.x - - Download protoc 4.33.0 from: https://github.com/protocolbuffers/protobuf/releases/tag/v33.0 - - Regenerate proto files: `protoc --java_out=src/main/java src/main/proto/record.proto` - - Protobuf 4.x is binary-compatible over the wire with 3.x, but generated Java code may differ - -- **SLF4J 2.0 Migration**: If you use a logging implementation, you may need to update it: - - `slf4j-simple`: Use version 2.0.17 or later - - `logback-classic`: Use version 1.4.14 or later (for SLF4J 2.0 compatibility) - - `log4j-slf4j-impl`: Use version 2.20.0 or later - -**Note**: If you use the fat JAR (`jar-with-dependencies`), all dependencies are bundled and no action is required. diff --git a/README.md b/README.md index 13d767b..2f7372b 100644 --- a/README.md +++ b/README.md @@ -9,31 +9,64 @@ The Databricks Zerobus Ingest SDK for Java provides a high-performance client fo ## Table of Contents - [Features](#features) +- [Architecture](#architecture) - [Requirements](#requirements) - [Quick Start User Guide](#quick-start-user-guide) - [Prerequisites](#prerequisites) - [Building Your Application](#building-your-application) - - [Define Your Protocol Buffer Schema](#define-your-protocol-buffer-schema) - - [Generate Protocol Buffer Schema from Unity Catalog (Alternative)](#generate-protocol-buffer-schema-from-unity-catalog-alternative) - - [Write Your Client Code](#write-your-client-code) - - [Compile and Run](#compile-and-run) + - [Choose Your Serialization Format](#choose-your-serialization-format) - [Usage Examples](#usage-examples) - - [Blocking Ingestion](#blocking-ingestion) - - [Non-Blocking Ingestion](#non-blocking-ingestion) + - [Protocol Buffers Examples](#protocol-buffers-examples) + - [JSON Examples](#json-examples) + - [Arrow Flight Examples (Experimental)](#arrow-flight-examples-experimental) +- [API Styles](#api-styles) + - [Offset-Based API (Recommended)](#offset-based-api-recommended) + - [Future-Based API](#future-based-api) - [Configuration](#configuration) - [Logging](#logging) - [Error Handling](#error-handling) - [API Reference](#api-reference) - [Best Practices](#best-practices) +- [Related Projects](#related-projects) +- [Changelog](#changelog) ## Features -- **High-throughput ingestion**: Optimized for high-volume data ingestion +- **High-throughput ingestion**: Optimized for high-volume data ingestion via native Rust backend +- **Native performance**: JNI bindings to a high-performance Rust implementation - **Automatic recovery**: Built-in retry and recovery mechanisms - **Flexible configuration**: Customizable stream behavior and timeouts - **Protocol Buffers**: Strongly-typed schema using protobuf +- **JSON support**: Ingest JSON records without Protocol Buffer schemas +- **Arrow Flight**: (Experimental) High-performance columnar data ingestion +- **Offset-based API**: Low-overhead alternative to CompletableFuture for high throughput - **OAuth 2.0 authentication**: Secure authentication with client credentials +## Architecture + +The Java SDK uses JNI (Java Native Interface) to call a high-performance Rust implementation. This architecture provides: + +- **Lower latency**: Direct native calls avoid Java gRPC overhead +- **Reduced memory**: Offset-based API eliminates CompletableFuture allocation per record +- **Better throughput**: Optimized Rust async runtime handles network I/O efficiently +- **Consistent behavior**: Same Rust core as the Python and Rust SDKs + +``` +┌─────────────────────────────────────────────────────────┐ +│ Java Application │ +├─────────────────────────────────────────────────────────┤ +│ ZerobusSdk │ ZerobusStream │ ZerobusArrowStream │ +├─────────────────────────────────────────────────────────┤ +│ JNI Bridge │ +├─────────────────────────────────────────────────────────┤ +│ Native Rust SDK (libzerobus_jni) │ +│ ┌─────────────┐ ┌─────────────┐ ┌────────────┐ │ +│ │ Tokio │ │ gRPC/ │ │ Arrow │ │ +│ │ Runtime │ │ HTTP/2 │ │ Flight │ │ +│ └─────────────┘ └─────────────┘ └────────────┘ │ +└─────────────────────────────────────────────────────────┘ +``` + ## Requirements ### Runtime Requirements @@ -41,6 +74,19 @@ The Databricks Zerobus Ingest SDK for Java provides a high-performance client fo - **Java**: 8 or higher - [Download Java](https://adoptium.net/) - **Databricks workspace** with Zerobus access enabled +### Supported Platforms + +This SDK includes native libraries for the following platforms: + +| Platform | Architecture | Status | +|----------|--------------|--------| +| Linux | x86_64 | ✅ Supported | +| Windows | x86_64 | ✅ Supported | +| macOS | x86_64 | ❌ Not yet supported | +| macOS | aarch64 (Apple Silicon) | ❌ Not yet supported | + +> **Note:** macOS support is planned for a future release. If you need macOS support, please [file an issue](https://github.com/databricks/zerobus-sdk-java/issues). + ### Dependencies **When using the fat JAR** (recommended for most users): @@ -48,10 +94,6 @@ The Databricks Zerobus Ingest SDK for Java provides a high-performance client fo **When using the regular JAR**: - [`protobuf-java` 4.33.0](https://mvnrepository.com/artifact/com.google.protobuf/protobuf-java/4.33.0) -- [`grpc-netty-shaded` 1.76.0](https://mvnrepository.com/artifact/io.grpc/grpc-netty-shaded/1.76.0) -- [`grpc-protobuf` 1.76.0](https://mvnrepository.com/artifact/io.grpc/grpc-protobuf/1.76.0) -- [`grpc-stub` 1.76.0](https://mvnrepository.com/artifact/io.grpc/grpc-stub/1.76.0) -- [`javax.annotation-api` 1.3.2](https://mvnrepository.com/artifact/javax.annotation/javax.annotation-api/1.3.2) - [`slf4j-api` 2.0.17](https://mvnrepository.com/artifact/org.slf4j/slf4j-api/2.0.17) - An SLF4J implementation such as [`slf4j-simple` 2.0.17](https://mvnrepository.com/artifact/org.slf4j/slf4j-simple/2.0.17) or [`logback-classic` 1.4.14](https://mvnrepository.com/artifact/ch.qos.logback/logback-classic/1.4.14) @@ -134,7 +176,7 @@ Add the SDK as a dependency in your `pom.xml`: com.databricks zerobus-ingest-sdk - 0.1.0 + 0.2.0 ``` @@ -143,7 +185,7 @@ Or with Gradle (`build.gradle`): ```groovy dependencies { - implementation 'com.databricks:zerobus-ingest-sdk:0.1.0' + implementation 'com.databricks:zerobus-ingest-sdk:0.2.0' } ``` @@ -156,7 +198,7 @@ dependencies { com.databricks zerobus-ingest-sdk - 0.1.0 + 0.2.0 @@ -165,21 +207,6 @@ dependencies { protobuf-java 4.33.0 - - io.grpc - grpc-netty-shaded - 1.76.0 - - - io.grpc - grpc-protobuf - 1.76.0 - - - io.grpc - grpc-stub - 1.76.0 - org.slf4j slf4j-api @@ -190,11 +217,6 @@ dependencies { slf4j-simple 2.0.17 - - javax.annotation - javax.annotation-api - 1.3.2 - ``` @@ -207,7 +229,7 @@ If you prefer the self-contained fat JAR with all dependencies included: com.databricks zerobus-ingest-sdk - 0.1.0 + 0.2.0 jar-with-dependencies @@ -217,7 +239,7 @@ Or with Gradle: ```groovy dependencies { - implementation 'com.databricks:zerobus-ingest-sdk:0.1.0:jar-with-dependencies' + implementation 'com.databricks:zerobus-ingest-sdk:0.2.0:jar-with-dependencies' } ``` @@ -235,11 +257,11 @@ mvn clean package This generates two JAR files in the `target/` directory: -- **Regular JAR**: `zerobus-ingest-sdk-0.1.0.jar` (155KB) +- **Regular JAR**: `zerobus-ingest-sdk-0.2.0.jar` (~12MB, includes native libraries) - Contains only the SDK classes - Requires all dependencies on the classpath -- **Fat JAR**: `zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar` (18MB) +- **Fat JAR**: `zerobus-ingest-sdk-0.2.0-jar-with-dependencies.jar` (~19MB, includes native libraries + all dependencies) - Contains SDK classes plus all dependencies bundled - Self-contained, easier to deploy @@ -283,7 +305,7 @@ Create `pom.xml`: com.databricks zerobus-ingest-sdk - 0.1.0 + 0.2.0 @@ -347,16 +369,16 @@ The proto generation tool requires the fat JAR (all dependencies included): ```bash # Download from Maven Central -wget https://repo1.maven.org/maven2/com/databricks/zerobus-ingest-sdk/0.1.0/zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar +wget https://repo1.maven.org/maven2/com/databricks/zerobus-ingest-sdk/0.2.0/zerobus-ingest-sdk-0.2.0-jar-with-dependencies.jar # Or if you built from source, it's in target/ -# cp target/zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar . +# cp target/zerobus-ingest-sdk-0.2.0-jar-with-dependencies.jar . ``` **Run the tool:** ```bash -java -jar zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar \ +java -jar zerobus-ingest-sdk-0.2.0-jar-with-dependencies.jar \ --uc-endpoint "https://dbc-a1b2c3d4-e5f6.cloud.databricks.com" \ --client-id "your-service-principal-application-id" \ --client-secret "your-service-principal-secret" \ @@ -540,77 +562,263 @@ Successfully ingested 100 records! ## Usage Examples -See the `examples/` directory for complete working examples: +The `examples/` directory contains complete working examples organized by data format: -- **BlockingIngestionExample.java** - Synchronous ingestion with progress tracking -- **NonBlockingIngestionExample.java** - High-throughput asynchronous ingestion +``` +examples/ +├── README.md # Overview and comparison +├── proto/ # Protocol Buffer examples +│ ├── README.md +│ ├── SingleRecordExample.java +│ └── BatchIngestionExample.java +├── json/ # JSON examples +│ ├── README.md +│ ├── SingleRecordExample.java +│ └── BatchIngestionExample.java +└── arrow/ # Arrow Flight examples (experimental) + ├── README.md + ├── SingleBatchExample.java + └── MultiBatchExample.java +``` + +### Protocol Buffers Examples + +Best for production systems with type safety and schema validation: + +```bash +# Single record ingestion +cd examples/proto +protoc --java_out=. air_quality.proto +javac -d . -cp "../../target/zerobus-ingest-sdk-*-jar-with-dependencies.jar:." *.java +java -cp "../../target/zerobus-ingest-sdk-*-jar-with-dependencies.jar:." \ + com.databricks.zerobus.examples.proto.SingleRecordExample + +# Batch ingestion +java -cp "../../target/zerobus-ingest-sdk-*-jar-with-dependencies.jar:." \ + com.databricks.zerobus.examples.proto.BatchIngestionExample +``` -### Blocking Ingestion +### JSON Examples -Ingest records synchronously, waiting for each record to be acknowledged: +Best for rapid prototyping and flexible schemas: + +```bash +cd examples/json +javac -d . -cp "../../target/zerobus-ingest-sdk-*-jar-with-dependencies.jar:../proto:." *.java +java -cp "../../target/zerobus-ingest-sdk-*-jar-with-dependencies.jar:../proto:." \ + com.databricks.zerobus.examples.json.SingleRecordExample +``` + +**Important:** JSON streams require `RecordType.JSON` in configuration: + +```java +StreamConfigurationOptions options = StreamConfigurationOptions.builder() + .setRecordType(RecordType.JSON) + .build(); +``` + +### Arrow Flight Examples (Experimental) + +Best for high-volume columnar data. Requires Apache Arrow libraries. + +```bash +cd examples/arrow +javac -d . -cp "../../target/zerobus-ingest-sdk-*-jar-with-dependencies.jar:." *.java +java -cp "../../target/zerobus-ingest-sdk-*-jar-with-dependencies.jar::." \ + com.databricks.zerobus.examples.arrow.SingleBatchExample +``` + +See [`examples/README.md`](examples/README.md) for detailed documentation. + +--- + +## API Styles + +The SDK provides two ingestion styles: + +| Style | Status | Best For | Overhead | +|-------|--------|----------|----------| +| **Offset-Based** | Recommended | All use cases | Minimal - no object allocation | +| **Future-Based** | Deprecated | Legacy code | CompletableFuture per record | + +### Offset-Based API (Recommended) + +Use the offset-based API for all new code. It avoids `CompletableFuture` allocation overhead: ```java ZerobusStream stream = sdk.createStream( - tableProperties, - clientId, - clientSecret + tableProperties, clientId, clientSecret, options ).join(); +try { + long lastOffset = -1; + + // Ingest records as fast as possible + for (int i = 0; i < 1000000; i++) { + AirQuality record = AirQuality.newBuilder() + .setDeviceName("sensor-" + (i % 100)) + .setTemp(20 + i % 15) + .setHumidity(50 + i % 40) + .build(); + + // Returns immediately after queuing (non-blocking) + lastOffset = stream.ingestRecordOffset(record); + } + + // Wait for all records to be acknowledged + stream.waitForOffset(lastOffset); +} finally { + stream.close(); +} +``` + +### Future-Based API (Deprecated) + +> **Deprecated:** Use the offset-based API instead for better performance. + +The future-based API is still available for backward compatibility but will be removed in a future release: + +```java +// DEPRECATED - use ingestRecordOffset() instead try { for (int i = 0; i < 1000; i++) { AirQuality record = AirQuality.newBuilder() .setDeviceName("sensor-" + i) .setTemp(20 + i % 15) - .setHumidity(50 + i % 40) .build(); - stream.ingestRecord(record).join(); // Wait for durability + stream.ingestRecord(record).join(); // Deprecated } } finally { stream.close(); } ``` -### Non-Blocking Ingestion +**Migration:** +```java +// Before (deprecated): +stream.ingestRecord(record).join(); +stream.ingestRecords(batch).join(); + +// After (recommended): +long offset = stream.ingestRecordOffset(ProtoRecord.of(record)); +stream.waitForOffset(offset); + +Optional batchOffset = stream.ingestRecordsOffset(batch); +batchOffset.ifPresent(stream::waitForOffset); +``` + +--- + +## Choose Your Serialization Format + +| Format | Best For | Pros | Cons | +|--------|----------|------|------| +| **Protocol Buffers** | Production systems | Type-safe, compact, fast | Requires schema compilation | +| **JSON** | Prototyping, flexible schemas | Human-readable, no compilation | Larger payload, slower | +| **Arrow** | High-volume columnar data | Maximum throughput | Extra dependencies, complex | -Ingest records asynchronously for maximum throughput: +### JSON Stream Configuration + +To ingest JSON records, configure the stream with `RecordType.JSON`: ```java StreamConfigurationOptions options = StreamConfigurationOptions.builder() - .setMaxInflightRecords(50000) - .setAckCallback(response -> - System.out.println("Acknowledged offset: " + - response.getDurabilityAckUpToOffset())) + .setRecordType(RecordType.JSON) // Required for JSON records + .setMaxInflightRecords(10000) .build(); ZerobusStream stream = sdk.createStream( + tableProperties, clientId, clientSecret, options +).join(); + +// Now you can ingest JSON records +JsonRecord record = JsonRecord.of("{\"device_name\": \"sensor-1\", \"temp\": 25}"); +long offset = stream.ingestRecordOffset(record); +stream.waitForOffset(offset); +``` + +### Arrow Flight (Experimental) + +For high-performance columnar data ingestion using Apache Arrow: + +```java +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; + +// Define the Arrow schema +Schema schema = new Schema(Arrays.asList( + Field.nullable("device_name", new ArrowType.Utf8()), + Field.nullable("temp", new ArrowType.Int(32, true)), + Field.nullable("humidity", new ArrowType.Int(64, true)) +)); + +// Create table properties +ArrowTableProperties tableProperties = new ArrowTableProperties( + "catalog.schema.air_quality", + schema +); + +// Create Arrow stream +ZerobusArrowStream stream = sdk.createArrowStream( tableProperties, clientId, - clientSecret, - options + clientSecret ).join(); -List> futures = new ArrayList<>(); +BufferAllocator allocator = new RootAllocator(); -try { - for (int i = 0; i < 100000; i++) { - AirQuality record = AirQuality.newBuilder() - .setDeviceName("sensor-" + (i % 10)) - .setTemp(20 + i % 15) - .setHumidity(50 + i % 40) - .build(); +try (VectorSchemaRoot batch = VectorSchemaRoot.create(schema, allocator)) { + VarCharVector deviceName = (VarCharVector) batch.getVector("device_name"); + IntVector temp = (IntVector) batch.getVector("temp"); + IntVector humidity = (IntVector) batch.getVector("humidity"); - futures.add(stream.ingestRecord(record)); + // Populate the batch + int batchSize = 1000; + deviceName.allocateNew(batchSize); + temp.allocateNew(batchSize); + humidity.allocateNew(batchSize); + + for (int i = 0; i < batchSize; i++) { + deviceName.setSafe(i, ("sensor-" + (i % 10)).getBytes()); + temp.setSafe(i, 20 + i % 15); + humidity.setSafe(i, 50 + i % 40); } - // Flush and wait for all records - stream.flush(); - CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join(); + batch.setRowCount(batchSize); + + // Ingest the batch + long offset = stream.ingestBatch(batch); + stream.waitForOffset(offset); + + System.out.println("Ingested " + batchSize + " rows via Arrow Flight!"); } finally { stream.close(); + allocator.close(); } ``` +**Note:** Arrow Flight support requires Apache Arrow Java libraries on the classpath: + +```xml + + org.apache.arrow + arrow-vector + 17.0.0 + + + org.apache.arrow + arrow-memory-netty + 17.0.0 + +``` + ## Configuration ### Stream Configuration Options @@ -624,8 +832,22 @@ try { | `recoveryRetries` | 3 | Maximum number of recovery attempts | | `flushTimeoutMs` | 300000 | Timeout for flush operations (ms) | | `serverLackOfAckTimeoutMs` | 60000 | Server acknowledgment timeout (ms) | +| `recordType` | `PROTO` | Record serialization format: `RecordType.PROTO` or `RecordType.JSON` | | `ackCallback` | None | Callback invoked on record acknowledgment | +### Arrow Stream Configuration Options + +| Option | Default | Description | +|--------|---------|-------------| +| `maxInflightBatches` | 1000 | Maximum number of unacknowledged batches | +| `recovery` | true | Enable automatic stream recovery | +| `recoveryTimeoutMs` | 15000 | Timeout for recovery operations (ms) | +| `recoveryBackoffMs` | 2000 | Delay between recovery attempts (ms) | +| `recoveryRetries` | 4 | Maximum number of recovery attempts | +| `flushTimeoutMs` | 300000 | Timeout for flush operations (ms) | +| `serverLackOfAckTimeoutMs` | 60000 | Server acknowledgment timeout (ms) | +| `connectionTimeoutMs` | 30000 | Arrow Flight connection timeout (ms) | + ## Logging The Databricks Zerobus Ingest SDK for Java uses the standard [SLF4J logging framework](https://www.slf4j.org/). The SDK only depends on `slf4j-api`, which means **you need to add an SLF4J implementation** to your classpath to see log output. @@ -805,12 +1027,25 @@ Recreates a failed stream, resending unacknowledged records. Returns a Completab Represents an active ingestion stream. -**Methods:** +**Offset-Based Methods (Recommended):** + +```java +long ingestRecordOffset(RecordType record) throws ZerobusException +long ingestRecordOffset(IngestableRecord record) throws ZerobusException +``` +Ingests a record and returns the offset immediately. Does NOT wait for acknowledgment. ```java -CompletableFuture ingestRecord(RecordType record) throws ZerobusException +Optional ingestRecordsOffset(Iterable records) throws ZerobusException ``` -Ingests a single record into the stream. Returns a future that completes when the record is durably written to storage. +Ingests multiple records and returns the batch offset, or empty if the batch was empty. Does NOT wait for acknowledgment. + +```java +void waitForOffset(long offset) throws ZerobusException +``` +Blocks until the specified offset is acknowledged by the server. + +**Stream Lifecycle:** ```java void flush() throws ZerobusException @@ -823,24 +1058,35 @@ void close() throws ZerobusException Flushes and closes the stream gracefully. Always call in a `finally` block. ```java -StreamState getState() +boolean isClosed() ``` -Returns the current stream state (`UNINITIALIZED`, `OPENED`, `FLUSHING`, `RECOVERING`, `CLOSED`, `FAILED`). +Returns true if the stream is closed. ```java -String getStreamId() +@Deprecated StreamState getState() ``` -Returns the unique stream ID assigned by the server. +**Deprecated.** Returns `OPENED` or `CLOSED` only. Use `isClosed()` instead. + +**Recovery Methods:** ```java -TableProperties getTableProperties() +List getUnackedBatches() throws ZerobusException +``` +Returns unacknowledged records grouped by batch. Use with `recreateStream()`. + +```java +List getUnackedRecordsRaw() throws ZerobusException ``` -Returns the table properties for this stream. +Returns unacknowledged records as raw byte arrays. + +**Accessors:** ```java +TableProperties getTableProperties() StreamConfigurationOptions getOptions() +String getClientId() +String getClientSecret() ``` -Returns the stream configuration options. --- @@ -989,12 +1235,155 @@ NonRetriableException(String message) NonRetriableException(String message, Throwable cause) ``` +--- + +### IngestableRecord (Interface) + +Unified interface for all record types that can be ingested. + +```java +byte[] toEncodedBytes() +``` +Returns the encoded record bytes (protobuf or UTF-8 JSON). + +```java +boolean isJson() +``` +Returns true if this is a JSON record, false for Protocol Buffer. + +--- + +### ProtoRecord\ + +Wrapper for Protocol Buffer messages implementing `IngestableRecord`. + +**Constructor:** +```java +ProtoRecord(T message) +``` + +**Factory Method:** +```java +static ProtoRecord of(T message) +``` + +--- + +### JsonRecord + +Wrapper for JSON strings implementing `IngestableRecord`. + +**Constructor:** +```java +JsonRecord(String json) +``` + +**Factory Methods:** +```java +static JsonRecord of(String json) +static JsonRecord fromObject(T object, JsonSerializer serializer) +``` + +--- + +### AckCallback (Interface) + +Callback interface for acknowledgment notifications. + +```java +void onAck(long offsetId) +``` +Called when records up to `offsetId` are acknowledged. + +```java +void onError(long offsetId, String errorMessage) +``` +Called when an error occurs for records at or after `offsetId`. + +--- + +### ZerobusArrowStream + +Arrow Flight stream for columnar data ingestion. + +**Methods:** + +```java +long ingestBatch(Object batch) throws ZerobusException +``` +Ingests a `VectorSchemaRoot` and returns the offset. + +```java +long ingestBatchRaw(byte[] batchData) throws ZerobusException +``` +Ingests pre-serialized Arrow IPC data. + +```java +void waitForOffset(long offset) throws ZerobusException +void flush() throws ZerobusException +void close() throws ZerobusException +boolean isClosed() +String getTableName() +Object getSchema() +ArrowTableProperties getTableProperties() +ArrowStreamConfigurationOptions getOptions() +List getUnackedBatchesRaw() throws ZerobusException +``` + +--- + +### ArrowTableProperties + +Table properties for Arrow streams. + +**Constructor:** +```java +ArrowTableProperties(String tableName, Object schema) +``` +- `tableName` - Fully qualified table name +- `schema` - Apache Arrow `Schema` object + +--- + +### ArrowStreamConfigurationOptions + +Configuration for Arrow streams. Similar to `StreamConfigurationOptions` with Arrow-specific settings. + +**Builder Methods:** +```java +setMaxInflightBatches(int) +setRecovery(boolean) +setRecoveryTimeoutMs(long) +setRecoveryBackoffMs(long) +setRecoveryRetries(int) +setServerLackOfAckTimeoutMs(long) +setFlushTimeoutMs(long) +setConnectionTimeoutMs(long) +build() +``` + ## Best Practices 1. **Reuse SDK instances**: Create one `ZerobusSdk` instance per application -2. **Stream lifecycle**: Always close streams in a `finally` block -3. **Batch size**: Adjust `maxInflightRecords` based on your throughput requirements -4. **Error handling**: Implement proper retry logic for retriable errors -5. **Monitoring**: Use `ackCallback` to track ingestion progress -6. **Token refresh**: Tokens are automatically refreshed on stream creation and recovery -7. **Proto generation**: Use the built-in `GenerateProto` tool to automatically generate proto files from your table schemas +2. **Stream lifecycle**: Always close streams in a `finally` block or use try-with-resources +3. **Use offset-based API for high throughput**: `ingestRecordOffset()` avoids `CompletableFuture` overhead +4. **Batch records when possible**: Use `ingestRecordsOffset()` for multiple records +5. **Configure `maxInflightRecords`**: Adjust based on your throughput and memory requirements +6. **Implement proper error handling**: Distinguish between retriable and non-retriable errors +7. **Use `AckCallback` for monitoring**: Track acknowledgment progress without blocking +8. **Proto generation**: Use the built-in `GenerateProto` tool to generate proto files from table schemas +9. **Choose the right API**: + - `ingestRecord()` → Simple use cases, moderate throughput + - `ingestRecordOffset()` + `waitForOffset()` → High throughput, fine-grained control + - Arrow Flight → Columnar data, very high throughput +10. **Recovery pattern**: Use `getUnackedBatches()` and `recreateStream()` for failure recovery + +## Related Projects + +- [Zerobus SDK for Rust](https://github.com/databricks/zerobus-sdk-rs) - The core Rust SDK (also used as the native backend for this Java SDK) +- [Zerobus SDK for Python](https://github.com/databricks/zerobus-sdk-py) - Python bindings for the Rust SDK +- [Zerobus SDK for TypeScript](https://github.com/databricks/zerobus-sdk-ts) - TypeScript/Node.js bindings for the Rust SDK + +## Changelog + +See [CHANGELOG.md](CHANGELOG.md) for a detailed list of changes in each release. diff --git a/examples/README.md b/examples/README.md index 183bf69..0c9ecdb 100644 --- a/examples/README.md +++ b/examples/README.md @@ -2,128 +2,271 @@ This directory contains example applications demonstrating different usage patterns of the Zerobus Ingest SDK for Java. -## Examples +## Overview -### 1. Blocking Ingestion (`BlockingIngestionExample.java`) +The examples are organized by data format and demonstrate both single-record and batch ingestion patterns. -Demonstrates synchronous record ingestion where each record is waited for before proceeding to the next. +**Features demonstrated:** +- Offset-based API for high throughput +- Future-based API for simpler code +- Batch ingestion for bulk data +- Stream configuration and recovery +- Acknowledgment callbacks -**Best for:** -- Low-volume ingestion (< 1000 records/sec) -- Use cases requiring immediate confirmation per record -- Critical data where you need to handle errors immediately +## Directory Structure -**Key features:** -- Waits for each record to be durably written -- Simple error handling -- Predictable behavior -- Lower throughput +``` +examples/ +├── README.md (this file) +├── proto/ (Protocol Buffer examples) +│ ├── README.md +│ ├── air_quality.proto (schema definition) +│ ├── SingleRecordExample.java +│ └── BatchIngestionExample.java +├── json/ (JSON examples) +│ ├── README.md +│ ├── SingleRecordExample.java +│ └── BatchIngestionExample.java +└── arrow/ (Arrow Flight examples - experimental) + ├── README.md + ├── SingleBatchExample.java + └── MultiBatchExample.java +``` + +## Examples Overview + +| Example | Format | Method | Description | +|---------|--------|--------|-------------| +| `proto/SingleRecordExample` | Protocol Buffers | Single | One record at a time with offset tracking | +| `proto/BatchIngestionExample` | Protocol Buffers | Batch | Records in batches for higher throughput | +| `json/SingleRecordExample` | JSON | Single | JSON records one at a time | +| `json/BatchIngestionExample` | JSON | Batch | JSON records in batches | +| `arrow/SingleBatchExample` | Arrow | Single batch | One Arrow RecordBatch | +| `arrow/MultiBatchExample` | Arrow | Multi batch | Multiple Arrow batches | + +## Prerequisites + +### 1. Create a Delta Table + +```sql +CREATE TABLE .default.air_quality ( + device_name STRING, + temp INT, + humidity BIGINT +) USING DELTA; +``` + +### 2. Set Up Service Principal + +Create a service principal with `SELECT` and `MODIFY` permissions on the table. + +### 3. Set Environment Variables -**Run:** ```bash -javac -cp "../target/databricks-zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar" \ - src/main/java/com/databricks/zerobus/examples/BlockingIngestionExample.java +export DATABRICKS_CLIENT_ID="your-client-id" +export DATABRICKS_CLIENT_SECRET="your-client-secret" +``` -java -cp "../target/databricks-zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar:src/main/java" \ - com.databricks.zerobus.examples.BlockingIngestionExample +### 4. Build the SDK + +```bash +cd .. # Go to SDK root +mvn package -DskipTests ``` -### 2. Non-Blocking Ingestion (`NonBlockingIngestionExample.java`) +## Running Examples -Demonstrates asynchronous record ingestion for maximum throughput. +### Protocol Buffer Examples + +```bash +cd proto -**Best for:** -- High-volume ingestion (> 10,000 records/sec) -- Batch processing scenarios -- Stream processing applications -- Maximum throughput requirements +# Compile the proto file +protoc --java_out=. air_quality.proto -**Key features:** -- Asynchronous ingestion with CompletableFutures -- Automatic buffering and flow control -- Ack callback for progress tracking -- Batch flush at the end -- Higher throughput +# Compile the examples +javac -d . -cp "../../target/zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar:." *.java + +# Run single record example +java -cp "../../target/zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar:." \ + com.databricks.zerobus.examples.proto.SingleRecordExample + +# Run batch example +java -cp "../../target/zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar:." \ + com.databricks.zerobus.examples.proto.BatchIngestionExample +``` + +**Expected output:** +``` +=== Proto Single Record Ingestion Example === +Server: https://.zerobus..cloud.databricks.com +Table: .. +Records: 1000 + +Creating stream... +Stream created successfully. +Queued 100 records (offset: 99) +... +=== Ingestion Complete === +Records: 1,000 +Duration: 0.52 seconds +Throughput: 1,912 records/sec +``` + +### JSON Examples -**Run:** ```bash -javac -cp "../target/databricks-zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar" \ - src/main/java/com/databricks/zerobus/examples/NonBlockingIngestionExample.java +cd json + +# Compile (needs proto directory for TableProperties) +javac -d . -cp "../../target/zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar:../proto:." *.java -java -cp "../target/databricks-zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar:src/main/java" \ - com.databricks.zerobus.examples.NonBlockingIngestionExample +# Run single record example +java -cp "../../target/zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar:../proto:." \ + com.databricks.zerobus.examples.json.SingleRecordExample + +# Run batch example +java -cp "../../target/zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar:../proto:." \ + com.databricks.zerobus.examples.json.BatchIngestionExample ``` -## Configuration +### Arrow Examples + +```bash +cd arrow + +# Compile (Arrow libs loaded at runtime via reflection) +javac -d . -cp "../../target/zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar:." *.java + +# Run (requires Arrow JARs on classpath) +java -cp "../../target/zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar::." \ + com.databricks.zerobus.examples.arrow.SingleBatchExample +``` -Before running the examples, update the following constants in each example file: +## API Styles + +All examples demonstrate the **offset-based API** which is recommended for high throughput: ```java -private static final String SERVER_ENDPOINT = "your-shard-id.zerobus.region.cloud.databricks.com"; -private static final String UNITY_CATALOG_ENDPOINT = "https://your-workspace.cloud.databricks.com"; -private static final String TABLE_NAME = "catalog.schema.table"; -private static final String CLIENT_ID = "your-oauth-client-id"; -private static final String CLIENT_SECRET = "your-oauth-client-secret"; +// Offset-based (recommended) - returns immediately after queuing +long offset = stream.ingestRecordOffset(record); +// ... ingest more records ... +stream.waitForOffset(offset); // Wait for acknowledgment when needed ``` -## Protobuf Schema +The SDK also supports a **future-based API** for simpler code: -The examples use an `AirQuality` message defined as: +```java +// Future-based - returns CompletableFuture +stream.ingestRecord(record).join(); // Wait immediately +``` -```proto -syntax = "proto2"; +| API Style | Best For | Overhead | +|-----------|----------|----------| +| **Offset-based** | High throughput (>10K rec/sec) | Minimal - no object allocation | +| **Future-based** | Simple use cases | CompletableFuture per record | -message AirQuality { - optional string device_name = 1; - optional int32 temp = 2; - optional int64 humidity = 3; -} +## Choosing the Right Format + +### By Data Format + +| Format | Best For | Pros | Cons | +|--------|----------|------|------| +| **Protocol Buffers** | Production systems | Type-safe, compact, fast | Requires schema compilation | +| **JSON** | Prototyping, flexible schemas | Human-readable, no compilation | Larger payload, slower | +| **Arrow** | High-volume columnar data | Maximum throughput | Extra dependencies | + +### By Ingestion Pattern + +| Pattern | Best For | Throughput | Memory | +|---------|----------|------------|--------| +| **Single record** | Low-volume, fine control | Low-Medium | Low | +| **Batch** | High-volume, bulk loads | High | Medium | +| **Arrow batch** | Columnar data, analytics | Very High | Higher | + +## Configuration + +All examples can be configured by modifying constants at the top of each file: + +```java +private static final String SERVER_ENDPOINT = "https://.zerobus..."; +private static final String WORKSPACE_URL = "https://.cloud.databricks.com"; +private static final String TABLE_NAME = "..
"; ``` -To use your own schema: -1. Define your `.proto` file -2. Generate Java classes: `protoc --java_out=. your_schema.proto` -3. Update the examples to use your message type instead of `Record.AirQuality` +## Common Patterns + +### High-Throughput Ingestion + +```java +// Configure for maximum throughput +StreamConfigurationOptions options = StreamConfigurationOptions.builder() + .setMaxInflightRecords(100000) + .build(); + +// Ingest many records, wait once at the end +long lastOffset = -1; +for (Record r : records) { + lastOffset = stream.ingestRecordOffset(r); +} +stream.waitForOffset(lastOffset); +``` -## Performance Comparison +### With Acknowledgment Callback -Typical performance characteristics (results may vary): +```java +AckCallback callback = new AckCallback() { + public void onAck(long offsetId) { + System.out.println("Acked: " + offsetId); + } + public void onError(long offsetId, String msg) { + System.err.println("Error at " + offsetId + ": " + msg); + } +}; + +StreamConfigurationOptions options = StreamConfigurationOptions.builder() + .setAckCallback(callback) + .build(); +``` -| Metric | Blocking | Non-Blocking | -|--------|----------|--------------| -| Throughput | ~100-500 records/sec | ~10,000-50,000 records/sec | -| Latency (avg) | Low per record | Higher per record, lower overall | -| Memory usage | Low | Medium (buffering) | -| Complexity | Simple | Moderate | -| Error handling | Immediate | Deferred to flush | +### JSON Stream Configuration -## Best Practices +```java +// Required for JSON records +StreamConfigurationOptions options = StreamConfigurationOptions.builder() + .setRecordType(RecordType.JSON) + .build(); +``` -1. **Choose the right pattern**: Use blocking for low-volume/critical data, non-blocking for high-volume -2. **Monitor progress**: Use `ackCallback` in non-blocking mode to track progress -3. **Handle errors**: Always wrap ingestion in try-catch blocks -4. **Close streams**: Always close streams in a `finally` block or use try-with-resources -5. **Tune buffer size**: Adjust `maxInflightRecords` based on your throughput needs +## Troubleshooting -## Common Issues +### Missing credentials +``` +Error: DATABRICKS_CLIENT_ID and DATABRICKS_CLIENT_SECRET must be set +``` +**Solution:** Export the environment variables before running. -### Out of Memory -Increase JVM heap size: -```bash -java -Xmx4g -cp ... com.databricks.zerobus.examples.NonBlockingIngestionExample +### Connection timeout ``` +ZerobusException: Connection timed out +``` +**Solution:** Check network connectivity and SERVER_ENDPOINT value. Ensure it starts with `https://`. -### Authentication Failures -- Verify your CLIENT_ID and CLIENT_SECRET are correct -- Check that your OAuth client has permissions for the target table +### Record type mismatch +``` +NonRetriableException: Record type does not match stream configuration +``` +**Solution:** Use `RecordType.JSON` in options when ingesting JSON records. -### Slow Performance -- Use non-blocking mode for better throughput -- Increase `maxInflightRecords` in stream configuration -- Check network connectivity to the Zerobus endpoint +### Arrow not found +``` +Error: Apache Arrow libraries not found on classpath +``` +**Solution:** Add `arrow-vector` and `arrow-memory-netty` JARs to classpath. ## Additional Resources - [SDK Documentation](../README.md) +- [Changelog](../CHANGELOG.md) - [Protocol Buffers Guide](https://developers.google.com/protocol-buffers) -- [Databricks Documentation](https://docs.databricks.com) +- [Apache Arrow Java](https://arrow.apache.org/docs/java/) diff --git a/examples/arrow/MultiBatchExample.java b/examples/arrow/MultiBatchExample.java new file mode 100644 index 0000000..795af45 --- /dev/null +++ b/examples/arrow/MultiBatchExample.java @@ -0,0 +1,206 @@ +package com.databricks.zerobus.examples.arrow; + +import com.databricks.zerobus.*; + +/** + * Multi-batch Arrow Flight ingestion example. + * + *

Demonstrates ingesting multiple Arrow RecordBatches for high-volume data. + * This pattern is useful for: + *

    + *
  • Streaming large datasets in chunks
  • + *
  • Memory-efficient processing of large files
  • + *
  • Continuous data pipelines
  • + *
+ * + *

Note: This example requires Apache Arrow Java libraries on the classpath. + * + *

Run with: {@code java -cp com.databricks.zerobus.examples.arrow.MultiBatchExample} + * + *

Required environment variables: + *

    + *
  • ZEROBUS_SERVER_ENDPOINT - The Zerobus server endpoint URL
  • + *
  • DATABRICKS_WORKSPACE_URL - The Databricks workspace URL
  • + *
  • ZEROBUS_TABLE_NAME - The target table name (catalog.schema.table)
  • + *
  • DATABRICKS_CLIENT_ID - Service principal application ID
  • + *
  • DATABRICKS_CLIENT_SECRET - Service principal secret
  • + *
+ */ +public class MultiBatchExample { + + private static final int ROWS_PER_BATCH = 5000; + private static final int NUM_BATCHES = 10; + private static final int TOTAL_ROWS = ROWS_PER_BATCH * NUM_BATCHES; + + private static String serverEndpoint; + private static String workspaceUrl; + private static String tableName; + + public static void main(String[] args) throws Exception { + serverEndpoint = System.getenv("ZEROBUS_SERVER_ENDPOINT"); + workspaceUrl = System.getenv("DATABRICKS_WORKSPACE_URL"); + tableName = System.getenv("ZEROBUS_TABLE_NAME"); + String clientId = System.getenv("DATABRICKS_CLIENT_ID"); + String clientSecret = System.getenv("DATABRICKS_CLIENT_SECRET"); + + if (serverEndpoint == null || workspaceUrl == null || tableName == null + || clientId == null || clientSecret == null) { + System.err.println("Error: Required environment variables not set."); + System.err.println("Set: ZEROBUS_SERVER_ENDPOINT, DATABRICKS_WORKSPACE_URL, ZEROBUS_TABLE_NAME,"); + System.err.println(" DATABRICKS_CLIENT_ID, DATABRICKS_CLIENT_SECRET"); + System.exit(1); + } + + System.out.println("=== Arrow Multi-Batch Ingestion Example ==="); + System.out.printf("Server: %s%n", serverEndpoint); + System.out.printf("Table: %s%n", tableName); + System.out.printf("Total rows: %d (%d batches x %d rows)%n%n", TOTAL_ROWS, NUM_BATCHES, ROWS_PER_BATCH); + + // Check if Arrow is available + try { + Class.forName("org.apache.arrow.vector.VectorSchemaRoot"); + } catch (ClassNotFoundException e) { + System.err.println("Error: Apache Arrow libraries not found on classpath."); + System.err.println("Add arrow-vector and arrow-memory-netty dependencies."); + System.exit(1); + } + + runWithArrow(clientId, clientSecret); + } + + private static void runWithArrow(String clientId, String clientSecret) throws Exception { + // Use reflection to avoid compile-time Arrow dependency + Class rootAllocatorClass = Class.forName("org.apache.arrow.memory.RootAllocator"); + Class schemaClass = Class.forName("org.apache.arrow.vector.types.pojo.Schema"); + Class fieldClass = Class.forName("org.apache.arrow.vector.types.pojo.Field"); + Class arrowTypeClass = Class.forName("org.apache.arrow.vector.types.pojo.ArrowType"); + Class utf8Class = Class.forName("org.apache.arrow.vector.types.pojo.ArrowType$Utf8"); + Class intClass = Class.forName("org.apache.arrow.vector.types.pojo.ArrowType$Int"); + Class vectorSchemaRootClass = Class.forName("org.apache.arrow.vector.VectorSchemaRoot"); + Class bufferAllocatorClass = Class.forName("org.apache.arrow.memory.BufferAllocator"); + + Object allocator = rootAllocatorClass.getConstructor().newInstance(); + + try { + // Build schema + Object utf8Type = utf8Class.getConstructor().newInstance(); + Object int32Type = intClass.getConstructor(int.class, boolean.class).newInstance(32, true); + Object int64Type = intClass.getConstructor(int.class, boolean.class).newInstance(64, true); + + java.lang.reflect.Method nullableMethod = fieldClass.getMethod("nullable", String.class, arrowTypeClass); + Object deviceNameField = nullableMethod.invoke(null, "device_name", utf8Type); + Object tempField = nullableMethod.invoke(null, "temp", int32Type); + Object humidityField = nullableMethod.invoke(null, "humidity", int64Type); + + java.util.List fields = java.util.Arrays.asList(deviceNameField, tempField, humidityField); + Object schema = schemaClass.getConstructor(java.util.List.class).newInstance(fields); + + System.out.println("Schema created: " + schema); + + // Initialize SDK + ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl); + + ArrowTableProperties tableProperties = new ArrowTableProperties(tableName, schema); + + ArrowStreamConfigurationOptions options = ArrowStreamConfigurationOptions.builder() + .setMaxInflightBatches(50) + .build(); + + System.out.println("Creating Arrow stream..."); + ZerobusArrowStream stream = sdk.createArrowStream( + tableProperties, + clientId, + clientSecret, + options + ).join(); + System.out.println("Arrow stream created successfully."); + + long startTime = System.currentTimeMillis(); + long lastOffset = -1; + + try { + // Create reusable VectorSchemaRoot + java.lang.reflect.Method createMethod = vectorSchemaRootClass.getMethod("create", schemaClass, bufferAllocatorClass); + Object batch = createMethod.invoke(null, schema, allocator); + + java.lang.reflect.Method getVectorMethod = vectorSchemaRootClass.getMethod("getVector", String.class); + java.lang.reflect.Method setRowCountMethod = vectorSchemaRootClass.getMethod("setRowCount", int.class); + java.lang.reflect.Method clearMethod = vectorSchemaRootClass.getMethod("clear"); + java.lang.reflect.Method closeMethod = vectorSchemaRootClass.getMethod("close"); + + Class varCharVectorClass = Class.forName("org.apache.arrow.vector.VarCharVector"); + Class intVectorClass = Class.forName("org.apache.arrow.vector.IntVector"); + Class bigIntVectorClass = Class.forName("org.apache.arrow.vector.BigIntVector"); + + java.lang.reflect.Method allocateNewVarChar = varCharVectorClass.getMethod("allocateNew", int.class); + java.lang.reflect.Method allocateNewInt = intVectorClass.getMethod("allocateNew", int.class); + java.lang.reflect.Method allocateNewBigInt = bigIntVectorClass.getMethod("allocateNew", int.class); + java.lang.reflect.Method setSafeVarChar = varCharVectorClass.getMethod("setSafe", int.class, byte[].class); + java.lang.reflect.Method setSafeInt = intVectorClass.getMethod("setSafe", int.class, int.class); + java.lang.reflect.Method setSafeBigInt = bigIntVectorClass.getMethod("setSafe", int.class, long.class); + + try { + for (int batchNum = 0; batchNum < NUM_BATCHES; batchNum++) { + // Clear previous batch data + clearMethod.invoke(batch); + + // Get vectors + Object deviceNameVector = getVectorMethod.invoke(batch, "device_name"); + Object tempVector = getVectorMethod.invoke(batch, "temp"); + Object humidityVector = getVectorMethod.invoke(batch, "humidity"); + + // Allocate + allocateNewVarChar.invoke(deviceNameVector, ROWS_PER_BATCH); + allocateNewInt.invoke(tempVector, ROWS_PER_BATCH); + allocateNewBigInt.invoke(humidityVector, ROWS_PER_BATCH); + + // Populate + int baseRow = batchNum * ROWS_PER_BATCH; + for (int i = 0; i < ROWS_PER_BATCH; i++) { + int globalIdx = baseRow + i; + String deviceName = "arrow-multi-" + (globalIdx % 100); + int temp = 15 + (globalIdx % 20); + long humidity = 40 + (globalIdx % 50); + + setSafeVarChar.invoke(deviceNameVector, i, deviceName.getBytes()); + setSafeInt.invoke(tempVector, i, temp); + setSafeBigInt.invoke(humidityVector, i, humidity); + } + + setRowCountMethod.invoke(batch, ROWS_PER_BATCH); + + // Ingest batch + lastOffset = stream.ingestBatch(batch); + System.out.printf("Batch %d/%d ingested (offset: %d)%n", + batchNum + 1, NUM_BATCHES, lastOffset); + } + + // Wait for final acknowledgment + System.out.println("Waiting for final acknowledgment..."); + stream.waitForOffset(lastOffset); + + long endTime = System.currentTimeMillis(); + double durationSec = (endTime - startTime) / 1000.0; + double rowsPerSec = TOTAL_ROWS / durationSec; + + System.out.println(); + System.out.println("=== Ingestion Complete ==="); + System.out.printf("Total rows: %,d in %d batches%n", TOTAL_ROWS, NUM_BATCHES); + System.out.printf("Duration: %.2f seconds%n", durationSec); + System.out.printf("Throughput: %,.0f rows/sec%n", rowsPerSec); + + } finally { + closeMethod.invoke(batch); + } + + } finally { + stream.close(); + System.out.println("Arrow stream closed."); + } + + } finally { + java.lang.reflect.Method closeMethod = rootAllocatorClass.getMethod("close"); + closeMethod.invoke(allocator); + } + } +} diff --git a/examples/arrow/README.md b/examples/arrow/README.md new file mode 100644 index 0000000..1dadc1f --- /dev/null +++ b/examples/arrow/README.md @@ -0,0 +1,259 @@ +# Arrow Flight Examples + +> **Experimental**: Arrow Flight support is experimental and may change in future releases. + +This directory contains examples for high-performance columnar data ingestion using Apache Arrow Flight. + +## Overview + +Arrow Flight provides: +- **Maximum throughput** - Columnar format is highly efficient for bulk data +- **Zero-copy potential** - Data can be transferred without serialization overhead +- **Ecosystem integration** - Works with Spark, Pandas, Polars, etc. +- **Compression support** - Built-in support for columnar compression + +## Prerequisites + +### 1. Add Arrow Dependencies + +```xml + + org.apache.arrow + arrow-vector + 17.0.0 + + + org.apache.arrow + arrow-memory-netty + 17.0.0 + +``` + +### 2. Set Environment Variables + +```bash +export DATABRICKS_CLIENT_ID="your-client-id" +export DATABRICKS_CLIENT_SECRET="your-client-secret" +``` + +## Examples + +### SingleBatchExample + +Ingests a single Arrow RecordBatch. Best for: +- One-time bulk loads +- Understanding Arrow Flight basics +- Simple columnar ingestion + +**Compile:** +```bash +javac -d . -cp "../../target/zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar:." \ + SingleBatchExample.java MultiBatchExample.java +``` + +**Run:** +```bash +java -cp "../../target/zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar::." \ + com.databricks.zerobus.examples.arrow.SingleBatchExample +``` + +**Expected output:** +``` +=== Arrow Single Batch Ingestion Example === +Server: https://.zerobus..cloud.databricks.com +Table: ..
+Batch size: 10000 rows + +Schema created: Schema +Creating Arrow stream... +Arrow stream created successfully. +Populating batch with 10000 rows... +Ingesting batch... +Batch queued (offset: 0) +Waiting for acknowledgment... + +=== Ingestion Complete === +Rows: 10,000 +Duration: 0.45 seconds +Throughput: 22,222 rows/sec +Arrow stream closed. +``` + +### MultiBatchExample + +Ingests multiple Arrow RecordBatches. Best for: +- Large dataset streaming +- Memory-efficient processing +- Continuous data pipelines + +**Run:** +```bash +java -cp "../../target/zerobus-ingest-sdk-0.1.0-jar-with-dependencies.jar::." \ + com.databricks.zerobus.examples.arrow.MultiBatchExample +``` + +**Expected output:** +``` +=== Arrow Multi-Batch Ingestion Example === +... +Batch 1/10 ingested (offset: 0) +Batch 2/10 ingested (offset: 1) +... +=== Ingestion Complete === +Total rows: 50,000 in 10 batches +Duration: 1.23 seconds +Throughput: 40,650 rows/sec +``` + +## Arrow Schema + +The examples use an Arrow schema matching the table: + +| Field | Arrow Type | Table Type | +|-------|------------|------------| +| device_name | Utf8 | STRING | +| temp | Int32 | INT | +| humidity | Int64 | BIGINT | + +```java +Schema schema = new Schema(Arrays.asList( + Field.nullable("device_name", new ArrowType.Utf8()), + Field.nullable("temp", new ArrowType.Int(32, true)), + Field.nullable("humidity", new ArrowType.Int(64, true)) +)); +``` + +## Key Patterns + +### Creating Arrow Table Properties + +```java +Schema schema = new Schema(Arrays.asList( + Field.nullable("device_name", new ArrowType.Utf8()), + Field.nullable("temp", new ArrowType.Int(32, true)), + Field.nullable("humidity", new ArrowType.Int(64, true)) +)); + +ArrowTableProperties tableProperties = new ArrowTableProperties( + "catalog.schema.table", + schema +); +``` + +### Creating Arrow Stream + +```java +ArrowStreamConfigurationOptions options = ArrowStreamConfigurationOptions.builder() + .setMaxInflightBatches(100) + .build(); + +ZerobusArrowStream stream = sdk.createArrowStream( + tableProperties, + clientId, + clientSecret, + options +).join(); +``` + +### Ingesting a RecordBatch + +```java +try (BufferAllocator allocator = new RootAllocator(); + VectorSchemaRoot batch = VectorSchemaRoot.create(schema, allocator)) { + + // Populate vectors + VarCharVector deviceName = (VarCharVector) batch.getVector("device_name"); + IntVector temp = (IntVector) batch.getVector("temp"); + + deviceName.allocateNew(batchSize); + temp.allocateNew(batchSize); + + for (int i = 0; i < batchSize; i++) { + deviceName.setSafe(i, ("sensor-" + i).getBytes()); + temp.setSafe(i, 20 + i % 15); + } + + batch.setRowCount(batchSize); + + // Ingest + long offset = stream.ingestBatch(batch); + stream.waitForOffset(offset); +} +``` + +## Configuration Options + +| Option | Default | Description | +|--------|---------|-------------| +| `maxInflightBatches` | 1000 | Max concurrent in-flight batches | +| `recovery` | true | Enable automatic recovery | +| `recoveryTimeoutMs` | 15000 | Timeout for recovery (ms) | +| `recoveryBackoffMs` | 2000 | Delay between retries (ms) | +| `recoveryRetries` | 4 | Max recovery attempts | +| `flushTimeoutMs` | 300000 | Flush timeout (ms) | +| `serverLackOfAckTimeoutMs` | 60000 | Server ack timeout (ms) | +| `connectionTimeoutMs` | 30000 | Connection timeout (ms) | + +## Performance Tips + +1. **Use larger batch sizes** - More rows per batch improves throughput +2. **Reuse VectorSchemaRoot** - Clear and reuse instead of creating new +3. **Pre-allocate vectors** - Allocate to expected size to avoid resizing +4. **Close resources properly** - Always close allocators and batches + +## Memory Management + +Arrow uses off-heap memory. Always close resources: + +```java +BufferAllocator allocator = new RootAllocator(); +try { + // ... use allocator ... +} finally { + allocator.close(); // Releases off-heap memory +} +``` + +Or use try-with-resources: + +```java +try (BufferAllocator allocator = new RootAllocator(); + VectorSchemaRoot batch = VectorSchemaRoot.create(schema, allocator)) { + // ... use batch ... +} // Automatically closed +``` + +## Note on Reflection + +The examples use reflection to avoid compile-time Arrow dependency. In a real application with Arrow on the classpath, import Arrow classes directly: + +```java +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +``` + +## Troubleshooting + +### Arrow libraries not found +``` +Error: Apache Arrow libraries not found on classpath +``` +**Solution:** Add `arrow-vector` and `arrow-memory-netty` JARs. + +### Memory leak warnings +``` +WARN: Memory was not released properly +``` +**Solution:** Ensure all allocators and batches are closed. + +### Schema mismatch +``` +ZerobusException: Schema does not match table +``` +**Solution:** Verify Arrow schema matches Delta table columns. diff --git a/examples/arrow/SingleBatchExample.java b/examples/arrow/SingleBatchExample.java new file mode 100644 index 0000000..8bda462 --- /dev/null +++ b/examples/arrow/SingleBatchExample.java @@ -0,0 +1,208 @@ +package com.databricks.zerobus.examples.arrow; + +import com.databricks.zerobus.*; + +/** + * Single-batch Arrow Flight ingestion example. + * + *

Demonstrates ingesting a single Arrow RecordBatch. Arrow Flight is ideal for: + *

    + *
  • High-performance columnar data ingestion
  • + *
  • Data already in Arrow format (from Spark, Pandas, etc.)
  • + *
  • Very high throughput requirements
  • + *
+ * + *

Note: This example requires Apache Arrow Java libraries on the classpath: + *

+ * org.apache.arrow:arrow-vector:17.0.0
+ * org.apache.arrow:arrow-memory-netty:17.0.0
+ * 
+ * + *

Run with: {@code java -cp com.databricks.zerobus.examples.arrow.SingleBatchExample} + * + *

Required environment variables: + *

    + *
  • ZEROBUS_SERVER_ENDPOINT - The Zerobus server endpoint URL
  • + *
  • DATABRICKS_WORKSPACE_URL - The Databricks workspace URL
  • + *
  • ZEROBUS_TABLE_NAME - The target table name (catalog.schema.table)
  • + *
  • DATABRICKS_CLIENT_ID - Service principal application ID
  • + *
  • DATABRICKS_CLIENT_SECRET - Service principal secret
  • + *
+ */ +public class SingleBatchExample { + + private static final int BATCH_SIZE = 10000; + + private static String serverEndpoint; + private static String workspaceUrl; + private static String tableName; + + public static void main(String[] args) throws Exception { + serverEndpoint = System.getenv("ZEROBUS_SERVER_ENDPOINT"); + workspaceUrl = System.getenv("DATABRICKS_WORKSPACE_URL"); + tableName = System.getenv("ZEROBUS_TABLE_NAME"); + String clientId = System.getenv("DATABRICKS_CLIENT_ID"); + String clientSecret = System.getenv("DATABRICKS_CLIENT_SECRET"); + + if (serverEndpoint == null || workspaceUrl == null || tableName == null + || clientId == null || clientSecret == null) { + System.err.println("Error: Required environment variables not set."); + System.err.println("Set: ZEROBUS_SERVER_ENDPOINT, DATABRICKS_WORKSPACE_URL, ZEROBUS_TABLE_NAME,"); + System.err.println(" DATABRICKS_CLIENT_ID, DATABRICKS_CLIENT_SECRET"); + System.exit(1); + } + + System.out.println("=== Arrow Single Batch Ingestion Example ==="); + System.out.printf("Server: %s%n", serverEndpoint); + System.out.printf("Table: %s%n", tableName); + System.out.printf("Batch size: %d rows%n%n", BATCH_SIZE); + + // Check if Arrow is available + try { + Class.forName("org.apache.arrow.vector.VectorSchemaRoot"); + } catch (ClassNotFoundException e) { + System.err.println("Error: Apache Arrow libraries not found on classpath."); + System.err.println("Add arrow-vector and arrow-memory-netty dependencies."); + System.exit(1); + } + + // Delegate to Arrow-specific code (uses reflection to avoid compile-time dependency) + runWithArrow(clientId, clientSecret); + } + + private static void runWithArrow(String clientId, String clientSecret) throws Exception { + // Use reflection to avoid compile-time Arrow dependency + // In a real application, you would import Arrow classes directly + + // Import Arrow classes via reflection + Class rootAllocatorClass = Class.forName("org.apache.arrow.memory.RootAllocator"); + Class schemaClass = Class.forName("org.apache.arrow.vector.types.pojo.Schema"); + Class fieldClass = Class.forName("org.apache.arrow.vector.types.pojo.Field"); + Class arrowTypeClass = Class.forName("org.apache.arrow.vector.types.pojo.ArrowType"); + Class utf8Class = Class.forName("org.apache.arrow.vector.types.pojo.ArrowType$Utf8"); + Class intClass = Class.forName("org.apache.arrow.vector.types.pojo.ArrowType$Int"); + Class vectorSchemaRootClass = Class.forName("org.apache.arrow.vector.VectorSchemaRoot"); + + // Create allocator + Object allocator = rootAllocatorClass.getConstructor().newInstance(); + + try { + // Build schema: device_name (String), temp (Int32), humidity (Int64) + Object utf8Type = utf8Class.getConstructor().newInstance(); + Object int32Type = intClass.getConstructor(int.class, boolean.class).newInstance(32, true); + Object int64Type = intClass.getConstructor(int.class, boolean.class).newInstance(64, true); + + java.lang.reflect.Method nullableMethod = fieldClass.getMethod("nullable", String.class, arrowTypeClass); + Object deviceNameField = nullableMethod.invoke(null, "device_name", utf8Type); + Object tempField = nullableMethod.invoke(null, "temp", int32Type); + Object humidityField = nullableMethod.invoke(null, "humidity", int64Type); + + java.util.List fields = java.util.Arrays.asList(deviceNameField, tempField, humidityField); + Object schema = schemaClass.getConstructor(java.util.List.class).newInstance(fields); + + System.out.println("Schema created: " + schema); + + // Initialize SDK + ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl); + + // Create Arrow table properties + ArrowTableProperties tableProperties = new ArrowTableProperties(tableName, schema); + + ArrowStreamConfigurationOptions options = ArrowStreamConfigurationOptions.builder() + .setMaxInflightBatches(100) + .build(); + + System.out.println("Creating Arrow stream..."); + ZerobusArrowStream stream = sdk.createArrowStream( + tableProperties, + clientId, + clientSecret, + options + ).join(); + System.out.println("Arrow stream created successfully."); + + long startTime = System.currentTimeMillis(); + + try { + // Create VectorSchemaRoot + Class bufferAllocatorClass = Class.forName("org.apache.arrow.memory.BufferAllocator"); + java.lang.reflect.Method createMethod = vectorSchemaRootClass.getMethod("create", schemaClass, bufferAllocatorClass); + Object batch = createMethod.invoke(null, schema, allocator); + + try { + // Get vectors + java.lang.reflect.Method getVectorMethod = vectorSchemaRootClass.getMethod("getVector", String.class); + Object deviceNameVector = getVectorMethod.invoke(batch, "device_name"); + Object tempVector = getVectorMethod.invoke(batch, "temp"); + Object humidityVector = getVectorMethod.invoke(batch, "humidity"); + + // Allocate vectors + Class varCharVectorClass = Class.forName("org.apache.arrow.vector.VarCharVector"); + Class intVectorClass = Class.forName("org.apache.arrow.vector.IntVector"); + Class bigIntVectorClass = Class.forName("org.apache.arrow.vector.BigIntVector"); + + java.lang.reflect.Method allocateNewVarChar = varCharVectorClass.getMethod("allocateNew", int.class); + java.lang.reflect.Method allocateNewInt = intVectorClass.getMethod("allocateNew", int.class); + java.lang.reflect.Method allocateNewBigInt = bigIntVectorClass.getMethod("allocateNew", int.class); + + allocateNewVarChar.invoke(deviceNameVector, BATCH_SIZE); + allocateNewInt.invoke(tempVector, BATCH_SIZE); + allocateNewBigInt.invoke(humidityVector, BATCH_SIZE); + + // Populate vectors + java.lang.reflect.Method setSafeVarChar = varCharVectorClass.getMethod("setSafe", int.class, byte[].class); + java.lang.reflect.Method setSafeInt = intVectorClass.getMethod("setSafe", int.class, int.class); + java.lang.reflect.Method setSafeBigInt = bigIntVectorClass.getMethod("setSafe", int.class, long.class); + + System.out.println("Populating batch with " + BATCH_SIZE + " rows..."); + for (int i = 0; i < BATCH_SIZE; i++) { + String deviceName = "arrow-sensor-" + (i % 100); + int temp = 15 + (i % 20); + long humidity = 40 + (i % 50); + + setSafeVarChar.invoke(deviceNameVector, i, deviceName.getBytes()); + setSafeInt.invoke(tempVector, i, temp); + setSafeBigInt.invoke(humidityVector, i, humidity); + } + + // Set row count + java.lang.reflect.Method setRowCountMethod = vectorSchemaRootClass.getMethod("setRowCount", int.class); + setRowCountMethod.invoke(batch, BATCH_SIZE); + + // Ingest batch + System.out.println("Ingesting batch..."); + long offset = stream.ingestBatch(batch); + System.out.printf("Batch queued (offset: %d)%n", offset); + + // Wait for acknowledgment + System.out.println("Waiting for acknowledgment..."); + stream.waitForOffset(offset); + + long endTime = System.currentTimeMillis(); + double durationSec = (endTime - startTime) / 1000.0; + double rowsPerSec = BATCH_SIZE / durationSec; + + System.out.println(); + System.out.println("=== Ingestion Complete ==="); + System.out.printf("Rows: %,d%n", BATCH_SIZE); + System.out.printf("Duration: %.2f seconds%n", durationSec); + System.out.printf("Throughput: %,.0f rows/sec%n", rowsPerSec); + + } finally { + // Close batch + java.lang.reflect.Method closeMethod = vectorSchemaRootClass.getMethod("close"); + closeMethod.invoke(batch); + } + + } finally { + stream.close(); + System.out.println("Arrow stream closed."); + } + + } finally { + // Close allocator + java.lang.reflect.Method closeMethod = rootAllocatorClass.getMethod("close"); + closeMethod.invoke(allocator); + } + } +} diff --git a/examples/arrow/com/databricks/zerobus/examples/arrow/MultiBatchExample.class b/examples/arrow/com/databricks/zerobus/examples/arrow/MultiBatchExample.class new file mode 100644 index 0000000..5ef663b Binary files /dev/null and b/examples/arrow/com/databricks/zerobus/examples/arrow/MultiBatchExample.class differ diff --git a/examples/arrow/com/databricks/zerobus/examples/arrow/SingleBatchExample.class b/examples/arrow/com/databricks/zerobus/examples/arrow/SingleBatchExample.class new file mode 100644 index 0000000..a4b2ce4 Binary files /dev/null and b/examples/arrow/com/databricks/zerobus/examples/arrow/SingleBatchExample.class differ diff --git a/examples/json/BatchIngestionExample.java b/examples/json/BatchIngestionExample.java new file mode 100644 index 0000000..e7b0f15 --- /dev/null +++ b/examples/json/BatchIngestionExample.java @@ -0,0 +1,127 @@ +package com.databricks.zerobus.examples.json; + +import com.databricks.zerobus.*; +import com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +/** + * Batch JSON ingestion example. + * + *

Demonstrates ingesting JSON records in batches. This combines the flexibility + * of JSON with the efficiency of batch processing. + * + *

Run with: {@code java -cp com.databricks.zerobus.examples.json.BatchIngestionExample} + * + *

Required environment variables: + *

    + *
  • ZEROBUS_SERVER_ENDPOINT - The Zerobus server endpoint URL
  • + *
  • DATABRICKS_WORKSPACE_URL - The Databricks workspace URL
  • + *
  • ZEROBUS_TABLE_NAME - The target table name (catalog.schema.table)
  • + *
  • DATABRICKS_CLIENT_ID - Service principal application ID
  • + *
  • DATABRICKS_CLIENT_SECRET - Service principal secret
  • + *
+ */ +public class BatchIngestionExample { + + private static final int TOTAL_RECORDS = 10000; + private static final int BATCH_SIZE = 100; + + public static void main(String[] args) throws Exception { + String serverEndpoint = System.getenv("ZEROBUS_SERVER_ENDPOINT"); + String workspaceUrl = System.getenv("DATABRICKS_WORKSPACE_URL"); + String tableName = System.getenv("ZEROBUS_TABLE_NAME"); + String clientId = System.getenv("DATABRICKS_CLIENT_ID"); + String clientSecret = System.getenv("DATABRICKS_CLIENT_SECRET"); + + if (serverEndpoint == null || workspaceUrl == null || tableName == null + || clientId == null || clientSecret == null) { + System.err.println("Error: Required environment variables not set."); + System.err.println("Set: ZEROBUS_SERVER_ENDPOINT, DATABRICKS_WORKSPACE_URL, ZEROBUS_TABLE_NAME,"); + System.err.println(" DATABRICKS_CLIENT_ID, DATABRICKS_CLIENT_SECRET"); + System.exit(1); + } + + System.out.println("=== JSON Batch Ingestion Example ==="); + System.out.printf("Server: %s%n", serverEndpoint); + System.out.printf("Table: %s%n", tableName); + System.out.printf("Total records: %d (batch size: %d)%n%n", TOTAL_RECORDS, BATCH_SIZE); + + ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl); + + TableProperties tableProperties = new TableProperties<>( + tableName, + AirQuality.getDefaultInstance() + ); + + // Configure for JSON record ingestion + StreamConfigurationOptions options = StreamConfigurationOptions.builder() + .setMaxInflightRecords(50000) + .setRecordType(RecordType.JSON) + .build(); + + System.out.println("Creating stream..."); + ZerobusStream stream = sdk.createStream( + tableProperties, + clientId, + clientSecret, + options + ).join(); + System.out.println("Stream created successfully."); + + long startTime = System.currentTimeMillis(); + Optional lastOffset = Optional.empty(); + int batchCount = 0; + + try { + for (int batchStart = 0; batchStart < TOTAL_RECORDS; batchStart += BATCH_SIZE) { + int batchEnd = Math.min(batchStart + BATCH_SIZE, TOTAL_RECORDS); + + // Build batch of JsonRecords + List batch = new ArrayList<>(); + for (int i = batchStart; i < batchEnd; i++) { + String json = String.format( + "{\"device_name\": \"json-batch-%d\", \"temp\": %d, \"humidity\": %d}", + i % 100, + 15 + (i % 20), + 40 + (i % 50) + ); + batch.add(JsonRecord.of(json)); + } + + // Ingest batch - returns Optional with batch offset + lastOffset = stream.ingestRecordsOffset(batch); + batchCount++; + + if (batchCount % 10 == 0) { + System.out.printf("Ingested %d batches (%d records, offset: %d)%n", + batchCount, batchStart + BATCH_SIZE, lastOffset.orElse(-1L)); + } + } + + System.out.println("Waiting for final acknowledgment..."); + lastOffset.ifPresent(offset -> { + try { + stream.waitForOffset(offset); + } catch (ZerobusException e) { + throw new RuntimeException(e); + } + }); + + long endTime = System.currentTimeMillis(); + double durationSec = (endTime - startTime) / 1000.0; + double recordsPerSec = TOTAL_RECORDS / durationSec; + + System.out.println(); + System.out.println("=== Ingestion Complete ==="); + System.out.printf("Records: %,d in %d batches%n", TOTAL_RECORDS, batchCount); + System.out.printf("Duration: %.2f seconds%n", durationSec); + System.out.printf("Throughput: %,.0f records/sec%n", recordsPerSec); + + } finally { + stream.close(); + System.out.println("Stream closed."); + } + } +} diff --git a/examples/json/README.md b/examples/json/README.md new file mode 100644 index 0000000..2d980a2 --- /dev/null +++ b/examples/json/README.md @@ -0,0 +1,234 @@ +# JSON Examples + +This directory contains examples for ingesting data using JSON serialization. + +## Overview + +JSON ingestion provides: +- **No schema compilation** - Skip the protobuf generation step +- **Flexibility** - Easy to add or change fields +- **Human-readable** - Easy debugging and inspection +- **Interoperability** - Works with any JSON producer + +## Building and Running Examples + +The examples need to be compiled against the SDK. From the repository root: + +```bash +# 1. Build the SDK (if not already built) +mvn package -DskipTests + +# 2. Extract SDK classes for compilation +mkdir -p target/lib target/examples +cd target/lib && jar xf ../zerobus-ingest-sdk-0.2.0-jar-with-dependencies.jar && cd ../.. + +# 3. Compile the proto classes and examples +javac -cp "target/lib" -d target/examples \ + examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto.java \ + examples/json/SingleRecordExample.java \ + examples/json/BatchIngestionExample.java + +# 4. Set environment variables +export ZEROBUS_SERVER_ENDPOINT="https://.zerobus..cloud.databricks.com" +export DATABRICKS_WORKSPACE_URL="https://.cloud.databricks.com" +export ZEROBUS_TABLE_NAME="catalog.schema.table" +export DATABRICKS_CLIENT_ID="your-client-id" +export DATABRICKS_CLIENT_SECRET="your-client-secret" + +# 5. Run an example +java -cp "target/lib:target/examples" com.databricks.zerobus.examples.json.SingleRecordExample +``` + +## Prerequisites + +Set environment variables as shown above. + +## Important: JSON Stream Configuration + +JSON streams **require** `RecordType.JSON` in the configuration: + +```java +StreamConfigurationOptions options = StreamConfigurationOptions.builder() + .setRecordType(RecordType.JSON) // Required for JSON records + .setMaxInflightRecords(10000) + .build(); +``` + +Without this, you'll get: `NonRetriableException: Record type does not match stream configuration` + +## Examples + +### SingleRecordExample + +Ingests JSON records one at a time. + +**Run:** +```bash +java -cp "target/lib:target/examples" com.databricks.zerobus.examples.json.SingleRecordExample +``` + +**Expected output:** +``` +=== JSON Single Record Ingestion Example === +Server: https://.zerobus..cloud.databricks.com +Table: ..
+Records: 1000 + +Creating stream... +Stream created successfully. +Queued 100 records (offset: 99) +... +=== Ingestion Complete === +Records: 1,000 +Duration: 0.50 seconds +Throughput: 2,012 records/sec +Stream closed. +``` + +**Key patterns:** +```java +// Configure for JSON records +StreamConfigurationOptions options = StreamConfigurationOptions.builder() + .setRecordType(RecordType.JSON) + .build(); + +// Create JSON record +String json = String.format( + "{\"device_name\": \"sensor-%d\", \"temp\": %d, \"humidity\": %d}", + i, temp, humidity +); +JsonRecord record = JsonRecord.of(json); + +// Ingest using offset-based API +long offset = stream.ingestRecordOffset(record); +``` + +### BatchIngestionExample + +Ingests JSON records in batches for higher throughput. + +**Run:** +```bash +java -cp "target/lib:target/examples" com.databricks.zerobus.examples.json.BatchIngestionExample +``` + +**Expected output:** +``` +=== JSON Batch Ingestion Example === +... +Ingested 10 batches (1000 records, offset: 9) +... +=== Ingestion Complete === +Records: 10,000 in 100 batches +Duration: 0.88 seconds +Throughput: 11,364 records/sec +``` + +**Key patterns:** +```java +// Build batch of JsonRecords +List batch = new ArrayList<>(); +for (int i = 0; i < BATCH_SIZE; i++) { + String json = String.format("{\"device_name\": \"json-%d\", \"temp\": %d}", i, temp); + batch.add(JsonRecord.of(json)); +} + +// Ingest batch - returns Optional with batch offset +Optional offset = stream.ingestRecordsOffset(batch); +offset.ifPresent(stream::waitForOffset); +``` + +## JSON Format + +Records must be valid JSON strings matching the table schema: + +```json +{ + "device_name": "sensor-1", + "temp": 25, + "humidity": 65 +} +``` + +**Field names** must match the Delta table column names exactly. + +## Creating JSON Records + +### From String Literal + +```java +JsonRecord record = JsonRecord.of( + "{\"device_name\": \"sensor-1\", \"temp\": 25, \"humidity\": 65}" +); +``` + +### Using String.format + +```java +String json = String.format( + "{\"device_name\": \"%s\", \"temp\": %d, \"humidity\": %d}", + deviceName, temp, humidity +); +JsonRecord record = JsonRecord.of(json); +``` + +### Using a JSON Library (Gson) + +```java +import com.google.gson.Gson; + +Map data = new HashMap<>(); +data.put("device_name", "sensor-1"); +data.put("temp", 25); +data.put("humidity", 65); + +Gson gson = new Gson(); +JsonRecord record = JsonRecord.of(gson.toJson(data)); +``` + +### Using JsonRecord.fromObject + +```java +JsonRecord record = JsonRecord.fromObject(data, obj -> new Gson().toJson(obj)); +``` + +## Why JSON Instead of Proto? + +| Use Case | Recommendation | +|----------|----------------| +| Rapid prototyping | JSON | +| Data already in JSON format | JSON | +| Schema changes frequently | JSON | +| Production with stable schema | Proto | +| Maximum performance needed | Proto | +| Type safety required | Proto | + +## Note on TableProperties + +Even for JSON ingestion, the SDK requires a `TableProperties` with a Proto descriptor for stream setup and schema validation. The actual record data is sent as JSON. + +```java +// Proto descriptor still needed for stream setup +TableProperties tableProperties = new TableProperties<>( + TABLE_NAME, + AirQuality.getDefaultInstance() +); + +// But records are sent as JSON +StreamConfigurationOptions options = StreamConfigurationOptions.builder() + .setRecordType(RecordType.JSON) + .build(); +``` + +## Performance Comparison + +Typical performance on a good network connection: + +| Method | Throughput | +|--------|------------| +| JSON single | ~2,000-5,000 rec/s | +| JSON batch | ~10,000-15,000 rec/s | +| Proto single | ~2,000-5,000 rec/s | +| Proto batch | ~10,000-15,000 rec/s | + +*Results vary based on record size, network, and server load.* diff --git a/examples/json/SingleRecordExample.java b/examples/json/SingleRecordExample.java new file mode 100644 index 0000000..70f5fc7 --- /dev/null +++ b/examples/json/SingleRecordExample.java @@ -0,0 +1,114 @@ +package com.databricks.zerobus.examples.json; + +import com.databricks.zerobus.*; +import com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality; + +/** + * Single-record JSON ingestion example. + * + *

Demonstrates ingesting JSON records one at a time. JSON ingestion is useful when: + *

    + *
  • You don't want to define Protocol Buffer schemas
  • + *
  • Your data is already in JSON format
  • + *
  • You need flexible schema evolution
  • + *
+ * + *

Run with: {@code java -cp com.databricks.zerobus.examples.json.SingleRecordExample} + * + *

Required environment variables: + *

    + *
  • ZEROBUS_SERVER_ENDPOINT - The Zerobus server endpoint URL
  • + *
  • DATABRICKS_WORKSPACE_URL - The Databricks workspace URL
  • + *
  • ZEROBUS_TABLE_NAME - The target table name (catalog.schema.table)
  • + *
  • DATABRICKS_CLIENT_ID - Service principal application ID
  • + *
  • DATABRICKS_CLIENT_SECRET - Service principal secret
  • + *
+ */ +public class SingleRecordExample { + + private static final int RECORD_COUNT = 1000; + + public static void main(String[] args) throws Exception { + String serverEndpoint = System.getenv("ZEROBUS_SERVER_ENDPOINT"); + String workspaceUrl = System.getenv("DATABRICKS_WORKSPACE_URL"); + String tableName = System.getenv("ZEROBUS_TABLE_NAME"); + String clientId = System.getenv("DATABRICKS_CLIENT_ID"); + String clientSecret = System.getenv("DATABRICKS_CLIENT_SECRET"); + + if (serverEndpoint == null || workspaceUrl == null || tableName == null + || clientId == null || clientSecret == null) { + System.err.println("Error: Required environment variables not set."); + System.err.println("Set: ZEROBUS_SERVER_ENDPOINT, DATABRICKS_WORKSPACE_URL, ZEROBUS_TABLE_NAME,"); + System.err.println(" DATABRICKS_CLIENT_ID, DATABRICKS_CLIENT_SECRET"); + System.exit(1); + } + + System.out.println("=== JSON Single Record Ingestion Example ==="); + System.out.printf("Server: %s%n", serverEndpoint); + System.out.printf("Table: %s%n", tableName); + System.out.printf("Records: %d%n%n", RECORD_COUNT); + + ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl); + + // Note: TableProperties still needs a proto descriptor for schema validation, + // but the stream is configured for JSON records via RecordType.JSON. + TableProperties tableProperties = new TableProperties<>( + tableName, + AirQuality.getDefaultInstance() + ); + + // Configure for JSON record ingestion + StreamConfigurationOptions options = StreamConfigurationOptions.builder() + .setMaxInflightRecords(10000) + .setRecordType(RecordType.JSON) + .build(); + + System.out.println("Creating stream..."); + ZerobusStream stream = sdk.createStream( + tableProperties, + clientId, + clientSecret, + options + ).join(); + System.out.println("Stream created successfully."); + + long startTime = System.currentTimeMillis(); + long lastOffset = -1; + + try { + for (int i = 0; i < RECORD_COUNT; i++) { + // Create JSON record manually + String json = String.format( + "{\"device_name\": \"json-sensor-%d\", \"temp\": %d, \"humidity\": %d}", + i % 100, + 15 + (i % 20), + 40 + (i % 50) + ); + + // Wrap as JsonRecord and ingest + lastOffset = stream.ingestRecordOffset(JsonRecord.of(json)); + + if ((i + 1) % 100 == 0) { + System.out.printf("Queued %d records (offset: %d)%n", i + 1, lastOffset); + } + } + + System.out.println("Waiting for acknowledgment..."); + stream.waitForOffset(lastOffset); + + long endTime = System.currentTimeMillis(); + double durationSec = (endTime - startTime) / 1000.0; + double recordsPerSec = RECORD_COUNT / durationSec; + + System.out.println(); + System.out.println("=== Ingestion Complete ==="); + System.out.printf("Records: %,d%n", RECORD_COUNT); + System.out.printf("Duration: %.2f seconds%n", durationSec); + System.out.printf("Throughput: %,.0f records/sec%n", recordsPerSec); + + } finally { + stream.close(); + System.out.println("Stream closed."); + } + } +} diff --git a/examples/json/com/databricks/zerobus/examples/json/BatchIngestionExample.class b/examples/json/com/databricks/zerobus/examples/json/BatchIngestionExample.class new file mode 100644 index 0000000..dad39fa Binary files /dev/null and b/examples/json/com/databricks/zerobus/examples/json/BatchIngestionExample.class differ diff --git a/examples/json/com/databricks/zerobus/examples/json/SingleRecordExample.class b/examples/json/com/databricks/zerobus/examples/json/SingleRecordExample.class new file mode 100644 index 0000000..22cc94a Binary files /dev/null and b/examples/json/com/databricks/zerobus/examples/json/SingleRecordExample.class differ diff --git a/examples/proto/BatchIngestionExample.java b/examples/proto/BatchIngestionExample.java new file mode 100644 index 0000000..cfbfe96 --- /dev/null +++ b/examples/proto/BatchIngestionExample.java @@ -0,0 +1,131 @@ +package com.databricks.zerobus.examples.proto; + +import com.databricks.zerobus.*; +import com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +/** + * Batch Protocol Buffer ingestion example. + * + *

Demonstrates ingesting records in batches using the offset-based API. + * Batch ingestion provides: + *

    + *
  • Higher throughput for bulk data
  • + *
  • Reduced per-record overhead
  • + *
  • Atomic batch acknowledgment
  • + *
+ * + *

Run with: {@code java -cp com.databricks.zerobus.examples.proto.BatchIngestionExample} + * + *

Required environment variables: + *

    + *
  • ZEROBUS_SERVER_ENDPOINT - The Zerobus server endpoint URL
  • + *
  • DATABRICKS_WORKSPACE_URL - The Databricks workspace URL
  • + *
  • ZEROBUS_TABLE_NAME - The target table name (catalog.schema.table)
  • + *
  • DATABRICKS_CLIENT_ID - Service principal application ID
  • + *
  • DATABRICKS_CLIENT_SECRET - Service principal secret
  • + *
+ */ +public class BatchIngestionExample { + + // Batch configuration + private static final int TOTAL_RECORDS = 10000; + private static final int BATCH_SIZE = 100; + + public static void main(String[] args) throws Exception { + String serverEndpoint = System.getenv("ZEROBUS_SERVER_ENDPOINT"); + String workspaceUrl = System.getenv("DATABRICKS_WORKSPACE_URL"); + String tableName = System.getenv("ZEROBUS_TABLE_NAME"); + String clientId = System.getenv("DATABRICKS_CLIENT_ID"); + String clientSecret = System.getenv("DATABRICKS_CLIENT_SECRET"); + + if (serverEndpoint == null || workspaceUrl == null || tableName == null + || clientId == null || clientSecret == null) { + System.err.println("Error: Required environment variables not set."); + System.err.println("Set: ZEROBUS_SERVER_ENDPOINT, DATABRICKS_WORKSPACE_URL, ZEROBUS_TABLE_NAME,"); + System.err.println(" DATABRICKS_CLIENT_ID, DATABRICKS_CLIENT_SECRET"); + System.exit(1); + } + + System.out.println("=== Proto Batch Ingestion Example ==="); + System.out.printf("Server: %s%n", serverEndpoint); + System.out.printf("Table: %s%n", tableName); + System.out.printf("Total records: %d (batch size: %d)%n%n", TOTAL_RECORDS, BATCH_SIZE); + + ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl); + + TableProperties tableProperties = new TableProperties<>( + tableName, + AirQuality.getDefaultInstance() + ); + + StreamConfigurationOptions options = StreamConfigurationOptions.builder() + .setMaxInflightRecords(50000) + .build(); + + System.out.println("Creating stream..."); + ZerobusStream stream = sdk.createStream( + tableProperties, + clientId, + clientSecret, + options + ).join(); + System.out.println("Stream created successfully."); + + long startTime = System.currentTimeMillis(); + Optional lastOffset = Optional.empty(); + int batchCount = 0; + + try { + // Process in batches + for (int batchStart = 0; batchStart < TOTAL_RECORDS; batchStart += BATCH_SIZE) { + int batchEnd = Math.min(batchStart + BATCH_SIZE, TOTAL_RECORDS); + + // Build batch of ProtoRecords + List> batch = new ArrayList<>(); + for (int i = batchStart; i < batchEnd; i++) { + AirQuality record = AirQuality.newBuilder() + .setDeviceName("batch-sensor-" + (i % 100)) + .setTemp(15 + (i % 20)) + .setHumidity(40 + (i % 50)) + .build(); + batch.add(ProtoRecord.of(record)); + } + + // Ingest batch - returns Optional with batch offset + lastOffset = stream.ingestRecordsOffset(batch); + batchCount++; + + if (batchCount % 10 == 0) { + System.out.printf("Ingested %d batches (%d records, offset: %d)%n", + batchCount, batchStart + BATCH_SIZE, lastOffset.orElse(-1L)); + } + } + + System.out.println("Waiting for final acknowledgment..."); + lastOffset.ifPresent(offset -> { + try { + stream.waitForOffset(offset); + } catch (ZerobusException e) { + throw new RuntimeException(e); + } + }); + + long endTime = System.currentTimeMillis(); + double durationSec = (endTime - startTime) / 1000.0; + double recordsPerSec = TOTAL_RECORDS / durationSec; + + System.out.println(); + System.out.println("=== Ingestion Complete ==="); + System.out.printf("Records: %,d in %d batches%n", TOTAL_RECORDS, batchCount); + System.out.printf("Duration: %.2f seconds%n", durationSec); + System.out.printf("Throughput: %,.0f records/sec%n", recordsPerSec); + + } finally { + stream.close(); + System.out.println("Stream closed."); + } + } +} diff --git a/examples/proto/README.md b/examples/proto/README.md new file mode 100644 index 0000000..1b45352 --- /dev/null +++ b/examples/proto/README.md @@ -0,0 +1,216 @@ +# Protocol Buffer Examples + +This directory contains examples for ingesting data using Protocol Buffer serialization. + +## Overview + +Protocol Buffers provide: +- **Type safety** - Compile-time validation of record structure +- **Compact encoding** - Smaller payload than JSON +- **High performance** - Fast serialization/deserialization +- **Schema evolution** - Backward-compatible schema changes + +## Prerequisites + +1. **Install Protocol Buffers compiler:** + ```bash + # macOS + brew install protobuf + + # Ubuntu/Debian + sudo apt-get install protobuf-compiler + + # Or download from https://github.com/protocolbuffers/protobuf/releases + ``` + +2. **Generate Java classes:** + ```bash + protoc --java_out=. air_quality.proto + ``` + +3. **Set environment variables:** + ```bash + export DATABRICKS_CLIENT_ID="your-client-id" + export DATABRICKS_CLIENT_SECRET="your-client-secret" + ``` + +## Building and Running Examples + +The examples need to be compiled against the SDK. From the repository root: + +```bash +# 1. Build the SDK (if not already built) +mvn package -DskipTests + +# 2. Extract SDK classes for compilation +mkdir -p target/lib target/examples +cd target/lib && jar xf ../zerobus-ingest-sdk-0.2.0-jar-with-dependencies.jar && cd ../.. + +# 3. Compile the proto classes and examples +javac -cp "target/lib" -d target/examples \ + examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto.java \ + examples/proto/SingleRecordExample.java \ + examples/proto/BatchIngestionExample.java + +# 4. Set environment variables +export ZEROBUS_SERVER_ENDPOINT="https://.zerobus..cloud.databricks.com" +export DATABRICKS_WORKSPACE_URL="https://.cloud.databricks.com" +export ZEROBUS_TABLE_NAME="catalog.schema.table" +export DATABRICKS_CLIENT_ID="your-client-id" +export DATABRICKS_CLIENT_SECRET="your-client-secret" + +# 5. Run an example +java -cp "target/lib:target/examples" com.databricks.zerobus.examples.proto.SingleRecordExample +``` + +## Examples + +### SingleRecordExample + +Ingests Protocol Buffer records one at a time using the offset-based API. + +**Run:** +```bash +java -cp "target/lib:target/examples" com.databricks.zerobus.examples.proto.SingleRecordExample +``` + +**Expected output:** +``` +=== Proto Single Record Ingestion Example === +Server: https://.zerobus..cloud.databricks.com +Table: ..
+Records: 1000 + +Creating stream... +Stream created successfully. +Queued 100 records (offset: 99) +Queued 200 records (offset: 199) +... +Waiting for acknowledgment... + +=== Ingestion Complete === +Records: 1,000 +Duration: 0.52 seconds +Throughput: 1,912 records/sec +Stream closed. +``` + +**Key patterns:** +```java +// Create stream with Proto descriptor +TableProperties tableProperties = new TableProperties<>( + TABLE_NAME, + AirQuality.getDefaultInstance() +); + +// Ingest using offset-based API +long offset = stream.ingestRecordOffset(record); + +// Wait for acknowledgment when needed +stream.waitForOffset(offset); +``` + +### BatchIngestionExample + +Ingests Protocol Buffer records in batches for higher throughput. + +**Run:** +```bash +java -cp "target/lib:target/examples" com.databricks.zerobus.examples.proto.BatchIngestionExample +``` + +**Expected output:** +``` +=== Proto Batch Ingestion Example === +... +Ingested 10 batches (1000 records, offset: 9) +Ingested 20 batches (2000 records, offset: 19) +... +=== Ingestion Complete === +Records: 10,000 in 100 batches +Duration: 0.91 seconds +Throughput: 11,050 records/sec +``` + +**Key patterns:** +```java +// Build batch of ProtoRecords +List> batch = new ArrayList<>(); +for (int i = 0; i < BATCH_SIZE; i++) { + batch.add(ProtoRecord.of(record)); +} + +// Ingest batch - returns Optional with batch offset +Optional offset = stream.ingestRecordsOffset(batch); +offset.ifPresent(stream::waitForOffset); +``` + +## Schema + +The examples use this Protocol Buffer schema (`air_quality.proto`): + +```protobuf +syntax = "proto2"; + +package com.databricks.zerobus.examples.proto; + +option java_package = "com.databricks.zerobus.examples.proto"; +option java_outer_classname = "AirQualityProto"; + +message AirQuality { + optional string device_name = 1; + optional int32 temp = 2; + optional int64 humidity = 3; +} +``` + +## Type Mappings + +| Delta Type | Proto2 Type | +|-----------|-------------| +| STRING | string | +| INT | int32 | +| BIGINT | int64 | +| FLOAT | float | +| DOUBLE | double | +| BOOLEAN | bool | +| BINARY | bytes | + +## Adapting for Your Table + +1. **Create your proto schema:** + ```protobuf + syntax = "proto2"; + + package com.example; + option java_package = "com.example.proto"; + option java_outer_classname = "MyRecord"; + + message MyData { + optional string field1 = 1; + optional int32 field2 = 2; + // ... add fields matching your table schema + } + ``` + +2. **Generate Java classes:** + ```bash + protoc --java_out=. my_record.proto + ``` + +3. **Update the example code:** + ```java + import com.example.proto.MyRecord.MyData; + + TableProperties tableProperties = new TableProperties<>( + "catalog.schema.my_table", + MyData.getDefaultInstance() + ); + ``` + +## Performance Tips + +1. **Use batch ingestion** for higher throughput +2. **Increase `maxInflightRecords`** for better pipelining +3. **Use offset-based API** to avoid CompletableFuture overhead +4. **Reuse record builders** when possible diff --git a/examples/proto/SingleRecordExample.java b/examples/proto/SingleRecordExample.java new file mode 100644 index 0000000..bbac785 --- /dev/null +++ b/examples/proto/SingleRecordExample.java @@ -0,0 +1,118 @@ +package com.databricks.zerobus.examples.proto; + +import com.databricks.zerobus.*; +import com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality; + +/** + * Single-record Protocol Buffer ingestion example. + * + *

Demonstrates ingesting records one at a time using the offset-based API. + * This is the recommended approach for most use cases as it provides: + *

    + *
  • Fine-grained control over acknowledgment
  • + *
  • Lower memory overhead than CompletableFuture-based API
  • + *
  • Clear backpressure semantics
  • + *
+ * + *

Run with: {@code java -cp com.databricks.zerobus.examples.proto.SingleRecordExample} + * + *

Required environment variables: + *

    + *
  • ZEROBUS_SERVER_ENDPOINT - The Zerobus server endpoint URL
  • + *
  • DATABRICKS_WORKSPACE_URL - The Databricks workspace URL
  • + *
  • ZEROBUS_TABLE_NAME - The target table name (catalog.schema.table)
  • + *
  • DATABRICKS_CLIENT_ID - Service principal application ID
  • + *
  • DATABRICKS_CLIENT_SECRET - Service principal secret
  • + *
+ */ +public class SingleRecordExample { + + // Number of records to ingest + private static final int RECORD_COUNT = 1000; + + public static void main(String[] args) throws Exception { + // Read configuration from environment + String serverEndpoint = System.getenv("ZEROBUS_SERVER_ENDPOINT"); + String workspaceUrl = System.getenv("DATABRICKS_WORKSPACE_URL"); + String tableName = System.getenv("ZEROBUS_TABLE_NAME"); + String clientId = System.getenv("DATABRICKS_CLIENT_ID"); + String clientSecret = System.getenv("DATABRICKS_CLIENT_SECRET"); + + if (serverEndpoint == null || workspaceUrl == null || tableName == null + || clientId == null || clientSecret == null) { + System.err.println("Error: Required environment variables not set."); + System.err.println("Set: ZEROBUS_SERVER_ENDPOINT, DATABRICKS_WORKSPACE_URL, ZEROBUS_TABLE_NAME,"); + System.err.println(" DATABRICKS_CLIENT_ID, DATABRICKS_CLIENT_SECRET"); + System.exit(1); + } + + System.out.println("=== Proto Single Record Ingestion Example ==="); + System.out.printf("Server: %s%n", serverEndpoint); + System.out.printf("Table: %s%n", tableName); + System.out.printf("Records: %d%n%n", RECORD_COUNT); + + // Initialize SDK + ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl); + + // Configure table properties with protobuf descriptor + TableProperties tableProperties = new TableProperties<>( + tableName, + AirQuality.getDefaultInstance() + ); + + // Configure stream options + StreamConfigurationOptions options = StreamConfigurationOptions.builder() + .setMaxInflightRecords(10000) + .build(); + + // Create stream + System.out.println("Creating stream..."); + ZerobusStream stream = sdk.createStream( + tableProperties, + clientId, + clientSecret, + options + ).join(); + System.out.println("Stream created successfully."); + + long startTime = System.currentTimeMillis(); + long lastOffset = -1; + + try { + // Ingest records one at a time + for (int i = 0; i < RECORD_COUNT; i++) { + AirQuality record = AirQuality.newBuilder() + .setDeviceName("sensor-" + (i % 100)) + .setTemp(15 + (i % 20)) // Temperature: 15-34 + .setHumidity(40 + (i % 50)) // Humidity: 40-89 + .build(); + + // Offset-based API - returns immediately after queuing + lastOffset = stream.ingestRecordOffset(record); + + // Progress reporting + if ((i + 1) % 100 == 0) { + System.out.printf("Queued %d records (offset: %d)%n", i + 1, lastOffset); + } + } + + // Wait for final acknowledgment + System.out.println("Waiting for acknowledgment..."); + stream.waitForOffset(lastOffset); + + long endTime = System.currentTimeMillis(); + double durationSec = (endTime - startTime) / 1000.0; + double recordsPerSec = RECORD_COUNT / durationSec; + + System.out.println(); + System.out.println("=== Ingestion Complete ==="); + System.out.printf("Records: %,d%n", RECORD_COUNT); + System.out.printf("Duration: %.2f seconds%n", durationSec); + System.out.printf("Throughput: %,.0f records/sec%n", recordsPerSec); + + } finally { + stream.close(); + System.out.println("Stream closed."); + } + } +} diff --git a/examples/proto/air_quality.proto b/examples/proto/air_quality.proto new file mode 100644 index 0000000..7379375 --- /dev/null +++ b/examples/proto/air_quality.proto @@ -0,0 +1,14 @@ +syntax = "proto2"; + +package com.databricks.zerobus.examples.proto; + +option java_package = "com.databricks.zerobus.examples.proto"; +option java_outer_classname = "AirQualityProto"; + +// Air quality sensor reading matching the Databricks table schema: +// device_name: STRING, temp: INT, humidity: BIGINT +message AirQuality { + optional string device_name = 1; + optional int32 temp = 2; + optional int64 humidity = 3; +} diff --git a/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto$AirQuality$1.class b/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto$AirQuality$1.class new file mode 100644 index 0000000..2f14ac1 Binary files /dev/null and b/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto$AirQuality$1.class differ diff --git a/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto$AirQuality$Builder.class b/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto$AirQuality$Builder.class new file mode 100644 index 0000000..8aa507d Binary files /dev/null and b/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto$AirQuality$Builder.class differ diff --git a/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto$AirQuality.class b/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto$AirQuality.class new file mode 100644 index 0000000..12f0b55 Binary files /dev/null and b/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto$AirQuality.class differ diff --git a/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto$AirQualityOrBuilder.class b/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto$AirQualityOrBuilder.class new file mode 100644 index 0000000..87fdd3e Binary files /dev/null and b/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto$AirQualityOrBuilder.class differ diff --git a/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto.class b/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto.class new file mode 100644 index 0000000..c894bf1 Binary files /dev/null and b/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto.class differ diff --git a/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto.java b/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto.java new file mode 100644 index 0000000..b376ab5 --- /dev/null +++ b/examples/proto/com/databricks/zerobus/examples/proto/AirQualityProto.java @@ -0,0 +1,814 @@ +// Generated by the protocol buffer compiler. DO NOT EDIT! +// NO CHECKED-IN PROTOBUF GENCODE +// source: air_quality.proto +// Protobuf Java Version: 4.33.0 + +package com.databricks.zerobus.examples.proto; + +@com.google.protobuf.Generated +public final class AirQualityProto extends com.google.protobuf.GeneratedFile { + private AirQualityProto() {} + static { + com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion( + com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC, + /* major= */ 4, + /* minor= */ 33, + /* patch= */ 0, + /* suffix= */ "", + "AirQualityProto"); + } + public static void registerAllExtensions( + com.google.protobuf.ExtensionRegistryLite registry) { + } + + public static void registerAllExtensions( + com.google.protobuf.ExtensionRegistry registry) { + registerAllExtensions( + (com.google.protobuf.ExtensionRegistryLite) registry); + } + public interface AirQualityOrBuilder extends + // @@protoc_insertion_point(interface_extends:com.databricks.zerobus.examples.proto.AirQuality) + com.google.protobuf.MessageOrBuilder { + + /** + * optional string device_name = 1; + * @return Whether the deviceName field is set. + */ + boolean hasDeviceName(); + /** + * optional string device_name = 1; + * @return The deviceName. + */ + java.lang.String getDeviceName(); + /** + * optional string device_name = 1; + * @return The bytes for deviceName. + */ + com.google.protobuf.ByteString + getDeviceNameBytes(); + + /** + * optional int32 temp = 2; + * @return Whether the temp field is set. + */ + boolean hasTemp(); + /** + * optional int32 temp = 2; + * @return The temp. + */ + int getTemp(); + + /** + * optional int64 humidity = 3; + * @return Whether the humidity field is set. + */ + boolean hasHumidity(); + /** + * optional int64 humidity = 3; + * @return The humidity. + */ + long getHumidity(); + } + /** + *
+   * Air quality sensor reading matching the Databricks table schema:
+   * device_name: STRING, temp: INT, humidity: BIGINT
+   * 
+ * + * Protobuf type {@code com.databricks.zerobus.examples.proto.AirQuality} + */ + public static final class AirQuality extends + com.google.protobuf.GeneratedMessage implements + // @@protoc_insertion_point(message_implements:com.databricks.zerobus.examples.proto.AirQuality) + AirQualityOrBuilder { + private static final long serialVersionUID = 0L; + static { + com.google.protobuf.RuntimeVersion.validateProtobufGencodeVersion( + com.google.protobuf.RuntimeVersion.RuntimeDomain.PUBLIC, + /* major= */ 4, + /* minor= */ 33, + /* patch= */ 0, + /* suffix= */ "", + "AirQuality"); + } + // Use AirQuality.newBuilder() to construct. + private AirQuality(com.google.protobuf.GeneratedMessage.Builder builder) { + super(builder); + } + private AirQuality() { + deviceName_ = ""; + } + + public static final com.google.protobuf.Descriptors.Descriptor + getDescriptor() { + return com.databricks.zerobus.examples.proto.AirQualityProto.internal_static_com_databricks_zerobus_examples_proto_AirQuality_descriptor; + } + + @java.lang.Override + protected com.google.protobuf.GeneratedMessage.FieldAccessorTable + internalGetFieldAccessorTable() { + return com.databricks.zerobus.examples.proto.AirQualityProto.internal_static_com_databricks_zerobus_examples_proto_AirQuality_fieldAccessorTable + .ensureFieldAccessorsInitialized( + com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality.class, com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality.Builder.class); + } + + private int bitField0_; + public static final int DEVICE_NAME_FIELD_NUMBER = 1; + @SuppressWarnings("serial") + private volatile java.lang.Object deviceName_ = ""; + /** + * optional string device_name = 1; + * @return Whether the deviceName field is set. + */ + @java.lang.Override + public boolean hasDeviceName() { + return ((bitField0_ & 0x00000001) != 0); + } + /** + * optional string device_name = 1; + * @return The deviceName. + */ + @java.lang.Override + public java.lang.String getDeviceName() { + java.lang.Object ref = deviceName_; + if (ref instanceof java.lang.String) { + return (java.lang.String) ref; + } else { + com.google.protobuf.ByteString bs = + (com.google.protobuf.ByteString) ref; + java.lang.String s = bs.toStringUtf8(); + if (bs.isValidUtf8()) { + deviceName_ = s; + } + return s; + } + } + /** + * optional string device_name = 1; + * @return The bytes for deviceName. + */ + @java.lang.Override + public com.google.protobuf.ByteString + getDeviceNameBytes() { + java.lang.Object ref = deviceName_; + if (ref instanceof java.lang.String) { + com.google.protobuf.ByteString b = + com.google.protobuf.ByteString.copyFromUtf8( + (java.lang.String) ref); + deviceName_ = b; + return b; + } else { + return (com.google.protobuf.ByteString) ref; + } + } + + public static final int TEMP_FIELD_NUMBER = 2; + private int temp_ = 0; + /** + * optional int32 temp = 2; + * @return Whether the temp field is set. + */ + @java.lang.Override + public boolean hasTemp() { + return ((bitField0_ & 0x00000002) != 0); + } + /** + * optional int32 temp = 2; + * @return The temp. + */ + @java.lang.Override + public int getTemp() { + return temp_; + } + + public static final int HUMIDITY_FIELD_NUMBER = 3; + private long humidity_ = 0L; + /** + * optional int64 humidity = 3; + * @return Whether the humidity field is set. + */ + @java.lang.Override + public boolean hasHumidity() { + return ((bitField0_ & 0x00000004) != 0); + } + /** + * optional int64 humidity = 3; + * @return The humidity. + */ + @java.lang.Override + public long getHumidity() { + return humidity_; + } + + private byte memoizedIsInitialized = -1; + @java.lang.Override + public final boolean isInitialized() { + byte isInitialized = memoizedIsInitialized; + if (isInitialized == 1) return true; + if (isInitialized == 0) return false; + + memoizedIsInitialized = 1; + return true; + } + + @java.lang.Override + public void writeTo(com.google.protobuf.CodedOutputStream output) + throws java.io.IOException { + if (((bitField0_ & 0x00000001) != 0)) { + com.google.protobuf.GeneratedMessage.writeString(output, 1, deviceName_); + } + if (((bitField0_ & 0x00000002) != 0)) { + output.writeInt32(2, temp_); + } + if (((bitField0_ & 0x00000004) != 0)) { + output.writeInt64(3, humidity_); + } + getUnknownFields().writeTo(output); + } + + @java.lang.Override + public int getSerializedSize() { + int size = memoizedSize; + if (size != -1) return size; + + size = 0; + if (((bitField0_ & 0x00000001) != 0)) { + size += com.google.protobuf.GeneratedMessage.computeStringSize(1, deviceName_); + } + if (((bitField0_ & 0x00000002) != 0)) { + size += com.google.protobuf.CodedOutputStream + .computeInt32Size(2, temp_); + } + if (((bitField0_ & 0x00000004) != 0)) { + size += com.google.protobuf.CodedOutputStream + .computeInt64Size(3, humidity_); + } + size += getUnknownFields().getSerializedSize(); + memoizedSize = size; + return size; + } + + @java.lang.Override + public boolean equals(final java.lang.Object obj) { + if (obj == this) { + return true; + } + if (!(obj instanceof com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality)) { + return super.equals(obj); + } + com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality other = (com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality) obj; + + if (hasDeviceName() != other.hasDeviceName()) return false; + if (hasDeviceName()) { + if (!getDeviceName() + .equals(other.getDeviceName())) return false; + } + if (hasTemp() != other.hasTemp()) return false; + if (hasTemp()) { + if (getTemp() + != other.getTemp()) return false; + } + if (hasHumidity() != other.hasHumidity()) return false; + if (hasHumidity()) { + if (getHumidity() + != other.getHumidity()) return false; + } + if (!getUnknownFields().equals(other.getUnknownFields())) return false; + return true; + } + + @java.lang.Override + public int hashCode() { + if (memoizedHashCode != 0) { + return memoizedHashCode; + } + int hash = 41; + hash = (19 * hash) + getDescriptor().hashCode(); + if (hasDeviceName()) { + hash = (37 * hash) + DEVICE_NAME_FIELD_NUMBER; + hash = (53 * hash) + getDeviceName().hashCode(); + } + if (hasTemp()) { + hash = (37 * hash) + TEMP_FIELD_NUMBER; + hash = (53 * hash) + getTemp(); + } + if (hasHumidity()) { + hash = (37 * hash) + HUMIDITY_FIELD_NUMBER; + hash = (53 * hash) + com.google.protobuf.Internal.hashLong( + getHumidity()); + } + hash = (29 * hash) + getUnknownFields().hashCode(); + memoizedHashCode = hash; + return hash; + } + + public static com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality parseFrom( + java.nio.ByteBuffer data) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data); + } + public static com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality parseFrom( + java.nio.ByteBuffer data, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data, extensionRegistry); + } + public static com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality parseFrom( + com.google.protobuf.ByteString data) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data); + } + public static com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality parseFrom( + com.google.protobuf.ByteString data, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data, extensionRegistry); + } + public static com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality parseFrom(byte[] data) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data); + } + public static com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality parseFrom( + byte[] data, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data, extensionRegistry); + } + public static com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality parseFrom(java.io.InputStream input) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessage + .parseWithIOException(PARSER, input); + } + public static com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality parseFrom( + java.io.InputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessage + .parseWithIOException(PARSER, input, extensionRegistry); + } + + public static com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality parseDelimitedFrom(java.io.InputStream input) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessage + .parseDelimitedWithIOException(PARSER, input); + } + + public static com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality parseDelimitedFrom( + java.io.InputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessage + .parseDelimitedWithIOException(PARSER, input, extensionRegistry); + } + public static com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality parseFrom( + com.google.protobuf.CodedInputStream input) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessage + .parseWithIOException(PARSER, input); + } + public static com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality parseFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessage + .parseWithIOException(PARSER, input, extensionRegistry); + } + + @java.lang.Override + public Builder newBuilderForType() { return newBuilder(); } + public static Builder newBuilder() { + return DEFAULT_INSTANCE.toBuilder(); + } + public static Builder newBuilder(com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality prototype) { + return DEFAULT_INSTANCE.toBuilder().mergeFrom(prototype); + } + @java.lang.Override + public Builder toBuilder() { + return this == DEFAULT_INSTANCE + ? new Builder() : new Builder().mergeFrom(this); + } + + @java.lang.Override + protected Builder newBuilderForType( + com.google.protobuf.GeneratedMessage.BuilderParent parent) { + Builder builder = new Builder(parent); + return builder; + } + /** + *
+     * Air quality sensor reading matching the Databricks table schema:
+     * device_name: STRING, temp: INT, humidity: BIGINT
+     * 
+ * + * Protobuf type {@code com.databricks.zerobus.examples.proto.AirQuality} + */ + public static final class Builder extends + com.google.protobuf.GeneratedMessage.Builder implements + // @@protoc_insertion_point(builder_implements:com.databricks.zerobus.examples.proto.AirQuality) + com.databricks.zerobus.examples.proto.AirQualityProto.AirQualityOrBuilder { + public static final com.google.protobuf.Descriptors.Descriptor + getDescriptor() { + return com.databricks.zerobus.examples.proto.AirQualityProto.internal_static_com_databricks_zerobus_examples_proto_AirQuality_descriptor; + } + + @java.lang.Override + protected com.google.protobuf.GeneratedMessage.FieldAccessorTable + internalGetFieldAccessorTable() { + return com.databricks.zerobus.examples.proto.AirQualityProto.internal_static_com_databricks_zerobus_examples_proto_AirQuality_fieldAccessorTable + .ensureFieldAccessorsInitialized( + com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality.class, com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality.Builder.class); + } + + // Construct using com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality.newBuilder() + private Builder() { + + } + + private Builder( + com.google.protobuf.GeneratedMessage.BuilderParent parent) { + super(parent); + + } + @java.lang.Override + public Builder clear() { + super.clear(); + bitField0_ = 0; + deviceName_ = ""; + temp_ = 0; + humidity_ = 0L; + return this; + } + + @java.lang.Override + public com.google.protobuf.Descriptors.Descriptor + getDescriptorForType() { + return com.databricks.zerobus.examples.proto.AirQualityProto.internal_static_com_databricks_zerobus_examples_proto_AirQuality_descriptor; + } + + @java.lang.Override + public com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality getDefaultInstanceForType() { + return com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality.getDefaultInstance(); + } + + @java.lang.Override + public com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality build() { + com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality result = buildPartial(); + if (!result.isInitialized()) { + throw newUninitializedMessageException(result); + } + return result; + } + + @java.lang.Override + public com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality buildPartial() { + com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality result = new com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality(this); + if (bitField0_ != 0) { buildPartial0(result); } + onBuilt(); + return result; + } + + private void buildPartial0(com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality result) { + int from_bitField0_ = bitField0_; + int to_bitField0_ = 0; + if (((from_bitField0_ & 0x00000001) != 0)) { + result.deviceName_ = deviceName_; + to_bitField0_ |= 0x00000001; + } + if (((from_bitField0_ & 0x00000002) != 0)) { + result.temp_ = temp_; + to_bitField0_ |= 0x00000002; + } + if (((from_bitField0_ & 0x00000004) != 0)) { + result.humidity_ = humidity_; + to_bitField0_ |= 0x00000004; + } + result.bitField0_ |= to_bitField0_; + } + + @java.lang.Override + public Builder mergeFrom(com.google.protobuf.Message other) { + if (other instanceof com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality) { + return mergeFrom((com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality)other); + } else { + super.mergeFrom(other); + return this; + } + } + + public Builder mergeFrom(com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality other) { + if (other == com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality.getDefaultInstance()) return this; + if (other.hasDeviceName()) { + deviceName_ = other.deviceName_; + bitField0_ |= 0x00000001; + onChanged(); + } + if (other.hasTemp()) { + setTemp(other.getTemp()); + } + if (other.hasHumidity()) { + setHumidity(other.getHumidity()); + } + this.mergeUnknownFields(other.getUnknownFields()); + onChanged(); + return this; + } + + @java.lang.Override + public final boolean isInitialized() { + return true; + } + + @java.lang.Override + public Builder mergeFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + if (extensionRegistry == null) { + throw new java.lang.NullPointerException(); + } + try { + boolean done = false; + while (!done) { + int tag = input.readTag(); + switch (tag) { + case 0: + done = true; + break; + case 10: { + deviceName_ = input.readBytes(); + bitField0_ |= 0x00000001; + break; + } // case 10 + case 16: { + temp_ = input.readInt32(); + bitField0_ |= 0x00000002; + break; + } // case 16 + case 24: { + humidity_ = input.readInt64(); + bitField0_ |= 0x00000004; + break; + } // case 24 + default: { + if (!super.parseUnknownField(input, extensionRegistry, tag)) { + done = true; // was an endgroup tag + } + break; + } // default: + } // switch (tag) + } // while (!done) + } catch (com.google.protobuf.InvalidProtocolBufferException e) { + throw e.unwrapIOException(); + } finally { + onChanged(); + } // finally + return this; + } + private int bitField0_; + + private java.lang.Object deviceName_ = ""; + /** + * optional string device_name = 1; + * @return Whether the deviceName field is set. + */ + public boolean hasDeviceName() { + return ((bitField0_ & 0x00000001) != 0); + } + /** + * optional string device_name = 1; + * @return The deviceName. + */ + public java.lang.String getDeviceName() { + java.lang.Object ref = deviceName_; + if (!(ref instanceof java.lang.String)) { + com.google.protobuf.ByteString bs = + (com.google.protobuf.ByteString) ref; + java.lang.String s = bs.toStringUtf8(); + if (bs.isValidUtf8()) { + deviceName_ = s; + } + return s; + } else { + return (java.lang.String) ref; + } + } + /** + * optional string device_name = 1; + * @return The bytes for deviceName. + */ + public com.google.protobuf.ByteString + getDeviceNameBytes() { + java.lang.Object ref = deviceName_; + if (ref instanceof String) { + com.google.protobuf.ByteString b = + com.google.protobuf.ByteString.copyFromUtf8( + (java.lang.String) ref); + deviceName_ = b; + return b; + } else { + return (com.google.protobuf.ByteString) ref; + } + } + /** + * optional string device_name = 1; + * @param value The deviceName to set. + * @return This builder for chaining. + */ + public Builder setDeviceName( + java.lang.String value) { + if (value == null) { throw new NullPointerException(); } + deviceName_ = value; + bitField0_ |= 0x00000001; + onChanged(); + return this; + } + /** + * optional string device_name = 1; + * @return This builder for chaining. + */ + public Builder clearDeviceName() { + deviceName_ = getDefaultInstance().getDeviceName(); + bitField0_ = (bitField0_ & ~0x00000001); + onChanged(); + return this; + } + /** + * optional string device_name = 1; + * @param value The bytes for deviceName to set. + * @return This builder for chaining. + */ + public Builder setDeviceNameBytes( + com.google.protobuf.ByteString value) { + if (value == null) { throw new NullPointerException(); } + deviceName_ = value; + bitField0_ |= 0x00000001; + onChanged(); + return this; + } + + private int temp_ ; + /** + * optional int32 temp = 2; + * @return Whether the temp field is set. + */ + @java.lang.Override + public boolean hasTemp() { + return ((bitField0_ & 0x00000002) != 0); + } + /** + * optional int32 temp = 2; + * @return The temp. + */ + @java.lang.Override + public int getTemp() { + return temp_; + } + /** + * optional int32 temp = 2; + * @param value The temp to set. + * @return This builder for chaining. + */ + public Builder setTemp(int value) { + + temp_ = value; + bitField0_ |= 0x00000002; + onChanged(); + return this; + } + /** + * optional int32 temp = 2; + * @return This builder for chaining. + */ + public Builder clearTemp() { + bitField0_ = (bitField0_ & ~0x00000002); + temp_ = 0; + onChanged(); + return this; + } + + private long humidity_ ; + /** + * optional int64 humidity = 3; + * @return Whether the humidity field is set. + */ + @java.lang.Override + public boolean hasHumidity() { + return ((bitField0_ & 0x00000004) != 0); + } + /** + * optional int64 humidity = 3; + * @return The humidity. + */ + @java.lang.Override + public long getHumidity() { + return humidity_; + } + /** + * optional int64 humidity = 3; + * @param value The humidity to set. + * @return This builder for chaining. + */ + public Builder setHumidity(long value) { + + humidity_ = value; + bitField0_ |= 0x00000004; + onChanged(); + return this; + } + /** + * optional int64 humidity = 3; + * @return This builder for chaining. + */ + public Builder clearHumidity() { + bitField0_ = (bitField0_ & ~0x00000004); + humidity_ = 0L; + onChanged(); + return this; + } + + // @@protoc_insertion_point(builder_scope:com.databricks.zerobus.examples.proto.AirQuality) + } + + // @@protoc_insertion_point(class_scope:com.databricks.zerobus.examples.proto.AirQuality) + private static final com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality DEFAULT_INSTANCE; + static { + DEFAULT_INSTANCE = new com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality(); + } + + public static com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality getDefaultInstance() { + return DEFAULT_INSTANCE; + } + + private static final com.google.protobuf.Parser + PARSER = new com.google.protobuf.AbstractParser() { + @java.lang.Override + public AirQuality parsePartialFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + Builder builder = newBuilder(); + try { + builder.mergeFrom(input, extensionRegistry); + } catch (com.google.protobuf.InvalidProtocolBufferException e) { + throw e.setUnfinishedMessage(builder.buildPartial()); + } catch (com.google.protobuf.UninitializedMessageException e) { + throw e.asInvalidProtocolBufferException().setUnfinishedMessage(builder.buildPartial()); + } catch (java.io.IOException e) { + throw new com.google.protobuf.InvalidProtocolBufferException(e) + .setUnfinishedMessage(builder.buildPartial()); + } + return builder.buildPartial(); + } + }; + + public static com.google.protobuf.Parser parser() { + return PARSER; + } + + @java.lang.Override + public com.google.protobuf.Parser getParserForType() { + return PARSER; + } + + @java.lang.Override + public com.databricks.zerobus.examples.proto.AirQualityProto.AirQuality getDefaultInstanceForType() { + return DEFAULT_INSTANCE; + } + + } + + private static final com.google.protobuf.Descriptors.Descriptor + internal_static_com_databricks_zerobus_examples_proto_AirQuality_descriptor; + private static final + com.google.protobuf.GeneratedMessage.FieldAccessorTable + internal_static_com_databricks_zerobus_examples_proto_AirQuality_fieldAccessorTable; + + public static com.google.protobuf.Descriptors.FileDescriptor + getDescriptor() { + return descriptor; + } + private static com.google.protobuf.Descriptors.FileDescriptor + descriptor; + static { + java.lang.String[] descriptorData = { + "\n\021air_quality.proto\022%com.databricks.zero" + + "bus.examples.proto\"A\n\nAirQuality\022\023\n\013devi" + + "ce_name\030\001 \001(\t\022\014\n\004temp\030\002 \001(\005\022\020\n\010humidity\030" + + "\003 \001(\003B8\n%com.databricks.zerobus.examples" + + ".protoB\017AirQualityProto" + }; + descriptor = com.google.protobuf.Descriptors.FileDescriptor + .internalBuildGeneratedFileFrom(descriptorData, + new com.google.protobuf.Descriptors.FileDescriptor[] { + }); + internal_static_com_databricks_zerobus_examples_proto_AirQuality_descriptor = + getDescriptor().getMessageType(0); + internal_static_com_databricks_zerobus_examples_proto_AirQuality_fieldAccessorTable = new + com.google.protobuf.GeneratedMessage.FieldAccessorTable( + internal_static_com_databricks_zerobus_examples_proto_AirQuality_descriptor, + new java.lang.String[] { "DeviceName", "Temp", "Humidity", }); + descriptor.resolveAllFeaturesImmutable(); + } + + // @@protoc_insertion_point(outer_class_scope) +} diff --git a/examples/proto/com/databricks/zerobus/examples/proto/BatchIngestionExample.class b/examples/proto/com/databricks/zerobus/examples/proto/BatchIngestionExample.class new file mode 100644 index 0000000..d470b2c Binary files /dev/null and b/examples/proto/com/databricks/zerobus/examples/proto/BatchIngestionExample.class differ diff --git a/examples/proto/com/databricks/zerobus/examples/proto/SingleRecordExample.class b/examples/proto/com/databricks/zerobus/examples/proto/SingleRecordExample.class new file mode 100644 index 0000000..38201c7 Binary files /dev/null and b/examples/proto/com/databricks/zerobus/examples/proto/SingleRecordExample.class differ diff --git a/examples/record.proto b/examples/record.proto deleted file mode 100644 index d063f8b..0000000 --- a/examples/record.proto +++ /dev/null @@ -1,7 +0,0 @@ -syntax = "proto2"; - -message AirQuality { - optional string device_name = 1; - optional int32 temp = 2; - optional int64 humidity = 3; -} diff --git a/examples/src/main/java/com/databricks/zerobus/examples/BlockingIngestionExample.java b/examples/src/main/java/com/databricks/zerobus/examples/BlockingIngestionExample.java deleted file mode 100644 index 7762dbb..0000000 --- a/examples/src/main/java/com/databricks/zerobus/examples/BlockingIngestionExample.java +++ /dev/null @@ -1,111 +0,0 @@ -package com.databricks.zerobus.examples; - -import com.databricks.zerobus.*; - -/** - * Example demonstrating blocking (synchronous) record ingestion. - * - *

This example shows how to ingest records synchronously, waiting for each - * record to be durably written before proceeding to the next one. This approach - * provides the strongest durability guarantees but has lower throughput compared - * to non-blocking ingestion. - * - *

Use Case: Best for low-volume ingestion where durability is critical - * and you need immediate confirmation of each write. - */ -public class BlockingIngestionExample { - - // Configuration - update these with your values - private static final String SERVER_ENDPOINT = "your-shard-id.zerobus.region.cloud.databricks.com"; - private static final String UNITY_CATALOG_ENDPOINT = "https://your-workspace.cloud.databricks.com"; - private static final String TABLE_NAME = "catalog.schema.table"; - private static final String CLIENT_ID = "your-oauth-client-id"; - private static final String CLIENT_SECRET = "your-oauth-client-secret"; - - // Number of records to ingest - private static final int NUM_RECORDS = 1000; - - public static void main(String[] args) { - System.out.println("Starting blocking ingestion example..."); - System.out.println("==========================================="); - - try { - // Step 1: Initialize the SDK - ZerobusSdk sdk = new ZerobusSdk(SERVER_ENDPOINT, UNITY_CATALOG_ENDPOINT); - System.out.println("✓ SDK initialized"); - - // Step 2: Define table properties with your protobuf message type - // Note: Replace Record.AirQuality with your own protobuf message class - TableProperties tableProperties = new TableProperties<>( - TABLE_NAME, - Record.AirQuality.getDefaultInstance() - ); - System.out.println("✓ Table properties configured"); - - // Step 3: Create a stream with default configuration - ZerobusStream stream = sdk.createStream( - tableProperties, - CLIENT_ID, - CLIENT_SECRET - ).join(); - System.out.println("✓ Stream created: " + stream.getStreamId()); - - // Step 4: Ingest records synchronously - System.out.println("\nIngesting " + NUM_RECORDS + " records (blocking mode)..."); - long startTime = System.currentTimeMillis(); - int successCount = 0; - - try { - for (int i = 0; i < NUM_RECORDS; i++) { - // Create a record - Record.AirQuality record = Record.AirQuality.newBuilder() - .setDeviceName("sensor-" + (i % 10)) - .setTemp(20 + (i % 15)) - .setHumidity(50 + (i % 40)) - .build(); - - // Ingest and wait for durability - stream.ingestRecord(record).join(); - - successCount++; - - // Progress indicator - if ((i + 1) % 100 == 0) { - System.out.println(" Ingested " + (i + 1) + " records"); - } - } - - long endTime = System.currentTimeMillis(); - double durationSeconds = (endTime - startTime) / 1000.0; - double recordsPerSecond = NUM_RECORDS / durationSeconds; - - // Step 5: Close the stream - stream.close(); - System.out.println("\n✓ Stream closed"); - - // Print summary - System.out.println("\n==========================================="); - System.out.println("Ingestion Summary:"); - System.out.println(" Total records: " + NUM_RECORDS); - System.out.println(" Successful: " + successCount); - System.out.println(" Failed: " + (NUM_RECORDS - successCount)); - System.out.println(" Duration: " + String.format("%.2f", durationSeconds) + " seconds"); - System.out.println(" Throughput: " + String.format("%.2f", recordsPerSecond) + " records/sec"); - System.out.println("==========================================="); - - } catch (Exception e) { - System.err.println("\n✗ Error during ingestion: " + e.getMessage()); - e.printStackTrace(); - stream.close(); - System.exit(1); - } - - } catch (ZerobusException e) { - System.err.println("\n✗ Failed to initialize stream: " + e.getMessage()); - e.printStackTrace(); - System.exit(1); - } - - System.out.println("\nBlocking ingestion example completed successfully!"); - } -} diff --git a/examples/src/main/java/com/databricks/zerobus/examples/NonBlockingIngestionExample.java b/examples/src/main/java/com/databricks/zerobus/examples/NonBlockingIngestionExample.java deleted file mode 100644 index 3d0c305..0000000 --- a/examples/src/main/java/com/databricks/zerobus/examples/NonBlockingIngestionExample.java +++ /dev/null @@ -1,157 +0,0 @@ -package com.databricks.zerobus.examples; - -import com.databricks.zerobus.*; - -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.CompletableFuture; -import java.util.function.Consumer; - -/** - * Example demonstrating non-blocking (asynchronous) record ingestion. - * - *

This example shows how to ingest records asynchronously, allowing maximum - * throughput by not waiting for each record to complete before submitting the next. - * The SDK manages buffering and flow control automatically. - * - *

Use Case: Best for high-volume ingestion where maximum throughput is - * important. Records are still durably written, but acknowledgments are handled - * asynchronously. - */ -public class NonBlockingIngestionExample { - - // Configuration - update these with your values - private static final String SERVER_ENDPOINT = "your-shard-id.zerobus.region.cloud.databricks.com"; - private static final String UNITY_CATALOG_ENDPOINT = "https://your-workspace.cloud.databricks.com"; - private static final String TABLE_NAME = "catalog.schema.table"; - private static final String CLIENT_ID = "your-oauth-client-id"; - private static final String CLIENT_SECRET = "your-oauth-client-secret"; - - // Number of records to ingest - private static final int NUM_RECORDS = 100_000; - - public static void main(String[] args) { - System.out.println("Starting non-blocking ingestion example..."); - System.out.println("==========================================="); - - try { - // Step 1: Initialize the SDK - ZerobusSdk sdk = new ZerobusSdk(SERVER_ENDPOINT, UNITY_CATALOG_ENDPOINT); - System.out.println("✓ SDK initialized"); - - // Step 2: Configure stream options with ack callback - StreamConfigurationOptions options = StreamConfigurationOptions.builder() - .setMaxInflightRecords(50_000) // Allow 50k records in flight - .setRecovery(true) // Enable automatic recovery - .setAckCallback(createAckCallback()) // Track acknowledgments - .build(); - System.out.println("✓ Stream configuration created"); - - // Step 3: Define table properties with your protobuf message type - // Note: Replace Record.AirQuality with your own protobuf message class - TableProperties tableProperties = new TableProperties<>( - TABLE_NAME, - Record.AirQuality.getDefaultInstance() - ); - System.out.println("✓ Table properties configured"); - - // Step 4: Create a stream - ZerobusStream stream = sdk.createStream( - tableProperties, - CLIENT_ID, - CLIENT_SECRET, - options - ).join(); - System.out.println("✓ Stream created: " + stream.getStreamId()); - - // Step 5: Ingest records asynchronously - System.out.println("\nIngesting " + NUM_RECORDS + " records (non-blocking mode)..."); - List> futures = new ArrayList<>(); - long startTime = System.currentTimeMillis(); - - try { - for (int i = 0; i < NUM_RECORDS; i++) { - // Create a record with varying data - Record.AirQuality record = Record.AirQuality.newBuilder() - .setDeviceName("sensor-" + (i % 10)) - .setTemp(20 + (i % 15)) - .setHumidity(50 + (i % 40)) - .build(); - - // Ingest record and collect future for durability later - futures.add(stream.ingestRecord(record)); - - // Progress indicator - if ((i + 1) % 10000 == 0) { - System.out.println(" Submitted " + (i + 1) + " records"); - } - } - - long submitEndTime = System.currentTimeMillis(); - double submitDuration = (submitEndTime - startTime) / 1000.0; - - System.out.println("\n✓ All records submitted in " + - String.format("%.2f", submitDuration) + " seconds"); - - // Step 6: Flush and wait for all records to be durably written - System.out.println("\nFlushing stream and waiting for durability..."); - stream.flush(); - - // Wait for all futures to complete - CompletableFuture allFutures = CompletableFuture.allOf( - futures.toArray(new CompletableFuture[0]) - ); - allFutures.join(); - - long endTime = System.currentTimeMillis(); - double totalDuration = (endTime - startTime) / 1000.0; - double recordsPerSecond = NUM_RECORDS / totalDuration; - - System.out.println("✓ All records durably written"); - - // Step 7: Close the stream - stream.close(); - System.out.println("✓ Stream closed"); - - // Print summary - System.out.println("\n==========================================="); - System.out.println("Ingestion Summary:"); - System.out.println(" Total records: " + NUM_RECORDS); - System.out.println(" Submit time: " + String.format("%.2f", submitDuration) + " seconds"); - System.out.println(" Total time: " + String.format("%.2f", totalDuration) + " seconds"); - System.out.println(" Throughput: " + String.format("%.2f", recordsPerSecond) + " records/sec"); - System.out.println(" Average latency: " + - String.format("%.2f", (totalDuration * 1000.0) / NUM_RECORDS) + " ms/record"); - System.out.println("==========================================="); - - } catch (Exception e) { - System.err.println("\n✗ Error during ingestion: " + e.getMessage()); - e.printStackTrace(); - stream.close(); - System.exit(1); - } - - } catch (ZerobusException e) { - System.err.println("\n✗ Failed to initialize stream: " + e.getMessage()); - e.printStackTrace(); - System.exit(1); - } - - System.out.println("\nNon-blocking ingestion example completed successfully!"); - } - - /** - * Creates an acknowledgment callback that logs progress. - * - * @return Consumer that handles acknowledgment responses - */ - private static Consumer createAckCallback() { - return response -> { - long offset = response.getDurabilityAckUpToOffset(); - // Log every 10000 records - if (offset % 10000 == 0) { - System.out.println(" Acknowledged up to offset: " + offset); - } - }; - } -} diff --git a/pom.xml b/pom.xml index 0f543ff..3c0a415 100644 --- a/pom.xml +++ b/pom.xml @@ -4,10 +4,10 @@ 4.0.0 com.databricks zerobus-ingest-sdk - 0.1.0 + 0.2.0 jar Zerobus Ingest SDK for Java - Databricks Zerobus Ingest SDK for Java - Direct ingestion to Delta tables + Databricks Zerobus Ingest SDK for Java - Direct ingestion to Delta tables via native Rust SDK https://github.com/databricks/zerobus-sdk-java @@ -39,28 +39,6 @@ protobuf-java 4.33.0 - - - io.grpc - grpc-netty-shaded - 1.76.0 - - - io.grpc - grpc-protobuf - 1.76.0 - - - io.grpc - grpc-stub - 1.76.0 - - - - javax.annotation - javax.annotation-api - 1.3.2 - org.slf4j @@ -99,11 +77,18 @@ 5.5.0 test + - io.grpc - grpc-testing - 1.76.0 - test + org.apache.arrow + arrow-vector + 17.0.0 + true + + + org.apache.arrow + arrow-memory-netty + 17.0.0 + true @@ -115,15 +100,12 @@ 0.6.1 com.google.protobuf:protoc:4.33.0:exe:${os.detected.classifier} - grpc-java - io.grpc:protoc-gen-grpc-java:1.76.0:exe:${os.detected.classifier} compile-protobuf compile - compile-custom @@ -146,11 +128,9 @@ com.diffplug.spotless spotless-maven-plugin - 2.30.0 - diff --git a/src/main/java/com/databricks/zerobus/AckCallback.java b/src/main/java/com/databricks/zerobus/AckCallback.java new file mode 100644 index 0000000..bc2005b --- /dev/null +++ b/src/main/java/com/databricks/zerobus/AckCallback.java @@ -0,0 +1,64 @@ +package com.databricks.zerobus; + +/** + * Callback interface for receiving acknowledgment notifications from the Zerobus stream. + * + *

This interface provides methods for handling both successful acknowledgments and errors. It + * replaces the deprecated {@code Consumer} callback with a more type-safe and + * flexible API. + * + *

Implementations should be thread-safe as callbacks may be invoked from multiple threads. + * Callbacks should be lightweight to avoid blocking the internal processing threads. + * + *

Example usage: + * + *

{@code
+ * AckCallback callback = new AckCallback() {
+ *     @Override
+ *     public void onAck(long offsetId) {
+ *         System.out.println("Record at offset " + offsetId + " acknowledged");
+ *     }
+ *
+ *     @Override
+ *     public void onError(long offsetId, String errorMessage) {
+ *         System.err.println("Error for offset " + offsetId + ": " + errorMessage);
+ *     }
+ * };
+ *
+ * StreamConfigurationOptions options = StreamConfigurationOptions.builder()
+ *     .setAckCallback(callback)
+ *     .build();
+ * }
+ * + * @see StreamConfigurationOptions.StreamConfigurationOptionsBuilder#setAckCallback(AckCallback) + */ +public interface AckCallback { + + /** + * Called when a record (or records up to this offset) has been durably acknowledged by the + * server. + * + *

The offset ID represents the durability acknowledgment up to and including this offset. All + * records with offset IDs less than or equal to this value have been durably stored. + * + *

This method should not throw exceptions. If an exception is thrown, it will be logged but + * will not affect stream operation. + * + * @param offsetId the offset ID that has been acknowledged + */ + void onAck(long offsetId); + + /** + * Called when an error occurs for a specific record or offset. + * + *

This method is called when the SDK encounters an error that affects a specific offset. The + * error may be retryable or non-retryable depending on the nature of the failure. + * + *

This method should not throw exceptions. If an exception is thrown, it will be logged but + * will not affect stream operation. + * + * @param offsetId the offset ID that encountered an error + * @param errorMessage a description of the error that occurred + */ + void onError(long offsetId, String errorMessage); +} diff --git a/src/main/java/com/databricks/zerobus/ArrowStreamConfigurationOptions.java b/src/main/java/com/databricks/zerobus/ArrowStreamConfigurationOptions.java new file mode 100644 index 0000000..b12aba8 --- /dev/null +++ b/src/main/java/com/databricks/zerobus/ArrowStreamConfigurationOptions.java @@ -0,0 +1,271 @@ +package com.databricks.zerobus; + +/** + * Configuration options for Zerobus Arrow Flight streams. + * + *

Experimental: Arrow Flight support is experimental and may change in future releases. + * + *

This class provides various settings to control Arrow stream behavior including performance + * tuning, error handling, and timeout configuration. + * + *

Use the builder pattern to create instances: + * + *

{@code
+ * ArrowStreamConfigurationOptions options = ArrowStreamConfigurationOptions.builder()
+ *     .setMaxInflightBatches(500)
+ *     .setRecovery(true)
+ *     .setFlushTimeoutMs(600000)
+ *     .build();
+ * }
+ * + * @see ZerobusArrowStream + * @see ZerobusSdk#createArrowStream + */ +public class ArrowStreamConfigurationOptions { + + private final int maxInflightBatches; + private final boolean recovery; + private final long recoveryTimeoutMs; + private final long recoveryBackoffMs; + private final int recoveryRetries; + private final long serverLackOfAckTimeoutMs; + private final long flushTimeoutMs; + private final long connectionTimeoutMs; + + private ArrowStreamConfigurationOptions( + int maxInflightBatches, + boolean recovery, + long recoveryTimeoutMs, + long recoveryBackoffMs, + int recoveryRetries, + long serverLackOfAckTimeoutMs, + long flushTimeoutMs, + long connectionTimeoutMs) { + this.maxInflightBatches = maxInflightBatches; + this.recovery = recovery; + this.recoveryTimeoutMs = recoveryTimeoutMs; + this.recoveryBackoffMs = recoveryBackoffMs; + this.recoveryRetries = recoveryRetries; + this.serverLackOfAckTimeoutMs = serverLackOfAckTimeoutMs; + this.flushTimeoutMs = flushTimeoutMs; + this.connectionTimeoutMs = connectionTimeoutMs; + } + + /** + * Returns the maximum number of batches that can be in flight. + * + * @return the maximum number of in-flight batches + */ + public int maxInflightBatches() { + return maxInflightBatches; + } + + /** + * Returns whether automatic recovery is enabled. + * + * @return true if automatic recovery is enabled + */ + public boolean recovery() { + return recovery; + } + + /** + * Returns the timeout for recovery operations. + * + * @return the recovery timeout in milliseconds + */ + public long recoveryTimeoutMs() { + return recoveryTimeoutMs; + } + + /** + * Returns the backoff delay between recovery attempts. + * + * @return the recovery backoff delay in milliseconds + */ + public long recoveryBackoffMs() { + return recoveryBackoffMs; + } + + /** + * Returns the maximum number of recovery attempts. + * + * @return the maximum number of recovery retries + */ + public int recoveryRetries() { + return recoveryRetries; + } + + /** + * Returns the timeout for server acknowledgment. + * + * @return the server acknowledgment timeout in milliseconds + */ + public long serverLackOfAckTimeoutMs() { + return serverLackOfAckTimeoutMs; + } + + /** + * Returns the timeout for flush operations. + * + * @return the flush timeout in milliseconds + */ + public long flushTimeoutMs() { + return flushTimeoutMs; + } + + /** + * Returns the timeout for initial connection. + * + * @return the connection timeout in milliseconds + */ + public long connectionTimeoutMs() { + return connectionTimeoutMs; + } + + /** + * Returns the default Arrow stream configuration options. + * + * @return the default options + */ + public static ArrowStreamConfigurationOptions getDefault() { + return new ArrowStreamConfigurationOptions( + 1000, // maxInflightBatches + true, // recovery + 15000, // recoveryTimeoutMs + 2000, // recoveryBackoffMs + 4, // recoveryRetries + 60000, // serverLackOfAckTimeoutMs + 300000, // flushTimeoutMs + 30000 // connectionTimeoutMs + ); + } + + /** + * Returns a new builder for creating ArrowStreamConfigurationOptions. + * + * @return a new builder + */ + public static Builder builder() { + return new Builder(); + } + + /** Builder for creating ArrowStreamConfigurationOptions instances. */ + public static class Builder { + private int maxInflightBatches = 1000; + private boolean recovery = true; + private long recoveryTimeoutMs = 15000; + private long recoveryBackoffMs = 2000; + private int recoveryRetries = 4; + private long serverLackOfAckTimeoutMs = 60000; + private long flushTimeoutMs = 300000; + private long connectionTimeoutMs = 30000; + + private Builder() {} + + /** + * Sets the maximum number of batches that can be in flight. + * + * @param maxInflightBatches the maximum number of in-flight batches + * @return this builder + */ + public Builder setMaxInflightBatches(int maxInflightBatches) { + this.maxInflightBatches = maxInflightBatches; + return this; + } + + /** + * Sets whether automatic recovery is enabled. + * + * @param recovery true to enable automatic recovery + * @return this builder + */ + public Builder setRecovery(boolean recovery) { + this.recovery = recovery; + return this; + } + + /** + * Sets the timeout for recovery operations. + * + * @param recoveryTimeoutMs the recovery timeout in milliseconds + * @return this builder + */ + public Builder setRecoveryTimeoutMs(long recoveryTimeoutMs) { + this.recoveryTimeoutMs = recoveryTimeoutMs; + return this; + } + + /** + * Sets the backoff delay between recovery attempts. + * + * @param recoveryBackoffMs the recovery backoff delay in milliseconds + * @return this builder + */ + public Builder setRecoveryBackoffMs(long recoveryBackoffMs) { + this.recoveryBackoffMs = recoveryBackoffMs; + return this; + } + + /** + * Sets the maximum number of recovery attempts. + * + * @param recoveryRetries the maximum number of recovery retries + * @return this builder + */ + public Builder setRecoveryRetries(int recoveryRetries) { + this.recoveryRetries = recoveryRetries; + return this; + } + + /** + * Sets the timeout for server acknowledgment. + * + * @param serverLackOfAckTimeoutMs the server acknowledgment timeout in milliseconds + * @return this builder + */ + public Builder setServerLackOfAckTimeoutMs(long serverLackOfAckTimeoutMs) { + this.serverLackOfAckTimeoutMs = serverLackOfAckTimeoutMs; + return this; + } + + /** + * Sets the timeout for flush operations. + * + * @param flushTimeoutMs the flush timeout in milliseconds + * @return this builder + */ + public Builder setFlushTimeoutMs(long flushTimeoutMs) { + this.flushTimeoutMs = flushTimeoutMs; + return this; + } + + /** + * Sets the timeout for initial connection. + * + * @param connectionTimeoutMs the connection timeout in milliseconds + * @return this builder + */ + public Builder setConnectionTimeoutMs(long connectionTimeoutMs) { + this.connectionTimeoutMs = connectionTimeoutMs; + return this; + } + + /** + * Builds a new ArrowStreamConfigurationOptions instance. + * + * @return the new options + */ + public ArrowStreamConfigurationOptions build() { + return new ArrowStreamConfigurationOptions( + maxInflightBatches, + recovery, + recoveryTimeoutMs, + recoveryBackoffMs, + recoveryRetries, + serverLackOfAckTimeoutMs, + flushTimeoutMs, + connectionTimeoutMs); + } + } +} diff --git a/src/main/java/com/databricks/zerobus/ArrowTableProperties.java b/src/main/java/com/databricks/zerobus/ArrowTableProperties.java new file mode 100644 index 0000000..cdcb278 --- /dev/null +++ b/src/main/java/com/databricks/zerobus/ArrowTableProperties.java @@ -0,0 +1,171 @@ +package com.databricks.zerobus; + +import java.io.ByteArrayOutputStream; +import java.nio.channels.Channels; + +/** + * Table properties for Arrow Flight streams. + * + *

Experimental: Arrow Flight support is experimental and may change in future releases. + * + *

This class describes the target table for Arrow data ingestion, including the fully qualified + * table name and the Arrow schema. + * + *

The schema is provided as an Apache Arrow {@code Schema} object and will be serialized to IPC + * format for transmission to the server. + * + *

Prerequisites: Arrow support requires adding Apache Arrow dependencies to your project: + * + *

{@code
+ * 
+ *     org.apache.arrow
+ *     arrow-vector
+ *     17.0.0
+ * 
+ * 
+ *     org.apache.arrow
+ *     arrow-memory-netty
+ *     17.0.0
+ * 
+ * }
+ * + *

Example usage: + * + *

{@code
+ * Schema schema = new Schema(Arrays.asList(
+ *     Field.nullable("id", new ArrowType.Int(64, true)),
+ *     Field.nullable("name", new ArrowType.Utf8())
+ * ));
+ *
+ * ArrowTableProperties props = new ArrowTableProperties("catalog.schema.table", schema);
+ * }
+ * + * @see ZerobusArrowStream + * @see ZerobusSdk#createArrowStream + */ +public class ArrowTableProperties { + + private static final boolean ARROW_AVAILABLE; + + static { + boolean available = false; + try { + Class.forName("org.apache.arrow.vector.types.pojo.Schema"); + available = true; + } catch (ClassNotFoundException e) { + // Arrow not on classpath + } + ARROW_AVAILABLE = available; + } + + /** + * Checks if Apache Arrow libraries are available on the classpath. + * + * @return true if Arrow is available, false otherwise + */ + public static boolean isArrowAvailable() { + return ARROW_AVAILABLE; + } + + /** + * Throws an exception if Arrow is not available. + * + * @throws UnsupportedOperationException if Arrow libraries are not on the classpath + */ + static void requireArrow() { + if (!ARROW_AVAILABLE) { + throw new UnsupportedOperationException( + "Apache Arrow libraries are not available. " + + "To use Arrow Flight support, add arrow-vector and arrow-memory-netty dependencies to your project. " + + "See https://arrow.apache.org/docs/java/"); + } + } + + private final String tableName; + private final Object schema; // org.apache.arrow.vector.types.pojo.Schema + + /** + * Creates new ArrowTableProperties. + * + * @param tableName the fully qualified table name (catalog.schema.table) + * @param schema the Arrow schema (must be an org.apache.arrow.vector.types.pojo.Schema) + * @throws NullPointerException if tableName or schema is null + * @throws IllegalArgumentException if tableName is not in three-part format + * @throws UnsupportedOperationException if Arrow libraries are not on the classpath + */ + public ArrowTableProperties(String tableName, Object schema) { + requireArrow(); + + if (tableName == null) { + throw new NullPointerException("tableName cannot be null"); + } + if (schema == null) { + throw new NullPointerException("schema cannot be null"); + } + + // Validate table name format + String[] parts = tableName.split("\\."); + if (parts.length != 3) { + throw new IllegalArgumentException( + "Table name must be in three-part format: catalog.schema.table"); + } + + this.tableName = tableName; + this.schema = schema; + } + + /** + * Returns the fully qualified table name. + * + * @return the table name in format catalog.schema.table + */ + public String getTableName() { + return tableName; + } + + /** + * Returns the Arrow schema. + * + * @return the Arrow schema object + */ + public Object getSchema() { + return schema; + } + + /** + * Serializes the Arrow schema to IPC format bytes. + * + *

This method uses reflection to avoid a hard dependency on Arrow libraries at compile time. + * + * @return the schema serialized as IPC format bytes + * @throws RuntimeException if serialization fails + */ + byte[] getSchemaBytes() { + try { + // Use reflection to serialize the schema to IPC format + // This avoids requiring Arrow at compile time + Class schemaClass = Class.forName("org.apache.arrow.vector.types.pojo.Schema"); + Class messageSerializerClass = + Class.forName("org.apache.arrow.vector.ipc.message.MessageSerializer"); + Class writeChannelClass = Class.forName("org.apache.arrow.vector.ipc.WriteChannel"); + + // Get the serializeSchema method + java.lang.reflect.Method serializeMethod = + messageSerializerClass.getMethod("serializeSchema", schemaClass, writeChannelClass); + + // Create a ByteArrayOutputStream and WriteChannel + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + Object writeChannel = + writeChannelClass + .getConstructor(java.nio.channels.WritableByteChannel.class) + .newInstance(Channels.newChannel(baos)); + + // Serialize the schema + serializeMethod.invoke(null, schema, writeChannel); + + return baos.toByteArray(); + } catch (Exception e) { + throw new RuntimeException("Failed to serialize Arrow schema: " + e.getMessage(), e); + } + } +} diff --git a/src/main/java/com/databricks/zerobus/EncodedBatch.java b/src/main/java/com/databricks/zerobus/EncodedBatch.java new file mode 100644 index 0000000..fcc2278 --- /dev/null +++ b/src/main/java/com/databricks/zerobus/EncodedBatch.java @@ -0,0 +1,122 @@ +package com.databricks.zerobus; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Represents a batch of encoded records that were ingested together. + * + *

This class is used when retrieving unacknowledged batches from the stream after a failure. It + * preserves the batch grouping from the original ingestion, which can be useful for re-ingesting + * records in the same batches. + * + *

Each EncodedBatch contains a list of raw byte arrays (the encoded records) and a flag + * indicating whether they are JSON or Protocol Buffer encoded. + * + * @see ZerobusStream#getUnackedBatches() + */ +public class EncodedBatch { + + private final List records; + private final boolean isJson; + + /** + * Creates a new EncodedBatch with the given records. + * + * @param records the list of encoded record byte arrays + * @param isJson true if the records are JSON encoded, false if Protocol Buffer encoded + */ + public EncodedBatch(List records, boolean isJson) { + this.records = records != null ? new ArrayList<>(records) : new ArrayList<>(); + this.isJson = isJson; + } + + /** + * Returns the encoded records in this batch. + * + *

The returned list is unmodifiable. + * + * @return an unmodifiable list of encoded record byte arrays + */ + public List getRecords() { + return Collections.unmodifiableList(records); + } + + /** + * Returns whether the records in this batch are JSON encoded. + * + * @return true if JSON encoded, false if Protocol Buffer encoded + */ + public boolean isJson() { + return isJson; + } + + /** + * Returns the number of records in this batch. + * + * @return the number of records + */ + public int size() { + return records.size(); + } + + /** + * Returns whether this batch is empty. + * + * @return true if empty, false otherwise + */ + public boolean isEmpty() { + return records.isEmpty(); + } + + /** + * Converts this batch to a list of IngestableRecord objects for re-ingestion. + * + *

If the records are JSON encoded, this returns a list of {@link JsonRecord} objects. If they + * are Protocol Buffer encoded, this returns a list of {@link RawProtoRecord} objects (which wrap + * the raw bytes without deserializing). + * + * @return a list of IngestableRecord objects + */ + public List toIngestableRecords() { + List result = new ArrayList<>(records.size()); + for (byte[] bytes : records) { + if (isJson) { + result.add(new JsonRecord(new String(bytes, java.nio.charset.StandardCharsets.UTF_8))); + } else { + result.add(new RawProtoRecord(bytes)); + } + } + return result; + } + + /** + * A raw Protocol Buffer record that wraps pre-serialized bytes. + * + *

This class is used when re-ingesting records from an EncodedBatch where we don't need to + * deserialize the Protocol Buffer message. + */ + public static class RawProtoRecord implements IngestableRecord { + private final byte[] bytes; + + /** + * Creates a new RawProtoRecord with the given serialized bytes. + * + * @param bytes the serialized Protocol Buffer bytes + */ + public RawProtoRecord(byte[] bytes) { + this.bytes = bytes.clone(); + } + + @Override + public byte[] toEncodedBytes() { + return bytes.clone(); + } + + @Override + public boolean isJson() { + return false; + } + } +} diff --git a/src/main/java/com/databricks/zerobus/IngestableRecord.java b/src/main/java/com/databricks/zerobus/IngestableRecord.java new file mode 100644 index 0000000..1c55e52 --- /dev/null +++ b/src/main/java/com/databricks/zerobus/IngestableRecord.java @@ -0,0 +1,37 @@ +package com.databricks.zerobus; + +/** + * Marker interface for any record type that can be ingested into a Zerobus stream. + * + *

This interface provides a common abstraction for both Protocol Buffer and JSON records, + * allowing them to be used interchangeably with the unified ingestion API. + * + *

Implementations must provide: + * + *

    + *
  • {@link #toEncodedBytes()} - Serializes the record to bytes + *
  • {@link #isJson()} - Indicates whether this is a JSON record + *
+ * + * @see ProtoRecord + * @see JsonRecord + */ +public interface IngestableRecord { + + /** + * Serializes this record to encoded bytes suitable for transmission. + * + *

For Protocol Buffer records, this returns the serialized protobuf bytes. For JSON records, + * this returns the UTF-8 encoded JSON string bytes. + * + * @return the encoded byte representation of this record + */ + byte[] toEncodedBytes(); + + /** + * Indicates whether this record is a JSON record. + * + * @return {@code true} if this is a JSON record, {@code false} if it is a Protocol Buffer record + */ + boolean isJson(); +} diff --git a/src/main/java/com/databricks/zerobus/JsonRecord.java b/src/main/java/com/databricks/zerobus/JsonRecord.java new file mode 100644 index 0000000..b875eaa --- /dev/null +++ b/src/main/java/com/databricks/zerobus/JsonRecord.java @@ -0,0 +1,135 @@ +package com.databricks.zerobus; + +import java.nio.charset.StandardCharsets; + +/** + * A wrapper for JSON strings that implements {@link IngestableRecord}. + * + *

This class wraps a JSON string to make it compatible with the unified ingestion API. The JSON + * string is encoded to UTF-8 bytes for transmission. + * + *

Example usage: + * + *

{@code
+ * // From a raw JSON string
+ * JsonRecord record = new JsonRecord("{\"field1\": \"value\", \"field2\": 123}");
+ * stream.ingestRecord(record);
+ *
+ * // From an object using a JSON library (e.g., Gson)
+ * Gson gson = new Gson();
+ * MyObject obj = new MyObject("value", 123);
+ * JsonRecord record = JsonRecord.fromObject(obj, gson::toJson);
+ * stream.ingestRecord(record);
+ * }
+ * + * @see IngestableRecord + * @see ProtoRecord + */ +public class JsonRecord implements IngestableRecord { + + private final String json; + + /** + * Creates a new JsonRecord from a JSON string. + * + * @param json the JSON string + * @throws NullPointerException if json is null + */ + public JsonRecord(String json) { + if (json == null) { + throw new NullPointerException("json cannot be null"); + } + this.json = json; + } + + /** + * Returns the JSON string. + * + * @return the JSON string + */ + public String getJson() { + return json; + } + + /** + * Encodes the JSON string to UTF-8 bytes. + * + * @return the UTF-8 encoded JSON bytes + */ + @Override + public byte[] toEncodedBytes() { + return json.getBytes(StandardCharsets.UTF_8); + } + + /** + * Returns {@code true} since this is a JSON record. + * + * @return {@code true} + */ + @Override + public boolean isJson() { + return true; + } + + /** + * Creates a new JsonRecord from a JSON string. + * + *

This is a convenience factory method equivalent to calling the constructor. + * + * @param json the JSON string + * @return a new JsonRecord wrapping the JSON string + */ + public static JsonRecord of(String json) { + return new JsonRecord(json); + } + + /** + * Creates a new JsonRecord from an object using a serializer function. + * + *

This method allows using any JSON serialization library by passing a serializer function. + * + *

Example with Gson: + * + *

{@code
+   * Gson gson = new Gson();
+   * JsonRecord record = JsonRecord.fromObject(myObject, gson::toJson);
+   * }
+ * + *

Example with Jackson: + * + *

{@code
+   * ObjectMapper mapper = new ObjectMapper();
+   * JsonRecord record = JsonRecord.fromObject(myObject, obj -> {
+   *     try {
+   *         return mapper.writeValueAsString(obj);
+   *     } catch (JsonProcessingException e) {
+   *         throw new RuntimeException(e);
+   *     }
+   * });
+   * }
+ * + * @param object the object to serialize + * @param serializer a function that converts the object to a JSON string + * @param the type of the object + * @return a new JsonRecord containing the serialized JSON + */ + public static JsonRecord fromObject(T object, JsonSerializer serializer) { + return new JsonRecord(serializer.serialize(object)); + } + + /** + * A functional interface for JSON serialization. + * + * @param the type of the object to serialize + */ + @FunctionalInterface + public interface JsonSerializer { + /** + * Serializes an object to a JSON string. + * + * @param object the object to serialize + * @return the JSON string representation + */ + String serialize(T object); + } +} diff --git a/src/main/java/com/databricks/zerobus/NativeLoader.java b/src/main/java/com/databricks/zerobus/NativeLoader.java new file mode 100644 index 0000000..5cca8c4 --- /dev/null +++ b/src/main/java/com/databricks/zerobus/NativeLoader.java @@ -0,0 +1,184 @@ +package com.databricks.zerobus; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.util.Locale; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Utility class for loading the native Zerobus JNI library. + * + *

This class handles the extraction and loading of platform-specific native libraries. It + * supports loading from the classpath (for packaged applications) or from a system path (for + * development or custom deployments). + * + *

The native library is loaded automatically when this class is first accessed. + * + *

Supported platforms: + * + *

    + *
  • Linux x86_64 (linux-x86_64) + *
  • Linux aarch64 (linux-aarch64) + *
  • macOS x86_64 (osx-x86_64) + *
  • macOS aarch64 (osx-aarch64) + *
  • Windows x86_64 (windows-x86_64) + *
+ */ +public final class NativeLoader { + private static final Logger logger = LoggerFactory.getLogger(NativeLoader.class); + + private static final String LIBRARY_NAME = "zerobus_jni"; + private static final String NATIVE_RESOURCE_PATH = "/native"; + + private static volatile boolean loaded = false; + private static volatile Throwable loadError = null; + + static { + try { + loadNativeLibrary(); + loaded = true; + } catch (Throwable t) { + loadError = t; + logger.error("Failed to load native library", t); + } + } + + private NativeLoader() { + // Utility class + } + + /** + * Ensures the native library is loaded. + * + * @throws UnsatisfiedLinkError if the native library could not be loaded + */ + public static void ensureLoaded() { + if (!loaded) { + if (loadError != null) { + throw new UnsatisfiedLinkError("Native library failed to load: " + loadError.getMessage()); + } + throw new UnsatisfiedLinkError("Native library not loaded"); + } + } + + /** + * Returns whether the native library has been successfully loaded. + * + * @return true if loaded, false otherwise + */ + public static boolean isLoaded() { + return loaded; + } + + /** + * Returns the error that occurred during loading, if any. + * + * @return the error, or null if no error occurred + */ + public static Throwable getLoadError() { + return loadError; + } + + private static void loadNativeLibrary() { + // First, try to load from java.library.path (system library) + try { + System.loadLibrary(LIBRARY_NAME); + logger.info("Loaded native library from system path"); + return; + } catch (UnsatisfiedLinkError e) { + logger.debug("Native library not found in system path, trying classpath"); + } + + // Try to load from classpath (packaged in JAR) + String platform = getPlatformIdentifier(); + String libraryFileName = getLibraryFileName(); + String resourcePath = NATIVE_RESOURCE_PATH + "/" + platform + "/" + libraryFileName; + + try (InputStream in = NativeLoader.class.getResourceAsStream(resourcePath)) { + if (in == null) { + throw new UnsatisfiedLinkError( + "Native library not found in classpath: " + + resourcePath + + ". Platform: " + + platform + + ". Make sure the correct native JAR is on the classpath."); + } + + // Extract to a temporary file + File tempFile = extractToTempFile(in, libraryFileName); + System.load(tempFile.getAbsolutePath()); + logger.info("Loaded native library from classpath: {}", resourcePath); + + } catch (IOException e) { + throw new UnsatisfiedLinkError("Failed to extract native library: " + e.getMessage()); + } + } + + private static File extractToTempFile(InputStream in, String fileName) throws IOException { + // Create a unique temp directory for this process + File tempDir = Files.createTempDirectory("zerobus-native-").toFile(); + tempDir.deleteOnExit(); + + File tempFile = new File(tempDir, fileName); + tempFile.deleteOnExit(); + + try (OutputStream out = new FileOutputStream(tempFile)) { + byte[] buffer = new byte[8192]; + int bytesRead; + while ((bytesRead = in.read(buffer)) != -1) { + out.write(buffer, 0, bytesRead); + } + } + + // Make the library executable on Unix systems + if (!tempFile.setExecutable(true)) { + logger.debug("Could not set executable permission on native library"); + } + + return tempFile; + } + + private static String getPlatformIdentifier() { + String os = System.getProperty("os.name", "").toLowerCase(Locale.ROOT); + String arch = System.getProperty("os.arch", "").toLowerCase(Locale.ROOT); + + String osName; + if (os.contains("linux")) { + osName = "linux"; + } else if (os.contains("mac") || os.contains("darwin")) { + osName = "osx"; + } else if (os.contains("windows")) { + osName = "windows"; + } else { + throw new UnsatisfiedLinkError("Unsupported operating system: " + os); + } + + String archName; + if (arch.equals("amd64") || arch.equals("x86_64")) { + archName = "x86_64"; + } else if (arch.equals("aarch64") || arch.equals("arm64")) { + archName = "aarch64"; + } else { + throw new UnsatisfiedLinkError("Unsupported architecture: " + arch); + } + + return osName + "-" + archName; + } + + private static String getLibraryFileName() { + String os = System.getProperty("os.name", "").toLowerCase(Locale.ROOT); + + if (os.contains("windows")) { + return LIBRARY_NAME + ".dll"; + } else if (os.contains("mac") || os.contains("darwin")) { + return "lib" + LIBRARY_NAME + ".dylib"; + } else { + return "lib" + LIBRARY_NAME + ".so"; + } + } +} diff --git a/src/main/java/com/databricks/zerobus/ProtoRecord.java b/src/main/java/com/databricks/zerobus/ProtoRecord.java new file mode 100644 index 0000000..5bd5315 --- /dev/null +++ b/src/main/java/com/databricks/zerobus/ProtoRecord.java @@ -0,0 +1,85 @@ +package com.databricks.zerobus; + +import com.google.protobuf.Message; + +/** + * A wrapper for Protocol Buffer messages that implements {@link IngestableRecord}. + * + *

This class wraps any Protocol Buffer {@link Message} to make it compatible with the unified + * ingestion API. The message is serialized to bytes using the standard Protocol Buffer + * serialization. + * + *

Example usage: + * + *

{@code
+ * MyProtoMessage message = MyProtoMessage.newBuilder()
+ *     .setField1("value")
+ *     .build();
+ *
+ * ProtoRecord record = new ProtoRecord<>(message);
+ * stream.ingestRecord(record);
+ * }
+ * + * @param the Protocol Buffer message type + * @see IngestableRecord + * @see JsonRecord + */ +public class ProtoRecord implements IngestableRecord { + + private final T message; + + /** + * Creates a new ProtoRecord wrapping the given Protocol Buffer message. + * + * @param message the Protocol Buffer message to wrap + * @throws NullPointerException if message is null + */ + public ProtoRecord(T message) { + if (message == null) { + throw new NullPointerException("message cannot be null"); + } + this.message = message; + } + + /** + * Returns the wrapped Protocol Buffer message. + * + * @return the wrapped message + */ + public T getMessage() { + return message; + } + + /** + * Serializes the wrapped message to Protocol Buffer bytes. + * + * @return the serialized Protocol Buffer bytes + */ + @Override + public byte[] toEncodedBytes() { + return message.toByteArray(); + } + + /** + * Returns {@code false} since this is a Protocol Buffer record. + * + * @return {@code false} + */ + @Override + public boolean isJson() { + return false; + } + + /** + * Creates a new ProtoRecord from a Protocol Buffer message. + * + *

This is a convenience factory method equivalent to calling the constructor. + * + * @param message the Protocol Buffer message to wrap + * @param the Protocol Buffer message type + * @return a new ProtoRecord wrapping the message + */ + public static ProtoRecord of(T message) { + return new ProtoRecord<>(message); + } +} diff --git a/src/main/java/com/databricks/zerobus/StreamConfigurationOptions.java b/src/main/java/com/databricks/zerobus/StreamConfigurationOptions.java index 050b685..6d57d30 100644 --- a/src/main/java/com/databricks/zerobus/StreamConfigurationOptions.java +++ b/src/main/java/com/databricks/zerobus/StreamConfigurationOptions.java @@ -11,9 +11,20 @@ * *

Use the builder pattern to create instances: * - *

StreamConfigurationOptions options = StreamConfigurationOptions.builder() - * .setMaxInflightRecords(50000) .setRecovery(true) .setAckCallback(response -> - * System.out.println("Acked: " + response.getDurabilityAckUpToOffset())) .build(); + *

{@code
+ * StreamConfigurationOptions options = StreamConfigurationOptions.builder()
+ *     .setMaxInflightRecords(50000)
+ *     .setRecovery(true)
+ *     .setAckCallback(new AckCallback() {
+ *         public void onAck(long offsetId) {
+ *             System.out.println("Acked offset: " + offsetId);
+ *         }
+ *         public void onError(long offsetId, String errorMessage) {
+ *             System.err.println("Error for offset " + offsetId + ": " + errorMessage);
+ *         }
+ *     })
+ *     .build();
+ * }
*/ public class StreamConfigurationOptions { @@ -24,7 +35,9 @@ public class StreamConfigurationOptions { private int recoveryRetries = 3; private int flushTimeoutMs = 300000; private int serverLackOfAckTimeoutMs = 60000; + private RecordType recordType = RecordType.PROTO; private Optional> ackCallback = Optional.empty(); + private Optional newAckCallback = Optional.empty(); private StreamConfigurationOptions() {} @@ -36,7 +49,9 @@ private StreamConfigurationOptions( int recoveryRetries, int flushTimeoutMs, int serverLackOfAckTimeoutMs, - Optional> ackCallback) { + RecordType recordType, + Optional> ackCallback, + Optional newAckCallback) { this.maxInflightRecords = maxInflightRecords; this.recovery = recovery; this.recoveryTimeoutMs = recoveryTimeoutMs; @@ -44,7 +59,9 @@ private StreamConfigurationOptions( this.recoveryRetries = recoveryRetries; this.flushTimeoutMs = flushTimeoutMs; this.serverLackOfAckTimeoutMs = serverLackOfAckTimeoutMs; + this.recordType = recordType; this.ackCallback = ackCallback; + this.newAckCallback = newAckCallback; } /** @@ -131,6 +148,18 @@ public int serverLackOfAckTimeoutMs() { return this.serverLackOfAckTimeoutMs; } + /** + * Returns the record type for this stream. + * + *

The record type determines how records are serialized and validated. Proto streams require a + * descriptor, while JSON streams do not. + * + * @return the record type (PROTO or JSON) + */ + public RecordType recordType() { + return this.recordType; + } + /** * Returns the acknowledgment callback function. * @@ -138,17 +167,41 @@ public int serverLackOfAckTimeoutMs() { * this returns an empty Optional. * * @return the acknowledgment callback, or an empty Optional if none is set + * @deprecated Use {@link #getNewAckCallback()} instead, which provides error notifications. */ + @Deprecated public Optional> ackCallback() { return this.ackCallback; } + /** + * Returns the new-style acknowledgment callback. + * + *

This callback provides both success and error notifications. If no callback is set, this + * returns an empty Optional. + * + * @return the acknowledgment callback, or an empty Optional if none is set + */ + public Optional getNewAckCallback() { + return this.newAckCallback; + } + /** * Returns the default stream configuration options. * - *

Default values: - maxInflightRecords: 50000 - recovery: true - recoveryTimeoutMs: 15000 - - * recoveryBackoffMs: 2000 - recoveryRetries: 3 - flushTimeoutMs: 300000 - - * serverLackOfAckTimeoutMs: 60000 - ackCallback: empty + *

Default values: + * + *

    + *
  • maxInflightRecords: 50000 + *
  • recovery: true + *
  • recoveryTimeoutMs: 15000 + *
  • recoveryBackoffMs: 2000 + *
  • recoveryRetries: 3 + *
  • flushTimeoutMs: 300000 + *
  • serverLackOfAckTimeoutMs: 60000 + *
  • recordType: PROTO + *
  • ackCallback: empty + *
* * @return the default stream configuration options */ @@ -173,9 +226,16 @@ public static StreamConfigurationOptionsBuilder builder() { * *

Example usage: * - *

StreamConfigurationOptions options = StreamConfigurationOptions.builder() - * .setMaxInflightRecords(100000) .setRecovery(false) .setAckCallback(response -> - * System.out.println("Record acked: " + response.getDurabilityAckUpToOffset())) .build(); + *

{@code
+   * StreamConfigurationOptions options = StreamConfigurationOptions.builder()
+   *     .setMaxInflightRecords(100000)
+   *     .setRecovery(false)
+   *     .setAckCallback(new AckCallback() {
+   *         public void onAck(long offsetId) { ... }
+   *         public void onError(long offsetId, String errorMessage) { ... }
+   *     })
+   *     .build();
+   * }
* * @see StreamConfigurationOptions * @since 1.0.0 @@ -190,7 +250,9 @@ public static class StreamConfigurationOptionsBuilder { private int recoveryRetries = defaultOptions.recoveryRetries; private int flushTimeoutMs = defaultOptions.flushTimeoutMs; private int serverLackOfAckTimeoutMs = defaultOptions.serverLackOfAckTimeoutMs; + private RecordType recordType = defaultOptions.recordType; private Optional> ackCallback = defaultOptions.ackCallback; + private Optional newAckCallback = defaultOptions.newAckCallback; private StreamConfigurationOptionsBuilder() {} @@ -294,6 +356,26 @@ public StreamConfigurationOptionsBuilder setServerLackOfAckTimeoutMs( return this; } + /** + * Sets the record type for this stream. + * + *

The record type determines how records are serialized and validated: + * + *

    + *
  • {@link RecordType#PROTO} - Records are serialized as protobuf bytes (default) + *
  • {@link RecordType#JSON} - Records are serialized as JSON strings + *
+ * + *

When using JSON record type, the stream can be created without a protobuf descriptor. + * + * @param recordType the record type (PROTO or JSON) + * @return this builder for method chaining + */ + public StreamConfigurationOptionsBuilder setRecordType(RecordType recordType) { + this.recordType = recordType; + return this; + } + /** * Sets the acknowledgment callback function. * @@ -302,13 +384,29 @@ public StreamConfigurationOptionsBuilder setServerLackOfAckTimeoutMs( * * @param ackCallback the acknowledgment callback function * @return this builder for method chaining + * @deprecated Use {@link #setAckCallback(AckCallback)} instead. */ + @Deprecated public StreamConfigurationOptionsBuilder setAckCallback( Consumer ackCallback) { this.ackCallback = Optional.ofNullable(ackCallback); return this; } + /** + * Sets the acknowledgment callback. + * + *

This callback is invoked for both successful acknowledgments and errors. It provides more + * detailed feedback than the deprecated Consumer-based callback. + * + * @param ackCallback the acknowledgment callback + * @return this builder for method chaining + */ + public StreamConfigurationOptionsBuilder setAckCallback(AckCallback ackCallback) { + this.newAckCallback = Optional.ofNullable(ackCallback); + return this; + } + /** * Builds a new StreamConfigurationOptions instance. * @@ -323,7 +421,9 @@ public StreamConfigurationOptions build() { this.recoveryRetries, this.flushTimeoutMs, this.serverLackOfAckTimeoutMs, - this.ackCallback); + this.recordType, + this.ackCallback, + this.newAckCallback); } } } diff --git a/src/main/java/com/databricks/zerobus/ZerobusArrowStream.java b/src/main/java/com/databricks/zerobus/ZerobusArrowStream.java new file mode 100644 index 0000000..11b731c --- /dev/null +++ b/src/main/java/com/databricks/zerobus/ZerobusArrowStream.java @@ -0,0 +1,331 @@ +package com.databricks.zerobus; + +import java.util.List; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Zerobus Arrow Flight stream for high-performance columnar data ingestion. + * + *

Experimental: Arrow Flight support is experimental and may change in future releases. + * + *

This class provides methods for ingesting Apache Arrow RecordBatches into a Databricks table + * via the Arrow Flight protocol. It offers higher throughput than the record-based API for columnar + * data. + * + *

Prerequisites: Arrow support requires adding Apache Arrow dependencies to your project: + * + *

{@code
+ * 
+ *     org.apache.arrow
+ *     arrow-vector
+ *     17.0.0
+ * 
+ * 
+ *     org.apache.arrow
+ *     arrow-memory-netty
+ *     17.0.0
+ * 
+ * }
+ * + *

Arrow streams should be created using {@link ZerobusSdk#createArrowStream} and closed when no + * longer needed. + * + *

Example usage: + * + *

{@code
+ * Schema schema = new Schema(Arrays.asList(
+ *     Field.nullable("id", new ArrowType.Int(64, true)),
+ *     Field.nullable("name", new ArrowType.Utf8())
+ * ));
+ *
+ * ArrowTableProperties props = new ArrowTableProperties("catalog.schema.table", schema);
+ * ZerobusArrowStream stream = sdk.createArrowStream(props, clientId, clientSecret).join();
+ *
+ * // Ingest a batch
+ * VectorSchemaRoot batch = createBatch();
+ * long offset = stream.ingestBatch(batch);
+ * stream.waitForOffset(offset);
+ *
+ * // Close when done
+ * stream.close();
+ * }
+ * + * @see ZerobusSdk#createArrowStream + * @see ArrowTableProperties + */ +public class ZerobusArrowStream implements AutoCloseable { + private static final Logger logger = LoggerFactory.getLogger(ZerobusArrowStream.class); + + // Ensure native library is loaded + static { + NativeLoader.ensureLoaded(); + } + + // Native handle to the Rust Arrow stream object + private volatile long nativeHandle; + + // Stream properties + private final ArrowTableProperties tableProperties; + private final ArrowStreamConfigurationOptions options; + private final String clientId; + private final String clientSecret; + + /** + * Package-private constructor. Arrow streams should be created via {@link + * ZerobusSdk#createArrowStream}. + */ + ZerobusArrowStream( + long nativeHandle, + ArrowTableProperties tableProperties, + ArrowStreamConfigurationOptions options, + String clientId, + String clientSecret) { + this.nativeHandle = nativeHandle; + this.tableProperties = tableProperties; + this.options = options; + this.clientId = clientId; + this.clientSecret = clientSecret; + } + + /** + * Returns the table properties for this stream. + * + * @return the table properties + */ + public ArrowTableProperties getTableProperties() { + return tableProperties; + } + + /** + * Returns the stream configuration options. + * + * @return the stream options + */ + public ArrowStreamConfigurationOptions getOptions() { + return options; + } + + /** + * Returns the OAuth client ID. + * + * @return the client ID + */ + public String getClientId() { + return clientId; + } + + /** + * Returns the OAuth client secret. + * + * @return the client secret + */ + public String getClientSecret() { + return clientSecret; + } + + /** + * Returns whether the stream is closed. + * + * @return true if closed + */ + public boolean isClosed() { + return nativeHandle == 0 || nativeIsClosed(nativeHandle); + } + + /** + * Returns the table name. + * + * @return the table name + */ + public String getTableName() { + return tableProperties.getTableName(); + } + + /** + * Returns the Arrow schema for this stream. + * + * @return the Arrow schema object + */ + public Object getSchema() { + return tableProperties.getSchema(); + } + + /** + * Ingests an Arrow RecordBatch (provided as VectorSchemaRoot) and returns the offset. + * + *

The batch must be serialized to Arrow IPC format before calling this method. Use {@link + * #ingestBatchRaw(byte[])} if you have pre-serialized IPC data. + * + *

This method blocks until the batch is enqueued but does NOT wait for acknowledgment. + * + * @param batch the VectorSchemaRoot to ingest (will be serialized to IPC format) + * @return the offset ID assigned to this batch + * @throws ZerobusException if an error occurs + */ + public long ingestBatch(Object batch) throws ZerobusException { + ensureOpen(); + + // Serialize the batch to IPC format using reflection + byte[] batchData = serializeBatchToIpc(batch); + return nativeIngestBatch(nativeHandle, batchData); + } + + /** + * Ingests pre-serialized Arrow IPC data and returns the offset. + * + * @param batchData the Arrow IPC serialized batch data + * @return the offset ID assigned to this batch + * @throws ZerobusException if an error occurs + */ + public long ingestBatchRaw(byte[] batchData) throws ZerobusException { + ensureOpen(); + return nativeIngestBatch(nativeHandle, batchData); + } + + /** + * Waits for a specific offset to be acknowledged by the server. + * + * @param offset the offset to wait for + * @throws ZerobusException if an error occurs or the wait times out + */ + public void waitForOffset(long offset) throws ZerobusException { + ensureOpen(); + nativeWaitForOffset(nativeHandle, offset); + } + + /** + * Flushes all pending batches, waiting for acknowledgment. + * + * @throws ZerobusException if an error occurs or the flush times out + */ + public void flush() throws ZerobusException { + ensureOpen(); + nativeFlush(nativeHandle); + logger.info("All batches have been flushed"); + } + + /** + * Closes the stream, flushing all pending batches first. + * + * @throws ZerobusException if an error occurs during close + */ + @Override + public void close() throws ZerobusException { + long handle = nativeHandle; + if (handle != 0) { + nativeHandle = 0; + nativeClose(handle); + nativeDestroy(handle); + logger.info("Arrow stream closed"); + } + } + + /** + * Returns the unacknowledged batches as serialized Arrow IPC data. + * + * @return a list of IPC-serialized batch data + * @throws ZerobusException if an error occurs + */ + public List getUnackedBatchesRaw() throws ZerobusException { + if (nativeHandle == 0) { + return java.util.Collections.emptyList(); + } + return nativeGetUnackedBatches(nativeHandle); + } + + private void ensureOpen() throws ZerobusException { + if (nativeHandle == 0) { + throw new ZerobusException("Arrow stream is closed"); + } + if (nativeIsClosed(nativeHandle)) { + throw new ZerobusException("Arrow stream is closed"); + } + } + + /** + * Serializes a VectorSchemaRoot to Arrow IPC format using reflection. + * + *

This avoids requiring Arrow at compile time. + */ + private byte[] serializeBatchToIpc(Object batch) throws ZerobusException { + if (!ArrowTableProperties.isArrowAvailable()) { + throw new ZerobusException( + "Apache Arrow libraries are not available. " + + "Add arrow-vector and arrow-memory-netty dependencies to use Arrow Flight."); + } + + try { + // Use reflection to serialize the batch + Class vectorSchemaRootClass = Class.forName("org.apache.arrow.vector.VectorSchemaRoot"); + Class arrowStreamWriterClass = + Class.forName("org.apache.arrow.vector.ipc.ArrowStreamWriter"); + Class dictionaryProviderClass = + Class.forName("org.apache.arrow.vector.dictionary.DictionaryProvider"); + Class mapDictionaryProviderClass = + Class.forName( + "org.apache.arrow.vector.dictionary.DictionaryProvider$MapDictionaryProvider"); + + java.io.ByteArrayOutputStream baos = new java.io.ByteArrayOutputStream(); + java.nio.channels.WritableByteChannel channel = java.nio.channels.Channels.newChannel(baos); + + // Create an empty dictionary provider + Object dictionaryProvider = mapDictionaryProviderClass.getConstructor().newInstance(); + + // Create the writer + Object writer = + arrowStreamWriterClass + .getConstructor( + vectorSchemaRootClass, + dictionaryProviderClass, + java.nio.channels.WritableByteChannel.class) + .newInstance(batch, dictionaryProvider, channel); + + // Start the stream + java.lang.reflect.Method startMethod = arrowStreamWriterClass.getMethod("start"); + startMethod.invoke(writer); + + // Write the batch + java.lang.reflect.Method writeBatchMethod = arrowStreamWriterClass.getMethod("writeBatch"); + writeBatchMethod.invoke(writer); + + // End the stream + java.lang.reflect.Method endMethod = arrowStreamWriterClass.getMethod("end"); + endMethod.invoke(writer); + + // Close the writer + java.lang.reflect.Method closeMethod = arrowStreamWriterClass.getMethod("close"); + closeMethod.invoke(writer); + + return baos.toByteArray(); + } catch (Exception e) { + throw new ZerobusException("Failed to serialize Arrow batch: " + e.getMessage(), e); + } + } + + @Override + protected void finalize() { + if (nativeHandle != 0) { + nativeDestroy(nativeHandle); + nativeHandle = 0; + } + } + + // Native methods implemented in Rust + + private static native void nativeDestroy(long handle); + + private native long nativeIngestBatch(long handle, byte[] batchData); + + private native void nativeWaitForOffset(long handle, long offset); + + private native void nativeFlush(long handle); + + private native void nativeClose(long handle); + + private native boolean nativeIsClosed(long handle); + + private native String nativeGetTableName(long handle); + + private native List nativeGetUnackedBatches(long handle); +} diff --git a/src/main/java/com/databricks/zerobus/ZerobusSdk.java b/src/main/java/com/databricks/zerobus/ZerobusSdk.java index a8b5429..36dc10b 100644 --- a/src/main/java/com/databricks/zerobus/ZerobusSdk.java +++ b/src/main/java/com/databricks/zerobus/ZerobusSdk.java @@ -1,16 +1,7 @@ package com.databricks.zerobus; import com.google.protobuf.Message; -import io.grpc.Status; -import io.grpc.StatusRuntimeException; -import java.util.Iterator; -import java.util.Random; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.ThreadFactory; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Supplier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -21,6 +12,9 @@ * Databricks tables. It handles authentication, connection management, and stream lifecycle * operations. * + *

The SDK uses a native Rust implementation via JNI for optimal performance. The native library + * is loaded automatically when the SDK is first used. + * *

Example usage: * *

{@code
@@ -43,94 +37,37 @@
 public class ZerobusSdk {
   private static final Logger logger = LoggerFactory.getLogger(ZerobusSdk.class);
 
+  // Ensure native library is loaded
+  static {
+    NativeLoader.ensureLoaded();
+  }
+
   // Constants
   private static final StreamConfigurationOptions DEFAULT_OPTIONS =
       StreamConfigurationOptions.getDefault();
-  private static final int STREAM_EXECUTOR_THREAD_POOL_SIZE = 4;
-  private static final String HTTPS_PREFIX = "https://";
-  private static final String HTTP_PREFIX = "http://";
-  private static final String THREAD_NAME_PREFIX = "ZerobusStream-executor-";
 
-  private static final Random RANDOM = new Random();
+  // Native handle to the Rust SDK object
+  private volatile long nativeHandle;
 
+  // Store endpoints for stream creation
   private final String serverEndpoint;
   private final String unityCatalogEndpoint;
-  private final String workspaceId;
-
-  private ZerobusSdkStubFactory stubFactory = ZerobusSdkStubFactory.create();
 
   /**
    * Creates a new ZerobusSdk instance.
    *
    * @param serverEndpoint The gRPC endpoint URL for the Zerobus service.
    * @param unityCatalogEndpoint The Unity Catalog endpoint URL.
+   * @throws ZerobusException if the SDK cannot be initialized
    */
   public ZerobusSdk(String serverEndpoint, String unityCatalogEndpoint) {
     this.serverEndpoint = serverEndpoint;
     this.unityCatalogEndpoint = unityCatalogEndpoint;
-    this.workspaceId = extractWorkspaceId(serverEndpoint);
-  }
-
-  /**
-   * Sets the stub factory (used for testing).
-   *
-   * @param stubFactory The stub factory to use
-   */
-  void setStubFactory(ZerobusSdkStubFactory stubFactory) {
-    this.stubFactory = stubFactory;
-  }
-
-  /**
-   * Extracts workspace ID from server endpoint.
-   *
-   * 

The workspace ID is the first component of the endpoint hostname. - * - *

Example: {@code 1234567890123456.zerobus.us-west-2.cloud.databricks.com} returns {@code - * 1234567890123456} - * - * @param endpoint The server endpoint (may include protocol prefix) - * @return The extracted workspace ID - */ - private static String extractWorkspaceId(String endpoint) { - String cleanEndpoint = endpoint; - - // Remove protocol prefix if present - if (cleanEndpoint.startsWith(HTTPS_PREFIX)) { - cleanEndpoint = cleanEndpoint.substring(HTTPS_PREFIX.length()); - } else if (cleanEndpoint.startsWith(HTTP_PREFIX)) { - cleanEndpoint = cleanEndpoint.substring(HTTP_PREFIX.length()); + this.nativeHandle = nativeCreate(serverEndpoint, unityCatalogEndpoint); + if (this.nativeHandle == 0) { + throw new RuntimeException("Failed to create native SDK instance"); } - - // Extract workspace ID (first part before first dot) - int dotIndex = cleanEndpoint.indexOf('.'); - return dotIndex > 0 ? cleanEndpoint.substring(0, dotIndex) : cleanEndpoint; - } - - /** - * Creates an executor service for stream operations. - * - *

The executor uses daemon threads to avoid preventing JVM shutdown. Each thread is named with - * a unique instance ID for debugging purposes. - * - * @return A new ExecutorService configured for stream operations - */ - private static ExecutorService createStreamExecutor() { - long instanceId = 1000000000L + Math.abs(RANDOM.nextLong() % 9000000000L); - - ThreadFactory daemonThreadFactory = - new ThreadFactory() { - private final AtomicInteger counter = new AtomicInteger(0); - - @Override - public Thread newThread(Runnable runnable) { - Thread thread = new Thread(runnable); - thread.setDaemon(true); - thread.setName(THREAD_NAME_PREFIX + instanceId + "-" + counter.getAndIncrement()); - return thread; - } - }; - - return Executors.newFixedThreadPool(STREAM_EXECUTOR_THREAD_POOL_SIZE, daemonThreadFactory); + logger.debug("ZerobusSdk created for endpoint: {}", serverEndpoint); } /** @@ -154,80 +91,40 @@ public CompletableFuture> String clientSecret, StreamConfigurationOptions options) { - ExecutorService streamExecutor = createStreamExecutor(); - CompletableFuture> resultFuture = new CompletableFuture<>(); - - try { - logger.debug("Creating stream for table: " + tableProperties.getTableName()); - - // Create a token supplier that generates a fresh token for each gRPC request - Supplier tokenSupplier = - () -> { - try { - return TokenFactory.getZerobusToken( - tableProperties.getTableName(), - workspaceId, - unityCatalogEndpoint, - clientId, - clientSecret); - } catch (NonRetriableException e) { - throw new RuntimeException("Failed to get Zerobus token", e); - } - }; - - // Create a stub supplier that generates a fresh stub with token supplier each time - Supplier stubSupplier = - () -> - stubFactory.createStubWithTokenSupplier( - serverEndpoint, tableProperties.getTableName(), tokenSupplier); - - ZerobusStream stream = - new ZerobusStream<>( - stubSupplier, - tableProperties, - stubFactory, - serverEndpoint, - workspaceId, - unityCatalogEndpoint, - clientId, - clientSecret, - options, - streamExecutor, - streamExecutor); - - stream - .initialize() - .whenComplete( - (result, error) -> { - if (error == null) { - resultFuture.complete(stream); - } else { - resultFuture.completeExceptionally(error); - } - }); - } catch (Throwable e) { - logger.error("Failed to create stream with: " + e.getMessage(), e); - - Throwable ex; - if (e instanceof ZerobusException) { - ex = e; - } else if (e instanceof StatusRuntimeException) { - StatusRuntimeException sre = (StatusRuntimeException) e; - Status.Code code = sre.getStatus().getCode(); - if (GrpcErrorHandling.isNonRetriable(code)) { - ex = - new NonRetriableException( - "Non-retriable gRPC error during stream creation: " + sre.getMessage(), sre); - } else { - ex = new ZerobusException("Failed to create stream: " + sre.getMessage(), sre); - } - } else { - ex = new ZerobusException("Failed to create stream: " + e.getMessage(), e); - } - resultFuture.completeExceptionally(ex); - } + ensureOpen(); + + StreamConfigurationOptions effectiveOptions = options != null ? options : DEFAULT_OPTIONS; + boolean isJson = effectiveOptions.recordType() == com.databricks.zerobus.RecordType.JSON; - return resultFuture; + logger.debug( + "Creating {} stream for table: {}", + isJson ? "JSON" : "Proto", + tableProperties.getTableName()); + + // Get the descriptor proto bytes (null for JSON streams) + byte[] descriptorProtoBytes = + isJson ? null : tableProperties.getDescriptorProto().toByteArray(); + + // Call native method to create stream + CompletableFuture handleFuture = + nativeCreateStream( + nativeHandle, + tableProperties.getTableName(), + descriptorProtoBytes, + clientId, + clientSecret, + effectiveOptions, + isJson); + + // Convert the handle to a ZerobusStream + return handleFuture.thenApply( + handle -> { + if (handle == null || handle == 0) { + throw new RuntimeException("Failed to create stream: null handle returned"); + } + return new ZerobusStream<>( + handle, tableProperties, effectiveOptions, clientId, clientSecret, isJson); + }); } /** @@ -258,32 +155,178 @@ public CompletableFuture> public CompletableFuture> recreateStream( ZerobusStream failedStream) { - CompletableFuture> resultFuture = new CompletableFuture<>(); + ensureOpen(); - createStream( + return createStream( failedStream.getTableProperties(), failedStream.getClientId(), failedStream.getClientSecret(), failedStream.getOptions()) - .whenComplete( - (stream, error) -> { - if (error == null) { - // ingest unacked records - Iterator unackedRecords = failedStream.getUnackedRecords(); - - try { - while (unackedRecords.hasNext()) { - stream.ingestRecord(unackedRecords.next()); + .thenCompose( + stream -> { + // Re-ingest unacked records using the new batch API + try { + java.util.List batches = failedStream.getUnackedBatches(); + for (EncodedBatch batch : batches) { + for (IngestableRecord record : batch.toIngestableRecords()) { + stream.ingestRecord(record); } - resultFuture.complete(stream); - } catch (ZerobusException e) { - resultFuture.completeExceptionally(e); } - } else { - resultFuture.completeExceptionally(error); + return CompletableFuture.completedFuture(stream); + } catch (ZerobusException e) { + CompletableFuture> failed = new CompletableFuture<>(); + failed.completeExceptionally(e); + return failed; + } + }); + } + + /** + * Creates a new Arrow Flight stream for ingesting Arrow data into a table. + * + *

Arrow streams provide high-performance ingestion for Apache Arrow columnar data. + * + * @param tableProperties Configuration for the target table including table name and Arrow + * schema. + * @param clientId The OAuth client ID for authentication. + * @param clientSecret The OAuth client secret for authentication. + * @param options Configuration options for the Arrow stream. + * @return A CompletableFuture that completes with the ZerobusArrowStream when ready. + */ + public CompletableFuture createArrowStream( + ArrowTableProperties tableProperties, + String clientId, + String clientSecret, + ArrowStreamConfigurationOptions options) { + + ensureOpen(); + + logger.debug("Creating Arrow stream for table: {}", tableProperties.getTableName()); + + // Serialize the Arrow schema to IPC format + byte[] schemaBytes = tableProperties.getSchemaBytes(); + + // Call native method to create Arrow stream + CompletableFuture handleFuture = + nativeCreateArrowStream( + nativeHandle, + tableProperties.getTableName(), + schemaBytes, + clientId, + clientSecret, + options != null ? options : ArrowStreamConfigurationOptions.getDefault()); + + // Convert the handle to a ZerobusArrowStream + return handleFuture.thenApply( + handle -> { + if (handle == null || handle == 0) { + throw new RuntimeException("Failed to create Arrow stream: null handle returned"); + } + return new ZerobusArrowStream( + handle, + tableProperties, + options != null ? options : ArrowStreamConfigurationOptions.getDefault(), + clientId, + clientSecret); + }); + } + + /** + * Creates a new Arrow Flight stream with default options. + * + * @param tableProperties Configuration for the target table. + * @param clientId The OAuth client ID for authentication. + * @param clientSecret The OAuth client secret for authentication. + * @return A CompletableFuture that completes with the ZerobusArrowStream when ready. + */ + public CompletableFuture createArrowStream( + ArrowTableProperties tableProperties, String clientId, String clientSecret) { + return createArrowStream(tableProperties, clientId, clientSecret, null); + } + + /** + * Recreate an Arrow stream from a failed stream. + * + * @param failedStream The Arrow stream to be recreated. + * @return A CompletableFuture that completes with the new ZerobusArrowStream when ready. + */ + public CompletableFuture recreateArrowStream( + ZerobusArrowStream failedStream) { + + ensureOpen(); + + return createArrowStream( + failedStream.getTableProperties(), + failedStream.getClientId(), + failedStream.getClientSecret(), + failedStream.getOptions()) + .thenCompose( + stream -> { + // Re-ingest unacked batches + try { + java.util.List batches = failedStream.getUnackedBatchesRaw(); + for (byte[] batchData : batches) { + stream.ingestBatchRaw(batchData); + } + return CompletableFuture.completedFuture(stream); + } catch (ZerobusException e) { + CompletableFuture failed = new CompletableFuture<>(); + failed.completeExceptionally(e); + return failed; } }); + } + + /** + * Closes the SDK and releases all resources. + * + *

After calling this method, the SDK cannot be used to create new streams. + */ + public void close() { + long handle = nativeHandle; + if (handle != 0) { + nativeHandle = 0; + nativeDestroy(handle); + logger.debug("ZerobusSdk closed"); + } + } + + private void ensureOpen() { + if (nativeHandle == 0) { + throw new IllegalStateException("SDK has been closed"); + } + } - return resultFuture; + @Override + protected void finalize() { + close(); } + + // Native methods implemented in Rust + + private static native long nativeCreate(String serverEndpoint, String unityCatalogEndpoint); + + private static native void nativeDestroy(long handle); + + private native CompletableFuture nativeCreateStream( + long sdkHandle, + String tableName, + byte[] descriptorProto, + String clientId, + String clientSecret, + Object options, + boolean isJson); + + private native CompletableFuture nativeRecreateStream(long sdkHandle, long streamHandle); + + private native CompletableFuture nativeCreateArrowStream( + long sdkHandle, + String tableName, + byte[] arrowSchema, + String clientId, + String clientSecret, + Object options); + + private native CompletableFuture nativeRecreateArrowStream( + long sdkHandle, long streamHandle); } diff --git a/src/main/java/com/databricks/zerobus/ZerobusSdkStubUtils.java b/src/main/java/com/databricks/zerobus/ZerobusSdkStubUtils.java deleted file mode 100644 index 6893cd5..0000000 --- a/src/main/java/com/databricks/zerobus/ZerobusSdkStubUtils.java +++ /dev/null @@ -1,164 +0,0 @@ -package com.databricks.zerobus; - -import io.grpc.CallOptions; -import io.grpc.Channel; -import io.grpc.ClientCall; -import io.grpc.ClientInterceptor; -import io.grpc.ManagedChannel; -import io.grpc.Metadata; -import io.grpc.MethodDescriptor; -import io.grpc.netty.shaded.io.grpc.netty.NettyChannelBuilder; -import java.util.concurrent.TimeUnit; - -/** - * Factory for creating Zerobus gRPC stubs with proper configuration. - * - *

This factory handles the creation of gRPC channels and stubs with appropriate settings for - * long-lived streaming connections. - */ -class ZerobusSdkStubFactory { - - // gRPC channel configuration constants - private static final int DEFAULT_TLS_PORT = 443; - private static final long KEEP_ALIVE_TIME_SECONDS = 30; - private static final long KEEP_ALIVE_TIMEOUT_SECONDS = 10; - - // Protocol prefix - private static final String HTTPS_PREFIX = "https://"; - - /** - * Creates a new managed gRPC channel with TLS. - * - *

The channel is configured for long-lived streaming with appropriate keep-alive settings and - * unlimited message size limits. - * - * @param endpoint The endpoint URL (may include https:// prefix) - * @return A configured ManagedChannel - */ - ManagedChannel createGrpcChannel(String endpoint) { - EndpointInfo endpointInfo = parseEndpoint(endpoint); - - NettyChannelBuilder builder = - NettyChannelBuilder.forAddress(endpointInfo.host, endpointInfo.port).useTransportSecurity(); - - // Configure for long-lived streaming connections with unlimited message size - return builder - .keepAliveTime(KEEP_ALIVE_TIME_SECONDS, TimeUnit.SECONDS) - .keepAliveTimeout(KEEP_ALIVE_TIMEOUT_SECONDS, TimeUnit.SECONDS) - .keepAliveWithoutCalls(true) - .maxInboundMessageSize(Integer.MAX_VALUE) - .build(); - } - - /** - * Creates a new Zerobus gRPC stub with dynamic token supplier. - * - *

The stub is configured with an interceptor that obtains a fresh token for each request using - * the provided token supplier. This allows token rotation without recreating the stub. - * - *

Note: Currently creates a new channel for each stub. Consider reusing channels across - * multiple streams for better resource utilization. - * - * @param endpoint The endpoint URL - * @param tableName The target table name - * @param tokenSupplier Supplier that provides a fresh authentication token for each request - * @return A configured ZerobusStub with unlimited message sizes - */ - ZerobusGrpc.ZerobusStub createStubWithTokenSupplier( - String endpoint, String tableName, java.util.function.Supplier tokenSupplier) { - ManagedChannel channel = createGrpcChannel(endpoint); - ClientInterceptor authInterceptor = new AuthenticationInterceptor(tokenSupplier, tableName); - Channel interceptedChannel = io.grpc.ClientInterceptors.intercept(channel, authInterceptor); - return ZerobusGrpc.newStub(interceptedChannel) - .withMaxInboundMessageSize(Integer.MAX_VALUE) - .withMaxOutboundMessageSize(Integer.MAX_VALUE); - } - - /** - * Creates a new stub factory instance. - * - * @return A new ZerobusSdkStubFactory - */ - static ZerobusSdkStubFactory create() { - return new ZerobusSdkStubFactory(); - } - - /** - * Parses an endpoint string to extract host and port information. - * - * @param endpoint The endpoint string (may include https:// prefix) - * @return Parsed endpoint information - */ - private EndpointInfo parseEndpoint(String endpoint) { - // Remove protocol prefix if present - String cleanEndpoint = endpoint; - if (cleanEndpoint.startsWith(HTTPS_PREFIX)) { - cleanEndpoint = cleanEndpoint.substring(HTTPS_PREFIX.length()); - } - - // Parse host:port format - String[] parts = cleanEndpoint.split(":", 2); - String host = parts[0]; - int port = parts.length > 1 ? Integer.parseInt(parts[1]) : DEFAULT_TLS_PORT; - - return new EndpointInfo(host, port); - } - - /** Container for parsed endpoint information. */ - private static class EndpointInfo { - final String host; - final int port; - - EndpointInfo(String host, int port) { - this.host = host; - this.port = port; - } - } -} - -/** - * gRPC client interceptor that adds authentication headers to requests. - * - *

This interceptor attaches the following headers to all outgoing requests: - * - *

    - *
  • Authorization: Bearer token - *
  • x-databricks-zerobus-table-name: table name - *
- */ -class AuthenticationInterceptor implements ClientInterceptor { - - private static final Metadata.Key AUTHORIZATION_HEADER = - Metadata.Key.of("authorization", Metadata.ASCII_STRING_MARSHALLER); - private static final Metadata.Key TABLE_NAME_HEADER = - Metadata.Key.of("x-databricks-zerobus-table-name", Metadata.ASCII_STRING_MARSHALLER); - private static final String BEARER_PREFIX = "Bearer "; - - private final java.util.function.Supplier tokenSupplier; - private final String tableName; - - /** - * Creates a new authentication interceptor with a dynamic token supplier. - * - * @param tokenSupplier Supplier that provides a fresh authentication token for each request - * @param tableName The target table name - */ - AuthenticationInterceptor(java.util.function.Supplier tokenSupplier, String tableName) { - this.tokenSupplier = tokenSupplier; - this.tableName = tableName; - } - - @Override - public ClientCall interceptCall( - MethodDescriptor method, CallOptions callOptions, Channel next) { - return new io.grpc.ForwardingClientCall.SimpleForwardingClientCall( - next.newCall(method, callOptions)) { - @Override - public void start(Listener responseListener, Metadata headers) { - headers.put(AUTHORIZATION_HEADER, BEARER_PREFIX + tokenSupplier.get()); - headers.put(TABLE_NAME_HEADER, tableName); - super.start(responseListener, headers); - } - }; - } -} diff --git a/src/main/java/com/databricks/zerobus/ZerobusStream.java b/src/main/java/com/databricks/zerobus/ZerobusStream.java index 1ba47f5..6d9b8fc 100644 --- a/src/main/java/com/databricks/zerobus/ZerobusStream.java +++ b/src/main/java/com/databricks/zerobus/ZerobusStream.java @@ -1,200 +1,115 @@ package com.databricks.zerobus; -import com.databricks.zerobus.ZerobusGrpc.ZerobusStub; -import com.google.protobuf.ByteString; import com.google.protobuf.Message; -import io.grpc.Status; -import io.grpc.StatusRuntimeException; -import io.grpc.stub.ClientCallStreamObserver; -import io.grpc.stub.ClientResponseObserver; import java.util.ArrayList; -import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Optional; -import java.util.Set; -import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeoutException; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.function.Supplier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** Types of stream failures that can occur during ingestion. */ -enum StreamFailureType { - /** Unknown failure type */ - UNKNOWN, - /** Server closed the stream */ - SERVER_CLOSED_STREAM, - /** Failed while sending a message to the server */ - SENDING_MESSAGE, - /** Server stopped responding to requests */ - SERVER_UNRESPONSIVE -} - -/** Tracks stream failure counts and types for recovery decisions. */ -class StreamFailureInfo { - private StreamFailureType _failureType = StreamFailureType.UNKNOWN; - private int _failureCounts = 0; - - synchronized void logFailure(StreamFailureType streamFailureType) { - if (streamFailureType == _failureType) { - _failureCounts += 1; - } else { - _failureType = streamFailureType; - _failureCounts = 1; - } - } - - synchronized void resetFailure(StreamFailureType streamFailureType) { - if (_failureType == streamFailureType) { - _failureCounts = 0; - _failureType = StreamFailureType.UNKNOWN; - } - } - - synchronized int getFailureCounts() { - return _failureCounts; - } - - synchronized StreamFailureType getFailureType() { - return _failureType; - } -} - -/** - * Utility for classifying gRPC errors as retriable or non-retriable. Non-retriable errors indicate - * issues that cannot be resolved by retrying (e.g., invalid credentials, missing resources). - */ -class GrpcErrorHandling { - private static final Set NON_RETRIABLE_CODES = new HashSet<>(); - - static { - NON_RETRIABLE_CODES.add(Status.Code.INVALID_ARGUMENT); - NON_RETRIABLE_CODES.add(Status.Code.NOT_FOUND); - NON_RETRIABLE_CODES.add(Status.Code.UNAUTHENTICATED); - NON_RETRIABLE_CODES.add(Status.Code.OUT_OF_RANGE); - } - - /** - * Determines if a gRPC status code represents a non-retriable error. - * - * @param code The gRPC status code to check - * @return true if the error should not be retried - */ - static boolean isNonRetriable(Status.Code code) { - return NON_RETRIABLE_CODES.contains(code); - } -} - /** - * Internal record wrapper that tracks ingestion state. + * Zerobus stream for ingesting records into a table. * - * @param The type of the protobuf message being ingested - */ -class Record { - long offsetId; - final T record; - final ByteString protoEncodedRecord; - final CompletableFuture ackPromise; - - Record( - long offsetId, T record, ByteString protoEncodedRecord, CompletableFuture ackPromise) { - this.offsetId = offsetId; - this.record = record; - this.protoEncodedRecord = protoEncodedRecord; - this.ackPromise = ackPromise; - } -} - -/** - * Zerobus stream for ingesting records into a table. Should be created using - * ZerobusSdk.createStream. + *

This class provides methods for ingesting records into a Databricks table via the Zerobus + * service. It supports both Protocol Buffer and JSON record formats through the {@link + * IngestableRecord} interface. + * + *

Streams should be created using {@link ZerobusSdk#createStream} and closed when no longer + * needed. + * + *

Example usage: + * + *

{@code
+ * ZerobusStream stream = sdk.createStream(tableProperties, clientId, clientSecret).join();
+ *
+ * // Ingest a record and wait for acknowledgment
+ * stream.ingestRecord(myRecord).join();
+ *
+ * // Or use offset-based API for better control
+ * long offset = stream.ingestRecordOffset(new ProtoRecord<>(myRecord));
+ * stream.waitForOffset(offset);
+ *
+ * // Close when done
+ * stream.close();
+ * }
+ * + * @param the Protocol Buffer message type for this stream */ -public class ZerobusStream { +public class ZerobusStream implements AutoCloseable { private static final Logger logger = LoggerFactory.getLogger(ZerobusStream.class); - // implicit ec: ExecutionContext - this is the ExecutionContext that client provides to run async - // operations (e.g.create stream async result processing) - // zerobusStreamExecutor: ExecutionContext - This is used only for futures like timeout counter / - // stream recovery / stream unresponsiveness detection, so we don't block threads from customer's - // ExecutionContext - // We have to use a separate executor (bounded) to make sure stream progress is - // not blocked + // Ensure native library is loaded + static { + NativeLoader.ensureLoaded(); + } - private static final int CREATE_STREAM_TIMEOUT_MS = 15000; + // Native handle to the Rust stream object + private volatile long nativeHandle; - private ZerobusStub stub; - private final Supplier stubSupplier; - final TableProperties tableProperties; - private final ZerobusSdkStubFactory stubFactory; - private final String serverEndpoint; - final StreamConfigurationOptions options; - private final ExecutorService zerobusStreamExecutor; - private final ExecutorService ec; - private final String workspaceId; - private final String unityCatalogEndpoint; + // Stream properties (stored for recreation) + private final TableProperties tableProperties; + private final StreamConfigurationOptions options; private final String clientId; private final String clientSecret; - private StreamState state = StreamState.UNINITIALIZED; - private Optional streamId = Optional.empty(); - private Optional> stream = Optional.empty(); - private Optional> streamCreatedEvent = Optional.empty(); - - // Sending records is asynchronus task which consumes records from recordsQueuedForSending and - // sends them to the server - private final ArrayBlockingQueue recordsQueuedForSending; - - // Here we store records which are not yet acknowledged by the server - final ArrayBlockingQueue> inflightRecords; - - // Populated just in case of hard failure, otherwise it's empty - private final List> unackedRecordsAfterStreamFailure = new ArrayList<>(); - - private long latestRespondedOffsetId = -1; - private long lastSentOffsetId = -1; - private final StreamFailureInfo streamFailureInfo = new StreamFailureInfo(); - - private final com.google.protobuf.DescriptorProtos.DescriptorProto descriptorProto; + // Track if stream was created with JSON mode + private final boolean isJsonMode; /** - * Returns the ID of the stream. + * Package-private constructor. Streams should be created via {@link ZerobusSdk#createStream}. * - * @return The ID of the stream. + * @param nativeHandle the native handle to the Rust stream object + * @param tableProperties the table properties + * @param options the stream configuration options + * @param clientId the OAuth client ID + * @param clientSecret the OAuth client secret + * @param isJsonMode whether this stream uses JSON mode */ - public synchronized String getStreamId() { - return streamId.orElse(""); + ZerobusStream( + long nativeHandle, + TableProperties tableProperties, + StreamConfigurationOptions options, + String clientId, + String clientSecret, + boolean isJsonMode) { + this.nativeHandle = nativeHandle; + this.tableProperties = tableProperties; + this.options = options; + this.clientId = clientId; + this.clientSecret = clientSecret; + this.isJsonMode = isJsonMode; } /** * Returns the state of the stream. * - * @return The state of the stream. + * @return the state of the stream + * @deprecated The stream state is no longer exposed by the underlying SDK. This method always + * returns {@link StreamState#OPENED} unless the stream is closed. */ - public synchronized StreamState getState() { - return state; + @Deprecated + public StreamState getState() { + if (nativeHandle == 0 || nativeIsClosed(nativeHandle)) { + return StreamState.CLOSED; + } + return StreamState.OPENED; } /** - * Returns the unacknowledged records after stream failure. + * Returns whether the stream is closed. * - * @return The unacknowledged records after stream failure. + * @return true if the stream is closed, false otherwise */ - public Iterator getUnackedRecords() { - List records = new ArrayList<>(); - for (Record record : unackedRecordsAfterStreamFailure) { - records.add(record.record); - } - return records.iterator(); + public boolean isClosed() { + return nativeHandle == 0 || nativeIsClosed(nativeHandle); } /** * Returns the table properties for this stream. * - * @return The table properties. + * @return the table properties */ public TableProperties getTableProperties() { return tableProperties; @@ -203,7 +118,7 @@ public TableProperties getTableProperties() { /** * Returns the stream configuration options. * - * @return The stream configuration options. + * @return the stream configuration options */ public StreamConfigurationOptions getOptions() { return options; @@ -212,7 +127,7 @@ public StreamConfigurationOptions getOptions() { /** * Returns the OAuth client ID. * - * @return The OAuth client ID. + * @return the OAuth client ID */ public String getClientId() { return clientId; @@ -221,1166 +136,282 @@ public String getClientId() { /** * Returns the OAuth client secret. * - * @return The OAuth client secret. + * @return the OAuth client secret */ public String getClientSecret() { return clientSecret; } - private synchronized void setState(StreamState newState) { - state = newState; - this.notifyAll(); - logger.debug("Stream state changed to " + newState); - } - - private CompletableFuture runWithTimeout( - long timeoutMs, java.util.function.Supplier> getFuture) { - AtomicBoolean done = new AtomicBoolean(false); - CompletableFuture future = getFuture.get(); - - future.whenComplete( - (result, error) -> { - synchronized (done) { - done.set(true); - done.notifyAll(); - } - }); - - CompletableFuture timeoutFuture = - CompletableFuture.runAsync( - () -> { - synchronized (done) { - try { - done.wait(timeoutMs); - if (!done.get()) { - throw new RuntimeException(new TimeoutException("Operation timed out!")); - } - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new RuntimeException(e); - } - } - }, - zerobusStreamExecutor); - - return CompletableFuture.anyOf(future, timeoutFuture).thenApply(result -> null); - } - /** - * Retries an operation with exponential backoff until success or max retries reached. + * Ingests a Protocol Buffer record into the stream. + * + *

This method enqueues the record for ingestion and returns a CompletableFuture that completes + * when the server acknowledges the record has been durably stored. * - *

This method uses recursion through the RetryHelper inner class to avoid blocking the caller - * thread. Each retry is scheduled asynchronously on the stream executor. + * @param record the Protocol Buffer record to ingest + * @return a CompletableFuture that completes when the record is acknowledged + * @throws ZerobusException if the stream is not in a valid state for ingestion + * @deprecated Use {@link #ingestRecordOffset(Object)} instead, which returns the offset directly + * after queuing. This avoids CompletableFuture allocation overhead for better performance. + * Example migration: + *

{@code
+   * // Before (deprecated):
+   * stream.ingestRecord(record).join();
    *
-   * @param maxRetries Maximum number of retry attempts
-   * @param context Context string for logging
-   * @param f Supplier that provides the operation to retry
-   * @return CompletableFuture that completes with the operation result or error
+   * // After (recommended):
+   * long offset = stream.ingestRecordOffset(record);
+   * stream.waitForOffset(offset);
+   * }
*/ - private CompletableFuture runWithRetries( - long maxRetries, String context, java.util.function.Supplier> f) { - CompletableFuture resultPromise = new CompletableFuture<>(); - - int backoffMs = options.recovery() ? options.recoveryBackoffMs() : 0; - - class RetryHelper { - void tryNext(int attempt) { - logger.debug("[" + context + "] Running attempt ... "); - - f.get() - .whenComplete( - (response, error) -> { - if (error == null) { - resultPromise.complete(response); - } else if (error instanceof NonRetriableException - || error.getCause() instanceof NonRetriableException) { - // Non-retriable errors should fail immediately without retrying - resultPromise.completeExceptionally(error); - } else { - if (attempt < maxRetries - 1) { - // Schedule next retry after backoff period - CompletableFuture.runAsync( - () -> { - logger.debug("[" + context + "] Retrying in " + backoffMs + " ms ... "); - try { - Thread.sleep(backoffMs); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } - tryNext(attempt + 1); - }, - zerobusStreamExecutor); - } else { - // Exhausted all retries - resultPromise.completeExceptionally(error); - } - } - }); - } - } - - new RetryHelper().tryNext(0); - return resultPromise; - } - - private void failStream(Throwable error) { - synchronized (this) { - if (stream.isPresent()) { - try { - stream.get().onError(error); - } catch (Exception e) { - // Ignore - } - - stream = Optional.empty(); - streamId = Optional.empty(); - } - } + @Deprecated + public CompletableFuture ingestRecord(RecordType record) throws ZerobusException { + ensureOpen(); + return nativeIngestRecord(nativeHandle, record.toByteArray(), false); } - private CompletableFuture createStream() { - CompletableFuture createStreamDone = new CompletableFuture<>(); - - int timeoutMs = options.recovery() ? options.recoveryTimeoutMs() : CREATE_STREAM_TIMEOUT_MS; - - latestRespondedOffsetId = -1; - lastSentOffsetId = -1; - streamId = Optional.empty(); - stream = Optional.empty(); - streamCreatedEvent = Optional.empty(); - - runWithTimeout( - timeoutMs, - () -> { - CompletableFuture createStreamTry = new CompletableFuture<>(); - - // Get a fresh stub from the supplier - stub = stubSupplier.get(); - - // Create the gRPC stream with the fresh stub - streamCreatedEvent = Optional.of(new CompletableFuture<>()); - stream = - Optional.of( - (ClientCallStreamObserver) - stub.ephemeralStream(ackReceiver)); - - logger.debug("Creating ephemeral stream for table " + tableProperties.getTableName()); - - // Create the initial request - EphemeralStreamRequest createStreamRequest = - EphemeralStreamRequest.newBuilder() - .setCreateStream( - CreateIngestStreamRequest.newBuilder() - .setTableName(tableProperties.getTableName()) - .setDescriptorProto( - ByteString.copyFrom(descriptorProto.toByteArray())) - .setRecordType(com.databricks.zerobus.RecordType.PROTO) - .build()) - .build(); - - // Send the CreateStreamRequest - try { - sendMessage(createStreamRequest); - } catch (Exception exception) { - failStream(exception); - createStreamTry.completeExceptionally(exception); - return createStreamTry; - } - - streamCreatedEvent - .get() - .whenComplete( - (id, e) -> { - if (e == null) { - streamId = Optional.of(id); - recordsSenderTask.start(); - createStreamTry.complete(null); - } else if (e instanceof ZerobusException) { - failStream(e); - streamId = Optional.empty(); - streamCreatedEvent = Optional.empty(); - stream = Optional.empty(); - createStreamTry.completeExceptionally(e); - } else { - failStream(e); - streamId = Optional.empty(); - streamCreatedEvent = Optional.empty(); - stream = Optional.empty(); - createStreamTry.completeExceptionally( - new ZerobusException(e.getMessage(), e)); - } - }); - - return createStreamTry; - }) - .whenComplete( - (result, e) -> { - if (e == null) { - createStreamDone.complete(null); - } else { - failStream(e); - Throwable ex; - if (e instanceof StatusRuntimeException) { - Status.Code code = ((StatusRuntimeException) e).getStatus().getCode(); - if (GrpcErrorHandling.isNonRetriable(code)) { - ex = - new NonRetriableException( - "Non-retriable gRPC error during stream creation: " + e.getMessage(), - e); - } else { - ex = new ZerobusException("Stream creation failed: " + e.getMessage(), e); - } - } else if (e instanceof NonRetriableException) { - ex = new NonRetriableException("Stream creation failed: " + e.getMessage(), e); - } else { - ex = new ZerobusException("Stream creation failed: " + e.getMessage(), e); - } - createStreamDone.completeExceptionally(ex); - } - }); - - return createStreamDone; + /** + * Ingests a record into the stream. + * + *

This method accepts any {@link IngestableRecord}, allowing both Protocol Buffer and JSON + * records to be ingested through a unified API. + * + * @param record the record to ingest + * @return a CompletableFuture that completes when the record is acknowledged + * @throws ZerobusException if the stream is not in a valid state for ingestion + * @deprecated Use {@link #ingestRecordOffset(IngestableRecord)} instead, which returns the offset + * directly after queuing. This avoids CompletableFuture allocation overhead for better + * performance. + */ + @Deprecated + public CompletableFuture ingestRecord(IngestableRecord record) throws ZerobusException { + ensureOpen(); + return nativeIngestRecord(nativeHandle, record.toEncodedBytes(), record.isJson()); } - CompletableFuture initialize() { - CompletableFuture initializeDone = new CompletableFuture<>(); - - synchronized (this) { - if (state != StreamState.UNINITIALIZED) { - logger.error("Stream cannot be initialized/opened more than once"); - initializeDone.completeExceptionally( - new ZerobusException("Stream cannot be initialized/opened more than once")); - return initializeDone; - } - } - - int retries = options.recovery() ? options.recoveryRetries() : 1; - - runWithRetries(retries, "CreateStream", () -> createStream()) - .whenComplete( - (result, e) -> { - if (e == null) { - setState(StreamState.OPENED); - serverUnresponsivenessDetectionTask.start(); - logger.info("Stream created successfully with id " + streamId.get()); - initializeDone.complete(null); - } else { - setState(StreamState.FAILED); - logger.error("Failed to create stream: ", e); - if (e instanceof ZerobusException) { - initializeDone.completeExceptionally(e); - } else { - initializeDone.completeExceptionally( - new ZerobusException("Stream creation failed: " + e.getMessage(), e)); - } - } - }); - - return initializeDone; + /** + * Ingests a record and returns the offset immediately. + * + *

This method blocks until the record is enqueued (respecting backpressure) but does NOT wait + * for the server to acknowledge the record. Use {@link #waitForOffset(long)} to wait for + * acknowledgment. + * + *

This is the recommended API for high-throughput scenarios as it avoids CompletableFuture + * allocation overhead. + * + * @param record the record to ingest + * @return the offset ID assigned to this record + * @throws ZerobusException if the stream is not in a valid state or an error occurs + */ + public long ingestRecordOffset(IngestableRecord record) throws ZerobusException { + ensureOpen(); + return nativeIngestRecordOffset(nativeHandle, record.toEncodedBytes(), record.isJson()); } /** - * Closes the stream and cleans up resources. + * Ingests multiple records and returns the batch offset. * - * @param hardFailure If true, marks stream as FAILED and saves unacked records for potential - * retry - * @param exception The exception that caused the failure (if any) + *

This method blocks until all records are enqueued but does NOT wait for acknowledgment. Use + * {@link #waitForOffset(long)} to wait for the batch to be acknowledged. + * + * @param records the records to ingest + * @return the offset ID for the batch, or empty if the iterable is empty + * @throws ZerobusException if the stream is not in a valid state or an error occurs */ - private void closeStream(boolean hardFailure, Optional exception) { - synchronized (this) { - logger.debug("Closing stream, hardFailure: " + hardFailure); - - if (hardFailure && exception.isPresent()) { - // CRITICAL: Atomically mark stream as FAILED before processing unacked records. - // This prevents race conditions where clients see errors but unackedRecords is empty. - setState(StreamState.FAILED); - } - - recordsQueuedForSending.clear(); - recordsSenderTask.cancel(); - - try { - if (stream.isPresent()) { - stream.get().onCompleted(); - if (hardFailure) { - stream.get().cancel("Stream closed", null); - } - } - } catch (Exception e) { - // Ignore errors during stream cleanup - stream may already be closed - logger.debug("Error while closing stream: " + e.getMessage()); - } - - // For hard failures, preserve unacked records so they can be retried via recreateStream() - if (hardFailure) { - serverUnresponsivenessDetectionTask.cancel(); - logger.debug("Stream closing: Failing all unacked records"); + public Optional ingestRecordsOffset(Iterable records) + throws ZerobusException { + ensureOpen(); - while (!inflightRecords.isEmpty()) { - try { - Record record = inflightRecords.take(); - unackedRecordsAfterStreamFailure.add(record); - record.ackPromise.completeExceptionally( - exception.orElse(new ZerobusException("Stream failed"))); - this.notifyAll(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - break; - } - } - } + List payloads = new ArrayList<>(); + boolean isJson = false; - stream = Optional.empty(); - streamCreatedEvent = Optional.empty(); - streamId = Optional.empty(); - stub = null; - - this.notifyAll(); + for (IngestableRecord record : records) { + payloads.add(record.toEncodedBytes()); + isJson = record.isJson(); // All records should be same type } - // Wait for background tasks to fully stop before returning. - // This ensures clean shutdown and prevents resource leaks. - recordsSenderTask.waitUntilStopped(); - if (hardFailure) { - serverUnresponsivenessDetectionTask.waitUntilStopped(); + if (payloads.isEmpty()) { + return Optional.empty(); } - } - - private CompletableFuture closeStreamAsync( - boolean hardFailure, Optional exception) { - return CompletableFuture.runAsync( - () -> closeStream(hardFailure, exception), zerobusStreamExecutor); - } - - private void enqueueRecordsForResending() { - synchronized (this) { - if (state != StreamState.RECOVERING) { - return; - } - Iterator> recordsIterator = inflightRecords.iterator(); - - while (recordsIterator.hasNext()) { - Record record = recordsIterator.next(); - - lastSentOffsetId += 1; - long offsetId = lastSentOffsetId; - - record.offsetId = offsetId; - - EphemeralStreamRequest recordRequest = - EphemeralStreamRequest.newBuilder() - .setIngestRecord( - IngestRecordRequest.newBuilder() - .setOffsetId(offsetId) - .setProtoEncodedRecord(record.protoEncodedRecord) - .build()) - .build(); - - try { - recordsQueuedForSending.put(recordRequest); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - break; - } - } - } + return Optional.of(nativeIngestRecordsOffset(nativeHandle, payloads, isJson)); } /** - * Attempts to recover a failed stream by recreating it and resending unacked records. + * Ingests multiple records and returns a CompletableFuture that completes when all are + * acknowledged. * - *

This method: + * @param records the records to ingest + * @return a CompletableFuture that completes when all records are acknowledged + * @throws ZerobusException if the stream is not in a valid state or an error occurs + * @deprecated Use {@link #ingestRecordsOffset(Iterable)} instead, which returns the offset + * directly after queuing. This avoids CompletableFuture allocation overhead for better + * performance. Example migration: + *

{@code
+   * // Before (deprecated):
+   * stream.ingestRecords(batch).join();
    *
-   * 
    - *
  1. Closes the current stream without marking it as hard failure - *
  2. Creates a new stream with the same configuration - *
  3. Re-enqueues all unacknowledged records for sending - *
- * - * @return CompletableFuture that completes when recovery succeeds or fails + * // After (recommended): + * Optional offset = stream.ingestRecordsOffset(batch); + * offset.ifPresent(stream::waitForOffset); + * }
*/ - private CompletableFuture recoverStream() { - CompletableFuture recoverStreamDone = new CompletableFuture<>(); - - CompletableFuture.runAsync( - () -> { - if (!options.recovery()) { - logger.debug("Stream recovery is disabled"); - recoverStreamDone.completeExceptionally( - new ZerobusException("Stream recovery is disabled")); - } else { - logger.warn( - "Stream broken! Running stream recovery for stream id '" - + streamId.orElse("unknown") - + "' ... "); - - // Close the broken stream but don't mark as hard failure since we're attempting - // recovery - closeStream(false, Optional.empty()); - - synchronized (this) { - int retries = options.recoveryRetries(); - // Reduce remaining retries based on consecutive failures of the same type - int leftRetries = Math.max(0, retries - streamFailureInfo.getFailureCounts() + 1); - - if (leftRetries == 0) { - logger.debug("Stream recovery failed: Run out of retries"); - recoverStreamDone.completeExceptionally( - new ZerobusException("Stream recovery failed")); - return; - } - - logger.debug( - "Stream recovery: Running with " - + leftRetries - + " / " - + retries - + " retries left"); - - runWithRetries( - leftRetries, - "RecoverStream", - () -> { - CompletableFuture recoverStreamTry = new CompletableFuture<>(); - - createStream() - .whenComplete( - (result, e) -> { - if (e != null) { - logger.debug( - "Stream recovery: Failed to create stream: " - + e.getMessage()); - recoverStreamTry.completeExceptionally(e); - } else { - enqueueRecordsForResending(); - recoverStreamTry.complete(null); - } - }); - - return recoverStreamTry; - }) - .whenComplete( - (result, e) -> { - if (e == null) { - logger.info( - "Stream recovery completed successfully. New stream id: " - + streamId.get()); - recoverStreamDone.complete(null); - } else { - logger.error("Stream recovery failed: " + e.getMessage(), e); - recoverStreamDone.completeExceptionally(e); - } - }); - } - } - }, - zerobusStreamExecutor); - - return recoverStreamDone; - } - - private void handleStreamFailed(StreamFailureType streamFailureType, Optional error) { - - Optional exception; - if (error.isPresent()) { - Throwable e = error.get(); - if (e instanceof ZerobusException) { - exception = Optional.of((ZerobusException) e); - } else { - exception = Optional.of(new ZerobusException("Stream failed: " + e.getMessage(), e)); - } - } else { - exception = Optional.of(new ZerobusException("Stream failed")); - } - - synchronized (this) { - if (state == StreamState.FAILED - || state == StreamState.UNINITIALIZED - || state == StreamState.RECOVERING) { - // UNINITIALIZED -> Stream failed during creation - // FAILED -> Stream already failed (don't handle it twice) - // RECOVERING -> Stream is recovering from a failure, no action needed - - if (state == StreamState.UNINITIALIZED && streamCreatedEvent.isPresent()) { - streamCreatedEvent.get().completeExceptionally(exception.get()); - } - - return; - } - - if (state == StreamState.CLOSED && !error.isPresent()) { - // Stream failed after closed, but without exception - that's expected (stream closed - // gracefully) - return; - } - - if (error.isPresent()) { - logger.error("Stream failed: " + error.get().getMessage(), error.get()); - } - - // Check if this is a non-retriable error - if so, don't attempt recovery - if (error.isPresent() && error.get() instanceof NonRetriableException) { - closeStreamAsync(true, exception); - return; - } - - streamFailureInfo.logFailure(streamFailureType); - - // Stream is open or flushing, try to recover it - setState(StreamState.RECOVERING); - - recoverStream() - .whenComplete( - (result, e) -> { - if (e == null) { - setState(StreamState.OPENED); - logger.info("Stream recovered successfully with id " + streamId.get()); - } else { - logger.error("Stream recovery failed", e); - closeStream(true, exception); - } - }); + @Deprecated + public CompletableFuture ingestRecords(Iterable records) + throws ZerobusException { + Optional offset = ingestRecordsOffset(records); + if (!offset.isPresent()) { + return CompletableFuture.completedFuture(null); } - } - - private CompletableFuture handleStreamFailedAsync( - StreamFailureType streamFailureType, Optional error) { return CompletableFuture.runAsync( - () -> handleStreamFailed(streamFailureType, error), zerobusStreamExecutor); + () -> { + try { + waitForOffset(offset.get()); + } catch (ZerobusException e) { + throw new RuntimeException(e); + } + }); } - // Task that checks if server is responsive (time it takes for server to ack a record) - // Task is created once during initialize() and it's shutdown when stream is closed finally - // (e.g. close() is called or stream can't be recovered) - private BackgroundTask serverUnresponsivenessDetectionTask; - - private void initServerUnresponsivenessDetectionTask() { - serverUnresponsivenessDetectionTask = - new BackgroundTask( - cancellationToken -> { - long taskIterationStartTime = System.currentTimeMillis(); - synchronized (ZerobusStream.this) { - switch (state) { - case UNINITIALIZED: - case CLOSED: - case FAILED: - break; - - case RECOVERING: - logger.debug( - "Server unresponsiveness detection task: Waiting for stream to finish recovering"); - try { - ZerobusStream.this.wait(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } - break; - - case OPENED: - case FLUSHING: - if (inflightRecords.isEmpty()) { - logger.debug( - "Server unresponsiveness detection task: Waiting for some records to be ingested"); - try { - ZerobusStream.this.wait(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } - } else { - // STREAM IS OPENED OR FLUSHING AND THERE ARE RECORDS IN THE QUEUE - CHECK IF - // SERVER IS RESPONSIVE - long latestRespondedOffsetIdBefore = latestRespondedOffsetId; - boolean serverResponsive = false; - boolean serverResponsiveTimeout = false; - - while (!serverResponsive && !serverResponsiveTimeout) { - if (latestRespondedOffsetIdBefore != latestRespondedOffsetId) { - serverResponsive = true; - } else { - long remainingTime = - options.serverLackOfAckTimeoutMs() - - (System.currentTimeMillis() - taskIterationStartTime); - - if (remainingTime <= 0) { - // We don't want to block here, since this potentially can close the - // stream, which will wait for this task to finish (deadlock) - handleStreamFailedAsync( - StreamFailureType.SERVER_UNRESPONSIVE, - Optional.of(new ZerobusException("Server is unresponsive"))); - serverResponsiveTimeout = true; - } else { - try { - ZerobusStream.this.wait(remainingTime); - if (cancellationToken.isDone()) { - // In case of a stream close, break the loop so that it doesn't hang - // waiting for the timeout. - serverResponsive = true; - } - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } - } - } - } - } - break; - } - } - }, - error -> { - // This should never happen (task won't throw any errors), but if it does, we need to - // handle it - // and it probably won't be recoverable - logger.error( - "Server unresponsiveness detection task failed: " + error.getMessage(), error); - - closeStreamAsync( - true, - Optional.of( - new ZerobusException( - "Server unresponsiveness detection task failed: " + error.getMessage(), - error))); - }, - zerobusStreamExecutor); + /** + * Waits for a specific offset to be acknowledged by the server. + * + *

This method blocks until the server acknowledges that all records up to and including the + * specified offset have been durably stored. + * + * @param offset the offset to wait for + * @throws ZerobusException if an error occurs or the wait times out + */ + public void waitForOffset(long offset) throws ZerobusException { + ensureOpen(); + nativeWaitForOffset(nativeHandle, offset); } - // Task that consumes records from recordsQueuedForSending and sends them to the server - // This task is restarted each time stream is recovered/restarted - private BackgroundTask recordsSenderTask; - - private void initRecordsSenderTask() { - recordsSenderTask = - new BackgroundTask( - cancellationToken -> { - // Check if there are records to send - Optional recordRequest; - synchronized (ZerobusStream.this) { - switch (state) { - case OPENED: - case FLUSHING: - if (recordsQueuedForSending.isEmpty()) { - try { - ZerobusStream.this.wait(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } - recordRequest = Optional.empty(); - } else { - try { - recordRequest = Optional.of(recordsQueuedForSending.take()); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - recordRequest = Optional.empty(); - } - } - break; - case CLOSED: - if (recordsQueuedForSending.isEmpty()) { - recordRequest = Optional.empty(); - } else { - try { - recordRequest = Optional.of(recordsQueuedForSending.take()); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - recordRequest = Optional.empty(); - } - } - break; - default: - recordRequest = Optional.empty(); - break; - } - } - - // If we have a record, wait for stream to be ready and send it - if (recordRequest.isPresent()) { - if (stream.isPresent()) { - ClientCallStreamObserver strm = stream.get(); - // Wait for stream to be ready - synchronized (ZerobusStream.this) { - while (!strm.isReady() && !cancellationToken.isDone()) { - try { - ZerobusStream.this.wait(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - break; - } - } - } - if (!cancellationToken.isDone()) { - // Send the record - try { - sendMessage(recordRequest.get()); - streamFailureInfo.resetFailure(StreamFailureType.SENDING_MESSAGE); - } catch (Exception ex) { - logger.error("Error while sending record: " + ex.getMessage(), ex); - - // Use async to avoid deadlock: handleStreamFailed() may call closeStream() - // which waits for this task to stop. - handleStreamFailedAsync(StreamFailureType.SENDING_MESSAGE, Optional.of(ex)); - - // Wait for state change before continuing. This prevents repeatedly - // attempting - // to send the next record which would likely fail with the same error. - // The task will be restarted after recovery (or shut down if recovery fails). - synchronized (ZerobusStream.this) { - while ((state == StreamState.OPENED || state == StreamState.FLUSHING) - && !cancellationToken.isDone()) { - try { - ZerobusStream.this.wait(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - break; - } - } - } - } - } - } - } - // No record available, continue to next iteration - }, - error -> { - // This should never happen (task won't throw any errors), but if it does, we need to - // handle it - // and it probably won't be recoverable - logger.error("Records sender task failed: " + error.getMessage(), error); - - closeStreamAsync( - true, - Optional.of( - new ZerobusException( - "Records sender task failed: " + error.getMessage(), error))); - }, - zerobusStreamExecutor); + /** + * Flushes the stream, waiting for all queued records to be acknowledged by the server. + * + *

The stream remains open after flushing. + * + * @throws ZerobusException if an error occurs or the flush times out + */ + public void flush() throws ZerobusException { + ensureOpen(); + nativeFlush(nativeHandle); + logger.info("All records have been flushed"); } - private ClientResponseObserver ackReceiver; - - private void initAckReceiver() { - ackReceiver = - new ClientResponseObserver() { - // Track state for the receiver - private Optional ackReceiverStreamId = Optional.empty(); - - @Override - public void beforeStart(ClientCallStreamObserver requestStream) { - requestStream.setOnReadyHandler( - () -> { - synchronized (ZerobusStream.this) { - ZerobusStream.this.notifyAll(); - } - }); - } - - @Override - public void onNext(EphemeralStreamResponse response) { - switch (response.getPayloadCase()) { - // *** Create stream response *** - case CREATE_STREAM_RESPONSE: - ackReceiverStreamId = - Optional.of( - response.getCreateStreamResponse().getStreamId().isEmpty() - ? null - : response.getCreateStreamResponse().getStreamId()); - if (!ackReceiverStreamId.isPresent() || ackReceiverStreamId.get() == null) { - throw new RuntimeException( - new ZerobusException("Invalid response from server: stream id is missing")); - } - logger.debug("Stream created with id " + ackReceiverStreamId.get()); - streamCreatedEvent.get().complete(ackReceiverStreamId.get()); - break; - - // *** Ingest record response (durability ack) *** - case INGEST_RECORD_RESPONSE: - String streamIdForReceiver = - ackReceiverStreamId.orElseThrow( - () -> - new RuntimeException( - new ZerobusException( - "Invalid response from server: expected stream id but got record ack"))); - long ackedOffsetId = - response.getIngestRecordResponse().getDurabilityAckUpToOffset(); - logger.debug("Acked offset " + ackedOffsetId); - - synchronized (ZerobusStream.this) { - - // Edge case: Stream was recovered/recreated while ack was in flight. - // Ignore stale acks from old stream to avoid incorrectly completing promises. - if (!streamId.isPresent() || !streamIdForReceiver.equals(streamId.get())) { - return; - } - - // Receiving an ack proves the server is responsive and connection is healthy - streamFailureInfo.resetFailure(StreamFailureType.SERVER_CLOSED_STREAM); - streamFailureInfo.resetFailure(StreamFailureType.SERVER_UNRESPONSIVE); - - latestRespondedOffsetId = Math.max(latestRespondedOffsetId, ackedOffsetId); - - // Complete promises for all records up to and including the acked offset. - // Server guarantees durability for all records <= ackedOffsetId. - boolean processingDone = false; - while (!processingDone) { - if (inflightRecords.isEmpty()) { - processingDone = true; - } else { - Record record = inflightRecords.peek(); - - if (record.offsetId > ackedOffsetId) { - // This record hasn't been acked yet - processingDone = true; - } else { - record.ackPromise.complete(null); - try { - inflightRecords.take(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - break; - } - } - } - } - - ZerobusStream.this.notifyAll(); - } - - // Invoke user callback asynchronously to avoid blocking the gRPC receiver thread. - // Exceptions in user code should not affect stream operation. - if (options.ackCallback().isPresent()) { - CompletableFuture.runAsync( - () -> { - options.ackCallback().get().accept(response.getIngestRecordResponse()); - }, - ec) - .exceptionally( - e -> { - logger.error( - "Exception in async ack_callback for offset " - + response - .getIngestRecordResponse() - .getDurabilityAckUpToOffset(), - e); - return null; - }); - } - break; - - // *** Close stream signal *** - case CLOSE_STREAM_SIGNAL: - if (options.recovery()) { - double durationMs = 0.0; - if (response.getCloseStreamSignal().hasDuration()) { - durationMs = - response.getCloseStreamSignal().getDuration().getSeconds() * 1000.0 - + response.getCloseStreamSignal().getDuration().getNanos() / 1000000.0; - } - logger.info( - String.format( - "Server will close the stream in %.3fms. Triggering stream recovery.", - durationMs)); - streamFailureInfo.resetFailure(StreamFailureType.SERVER_CLOSED_STREAM); - handleStreamFailed(StreamFailureType.SERVER_CLOSED_STREAM, Optional.empty()); - } - break; - - // *** Unknown response *** - default: - throw new RuntimeException(new ZerobusException("Invalid response from server")); - } - } - - @Override - public void onError(Throwable t) { - synchronized (ZerobusStream.this) { - if (state == StreamState.CLOSED && !stream.isPresent()) { - logger.debug("Ignoring error on already closed stream: " + t.getMessage()); - return; - } - } - - Optional error = Optional.of(t); - - if (t instanceof StatusRuntimeException) { - Status.Code code = ((StatusRuntimeException) t).getStatus().getCode(); - if (GrpcErrorHandling.isNonRetriable(code)) { - error = - Optional.of( - new NonRetriableException( - "Non-retriable gRPC error: " + ((StatusRuntimeException) t).getStatus(), - t)); - } - } - - handleStreamFailed(StreamFailureType.SERVER_CLOSED_STREAM, error); - } - - @Override - public void onCompleted() { - logger.debug("Server called close on the stream"); - handleStreamFailed(StreamFailureType.SERVER_CLOSED_STREAM, Optional.empty()); - } - }; + /** + * Closes the stream, flushing all pending records first. + * + *

Once closed, the stream cannot be reopened. Use {@link ZerobusSdk#recreateStream} to create + * a new stream and replay unacknowledged records. + * + * @throws ZerobusException if an error occurs during close + */ + @Override + public void close() throws ZerobusException { + long handle = nativeHandle; + if (handle != 0) { + nativeHandle = 0; + nativeClose(handle); + nativeDestroy(handle); + logger.info("Stream closed"); + } } - private void sendMessage(EphemeralStreamRequest message) throws Exception { - stream.get().onNext(message); + /** + * Returns the unacknowledged records after stream failure. + * + *

This method should be called after the stream has failed or been closed to retrieve records + * that were not acknowledged. These records can be re-ingested using {@link + * ZerobusSdk#recreateStream}. + * + *

Note: This method returns raw byte arrays that need to be deserialized back into the + * original Protocol Buffer type. + * + * @return an iterator over the unacknowledged records + * @deprecated Use {@link #getUnackedBatches()} instead, which preserves batch grouping + */ + @Deprecated + public Iterator getUnackedRecords() { + // This is a compatibility method - we can't actually deserialize the records + // without knowing the message type at runtime. Return an empty iterator. + // Users should use getUnackedBatches() or getUnackedRecordsRaw() instead. + logger.warn( + "getUnackedRecords() is deprecated and may not work correctly. " + + "Use getUnackedBatches() instead."); + return new ArrayList().iterator(); } /** - * Ingests a record into the stream. + * Returns the unacknowledged records as raw byte arrays. * - * @param record The record to ingest. - * @return A CompletableFuture that completes when the server acknowledges the record has been - * durably stored. If the future raises an exception, the record most probably was not - * acknowledged, but it is also possible that the server acknowledged the record but the - * response was lost. In this case client should decide whether to retry the record or not. - * @throws ZerobusException if the stream is not in a valid state for ingestion + *

This method returns the raw encoded records that were not acknowledged. For Protocol Buffer + * records, these are the serialized protobuf bytes. For JSON records, these are UTF-8 encoded + * JSON strings. + * + * @return a list of raw encoded records + * @throws ZerobusException if an error occurs */ - public CompletableFuture ingestRecord(RecordType record) throws ZerobusException { - CompletableFuture durabilityPromise = new CompletableFuture<>(); - - synchronized (this) { - // Wait until there is space in the queue - boolean recordQueueFull = true; - while (recordQueueFull) { - switch (state) { - case RECOVERING: - case FLUSHING: - logger.debug( - "Ingest record: Waiting for stream " - + streamId.orElse("") - + " to finish recovering/flushing"); - try { - this.wait(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - durabilityPromise.completeExceptionally( - new ZerobusException("Interrupted while waiting for stream", e)); - return durabilityPromise; - } - break; - case FAILED: - case CLOSED: - case UNINITIALIZED: - logger.error( - "Cannot ingest record when stream is closed or not opened for stream ID " - + streamId.orElse("unknown")); - throw new ZerobusException( - "Cannot ingest record when stream is closed or not opened for stream ID " - + streamId.orElse("unknown")); - case OPENED: - if (inflightRecords.remainingCapacity() > 0) { - recordQueueFull = false; - } else { - logger.debug("Ingest record: Waiting for space in the queue"); - try { - this.wait(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - durabilityPromise.completeExceptionally( - new ZerobusException("Interrupted while waiting for space in queue", e)); - return durabilityPromise; - } - } - break; - } - } - - ByteString protoEncodedRecord = ByteString.copyFrom(record.toByteArray()); - lastSentOffsetId += 1; - long offsetId = lastSentOffsetId; - - try { - inflightRecords.put(new Record<>(offsetId, record, protoEncodedRecord, durabilityPromise)); - - recordsQueuedForSending.put( - EphemeralStreamRequest.newBuilder() - .setIngestRecord( - IngestRecordRequest.newBuilder() - .setOffsetId(offsetId) - .setProtoEncodedRecord(protoEncodedRecord) - .build()) - .build()); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - durabilityPromise.completeExceptionally( - new ZerobusException("Interrupted while enqueuing record", e)); - return durabilityPromise; - } - - this.notifyAll(); + public List getUnackedRecordsRaw() throws ZerobusException { + if (nativeHandle == 0) { + return new ArrayList<>(); } - - return durabilityPromise; + return nativeGetUnackedRecords(nativeHandle); } /** - * Flushes the stream, waiting for all queued records to be acknowledged by the server. The stream - * doesn't close after flushing. + * Returns the unacknowledged batches after stream failure. + * + *

This method preserves the batch grouping from the original ingestion, which can be useful + * for re-ingesting records in the same batches. * - * @throws ZerobusException If the stream is not opened. + * @return a list of unacknowledged batches + * @throws ZerobusException if an error occurs */ - public void flush() throws ZerobusException { - synchronized (this) { - logger.debug("Flushing stream ..."); + public List getUnackedBatches() throws ZerobusException { + if (nativeHandle == 0) { + return new ArrayList<>(); + } + return nativeGetUnackedBatches(nativeHandle); + } - try { - if (state == StreamState.UNINITIALIZED) { - logger.error("Cannot flush stream when it is not opened"); - throw new ZerobusException("Cannot flush stream when it is not opened"); - } + private void ensureOpen() throws ZerobusException { + if (nativeHandle == 0) { + throw new ZerobusException("Stream is closed"); + } + if (nativeIsClosed(nativeHandle)) { + throw new ZerobusException("Stream is closed"); + } + } - while (state == StreamState.RECOVERING) { - logger.debug("Flushing stream: Waiting for stream to finish recovering"); - try { - this.wait(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new ZerobusException("Error while flushing stream", e); - } - } + @Override + protected void finalize() { + if (nativeHandle != 0) { + nativeDestroy(nativeHandle); + nativeHandle = 0; + } + } - if (state == StreamState.OPENED) { - setState(StreamState.FLUSHING); - } + // Native methods implemented in Rust - long startTime = System.currentTimeMillis(); + private static native void nativeDestroy(long handle); - boolean recordsFlushed = false; - while (!recordsFlushed) { - if (state == StreamState.FAILED) { - logger.error("Stream failed, cannot flush"); - throw new ZerobusException("Stream failed, cannot flush"); - } else { - if (inflightRecords.isEmpty()) { - recordsFlushed = true; - } else { - long remainingTime = - options.flushTimeoutMs() - (System.currentTimeMillis() - startTime); + private native CompletableFuture nativeIngestRecord( + long handle, byte[] payload, boolean isJson); - if (remainingTime <= 0) { - logger.error("Flushing stream timed out"); - throw new ZerobusException("Flushing stream timed out"); - } + private native long nativeIngestRecordOffset(long handle, byte[] payload, boolean isJson); - try { - logger.debug("Waiting for " + remainingTime + "ms to flush stream ..."); - this.wait(remainingTime); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - logger.error("Error while flushing stream: " + e.getMessage(), e); - throw new ZerobusException("Error while flushing stream", e); - } - } - } - } + private native long nativeIngestRecordsOffset(long handle, List payloads, boolean isJson); - if (!inflightRecords.isEmpty()) { - logger.error("Flushing stream timed out"); - throw new ZerobusException("Flushing stream timed out"); - } + private native void nativeWaitForOffset(long handle, long offset); - logger.info("All records have been flushed"); - } finally { - if (state == StreamState.FLUSHING) { - setState(StreamState.OPENED); - } - } - } - } + private native void nativeFlush(long handle); - /** - * Closes the stream, while first flushing all queued records. Once a stream is closed, it cannot - * be reopened. - * - * @throws ZerobusException If the stream is not opened. - */ - public void close() throws ZerobusException { - boolean readyToClose = false; - synchronized (this) { - while (!readyToClose) { - switch (state) { - case UNINITIALIZED: - logger.error("Cannot close stream when it is not opened"); - throw new ZerobusException("Cannot close stream when it is not opened"); - case FAILED: - logger.error("Stream failed and cannot be gracefully closed"); - throw new ZerobusException("Stream failed and cannot be gracefully closed"); - case CLOSED: - // Idempotent operation - logger.debug("Close stream: Stream is already closed"); - return; - case FLUSHING: - case RECOVERING: - // Wait until the stream is flushed or recovering - logger.debug("Close stream: Waiting for stream to finish flushing/recovering"); - try { - this.wait(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } - break; - case OPENED: - setState(StreamState.CLOSED); - readyToClose = true; - break; - } - } - } + private native void nativeClose(long handle); - Optional receivedException = Optional.empty(); - try { - flush(); - } catch (ZerobusException ex) { - // Case 1: The exception is already the type we want. - receivedException = Optional.of(ex); - throw ex; // Re-throw the original exception. - } catch (Exception otherEx) { - // Case 2: Any other non-fatal exception. - // Wrap the unexpected exception in a new ZerobusException. - ZerobusException wrappedEx = new ZerobusException("Underlying failure during flush", otherEx); - receivedException = Optional.of(wrappedEx); - throw wrappedEx; - } finally { - closeStream(true, receivedException); - } + private native boolean nativeIsClosed(long handle); - logger.info("Stream gracefully closed"); - } + private native List nativeGetUnackedRecords(long handle); - public ZerobusStream( - Supplier stubSupplier, - TableProperties tableProperties, - ZerobusSdkStubFactory stubFactory, - String serverEndpoint, - String workspaceId, - String unityCatalogEndpoint, - String clientId, - String clientSecret, - StreamConfigurationOptions options, - ExecutorService zerobusStreamExecutor, - ExecutorService ec) { - this.stub = null; - this.stubSupplier = stubSupplier; - this.tableProperties = tableProperties; - this.stubFactory = stubFactory; - this.serverEndpoint = serverEndpoint; - this.workspaceId = workspaceId; - this.unityCatalogEndpoint = unityCatalogEndpoint; - this.clientId = clientId; - this.clientSecret = clientSecret; - this.options = options; - this.zerobusStreamExecutor = zerobusStreamExecutor; - this.ec = ec; - - this.recordsQueuedForSending = new ArrayBlockingQueue<>(options.maxInflightRecords()); - this.inflightRecords = new ArrayBlockingQueue<>(options.maxInflightRecords()); - this.descriptorProto = tableProperties.getDescriptorProto(); - - // Initialize background tasks and observers - initServerUnresponsivenessDetectionTask(); - initRecordsSenderTask(); - initAckReceiver(); - } + private native List nativeGetUnackedBatches(long handle); } diff --git a/src/main/resources/native/linux-x86_64/libzerobus_jni.so b/src/main/resources/native/linux-x86_64/libzerobus_jni.so new file mode 100755 index 0000000..c14cde3 Binary files /dev/null and b/src/main/resources/native/linux-x86_64/libzerobus_jni.so differ diff --git a/src/main/resources/native/windows-x86_64/zerobus_jni.dll b/src/main/resources/native/windows-x86_64/zerobus_jni.dll new file mode 100644 index 0000000..e426197 Binary files /dev/null and b/src/main/resources/native/windows-x86_64/zerobus_jni.dll differ diff --git a/src/test/java/com/databricks/zerobus/ArrowStreamConfigurationOptionsTest.java b/src/test/java/com/databricks/zerobus/ArrowStreamConfigurationOptionsTest.java new file mode 100644 index 0000000..d5b51c7 --- /dev/null +++ b/src/test/java/com/databricks/zerobus/ArrowStreamConfigurationOptionsTest.java @@ -0,0 +1,94 @@ +package com.databricks.zerobus; + +import static org.junit.jupiter.api.Assertions.*; + +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link ArrowStreamConfigurationOptions} and its builder. + * + *

These tests verify the builder pattern, default values, and configuration without requiring + * the native library. + */ +public class ArrowStreamConfigurationOptionsTest { + + @Test + void testDefaultValues() { + ArrowStreamConfigurationOptions options = ArrowStreamConfigurationOptions.getDefault(); + + assertEquals(1000, options.maxInflightBatches()); + assertTrue(options.recovery()); + assertEquals(15000, options.recoveryTimeoutMs()); + assertEquals(2000, options.recoveryBackoffMs()); + assertEquals(4, options.recoveryRetries()); + assertEquals(60000, options.serverLackOfAckTimeoutMs()); + assertEquals(300000, options.flushTimeoutMs()); + assertEquals(30000, options.connectionTimeoutMs()); + } + + @Test + void testBuilderWithCustomValues() { + ArrowStreamConfigurationOptions options = + ArrowStreamConfigurationOptions.builder() + .setMaxInflightBatches(500) + .setRecovery(false) + .setRecoveryTimeoutMs(30000) + .setRecoveryBackoffMs(5000) + .setRecoveryRetries(10) + .setServerLackOfAckTimeoutMs(120000) + .setFlushTimeoutMs(600000) + .setConnectionTimeoutMs(60000) + .build(); + + assertEquals(500, options.maxInflightBatches()); + assertFalse(options.recovery()); + assertEquals(30000, options.recoveryTimeoutMs()); + assertEquals(5000, options.recoveryBackoffMs()); + assertEquals(10, options.recoveryRetries()); + assertEquals(120000, options.serverLackOfAckTimeoutMs()); + assertEquals(600000, options.flushTimeoutMs()); + assertEquals(60000, options.connectionTimeoutMs()); + } + + @Test + void testPartialConfiguration() { + ArrowStreamConfigurationOptions options = + ArrowStreamConfigurationOptions.builder() + .setMaxInflightBatches(2000) + .setConnectionTimeoutMs(45000) + .build(); + + assertEquals(2000, options.maxInflightBatches()); + assertEquals(45000, options.connectionTimeoutMs()); + // Rest should be defaults + assertTrue(options.recovery()); + assertEquals(15000, options.recoveryTimeoutMs()); + } + + @Test + void testMinimalConfiguration() { + ArrowStreamConfigurationOptions options = ArrowStreamConfigurationOptions.builder().build(); + + // All should be defaults + assertEquals(1000, options.maxInflightBatches()); + assertTrue(options.recovery()); + assertEquals(30000, options.connectionTimeoutMs()); + } + + @Test + void testLongTimeoutValues() { + // Arrow uses long for timeouts (unlike StreamConfigurationOptions which uses int) + long largeTimeout = 3_600_000L; // 1 hour + + ArrowStreamConfigurationOptions options = + ArrowStreamConfigurationOptions.builder() + .setRecoveryTimeoutMs(largeTimeout) + .setFlushTimeoutMs(largeTimeout) + .setConnectionTimeoutMs(largeTimeout) + .build(); + + assertEquals(largeTimeout, options.recoveryTimeoutMs()); + assertEquals(largeTimeout, options.flushTimeoutMs()); + assertEquals(largeTimeout, options.connectionTimeoutMs()); + } +} diff --git a/src/test/java/com/databricks/zerobus/ArrowTablePropertiesTest.java b/src/test/java/com/databricks/zerobus/ArrowTablePropertiesTest.java new file mode 100644 index 0000000..41bfbe6 --- /dev/null +++ b/src/test/java/com/databricks/zerobus/ArrowTablePropertiesTest.java @@ -0,0 +1,92 @@ +package com.databricks.zerobus; + +import static org.junit.jupiter.api.Assertions.*; + +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link ArrowTableProperties}. + * + *

Note: These tests focus on validation and basic functionality. Full Arrow schema tests would + * require Apache Arrow libraries on the classpath. + */ +public class ArrowTablePropertiesTest { + + @Test + void testTableNameValidation_ThreeParts() { + // Should accept three-part table names + Object dummySchema = new Object(); // Placeholder schema for testing + ArrowTableProperties props = new ArrowTableProperties("catalog.schema.table", dummySchema); + + assertEquals("catalog.schema.table", props.getTableName()); + } + + @Test + void testTableNameValidation_RejectsOnePart() { + Object dummySchema = new Object(); + + assertThrows( + IllegalArgumentException.class, () -> new ArrowTableProperties("table_only", dummySchema)); + } + + @Test + void testTableNameValidation_RejectsTwoParts() { + Object dummySchema = new Object(); + + assertThrows( + IllegalArgumentException.class, + () -> new ArrowTableProperties("schema.table", dummySchema)); + } + + @Test + void testTableNameValidation_RejectsFourParts() { + Object dummySchema = new Object(); + + assertThrows( + IllegalArgumentException.class, + () -> new ArrowTableProperties("extra.catalog.schema.table", dummySchema)); + } + + @Test + void testNullTableNameThrows() { + Object dummySchema = new Object(); + + assertThrows(NullPointerException.class, () -> new ArrowTableProperties(null, dummySchema)); + } + + @Test + void testNullSchemaThrows() { + assertThrows( + NullPointerException.class, () -> new ArrowTableProperties("catalog.schema.table", null)); + } + + @Test + void testGetSchema() { + Object dummySchema = new Object(); + ArrowTableProperties props = new ArrowTableProperties("catalog.schema.table", dummySchema); + + assertEquals(dummySchema, props.getSchema()); + } + + @Test + void testValidTableNames() { + Object dummySchema = new Object(); + String[] validNames = { + "main.default.my_table", "catalog.schema.table", "a.b.c", "my_catalog.my_schema.my_table123" + }; + + for (String name : validNames) { + ArrowTableProperties props = new ArrowTableProperties(name, dummySchema); + assertEquals(name, props.getTableName()); + } + } + + @Test + void testSchemaIsStoredAsIs() { + // The schema is stored as Object and returned as-is + String fakeSchema = "this would be an Arrow Schema object"; + ArrowTableProperties props = new ArrowTableProperties("cat.sch.tbl", fakeSchema); + + assertSame(fakeSchema, props.getSchema()); + } +} diff --git a/src/test/java/com/databricks/zerobus/ExceptionTest.java b/src/test/java/com/databricks/zerobus/ExceptionTest.java new file mode 100644 index 0000000..ed78a4c --- /dev/null +++ b/src/test/java/com/databricks/zerobus/ExceptionTest.java @@ -0,0 +1,151 @@ +package com.databricks.zerobus; + +import static org.junit.jupiter.api.Assertions.*; + +import org.junit.jupiter.api.Test; + +/** + * Unit tests for the exception hierarchy: {@link ZerobusException} and {@link + * NonRetriableException}. + */ +public class ExceptionTest { + + // ==================== ZerobusException Tests ==================== + + @Test + void testZerobusExceptionWithMessage() { + String message = "Connection failed"; + ZerobusException exception = new ZerobusException(message); + + assertEquals(message, exception.getMessage()); + assertNull(exception.getCause()); + } + + @Test + void testZerobusExceptionWithMessageAndCause() { + String message = "Stream error"; + Throwable cause = new RuntimeException("Underlying cause"); + ZerobusException exception = new ZerobusException(message, cause); + + assertEquals(message, exception.getMessage()); + assertEquals(cause, exception.getCause()); + } + + @Test + void testZerobusExceptionIsRetryable() { + ZerobusException exception = new ZerobusException("Temporary failure"); + + // ZerobusException represents retriable errors + assertTrue(exception instanceof Exception); + assertFalse(exception instanceof NonRetriableException); + } + + @Test + void testZerobusExceptionThrowAndCatch() { + assertThrows( + ZerobusException.class, + () -> { + throw new ZerobusException("Test exception"); + }); + } + + // ==================== NonRetriableException Tests ==================== + + @Test + void testNonRetriableExceptionWithMessage() { + String message = "Invalid credentials"; + NonRetriableException exception = new NonRetriableException(message); + + assertEquals(message, exception.getMessage()); + assertNull(exception.getCause()); + } + + @Test + void testNonRetriableExceptionWithMessageAndCause() { + String message = "Table not found"; + Throwable cause = new IllegalArgumentException("Bad table name"); + NonRetriableException exception = new NonRetriableException(message, cause); + + assertEquals(message, exception.getMessage()); + assertEquals(cause, exception.getCause()); + } + + @Test + void testNonRetriableExceptionExtendsZerobusException() { + NonRetriableException exception = new NonRetriableException("Fatal error"); + + assertTrue(exception instanceof ZerobusException); + } + + @Test + void testNonRetriableExceptionCanBeCaughtAsZerobus() { + try { + throw new NonRetriableException("Fatal"); + } catch (ZerobusException e) { + // Should be caught here + assertTrue(e instanceof NonRetriableException); + } + } + + @Test + void testNonRetriableExceptionThrowAndCatch() { + assertThrows( + NonRetriableException.class, + () -> { + throw new NonRetriableException("Test non-retriable"); + }); + } + + // ==================== Exception Hierarchy Tests ==================== + + @Test + void testExceptionHierarchy() { + ZerobusException retriable = new ZerobusException("Retriable"); + NonRetriableException nonRetriable = new NonRetriableException("Non-retriable"); + + // Type checks + assertTrue(retriable instanceof Exception); + assertTrue(nonRetriable instanceof Exception); + assertTrue(nonRetriable instanceof ZerobusException); + + // NonRetriableException is a subtype of ZerobusException + assertFalse(retriable instanceof NonRetriableException); + assertTrue(nonRetriable instanceof ZerobusException); + } + + @Test + void testDistinguishExceptionTypes() { + // Simulate error handling logic that distinguishes between exception types + Exception[] exceptions = { + new ZerobusException("Network timeout"), + new NonRetriableException("Invalid token"), + new ZerobusException("Server busy"), + new NonRetriableException("Missing table") + }; + + int retriableCount = 0; + int nonRetriableCount = 0; + + for (Exception e : exceptions) { + if (e instanceof NonRetriableException) { + nonRetriableCount++; + } else if (e instanceof ZerobusException) { + retriableCount++; + } + } + + assertEquals(2, retriableCount); + assertEquals(2, nonRetriableCount); + } + + @Test + void testExceptionChaining() { + Throwable root = new IllegalStateException("Root cause"); + ZerobusException middle = new ZerobusException("Middle", root); + NonRetriableException top = new NonRetriableException("Top level", middle); + + assertEquals("Top level", top.getMessage()); + assertEquals(middle, top.getCause()); + assertEquals(root, top.getCause().getCause()); + } +} diff --git a/src/test/java/com/databricks/zerobus/IngestableRecordTest.java b/src/test/java/com/databricks/zerobus/IngestableRecordTest.java new file mode 100644 index 0000000..62c7e5e --- /dev/null +++ b/src/test/java/com/databricks/zerobus/IngestableRecordTest.java @@ -0,0 +1,237 @@ +package com.databricks.zerobus; + +import static org.junit.jupiter.api.Assertions.*; + +import com.databricks.test.table.TestTableRow.CityPopulationTableRow; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link IngestableRecord} implementations including {@link JsonRecord}, {@link + * ProtoRecord}, and {@link EncodedBatch}. + */ +public class IngestableRecordTest { + + // ==================== JsonRecord Tests ==================== + + @Test + void testJsonRecordCreation() { + String json = "{\"name\": \"test\", \"value\": 42}"; + JsonRecord record = new JsonRecord(json); + + assertEquals(json, record.getJson()); + assertTrue(record.isJson()); + } + + @Test + void testJsonRecordOf() { + String json = "{\"key\": \"value\"}"; + JsonRecord record = JsonRecord.of(json); + + assertEquals(json, record.getJson()); + assertTrue(record.isJson()); + } + + @Test + void testJsonRecordToEncodedBytes() { + String json = "{\"test\": true}"; + JsonRecord record = JsonRecord.of(json); + + byte[] bytes = record.toEncodedBytes(); + String decoded = new String(bytes, StandardCharsets.UTF_8); + + assertEquals(json, decoded); + } + + @Test + void testJsonRecordFromObject() { + // Simple test object + TestData data = new TestData("hello", 123); + + JsonRecord record = + JsonRecord.fromObject( + data, obj -> String.format("{\"name\":\"%s\",\"count\":%d}", obj.name, obj.count)); + + String json = record.getJson(); + assertTrue(json.contains("\"name\":\"hello\"")); + assertTrue(json.contains("\"count\":123")); + } + + @Test + void testJsonRecordWithUnicode() { + String json = "{\"message\": \"Hello, 世界! 🌍\"}"; + JsonRecord record = JsonRecord.of(json); + + byte[] bytes = record.toEncodedBytes(); + String decoded = new String(bytes, StandardCharsets.UTF_8); + + assertEquals(json, decoded); + } + + @Test + void testJsonRecordEmptyObject() { + JsonRecord record = JsonRecord.of("{}"); + + assertEquals("{}", record.getJson()); + assertEquals(2, record.toEncodedBytes().length); + } + + // ==================== ProtoRecord Tests ==================== + + @Test + void testProtoRecordCreation() { + CityPopulationTableRow message = + CityPopulationTableRow.newBuilder().setCityName("Seattle").setPopulation(750000).build(); + + ProtoRecord record = new ProtoRecord<>(message); + + assertEquals(message, record.getMessage()); + assertFalse(record.isJson()); + } + + @Test + void testProtoRecordOf() { + CityPopulationTableRow message = + CityPopulationTableRow.newBuilder().setCityName("Portland").setPopulation(650000).build(); + + ProtoRecord record = ProtoRecord.of(message); + + assertEquals(message, record.getMessage()); + assertFalse(record.isJson()); + } + + @Test + void testProtoRecordToEncodedBytes() { + CityPopulationTableRow message = + CityPopulationTableRow.newBuilder().setCityName("Denver").setPopulation(715000).build(); + + ProtoRecord record = ProtoRecord.of(message); + byte[] bytes = record.toEncodedBytes(); + + // Bytes should match proto serialization + assertArrayEquals(message.toByteArray(), bytes); + } + + @Test + void testProtoRecordIsNotJson() { + CityPopulationTableRow message = CityPopulationTableRow.getDefaultInstance(); + ProtoRecord record = ProtoRecord.of(message); + + assertFalse(record.isJson()); + } + + // ==================== EncodedBatch Tests ==================== + + @Test + void testEncodedBatchCreation() { + List records = + Arrays.asList("record1".getBytes(), "record2".getBytes(), "record3".getBytes()); + + EncodedBatch batch = new EncodedBatch(records, false); + + assertEquals(3, batch.size()); + assertFalse(batch.isJson()); + assertFalse(batch.isEmpty()); + } + + @Test + void testEncodedBatchJsonMode() { + List records = Arrays.asList("{\"a\":1}".getBytes(), "{\"b\":2}".getBytes()); + + EncodedBatch batch = new EncodedBatch(records, true); + + assertTrue(batch.isJson()); + assertEquals(2, batch.size()); + } + + @Test + void testEncodedBatchEmpty() { + EncodedBatch batch = new EncodedBatch(Arrays.asList(), false); + + assertTrue(batch.isEmpty()); + assertEquals(0, batch.size()); + } + + @Test + void testEncodedBatchGetRecords() { + byte[] record1 = "data1".getBytes(); + byte[] record2 = "data2".getBytes(); + List records = Arrays.asList(record1, record2); + + EncodedBatch batch = new EncodedBatch(records, false); + List retrieved = batch.getRecords(); + + assertEquals(2, retrieved.size()); + assertArrayEquals(record1, retrieved.get(0)); + assertArrayEquals(record2, retrieved.get(1)); + } + + @Test + void testEncodedBatchToIngestableRecordsProto() { + byte[] proto1 = new byte[] {0x0a, 0x04, 't', 'e', 's', 't'}; + byte[] proto2 = new byte[] {0x0a, 0x05, 'h', 'e', 'l', 'l', 'o'}; + + EncodedBatch batch = new EncodedBatch(Arrays.asList(proto1, proto2), false); + List ingestableRecords = batch.toIngestableRecords(); + + assertEquals(2, ingestableRecords.size()); + assertFalse(ingestableRecords.get(0).isJson()); + assertFalse(ingestableRecords.get(1).isJson()); + assertArrayEquals(proto1, ingestableRecords.get(0).toEncodedBytes()); + assertArrayEquals(proto2, ingestableRecords.get(1).toEncodedBytes()); + } + + @Test + void testEncodedBatchToIngestableRecordsJson() { + byte[] json1 = "{\"id\":1}".getBytes(StandardCharsets.UTF_8); + byte[] json2 = "{\"id\":2}".getBytes(StandardCharsets.UTF_8); + + EncodedBatch batch = new EncodedBatch(Arrays.asList(json1, json2), true); + List ingestableRecords = batch.toIngestableRecords(); + + assertEquals(2, ingestableRecords.size()); + assertTrue(ingestableRecords.get(0).isJson()); + assertTrue(ingestableRecords.get(1).isJson()); + + // JSON records should decode back to the original strings + JsonRecord record1 = (JsonRecord) ingestableRecords.get(0); + JsonRecord record2 = (JsonRecord) ingestableRecords.get(1); + assertEquals("{\"id\":1}", record1.getJson()); + assertEquals("{\"id\":2}", record2.getJson()); + } + + // ==================== Interface Consistency Tests ==================== + + @Test + void testIngestableRecordInterfaceConsistency() { + // Both JsonRecord and ProtoRecord should work through IngestableRecord interface + CityPopulationTableRow protoMsg = + CityPopulationTableRow.newBuilder().setCityName("Test").build(); + + IngestableRecord jsonRecord = JsonRecord.of("{\"test\": true}"); + IngestableRecord protoRecord = ProtoRecord.of(protoMsg); + + // Verify interface methods + assertTrue(jsonRecord.isJson()); + assertFalse(protoRecord.isJson()); + + assertNotNull(jsonRecord.toEncodedBytes()); + assertNotNull(protoRecord.toEncodedBytes()); + + assertTrue(jsonRecord.toEncodedBytes().length > 0); + assertTrue(protoRecord.toEncodedBytes().length > 0); + } + + // Helper class for testing + private static class TestData { + String name; + int count; + + TestData(String name, int count) { + this.name = name; + this.count = count; + } + } +} diff --git a/src/test/java/com/databricks/zerobus/IntegrationTest.java b/src/test/java/com/databricks/zerobus/IntegrationTest.java new file mode 100644 index 0000000..5aec138 --- /dev/null +++ b/src/test/java/com/databricks/zerobus/IntegrationTest.java @@ -0,0 +1,457 @@ +package com.databricks.zerobus; + +import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assumptions.assumeTrue; + +import com.databricks.test.table.AirQualityRow.AirQuality; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import org.junit.jupiter.api.*; + +/** + * Integration tests that run against a real Zerobus server. + * + *

These tests are skipped unless the following environment variables are set: + * + *

    + *
  • ZEROBUS_serverEndpoint - The Zerobus server endpoint URL + *
  • DATABRICKS_workspaceUrl - The Databricks workspace URL + *
  • ZEROBUS_tableName - The target table name (catalog.schema.table) + *
  • DATABRICKS_CLIENT_ID - Service principal application ID + *
  • DATABRICKS_CLIENT_SECRET - Service principal secret + *
+ * + *

Run with: {@code mvn test -Dtest=IntegrationTest} + */ +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) +public class IntegrationTest { + + private static String serverEndpoint; + private static String workspaceUrl; + private static String tableName; + private static String clientId; + private static String clientSecret; + private static boolean configAvailable; + private static boolean nativeLibraryAvailable; + + @BeforeAll + static void checkPrerequisites() { + // Check configuration + serverEndpoint = System.getenv("ZEROBUS_SERVER_ENDPOINT"); + workspaceUrl = System.getenv("DATABRICKS_WORKSPACE_URL"); + tableName = System.getenv("ZEROBUS_TABLE_NAME"); + clientId = System.getenv("DATABRICKS_CLIENT_ID"); + clientSecret = System.getenv("DATABRICKS_CLIENT_SECRET"); + + configAvailable = + serverEndpoint != null + && workspaceUrl != null + && tableName != null + && clientId != null + && clientSecret != null; + + if (!configAvailable) { + System.out.println( + "Integration tests skipped: Required environment variables not set. " + + "Set ZEROBUS_SERVER_ENDPOINT, DATABRICKS_WORKSPACE_URL, ZEROBUS_TABLE_NAME, " + + "DATABRICKS_CLIENT_ID, and DATABRICKS_CLIENT_SECRET"); + } + + // Check native library + try { + NativeLoader.ensureLoaded(); + nativeLibraryAvailable = true; + } catch (UnsatisfiedLinkError | ExceptionInInitializerError e) { + nativeLibraryAvailable = false; + System.out.println( + "Integration tests skipped: Native library not available - " + e.getMessage()); + } + } + + @BeforeEach + void skipIfPrerequisitesNotMet() { + assumeTrue(nativeLibraryAvailable, "Native library not available"); + assumeTrue(configAvailable, "Configuration not available"); + } + + @Test + @Order(1) + @DisplayName("SDK creates and closes successfully") + void testSdkCreateAndClose() { + ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl); + assertNotNull(sdk); + sdk.close(); + } + + @Test + @Order(2) + @DisplayName("Create stream and ingest single proto record") + void testSingleProtoRecord() throws Exception { + ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl); + + TableProperties tableProperties = + new TableProperties<>(tableName, AirQuality.getDefaultInstance()); + + ZerobusStream stream = + sdk.createStream(tableProperties, clientId, clientSecret).join(); + + assertNotNull(stream); + assertFalse(stream.isClosed()); + + try { + AirQuality record = + AirQuality.newBuilder() + .setDeviceName("integration-test-proto") + .setTemp(25) + .setHumidity(65L) + .build(); + + long offset = stream.ingestRecordOffset(ProtoRecord.of(record)); + assertTrue(offset >= 0, "Offset should be non-negative"); + + stream.waitForOffset(offset); + System.out.println("Single proto record ingested successfully, offset: " + offset); + } finally { + stream.close(); + assertTrue(stream.isClosed()); + } + } + + @Test + @Order(3) + @DisplayName("Create stream and ingest single JSON record") + void testSingleJsonRecord() throws Exception { + ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl); + + TableProperties tableProperties = + new TableProperties<>(tableName, AirQuality.getDefaultInstance()); + + // Create a JSON-configured stream + StreamConfigurationOptions jsonOptions = + StreamConfigurationOptions.builder().setRecordType(RecordType.JSON).build(); + + ZerobusStream stream = + sdk.createStream(tableProperties, clientId, clientSecret, jsonOptions).join(); + + try { + JsonRecord jsonRecord = + JsonRecord.of( + "{\"device_name\": \"integration-test-json\", \"temp\": 28, \"humidity\": 55}"); + + long offset = stream.ingestRecordOffset(jsonRecord); + assertTrue(offset >= 0); + + stream.waitForOffset(offset); + System.out.println("Single JSON record ingested successfully, offset: " + offset); + } finally { + stream.close(); + } + } + + @Test + @Order(4) + @DisplayName("Batch proto record ingestion") + void testBatchProtoIngestion() throws Exception { + ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl); + + TableProperties tableProperties = + new TableProperties<>(tableName, AirQuality.getDefaultInstance()); + + StreamConfigurationOptions options = + StreamConfigurationOptions.builder().setMaxInflightRecords(1000).build(); + + ZerobusStream stream = + sdk.createStream(tableProperties, clientId, clientSecret, options).join(); + + try { + List> batch = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + AirQuality record = + AirQuality.newBuilder() + .setDeviceName("batch-test-" + i) + .setTemp(15 + (i % 20)) + .setHumidity(40 + (i % 50)) + .build(); + batch.add(ProtoRecord.of(record)); + } + + Optional offset = stream.ingestRecordsOffset(batch); + assertTrue(offset.isPresent()); + + stream.waitForOffset(offset.get()); + System.out.println("Batch of 100 proto records ingested, offset: " + offset.get()); + } finally { + stream.close(); + } + } + + @Test + @Order(5) + @DisplayName("Batch JSON record ingestion") + void testBatchJsonIngestion() throws Exception { + ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl); + + TableProperties tableProperties = + new TableProperties<>(tableName, AirQuality.getDefaultInstance()); + + // Create a JSON-configured stream + StreamConfigurationOptions jsonOptions = + StreamConfigurationOptions.builder().setRecordType(RecordType.JSON).build(); + + ZerobusStream stream = + sdk.createStream(tableProperties, clientId, clientSecret, jsonOptions).join(); + + try { + List batch = new ArrayList<>(); + for (int i = 0; i < 50; i++) { + String json = + String.format( + "{\"device_name\": \"json-batch-%d\", \"temp\": %d, \"humidity\": %d}", + i, 20 + (i % 15), 50 + (i % 40)); + batch.add(JsonRecord.of(json)); + } + + Optional offset = stream.ingestRecordsOffset(batch); + assertTrue(offset.isPresent()); + + stream.waitForOffset(offset.get()); + System.out.println("Batch of 50 JSON records ingested, offset: " + offset.get()); + } finally { + stream.close(); + } + } + + @Test + @Order(6) + @DisplayName("Flush operation") + void testFlush() throws Exception { + ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl); + + TableProperties tableProperties = + new TableProperties<>(tableName, AirQuality.getDefaultInstance()); + + ZerobusStream stream = + sdk.createStream(tableProperties, clientId, clientSecret).join(); + + try { + // Ingest multiple records without waiting + for (int i = 0; i < 10; i++) { + AirQuality record = + AirQuality.newBuilder() + .setDeviceName("flush-test-" + i) + .setTemp(22) + .setHumidity(60L) + .build(); + stream.ingestRecordOffset(ProtoRecord.of(record)); + } + + // Flush and wait for all + stream.flush(); + System.out.println("Flush completed successfully"); + } finally { + stream.close(); + } + } + + @Test + @Order(7) + @DisplayName("High-throughput offset-based ingestion") + void testHighThroughput() throws Exception { + ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl); + + TableProperties tableProperties = + new TableProperties<>(tableName, AirQuality.getDefaultInstance()); + + StreamConfigurationOptions options = + StreamConfigurationOptions.builder().setMaxInflightRecords(10000).build(); + + ZerobusStream stream = + sdk.createStream(tableProperties, clientId, clientSecret, options).join(); + + int recordCount = 1000; + long startTime = System.currentTimeMillis(); + long lastOffset = -1; + + try { + for (int i = 0; i < recordCount; i++) { + AirQuality record = + AirQuality.newBuilder() + .setDeviceName("throughput-test-" + (i % 100)) + .setTemp(15 + (i % 20)) + .setHumidity(40 + (i % 50)) + .build(); + lastOffset = stream.ingestRecordOffset(ProtoRecord.of(record)); + } + + stream.waitForOffset(lastOffset); + + long endTime = System.currentTimeMillis(); + double durationSec = (endTime - startTime) / 1000.0; + double recordsPerSec = recordCount / durationSec; + + System.out.printf( + "Throughput test: %d records in %.2f sec = %.0f rec/sec%n", + recordCount, durationSec, recordsPerSec); + + assertTrue(recordsPerSec > 100, "Throughput should be at least 100 rec/sec"); + } finally { + stream.close(); + } + } + + @Test + @Order(8) + @DisplayName("AckCallback receives notifications") + void testAckCallback() throws Exception { + ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl); + + TableProperties tableProperties = + new TableProperties<>(tableName, AirQuality.getDefaultInstance()); + + // Track callback invocations + List ackedOffsets = new ArrayList<>(); + List errors = new ArrayList<>(); + + AckCallback callback = + new AckCallback() { + @Override + public void onAck(long offsetId) { + synchronized (ackedOffsets) { + ackedOffsets.add(offsetId); + } + } + + @Override + public void onError(long offsetId, String errorMessage) { + synchronized (errors) { + errors.add(errorMessage); + } + } + }; + + StreamConfigurationOptions options = + StreamConfigurationOptions.builder().setAckCallback(callback).build(); + + ZerobusStream stream = + sdk.createStream(tableProperties, clientId, clientSecret, options).join(); + + try { + long lastOffset = -1; + for (int i = 0; i < 10; i++) { + AirQuality record = + AirQuality.newBuilder() + .setDeviceName("callback-test-" + i) + .setTemp(25) + .setHumidity(60L) + .build(); + lastOffset = stream.ingestRecordOffset(ProtoRecord.of(record)); + } + + stream.waitForOffset(lastOffset); + + // Give callback time to be invoked + Thread.sleep(500); + + System.out.println("Callback received " + ackedOffsets.size() + " ack notifications"); + assertTrue(errors.isEmpty(), "No errors should have occurred"); + } finally { + stream.close(); + } + } + + @Test + @Order(9) + @DisplayName("Stream recreation preserves unacked records") + void testStreamRecreation() throws Exception { + ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl); + + TableProperties tableProperties = + new TableProperties<>(tableName, AirQuality.getDefaultInstance()); + + // First stream + ZerobusStream stream1 = + sdk.createStream(tableProperties, clientId, clientSecret).join(); + + // Ingest some records and flush before closing + for (int i = 0; i < 5; i++) { + AirQuality record = + AirQuality.newBuilder() + .setDeviceName("recreate-test-" + i) + .setTemp(25) + .setHumidity(60L) + .build(); + stream1.ingestRecordOffset(ProtoRecord.of(record)); + } + + stream1.flush(); + stream1.close(); + + // Get unacked batches after close (should be empty since we flushed) + List batches = stream1.getUnackedBatches(); + System.out.println("Unacked batches after flush and close: " + batches.size()); + assertTrue(batches.isEmpty(), "All batches should be acked after flush"); + + // Create new stream + ZerobusStream stream2 = + sdk.createStream(tableProperties, clientId, clientSecret).join(); + + try { + // Verify new stream works + AirQuality record = + AirQuality.newBuilder() + .setDeviceName("recreate-test-after") + .setTemp(30) + .setHumidity(70L) + .build(); + long offset = stream2.ingestRecordOffset(ProtoRecord.of(record)); + stream2.waitForOffset(offset); + System.out.println("New stream after recreation works correctly"); + } finally { + stream2.close(); + } + } + + @Test + @Order(10) + @DisplayName("Multiple concurrent streams") + void testConcurrentStreams() throws Exception { + ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl); + + TableProperties tableProperties = + new TableProperties<>(tableName, AirQuality.getDefaultInstance()); + + // Create two streams concurrently + ZerobusStream stream1 = + sdk.createStream(tableProperties, clientId, clientSecret).join(); + ZerobusStream stream2 = + sdk.createStream(tableProperties, clientId, clientSecret).join(); + + try { + // Ingest to both streams + AirQuality record1 = + AirQuality.newBuilder() + .setDeviceName("concurrent-stream-1") + .setTemp(25) + .setHumidity(60L) + .build(); + AirQuality record2 = + AirQuality.newBuilder() + .setDeviceName("concurrent-stream-2") + .setTemp(26) + .setHumidity(61L) + .build(); + + long offset1 = stream1.ingestRecordOffset(ProtoRecord.of(record1)); + long offset2 = stream2.ingestRecordOffset(ProtoRecord.of(record2)); + + stream1.waitForOffset(offset1); + stream2.waitForOffset(offset2); + + System.out.println("Both concurrent streams completed successfully"); + } finally { + stream1.close(); + stream2.close(); + } + } +} diff --git a/src/test/java/com/databricks/zerobus/MockedGrpcServer.java b/src/test/java/com/databricks/zerobus/MockedGrpcServer.java deleted file mode 100644 index 5620b98..0000000 --- a/src/test/java/com/databricks/zerobus/MockedGrpcServer.java +++ /dev/null @@ -1,420 +0,0 @@ -package com.databricks.zerobus; - -import io.grpc.stub.ClientCallStreamObserver; -import io.grpc.stub.StreamObserver; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.TimeUnit; - -/** - * MockedGrpcServer simulates server-side gRPC behavior for testing ZerobusSDK without needing a - * real server. - * - *

It intercepts gRPC stream messages, processes them asynchronously, and sends responses back to - * the client based on injected test scenarios. - */ -public class MockedGrpcServer { - private static class AckRecord { - final boolean success; - final long offsetId; - final long delayMs; - final Throwable error; - final boolean writeFailure; - final boolean closeStreamSignal; - - AckRecord( - boolean success, - long offsetId, - long delayMs, - Throwable error, - boolean writeFailure, - boolean closeStreamSignal) { - this.success = success; - this.offsetId = offsetId; - this.delayMs = delayMs; - this.error = error; - this.writeFailure = writeFailure; - this.closeStreamSignal = closeStreamSignal; - } - } - - private static class CreateStreamResponse { - final boolean success; - final long delayMs; - final boolean skip; - final boolean writeFailure; - - CreateStreamResponse(boolean success, long delayMs, boolean skip, boolean writeFailure) { - this.success = success; - this.delayMs = delayMs; - this.skip = skip; - this.writeFailure = writeFailure; - } - } - - private final ExecutorService executorService; - private final List capturedMessages; - private final List injectedAckRecords; - private final List injectedCreateStreamResponses; - private final BlockingQueue messagesToProcess; - - private StreamObserver ackSender; - private long lastReceivedOffsetId = -1; - private volatile boolean serverRunning = false; - private volatile boolean streamReady = true; - private Runnable streamReadyHandler; - - private final ClientCallStreamObserver messageReceiver = - new ClientCallStreamObserver() { - @Override - public void onNext(EphemeralStreamRequest request) { - synchronized (MockedGrpcServer.this) { - capturedMessages.add(request); - messagesToProcess.offer(request); - } - } - - @Override - public void onError(Throwable t) { - stopServerThread(); - } - - @Override - public void onCompleted() { - stopServerThread(); - } - - @Override - public boolean isReady() { - return streamReady; - } - - @Override - public void setOnReadyHandler(Runnable onReadyHandler) { - streamReadyHandler = onReadyHandler; - } - - @Override - public void disableAutoInboundFlowControl() {} - - @Override - public void request(int count) {} - - @Override - public void setMessageCompression(boolean enable) {} - - @Override - public void cancel(String message, Throwable cause) {} - }; - - public MockedGrpcServer() { - this.executorService = Executors.newFixedThreadPool(2); - this.capturedMessages = Collections.synchronizedList(new ArrayList<>()); - this.injectedAckRecords = Collections.synchronizedList(new ArrayList<>()); - this.injectedCreateStreamResponses = Collections.synchronizedList(new ArrayList<>()); - this.messagesToProcess = new LinkedBlockingQueue<>(); - } - - /** Initialize the mocked server with an ack sender and start processing messages. */ - public void initialize(StreamObserver ackSender) { - synchronized (this) { - this.ackSender = ackSender; - this.lastReceivedOffsetId = -1; - this.messagesToProcess.clear(); - startServerThread(); - } - } - - /** Inject a successful ack for a specific record offset with optional delay. */ - public void injectAckRecord(long offsetId, long delayMs) { - injectedAckRecords.add(new AckRecord(true, offsetId, delayMs, null, false, false)); - } - - /** Inject a successful ack for a specific record offset. */ - public void injectAckRecord(long offsetId) { - injectAckRecord(offsetId, 0); - } - - /** Clear all injected ack records. */ - public void clearAckRecords() { - synchronized (injectedAckRecords) { - injectedAckRecords.clear(); - } - } - - /** Inject a failed ingest record response. */ - public void injectFailIngestRecord(long offsetId, long delayMs, Throwable error) { - injectedAckRecords.add(new AckRecord(false, offsetId, delayMs, error, false, false)); - } - - /** Inject a failed ingest record response. */ - public void injectFailIngestRecord(long offsetId) { - injectFailIngestRecord(offsetId, 0, new RuntimeException("Ingest record failed")); - } - - /** Inject a write failure for a specific record offset. */ - public void injectWriteFailureOfRecords(long offsetId, long delayMs) { - injectedAckRecords.add( - new AckRecord( - false, - offsetId, - delayMs, - new RuntimeException("IngestRecord write failure"), - true, - false)); - } - - /** Inject a write failure for a specific record offset. */ - public void injectWriteFailureOfRecords(long offsetId) { - injectWriteFailureOfRecords(offsetId, 0); - } - - /** Inject a non-retriable error for a specific record offset. */ - public void injectNonRetriableError(long offsetId, long delayMs) { - io.grpc.StatusRuntimeException nonRetriableError = - new io.grpc.StatusRuntimeException( - io.grpc.Status.UNAUTHENTICATED.withDescription("Non-retriable gRPC error")); - injectedAckRecords.add( - new AckRecord(false, offsetId, delayMs, nonRetriableError, false, false)); - } - - /** Inject a non-retriable error for a specific record offset. */ - public void injectNonRetriableError(long offsetId) { - injectNonRetriableError(offsetId, 0); - } - - /** Inject a CloseStreamSignal for a specific record offset. */ - public void injectCloseStreamSignal(long offsetId, long delayMs) { - injectedAckRecords.add(new AckRecord(true, offsetId, delayMs, null, false, true)); - } - - /** Inject a CloseStreamSignal for a specific record offset. */ - public void injectCloseStreamSignal(long offsetId) { - injectCloseStreamSignal(offsetId, 0); - } - - /** Inject a successful create stream response with delay. */ - public void injectCreateStreamSuccessWithDelay(long delayMs) { - injectedCreateStreamResponses.add(new CreateStreamResponse(true, delayMs, false, false)); - } - - /** Inject a failed create stream response. */ - public void injectFailCreateStream() { - injectedCreateStreamResponses.add(new CreateStreamResponse(false, 0, false, false)); - } - - /** Inject a skip create stream response (never sends response). */ - public void injectSkipCreateStreamResponse() { - injectedCreateStreamResponses.add(new CreateStreamResponse(false, 0, true, false)); - } - - /** Inject a write failure for create stream. */ - public void injectWriteFailureCreateStream(long delayMs) { - injectedCreateStreamResponses.add(new CreateStreamResponse(false, delayMs, false, true)); - } - - /** Inject a write failure for create stream. */ - public void injectWriteFailureCreateStream() { - injectWriteFailureCreateStream(0); - } - - /** Get all captured messages sent by the client. */ - public List getCapturedMessages() { - synchronized (capturedMessages) { - return new ArrayList<>(capturedMessages); - } - } - - /** Get the message receiver for the client to write to. */ - public ClientCallStreamObserver getMessageReceiver() { - return messageReceiver; - } - - /** Set stream readiness state. */ - public void setStreamReady(boolean ready) { - boolean oldStreamReadyState = streamReady; - streamReady = ready; - - if (streamReady && !oldStreamReadyState && streamReadyHandler != null) { - streamReadyHandler.run(); - } - } - - /** Destroy the mocked server and clean up resources. */ - public void destroy() { - stopServerThread(); - executorService.shutdownNow(); - try { - executorService.awaitTermination(5, TimeUnit.SECONDS); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } - } - - private void startServerThread() { - synchronized (this) { - if (serverRunning) { - return; - } - serverRunning = true; - } - - executorService.submit( - () -> { - try { - while (serverRunning) { - EphemeralStreamRequest request = messagesToProcess.poll(100, TimeUnit.MILLISECONDS); - if (request == null) { - continue; - } - - processMessage(request); - } - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } catch (Exception e) { - // Server thread error, stop processing - } - }); - } - - private void stopServerThread() { - synchronized (this) { - serverRunning = false; - } - } - - private void processMessage(EphemeralStreamRequest request) throws InterruptedException { - if (request.hasCreateStream()) { - handleCreateStream(); - } else if (request.hasIngestRecord()) { - handleIngestRecord(request.getIngestRecord().getOffsetId()); - } - } - - private void handleCreateStream() throws InterruptedException { - synchronized (injectedCreateStreamResponses) { - if (injectedCreateStreamResponses.isEmpty()) { - sendCreateStreamSuccess(); - return; - } - - CreateStreamResponse response = injectedCreateStreamResponses.remove(0); - if (response.skip) { - return; // Never send response - } - - if (response.delayMs > 0) { - Thread.sleep(response.delayMs); - } - - if (response.writeFailure) { - throw new RuntimeException("CreateStream write failure"); - } - - if (response.success) { - sendCreateStreamSuccess(); - } else { - sendError(new RuntimeException("Create stream failed")); - } - } - } - - private void handleIngestRecord(long offset) throws InterruptedException { - if (offset != lastReceivedOffsetId + 1) { - sendError( - new RuntimeException( - String.format( - "Invalid offset Id; expected %d but got %d", lastReceivedOffsetId + 1, offset))); - return; - } - - lastReceivedOffsetId = offset; - - synchronized (injectedAckRecords) { - if (injectedAckRecords.isEmpty()) { - // Default behavior: auto-ack all records when no specific behavior is injected - sendAck(offset); - return; - } - - // Check if there's a specific ack record for this offset - AckRecord matchingRecord = null; - for (int i = 0; i < injectedAckRecords.size(); i++) { - if (injectedAckRecords.get(i).offsetId == offset) { - matchingRecord = injectedAckRecords.remove(i); - break; - } - } - - if (matchingRecord != null) { - // Process the specific injected behavior - if (matchingRecord.delayMs > 0) { - Thread.sleep(matchingRecord.delayMs); - } - - if (matchingRecord.writeFailure) { - throw new RuntimeException("IngestRecord write failure"); - } - - if (matchingRecord.closeStreamSignal) { - sendCloseStreamSignal(); - } else if (matchingRecord.success) { - sendAck(offset); - } else { - Throwable error = - matchingRecord.error != null - ? matchingRecord.error - : new RuntimeException("Ingest failed"); - sendError(error); - } - } - // Note: If no matching record found and injectedAckRecords is not empty, - // do NOT send ack (this is intentional for tests that need to test lack of acks) - } - } - - private void sendCreateStreamSuccess() { - if (ackSender != null) { - EphemeralStreamResponse response = - EphemeralStreamResponse.newBuilder() - .setCreateStreamResponse( - CreateIngestStreamResponse.newBuilder().setStreamId("test-stream-id").build()) - .build(); - ackSender.onNext(response); - } - } - - private void sendAck(long offset) { - if (ackSender != null) { - EphemeralStreamResponse response = - EphemeralStreamResponse.newBuilder() - .setIngestRecordResponse( - IngestRecordResponse.newBuilder().setDurabilityAckUpToOffset(offset).build()) - .build(); - ackSender.onNext(response); - } - } - - private void sendCloseStreamSignal() { - if (ackSender != null) { - EphemeralStreamResponse response = - EphemeralStreamResponse.newBuilder() - .setCloseStreamSignal(CloseStreamSignal.newBuilder().build()) - .build(); - ackSender.onNext(response); - } - } - - private void sendError(Throwable error) { - if (ackSender != null) { - ackSender.onError(error); - } - stopServerThread(); - } -} diff --git a/src/test/java/com/databricks/zerobus/StreamConfigurationOptionsTest.java b/src/test/java/com/databricks/zerobus/StreamConfigurationOptionsTest.java new file mode 100644 index 0000000..4dc1b4d --- /dev/null +++ b/src/test/java/com/databricks/zerobus/StreamConfigurationOptionsTest.java @@ -0,0 +1,122 @@ +package com.databricks.zerobus; + +import static org.junit.jupiter.api.Assertions.*; + +import java.util.function.Consumer; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link StreamConfigurationOptions} and its builder. + * + *

These tests verify the builder pattern, default values, and configuration validation without + * requiring the native library. + */ +public class StreamConfigurationOptionsTest { + + @Test + void testDefaultValues() { + StreamConfigurationOptions options = StreamConfigurationOptions.getDefault(); + + assertEquals(50000, options.maxInflightRecords()); + assertTrue(options.recovery()); + assertEquals(15000, options.recoveryTimeoutMs()); + assertEquals(2000, options.recoveryBackoffMs()); + assertEquals(3, options.recoveryRetries()); + assertEquals(300000, options.flushTimeoutMs()); + assertEquals(60000, options.serverLackOfAckTimeoutMs()); + assertFalse(options.ackCallback().isPresent()); + assertFalse(options.getNewAckCallback().isPresent()); + } + + @Test + void testBuilderWithCustomValues() { + StreamConfigurationOptions options = + StreamConfigurationOptions.builder() + .setMaxInflightRecords(10000) + .setRecovery(false) + .setRecoveryTimeoutMs(30000) + .setRecoveryBackoffMs(5000) + .setRecoveryRetries(5) + .setFlushTimeoutMs(600000) + .setServerLackOfAckTimeoutMs(120000) + .build(); + + assertEquals(10000, options.maxInflightRecords()); + assertFalse(options.recovery()); + assertEquals(30000, options.recoveryTimeoutMs()); + assertEquals(5000, options.recoveryBackoffMs()); + assertEquals(5, options.recoveryRetries()); + assertEquals(600000, options.flushTimeoutMs()); + assertEquals(120000, options.serverLackOfAckTimeoutMs()); + } + + @Test + void testBuilderWithOldStyleAckCallback() { + Consumer callback = response -> {}; + + @SuppressWarnings("deprecation") + StreamConfigurationOptions options = + StreamConfigurationOptions.builder().setAckCallback(callback).build(); + + assertTrue(options.ackCallback().isPresent()); + assertEquals(callback, options.ackCallback().get()); + } + + @Test + void testBuilderWithNewStyleAckCallback() { + AckCallback callback = + new AckCallback() { + @Override + public void onAck(long offsetId) {} + + @Override + public void onError(long offsetId, String errorMessage) {} + }; + + StreamConfigurationOptions options = + StreamConfigurationOptions.builder().setAckCallback(callback).build(); + + assertTrue(options.getNewAckCallback().isPresent()); + assertEquals(callback, options.getNewAckCallback().get()); + } + + @Test + void testBuilderReturnsNewInstanceEachTime() { + StreamConfigurationOptions.StreamConfigurationOptionsBuilder builder = + StreamConfigurationOptions.builder(); + + StreamConfigurationOptions options1 = builder.setMaxInflightRecords(1000).build(); + StreamConfigurationOptions options2 = builder.setMaxInflightRecords(2000).build(); + + // Builder should create independent instances (though this builder may reuse state) + // The important thing is that build() returns a valid object + assertNotNull(options1); + assertNotNull(options2); + } + + @Test + void testPartialBuilderConfiguration() { + // Only set some values, rest should be defaults + StreamConfigurationOptions options = + StreamConfigurationOptions.builder() + .setMaxInflightRecords(25000) + .setRecovery(false) + .build(); + + assertEquals(25000, options.maxInflightRecords()); + assertFalse(options.recovery()); + // These should still be defaults + assertEquals(15000, options.recoveryTimeoutMs()); + assertEquals(2000, options.recoveryBackoffMs()); + } + + @Test + void testMinimumConfiguration() { + // Build with no customization + StreamConfigurationOptions options = StreamConfigurationOptions.builder().build(); + + // Should have all defaults + assertEquals(50000, options.maxInflightRecords()); + assertTrue(options.recovery()); + } +} diff --git a/src/test/java/com/databricks/zerobus/TablePropertiesTest.java b/src/test/java/com/databricks/zerobus/TablePropertiesTest.java new file mode 100644 index 0000000..bb21def --- /dev/null +++ b/src/test/java/com/databricks/zerobus/TablePropertiesTest.java @@ -0,0 +1,86 @@ +package com.databricks.zerobus; + +import static org.junit.jupiter.api.Assertions.*; + +import com.databricks.test.table.TestTableRow.CityPopulationTableRow; +import com.google.protobuf.DescriptorProtos; +import com.google.protobuf.Descriptors; +import org.junit.jupiter.api.Test; + +/** Unit tests for {@link TableProperties}. */ +public class TablePropertiesTest { + + @Test + void testTablePropertiesCreation() { + String tableName = "catalog.schema.test_table"; + CityPopulationTableRow defaultInstance = CityPopulationTableRow.getDefaultInstance(); + + TableProperties props = + new TableProperties<>(tableName, defaultInstance); + + assertEquals(tableName, props.getTableName()); + assertEquals(defaultInstance, props.getDefaultInstance()); + } + + @Test + void testGetDescriptor() { + TableProperties props = + new TableProperties<>("catalog.schema.table", CityPopulationTableRow.getDefaultInstance()); + + Descriptors.Descriptor descriptor = props.getDescriptor(); + + assertNotNull(descriptor); + assertEquals("CityPopulationTableRow", descriptor.getName()); + } + + @Test + void testGetDescriptorProto() { + TableProperties props = + new TableProperties<>("catalog.schema.table", CityPopulationTableRow.getDefaultInstance()); + + DescriptorProtos.DescriptorProto descriptorProto = props.getDescriptorProto(); + + assertNotNull(descriptorProto); + // The descriptor proto should have the fields from the message + assertTrue(descriptorProto.getFieldCount() > 0); + } + + @Test + void testTableNameWithThreeParts() { + String tableName = "main.default.my_table"; + TableProperties props = + new TableProperties<>(tableName, CityPopulationTableRow.getDefaultInstance()); + + assertEquals(tableName, props.getTableName()); + } + + @Test + void testTableNameFormats() { + // Test various table name formats (the SDK should accept them) + String[] validNames = { + "catalog.schema.table", + "main.default.air_quality", + "my_catalog.my_schema.my_table", + "catalog123.schema456.table789" + }; + + for (String name : validNames) { + TableProperties props = + new TableProperties<>(name, CityPopulationTableRow.getDefaultInstance()); + assertEquals(name, props.getTableName()); + } + } + + @Test + void testDescriptorProtoCanBeSerialized() { + TableProperties props = + new TableProperties<>("cat.sch.tbl", CityPopulationTableRow.getDefaultInstance()); + + DescriptorProtos.DescriptorProto descriptorProto = props.getDescriptorProto(); + byte[] serialized = descriptorProto.toByteArray(); + + // Should produce non-empty serialized form + assertNotNull(serialized); + assertTrue(serialized.length > 0); + } +} diff --git a/src/test/java/com/databricks/zerobus/ZerobusSdkTest.java b/src/test/java/com/databricks/zerobus/ZerobusSdkTest.java deleted file mode 100644 index 6a0ec76..0000000 --- a/src/test/java/com/databricks/zerobus/ZerobusSdkTest.java +++ /dev/null @@ -1,421 +0,0 @@ -package com.databricks.zerobus; - -import static org.junit.jupiter.api.Assertions.*; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.anyString; -import static org.mockito.Mockito.*; - -import com.databricks.test.table.TestTableRow.CityPopulationTableRow; -import io.grpc.stub.StreamObserver; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.TimeUnit; -import java.util.function.Consumer; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.ExtendWith; -import org.mockito.junit.jupiter.MockitoExtension; - -/** - * Test suite for ZerobusSdk with mocked gRPC server. - * - *

These tests verify the SDK's core functionality including stream creation, record ingestion, - * acknowledgments, and flush operations without requiring a real Zerobus backend server. - * - *

Best practices followed: - Fast execution (no long sleeps or timeouts) - Clear test names - * describing what is being tested - Proper mock setup and teardown - Testing both success and - * failure paths - Using CompletableFutures for async operations - */ -@ExtendWith(MockitoExtension.class) -public class ZerobusSdkTest { - - private MockedGrpcServer mockedGrpcServer; - private ZerobusGrpc.ZerobusStub zerobusStub; - private ZerobusSdk zerobusSdk; - private ZerobusSdkStubFactory zerobusSdkStubFactory; - private org.mockito.MockedStatic tokenFactoryMock; - private io.grpc.stub.ClientCallStreamObserver spiedStream; - - @BeforeEach - public void setUp() { - // Create mocked gRPC server - mockedGrpcServer = new MockedGrpcServer(); - - // Create mocked stub - zerobusStub = mock(ZerobusGrpc.ZerobusStub.class); - - // Create spy on stub factory - zerobusSdkStubFactory = spy(ZerobusSdkStubFactory.create()); - - // Mock TokenFactory to return a fake token - tokenFactoryMock = mockStatic(TokenFactory.class); - tokenFactoryMock - .when( - () -> - TokenFactory.getZerobusToken( - anyString(), anyString(), anyString(), anyString(), anyString())) - .thenReturn("fake-token-for-testing"); - - // Create ZerobusSdk and set mocked stub factory - zerobusSdk = new ZerobusSdk("localhost:50051", "https://test.cloud.databricks.com"); - zerobusSdk.setStubFactory(zerobusSdkStubFactory); - - // Configure stub factory to return our mocked stub with token supplier - doReturn(zerobusStub) - .when(zerobusSdkStubFactory) - .createStubWithTokenSupplier(anyString(), anyString(), any()); - - // Setup mocked stub's ephemeralStream behavior - doAnswer( - invocation -> { - @SuppressWarnings("unchecked") - StreamObserver ackSender = - (StreamObserver) invocation.getArgument(0); - - mockedGrpcServer.initialize(ackSender); - - // Spy on the message receiver to verify cancel() is called - spiedStream = spy(mockedGrpcServer.getMessageReceiver()); - return spiedStream; - }) - .when(zerobusStub) - .ephemeralStream(any()); - } - - @AfterEach - public void tearDown() { - if (tokenFactoryMock != null) { - tokenFactoryMock.close(); - } - if (mockedGrpcServer != null) { - mockedGrpcServer.destroy(); - } - mockedGrpcServer = null; - zerobusStub = null; - zerobusSdk = null; - zerobusSdkStubFactory = null; - tokenFactoryMock = null; - } - - @Test - public void testSingleRecordIngestAndAcknowledgment() throws Exception { - // Test basic ingestion: send one record and verify it's acknowledged - mockedGrpcServer.injectAckRecord(0); - - TableProperties tableProperties = - new TableProperties<>("test-table", CityPopulationTableRow.getDefaultInstance()); - StreamConfigurationOptions options = - StreamConfigurationOptions.builder().setRecovery(false).build(); - - ZerobusStream stream = - zerobusSdk.createStream(tableProperties, "client-id", "client-secret", options).get(); - - assertEquals(StreamState.OPENED, stream.getState()); - - CompletableFuture writeCompleted = - stream.ingestRecord( - CityPopulationTableRow.newBuilder() - .setCityName("test-city") - .setPopulation(1000) - .build()); - - // Wait for acknowledgment - writeCompleted.get(5, TimeUnit.SECONDS); - - // Verify no unacked records - Iterator unackedRecords = stream.getUnackedRecords(); - assertFalse(unackedRecords.hasNext()); - - stream.close(); - assertEquals(StreamState.CLOSED, stream.getState()); - } - - @Test - public void testBatchIngestion() throws Exception { - // Test ingesting multiple records in a batch - int batchSize = 100; - - for (int i = 0; i < batchSize; i++) { - mockedGrpcServer.injectAckRecord(i); - } - - TableProperties tableProperties = - new TableProperties<>("test-table", CityPopulationTableRow.getDefaultInstance()); - StreamConfigurationOptions options = - StreamConfigurationOptions.builder().setRecovery(false).build(); - - ZerobusStream stream = - zerobusSdk.createStream(tableProperties, "client-id", "client-secret", options).get(); - assertEquals(StreamState.OPENED, stream.getState()); - - // Send records - List> futures = new ArrayList<>(); - for (int i = 0; i < batchSize; i++) { - futures.add( - stream.ingestRecord( - CityPopulationTableRow.newBuilder() - .setCityName("city-" + i) - .setPopulation(1000 + i) - .build())); - } - - // Wait for all acknowledgments - for (CompletableFuture future : futures) { - future.get(5, TimeUnit.SECONDS); - } - - // Verify all records acknowledged - Iterator unackedRecords = stream.getUnackedRecords(); - assertFalse(unackedRecords.hasNext()); - - stream.close(); - assertEquals(StreamState.CLOSED, stream.getState()); - } - - @Test - public void testFlushWaitsForAllAcknowledgments() throws Exception { - // Test that flush() blocks until all inflight records are acknowledged - int numRecords = 10; - mockedGrpcServer.injectAckRecord(numRecords - 1); - - TableProperties tableProperties = - new TableProperties<>("test-table", CityPopulationTableRow.getDefaultInstance()); - StreamConfigurationOptions options = - StreamConfigurationOptions.builder().setRecovery(false).build(); - - ZerobusStream stream = - zerobusSdk.createStream(tableProperties, "client-id", "client-secret", options).get(); - assertEquals(StreamState.OPENED, stream.getState()); - - // Ingest records - for (int i = 0; i < numRecords; i++) { - stream.ingestRecord( - CityPopulationTableRow.newBuilder() - .setCityName("device-" + i) - .setPopulation(20 + i) - .build()); - } - - // Flush should wait for all acks - stream.flush(); - - // Verify no unacked records after flush - Iterator unackedRecords = stream.getUnackedRecords(); - assertFalse(unackedRecords.hasNext()); - - stream.close(); - } - - @Test - public void testEmptyFlushReturnsImmediately() throws Exception { - // Test that flush() on an empty stream returns immediately - TableProperties tableProperties = - new TableProperties<>("test-table", CityPopulationTableRow.getDefaultInstance()); - StreamConfigurationOptions options = - StreamConfigurationOptions.builder().setRecovery(false).build(); - - ZerobusStream stream = - zerobusSdk.createStream(tableProperties, "client-id", "client-secret", options).get(); - - assertEquals(StreamState.OPENED, stream.getState()); - - // Measure flush execution time - long startTime = System.currentTimeMillis(); - stream.flush(); - long endTime = System.currentTimeMillis(); - long flushDuration = endTime - startTime; - - assertTrue( - flushDuration < 100, - "Expected flush to return immediately, but took " + flushDuration + "ms"); - - assertEquals(StreamState.OPENED, stream.getState()); - stream.close(); - } - - @Test - public void testAckCallback() throws Exception { - // Test that ack callbacks are invoked for each acknowledgment - List ackedOffsets = Collections.synchronizedList(new ArrayList<>()); - Consumer ackCallback = - response -> ackedOffsets.add(response.getDurabilityAckUpToOffset()); - - int numRecords = 10; - for (int i = 0; i < numRecords; i++) { - mockedGrpcServer.injectAckRecord(i); - } - - TableProperties tableProperties = - new TableProperties<>("test-table", CityPopulationTableRow.getDefaultInstance()); - StreamConfigurationOptions options = - StreamConfigurationOptions.builder().setRecovery(false).setAckCallback(ackCallback).build(); - - ZerobusStream stream = - zerobusSdk.createStream(tableProperties, "client-id", "client-secret", options).get(); - assertEquals(StreamState.OPENED, stream.getState()); - - // Ingest records - List> futures = new ArrayList<>(); - for (int i = 0; i < numRecords; i++) { - futures.add( - stream.ingestRecord( - CityPopulationTableRow.newBuilder() - .setCityName("test-city-" + i) - .setPopulation(i) - .build())); - } - - // Wait for all records to be acknowledged - for (CompletableFuture future : futures) { - future.get(5, TimeUnit.SECONDS); - } - - stream.flush(); - assertEquals(StreamState.OPENED, stream.getState()); - - // Wait for callbacks to complete - wait until we see the final offset (numRecords - 1) - long deadline = System.currentTimeMillis() + 2000; - boolean foundFinalOffset = false; - while (System.currentTimeMillis() < deadline) { - synchronized (ackedOffsets) { - if (!ackedOffsets.isEmpty() && ackedOffsets.contains((long) (numRecords - 1))) { - foundFinalOffset = true; - break; - } - } - Thread.sleep(10); - } - - // Verify callback was called and final offset was received - assertTrue(foundFinalOffset, "Expected to receive ack for final offset " + (numRecords - 1)); - assertTrue(ackedOffsets.size() > 0, "Expected callback to be called at least once"); - - // Verify the final offset was acknowledged - assertTrue( - ackedOffsets.contains((long) (numRecords - 1)), - "Expected callbacks to include offset " + (numRecords - 1)); - - // Verify unacked records are empty - Iterator unackedRecords = stream.getUnackedRecords(); - assertFalse(unackedRecords.hasNext()); - - stream.close(); - assertEquals(StreamState.CLOSED, stream.getState()); - } - - @Test - public void testCallbackExceptionHandling() throws Exception { - // Test that exceptions in callbacks don't crash the stream - List callbackInvocations = new ArrayList<>(); - List thrownExceptions = new ArrayList<>(); - - Consumer ackCallback = - response -> { - long offsetId = response.getDurabilityAckUpToOffset(); - callbackInvocations.add(offsetId); - - // Throw exception for offset 1 to test error handling - if (offsetId == 1) { - RuntimeException exception = - new RuntimeException("Test exception in callback for offset " + offsetId); - thrownExceptions.add(exception.getMessage()); - throw exception; - } - }; - - int numRecords = 3; - for (int i = 0; i < numRecords; i++) { - mockedGrpcServer.injectAckRecord(i); - } - - TableProperties tableProperties = - new TableProperties<>("test-table", CityPopulationTableRow.getDefaultInstance()); - StreamConfigurationOptions options = - StreamConfigurationOptions.builder().setRecovery(false).setAckCallback(ackCallback).build(); - - ZerobusStream stream = - zerobusSdk.createStream(tableProperties, "client-id", "client-secret", options).get(); - - assertEquals(StreamState.OPENED, stream.getState()); - - List> ingestResults = new ArrayList<>(); - for (int i = 0; i < numRecords; i++) { - CompletableFuture writeCompleted = - stream.ingestRecord( - CityPopulationTableRow.newBuilder() - .setCityName("error-callback-device-" + i) - .setPopulation(30 + i) - .build()); - ingestResults.add(writeCompleted); - } - - // Wait for all records to be acknowledged (should succeed despite callback exception) - for (CompletableFuture future : ingestResults) { - future.get(5, TimeUnit.SECONDS); - } - - // Wait for callbacks to complete - long deadline = System.currentTimeMillis() + 1000; - while (callbackInvocations.size() < numRecords && System.currentTimeMillis() < deadline) { - Thread.yield(); - } - - // Verify callback was invoked for all acknowledgments (including the one that threw) - assertEquals(numRecords, callbackInvocations.size()); - assertTrue(callbackInvocations.contains(0L)); - assertTrue(callbackInvocations.contains(1L)); - assertTrue(callbackInvocations.contains(2L)); - - // Verify the exception was thrown for offset 1 - assertEquals(1, thrownExceptions.size()); - assertTrue(thrownExceptions.get(0).contains("Test exception in callback for offset 1")); - - // Verify stream remains functional - Iterator unackedRecords = stream.getUnackedRecords(); - assertFalse(unackedRecords.hasNext()); - assertEquals(StreamState.OPENED, stream.getState()); - - stream.close(); - assertEquals(StreamState.CLOSED, stream.getState()); - } - - @Test - public void testGrpcStreamIsCancelledOnClose() throws Exception { - // Test that the underlying gRPC stream is properly cancelled when stream.close() is called - mockedGrpcServer.injectAckRecord(0); - - TableProperties tableProperties = - new TableProperties<>("test-table", CityPopulationTableRow.getDefaultInstance()); - StreamConfigurationOptions options = - StreamConfigurationOptions.builder().setRecovery(false).build(); - - ZerobusStream stream = - zerobusSdk.createStream(tableProperties, "client-id", "client-secret", options).get(); - - assertEquals(StreamState.OPENED, stream.getState()); - - // Ingest one record - CompletableFuture writeCompleted = - stream.ingestRecord( - CityPopulationTableRow.newBuilder() - .setCityName("test-city") - .setPopulation(1000) - .build()); - - writeCompleted.get(5, TimeUnit.SECONDS); - - // Close the stream - stream.close(); - assertEquals(StreamState.CLOSED, stream.getState()); - - // Verify that cancel() was called on the gRPC stream - verify(spiedStream, times(1)).cancel(anyString(), any()); - - // Also verify onCompleted() was called - verify(spiedStream, times(1)).onCompleted(); - } -} diff --git a/src/test/proto/air_quality.proto b/src/test/proto/air_quality.proto new file mode 100644 index 0000000..886a97d --- /dev/null +++ b/src/test/proto/air_quality.proto @@ -0,0 +1,13 @@ +syntax = "proto2"; + +package databricks.test.table; + +option java_package = "com.databricks.test.table"; +option java_outer_classname = "AirQualityRow"; + +// Air quality sensor reading for integration tests +message AirQuality { + optional string device_name = 1; + optional int32 temp = 2; + optional int64 humidity = 3; +}