Skip to content

Commit 801f6ec

Browse files
committed
s3 upload thread
1 parent 7fddacd commit 801f6ec

File tree

3 files changed

+87
-62
lines changed

3 files changed

+87
-62
lines changed

s3/src/main/java/com/kesque/pulsar/sink/s3/AWSS3Sink.java

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,11 @@ public class AWSS3Sink implements Sink<byte[]> {
6868

6969
private String filename;
7070

71+
// debug
72+
private int groupCounter = 0;
73+
private int fileSuffix = 0;
74+
// end of debug
75+
7176
/**
7277
* Write a message to Sink
7378
* @param inputRecordContext Context of input record from the source
@@ -84,23 +89,7 @@ public void write(Record<byte[]> record) throws Exception {
8489
// Optional<Message<byte[]>> msgOption = record.getMessage(); //.get();
8590
// LOG.error("message option isPresent {}", msgOption.isPresent());
8691

87-
try {
88-
byte[] data = record.getValue();
89-
String convJson = new String(data); // StandardCharsets.UTF_8);
90-
LOG.info(convJson);
91-
LOG.info("data payload length is {} string-value {}", data.length);
92-
this.avroSchema = JsonUtil.inferSchema(JsonUtil.parse(convJson), "schemafromjson");
93-
94-
LOG.info(avroSchema.toString());
95-
96-
} catch (Exception e) {
97-
e.printStackTrace();
98-
LOG.error("msgOption is ", e);
99-
}
100-
10192
this.filename = getFilename(this.filePrefix, ledgerId);
102-
LOG.info("filename is {}", this.filename);
103-
10493
this.recordWriter.write(record, this.filename);
10594
}
10695

s3/src/main/java/com/kesque/pulsar/sink/s3/format/parquet/ParquetRecordWriter.java

Lines changed: 82 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
import static java.nio.charset.StandardCharsets.UTF_8;
44

55
import java.io.IOException;
6+
import java.util.Map;
7+
import java.util.concurrent.ConcurrentHashMap;
8+
import java.util.concurrent.ExecutorService;
9+
import java.util.concurrent.Executors;
610

711
import com.fasterxml.jackson.databind.JsonNode;
812
import com.kesque.pulsar.sink.s3.AWSS3Config;
@@ -18,6 +22,7 @@
1822
import org.apache.avro.io.DatumReader;
1923
import org.apache.avro.io.DecoderFactory;
2024
import org.apache.hadoop.conf.Configuration;
25+
import org.apache.logging.log4j.util.Strings;
2126
import org.apache.parquet.avro.AvroParquetWriter;
2227
import org.apache.parquet.hadoop.ParquetFileWriter;
2328
import org.apache.parquet.hadoop.ParquetWriter;
@@ -36,12 +41,22 @@ public class ParquetRecordWriter implements RecordWriter {
3641
private AWSS3Config config;
3742
private S3Storage s3Storage;
3843
private Configuration parquetWriterConfig;
39-
private String currentFile = "";
4044
private Schema avroSchema;
41-
private Record<byte[]> record; // kept for batch ack
45+
private volatile String currentFile = "";
46+
private volatile Record<byte[]> lastRecord; // kept for batch ack
4247

43-
S3ParquetOutputFile s3ParquetOutputFile = null;
44-
private ParquetWriter<GenericData.Record> writer = null;
48+
// parallel writer size
49+
int WRITER_LIMIT = 4;
50+
51+
// key is the file name in S3
52+
private ConcurrentHashMap<String, ParquetWriter<GenericData.Record>> writerMap = new ConcurrentHashMap<String, ParquetWriter<GenericData.Record>>(WRITER_LIMIT);
53+
private ConcurrentHashMap<String, S3ParquetOutputFile> s3ParquetOutputFileMap = new ConcurrentHashMap<String, S3ParquetOutputFile>(WRITER_LIMIT);
54+
55+
// a thread pool of hard coded 4 threads for final commit and upload s3
56+
ExecutorService uploaderExecutor = Executors.newFixedThreadPool(WRITER_LIMIT);
57+
58+
// S3ParquetOutputFile s3ParquetOutputFile = null;
59+
// private ParquetWriter<GenericData.Record> writer = null;
4560

4661
public ParquetRecordWriter(AWSS3Config confg, S3Storage storage) {
4762
this.config = confg;
@@ -50,57 +65,90 @@ public ParquetRecordWriter(AWSS3Config confg, S3Storage storage) {
5065
parquetWriterConfig = new Configuration();
5166
parquetWriterConfig.set("fs.s3.awsAccessKeyId", config.getAccessKeyId());
5267
parquetWriterConfig.set("fs.s3.awsSecretAccessKey", config.getSecretAccessKey());
53-
5468
}
5569

5670
@Override
5771
public void write(Record<byte[]> record, String file) {
5872
byte[] data = record.getValue();
5973
String convJson = new String(data); // StandardCharsets.UTF_8);
60-
log.info("data payload length is {} string-value {}", data.length);
6174
JsonNode datum = JsonUtil.parse(convJson);
6275
this.avroSchema = JsonUtil.inferSchema(JsonUtil.parse(convJson), "schemafromjson");
6376
log.info(avroSchema.toString());
6477

6578
GenericData.Record convertedRecord = (org.apache.avro.generic.GenericData.Record) JsonUtil.convertToAvro(GenericData.get(), datum, avroSchema);
79+
writeParquet(convertedRecord, file);
80+
this.lastRecord = record;
81+
}
6682

67-
try {
68-
if (file.equals(currentFile)) {
69-
log.info("write to existing parquet writer");
70-
writer.write(convertedRecord);
71-
72-
} else {
73-
this.currentFile = file;
74-
if (this.writer != null) {
75-
writer.write(convertedRecord);
76-
log.info("cumulative ack all pulsar messages and write to existing parquet writer");
77-
record.ack(); // depends on cumulative ack
78-
s3ParquetOutputFile.s3out.setCommit();
79-
this.writer.close();
80-
this.writer = null;
81-
} else {
82-
s3ParquetOutputFile = new S3ParquetOutputFile(this.s3Storage, file);
83-
84-
log.info("write to a new parquet writer");
85-
86-
this.writer = AvroParquetWriter.<GenericData.Record>builder(s3ParquetOutputFile).withSchema(avroSchema)
87-
.withCompressionCodec(CompressionCodecName.SNAPPY) // GZIP
88-
.withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withConf(parquetWriterConfig)
89-
.withPageSize(4 * 1024 * 1024) // For compression
90-
.withRowGroupSize(16 * 1024 * 1024) // For write buffering (Page size)
91-
.build();
92-
93-
writer.write(convertedRecord);
83+
private synchronized void writeParquet(GenericData.Record record, String file) {
84+
log.info("currentFile is {} file name is {}", this.currentFile, file);
85+
String lastFile = this.currentFile; // save a copy because currentFile can be replace in the main thread
86+
if (Strings.isNotBlank(lastFile) && !file.equals(lastFile)) {
87+
uploaderExecutor.execute(() -> {
88+
ParquetWriter<GenericData.Record> writer = writerMap.get(lastFile);
89+
if (writer == null) {
90+
log.error("fatal error - failed to find parquet writer to match file {}", lastFile);
91+
return;
92+
}
93+
S3ParquetOutputFile s3ParquetOutputFile = s3ParquetOutputFileMap.get(lastFile);
94+
if (s3ParquetOutputFile == null) {
95+
log.error("fatal error - failed to find s3ParquetOutputFile to match file {}", lastFile);
96+
return;
97+
}
98+
99+
// when a new file and parquet writer is required
100+
s3ParquetOutputFile.s3out.setCommit();
101+
try {
102+
writer.close();
103+
} catch (IOException e) {
104+
log.error("close parquet writer exception {}", e.getMessage());
105+
e.printStackTrace();
94106
}
107+
writerMap.remove(lastFile);
108+
s3ParquetOutputFileMap.remove(lastFile);
109+
log.info("cumulative ack all pulsar messages and write to existing parquet writer, map size {}", writerMap.size());
110+
lastRecord.ack(); // depends on cumulative ack
111+
112+
});
113+
114+
}
115+
this.currentFile = file; // for the next write
116+
117+
ParquetWriter<GenericData.Record> writer = this.writerMap.get(file);
118+
if (writer==null) {
119+
log.info("write to a new parquet writer with file {} currentFile {}", file, this.currentFile);
120+
S3ParquetOutputFile s3ParquetOutputFile = new S3ParquetOutputFile(this.s3Storage, file);
121+
122+
try {
123+
writer = AvroParquetWriter.<GenericData.Record>builder(s3ParquetOutputFile).withSchema(avroSchema)
124+
.withCompressionCodec(CompressionCodecName.SNAPPY) // GZIP
125+
.withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withConf(parquetWriterConfig)
126+
.withPageSize(4 * 1024 * 1024) // For compression
127+
.withRowGroupSize(16 * 1024 * 1024) // For write buffering (Page size)
128+
.build();
129+
} catch (IOException e) {
130+
log.error("create parquet s3 writer exception {}", e.getMessage());
131+
e.printStackTrace();
95132
}
133+
134+
s3ParquetOutputFileMap.put(file, s3ParquetOutputFile);
135+
writerMap.put(file, writer);
136+
log.info("put writer and parquet output file to {}", file);
137+
}
138+
139+
try {
140+
writer.write(record);
96141
} catch (IOException e) {
97-
e.printStackTrace();
98142
log.error("write to parquet s3 exception {}", e.getMessage());
143+
e.printStackTrace();
99144
}
100145
}
101146

102147
@Override
103148
public void close() {
149+
if (!uploaderExecutor.isShutdown()) {
150+
uploaderExecutor.shutdown();
151+
}
104152
log.info("ParquetRecordWriter close()");
105153
}
106154

s3/src/main/java/com/kesque/pulsar/sink/s3/storage/S3OutputStream.java

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,10 @@ public S3OutputStream(String key, AWSS3Config conf, AmazonS3 s3) {
8181
this.compressionLevel = conf.compressionLevel;
8282
this.position = 0L;
8383
log.info("Create S3OutputStream for bucket '{}' key '{}'", bucket, key);
84-
System.out.println("created S3OutputStream ... buffer partSize " + conf.partSize+ " bucket " + bucket + " keyname: " + key);
8584
}
8685

8786
@Override
8887
public void write(int b) throws IOException {
89-
System.out.println("write 1 called size b " + b);
9088
buffer.put((byte) b);
9189
if (!buffer.hasRemaining()) {
9290
uploadPart();
@@ -102,7 +100,6 @@ public void write(byte[] b) throws IOException {
102100

103101
@Override
104102
public void write(byte[] b, int off, int len) throws IOException {
105-
System.out.println("write 3 called off:" + off + " length:" + len + " remaining:" + buffer.remaining());
106103
if (b == null) {
107104
throw new NullPointerException();
108105
} else if (outOfRange(off, b.length) || len < 0 || outOfRange(off + len, b.length)) {
@@ -113,13 +110,11 @@ public void write(byte[] b, int off, int len) throws IOException {
113110

114111
if (buffer.remaining() <= len) {
115112
int firstPart = buffer.remaining();
116-
//System.out.println("first part " + firstPart + " position " + position);
117113
buffer.put(b, off, firstPart);
118114
position += firstPart;
119115
uploadPart();
120116
write(b, off + firstPart, len - firstPart);
121117
} else {
122-
//System.out.println("position " + position + " off " + off + " length " + len);
123118
buffer.put(b, off, len);
124119
position += len;
125120
}
@@ -130,20 +125,16 @@ private static boolean outOfRange(int off, int len) {
130125
}
131126

132127
private void uploadPart() throws IOException {
133-
System.out.println("upload part partsize is " + partSize);
134128
uploadPart(partSize);
135129
buffer.clear();
136130
}
137131

138132
private void uploadPart(final int size) throws IOException {
139133
if (multiPartUpload == null) {
140134
log.info("New multi-part upload for bucket '{}' key '{}'", bucket, key);
141-
System.out.println("bucket " + bucket + " key "+ key);
142135
multiPartUpload = newMultipartUpload();
143136
}
144137

145-
System.out.println("Object upload started");
146-
147138
try {
148139
multiPartUpload.uploadPart(new ByteArrayInputStream(buffer.array()), size);
149140
} catch (Exception e) {
@@ -172,7 +163,6 @@ public void commit() throws IOException {
172163
}
173164
multiPartUpload.complete();
174165
log.debug("Upload complete for bucket '{}' key '{}'", bucket, key);
175-
System.out.println("Upload complete for bucket " + bucket + " key " + key);
176166
} catch (Exception e) {
177167
log.error("Multipart upload failed to complete for bucket '{}' key '{}'", bucket, key);
178168
throw e;
@@ -241,7 +231,6 @@ private class MultipartUpload {
241231
public MultipartUpload(String uploadId) {
242232
this.uploadId = uploadId;
243233
this.partETags = new ArrayList<>();
244-
System.out.println("create MultipartUpload " + key + " uploadID " + uploadId);
245234
log.debug(
246235
"Initiated multi-part upload for bucket key '{}' with id '{}'",
247236
key,
@@ -261,7 +250,6 @@ public void uploadPart(ByteArrayInputStream inputStream, int partSize) {
261250
.withPartSize(partSize)
262251
.withGeneralProgressListener(progressListener);
263252
log.debug("Uploading part {} for id '{}'", currentPartNumber, uploadId);
264-
System.out.println("Uploading part "+ currentPartNumber+" upload id is " +uploadId);
265253
partETags.add(s3.uploadPart(request).getPartETag());
266254
}
267255

0 commit comments

Comments
 (0)