Skip to content
This repository was archived by the owner on Oct 18, 2023. It is now read-only.

Commit f232316

Browse files
committed
bottomless: add xz compression option
Empirical testing shows, that gzip achieves mere x2 compression ratio even with very simple and repeatable data patterns. Since compression is very important for optimizing our egress traffic and throughput in general, .xz algorithm is hereby implemented as well. Ran with the same data set, it achieved ~x50 compression ratio, which is orders of magnitude better than gzip, at the cost of elevated CPU usage. Note: with more algos implemented, we should also consider adding code that detects which compression methods was used when restoring a snapshot, to allow restoring from a gzip file, but continue new snapshots with xz. Currently, setting the compression methods via the env var assumes that both restore and backup use the same algorithm.
1 parent eb92268 commit f232316

File tree

4 files changed

+57
-11
lines changed

4 files changed

+57
-11
lines changed

bottomless/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ description = "Bottomless replication for libSQL"
1010

1111
[dependencies]
1212
anyhow = "1.0.66"
13-
async-compression = { version = "0.3.15", features = ["tokio", "gzip"] }
13+
async-compression = { version = "0.3.15", features = ["tokio", "gzip", "xz"] }
1414
aws-config = { version = "0.55" }
1515
aws-sdk-s3 = { version = "0.28" }
1616
bytes = "1"

bottomless/src/backup.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,14 @@ impl WalCopier {
116116
wal.copy_frames(&mut gzip, len).await?;
117117
gzip.shutdown().await?;
118118
}
119+
CompressionKind::Xz => {
120+
let mut xz = async_compression::tokio::write::XzEncoder::with_quality(
121+
&mut out,
122+
async_compression::Level::Best,
123+
);
124+
wal.copy_frames(&mut xz, len).await?;
125+
xz.shutdown().await?;
126+
}
119127
}
120128
if tracing::enabled!(tracing::Level::DEBUG) {
121129
let elapsed = Instant::now() - period_start;

bottomless/src/read.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use crate::replicator::CompressionKind;
22
use crate::wal::WalFrameHeader;
33
use anyhow::Result;
4-
use async_compression::tokio::bufread::GzipDecoder;
4+
use async_compression::tokio::bufread::{GzipDecoder, XzEncoder};
55
use aws_sdk_s3::primitives::ByteStream;
66
use std::io::ErrorKind;
77
use std::pin::Pin;
@@ -32,6 +32,10 @@ impl BatchReader {
3232
let gzip = GzipDecoder::new(reader);
3333
Box::pin(gzip)
3434
}
35+
CompressionKind::Xz => {
36+
let xz = XzEncoder::new(reader);
37+
Box::pin(xz)
38+
}
3539
},
3640
}
3741
}

bottomless/src/replicator.rs

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use crate::uuid_utils::decode_unix_timestamp;
55
use crate::wal::WalFileReader;
66
use anyhow::{anyhow, bail};
77
use arc_swap::ArcSwapOption;
8-
use async_compression::tokio::write::GzipEncoder;
8+
use async_compression::tokio::write::{GzipEncoder, XzEncoder};
99
use aws_sdk_s3::config::{Credentials, Region};
1010
use aws_sdk_s3::error::SdkError;
1111
use aws_sdk_s3::operation::get_object::builders::GetObjectFluentBuilder;
@@ -653,7 +653,7 @@ impl Replicator {
653653
CompressionKind::None => Ok(ByteStream::from_path(db_path).await?),
654654
CompressionKind::Gzip => {
655655
let mut reader = File::open(db_path).await?;
656-
let gzip_path = Self::db_gzip_path(db_path);
656+
let gzip_path = Self::db_compressed_path(db_path, "gz");
657657
let compressed_file = OpenOptions::new()
658658
.create(true)
659659
.write(true)
@@ -671,13 +671,34 @@ impl Replicator {
671671
);
672672
Ok(ByteStream::from_path(gzip_path).await?)
673673
}
674+
CompressionKind::Xz => {
675+
let mut reader = File::open(db_path).await?;
676+
let gzip_path = Self::db_compressed_path(db_path, "xz");
677+
let compressed_file = OpenOptions::new()
678+
.create(true)
679+
.write(true)
680+
.read(true)
681+
.truncate(true)
682+
.open(&gzip_path)
683+
.await?;
684+
let mut writer =
685+
XzEncoder::with_quality(compressed_file, async_compression::Level::Best);
686+
let size = tokio::io::copy(&mut reader, &mut writer).await?;
687+
writer.shutdown().await?;
688+
tracing::debug!(
689+
"Compressed database file ({} bytes) into `{}`",
690+
size,
691+
gzip_path.display()
692+
);
693+
Ok(ByteStream::from_path(gzip_path).await?)
694+
}
674695
}
675696
}
676697

677-
fn db_gzip_path(db_path: &Path) -> PathBuf {
678-
let mut gzip_path = db_path.to_path_buf();
679-
gzip_path.pop();
680-
gzip_path.join("db.gz")
698+
fn db_compressed_path(db_path: &Path, suffix: &'static str) -> PathBuf {
699+
let mut compressed_path: PathBuf = db_path.to_path_buf();
700+
compressed_path.pop();
701+
compressed_path.join(format!("db.{suffix}"))
681702
}
682703

683704
fn restore_db_path(&self) -> PathBuf {
@@ -816,9 +837,10 @@ impl Replicator {
816837
let _ = snapshot_notifier.send(Ok(Some(generation)));
817838
let elapsed = Instant::now() - start;
818839
tracing::debug!("Snapshot upload finished (took {:?})", elapsed);
819-
// cleanup gzip database snapshot if exists
820-
let gzip_path = Self::db_gzip_path(&db_path);
821-
let _ = tokio::fs::remove_file(gzip_path).await;
840+
// cleanup gzip/xz database snapshot if exists
841+
for suffix in &["gz", "xz"] {
842+
let _ = tokio::fs::remove_file(Self::db_compressed_path(&db_path, suffix)).await;
843+
}
822844
});
823845
let elapsed = Instant::now() - start_ts;
824846
tracing::debug!("Scheduled DB snapshot {} (took {:?})", generation, elapsed);
@@ -1163,6 +1185,7 @@ impl Replicator {
11631185
let main_db_path = match self.use_compression {
11641186
CompressionKind::None => format!("{}-{}/db.db", self.db_name, generation),
11651187
CompressionKind::Gzip => format!("{}-{}/db.gz", self.db_name, generation),
1188+
CompressionKind::Xz => format!("{}-{}/db.xz", self.db_name, generation),
11661189
};
11671190

11681191
if let Ok(db_file) = self.get_object(main_db_path).send().await {
@@ -1175,6 +1198,12 @@ impl Replicator {
11751198
);
11761199
tokio::io::copy(&mut decompress_reader, db).await?
11771200
}
1201+
CompressionKind::Xz => {
1202+
let mut decompress_reader = async_compression::tokio::bufread::XzDecoder::new(
1203+
tokio::io::BufReader::new(body_reader),
1204+
);
1205+
tokio::io::copy(&mut decompress_reader, db).await?
1206+
}
11781207
};
11791208
db.flush().await?;
11801209

@@ -1235,6 +1264,7 @@ impl Replicator {
12351264
Some(result) => result,
12361265
None => {
12371266
if !key.ends_with(".gz")
1267+
&& !key.ends_with(".xz")
12381268
&& !key.ends_with(".db")
12391269
&& !key.ends_with(".meta")
12401270
&& !key.ends_with(".dep")
@@ -1423,6 +1453,7 @@ impl Replicator {
14231453
let str = fpath.to_str()?;
14241454
if str.ends_with(".db")
14251455
| str.ends_with(".gz")
1456+
| str.ends_with(".xz")
14261457
| str.ends_with(".raw")
14271458
| str.ends_with(".meta")
14281459
| str.ends_with(".dep")
@@ -1670,13 +1701,15 @@ pub enum CompressionKind {
16701701
#[default]
16711702
None,
16721703
Gzip,
1704+
Xz,
16731705
}
16741706

16751707
impl CompressionKind {
16761708
pub fn parse(kind: &str) -> std::result::Result<Self, &str> {
16771709
match kind {
16781710
"gz" | "gzip" => Ok(CompressionKind::Gzip),
16791711
"raw" | "" => Ok(CompressionKind::None),
1712+
"xz" => Ok(CompressionKind::Xz),
16801713
other => Err(other),
16811714
}
16821715
}
@@ -1687,6 +1720,7 @@ impl std::fmt::Display for CompressionKind {
16871720
match self {
16881721
CompressionKind::None => write!(f, "raw"),
16891722
CompressionKind::Gzip => write!(f, "gz"),
1723+
CompressionKind::Xz => write!(f, "xz"),
16901724
}
16911725
}
16921726
}

0 commit comments

Comments
 (0)