Skip to content

Commit a0dd2fb

Browse files
committed
change blob storage conditions and conditional ext compression
1 parent b00246d commit a0dd2fb

File tree

3 files changed

+64
-16
lines changed

3 files changed

+64
-16
lines changed

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,9 @@ the config parameter is optional and do have default values
9797
* `rootdir` : defaults to current working directory. The path where to find the a `content` directory.
9898
* `outdir` : defaults to `.structure`. Relative output directory is the location where all output data will be generated, which is relative to the root directory.
9999
* `folder_single_doc` : defaults to `false`. When `true`, each folder is treated as a single document and the first YAML/YML file contributes overrides plus `meta_data` fields.
100-
* `blob_external_threshold_bytes` : defaults to `1_048_576` (1 MB). Blobs larger than this size are written to disk under `blobs/<YYYY>/<MM>/<prefix>/<hash>`.
101-
* `blob_inline_compression_min_bytes` : defaults to `4_096`. Inline blobs bigger than or equal to this size are gzip-compressed before being stored inside the `blob_store` table.
100+
* `external_storage_kb` : defaults to `512`. Blobs larger than this size (in KB) are written to disk under `blobs/<YYYY>/<MM>/<prefix>/<hash>`.
101+
* `inline_compression_kb` : defaults to `32`. Inline blobs bigger than or equal to this size are eligible for gzip compression before being stored inside the `blob_store` table.
102+
* `file_compress_ext` : defaults to `["txt","md","json","csv","tsv","yaml","yml"]`. Inline blobs are compressed only if their source extension (when known) appears in this list.
102103
103104
## Generated output
104105
* `gen/document_list.json`

src/blob_manager.js

Lines changed: 58 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
import { createHash } from 'crypto';
22
import { readFile, writeFile } from 'fs/promises';
3-
import { join } from 'path';
3+
import { extname, join } from 'path';
44
import { gzip } from 'zlib';
55
import { promisify } from 'util';
66
import { check_dir_create, exists_abs } from './utils.js';
77
import { get_config } from './collect.js';
88

99
const gzipAsync = promisify(gzip);
10-
const DEFAULT_EXTERNAL_THRESHOLD_BYTES = 1024 * 1024; // 1MB
11-
const DEFAULT_INLINE_COMPRESSION_MIN_BYTES = 4 * 1024; // 4KB
10+
const DEFAULT_EXTERNAL_THRESHOLD_BYTES = 512 * 1024;
11+
const DEFAULT_INLINE_COMPRESSION_MIN_BYTES = 32 * 1024;
12+
const DEFAULT_COMPRESSIBLE_EXTENSIONS = new Set(['txt', 'md', 'json', 'csv', 'tsv', 'yaml', 'yml']);
1213

1314
function formatMonth(month) {
1415
return String(month).padStart(2, '0');
@@ -41,25 +42,32 @@ class BlobManager {
4142
constructor(timestamp) {
4243
this.timestamp = timestamp ?? new Date().toISOString();
4344
const config = get_config();
45+
const legacyExternalThreshold = config?.blob_external_threshold_bytes;
46+
const legacyInlineCompression = config?.blob_inline_compression_min_bytes;
4447
this.externalThreshold = normalizeThreshold(
45-
config?.blob_external_threshold_bytes,
48+
Number.isFinite(Number(config?.external_storage_kb))
49+
? Number(config.external_storage_kb) * 1024
50+
: legacyExternalThreshold,
4651
DEFAULT_EXTERNAL_THRESHOLD_BYTES
4752
);
4853
this.inlineCompressionMin = normalizeThreshold(
49-
config?.blob_inline_compression_min_bytes,
54+
Number.isFinite(Number(config?.inline_compression_kb))
55+
? Number(config.inline_compression_kb) * 1024
56+
: legacyInlineCompression,
5057
DEFAULT_INLINE_COMPRESSION_MIN_BYTES
5158
);
59+
this.compressibleExtensions = buildCompressibleExtensionSet(config?.file_compress_ext);
5260
this.hashIndex = new Map(); // hash -> {relativePath, storageDir, size, payload, compression}
5361
}
5462

55-
async ensureFromBuffer(buffer) {
63+
async ensureFromBuffer(buffer, options = {}) {
5664
if (!buffer) {
5765
return null;
5866
}
5967
const hash = createHash('sha512').update(buffer).digest('hex');
6068
const existing = this.hashIndex.get(hash);
6169
if (!existing) {
62-
const entry = await this.persistBuffer(buffer, hash);
70+
const entry = await this.persistBuffer(buffer, hash, options);
6371
this.hashIndex.set(hash, entry);
6472
return buildResult(hash, entry);
6573
}
@@ -71,7 +79,8 @@ class BlobManager {
7179
return null;
7280
}
7381
const buffer = await readFile(absPath);
74-
return this.ensureFromBuffer(buffer);
82+
const compressionHint = inferCompressionHintFromPath(absPath, this.compressibleExtensions);
83+
return this.ensureFromBuffer(buffer, {compressionHint});
7584
}
7685

7786
async writeBlob(relativePath, buffer) {
@@ -84,7 +93,7 @@ class BlobManager {
8493
await writeFile(absPath, buffer);
8594
}
8695

87-
async persistBuffer(buffer, hash) {
96+
async persistBuffer(buffer, hash, options = {}) {
8897
const size = buffer.length;
8998
if (size > this.externalThreshold) {
9099
const {year, month} = getDateParts(this.timestamp);
@@ -102,7 +111,7 @@ class BlobManager {
102111
compression: null
103112
};
104113
}
105-
const inlinePayload = await this.prepareInlinePayload(buffer);
114+
const inlinePayload = await this.prepareInlinePayload(buffer, options);
106115
return {
107116
relativePath: null,
108117
storageDir: null,
@@ -112,8 +121,8 @@ class BlobManager {
112121
};
113122
}
114123

115-
async prepareInlinePayload(buffer) {
116-
const shouldCompress = buffer.length >= this.inlineCompressionMin;
124+
async prepareInlinePayload(buffer, options = {}) {
125+
const shouldCompress = this.shouldCompressInlinePayload(buffer.length, options?.compressionHint);
117126
if (!shouldCompress) {
118127
return {
119128
payload: Buffer.from(buffer),
@@ -126,6 +135,16 @@ class BlobManager {
126135
compression: true
127136
};
128137
}
138+
139+
shouldCompressInlinePayload(byteLength, compressionHint) {
140+
if (byteLength < this.inlineCompressionMin) {
141+
return false;
142+
}
143+
if (!compressionHint) {
144+
return true;
145+
}
146+
return compressionHint.shouldCompress === true;
147+
}
129148
}
130149

131150
function normalizeThreshold(value, fallback) {
@@ -150,6 +169,33 @@ function buildResult(hash, entry) {
150169
};
151170
}
152171

172+
function buildCompressibleExtensionSet(value) {
173+
if (!value) {
174+
return new Set(DEFAULT_COMPRESSIBLE_EXTENSIONS);
175+
}
176+
const list = Array.isArray(value) ? value : String(value).split(',');
177+
const normalized = list
178+
.map((entry) => String(entry ?? '').trim().replace(/^\./, '').toLowerCase())
179+
.filter(Boolean);
180+
if (!normalized.length) {
181+
return new Set(DEFAULT_COMPRESSIBLE_EXTENSIONS);
182+
}
183+
return new Set(normalized);
184+
}
185+
186+
function inferCompressionHintFromPath(absPath, compressibleExtensions) {
187+
if (!absPath) {
188+
return null;
189+
}
190+
const extension = extname(absPath).replace(/^\./, '').toLowerCase();
191+
if (!extension) {
192+
return null;
193+
}
194+
return {
195+
shouldCompress: compressibleExtensions.has(extension)
196+
};
197+
}
198+
153199
function createBlobManager(timestamp) {
154200
return new BlobManager(timestamp);
155201
}

src/collect.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@ let config = {
1313
outdir: ".structure",
1414
debug:false,
1515
folder_single_doc:false,
16-
blob_external_threshold_bytes: 1024 * 1024,
17-
blob_inline_compression_min_bytes: 4 * 1024
16+
external_storage_kb: 512,
17+
inline_compression_kb: 32,
18+
file_compress_ext: ['txt','md','json','csv','tsv','yaml','yml']
1819
}
1920

2021
const DOCUMENTS_TABLE_NAME = 'documents'

0 commit comments

Comments
 (0)