From d2eb78f37c48b938696b979405439269fedcb635 Mon Sep 17 00:00:00 2001 From: KeKs0r Date: Thu, 2 Apr 2026 00:13:01 +0200 Subject: [PATCH 1/5] replace backfill chunking with smart planner --- packages/cli/src/plugin.test.ts | 2 +- packages/plugin-backfill/package.json | 5 + .../plugin-backfill/src/chunking/analyze.ts | 954 +++++++++++++++++- .../src/chunking/introspect.ts | 67 +- .../smart-chunking.integration.test.ts | 420 ++++++++ packages/plugin-backfill/src/chunking/sql.ts | 76 +- .../plugin-backfill/src/chunking/types.ts | 51 + packages/plugin-backfill/src/index.ts | 10 - packages/plugin-backfill/src/planner.ts | 20 +- packages/plugin-backfill/src/plugin.test.ts | 11 + packages/plugin-backfill/src/sdk.ts | 29 + packages/plugin-backfill/src/types.ts | 20 +- 12 files changed, 1573 insertions(+), 92 deletions(-) create mode 100644 packages/plugin-backfill/src/chunking/smart-chunking.integration.test.ts create mode 100644 packages/plugin-backfill/src/sdk.ts diff --git a/packages/cli/src/plugin.test.ts b/packages/cli/src/plugin.test.ts index ddda0ed..8ff0a2b 100644 --- a/packages/cli/src/plugin.test.ts +++ b/packages/cli/src/plugin.test.ts @@ -40,7 +40,7 @@ async function waitForParts( database: string, table: string, expectedPartitions: number, - timeoutMs = 15_000, + timeoutMs = 60_000, ): Promise { const start = Date.now() while (Date.now() - start < timeoutMs) { diff --git a/packages/plugin-backfill/package.json b/packages/plugin-backfill/package.json index c4cbe63..9a035ca 100644 --- a/packages/plugin-backfill/package.json +++ b/packages/plugin-backfill/package.json @@ -27,6 +27,11 @@ "source": "./src/index.ts", "types": "./dist/index.d.ts", "default": "./dist/index.js" + }, + "./sdk": { + "source": "./src/sdk.ts", + "types": "./dist/sdk.d.ts", + "default": "./dist/sdk.js" } }, "files": [ diff --git a/packages/plugin-backfill/src/chunking/analyze.ts b/packages/plugin-backfill/src/chunking/analyze.ts index 7e051e1..278d9df 100644 --- a/packages/plugin-backfill/src/chunking/analyze.ts +++ b/packages/plugin-backfill/src/chunking/analyze.ts @@ -1,8 +1,44 @@ import { hashId, randomPlanId } from '../state.js' -import { buildChunkBoundaries } from './build.js' -import { introspectTable, querySortKeyRanges } from './introspect.js' -import type { ChunkBoundary, PartitionInfo, PlannedChunk, SortKeyInfo } from './types.js' +import { introspectTable } from './introspect.js' +import type { + ChunkBoundary, + EstimateConfidence, + EstimateReason, + PartitionDiagnostics, + PartitionInfo, + PlannedChunk, + SliceLineageStep, + SliceRange, + SortKeyInfo, +} from './types.js' + +const MAX_SPLIT_DEPTH_MULTIPLIER = 3 +const TARGET_BYTES_FUZZ_FACTOR = 1.15 +const STOP_SPLIT_FUZZ_FACTOR = 1.5 +const STRING_PREFIX_START_DEPTH = 1 +const STRING_PREFIX_MAX_DEPTH = 4 +const BINARY_SEARCH_STEPS = 24 + +interface PartitionSlice { + partitionId: string + ranges: SliceRange[] + estimatedRows: number + estimatedBytes: number + isHotKey: boolean + hotDimensionIndex?: number + hotKeyValue?: string + estimateConfidence: EstimateConfidence + estimateReason: EstimateReason + lineage: SliceLineageStep[] +} + +interface QueryContext { + database: string + table: string + sortKeys: SortKeyInfo[] + query: (sql: string) => Promise +} export interface AnalyzeAndChunkInput { database: string @@ -18,11 +54,13 @@ export interface AnalyzeAndChunkResult { planId: string partitions: PartitionInfo[] sortKey?: SortKeyInfo + sortKeys: SortKeyInfo[] chunks: PlannedChunk[] + partitionDiagnostics: PartitionDiagnostics[] } export async function analyzeAndChunk(input: AnalyzeAndChunkInput): Promise { - const { partitions, sortKey, boundaries } = await analyzeTable({ + const { partitions, sortKey, sortKeys, boundaries, partitionDiagnostics } = await analyzeTable({ database: input.database, table: input.table, from: input.from, @@ -36,11 +74,12 @@ export async function analyzeAndChunk(input: AnalyzeAndChunkInput): Promise { - const { partitions, sortKey } = await introspectTable({ + const { partitions, sortKey, sortKeys } = await introspectTable({ database: input.database, table: input.table, from: input.from, @@ -67,34 +108,64 @@ export async function analyzeTable(input: AnalyzeTableInput): Promise p.bytesOnDisk > input.maxChunkBytes) - .map(p => p.partitionId) - - let sortKeyRanges: Map | undefined - if (sortKey && oversizedPartitionIds.length > 0) { - sortKeyRanges = await querySortKeyRanges({ - database: input.database, - table: input.table, - sortKeyColumn: sortKey.column, - partitionIds: oversizedPartitionIds, - query: input.query, - }) + const context: QueryContext = { + database: input.database, + table: input.table, + sortKeys, + query: input.query, } - const boundaries = buildChunkBoundaries({ - partitions, - maxChunkBytes: input.maxChunkBytes, - sortKey, - sortKeyRanges, - }) + const boundaries: ChunkBoundary[] = [] + const partitionDiagnostics: PartitionDiagnostics[] = [] + + for (const partition of partitions) { + const slices = await planPartition(context, partition, input.maxChunkBytes) + const merged = mergeAdjacentSlices(slices, input.maxChunkBytes) - return { partitions, sortKey, boundaries } + for (const slice of merged) { + const primaryRange = getSliceRange(slice, 0) + boundaries.push({ + partitionId: slice.partitionId, + ranges: slice.ranges, + sortKeyFrom: primaryRange.from, + sortKeyTo: primaryRange.to, + estimatedBytes: slice.estimatedBytes, + estimatedRows: slice.estimatedRows, + isHotKey: slice.isHotKey, + hotDimensionIndex: slice.hotDimensionIndex, + hotKeyValue: slice.hotKeyValue, + estimateConfidence: slice.estimateConfidence, + estimateReason: slice.estimateReason, + lineage: slice.lineage, + }) + } + + const estimatedRowSum = merged.reduce((sum, slice) => sum + slice.estimatedRows, 0) + const estimateToExactRatio = partition.rows > 0 ? estimatedRowSum / partition.rows : 1 + partitionDiagnostics.push({ + partitionId: partition.partitionId, + estimatedRowSum, + exactPartitionRows: partition.rows, + estimateToExactRatio, + suspiciousEstimate: estimateToExactRatio < 0.7 || estimateToExactRatio > 1.3, + lowConfidenceChunkCount: merged.filter((slice) => slice.estimateConfidence === 'low').length, + usedDistributionFallback: merged.some((slice) => + slice.estimateReason === 'string-prefix-distribution' || + slice.estimateReason === 'temporal-distribution' || + slice.estimateReason === 'equal-width-distribution' + ), + usedLowConfidenceChunkRefinement: false, + usedExactCountFallback: false, + }) + } + + return { partitions, sortKey, sortKeys, boundaries, partitionDiagnostics } } export function buildPlannedChunks(input: { planId: string partitions: PartitionInfo[] + sortKeys: SortKeyInfo[] boundaries: ChunkBoundary[] requireIdempotencyToken: boolean }): PlannedChunk[] { @@ -109,21 +180,846 @@ export function buildPlannedChunks(input: { const chunkId = hashId(`chunk:${idSeed}`).slice(0, 16) const token = input.requireIdempotencyToken ? hashId(`token:${idSeed}`) : '' - const partition = input.partitions.find(p => p.partitionId === boundary.partitionId) - const from = boundary.sortKeyFrom ?? partition?.minTime ?? '' - const to = boundary.sortKeyTo ?? partition?.maxTime ?? '' + const partition = input.partitions.find((candidate) => candidate.partitionId === boundary.partitionId) + const { from, to } = deriveChunkWindow(boundary.ranges ?? [], input.sortKeys, partition) chunks.push({ id: chunkId, partitionId: boundary.partitionId, + ranges: boundary.ranges, sortKeyFrom: boundary.sortKeyFrom, sortKeyTo: boundary.sortKeyTo, estimatedBytes: boundary.estimatedBytes, + estimatedRows: boundary.estimatedRows, idempotencyToken: token, from, to, + isHotKey: boundary.isHotKey, + hotDimensionIndex: boundary.hotDimensionIndex, + hotKeyValue: boundary.hotKeyValue, + estimateConfidence: boundary.estimateConfidence, + estimateReason: boundary.estimateReason, + lineage: boundary.lineage, }) } return chunks } + +async function planPartition( + context: QueryContext, + partition: PartitionInfo, + maxChunkBytes: number, +): Promise { + if (partition.bytesOnDisk <= maxChunkBytes || context.sortKeys.length === 0) { + return [buildRootSlice(partition)] + } + + const rootSlice = buildRootSlice(partition) + return splitSliceRecursively(context, partition, rootSlice, maxChunkBytes, 0) +} + +async function splitSliceRecursively( + context: QueryContext, + partition: PartitionInfo, + slice: PartitionSlice, + maxChunkBytes: number, + depth: number, +): Promise { + if (slice.estimatedBytes <= maxChunkBytes * STOP_SPLIT_FUZZ_FACTOR) { + return [slice] + } + + if (depth >= context.sortKeys.length * MAX_SPLIT_DEPTH_MULTIPLIER) { + return [slice] + } + + const children = await splitOversizedSlice(context, partition, slice, maxChunkBytes, depth) + if (children.length <= 1) { + return [slice] + } + + const finalChildren: PartitionSlice[] = [] + for (const child of children) { + finalChildren.push(...await splitSliceRecursively(context, partition, child, maxChunkBytes, depth + 1)) + } + return finalChildren +} + +async function splitOversizedSlice( + context: QueryContext, + partition: PartitionInfo, + slice: PartitionSlice, + maxChunkBytes: number, + depth: number, +): Promise { + for (const dimensionIndex of getCandidateDimensions(context.sortKeys, slice)) { + const preparedSlice = await hydrateSliceRange(context, slice, dimensionIndex) + if (!preparedSlice) continue + + const sortKey = context.sortKeys[dimensionIndex] + if (!sortKey) continue + + const rootLike = depth === 0 + const hotIdentity = findHotIdentity(preparedSlice, context.sortKeys) + + if (sortKey.category === 'string') { + const stringSlices = await splitSliceWithStringPrefixes( + context, + partition, + preparedSlice, + dimensionIndex, + maxChunkBytes, + STRING_PREFIX_START_DEPTH, + ) + if (isEffectiveSplit(preparedSlice, stringSlices)) { + return applyHotIdentity(stringSlices, hotIdentity) + } + } + + if (sortKey.category === 'datetime' && (!rootLike || hotIdentity !== undefined)) { + const temporalSlices = await splitSliceWithTemporalBuckets( + context, + partition, + markHotSlice(preparedSlice, hotIdentity), + dimensionIndex, + maxChunkBytes, + ) + if (isEffectiveSplit(preparedSlice, temporalSlices)) { + return applyHotIdentity(temporalSlices, hotIdentity) + } + } + + const quantileSlices = await splitWithRanges( + context, + partition, + preparedSlice, + dimensionIndex, + maxChunkBytes, + ) + if (isEffectiveSplit(preparedSlice, quantileSlices)) { + return applyHotIdentity(quantileSlices, hotIdentity) + } + } + + return [slice] +} + +async function splitWithRanges( + context: QueryContext, + partition: PartitionInfo, + slice: PartitionSlice, + dimensionIndex: number, + maxChunkBytes: number, +): Promise { + const sortKey = context.sortKeys[dimensionIndex] + const range = getSliceRange(slice, dimensionIndex) + if (!sortKey || range.from === undefined || range.to === undefined) return [slice] + if (sortKey.category === 'string' && isExactSliceRange(range)) return [slice] + + const subCount = Math.ceil(slice.estimatedBytes / maxChunkBytes) + if (subCount <= 1) return [slice] + + const boundaries = await buildQuantileBoundaries(context, slice, dimensionIndex, subCount) + if (boundaries) { + return splitSliceWithBoundaries( + context, + partition, + slice, + dimensionIndex, + boundaries, + 'quantile-range-split', + 'split slice into quantile-aligned ranges', + 'quantile-estimate', + 'high', + ) + } + + const equalWidthBoundaries = buildEvenlySpacedBoundaries(range.from, range.to, subCount, sortKey) + return splitSliceWithBoundaries( + context, + partition, + slice, + dimensionIndex, + equalWidthBoundaries, + 'equal-width-split', + 'fallback to equal-width ranges', + 'equal-width-distribution', + 'low', + ) +} + +async function splitSliceWithBoundaries( + context: QueryContext, + partition: PartitionInfo, + slice: PartitionSlice, + dimensionIndex: number, + boundaries: string[], + strategyId: string, + reason: string, + estimateReason: EstimateReason, + estimateConfidence: EstimateConfidence, +): Promise { + const slices: PartitionSlice[] = [] + + for (let index = 0; index < boundaries.length - 1; index++) { + const ranges = replaceSliceRange(slice, dimensionIndex, boundaries[index], boundaries[index + 1]) + const estimatedRows = await countRows(context, partition.partitionId, ranges) + slices.push(buildSliceFromRows(partition, { + ranges, + estimatedRows, + isHotKey: false, + hotDimensionIndex: undefined, + hotKeyValue: undefined, + estimateConfidence, + estimateReason, + lineage: slice.lineage.concat([{ strategyId, dimensionIndex, reason }]), + })) + } + + return slices +} + +async function splitSliceWithStringPrefixes( + context: QueryContext, + partition: PartitionInfo, + slice: PartitionSlice, + dimensionIndex: number, + maxChunkBytes: number, + depth: number, +): Promise { + const sortKey = context.sortKeys[dimensionIndex] + const range = getSliceRange(slice, dimensionIndex) + if (!sortKey || sortKey.category !== 'string' || range.from === undefined || range.to === undefined) { + return [] + } + + const rows = await context.query<{ prefix: string; cnt: string }>(` +SELECT + substring(${sortKey.column}, 1, ${depth}) AS prefix, + count() AS cnt +FROM ${context.database}.${context.table} +WHERE ${buildWhereClause(partition.partitionId, replaceSliceRange(slice, dimensionIndex, range.from, range.to), context.sortKeys)} +GROUP BY prefix +ORDER BY prefix`) + + const slices: PartitionSlice[] = [] + + for (const row of rows) { + const bucket = { + value: row.prefix, + rowCount: Number(row.cnt), + isExactValue: Buffer.from(row.prefix, 'latin1').length < depth, + } + if (bucket.rowCount <= 0) continue + + const bucketFrom = maxBinaryString(range.from, bucket.value) + const bucketUpper = bucket.isExactValue ? `${bucket.value}\0` : nextPrefixValue(bucket.value) + if (!bucketUpper) continue + + const bucketTo = minBinaryString(range.to, bucketUpper) + const bucketSlice = buildSliceFromRows(partition, { + ranges: replaceSliceRange(slice, dimensionIndex, bucketFrom, bucketTo), + estimatedRows: bucket.rowCount, + isHotKey: false, + hotDimensionIndex: undefined, + hotKeyValue: undefined, + estimateConfidence: 'high', + estimateReason: 'string-prefix-distribution', + lineage: slice.lineage.concat([{ + strategyId: 'string-prefix-split', + dimensionIndex, + reason: 'split slice using string prefix distribution', + }]), + }) + + if (bucketSlice.estimatedBytes <= maxChunkBytes * TARGET_BYTES_FUZZ_FACTOR) { + slices.push(bucketSlice) + continue + } + + if (!bucket.isExactValue && depth < STRING_PREFIX_MAX_DEPTH) { + slices.push(...await splitSliceWithStringPrefixes( + context, + partition, + bucketSlice, + dimensionIndex, + maxChunkBytes, + depth + 1, + )) + continue + } + + slices.push(bucketSlice) + } + + return slices +} + +async function splitSliceWithTemporalBuckets( + context: QueryContext, + partition: PartitionInfo, + slice: PartitionSlice, + dimensionIndex: number, + maxChunkBytes: number, +): Promise { + const dayBuckets = await probeTemporalBuckets(context, partition.partitionId, slice.ranges, dimensionIndex, 'day') + if (dayBuckets.length === 0) return [slice] + + const daySlices = buildTemporalSlices(partition, slice, dimensionIndex, dayBuckets, maxChunkBytes) + if (daySlices.every((candidate) => candidate.estimatedBytes <= maxChunkBytes * TARGET_BYTES_FUZZ_FACTOR)) { + return daySlices + } + + const hourBuckets = await probeTemporalBuckets(context, partition.partitionId, slice.ranges, dimensionIndex, 'hour') + if (hourBuckets.length === 0) return daySlices + return buildTemporalSlices(partition, slice, dimensionIndex, hourBuckets, maxChunkBytes) +} + +async function probeTemporalBuckets( + context: QueryContext, + partitionId: string, + ranges: SliceRange[], + dimensionIndex: number, + grain: 'day' | 'hour', +): Promise> { + const sortKey = context.sortKeys[dimensionIndex] + if (!sortKey || sortKey.category !== 'datetime') return [] + + const bucketExpression = grain === 'day' + ? `toStartOfDay(${sortKey.column})` + : `toStartOfHour(${sortKey.column})` + + const rows = await context.query<{ bucket: string; cnt: string }>(` +SELECT + formatDateTime(${bucketExpression}, '%Y-%m-%dT%H:%i:%sZ') AS bucket, + count() AS cnt +FROM ${context.database}.${context.table} +WHERE ${buildWhereClause(partitionId, ranges, context.sortKeys)} +GROUP BY bucket +ORDER BY bucket`) + + return rows.map((row) => ({ + start: row.bucket, + rowCount: Number(row.cnt), + })) +} + +function buildTemporalSlices( + partition: PartitionInfo, + parentSlice: PartitionSlice, + dimensionIndex: number, + buckets: Array<{ start: string; rowCount: number }>, + maxChunkBytes: number, +): PartitionSlice[] { + const targetChunkRows = getTargetChunkRows(partition, maxChunkBytes) + const slices: PartitionSlice[] = [] + let currentStart: string | undefined + let currentRows = 0 + + for (let index = 0; index < buckets.length; index++) { + const bucket = buckets[index] + if (!bucket) continue + + if (currentStart === undefined) currentStart = bucket.start + + const wouldExceed = currentRows > 0 && currentRows + bucket.rowCount > targetChunkRows * TARGET_BYTES_FUZZ_FACTOR + if (wouldExceed && currentStart !== undefined) { + slices.push(buildSliceFromRows(partition, { + ranges: replaceSliceRange(parentSlice, dimensionIndex, currentStart, bucket.start), + estimatedRows: currentRows, + isHotKey: parentSlice.isHotKey, + hotDimensionIndex: parentSlice.hotDimensionIndex, + hotKeyValue: parentSlice.hotKeyValue, + estimateConfidence: 'low', + estimateReason: 'temporal-distribution', + lineage: parentSlice.lineage.concat([{ + strategyId: 'temporal-bucket-split', + dimensionIndex, + reason: 'split slice using temporal distribution buckets', + }]), + })) + currentStart = bucket.start + currentRows = 0 + } + + currentRows += bucket.rowCount + + if (index === buckets.length - 1 && currentStart !== undefined) { + slices.push(buildSliceFromRows(partition, { + ranges: replaceSliceRange(parentSlice, dimensionIndex, currentStart, getPartitionEndExclusive(partition)), + estimatedRows: currentRows, + isHotKey: parentSlice.isHotKey, + hotDimensionIndex: parentSlice.hotDimensionIndex, + hotKeyValue: parentSlice.hotKeyValue, + estimateConfidence: 'low', + estimateReason: 'temporal-distribution', + lineage: parentSlice.lineage.concat([{ + strategyId: 'temporal-bucket-split', + dimensionIndex, + reason: 'split slice using temporal distribution buckets', + }]), + })) + } + } + + return slices +} + +async function buildQuantileBoundaries( + context: QueryContext, + slice: PartitionSlice, + dimensionIndex: number, + subCount: number, +): Promise { + const range = getSliceRange(slice, dimensionIndex) + if (range.from === undefined || range.to === undefined) return undefined + + const boundaries = [range.from] + for (let step = 1; step < subCount; step++) { + const targetCumRows = Math.round((slice.estimatedRows * step) / subCount) + boundaries.push(await findQuantileBoundaryOnDimension(context, slice, dimensionIndex, targetCumRows)) + } + + const uniqueBoundaryCount = new Set(boundaries).size + if (uniqueBoundaryCount <= Math.max(2, Math.ceil(subCount / 3))) { + return undefined + } + + return boundaries.concat([range.to]) +} + +async function findQuantileBoundaryOnDimension( + context: QueryContext, + slice: PartitionSlice, + dimensionIndex: number, + targetCumRows: number, +): Promise { + const sortKey = context.sortKeys[dimensionIndex] + const range = getSliceRange(slice, dimensionIndex) + if (!sortKey || range.from === undefined || range.to === undefined) { + throw new Error(`Missing range for quantile split on dimension ${dimensionIndex}`) + } + + if (sortKey.category === 'string') { + let low = strToBigInt(range.from, 8) + let high = strToBigInt(range.to, 8) + + for (let step = 0; step < BINARY_SEARCH_STEPS; step++) { + const midpoint = (low + high) / 2n + if (midpoint === low || midpoint === high) break + + const mid = bigIntToStr(midpoint, 8) + const rows = await countRows(context, slice.partitionId, replaceSliceRange(slice, dimensionIndex, range.from, mid)) + if (rows < targetCumRows) low = midpoint + else high = midpoint + } + + return bigIntToStr((low + high) / 2n, 8) + } + + if (sortKey.category === 'datetime') { + let low = parsePlannerDateTime(range.from) + let high = parsePlannerDateTime(range.to) + + for (let step = 0; step < BINARY_SEARCH_STEPS; step++) { + const midpoint = Math.floor((low + high) / 2) + if (midpoint === low || midpoint === high) break + + const mid = new Date(midpoint).toISOString() + const rows = await countRows(context, slice.partitionId, replaceSliceRange(slice, dimensionIndex, range.from, mid)) + if (rows < targetCumRows) low = midpoint + else high = midpoint + } + + return new Date(Math.floor((low + high) / 2)).toISOString() + } + + let low = Number(range.from) + let high = Number(range.to) + for (let step = 0; step < BINARY_SEARCH_STEPS; step++) { + const midpoint = Math.floor((low + high) / 2) + if (midpoint === low || midpoint === high) break + + const rows = await countRows(context, slice.partitionId, replaceSliceRange(slice, dimensionIndex, range.from, String(midpoint))) + if (rows < targetCumRows) low = midpoint + else high = midpoint + } + + return String(Math.floor((low + high) / 2)) +} + +async function hydrateSliceRange( + context: QueryContext, + slice: PartitionSlice, + dimensionIndex: number, +): Promise { + const currentRange = getSliceRange(slice, dimensionIndex) + if (currentRange.from !== undefined && currentRange.to !== undefined) return slice + + const sortKey = context.sortKeys[dimensionIndex] + if (!sortKey) return undefined + + const rows = await context.query<{ minVal: string; maxVal: string }>(` +SELECT + toString(min(${sortKey.column})) AS minVal, + toString(max(${sortKey.column})) AS maxVal +FROM ${context.database}.${context.table} +WHERE ${buildWhereClause(slice.partitionId, slice.ranges, context.sortKeys)}`) + + const observed = rows[0] + if (!observed) return undefined + + return { + ...slice, + ranges: replaceSliceRange(slice, dimensionIndex, observed.minVal, toExclusiveUpperBound(observed.maxVal, sortKey)), + } +} + +function buildRootSlice(partition: PartitionInfo): PartitionSlice { + return { + partitionId: partition.partitionId, + ranges: [], + estimatedRows: partition.rows, + estimatedBytes: partition.bytesOnDisk, + isHotKey: false, + estimateConfidence: 'high', + estimateReason: 'partition-metadata', + lineage: [], + } +} + +function buildSliceFromRows( + partition: PartitionInfo, + input: { + ranges: SliceRange[] + estimatedRows: number + isHotKey: boolean + hotDimensionIndex?: number + hotKeyValue?: string + estimateConfidence: EstimateConfidence + estimateReason: EstimateReason + lineage: SliceLineageStep[] + }, +): PartitionSlice { + return { + partitionId: partition.partitionId, + ranges: input.ranges, + estimatedRows: input.estimatedRows, + estimatedBytes: partition.rows > 0 + ? Math.round((input.estimatedRows / partition.rows) * partition.bytesOnDisk) + : 0, + isHotKey: input.isHotKey, + hotDimensionIndex: input.hotDimensionIndex, + hotKeyValue: input.hotKeyValue, + estimateConfidence: input.estimateConfidence, + estimateReason: input.estimateReason, + lineage: input.lineage, + } +} + +function getTargetChunkRows(partition: PartitionInfo, maxChunkBytes: number): number { + if (partition.bytesOnDisk <= 0) return partition.rows + return (maxChunkBytes * partition.rows) / partition.bytesOnDisk +} + +function mergeAdjacentSlices(slices: PartitionSlice[], maxChunkBytes: number): PartitionSlice[] { + if (slices.length <= 1) return slices + + const merged: PartitionSlice[] = [] + let current: PartitionSlice | undefined + + for (const slice of slices) { + if (!current) { + current = slice + continue + } + + const canMerge = + !current.isHotKey && + !slice.isHotKey && + haveSameTrailingRanges(current.ranges, slice.ranges) && + current.estimatedBytes + slice.estimatedBytes <= maxChunkBytes * 1.1 + + if (!canMerge) { + merged.push(current) + current = slice + continue + } + + current = { + ...current, + ranges: mergeRanges(current.ranges, slice.ranges), + estimatedRows: current.estimatedRows + slice.estimatedRows, + estimatedBytes: current.estimatedBytes + slice.estimatedBytes, + } + } + + if (current) merged.push(current) + return merged +} + +function mergeRanges(left: SliceRange[], right: SliceRange[]): SliceRange[] { + return left.map((leftRange) => { + const rightRange = right.find((candidate) => candidate.dimensionIndex === leftRange.dimensionIndex) + return rightRange === undefined + ? leftRange + : { + dimensionIndex: leftRange.dimensionIndex, + from: leftRange.from, + to: rightRange.to, + } + }) +} + +function haveSameTrailingRanges(left: SliceRange[], right: SliceRange[]): boolean { + if (left.length !== right.length) return false + + let differingDimensions = 0 + for (const leftRange of left) { + const rightRange = right.find((candidate) => candidate.dimensionIndex === leftRange.dimensionIndex) + if (!rightRange) return false + + const same = leftRange.from === rightRange.from && leftRange.to === rightRange.to + if (!same) { + differingDimensions += 1 + if (leftRange.to !== rightRange.from) return false + } + } + + return differingDimensions <= 1 +} + +function getCandidateDimensions(sortKeys: SortKeyInfo[], slice: PartitionSlice): number[] { + return sortKeys + .map((sortKey, index) => ({ + index, + priority: getDimensionPriority(sortKey.category, slice.isHotKey, slice.hotDimensionIndex, index), + })) + .sort((left, right) => left.priority - right.priority) + .map((candidate) => candidate.index) +} + +function getDimensionPriority( + category: SortKeyInfo['category'], + isHotKey: boolean, + hotDimensionIndex: number | undefined, + dimensionIndex: number, +): number { + if (isHotKey && hotDimensionIndex === dimensionIndex) return 100 + if (category === 'string') return 0 + if (category === 'datetime') return 1 + return 2 +} + +function getSliceRange(slice: Pick, dimensionIndex: number): SliceRange { + return slice.ranges.find((range) => range.dimensionIndex === dimensionIndex) + ?? { dimensionIndex, from: undefined, to: undefined } +} + +function replaceSliceRange( + slice: Pick, + dimensionIndex: number, + from: string | undefined, + to: string | undefined, +): SliceRange[] { + return slice.ranges + .filter((range) => range.dimensionIndex !== dimensionIndex) + .concat([{ dimensionIndex, from, to }]) + .sort((left, right) => left.dimensionIndex - right.dimensionIndex) +} + +function isExactSliceRange(range: Pick): boolean { + if (range.from === undefined || range.to === undefined) return false + return range.to === `${range.from}\0` +} + +function findHotIdentity( + slice: PartitionSlice, + sortKeys: SortKeyInfo[], +): { dimensionIndex: number; value: string } | undefined { + for (const range of slice.ranges) { + const sortKey = sortKeys[range.dimensionIndex] + if (sortKey?.category !== 'string') continue + if (isExactSliceRange(range) && range.from !== undefined) { + return { dimensionIndex: range.dimensionIndex, value: range.from } + } + } +} + +function applyHotIdentity( + slices: PartitionSlice[], + hotIdentity: { dimensionIndex: number; value: string } | undefined, +): PartitionSlice[] { + if (!hotIdentity) return slices + return slices.map((slice) => markHotSlice(slice, hotIdentity)) +} + +function markHotSlice( + slice: PartitionSlice, + hotIdentity: { dimensionIndex: number; value: string } | undefined, +): PartitionSlice { + if (!hotIdentity) return slice + return { + ...slice, + isHotKey: true, + hotDimensionIndex: hotIdentity.dimensionIndex, + hotKeyValue: hotIdentity.value, + } +} + +function isEffectiveSplit(parent: PartitionSlice, children: PartitionSlice[]): boolean { + if (children.length <= 1) return false + return children.some((child) => + child.estimatedRows !== parent.estimatedRows || + JSON.stringify(child.ranges) !== JSON.stringify(parent.ranges) + ) +} + +function toExclusiveUpperBound(value: string, sortKey: SortKeyInfo): string { + if (sortKey.category === 'string') return `${value}\0` + if (sortKey.category === 'datetime') return new Date(parsePlannerDateTime(value) + 1000).toISOString() + return String(Number(value) + 1) +} + +function getPartitionEndExclusive(partition: PartitionInfo): string { + return new Date(parsePlannerDateTime(partition.maxTime) + 1000).toISOString() +} + +function deriveChunkWindow( + ranges: SliceRange[], + sortKeys: SortKeyInfo[], + partition: PartitionInfo | undefined, +): { from: string; to: string } { + for (const range of ranges) { + const sortKey = sortKeys[range.dimensionIndex] + if (sortKey?.category !== 'datetime') continue + return { + from: range.from ?? partition?.minTime ?? '', + to: range.to ?? partition?.maxTime ?? '', + } + } + + return { + from: partition?.minTime ?? '', + to: partition?.maxTime ?? '', + } +} + +async function countRows(context: QueryContext, partitionId: string, ranges: SliceRange[]): Promise { + const rows = await context.query<{ cnt: string }>(` +SELECT count() AS cnt +FROM ${context.database}.${context.table} +WHERE ${buildWhereClause(partitionId, ranges, context.sortKeys)}`) + return Number(rows[0]?.cnt ?? 0) +} + +function buildWhereClause(partitionId: string, ranges: SliceRange[], sortKeys: SortKeyInfo[]): string { + const conditions = [`_partition_id = ${quoteSqlString(partitionId)}`] + + for (const range of ranges) { + const sortKey = sortKeys[range.dimensionIndex] + if (!sortKey) continue + if (range.from !== undefined) conditions.push(`${sortKey.column} >= ${formatBound(range.from, sortKey)}`) + if (range.to !== undefined) conditions.push(`${sortKey.column} < ${formatBound(range.to, sortKey)}`) + } + + return conditions.join('\n AND ') +} + +function quoteSqlString(value: string): string { + return `'${value.replaceAll('\\', '\\\\').replaceAll('\'', '\\\'')}'` +} + +function formatBound(value: string, sortKey: SortKeyInfo): string { + if (sortKey.category === 'datetime') { + return `parseDateTimeBestEffort(${quoteSqlString(value)})` + } + if (sortKey.category === 'string') { + return `unhex('${Buffer.from(value, 'latin1').toString('hex')}')` + } + return value +} + +function buildEvenlySpacedBoundaries( + from: string, + to: string, + subCount: number, + sortKey: SortKeyInfo, +): string[] { + if (sortKey.category === 'datetime') { + const start = parsePlannerDateTime(from) + const end = parsePlannerDateTime(to) + return Array.from({ length: subCount + 1 }, (_, index) => + new Date(start + Math.floor(((end - start) * index) / subCount)).toISOString() + ) + } + + if (sortKey.category === 'numeric') { + const start = Number(from) + const end = Number(to) + return Array.from({ length: subCount + 1 }, (_, index) => + String(start + Math.floor(((end - start) * index) / subCount)) + ) + } + + const start = strToBigInt(from, 8) + const end = strToBigInt(to, 8) + return Array.from({ length: subCount + 1 }, (_, index) => + bigIntToStr(start + ((end - start) * BigInt(index)) / BigInt(subCount), 8) + ) +} + +function parsePlannerDateTime(value: string): number { + const normalized = value.includes('T') ? value : value.replace(' ', 'T') + return Date.parse(normalized.endsWith('Z') ? normalized : `${normalized}Z`) +} + +function strToBigInt(value: string, padTo: number): bigint { + const buffer = Buffer.from(value, 'latin1') + let result = 0n + for (let index = 0; index < padTo; index++) { + const byte = index < buffer.length ? (buffer[index] ?? 0) : 0 + result = (result << 8n) | BigInt(byte) + } + return result +} + +function bigIntToStr(value: bigint, length: number): string { + const buffer = Buffer.alloc(length) + let remaining = value + for (let index = length - 1; index >= 0; index--) { + buffer[index] = Number(remaining & 0xffn) + remaining >>= 8n + } + return buffer.toString('latin1') +} + +function compareBinaryStrings(left: string, right: string): number { + return Buffer.from(left, 'latin1').compare(Buffer.from(right, 'latin1')) +} + +function minBinaryString(left: string, right: string): string { + return compareBinaryStrings(left, right) <= 0 ? left : right +} + +function maxBinaryString(left: string, right: string): string { + return compareBinaryStrings(left, right) >= 0 ? left : right +} + +function nextPrefixValue(prefix: string): string | undefined { + if (prefix === '') return undefined + + const buffer = Buffer.from(prefix, 'latin1') + for (let index = buffer.length - 1; index >= 0; index--) { + const byte = buffer[index] + if (byte === undefined) continue + if (byte === 0xff) continue + + const next = Buffer.from(buffer.subarray(0, index + 1)) + next[index] = (next[index] ?? 0) + 1 + return next.toString('latin1') + } + + return undefined +} diff --git a/packages/plugin-backfill/src/chunking/introspect.ts b/packages/plugin-backfill/src/chunking/introspect.ts index e383f6f..a9f8e0e 100644 --- a/packages/plugin-backfill/src/chunking/introspect.ts +++ b/packages/plugin-backfill/src/chunking/introspect.ts @@ -35,6 +35,7 @@ export async function queryPartitionInfo(input: { partition_id: string total_rows: string total_bytes: string + total_uncompressed_bytes?: string min_time: string max_time: string }>( @@ -42,6 +43,7 @@ export async function queryPartitionInfo(input: { partition_id, toString(sum(rows)) AS total_rows, toString(sum(bytes_on_disk)) AS total_bytes, + toString(sum(data_uncompressed_bytes)) AS total_uncompressed_bytes, toString(min(min_time)) AS min_time, toString(max(max_time)) AS max_time FROM system.parts @@ -57,6 +59,7 @@ SETTINGS select_sequential_consistency = 1` partitionId: row.partition_id, rows: Number(row.total_rows), bytesOnDisk: Number(row.total_bytes), + bytesUncompressed: Number(row.total_uncompressed_bytes ?? row.total_bytes), minTime: new Date(row.min_time).toISOString(), maxTime: new Date(row.max_time).toISOString(), })) @@ -68,39 +71,57 @@ SETTINGS select_sequential_consistency = 1` }) } -export async function querySortKeyInfo(input: { +function extractSortKeyColumns(sortingKey: string): string[] { + return sortingKey + .split(',') + .map((part) => part.trim()) + .map((part) => { + if (!part) return undefined + const match = part.match(/^\w+\((\w+)\)$/) + return match ? match[1] : part + }) + .filter((part): part is string => Boolean(part && part.length > 0)) +} + +export async function querySortKeys(input: { database: string table: string query: (sql: string) => Promise -}): Promise { +}): Promise { const tableRows = await input.query<{ sorting_key: string }>( `SELECT sorting_key FROM system.tables WHERE database = '${input.database}' AND name = '${input.table}'` ) const sortingKey = tableRows[0]?.sorting_key - if (!sortingKey) return undefined + if (!sortingKey) return [] - // Parse first column from sorting key (may have expressions like "toDate(event_time)") - const firstColumn = sortingKey.split(',')[0]?.trim() - if (!firstColumn) return undefined + const columnNames = extractSortKeyColumns(sortingKey) + if (columnNames.length === 0) return [] - // If it's a function call like toDate(col), extract the column name - const match = firstColumn.match(/^\w+\((\w+)\)$/) - const columnName = match ? match[1] : firstColumn - if (!columnName) return undefined - - const columnRows = await input.query<{ type: string }>( - `SELECT type FROM system.columns WHERE database = '${input.database}' AND table = '${input.table}' AND name = '${columnName}'` + const inList = columnNames.map((name) => `'${name}'`).join(', ') + const columnRows = await input.query<{ name?: string; type: string }>( + `SELECT name, type FROM system.columns WHERE database = '${input.database}' AND table = '${input.table}' AND name IN (${inList})` + ) + const typeByName = new Map( + columnRows.map((row, index) => [row.name ?? columnNames[index] ?? columnNames[0], row.type]) ) - const type = columnRows[0]?.type - if (!type) return undefined + return columnNames.map((column) => { + const type = typeByName.get(column) ?? 'String' + return { + column, + type, + category: classifySortKeyType(type), + } + }) +} - return { - column: columnName, - type, - category: classifySortKeyType(type), - } +export async function querySortKeyInfo(input: { + database: string + table: string + query: (sql: string) => Promise +}): Promise { + return (await querySortKeys(input))[0] } export async function querySortKeyRanges(input: { @@ -134,13 +155,13 @@ export async function introspectTable(input: { from?: string to?: string query: (sql: string) => Promise -}): Promise<{ partitions: PartitionInfo[]; sortKey?: SortKeyInfo }> { +}): Promise<{ partitions: PartitionInfo[]; sortKey?: SortKeyInfo; sortKeys: SortKeyInfo[] }> { const partitions = await queryPartitionInfo(input) - const sortKey = await querySortKeyInfo({ + const sortKeys = await querySortKeys({ database: input.database, table: input.table, query: input.query, }) - return { partitions, sortKey } + return { partitions, sortKey: sortKeys[0], sortKeys } } diff --git a/packages/plugin-backfill/src/chunking/smart-chunking.integration.test.ts b/packages/plugin-backfill/src/chunking/smart-chunking.integration.test.ts new file mode 100644 index 0000000..7176994 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/smart-chunking.integration.test.ts @@ -0,0 +1,420 @@ +import { describe, expect, test } from 'bun:test' + +import { analyzeAndChunk } from './analyze.js' +import { buildChunkSql } from './sql.js' +import type { SortKeyInfo } from './types.js' + +const MiB = 1024 ** 2 + +type RowValue = string | number + +interface FixtureRow { + _partition_id: string + event_time: string + [key: string]: RowValue +} + +function isoAt(day: number, hour: number, minute = 0): string { + return new Date(Date.UTC(2026, 0, day, hour, minute, 0)).toISOString() +} + +function createFixtureQuery(input: { + database: string + table: string + rows: FixtureRow[] + sortKeys: Array<{ column: string; type: string }> + bytesPerRow?: number + uncompressedBytesPerRow?: number +}) { + const bytesPerRow = input.bytesPerRow ?? 1024 + const uncompressedBytesPerRow = input.uncompressedBytesPerRow ?? bytesPerRow * 2 + + return async function query(sql: string): Promise { + if (sql.includes(`SELECT 1 FROM ${input.database}.${input.table} LIMIT 1`)) { + return [{ ok: 1 }] as T[] + } + + if (sql.includes('FROM system.parts')) { + const partitions = summarizePartitions(input.rows, bytesPerRow, uncompressedBytesPerRow) + return partitions as T[] + } + + if (sql.includes('FROM system.tables')) { + return [{ sorting_key: input.sortKeys.map((key) => key.column).join(', ') }] as T[] + } + + if (sql.includes('FROM system.columns')) { + return input.sortKeys.map((key) => ({ name: key.column, type: key.type })) as T[] + } + + const filteredRows = filterRows(sql, input.rows) + + if (sql.includes('substring(')) { + const match = sql.match(/substring\((\w+), 1, (\d+)\) AS prefix/) + const column = match?.[1] + const depth = Number(match?.[2] ?? 0) + if (!column || depth <= 0) return [] as T[] + + const grouped = new Map() + for (const row of filteredRows) { + const value = String(row[column] ?? '') + const prefix = Buffer.from(value, 'latin1').subarray(0, depth).toString('latin1') + grouped.set(prefix, (grouped.get(prefix) ?? 0) + 1) + } + + return Array.from(grouped.entries()) + .sort(([left], [right]) => compareLatin1(left, right)) + .map(([prefix, cnt]) => ({ prefix, cnt: String(cnt) })) as T[] + } + + if (sql.includes('formatDateTime(toStartOfDay(') || sql.includes('formatDateTime(toStartOfHour(')) { + const grain = sql.includes('toStartOfDay(') ? 'day' : 'hour' + const columnMatch = sql.match(/toStartOf(?:Day|Hour)\((\w+)\)/) + const column = columnMatch?.[1] + if (!column) return [] as T[] + + const grouped = new Map() + for (const row of filteredRows) { + const bucket = grain === 'day' + ? toStartOfDay(String(row[column])) + : toStartOfHour(String(row[column])) + grouped.set(bucket, (grouped.get(bucket) ?? 0) + 1) + } + + return Array.from(grouped.entries()) + .sort(([left], [right]) => left.localeCompare(right)) + .map(([bucket, cnt]) => ({ bucket, cnt: String(cnt) })) as T[] + } + + if (sql.includes('toString(min(') && sql.includes('toString(max(')) { + const match = sql.match(/toString\(min\((\w+)\)\) AS minVal,\s+toString\(max\(\1\)\) AS maxVal/s) + const column = match?.[1] + if (!column || filteredRows.length === 0) return [] as T[] + + const values = filteredRows.map((row) => row[column]).filter((value) => value !== undefined) + if (values.length === 0) return [] as T[] + + return [{ + minVal: formatValueForMinMax(values.reduce((current, candidate) => compareValues(candidate, current) < 0 ? candidate : current)), + maxVal: formatValueForMinMax(values.reduce((current, candidate) => compareValues(candidate, current) > 0 ? candidate : current)), + }] as T[] + } + + if (sql.includes('SELECT count() AS cnt')) { + return [{ cnt: String(filteredRows.length) }] as T[] + } + + return [] as T[] + } +} + +function summarizePartitions(rows: FixtureRow[], bytesPerRow: number, uncompressedBytesPerRow: number) { + const byPartition = new Map() + for (const row of rows) { + const list = byPartition.get(row._partition_id) + if (list) list.push(row) + else byPartition.set(row._partition_id, [row]) + } + + return Array.from(byPartition.entries()) + .sort(([left], [right]) => left.localeCompare(right)) + .map(([partitionId, partitionRows]) => ({ + partition_id: partitionId, + total_rows: String(partitionRows.length), + total_bytes: String(partitionRows.length * bytesPerRow), + total_uncompressed_bytes: String(partitionRows.length * uncompressedBytesPerRow), + min_time: String(partitionRows.reduce((min, row) => row.event_time < min ? row.event_time : min, partitionRows[0]?.event_time ?? '')), + max_time: String(partitionRows.reduce((max, row) => row.event_time > max ? row.event_time : max, partitionRows[0]?.event_time ?? '')), + })) +} + +function filterRows(sql: string, rows: FixtureRow[]): FixtureRow[] { + const whereMatch = sql.match(/WHERE\s+([\s\S]*?)(?:GROUP BY|ORDER BY|SETTINGS|$)/i) + if (!whereMatch?.[1]) return rows + + const clauses = whereMatch[1] + .split(/\s+AND\s+/) + .map((clause) => clause.replace(/\s+/g, ' ').trim()) + .filter(Boolean) + + return rows.filter((row) => clauses.every((clause) => evaluateClause(clause, row))) +} + +function evaluateClause(clause: string, row: FixtureRow): boolean { + let match = clause.match(/^_partition_id = '([^']+)'$/) + if (match) return row._partition_id === match[1] + + match = clause.match(/^(\w+) >= parseDateTimeBestEffort\('([^']+)'\)$/) + if (match) return Date.parse(String(row[match[1]])) >= Date.parse(match[2]) + + match = clause.match(/^(\w+) < parseDateTimeBestEffort\('([^']+)'\)$/) + if (match) return Date.parse(String(row[match[1]])) < Date.parse(match[2]) + + match = clause.match(/^(\w+) >= unhex\('([0-9a-f]+)'\)$/i) + if (match) return compareLatin1(String(row[match[1]] ?? ''), Buffer.from(match[2], 'hex').toString('latin1')) >= 0 + + match = clause.match(/^(\w+) < unhex\('([0-9a-f]+)'\)$/i) + if (match) return compareLatin1(String(row[match[1]] ?? ''), Buffer.from(match[2], 'hex').toString('latin1')) < 0 + + match = clause.match(/^(\w+) >= '([^']+)'$/) + if (match) return comparePrimitive(row[match[1]], match[2]) >= 0 + + match = clause.match(/^(\w+) < '([^']+)'$/) + if (match) return comparePrimitive(row[match[1]], match[2]) < 0 + + match = clause.match(/^(\w+) >= (-?\d+(?:\.\d+)?)$/) + if (match) return Number(row[match[1]]) >= Number(match[2]) + + match = clause.match(/^(\w+) < (-?\d+(?:\.\d+)?)$/) + if (match) return Number(row[match[1]]) < Number(match[2]) + + throw new Error(`Unsupported test clause: ${clause}`) +} + +function comparePrimitive(left: RowValue | undefined, right: string): number { + if (typeof left === 'number') return left - Number(right) + return String(left ?? '').localeCompare(right) +} + +function compareValues(left: RowValue, right: RowValue): number { + if (typeof left === 'number' && typeof right === 'number') return left - right + return compareLatin1(String(left), String(right)) +} + +function formatValueForMinMax(value: RowValue): string { + return typeof value === 'number' ? String(value) : String(value) +} + +function compareLatin1(left: string, right: string): number { + return Buffer.from(left, 'latin1').compare(Buffer.from(right, 'latin1')) +} + +function toStartOfDay(value: string): string { + const date = new Date(value) + return new Date(Date.UTC(date.getUTCFullYear(), date.getUTCMonth(), date.getUTCDate(), 0, 0, 0)).toISOString() +} + +function toStartOfHour(value: string): string { + const date = new Date(value) + return new Date(Date.UTC( + date.getUTCFullYear(), + date.getUTCMonth(), + date.getUTCDate(), + date.getUTCHours(), + 0, + 0, + )).toISOString() +} + +async function planFixture(input: { + rows: FixtureRow[] + sortKeys: Array<{ column: string; type: string }> + maxChunkBytes: number +}) { + const query = createFixtureQuery({ + database: 'app', + table: 'events', + rows: input.rows, + sortKeys: input.sortKeys, + }) + + return analyzeAndChunk({ + database: 'app', + table: 'events', + maxChunkBytes: input.maxChunkBytes, + requireIdempotencyToken: true, + query, + }) +} + +function strategyIds(chunk: { lineage?: Array<{ strategyId: string }> }): string[] { + return chunk.lineage?.map((step) => step.strategyId) ?? [] +} + +function buildSqlForChunk(chunk: Awaited>['chunks'][number], sortKeys: SortKeyInfo[]) { + return buildChunkSql({ + planId: 'fixture-plan', + chunk, + target: 'app.events', + sortKey: sortKeys[0], + sortKeys, + }) +} + +function requireChunk(value: T | undefined, label: string): T { + if (value === undefined) { + throw new Error(`Missing expected chunk: ${label}`) + } + return value +} + +describe('smart chunking integration', () => { + test('keeps small partitions as a single metadata chunk', async () => { + const rows = Array.from({ length: 12 }, (_, index) => ({ + _partition_id: 'p_small', + event_time: isoAt(1, index), + id: index, + })) + + const result = await planFixture({ + rows, + sortKeys: [{ column: 'id', type: 'UInt64' }], + maxChunkBytes: 64 * MiB, + }) + + expect(result.chunks).toHaveLength(1) + expect(result.chunks[0]?.estimateReason).toBe('partition-metadata') + expect(strategyIds(result.chunks[0] ?? {})).toHaveLength(0) + }) + + test('uses quantile range splitting for wide numeric distributions', async () => { + const rows = Array.from({ length: 120 }, (_, index) => ({ + _partition_id: 'p_quantile', + event_time: isoAt(2, index % 24), + id: index, + })) + + const result = await planFixture({ + rows, + sortKeys: [{ column: 'id', type: 'UInt64' }], + maxChunkBytes: 30 * 1024, + }) + + expect(result.chunks.length).toBeGreaterThanOrEqual(3) + expect(result.chunks.every((chunk) => strategyIds(chunk).includes('quantile-range-split'))).toBe(true) + const estimatedRows = result.chunks.map((chunk) => chunk.estimatedRows ?? 0) + expect(Math.max(...estimatedRows) - Math.min(...estimatedRows)).toBeLessThanOrEqual(4) + }) + + test('falls back to equal-width splitting when quantile boundaries collapse', async () => { + const rows = Array.from({ length: 80 }, (_, index) => ({ + _partition_id: 'p_equal', + event_time: isoAt(3, index % 24), + id: 100 + (index % 2), + })) + + const result = await planFixture({ + rows, + sortKeys: [{ column: 'id', type: 'UInt64' }], + maxChunkBytes: 20 * 1024, + }) + + expect(result.chunks.length).toBeGreaterThan(1) + expect(result.chunks.some((chunk) => strategyIds(chunk).includes('equal-width-split'))).toBe(true) + }) + + test('uses string-prefix splitting for string-distributed partitions', async () => { + const rows: FixtureRow[] = [] + for (const prefix of ['apple', 'apricot', 'banana', 'berry', 'citrus']) { + for (let index = 0; index < 24; index++) { + rows.push({ + _partition_id: 'p_string', + event_time: isoAt(4, index % 24), + slug: `${prefix}-${index.toString().padStart(2, '0')}`, + }) + } + } + + const result = await planFixture({ + rows, + sortKeys: [{ column: 'slug', type: 'String' }], + maxChunkBytes: 24 * 1024, + }) + + expect(result.chunks.length).toBeGreaterThan(2) + expect(result.chunks.some((chunk) => strategyIds(chunk).includes('string-prefix-split'))).toBe(true) + + const sql = buildSqlForChunk(requireChunk(result.chunks[0], 'string-prefix first chunk'), result.sortKeys) + expect(sql).toContain("unhex('") + }) + + test('combines string-prefix and temporal splitting for hot-key time windows', async () => { + const rows: FixtureRow[] = [] + + for (let day = 1; day <= 3; day++) { + for (let hour = 0; hour < 24; hour++) { + rows.push({ + _partition_id: 'p_combo_temporal', + event_time: isoAt(10 + day, hour), + user_id: 'hot', + score: 1000 + day * 24 + hour, + }) + } + } + + for (let index = 0; index < 18; index++) { + rows.push({ + _partition_id: 'p_combo_temporal', + event_time: isoAt(10, index), + user_id: `cold-${index}`, + score: index, + }) + } + + const result = await planFixture({ + rows, + sortKeys: [ + { column: 'user_id', type: 'String' }, + { column: 'event_time', type: 'DateTime' }, + ], + maxChunkBytes: 18 * 1024, + }) + + const hotChunks = result.chunks.filter((chunk) => + strategyIds(chunk).includes('temporal-bucket-split') && + (chunk.ranges?.some((range) => range.dimensionIndex === 0) ?? false) && + (chunk.ranges?.some((range) => range.dimensionIndex === 1) ?? false) + ) + + expect(hotChunks.length).toBeGreaterThan(0) + expect(hotChunks.every((chunk) => chunk.isHotKey || (chunk.hotKeyValue !== undefined))).toBe(true) + + const sql = buildSqlForChunk(requireChunk(hotChunks[0], 'temporal combo chunk'), result.sortKeys) + expect(sql).toContain('user_id >=') + expect(sql).toContain('event_time >=') + expect(sql).toContain('parseDateTimeBestEffort') + }) + + test('combines string-prefix and quantile splitting on secondary numeric dimensions', async () => { + const rows: FixtureRow[] = [] + + for (let index = 0; index < 96; index++) { + rows.push({ + _partition_id: 'p_combo_numeric', + event_time: isoAt(20, index % 24), + account: 'vip', + seq: index, + }) + } + + for (let index = 0; index < 24; index++) { + rows.push({ + _partition_id: 'p_combo_numeric', + event_time: isoAt(20, index % 24), + account: `free-${index}`, + seq: index, + }) + } + + const result = await planFixture({ + rows, + sortKeys: [ + { column: 'account', type: 'String' }, + { column: 'seq', type: 'UInt64' }, + ], + maxChunkBytes: 24 * 1024, + }) + + const comboChunks = result.chunks.filter((chunk) => + strategyIds(chunk).includes('quantile-range-split') && + (chunk.ranges?.some((range) => range.dimensionIndex === 0) ?? false) && + (chunk.ranges?.some((range) => range.dimensionIndex === 1) ?? false) + ) + + expect(comboChunks.length).toBeGreaterThan(0) + + const sql = buildSqlForChunk(requireChunk(comboChunks[0], 'numeric combo chunk'), result.sortKeys) + expect(sql).toContain('account >=') + expect(sql).toContain("seq >= '") + }) +}) diff --git a/packages/plugin-backfill/src/chunking/sql.ts b/packages/plugin-backfill/src/chunking/sql.ts index e6b7458..0475815 100644 --- a/packages/plugin-backfill/src/chunking/sql.ts +++ b/packages/plugin-backfill/src/chunking/sql.ts @@ -7,17 +7,45 @@ function buildSettingsClause(token: string): string { return `SETTINGS async_insert=0` } -function buildSortKeyCondition( - sortKeyColumn: string, - category: SortKeyInfo['category'], - from: string, - to: string, -): string { - if (category === 'datetime') { - return ` AND ${sortKeyColumn} >= parseDateTimeBestEffort('${from}')\n AND ${sortKeyColumn} < parseDateTimeBestEffort('${to}')` +function quoteSqlString(value: string): string { + return `'${value.replaceAll('\\', '\\\\').replaceAll('\'', '\\\'')}'` +} + +function formatBound(value: string, sortKey: SortKeyInfo): string { + if (sortKey.category === 'datetime') { + return `parseDateTimeBestEffort(${quoteSqlString(value)})` + } + if (sortKey.category === 'string') { + return `unhex('${Buffer.from(value, 'latin1').toString('hex')}')` } - // numeric and string use direct comparison - return ` AND ${sortKeyColumn} >= '${from}'\n AND ${sortKeyColumn} < '${to}'` + return quoteSqlString(value) +} + +function buildChunkConditions(chunk: PlannedChunk, sortKeys: SortKeyInfo[]): string[] { + if (chunk.ranges?.length) { + return chunk.ranges.flatMap((range) => { + const sortKey = sortKeys[range.dimensionIndex] + if (!sortKey) return [] + + const conditions: string[] = [] + if (range.from !== undefined) { + conditions.push(`${sortKey.column} >= ${formatBound(range.from, sortKey)}`) + } + if (range.to !== undefined) { + conditions.push(`${sortKey.column} < ${formatBound(range.to, sortKey)}`) + } + return conditions + }) + } + + if (chunk.sortKeyFrom !== undefined && chunk.sortKeyTo !== undefined && sortKeys[0]) { + return [ + `${sortKeys[0].column} >= ${formatBound(chunk.sortKeyFrom, sortKeys[0])}`, + `${sortKeys[0].column} < ${formatBound(chunk.sortKeyTo, sortKeys[0])}`, + ] + } + + return [] } export function buildChunkSql(input: { @@ -25,24 +53,21 @@ export function buildChunkSql(input: { chunk: PlannedChunk target: string sortKey?: SortKeyInfo + sortKeys?: SortKeyInfo[] mvAsQuery?: string targetColumns?: string[] }): string { const header = `/* chkit backfill plan=${input.planId} chunk=${input.chunk.id} token=${input.chunk.idempotencyToken} */` const settings = buildSettingsClause(input.chunk.idempotencyToken) const { chunk } = input + const sortKeys = input.sortKeys ?? (input.sortKey ? [input.sortKey] : []) + const chunkConditions = buildChunkConditions(chunk, sortKeys) if (input.mvAsQuery) { // MV replay: inject partition + sort key filters into the MV's AS query let filtered = injectPartitionFilter(input.mvAsQuery, chunk.partitionId) - if (chunk.sortKeyFrom !== undefined && chunk.sortKeyTo !== undefined && input.sortKey) { - filtered = injectSortKeyFilter( - filtered, - input.sortKey.column, - input.sortKey.category, - chunk.sortKeyFrom, - chunk.sortKeyTo, - ) + for (const condition of chunkConditions) { + filtered = injectWhereCondition(filtered, condition) } if (input.targetColumns?.length) { filtered = rewriteSelectColumns(filtered, input.targetColumns) @@ -59,13 +84,8 @@ export function buildChunkSql(input: { `WHERE _partition_id = '${chunk.partitionId}'`, ] - if (chunk.sortKeyFrom !== undefined && chunk.sortKeyTo !== undefined && input.sortKey) { - lines.push(buildSortKeyCondition( - input.sortKey.column, - input.sortKey.category, - chunk.sortKeyFrom, - chunk.sortKeyTo, - )) + for (const condition of chunkConditions) { + lines.push(` AND ${condition}`) } lines.push(settings) @@ -88,9 +108,11 @@ export function injectSortKeyFilter( ): string { let condition: string if (category === 'datetime') { - condition = `${sortKeyColumn} >= parseDateTimeBestEffort('${from}')\n AND ${sortKeyColumn} < parseDateTimeBestEffort('${to}')` + condition = `${sortKeyColumn} >= parseDateTimeBestEffort(${quoteSqlString(from)})\n AND ${sortKeyColumn} < parseDateTimeBestEffort(${quoteSqlString(to)})` + } else if (category === 'string') { + condition = `${sortKeyColumn} >= unhex('${Buffer.from(from, 'latin1').toString('hex')}')\n AND ${sortKeyColumn} < unhex('${Buffer.from(to, 'latin1').toString('hex')}')` } else { - condition = `${sortKeyColumn} >= '${from}'\n AND ${sortKeyColumn} < '${to}'` + condition = `${sortKeyColumn} >= ${quoteSqlString(from)}\n AND ${sortKeyColumn} < ${quoteSqlString(to)}` } return injectWhereCondition(query, condition) } diff --git a/packages/plugin-backfill/src/chunking/types.ts b/packages/plugin-backfill/src/chunking/types.ts index 0c76952..220da04 100644 --- a/packages/plugin-backfill/src/chunking/types.ts +++ b/packages/plugin-backfill/src/chunking/types.ts @@ -2,6 +2,7 @@ export interface PartitionInfo { partitionId: string rows: number bytesOnDisk: number + bytesUncompressed?: number minTime: string maxTime: string } @@ -12,20 +13,70 @@ export interface SortKeyInfo { category: 'numeric' | 'datetime' | 'string' } +export interface SliceRange { + dimensionIndex: number + from?: string + to?: string +} + +export interface SliceLineageStep { + strategyId: string + dimensionIndex?: number + reason: string +} + +export type EstimateConfidence = 'high' | 'low' | 'exact' + +export type EstimateReason = + | 'partition-metadata' + | 'quantile-estimate' + | 'string-prefix-distribution' + | 'temporal-distribution' + | 'equal-width-distribution' + | 'exact-count' + export interface ChunkBoundary { partitionId: string + ranges?: SliceRange[] sortKeyFrom?: string sortKeyTo?: string estimatedBytes: number + estimatedRows?: number + isHotKey?: boolean + hotDimensionIndex?: number + hotKeyValue?: string + estimateConfidence?: EstimateConfidence + estimateReason?: EstimateReason + lineage?: SliceLineageStep[] } export interface PlannedChunk { id: string partitionId: string + ranges?: SliceRange[] sortKeyFrom?: string sortKeyTo?: string estimatedBytes: number + estimatedRows?: number idempotencyToken: string from: string to: string + isHotKey?: boolean + hotDimensionIndex?: number + hotKeyValue?: string + estimateConfidence?: EstimateConfidence + estimateReason?: EstimateReason + lineage?: SliceLineageStep[] +} + +export interface PartitionDiagnostics { + partitionId: string + estimatedRowSum: number + exactPartitionRows: number + estimateToExactRatio: number + suspiciousEstimate: boolean + lowConfidenceChunkCount: number + usedDistributionFallback: boolean + usedLowConfidenceChunkRefinement: boolean + usedExactCountFallback: boolean } diff --git a/packages/plugin-backfill/src/index.ts b/packages/plugin-backfill/src/index.ts index 3420da7..c2c8446 100644 --- a/packages/plugin-backfill/src/index.ts +++ b/packages/plugin-backfill/src/index.ts @@ -1,16 +1,6 @@ import './table-config.js' export { backfill, createBackfillPlugin } from './plugin.js' -export { executeBackfill, syncProgress } from './async-backfill.js' -export { analyzeAndChunk } from './chunking/analyze.js' -export type { - BackfillOptions, - BackfillChunkState, - BackfillProgress, - BackfillResult, -} from './async-backfill.js' export type { BackfillPlugin, BackfillPluginOptions, BackfillPluginRegistration } from './types.js' export type { PluginConfig } from './options.js' export type { BackfillTableConfig } from './table-config.js' -export type { AnalyzeAndChunkInput, AnalyzeAndChunkResult } from './chunking/analyze.js' -export type { PlannedChunk, PartitionInfo, SortKeyInfo } from './chunking/types.js' diff --git a/packages/plugin-backfill/src/planner.ts b/packages/plugin-backfill/src/planner.ts index 6f24e02..9f586c6 100644 --- a/packages/plugin-backfill/src/planner.ts +++ b/packages/plugin-backfill/src/planner.ts @@ -36,7 +36,14 @@ export async function buildBackfillPlan(input: { const env = computeEnvironmentFingerprint(input.clickhouse) // 1. Analyze table and build planned chunks - const { planId, partitions, sortKey, chunks: plannedChunks } = await analyzeAndChunk({ + const { + planId, + partitions, + sortKey, + sortKeys, + chunks: plannedChunks, + partitionDiagnostics, + } = await analyzeAndChunk({ database, table, from: opts.from, @@ -88,6 +95,7 @@ export async function buildBackfillPlan(input: { chunk: planned, target: opts.target, sortKey, + sortKeys, mvAsQuery, targetColumns, }) @@ -102,8 +110,16 @@ export async function buildBackfillPlan(input: { sqlTemplate, partitionId: planned.partitionId, estimatedBytes: planned.estimatedBytes, + ...(planned.estimatedRows !== undefined ? { estimatedRows: planned.estimatedRows } : {}), + ...(planned.ranges ? { ranges: planned.ranges } : {}), ...(planned.sortKeyFrom !== undefined ? { sortKeyFrom: planned.sortKeyFrom } : {}), ...(planned.sortKeyTo !== undefined ? { sortKeyTo: planned.sortKeyTo } : {}), + ...(planned.isHotKey !== undefined ? { isHotKey: planned.isHotKey } : {}), + ...(planned.hotDimensionIndex !== undefined ? { hotDimensionIndex: planned.hotDimensionIndex } : {}), + ...(planned.hotKeyValue !== undefined ? { hotKeyValue: planned.hotKeyValue } : {}), + ...(planned.estimateConfidence !== undefined ? { estimateConfidence: planned.estimateConfidence } : {}), + ...(planned.estimateReason !== undefined ? { estimateReason: planned.estimateReason } : {}), + ...(planned.lineage ? { lineage: planned.lineage } : {}), } }) @@ -121,6 +137,8 @@ export async function buildBackfillPlan(input: { chunks, partitions, sortKey, + sortKeys, + partitionDiagnostics, options: { maxChunkBytes: opts.maxChunkBytes, maxParallelChunks: opts.maxParallelChunks, diff --git a/packages/plugin-backfill/src/plugin.test.ts b/packages/plugin-backfill/src/plugin.test.ts index 275fe78..01a85b5 100644 --- a/packages/plugin-backfill/src/plugin.test.ts +++ b/packages/plugin-backfill/src/plugin.test.ts @@ -1,5 +1,7 @@ import { describe, expect, test } from 'bun:test' +import * as sdk from './sdk.js' +import * as root from './index.js' import { backfill, createBackfillPlugin } from './plugin.js' describe('@chkit/plugin-backfill plugin surface', () => { @@ -21,4 +23,13 @@ describe('@chkit/plugin-backfill plugin surface', () => { expect(registration.enabled).toBe(true) expect(registration.options?.maxParallelChunks).toBe(4) }) + + test('keeps internals off the package root and exposes them via sdk', () => { + expect(root).not.toHaveProperty('analyzeAndChunk') + expect(root).not.toHaveProperty('executeBackfill') + + expect(sdk).toHaveProperty('analyzeAndChunk') + expect(sdk).toHaveProperty('executeBackfill') + expect(sdk).toHaveProperty('buildChunkSql') + }) }) diff --git a/packages/plugin-backfill/src/sdk.ts b/packages/plugin-backfill/src/sdk.ts new file mode 100644 index 0000000..0570001 --- /dev/null +++ b/packages/plugin-backfill/src/sdk.ts @@ -0,0 +1,29 @@ +export { executeBackfill, syncProgress } from './async-backfill.js' +export { analyzeAndChunk, analyzeTable, buildPlannedChunks } from './chunking/analyze.js' +export { buildChunkSql, injectSortKeyFilter, rewriteSelectColumns } from './chunking/sql.js' + +export type { + BackfillOptions, + BackfillChunkState, + BackfillProgress, + BackfillResult, +} from './async-backfill.js' + +export type { + AnalyzeAndChunkInput, + AnalyzeAndChunkResult, + AnalyzeTableInput, + AnalyzeTableResult, +} from './chunking/analyze.js' + +export type { + ChunkBoundary, + EstimateConfidence, + EstimateReason, + PartitionDiagnostics, + PartitionInfo, + PlannedChunk, + SliceLineageStep, + SliceRange, + SortKeyInfo, +} from './chunking/types.js' diff --git a/packages/plugin-backfill/src/types.ts b/packages/plugin-backfill/src/types.ts index f3b50da..cf812ea 100644 --- a/packages/plugin-backfill/src/types.ts +++ b/packages/plugin-backfill/src/types.ts @@ -1,7 +1,15 @@ import type { ChxInlinePluginRegistration, ResolvedChxConfig } from '@chkit/core' import type { BackfillProgress } from './async-backfill.js' -import type { PartitionInfo, SortKeyInfo } from './chunking/types.js' +import type { + PartitionDiagnostics, + PartitionInfo, + SliceLineageStep, + SliceRange, + SortKeyInfo, + EstimateConfidence, + EstimateReason, +} from './chunking/types.js' import type { PluginConfig } from './options.js' /** @deprecated Use {@link PluginConfig} instead. */ @@ -29,8 +37,16 @@ export interface BackfillChunk { lastError?: string partitionId: string estimatedBytes: number + estimatedRows?: number + ranges?: SliceRange[] sortKeyFrom?: string sortKeyTo?: string + isHotKey?: boolean + hotDimensionIndex?: number + hotKeyValue?: string + estimateConfidence?: EstimateConfidence + estimateReason?: EstimateReason + lineage?: SliceLineageStep[] } export interface BackfillPlanState { @@ -45,6 +61,8 @@ export interface BackfillPlanState { chunks: BackfillChunk[] partitions?: PartitionInfo[] sortKey?: SortKeyInfo + sortKeys?: SortKeyInfo[] + partitionDiagnostics?: PartitionDiagnostics[] options: { chunkHours?: number maxChunkBytes?: number From 6d224b7e731842106665913485c166ed138f1f1b Mon Sep 17 00:00:00 2001 From: KeKs0r Date: Thu, 2 Apr 2026 00:23:30 +0200 Subject: [PATCH 2/5] fix smart chunking review issues --- packages/plugin-backfill/README.md | 10 ++ .../plugin-backfill/src/chunking/analyze.ts | 38 ++++-- .../src/chunking/introspect.test.ts | 44 +++++- .../src/chunking/introspect.ts | 127 +++++++++++++++--- .../smart-chunking.integration.test.ts | 13 ++ packages/plugin-backfill/src/plugin.test.ts | 10 ++ 6 files changed, 216 insertions(+), 26 deletions(-) diff --git a/packages/plugin-backfill/README.md b/packages/plugin-backfill/README.md index 6feaa72..4b3f42f 100644 --- a/packages/plugin-backfill/README.md +++ b/packages/plugin-backfill/README.md @@ -35,6 +35,16 @@ export default defineConfig({ See the [chkit documentation](https://chkit.obsessiondb.com). +## SDK Internals + +The package root is limited to the plugin registration API. + +Chunk-planning and async execution internals are exposed from the SDK subpath: + +```ts +import { analyzeAndChunk, executeBackfill } from '@chkit/plugin-backfill/sdk' +``` + ## License [MIT](../../LICENSE) diff --git a/packages/plugin-backfill/src/chunking/analyze.ts b/packages/plugin-backfill/src/chunking/analyze.ts index 278d9df..a79e030 100644 --- a/packages/plugin-backfill/src/chunking/analyze.ts +++ b/packages/plugin-backfill/src/chunking/analyze.ts @@ -363,8 +363,18 @@ async function splitSliceWithBoundaries( const slices: PartitionSlice[] = [] for (let index = 0; index < boundaries.length - 1; index++) { - const ranges = replaceSliceRange(slice, dimensionIndex, boundaries[index], boundaries[index + 1]) + const from = boundaries[index] + const to = boundaries[index + 1] + if (from === undefined || to === undefined || from === to) { + continue + } + + const ranges = replaceSliceRange(slice, dimensionIndex, from, to) const estimatedRows = await countRows(context, partition.partitionId, ranges) + if (estimatedRows <= 0) { + continue + } + slices.push(buildSliceFromRows(partition, { ranges, estimatedRows, @@ -516,6 +526,8 @@ function buildTemporalSlices( const slices: PartitionSlice[] = [] let currentStart: string | undefined let currentRows = 0 + const parentRange = getSliceRange(parentSlice, dimensionIndex) + const sliceEnd = parentRange.to ?? getPartitionEndExclusive(partition) for (let index = 0; index < buckets.length; index++) { const bucket = buckets[index] @@ -547,7 +559,7 @@ function buildTemporalSlices( if (index === buckets.length - 1 && currentStart !== undefined) { slices.push(buildSliceFromRows(partition, { - ranges: replaceSliceRange(parentSlice, dimensionIndex, currentStart, getPartitionEndExclusive(partition)), + ranges: replaceSliceRange(parentSlice, dimensionIndex, currentStart, sliceEnd), estimatedRows: currentRows, isHotKey: parentSlice.isHotKey, hotDimensionIndex: parentSlice.hotDimensionIndex, @@ -950,24 +962,34 @@ function buildEvenlySpacedBoundaries( if (sortKey.category === 'datetime') { const start = parsePlannerDateTime(from) const end = parsePlannerDateTime(to) - return Array.from({ length: subCount + 1 }, (_, index) => + return uniqueBoundaries(Array.from({ length: subCount + 1 }, (_, index) => new Date(start + Math.floor(((end - start) * index) / subCount)).toISOString() - ) + )) } if (sortKey.category === 'numeric') { const start = Number(from) const end = Number(to) - return Array.from({ length: subCount + 1 }, (_, index) => + return uniqueBoundaries(Array.from({ length: subCount + 1 }, (_, index) => String(start + Math.floor(((end - start) * index) / subCount)) - ) + )) } const start = strToBigInt(from, 8) const end = strToBigInt(to, 8) - return Array.from({ length: subCount + 1 }, (_, index) => + return uniqueBoundaries(Array.from({ length: subCount + 1 }, (_, index) => bigIntToStr(start + ((end - start) * BigInt(index)) / BigInt(subCount), 8) - ) + )) +} + +function uniqueBoundaries(boundaries: string[]): string[] { + const unique: string[] = [] + for (const boundary of boundaries) { + if (unique[unique.length - 1] !== boundary) { + unique.push(boundary) + } + } + return unique } function parsePlannerDateTime(value: string): number { diff --git a/packages/plugin-backfill/src/chunking/introspect.test.ts b/packages/plugin-backfill/src/chunking/introspect.test.ts index 431872c..b40822b 100644 --- a/packages/plugin-backfill/src/chunking/introspect.test.ts +++ b/packages/plugin-backfill/src/chunking/introspect.test.ts @@ -1,6 +1,6 @@ import { describe, expect, test } from 'bun:test' -import { introspectTable, queryPartitionInfo, querySortKeyInfo, querySortKeyRanges } from './introspect.js' +import { introspectTable, queryPartitionInfo, querySortKeyInfo, querySortKeyRanges, querySortKeys } from './introspect.js' describe('queryPartitionInfo', () => { test('maps system.parts rows to PartitionInfo array', async () => { @@ -132,7 +132,7 @@ describe('querySortKeyInfo', () => { test('returns first column from multi-column sorting key', async () => { const query = async (sql: string) => { if (sql.includes('system.tables')) return [{ sorting_key: 'event_time, id' }] as T[] - if (sql.includes('system.columns')) return [{ type: 'DateTime' }] as T[] + if (sql.includes('system.columns')) return [{ name: 'event_time', type: 'DateTime' }, { name: 'id', type: 'UInt64' }] as T[] return [] as T[] } @@ -140,6 +140,46 @@ describe('querySortKeyInfo', () => { expect(result?.column).toBe('event_time') }) + + test('extracts a single referenced column from function expressions with commas', async () => { + const query = async (sql: string) => { + if (sql.includes('system.tables')) { + return [{ sorting_key: 'toStartOfInterval(ts, INTERVAL 5 MINUTE), user_id' }] as T[] + } + if (sql.includes('system.columns')) { + return [ + { name: 'ts', type: 'DateTime' }, + { name: 'user_id', type: 'String' }, + ] as T[] + } + return [] as T[] + } + + const result = await querySortKeys({ database: 'default', table: 'events', query }) + + expect(result.map((key) => key.column)).toEqual(['ts', 'user_id']) + expect(result.map((key) => key.category)).toEqual(['datetime', 'string']) + }) + + test('skips ambiguous tuple expressions that do not map to one physical column', async () => { + const query = async (sql: string) => { + if (sql.includes('system.tables')) { + return [{ sorting_key: 'tuple(user_id, session_id), event_time' }] as T[] + } + if (sql.includes('system.columns')) { + return [ + { name: 'user_id', type: 'String' }, + { name: 'session_id', type: 'String' }, + { name: 'event_time', type: 'DateTime' }, + ] as T[] + } + return [] as T[] + } + + const result = await querySortKeys({ database: 'default', table: 'events', query }) + + expect(result.map((key) => key.column)).toEqual(['event_time']) + }) }) describe('querySortKeyRanges', () => { diff --git a/packages/plugin-backfill/src/chunking/introspect.ts b/packages/plugin-backfill/src/chunking/introspect.ts index a9f8e0e..16f0c79 100644 --- a/packages/plugin-backfill/src/chunking/introspect.ts +++ b/packages/plugin-backfill/src/chunking/introspect.ts @@ -72,15 +72,92 @@ SETTINGS select_sequential_consistency = 1` } function extractSortKeyColumns(sortingKey: string): string[] { - return sortingKey - .split(',') + return splitTopLevelCsv(sortingKey) .map((part) => part.trim()) - .map((part) => { - if (!part) return undefined - const match = part.match(/^\w+\((\w+)\)$/) - return match ? match[1] : part - }) - .filter((part): part is string => Boolean(part && part.length > 0)) + .filter((part): part is string => part.length > 0) +} + +function splitTopLevelCsv(input: string): string[] { + const parts: string[] = [] + let current = '' + let depth = 0 + let quote: "'" | '"' | undefined + + for (let index = 0; index < input.length; index++) { + const char = input[index] + if (char === undefined) continue + + if (quote) { + current += char + if (char === quote && input[index - 1] !== '\\') { + quote = undefined + } + continue + } + + if (char === '\'' || char === '"') { + quote = char + current += char + continue + } + + if (char === '(') { + depth += 1 + current += char + continue + } + + if (char === ')') { + depth = Math.max(0, depth - 1) + current += char + continue + } + + if (char === ',' && depth === 0) { + parts.push(current.trim()) + current = '' + continue + } + + current += char + } + + if (current.trim().length > 0) { + parts.push(current.trim()) + } + + return parts +} + +function resolveSortKeyColumn(expression: string, knownColumns: Set): string | undefined { + const trimmed = expression.trim() + if (knownColumns.has(trimmed)) { + return trimmed + } + + const identifiers = Array.from(trimmed.matchAll(/\b[A-Za-z_][A-Za-z0-9_]*\b/g)) + .map((match) => match[0]) + .filter((identifier): identifier is string => Boolean(identifier)) + + const matches = Array.from(new Set(identifiers.filter((identifier) => knownColumns.has(identifier)))) + if (matches.length === 1) { + return matches[0] + } + + return undefined +} + +function resolveSortKeyColumnWithoutSchema(expression: string): string | undefined { + const trimmed = expression.trim() + if (/^[A-Za-z_][A-Za-z0-9_]*$/.test(trimmed)) { + return trimmed + } + + const identifiers = Array.from(trimmed.matchAll(/\b[A-Za-z_][A-Za-z0-9_]*\b/g)) + .map((match) => match[0]) + .filter((identifier): identifier is string => Boolean(identifier)) + + return identifiers.length > 0 ? identifiers[identifiers.length - 1] : undefined } export async function querySortKeys(input: { @@ -95,24 +172,42 @@ export async function querySortKeys(input: { const sortingKey = tableRows[0]?.sorting_key if (!sortingKey) return [] - const columnNames = extractSortKeyColumns(sortingKey) - if (columnNames.length === 0) return [] + const expressions = extractSortKeyColumns(sortingKey) + if (expressions.length === 0) return [] - const inList = columnNames.map((name) => `'${name}'`).join(', ') const columnRows = await input.query<{ name?: string; type: string }>( - `SELECT name, type FROM system.columns WHERE database = '${input.database}' AND table = '${input.table}' AND name IN (${inList})` + `SELECT name, type FROM system.columns WHERE database = '${input.database}' AND table = '${input.table}'` ) const typeByName = new Map( - columnRows.map((row, index) => [row.name ?? columnNames[index] ?? columnNames[0], row.type]) + columnRows + .filter((row): row is { name: string; type: string } => Boolean(row.name)) + .map((row) => [row.name, row.type]) ) + const knownColumns = new Set(typeByName.keys()) + + if (knownColumns.size === 0) { + return expressions.flatMap((expression, index) => { + const column = resolveSortKeyColumnWithoutSchema(expression) + const type = columnRows[index]?.type ?? columnRows[0]?.type + if (!column || !type) return [] - return columnNames.map((column) => { + return [{ + column, + type, + category: classifySortKeyType(type), + }] + }) + } + + return expressions.flatMap((expression) => { + const column = resolveSortKeyColumn(expression, knownColumns) + if (!column) return [] const type = typeByName.get(column) ?? 'String' - return { + return [{ column, type, category: classifySortKeyType(type), - } + }] }) } diff --git a/packages/plugin-backfill/src/chunking/smart-chunking.integration.test.ts b/packages/plugin-backfill/src/chunking/smart-chunking.integration.test.ts index 7176994..e362f50 100644 --- a/packages/plugin-backfill/src/chunking/smart-chunking.integration.test.ts +++ b/packages/plugin-backfill/src/chunking/smart-chunking.integration.test.ts @@ -301,6 +301,10 @@ describe('smart chunking integration', () => { expect(result.chunks.length).toBeGreaterThan(1) expect(result.chunks.some((chunk) => strategyIds(chunk).includes('equal-width-split'))).toBe(true) + expect(result.chunks.every((chunk) => (chunk.estimatedRows ?? 0) > 0)).toBe(true) + expect(result.chunks.every((chunk) => + chunk.ranges?.every((range) => range.from !== range.to) ?? true + )).toBe(true) }) test('uses string-prefix splitting for string-distributed partitions', async () => { @@ -373,6 +377,15 @@ describe('smart chunking integration', () => { expect(sql).toContain('user_id >=') expect(sql).toContain('event_time >=') expect(sql).toContain('parseDateTimeBestEffort') + + const temporalRanges = hotChunks + .map((chunk) => chunk.ranges?.find((range) => range.dimensionIndex === 1)) + .filter((range): range is NonNullable => Boolean(range)) + .sort((left, right) => String(left.from).localeCompare(String(right.from))) + + for (let index = 1; index < temporalRanges.length; index++) { + expect(temporalRanges[index - 1]?.to).toBe(temporalRanges[index]?.from) + } }) test('combines string-prefix and quantile splitting on secondary numeric dimensions', async () => { diff --git a/packages/plugin-backfill/src/plugin.test.ts b/packages/plugin-backfill/src/plugin.test.ts index 01a85b5..3e25054 100644 --- a/packages/plugin-backfill/src/plugin.test.ts +++ b/packages/plugin-backfill/src/plugin.test.ts @@ -32,4 +32,14 @@ describe('@chkit/plugin-backfill plugin surface', () => { expect(sdk).toHaveProperty('executeBackfill') expect(sdk).toHaveProperty('buildChunkSql') }) + + test('package exports resolve root and sdk subpath separately', async () => { + const packageRoot = await import('@chkit/plugin-backfill') + const packageSdk = await import('@chkit/plugin-backfill/sdk') + + expect(packageRoot).toHaveProperty('backfill') + expect(packageRoot).not.toHaveProperty('analyzeAndChunk') + expect(packageSdk).toHaveProperty('analyzeAndChunk') + expect(packageSdk).toHaveProperty('executeBackfill') + }) }) From ab1239e94f276e8ea24bf8d112ad883eb3971d62 Mon Sep 17 00:00:00 2001 From: KeKs0r Date: Thu, 2 Apr 2026 14:53:26 +0200 Subject: [PATCH 3/5] Update Algo --- .../plugin-backfill/src/async-backfill.ts | 4 +- .../plugin-backfill/src/chunking/analyze.ts | 1048 +---------------- .../src/chunking/boundary-codec.ts | 109 ++ .../src/chunking/build.test.ts | 135 --- .../plugin-backfill/src/chunking/build.ts | 60 - .../src/chunking/introspect.test.ts | 274 ----- .../src/chunking/introspect.ts | 262 ----- .../src/chunking/partition-slices.ts | 153 +++ .../plugin-backfill/src/chunking/planner.ts | 358 ++++++ .../chunking/services/distribution-source.ts | 66 ++ .../src/chunking/services/metadata-source.ts | 163 +++ .../src/chunking/services/row-probe.ts | 102 ++ .../smart-chunking.integration.test.ts | 106 +- .../src/chunking/splitter.test.ts | 64 - .../plugin-backfill/src/chunking/splitter.ts | 86 -- packages/plugin-backfill/src/chunking/sql.ts | 385 +++--- .../chunking/strategies/equal-width-split.ts | 67 ++ .../strategies/metadata-single-chunk.ts | 6 + .../strategies/quantile-range-split.ts | 208 ++++ .../src/chunking/strategies/refinement.ts | 128 ++ .../strategies/string-prefix-split.ts | 144 +++ .../strategies/temporal-bucket-split.ts | 117 ++ .../src/chunking/strategy-policy.test.ts | 13 + .../src/chunking/strategy-policy.ts | 8 + .../plugin-backfill/src/chunking/types.ts | 189 ++- .../src/chunking/utils/binary-string.ts | 55 + .../plugin-backfill/src/chunking/utils/ids.ts | 17 + .../src/chunking/utils/ranges.ts | 31 + .../src/partition-planner.test.ts | 185 --- packages/plugin-backfill/src/payload.ts | 10 +- packages/plugin-backfill/src/planner.test.ts | 591 +++------- packages/plugin-backfill/src/planner.ts | 117 +- packages/plugin-backfill/src/plugin.test.ts | 29 +- packages/plugin-backfill/src/plugin.ts | 30 +- packages/plugin-backfill/src/queries.ts | 6 +- packages/plugin-backfill/src/sdk.ts | 18 +- packages/plugin-backfill/src/state.ts | 27 +- packages/plugin-backfill/src/types.ts | 59 +- 38 files changed, 2496 insertions(+), 2934 deletions(-) create mode 100644 packages/plugin-backfill/src/chunking/boundary-codec.ts delete mode 100644 packages/plugin-backfill/src/chunking/build.test.ts delete mode 100644 packages/plugin-backfill/src/chunking/build.ts delete mode 100644 packages/plugin-backfill/src/chunking/introspect.test.ts delete mode 100644 packages/plugin-backfill/src/chunking/introspect.ts create mode 100644 packages/plugin-backfill/src/chunking/partition-slices.ts create mode 100644 packages/plugin-backfill/src/chunking/planner.ts create mode 100644 packages/plugin-backfill/src/chunking/services/distribution-source.ts create mode 100644 packages/plugin-backfill/src/chunking/services/metadata-source.ts create mode 100644 packages/plugin-backfill/src/chunking/services/row-probe.ts delete mode 100644 packages/plugin-backfill/src/chunking/splitter.test.ts delete mode 100644 packages/plugin-backfill/src/chunking/splitter.ts create mode 100644 packages/plugin-backfill/src/chunking/strategies/equal-width-split.ts create mode 100644 packages/plugin-backfill/src/chunking/strategies/metadata-single-chunk.ts create mode 100644 packages/plugin-backfill/src/chunking/strategies/quantile-range-split.ts create mode 100644 packages/plugin-backfill/src/chunking/strategies/refinement.ts create mode 100644 packages/plugin-backfill/src/chunking/strategies/string-prefix-split.ts create mode 100644 packages/plugin-backfill/src/chunking/strategies/temporal-bucket-split.ts create mode 100644 packages/plugin-backfill/src/chunking/strategy-policy.test.ts create mode 100644 packages/plugin-backfill/src/chunking/strategy-policy.ts create mode 100644 packages/plugin-backfill/src/chunking/utils/binary-string.ts create mode 100644 packages/plugin-backfill/src/chunking/utils/ids.ts create mode 100644 packages/plugin-backfill/src/chunking/utils/ranges.ts delete mode 100644 packages/plugin-backfill/src/partition-planner.test.ts diff --git a/packages/plugin-backfill/src/async-backfill.ts b/packages/plugin-backfill/src/async-backfill.ts index f393499..e5acd1e 100644 --- a/packages/plugin-backfill/src/async-backfill.ts +++ b/packages/plugin-backfill/src/async-backfill.ts @@ -7,9 +7,9 @@ export interface BackfillOptions { /** Plan ID used as a namespace in deterministic query IDs */ planId: string /** The chunks to process (from buildChunks) */ - chunks: Array<{ id: string; from: string; to: string; [key: string]: unknown }> + chunks: Array<{ id: string; from?: string; to?: string; [key: string]: unknown }> /** Build the SQL for a given chunk. Called once per chunk at submit time. */ - buildQuery: (chunk: { id: string; from: string; to: string }) => string + buildQuery: (chunk: { id: string; from?: string; to?: string }) => string /** Max concurrent queries running on the server. Default: 3 */ concurrency?: number /** Polling interval in ms. Default: 5000 */ diff --git a/packages/plugin-backfill/src/chunking/analyze.ts b/packages/plugin-backfill/src/chunking/analyze.ts index a79e030..3903b72 100644 --- a/packages/plugin-backfill/src/chunking/analyze.ts +++ b/packages/plugin-backfill/src/chunking/analyze.ts @@ -1,1047 +1,15 @@ -import { hashId, randomPlanId } from '../state.js' +import { generateChunkPlan } from './planner.js' +import type { ChunkPlan, GenerateChunkPlanInput } from './types.js' -import { introspectTable } from './introspect.js' -import type { - ChunkBoundary, - EstimateConfidence, - EstimateReason, - PartitionDiagnostics, - PartitionInfo, - PlannedChunk, - SliceLineageStep, - SliceRange, - SortKeyInfo, -} from './types.js' - -const MAX_SPLIT_DEPTH_MULTIPLIER = 3 -const TARGET_BYTES_FUZZ_FACTOR = 1.15 -const STOP_SPLIT_FUZZ_FACTOR = 1.5 -const STRING_PREFIX_START_DEPTH = 1 -const STRING_PREFIX_MAX_DEPTH = 4 -const BINARY_SEARCH_STEPS = 24 - -interface PartitionSlice { - partitionId: string - ranges: SliceRange[] - estimatedRows: number - estimatedBytes: number - isHotKey: boolean - hotDimensionIndex?: number - hotKeyValue?: string - estimateConfidence: EstimateConfidence - estimateReason: EstimateReason - lineage: SliceLineageStep[] -} - -interface QueryContext { - database: string - table: string - sortKeys: SortKeyInfo[] - query: (sql: string) => Promise -} - -export interface AnalyzeAndChunkInput { - database: string - table: string - from?: string - to?: string - maxChunkBytes: number - requireIdempotencyToken: boolean - query: (sql: string) => Promise -} - -export interface AnalyzeAndChunkResult { - planId: string - partitions: PartitionInfo[] - sortKey?: SortKeyInfo - sortKeys: SortKeyInfo[] - chunks: PlannedChunk[] - partitionDiagnostics: PartitionDiagnostics[] -} +export type AnalyzeAndChunkInput = GenerateChunkPlanInput +export type AnalyzeAndChunkResult = ChunkPlan +export type AnalyzeTableInput = GenerateChunkPlanInput +export type AnalyzeTableResult = ChunkPlan export async function analyzeAndChunk(input: AnalyzeAndChunkInput): Promise { - const { partitions, sortKey, sortKeys, boundaries, partitionDiagnostics } = await analyzeTable({ - database: input.database, - table: input.table, - from: input.from, - to: input.to, - maxChunkBytes: input.maxChunkBytes, - query: input.query, - }) - - const planId = randomPlanId() - - const chunks = buildPlannedChunks({ - planId, - partitions, - sortKeys, - boundaries, - requireIdempotencyToken: input.requireIdempotencyToken, - }) - - return { planId, partitions, sortKey, sortKeys, chunks, partitionDiagnostics } -} - -export interface AnalyzeTableInput { - database: string - table: string - from?: string - to?: string - maxChunkBytes: number - query: (sql: string) => Promise -} - -export interface AnalyzeTableResult { - partitions: PartitionInfo[] - sortKey?: SortKeyInfo - sortKeys: SortKeyInfo[] - boundaries: ChunkBoundary[] - partitionDiagnostics: PartitionDiagnostics[] + return generateChunkPlan(input) } export async function analyzeTable(input: AnalyzeTableInput): Promise { - const { partitions, sortKey, sortKeys } = await introspectTable({ - database: input.database, - table: input.table, - from: input.from, - to: input.to, - query: input.query, - }) - - const context: QueryContext = { - database: input.database, - table: input.table, - sortKeys, - query: input.query, - } - - const boundaries: ChunkBoundary[] = [] - const partitionDiagnostics: PartitionDiagnostics[] = [] - - for (const partition of partitions) { - const slices = await planPartition(context, partition, input.maxChunkBytes) - const merged = mergeAdjacentSlices(slices, input.maxChunkBytes) - - for (const slice of merged) { - const primaryRange = getSliceRange(slice, 0) - boundaries.push({ - partitionId: slice.partitionId, - ranges: slice.ranges, - sortKeyFrom: primaryRange.from, - sortKeyTo: primaryRange.to, - estimatedBytes: slice.estimatedBytes, - estimatedRows: slice.estimatedRows, - isHotKey: slice.isHotKey, - hotDimensionIndex: slice.hotDimensionIndex, - hotKeyValue: slice.hotKeyValue, - estimateConfidence: slice.estimateConfidence, - estimateReason: slice.estimateReason, - lineage: slice.lineage, - }) - } - - const estimatedRowSum = merged.reduce((sum, slice) => sum + slice.estimatedRows, 0) - const estimateToExactRatio = partition.rows > 0 ? estimatedRowSum / partition.rows : 1 - partitionDiagnostics.push({ - partitionId: partition.partitionId, - estimatedRowSum, - exactPartitionRows: partition.rows, - estimateToExactRatio, - suspiciousEstimate: estimateToExactRatio < 0.7 || estimateToExactRatio > 1.3, - lowConfidenceChunkCount: merged.filter((slice) => slice.estimateConfidence === 'low').length, - usedDistributionFallback: merged.some((slice) => - slice.estimateReason === 'string-prefix-distribution' || - slice.estimateReason === 'temporal-distribution' || - slice.estimateReason === 'equal-width-distribution' - ), - usedLowConfidenceChunkRefinement: false, - usedExactCountFallback: false, - }) - } - - return { partitions, sortKey, sortKeys, boundaries, partitionDiagnostics } -} - -export function buildPlannedChunks(input: { - planId: string - partitions: PartitionInfo[] - sortKeys: SortKeyInfo[] - boundaries: ChunkBoundary[] - requireIdempotencyToken: boolean -}): PlannedChunk[] { - const chunks: PlannedChunk[] = [] - const partitionIndex = new Map() - - for (const boundary of input.boundaries) { - const idx = partitionIndex.get(boundary.partitionId) ?? 0 - partitionIndex.set(boundary.partitionId, idx + 1) - - const idSeed = `${input.planId}:${boundary.partitionId}:${idx}` - const chunkId = hashId(`chunk:${idSeed}`).slice(0, 16) - const token = input.requireIdempotencyToken ? hashId(`token:${idSeed}`) : '' - - const partition = input.partitions.find((candidate) => candidate.partitionId === boundary.partitionId) - const { from, to } = deriveChunkWindow(boundary.ranges ?? [], input.sortKeys, partition) - - chunks.push({ - id: chunkId, - partitionId: boundary.partitionId, - ranges: boundary.ranges, - sortKeyFrom: boundary.sortKeyFrom, - sortKeyTo: boundary.sortKeyTo, - estimatedBytes: boundary.estimatedBytes, - estimatedRows: boundary.estimatedRows, - idempotencyToken: token, - from, - to, - isHotKey: boundary.isHotKey, - hotDimensionIndex: boundary.hotDimensionIndex, - hotKeyValue: boundary.hotKeyValue, - estimateConfidence: boundary.estimateConfidence, - estimateReason: boundary.estimateReason, - lineage: boundary.lineage, - }) - } - - return chunks -} - -async function planPartition( - context: QueryContext, - partition: PartitionInfo, - maxChunkBytes: number, -): Promise { - if (partition.bytesOnDisk <= maxChunkBytes || context.sortKeys.length === 0) { - return [buildRootSlice(partition)] - } - - const rootSlice = buildRootSlice(partition) - return splitSliceRecursively(context, partition, rootSlice, maxChunkBytes, 0) -} - -async function splitSliceRecursively( - context: QueryContext, - partition: PartitionInfo, - slice: PartitionSlice, - maxChunkBytes: number, - depth: number, -): Promise { - if (slice.estimatedBytes <= maxChunkBytes * STOP_SPLIT_FUZZ_FACTOR) { - return [slice] - } - - if (depth >= context.sortKeys.length * MAX_SPLIT_DEPTH_MULTIPLIER) { - return [slice] - } - - const children = await splitOversizedSlice(context, partition, slice, maxChunkBytes, depth) - if (children.length <= 1) { - return [slice] - } - - const finalChildren: PartitionSlice[] = [] - for (const child of children) { - finalChildren.push(...await splitSliceRecursively(context, partition, child, maxChunkBytes, depth + 1)) - } - return finalChildren -} - -async function splitOversizedSlice( - context: QueryContext, - partition: PartitionInfo, - slice: PartitionSlice, - maxChunkBytes: number, - depth: number, -): Promise { - for (const dimensionIndex of getCandidateDimensions(context.sortKeys, slice)) { - const preparedSlice = await hydrateSliceRange(context, slice, dimensionIndex) - if (!preparedSlice) continue - - const sortKey = context.sortKeys[dimensionIndex] - if (!sortKey) continue - - const rootLike = depth === 0 - const hotIdentity = findHotIdentity(preparedSlice, context.sortKeys) - - if (sortKey.category === 'string') { - const stringSlices = await splitSliceWithStringPrefixes( - context, - partition, - preparedSlice, - dimensionIndex, - maxChunkBytes, - STRING_PREFIX_START_DEPTH, - ) - if (isEffectiveSplit(preparedSlice, stringSlices)) { - return applyHotIdentity(stringSlices, hotIdentity) - } - } - - if (sortKey.category === 'datetime' && (!rootLike || hotIdentity !== undefined)) { - const temporalSlices = await splitSliceWithTemporalBuckets( - context, - partition, - markHotSlice(preparedSlice, hotIdentity), - dimensionIndex, - maxChunkBytes, - ) - if (isEffectiveSplit(preparedSlice, temporalSlices)) { - return applyHotIdentity(temporalSlices, hotIdentity) - } - } - - const quantileSlices = await splitWithRanges( - context, - partition, - preparedSlice, - dimensionIndex, - maxChunkBytes, - ) - if (isEffectiveSplit(preparedSlice, quantileSlices)) { - return applyHotIdentity(quantileSlices, hotIdentity) - } - } - - return [slice] -} - -async function splitWithRanges( - context: QueryContext, - partition: PartitionInfo, - slice: PartitionSlice, - dimensionIndex: number, - maxChunkBytes: number, -): Promise { - const sortKey = context.sortKeys[dimensionIndex] - const range = getSliceRange(slice, dimensionIndex) - if (!sortKey || range.from === undefined || range.to === undefined) return [slice] - if (sortKey.category === 'string' && isExactSliceRange(range)) return [slice] - - const subCount = Math.ceil(slice.estimatedBytes / maxChunkBytes) - if (subCount <= 1) return [slice] - - const boundaries = await buildQuantileBoundaries(context, slice, dimensionIndex, subCount) - if (boundaries) { - return splitSliceWithBoundaries( - context, - partition, - slice, - dimensionIndex, - boundaries, - 'quantile-range-split', - 'split slice into quantile-aligned ranges', - 'quantile-estimate', - 'high', - ) - } - - const equalWidthBoundaries = buildEvenlySpacedBoundaries(range.from, range.to, subCount, sortKey) - return splitSliceWithBoundaries( - context, - partition, - slice, - dimensionIndex, - equalWidthBoundaries, - 'equal-width-split', - 'fallback to equal-width ranges', - 'equal-width-distribution', - 'low', - ) -} - -async function splitSliceWithBoundaries( - context: QueryContext, - partition: PartitionInfo, - slice: PartitionSlice, - dimensionIndex: number, - boundaries: string[], - strategyId: string, - reason: string, - estimateReason: EstimateReason, - estimateConfidence: EstimateConfidence, -): Promise { - const slices: PartitionSlice[] = [] - - for (let index = 0; index < boundaries.length - 1; index++) { - const from = boundaries[index] - const to = boundaries[index + 1] - if (from === undefined || to === undefined || from === to) { - continue - } - - const ranges = replaceSliceRange(slice, dimensionIndex, from, to) - const estimatedRows = await countRows(context, partition.partitionId, ranges) - if (estimatedRows <= 0) { - continue - } - - slices.push(buildSliceFromRows(partition, { - ranges, - estimatedRows, - isHotKey: false, - hotDimensionIndex: undefined, - hotKeyValue: undefined, - estimateConfidence, - estimateReason, - lineage: slice.lineage.concat([{ strategyId, dimensionIndex, reason }]), - })) - } - - return slices -} - -async function splitSliceWithStringPrefixes( - context: QueryContext, - partition: PartitionInfo, - slice: PartitionSlice, - dimensionIndex: number, - maxChunkBytes: number, - depth: number, -): Promise { - const sortKey = context.sortKeys[dimensionIndex] - const range = getSliceRange(slice, dimensionIndex) - if (!sortKey || sortKey.category !== 'string' || range.from === undefined || range.to === undefined) { - return [] - } - - const rows = await context.query<{ prefix: string; cnt: string }>(` -SELECT - substring(${sortKey.column}, 1, ${depth}) AS prefix, - count() AS cnt -FROM ${context.database}.${context.table} -WHERE ${buildWhereClause(partition.partitionId, replaceSliceRange(slice, dimensionIndex, range.from, range.to), context.sortKeys)} -GROUP BY prefix -ORDER BY prefix`) - - const slices: PartitionSlice[] = [] - - for (const row of rows) { - const bucket = { - value: row.prefix, - rowCount: Number(row.cnt), - isExactValue: Buffer.from(row.prefix, 'latin1').length < depth, - } - if (bucket.rowCount <= 0) continue - - const bucketFrom = maxBinaryString(range.from, bucket.value) - const bucketUpper = bucket.isExactValue ? `${bucket.value}\0` : nextPrefixValue(bucket.value) - if (!bucketUpper) continue - - const bucketTo = minBinaryString(range.to, bucketUpper) - const bucketSlice = buildSliceFromRows(partition, { - ranges: replaceSliceRange(slice, dimensionIndex, bucketFrom, bucketTo), - estimatedRows: bucket.rowCount, - isHotKey: false, - hotDimensionIndex: undefined, - hotKeyValue: undefined, - estimateConfidence: 'high', - estimateReason: 'string-prefix-distribution', - lineage: slice.lineage.concat([{ - strategyId: 'string-prefix-split', - dimensionIndex, - reason: 'split slice using string prefix distribution', - }]), - }) - - if (bucketSlice.estimatedBytes <= maxChunkBytes * TARGET_BYTES_FUZZ_FACTOR) { - slices.push(bucketSlice) - continue - } - - if (!bucket.isExactValue && depth < STRING_PREFIX_MAX_DEPTH) { - slices.push(...await splitSliceWithStringPrefixes( - context, - partition, - bucketSlice, - dimensionIndex, - maxChunkBytes, - depth + 1, - )) - continue - } - - slices.push(bucketSlice) - } - - return slices -} - -async function splitSliceWithTemporalBuckets( - context: QueryContext, - partition: PartitionInfo, - slice: PartitionSlice, - dimensionIndex: number, - maxChunkBytes: number, -): Promise { - const dayBuckets = await probeTemporalBuckets(context, partition.partitionId, slice.ranges, dimensionIndex, 'day') - if (dayBuckets.length === 0) return [slice] - - const daySlices = buildTemporalSlices(partition, slice, dimensionIndex, dayBuckets, maxChunkBytes) - if (daySlices.every((candidate) => candidate.estimatedBytes <= maxChunkBytes * TARGET_BYTES_FUZZ_FACTOR)) { - return daySlices - } - - const hourBuckets = await probeTemporalBuckets(context, partition.partitionId, slice.ranges, dimensionIndex, 'hour') - if (hourBuckets.length === 0) return daySlices - return buildTemporalSlices(partition, slice, dimensionIndex, hourBuckets, maxChunkBytes) -} - -async function probeTemporalBuckets( - context: QueryContext, - partitionId: string, - ranges: SliceRange[], - dimensionIndex: number, - grain: 'day' | 'hour', -): Promise> { - const sortKey = context.sortKeys[dimensionIndex] - if (!sortKey || sortKey.category !== 'datetime') return [] - - const bucketExpression = grain === 'day' - ? `toStartOfDay(${sortKey.column})` - : `toStartOfHour(${sortKey.column})` - - const rows = await context.query<{ bucket: string; cnt: string }>(` -SELECT - formatDateTime(${bucketExpression}, '%Y-%m-%dT%H:%i:%sZ') AS bucket, - count() AS cnt -FROM ${context.database}.${context.table} -WHERE ${buildWhereClause(partitionId, ranges, context.sortKeys)} -GROUP BY bucket -ORDER BY bucket`) - - return rows.map((row) => ({ - start: row.bucket, - rowCount: Number(row.cnt), - })) -} - -function buildTemporalSlices( - partition: PartitionInfo, - parentSlice: PartitionSlice, - dimensionIndex: number, - buckets: Array<{ start: string; rowCount: number }>, - maxChunkBytes: number, -): PartitionSlice[] { - const targetChunkRows = getTargetChunkRows(partition, maxChunkBytes) - const slices: PartitionSlice[] = [] - let currentStart: string | undefined - let currentRows = 0 - const parentRange = getSliceRange(parentSlice, dimensionIndex) - const sliceEnd = parentRange.to ?? getPartitionEndExclusive(partition) - - for (let index = 0; index < buckets.length; index++) { - const bucket = buckets[index] - if (!bucket) continue - - if (currentStart === undefined) currentStart = bucket.start - - const wouldExceed = currentRows > 0 && currentRows + bucket.rowCount > targetChunkRows * TARGET_BYTES_FUZZ_FACTOR - if (wouldExceed && currentStart !== undefined) { - slices.push(buildSliceFromRows(partition, { - ranges: replaceSliceRange(parentSlice, dimensionIndex, currentStart, bucket.start), - estimatedRows: currentRows, - isHotKey: parentSlice.isHotKey, - hotDimensionIndex: parentSlice.hotDimensionIndex, - hotKeyValue: parentSlice.hotKeyValue, - estimateConfidence: 'low', - estimateReason: 'temporal-distribution', - lineage: parentSlice.lineage.concat([{ - strategyId: 'temporal-bucket-split', - dimensionIndex, - reason: 'split slice using temporal distribution buckets', - }]), - })) - currentStart = bucket.start - currentRows = 0 - } - - currentRows += bucket.rowCount - - if (index === buckets.length - 1 && currentStart !== undefined) { - slices.push(buildSliceFromRows(partition, { - ranges: replaceSliceRange(parentSlice, dimensionIndex, currentStart, sliceEnd), - estimatedRows: currentRows, - isHotKey: parentSlice.isHotKey, - hotDimensionIndex: parentSlice.hotDimensionIndex, - hotKeyValue: parentSlice.hotKeyValue, - estimateConfidence: 'low', - estimateReason: 'temporal-distribution', - lineage: parentSlice.lineage.concat([{ - strategyId: 'temporal-bucket-split', - dimensionIndex, - reason: 'split slice using temporal distribution buckets', - }]), - })) - } - } - - return slices -} - -async function buildQuantileBoundaries( - context: QueryContext, - slice: PartitionSlice, - dimensionIndex: number, - subCount: number, -): Promise { - const range = getSliceRange(slice, dimensionIndex) - if (range.from === undefined || range.to === undefined) return undefined - - const boundaries = [range.from] - for (let step = 1; step < subCount; step++) { - const targetCumRows = Math.round((slice.estimatedRows * step) / subCount) - boundaries.push(await findQuantileBoundaryOnDimension(context, slice, dimensionIndex, targetCumRows)) - } - - const uniqueBoundaryCount = new Set(boundaries).size - if (uniqueBoundaryCount <= Math.max(2, Math.ceil(subCount / 3))) { - return undefined - } - - return boundaries.concat([range.to]) -} - -async function findQuantileBoundaryOnDimension( - context: QueryContext, - slice: PartitionSlice, - dimensionIndex: number, - targetCumRows: number, -): Promise { - const sortKey = context.sortKeys[dimensionIndex] - const range = getSliceRange(slice, dimensionIndex) - if (!sortKey || range.from === undefined || range.to === undefined) { - throw new Error(`Missing range for quantile split on dimension ${dimensionIndex}`) - } - - if (sortKey.category === 'string') { - let low = strToBigInt(range.from, 8) - let high = strToBigInt(range.to, 8) - - for (let step = 0; step < BINARY_SEARCH_STEPS; step++) { - const midpoint = (low + high) / 2n - if (midpoint === low || midpoint === high) break - - const mid = bigIntToStr(midpoint, 8) - const rows = await countRows(context, slice.partitionId, replaceSliceRange(slice, dimensionIndex, range.from, mid)) - if (rows < targetCumRows) low = midpoint - else high = midpoint - } - - return bigIntToStr((low + high) / 2n, 8) - } - - if (sortKey.category === 'datetime') { - let low = parsePlannerDateTime(range.from) - let high = parsePlannerDateTime(range.to) - - for (let step = 0; step < BINARY_SEARCH_STEPS; step++) { - const midpoint = Math.floor((low + high) / 2) - if (midpoint === low || midpoint === high) break - - const mid = new Date(midpoint).toISOString() - const rows = await countRows(context, slice.partitionId, replaceSliceRange(slice, dimensionIndex, range.from, mid)) - if (rows < targetCumRows) low = midpoint - else high = midpoint - } - - return new Date(Math.floor((low + high) / 2)).toISOString() - } - - let low = Number(range.from) - let high = Number(range.to) - for (let step = 0; step < BINARY_SEARCH_STEPS; step++) { - const midpoint = Math.floor((low + high) / 2) - if (midpoint === low || midpoint === high) break - - const rows = await countRows(context, slice.partitionId, replaceSliceRange(slice, dimensionIndex, range.from, String(midpoint))) - if (rows < targetCumRows) low = midpoint - else high = midpoint - } - - return String(Math.floor((low + high) / 2)) -} - -async function hydrateSliceRange( - context: QueryContext, - slice: PartitionSlice, - dimensionIndex: number, -): Promise { - const currentRange = getSliceRange(slice, dimensionIndex) - if (currentRange.from !== undefined && currentRange.to !== undefined) return slice - - const sortKey = context.sortKeys[dimensionIndex] - if (!sortKey) return undefined - - const rows = await context.query<{ minVal: string; maxVal: string }>(` -SELECT - toString(min(${sortKey.column})) AS minVal, - toString(max(${sortKey.column})) AS maxVal -FROM ${context.database}.${context.table} -WHERE ${buildWhereClause(slice.partitionId, slice.ranges, context.sortKeys)}`) - - const observed = rows[0] - if (!observed) return undefined - - return { - ...slice, - ranges: replaceSliceRange(slice, dimensionIndex, observed.minVal, toExclusiveUpperBound(observed.maxVal, sortKey)), - } -} - -function buildRootSlice(partition: PartitionInfo): PartitionSlice { - return { - partitionId: partition.partitionId, - ranges: [], - estimatedRows: partition.rows, - estimatedBytes: partition.bytesOnDisk, - isHotKey: false, - estimateConfidence: 'high', - estimateReason: 'partition-metadata', - lineage: [], - } -} - -function buildSliceFromRows( - partition: PartitionInfo, - input: { - ranges: SliceRange[] - estimatedRows: number - isHotKey: boolean - hotDimensionIndex?: number - hotKeyValue?: string - estimateConfidence: EstimateConfidence - estimateReason: EstimateReason - lineage: SliceLineageStep[] - }, -): PartitionSlice { - return { - partitionId: partition.partitionId, - ranges: input.ranges, - estimatedRows: input.estimatedRows, - estimatedBytes: partition.rows > 0 - ? Math.round((input.estimatedRows / partition.rows) * partition.bytesOnDisk) - : 0, - isHotKey: input.isHotKey, - hotDimensionIndex: input.hotDimensionIndex, - hotKeyValue: input.hotKeyValue, - estimateConfidence: input.estimateConfidence, - estimateReason: input.estimateReason, - lineage: input.lineage, - } -} - -function getTargetChunkRows(partition: PartitionInfo, maxChunkBytes: number): number { - if (partition.bytesOnDisk <= 0) return partition.rows - return (maxChunkBytes * partition.rows) / partition.bytesOnDisk -} - -function mergeAdjacentSlices(slices: PartitionSlice[], maxChunkBytes: number): PartitionSlice[] { - if (slices.length <= 1) return slices - - const merged: PartitionSlice[] = [] - let current: PartitionSlice | undefined - - for (const slice of slices) { - if (!current) { - current = slice - continue - } - - const canMerge = - !current.isHotKey && - !slice.isHotKey && - haveSameTrailingRanges(current.ranges, slice.ranges) && - current.estimatedBytes + slice.estimatedBytes <= maxChunkBytes * 1.1 - - if (!canMerge) { - merged.push(current) - current = slice - continue - } - - current = { - ...current, - ranges: mergeRanges(current.ranges, slice.ranges), - estimatedRows: current.estimatedRows + slice.estimatedRows, - estimatedBytes: current.estimatedBytes + slice.estimatedBytes, - } - } - - if (current) merged.push(current) - return merged -} - -function mergeRanges(left: SliceRange[], right: SliceRange[]): SliceRange[] { - return left.map((leftRange) => { - const rightRange = right.find((candidate) => candidate.dimensionIndex === leftRange.dimensionIndex) - return rightRange === undefined - ? leftRange - : { - dimensionIndex: leftRange.dimensionIndex, - from: leftRange.from, - to: rightRange.to, - } - }) -} - -function haveSameTrailingRanges(left: SliceRange[], right: SliceRange[]): boolean { - if (left.length !== right.length) return false - - let differingDimensions = 0 - for (const leftRange of left) { - const rightRange = right.find((candidate) => candidate.dimensionIndex === leftRange.dimensionIndex) - if (!rightRange) return false - - const same = leftRange.from === rightRange.from && leftRange.to === rightRange.to - if (!same) { - differingDimensions += 1 - if (leftRange.to !== rightRange.from) return false - } - } - - return differingDimensions <= 1 -} - -function getCandidateDimensions(sortKeys: SortKeyInfo[], slice: PartitionSlice): number[] { - return sortKeys - .map((sortKey, index) => ({ - index, - priority: getDimensionPriority(sortKey.category, slice.isHotKey, slice.hotDimensionIndex, index), - })) - .sort((left, right) => left.priority - right.priority) - .map((candidate) => candidate.index) -} - -function getDimensionPriority( - category: SortKeyInfo['category'], - isHotKey: boolean, - hotDimensionIndex: number | undefined, - dimensionIndex: number, -): number { - if (isHotKey && hotDimensionIndex === dimensionIndex) return 100 - if (category === 'string') return 0 - if (category === 'datetime') return 1 - return 2 -} - -function getSliceRange(slice: Pick, dimensionIndex: number): SliceRange { - return slice.ranges.find((range) => range.dimensionIndex === dimensionIndex) - ?? { dimensionIndex, from: undefined, to: undefined } -} - -function replaceSliceRange( - slice: Pick, - dimensionIndex: number, - from: string | undefined, - to: string | undefined, -): SliceRange[] { - return slice.ranges - .filter((range) => range.dimensionIndex !== dimensionIndex) - .concat([{ dimensionIndex, from, to }]) - .sort((left, right) => left.dimensionIndex - right.dimensionIndex) -} - -function isExactSliceRange(range: Pick): boolean { - if (range.from === undefined || range.to === undefined) return false - return range.to === `${range.from}\0` -} - -function findHotIdentity( - slice: PartitionSlice, - sortKeys: SortKeyInfo[], -): { dimensionIndex: number; value: string } | undefined { - for (const range of slice.ranges) { - const sortKey = sortKeys[range.dimensionIndex] - if (sortKey?.category !== 'string') continue - if (isExactSliceRange(range) && range.from !== undefined) { - return { dimensionIndex: range.dimensionIndex, value: range.from } - } - } -} - -function applyHotIdentity( - slices: PartitionSlice[], - hotIdentity: { dimensionIndex: number; value: string } | undefined, -): PartitionSlice[] { - if (!hotIdentity) return slices - return slices.map((slice) => markHotSlice(slice, hotIdentity)) -} - -function markHotSlice( - slice: PartitionSlice, - hotIdentity: { dimensionIndex: number; value: string } | undefined, -): PartitionSlice { - if (!hotIdentity) return slice - return { - ...slice, - isHotKey: true, - hotDimensionIndex: hotIdentity.dimensionIndex, - hotKeyValue: hotIdentity.value, - } -} - -function isEffectiveSplit(parent: PartitionSlice, children: PartitionSlice[]): boolean { - if (children.length <= 1) return false - return children.some((child) => - child.estimatedRows !== parent.estimatedRows || - JSON.stringify(child.ranges) !== JSON.stringify(parent.ranges) - ) -} - -function toExclusiveUpperBound(value: string, sortKey: SortKeyInfo): string { - if (sortKey.category === 'string') return `${value}\0` - if (sortKey.category === 'datetime') return new Date(parsePlannerDateTime(value) + 1000).toISOString() - return String(Number(value) + 1) -} - -function getPartitionEndExclusive(partition: PartitionInfo): string { - return new Date(parsePlannerDateTime(partition.maxTime) + 1000).toISOString() -} - -function deriveChunkWindow( - ranges: SliceRange[], - sortKeys: SortKeyInfo[], - partition: PartitionInfo | undefined, -): { from: string; to: string } { - for (const range of ranges) { - const sortKey = sortKeys[range.dimensionIndex] - if (sortKey?.category !== 'datetime') continue - return { - from: range.from ?? partition?.minTime ?? '', - to: range.to ?? partition?.maxTime ?? '', - } - } - - return { - from: partition?.minTime ?? '', - to: partition?.maxTime ?? '', - } -} - -async function countRows(context: QueryContext, partitionId: string, ranges: SliceRange[]): Promise { - const rows = await context.query<{ cnt: string }>(` -SELECT count() AS cnt -FROM ${context.database}.${context.table} -WHERE ${buildWhereClause(partitionId, ranges, context.sortKeys)}`) - return Number(rows[0]?.cnt ?? 0) -} - -function buildWhereClause(partitionId: string, ranges: SliceRange[], sortKeys: SortKeyInfo[]): string { - const conditions = [`_partition_id = ${quoteSqlString(partitionId)}`] - - for (const range of ranges) { - const sortKey = sortKeys[range.dimensionIndex] - if (!sortKey) continue - if (range.from !== undefined) conditions.push(`${sortKey.column} >= ${formatBound(range.from, sortKey)}`) - if (range.to !== undefined) conditions.push(`${sortKey.column} < ${formatBound(range.to, sortKey)}`) - } - - return conditions.join('\n AND ') -} - -function quoteSqlString(value: string): string { - return `'${value.replaceAll('\\', '\\\\').replaceAll('\'', '\\\'')}'` -} - -function formatBound(value: string, sortKey: SortKeyInfo): string { - if (sortKey.category === 'datetime') { - return `parseDateTimeBestEffort(${quoteSqlString(value)})` - } - if (sortKey.category === 'string') { - return `unhex('${Buffer.from(value, 'latin1').toString('hex')}')` - } - return value -} - -function buildEvenlySpacedBoundaries( - from: string, - to: string, - subCount: number, - sortKey: SortKeyInfo, -): string[] { - if (sortKey.category === 'datetime') { - const start = parsePlannerDateTime(from) - const end = parsePlannerDateTime(to) - return uniqueBoundaries(Array.from({ length: subCount + 1 }, (_, index) => - new Date(start + Math.floor(((end - start) * index) / subCount)).toISOString() - )) - } - - if (sortKey.category === 'numeric') { - const start = Number(from) - const end = Number(to) - return uniqueBoundaries(Array.from({ length: subCount + 1 }, (_, index) => - String(start + Math.floor(((end - start) * index) / subCount)) - )) - } - - const start = strToBigInt(from, 8) - const end = strToBigInt(to, 8) - return uniqueBoundaries(Array.from({ length: subCount + 1 }, (_, index) => - bigIntToStr(start + ((end - start) * BigInt(index)) / BigInt(subCount), 8) - )) -} - -function uniqueBoundaries(boundaries: string[]): string[] { - const unique: string[] = [] - for (const boundary of boundaries) { - if (unique[unique.length - 1] !== boundary) { - unique.push(boundary) - } - } - return unique -} - -function parsePlannerDateTime(value: string): number { - const normalized = value.includes('T') ? value : value.replace(' ', 'T') - return Date.parse(normalized.endsWith('Z') ? normalized : `${normalized}Z`) -} - -function strToBigInt(value: string, padTo: number): bigint { - const buffer = Buffer.from(value, 'latin1') - let result = 0n - for (let index = 0; index < padTo; index++) { - const byte = index < buffer.length ? (buffer[index] ?? 0) : 0 - result = (result << 8n) | BigInt(byte) - } - return result -} - -function bigIntToStr(value: bigint, length: number): string { - const buffer = Buffer.alloc(length) - let remaining = value - for (let index = length - 1; index >= 0; index--) { - buffer[index] = Number(remaining & 0xffn) - remaining >>= 8n - } - return buffer.toString('latin1') -} - -function compareBinaryStrings(left: string, right: string): number { - return Buffer.from(left, 'latin1').compare(Buffer.from(right, 'latin1')) -} - -function minBinaryString(left: string, right: string): string { - return compareBinaryStrings(left, right) <= 0 ? left : right -} - -function maxBinaryString(left: string, right: string): string { - return compareBinaryStrings(left, right) >= 0 ? left : right -} - -function nextPrefixValue(prefix: string): string | undefined { - if (prefix === '') return undefined - - const buffer = Buffer.from(prefix, 'latin1') - for (let index = buffer.length - 1; index >= 0; index--) { - const byte = buffer[index] - if (byte === undefined) continue - if (byte === 0xff) continue - - const next = Buffer.from(buffer.subarray(0, index + 1)) - next[index] = (next[index] ?? 0) + 1 - return next.toString('latin1') - } - - return undefined + return analyzeAndChunk(input) } diff --git a/packages/plugin-backfill/src/chunking/boundary-codec.ts b/packages/plugin-backfill/src/chunking/boundary-codec.ts new file mode 100644 index 0000000..73b8984 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/boundary-codec.ts @@ -0,0 +1,109 @@ +import type { + Chunk, + ChunkPlan, + ChunkRange, + FocusedValue, + SortKey, +} from './types.js' + +export function encodeBoundary( + value: string | undefined, + sortKey: SortKey | undefined, +): string | undefined { + if (value === undefined || sortKey === undefined) return value + if (sortKey.boundaryEncoding === 'hex-latin1') { + return Buffer.from(value, 'latin1').toString('hex') + } + return value +} + +export function decodeBoundary( + value: string | undefined, + sortKey: SortKey | undefined, +): string | undefined { + if (value === undefined || sortKey === undefined) return value + if (sortKey.boundaryEncoding === 'hex-latin1') { + return Buffer.from(value, 'hex').toString('latin1') + } + return value +} + +export function encodeRangesForPlan( + ranges: ChunkRange[], + sortKeys: SortKey[], +): ChunkRange[] { + return ranges.map((range) => ({ + dimensionIndex: range.dimensionIndex, + from: encodeBoundary(range.from, sortKeys[range.dimensionIndex]), + to: encodeBoundary(range.to, sortKeys[range.dimensionIndex]), + })) +} + +export function decodeRangesFromPlan( + ranges: ChunkRange[], + sortKeys: SortKey[], +): ChunkRange[] { + return ranges.map((range) => ({ + dimensionIndex: range.dimensionIndex, + from: decodeBoundary(range.from, sortKeys[range.dimensionIndex]), + to: decodeBoundary(range.to, sortKeys[range.dimensionIndex]), + })) +} + +function encodeFocusedValue( + focusedValue: FocusedValue | undefined, + sortKeys: SortKey[], +): FocusedValue | undefined { + if (!focusedValue) return undefined + return { + dimensionIndex: focusedValue.dimensionIndex, + value: encodeBoundary(focusedValue.value, sortKeys[focusedValue.dimensionIndex]) ?? focusedValue.value, + } +} + +function decodeFocusedValue( + focusedValue: FocusedValue | undefined, + sortKeys: SortKey[], +): FocusedValue | undefined { + if (!focusedValue) return undefined + return { + dimensionIndex: focusedValue.dimensionIndex, + value: decodeBoundary(focusedValue.value, sortKeys[focusedValue.dimensionIndex]) ?? focusedValue.value, + } +} + +export function encodeChunkForPlan(chunk: Chunk, sortKeys: SortKey[]): Chunk { + return { + ...chunk, + ranges: encodeRangesForPlan(chunk.ranges, sortKeys), + analysis: { + ...chunk.analysis, + focusedValue: encodeFocusedValue(chunk.analysis.focusedValue, sortKeys), + }, + } +} + +export function decodeChunkFromPlan(chunk: Chunk, sortKeys: SortKey[]): Chunk { + return { + ...chunk, + ranges: decodeRangesFromPlan(chunk.ranges, sortKeys), + analysis: { + ...chunk.analysis, + focusedValue: decodeFocusedValue(chunk.analysis.focusedValue, sortKeys), + }, + } +} + +export function encodeChunkPlanForPersistence(plan: ChunkPlan): ChunkPlan { + return { + ...plan, + chunks: plan.chunks.map((chunk) => encodeChunkForPlan(chunk, plan.table.sortKeys)), + } +} + +export function decodeChunkPlanFromPersistence(plan: ChunkPlan): ChunkPlan { + return { + ...plan, + chunks: plan.chunks.map((chunk) => decodeChunkFromPlan(chunk, plan.table.sortKeys)), + } +} diff --git a/packages/plugin-backfill/src/chunking/build.test.ts b/packages/plugin-backfill/src/chunking/build.test.ts deleted file mode 100644 index 71aa47e..0000000 --- a/packages/plugin-backfill/src/chunking/build.test.ts +++ /dev/null @@ -1,135 +0,0 @@ -import { describe, expect, test } from 'bun:test' - -import { buildChunkBoundaries } from './build.js' -import type { PartitionInfo, SortKeyInfo } from './types.js' - -const GiB = 1024 ** 3 - -describe('buildChunkBoundaries', () => { - test('small partition produces one chunk boundary', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 1000, bytesOnDisk: 5 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T23:59:59.000Z' }, - ] - - const boundaries = buildChunkBoundaries({ - partitions, - maxChunkBytes: 10 * GiB, - }) - - expect(boundaries).toHaveLength(1) - expect(boundaries[0]?.partitionId).toBe('202501') - expect(boundaries[0]?.sortKeyFrom).toBeUndefined() - expect(boundaries[0]?.sortKeyTo).toBeUndefined() - expect(boundaries[0]?.estimatedBytes).toBe(5 * GiB) - }) - - test('large partition produces multiple sub-chunks with sort key ranges', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 10000, bytesOnDisk: 30 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - const sortKey: SortKeyInfo = { column: 'event_time', type: 'DateTime', category: 'datetime' } - const sortKeyRanges = new Map([ - ['202501', { min: '2025-01-01 00:00:00', max: '2025-01-31 00:00:00' }], - ]) - - const boundaries = buildChunkBoundaries({ - partitions, - maxChunkBytes: 10 * GiB, - sortKey, - sortKeyRanges, - }) - - expect(boundaries).toHaveLength(3) - for (const b of boundaries) { - expect(b.partitionId).toBe('202501') - expect(b.sortKeyFrom).toBeDefined() - expect(b.sortKeyTo).toBeDefined() - } - }) - - test('large partition without sort key produces single chunk', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 10000, bytesOnDisk: 30 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - - const boundaries = buildChunkBoundaries({ - partitions, - maxChunkBytes: 10 * GiB, - }) - - expect(boundaries).toHaveLength(1) - expect(boundaries[0]?.estimatedBytes).toBe(30 * GiB) - }) - - test('mixed sizes produce correct boundary counts', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 500, bytesOnDisk: 5 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - { partitionId: '202502', rows: 5000, bytesOnDisk: 25 * GiB, minTime: '2025-02-01T00:00:00.000Z', maxTime: '2025-02-28T00:00:00.000Z' }, - ] - const sortKey: SortKeyInfo = { column: 'event_time', type: 'DateTime', category: 'datetime' } - const sortKeyRanges = new Map([ - ['202502', { min: '2025-02-01 00:00:00', max: '2025-02-28 00:00:00' }], - ]) - - const boundaries = buildChunkBoundaries({ - partitions, - maxChunkBytes: 10 * GiB, - sortKey, - sortKeyRanges, - }) - - // First partition: 5 GiB < 10 GiB -> 1 boundary - // Second partition: 25 GiB / 10 GiB = 3 sub-boundaries - expect(boundaries).toHaveLength(4) - - const p1 = boundaries.filter((b) => b.partitionId === '202501') - const p2 = boundaries.filter((b) => b.partitionId === '202502') - expect(p1).toHaveLength(1) - expect(p2).toHaveLength(3) - }) - - test('large partition with min === max sort key produces single chunk', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 10000, bytesOnDisk: 30 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - const sortKey: SortKeyInfo = { column: 'event_type', type: 'String', category: 'string' } - const sortKeyRanges = new Map([ - ['202501', { min: 'click', max: 'click' }], - ]) - - const boundaries = buildChunkBoundaries({ - partitions, - maxChunkBytes: 10 * GiB, - sortKey, - sortKeyRanges, - }) - - expect(boundaries).toHaveLength(1) - expect(boundaries[0]?.partitionId).toBe('202501') - expect(boundaries[0]?.sortKeyFrom).toBeUndefined() - expect(boundaries[0]?.sortKeyTo).toBeUndefined() - }) - - test('numeric sort key produces numeric range sub-chunks', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 10000, bytesOnDisk: 20 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - const sortKey: SortKeyInfo = { column: 'id', type: 'UInt64', category: 'numeric' } - const sortKeyRanges = new Map([ - ['202501', { min: '100', max: '200' }], - ]) - - const boundaries = buildChunkBoundaries({ - partitions, - maxChunkBytes: 10 * GiB, - sortKey, - sortKeyRanges, - }) - - expect(boundaries).toHaveLength(2) - expect(boundaries[0]?.sortKeyFrom).toBe('100') - expect(boundaries[0]?.sortKeyTo).toBe('150') - expect(boundaries[1]?.sortKeyFrom).toBe('150') - expect(boundaries[1]?.sortKeyTo).toBe('201') - }) -}) diff --git a/packages/plugin-backfill/src/chunking/build.ts b/packages/plugin-backfill/src/chunking/build.ts deleted file mode 100644 index cc6693b..0000000 --- a/packages/plugin-backfill/src/chunking/build.ts +++ /dev/null @@ -1,60 +0,0 @@ -import { splitSortKeyRange } from './splitter.js' -import type { ChunkBoundary, PartitionInfo, SortKeyInfo } from './types.js' - -export function buildChunkBoundaries(input: { - partitions: PartitionInfo[] - maxChunkBytes: number - sortKey?: SortKeyInfo - sortKeyRanges?: Map -}): ChunkBoundary[] { - const boundaries: ChunkBoundary[] = [] - - for (const partition of input.partitions) { - if (partition.bytesOnDisk <= input.maxChunkBytes) { - boundaries.push({ - partitionId: partition.partitionId, - estimatedBytes: partition.bytesOnDisk, - }) - } else if (input.sortKey && input.sortKeyRanges) { - const range = input.sortKeyRanges.get(partition.partitionId) - if (!range) { - // No range data — emit as single chunk - boundaries.push({ - partitionId: partition.partitionId, - estimatedBytes: partition.bytesOnDisk, - }) - continue - } - - // If min === max, splitting would produce empty sub-ranges; emit as single chunk - if (range.min === range.max) { - boundaries.push({ - partitionId: partition.partitionId, - estimatedBytes: partition.bytesOnDisk, - }) - continue - } - - const subCount = Math.ceil(partition.bytesOnDisk / input.maxChunkBytes) - const subRanges = splitSortKeyRange(input.sortKey.category, range.min, range.max, subCount) - const estimatedBytesPerSub = Math.ceil(partition.bytesOnDisk / subCount) - - for (const sub of subRanges) { - boundaries.push({ - partitionId: partition.partitionId, - sortKeyFrom: sub.from, - sortKeyTo: sub.to, - estimatedBytes: estimatedBytesPerSub, - }) - } - } else { - // No sort key info — emit as single chunk despite being oversized - boundaries.push({ - partitionId: partition.partitionId, - estimatedBytes: partition.bytesOnDisk, - }) - } - } - - return boundaries -} diff --git a/packages/plugin-backfill/src/chunking/introspect.test.ts b/packages/plugin-backfill/src/chunking/introspect.test.ts deleted file mode 100644 index b40822b..0000000 --- a/packages/plugin-backfill/src/chunking/introspect.test.ts +++ /dev/null @@ -1,274 +0,0 @@ -import { describe, expect, test } from 'bun:test' - -import { introspectTable, queryPartitionInfo, querySortKeyInfo, querySortKeyRanges, querySortKeys } from './introspect.js' - -describe('queryPartitionInfo', () => { - test('maps system.parts rows to PartitionInfo array', async () => { - const mockRows = [ - { partition_id: '202501', total_rows: '1000', total_bytes: '5000000', min_time: '2025-01-01 00:00:00', max_time: '2025-01-31 23:59:59' }, - { partition_id: '202502', total_rows: '2000', total_bytes: '8000000', min_time: '2025-02-01 00:00:00', max_time: '2025-02-28 23:59:59' }, - ] - - const result = await queryPartitionInfo({ - database: 'default', - table: 'events', - query: async () => mockRows as never, - }) - - expect(result).toHaveLength(2) - expect(result[0]?.partitionId).toBe('202501') - expect(result[0]?.rows).toBe(1000) - expect(result[0]?.bytesOnDisk).toBe(5000000) - expect(result[1]?.partitionId).toBe('202502') - expect(result[1]?.rows).toBe(2000) - }) - - test('filters out partitions before --from', async () => { - const mockRows = [ - { partition_id: '202501', total_rows: '1000', total_bytes: '5000000', min_time: '2025-01-01 00:00:00', max_time: '2025-01-31 23:59:59' }, - { partition_id: '202503', total_rows: '3000', total_bytes: '9000000', min_time: '2025-03-01 00:00:00', max_time: '2025-03-31 23:59:59' }, - ] - - const result = await queryPartitionInfo({ - database: 'default', - table: 'events', - from: '2025-02-01T00:00:00.000Z', - query: async () => mockRows as never, - }) - - expect(result).toHaveLength(1) - expect(result[0]?.partitionId).toBe('202503') - }) - - test('filters out partitions at or after --to', async () => { - const mockRows = [ - { partition_id: '202501', total_rows: '1000', total_bytes: '5000000', min_time: '2025-01-01 00:00:00', max_time: '2025-01-31 23:59:59' }, - { partition_id: '202503', total_rows: '3000', total_bytes: '9000000', min_time: '2025-03-01 00:00:00', max_time: '2025-03-31 23:59:59' }, - ] - - const result = await queryPartitionInfo({ - database: 'default', - table: 'events', - to: '2025-03-01T00:00:00.000Z', - query: async () => mockRows as never, - }) - - expect(result).toHaveLength(1) - expect(result[0]?.partitionId).toBe('202501') - }) -}) - -describe('querySortKeyInfo', () => { - test('returns sort key info for table with DateTime sorting key', async () => { - const query = async (sql: string) => { - if (sql.includes('system.tables')) { - return [{ sorting_key: 'event_time' }] as T[] - } - if (sql.includes('system.columns')) { - return [{ type: 'DateTime' }] as T[] - } - return [] as T[] - } - - const result = await querySortKeyInfo({ - database: 'default', - table: 'events', - query, - }) - - expect(result).toBeDefined() - expect(result?.column).toBe('event_time') - expect(result?.type).toBe('DateTime') - expect(result?.category).toBe('datetime') - }) - - test('returns numeric category for Int64 sorting key', async () => { - const query = async (sql: string) => { - if (sql.includes('system.tables')) return [{ sorting_key: 'id' }] as T[] - if (sql.includes('system.columns')) return [{ type: 'Int64' }] as T[] - return [] as T[] - } - - const result = await querySortKeyInfo({ database: 'default', table: 'events', query }) - - expect(result?.category).toBe('numeric') - }) - - test('returns string category for String sorting key', async () => { - const query = async (sql: string) => { - if (sql.includes('system.tables')) return [{ sorting_key: 'name' }] as T[] - if (sql.includes('system.columns')) return [{ type: 'String' }] as T[] - return [] as T[] - } - - const result = await querySortKeyInfo({ database: 'default', table: 'events', query }) - - expect(result?.category).toBe('string') - }) - - test('extracts column name from function expression', async () => { - const query = async (sql: string) => { - if (sql.includes('system.tables')) return [{ sorting_key: 'toDate(event_time)' }] as T[] - if (sql.includes('system.columns')) return [{ type: 'DateTime' }] as T[] - return [] as T[] - } - - const result = await querySortKeyInfo({ database: 'default', table: 'events', query }) - - expect(result?.column).toBe('event_time') - }) - - test('returns undefined when table has no sorting key', async () => { - const query = async (sql: string) => { - if (sql.includes('system.tables')) return [{ sorting_key: '' }] as T[] - return [] as T[] - } - - const result = await querySortKeyInfo({ database: 'default', table: 'events', query }) - - expect(result).toBeUndefined() - }) - - test('returns first column from multi-column sorting key', async () => { - const query = async (sql: string) => { - if (sql.includes('system.tables')) return [{ sorting_key: 'event_time, id' }] as T[] - if (sql.includes('system.columns')) return [{ name: 'event_time', type: 'DateTime' }, { name: 'id', type: 'UInt64' }] as T[] - return [] as T[] - } - - const result = await querySortKeyInfo({ database: 'default', table: 'events', query }) - - expect(result?.column).toBe('event_time') - }) - - test('extracts a single referenced column from function expressions with commas', async () => { - const query = async (sql: string) => { - if (sql.includes('system.tables')) { - return [{ sorting_key: 'toStartOfInterval(ts, INTERVAL 5 MINUTE), user_id' }] as T[] - } - if (sql.includes('system.columns')) { - return [ - { name: 'ts', type: 'DateTime' }, - { name: 'user_id', type: 'String' }, - ] as T[] - } - return [] as T[] - } - - const result = await querySortKeys({ database: 'default', table: 'events', query }) - - expect(result.map((key) => key.column)).toEqual(['ts', 'user_id']) - expect(result.map((key) => key.category)).toEqual(['datetime', 'string']) - }) - - test('skips ambiguous tuple expressions that do not map to one physical column', async () => { - const query = async (sql: string) => { - if (sql.includes('system.tables')) { - return [{ sorting_key: 'tuple(user_id, session_id), event_time' }] as T[] - } - if (sql.includes('system.columns')) { - return [ - { name: 'user_id', type: 'String' }, - { name: 'session_id', type: 'String' }, - { name: 'event_time', type: 'DateTime' }, - ] as T[] - } - return [] as T[] - } - - const result = await querySortKeys({ database: 'default', table: 'events', query }) - - expect(result.map((key) => key.column)).toEqual(['event_time']) - }) -}) - -describe('querySortKeyRanges', () => { - test('returns min/max per partition', async () => { - const query = async () => { - return [ - { partition_id: '202501', min_val: '2025-01-01 00:00:00', max_val: '2025-01-31 23:59:59' }, - { partition_id: '202502', min_val: '2025-02-01 00:00:00', max_val: '2025-02-28 23:59:59' }, - ] as T[] - } - - const result = await querySortKeyRanges({ - database: 'default', - table: 'events', - sortKeyColumn: 'event_time', - partitionIds: ['202501', '202502'], - query, - }) - - expect(result.size).toBe(2) - expect(result.get('202501')?.min).toBe('2025-01-01 00:00:00') - expect(result.get('202502')?.max).toBe('2025-02-28 23:59:59') - }) - - test('returns empty map for empty partition list', async () => { - const query = async () => [] as T[] - - const result = await querySortKeyRanges({ - database: 'default', - table: 'events', - sortKeyColumn: 'event_time', - partitionIds: [], - query, - }) - - expect(result.size).toBe(0) - }) -}) - -describe('introspectTable', () => { - test('returns partitions and sort key in a single call', async () => { - const query = async (sql: string) => { - if (sql.includes('system.parts')) { - return [ - { partition_id: '202501', total_rows: '1000', total_bytes: '5000000', min_time: '2025-01-01 00:00:00', max_time: '2025-01-31 23:59:59' }, - ] as T[] - } - if (sql.includes('system.tables')) { - return [{ sorting_key: 'event_time' }] as T[] - } - if (sql.includes('system.columns')) { - return [{ type: 'DateTime' }] as T[] - } - return [] as T[] - } - - const result = await introspectTable({ - database: 'default', - table: 'events', - query, - }) - - expect(result.partitions).toHaveLength(1) - expect(result.partitions[0]?.partitionId).toBe('202501') - expect(result.sortKey).toBeDefined() - expect(result.sortKey?.column).toBe('event_time') - expect(result.sortKey?.category).toBe('datetime') - }) - - test('returns undefined sortKey when table has no sorting key', async () => { - const query = async (sql: string) => { - if (sql.includes('system.parts')) { - return [ - { partition_id: '202501', total_rows: '1000', total_bytes: '5000000', min_time: '2025-01-01 00:00:00', max_time: '2025-01-31 23:59:59' }, - ] as T[] - } - if (sql.includes('system.tables')) { - return [{ sorting_key: '' }] as T[] - } - return [] as T[] - } - - const result = await introspectTable({ - database: 'default', - table: 'events', - query, - }) - - expect(result.partitions).toHaveLength(1) - expect(result.sortKey).toBeUndefined() - }) -}) diff --git a/packages/plugin-backfill/src/chunking/introspect.ts b/packages/plugin-backfill/src/chunking/introspect.ts deleted file mode 100644 index 16f0c79..0000000 --- a/packages/plugin-backfill/src/chunking/introspect.ts +++ /dev/null @@ -1,262 +0,0 @@ -import type { PartitionInfo, SortKeyInfo } from './types.js' - -const NUMERIC_TYPES = new Set([ - 'Int8', 'Int16', 'Int32', 'Int64', 'Int128', 'Int256', - 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'UInt128', 'UInt256', - 'Float32', 'Float64', -]) - -const DATETIME_TYPES = new Set(['Date', 'Date32', 'DateTime', 'DateTime64']) - -function classifySortKeyType(type: string): SortKeyInfo['category'] { - if (NUMERIC_TYPES.has(type)) return 'numeric' - if (DATETIME_TYPES.has(type)) return 'datetime' - if (type.startsWith('DateTime64(')) return 'datetime' - if (type.startsWith("DateTime('")) return 'datetime' - return 'string' -} - -export async function queryPartitionInfo(input: { - database: string - table: string - from?: string - to?: string - query: (sql: string) => Promise -}): Promise { - // Force replica sync on the target table before reading system.parts. - // select_sequential_consistency is only effective on user tables, not system - // tables, so this preliminary query ensures the replica has caught up with - // all pending writes before we inspect part metadata. - await input.query( - `SELECT 1 FROM ${input.database}.${input.table} LIMIT 1 SETTINGS select_sequential_consistency = 1` - ) - - const rows = await input.query<{ - partition_id: string - total_rows: string - total_bytes: string - total_uncompressed_bytes?: string - min_time: string - max_time: string - }>( - `SELECT - partition_id, - toString(sum(rows)) AS total_rows, - toString(sum(bytes_on_disk)) AS total_bytes, - toString(sum(data_uncompressed_bytes)) AS total_uncompressed_bytes, - toString(min(min_time)) AS min_time, - toString(max(max_time)) AS max_time -FROM system.parts -WHERE database = '${input.database}' - AND table = '${input.table}' - AND active = 1 -GROUP BY partition_id -ORDER BY partition_id -SETTINGS select_sequential_consistency = 1` - ) - - const partitions: PartitionInfo[] = rows.map((row) => ({ - partitionId: row.partition_id, - rows: Number(row.total_rows), - bytesOnDisk: Number(row.total_bytes), - bytesUncompressed: Number(row.total_uncompressed_bytes ?? row.total_bytes), - minTime: new Date(row.min_time).toISOString(), - maxTime: new Date(row.max_time).toISOString(), - })) - - return partitions.filter((p) => { - if (input.from && p.maxTime < input.from) return false - if (input.to && p.minTime >= input.to) return false - return true - }) -} - -function extractSortKeyColumns(sortingKey: string): string[] { - return splitTopLevelCsv(sortingKey) - .map((part) => part.trim()) - .filter((part): part is string => part.length > 0) -} - -function splitTopLevelCsv(input: string): string[] { - const parts: string[] = [] - let current = '' - let depth = 0 - let quote: "'" | '"' | undefined - - for (let index = 0; index < input.length; index++) { - const char = input[index] - if (char === undefined) continue - - if (quote) { - current += char - if (char === quote && input[index - 1] !== '\\') { - quote = undefined - } - continue - } - - if (char === '\'' || char === '"') { - quote = char - current += char - continue - } - - if (char === '(') { - depth += 1 - current += char - continue - } - - if (char === ')') { - depth = Math.max(0, depth - 1) - current += char - continue - } - - if (char === ',' && depth === 0) { - parts.push(current.trim()) - current = '' - continue - } - - current += char - } - - if (current.trim().length > 0) { - parts.push(current.trim()) - } - - return parts -} - -function resolveSortKeyColumn(expression: string, knownColumns: Set): string | undefined { - const trimmed = expression.trim() - if (knownColumns.has(trimmed)) { - return trimmed - } - - const identifiers = Array.from(trimmed.matchAll(/\b[A-Za-z_][A-Za-z0-9_]*\b/g)) - .map((match) => match[0]) - .filter((identifier): identifier is string => Boolean(identifier)) - - const matches = Array.from(new Set(identifiers.filter((identifier) => knownColumns.has(identifier)))) - if (matches.length === 1) { - return matches[0] - } - - return undefined -} - -function resolveSortKeyColumnWithoutSchema(expression: string): string | undefined { - const trimmed = expression.trim() - if (/^[A-Za-z_][A-Za-z0-9_]*$/.test(trimmed)) { - return trimmed - } - - const identifiers = Array.from(trimmed.matchAll(/\b[A-Za-z_][A-Za-z0-9_]*\b/g)) - .map((match) => match[0]) - .filter((identifier): identifier is string => Boolean(identifier)) - - return identifiers.length > 0 ? identifiers[identifiers.length - 1] : undefined -} - -export async function querySortKeys(input: { - database: string - table: string - query: (sql: string) => Promise -}): Promise { - const tableRows = await input.query<{ sorting_key: string }>( - `SELECT sorting_key FROM system.tables WHERE database = '${input.database}' AND name = '${input.table}'` - ) - - const sortingKey = tableRows[0]?.sorting_key - if (!sortingKey) return [] - - const expressions = extractSortKeyColumns(sortingKey) - if (expressions.length === 0) return [] - - const columnRows = await input.query<{ name?: string; type: string }>( - `SELECT name, type FROM system.columns WHERE database = '${input.database}' AND table = '${input.table}'` - ) - const typeByName = new Map( - columnRows - .filter((row): row is { name: string; type: string } => Boolean(row.name)) - .map((row) => [row.name, row.type]) - ) - const knownColumns = new Set(typeByName.keys()) - - if (knownColumns.size === 0) { - return expressions.flatMap((expression, index) => { - const column = resolveSortKeyColumnWithoutSchema(expression) - const type = columnRows[index]?.type ?? columnRows[0]?.type - if (!column || !type) return [] - - return [{ - column, - type, - category: classifySortKeyType(type), - }] - }) - } - - return expressions.flatMap((expression) => { - const column = resolveSortKeyColumn(expression, knownColumns) - if (!column) return [] - const type = typeByName.get(column) ?? 'String' - return [{ - column, - type, - category: classifySortKeyType(type), - }] - }) -} - -export async function querySortKeyInfo(input: { - database: string - table: string - query: (sql: string) => Promise -}): Promise { - return (await querySortKeys(input))[0] -} - -export async function querySortKeyRanges(input: { - database: string - table: string - sortKeyColumn: string - partitionIds: string[] - query: (sql: string) => Promise -}): Promise> { - if (input.partitionIds.length === 0) return new Map() - - const inList = input.partitionIds.map((id) => `'${id}'`).join(', ') - const rows = await input.query<{ - partition_id: string - min_val: string - max_val: string - }>( - `SELECT _partition_id AS partition_id, toString(min(${input.sortKeyColumn})) AS min_val, toString(max(${input.sortKeyColumn})) AS max_val FROM ${input.database}.${input.table} WHERE _partition_id IN (${inList}) GROUP BY _partition_id SETTINGS select_sequential_consistency = 1` - ) - - const result = new Map() - for (const row of rows) { - result.set(row.partition_id, { min: row.min_val, max: row.max_val }) - } - return result -} - -export async function introspectTable(input: { - database: string - table: string - from?: string - to?: string - query: (sql: string) => Promise -}): Promise<{ partitions: PartitionInfo[]; sortKey?: SortKeyInfo; sortKeys: SortKeyInfo[] }> { - const partitions = await queryPartitionInfo(input) - const sortKeys = await querySortKeys({ - database: input.database, - table: input.table, - query: input.query, - }) - - return { partitions, sortKey: sortKeys[0], sortKeys } -} diff --git a/packages/plugin-backfill/src/chunking/partition-slices.ts b/packages/plugin-backfill/src/chunking/partition-slices.ts new file mode 100644 index 0000000..18099b0 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/partition-slices.ts @@ -0,0 +1,153 @@ +import type { + ChunkEstimate, + EstimateConfidence, + EstimateReason, + Partition, + PartitionSlice, + ChunkDerivationStep, + ChunkRange, +} from './types.js' + +export function buildRootSlice(partition: Partition): PartitionSlice { + return { + partitionId: partition.partitionId, + ranges: [], + estimate: { + rows: partition.rows, + bytesCompressed: partition.bytesCompressed, + bytesUncompressed: partition.bytesUncompressed, + confidence: 'high', + reason: 'partition-metadata', + }, + analysis: { + lineage: [], + }, + } +} + +export function buildSliceEstimate( + partition: Partition, + rows: number, + confidence: EstimateConfidence, + reason: EstimateReason, +): ChunkEstimate { + const bytesCompressed = partition.rows > 0 + ? Math.round((rows / partition.rows) * partition.bytesCompressed) + : 0 + const bytesUncompressed = partition.rows > 0 + ? Math.round((rows / partition.rows) * partition.bytesUncompressed) + : 0 + + return { + rows, + bytesCompressed, + bytesUncompressed, + confidence, + reason, + } +} + +export function buildSliceFromRows( + partition: Partition, + input: { + ranges: ChunkRange[] + rows: number + focusedValue?: PartitionSlice['analysis']['focusedValue'] + confidence: EstimateConfidence + reason: EstimateReason + lineage: ChunkDerivationStep[] + }, +): PartitionSlice { + return { + partitionId: partition.partitionId, + ranges: input.ranges, + estimate: buildSliceEstimate(partition, input.rows, input.confidence, input.reason), + analysis: { + focusedValue: input.focusedValue, + lineage: input.lineage, + }, + } +} + +export function getTargetChunkRows( + partition: Partition, + targetChunkBytes: number, +): number { + if (partition.bytesCompressed <= 0) return partition.rows + return (targetChunkBytes * partition.rows) / partition.bytesCompressed +} + +export function mergeAdjacentSlices( + slices: PartitionSlice[], + targetChunkBytes: number, +): PartitionSlice[] { + if (slices.length <= 1) return slices + + const merged: PartitionSlice[] = [] + let current: PartitionSlice | undefined + + for (const slice of slices) { + if (!current) { + current = slice + continue + } + + const canMerge = + !current.analysis.focusedValue && + !slice.analysis.focusedValue && + haveSameTrailingRanges(current.ranges, slice.ranges) && + current.estimate.bytesCompressed + slice.estimate.bytesCompressed <= targetChunkBytes * 1.1 + + if (!canMerge) { + merged.push(current) + current = slice + continue + } + + current = { + ...current, + ranges: mergeRanges(current.ranges, slice.ranges), + estimate: { + ...current.estimate, + rows: current.estimate.rows + slice.estimate.rows, + bytesCompressed: current.estimate.bytesCompressed + slice.estimate.bytesCompressed, + bytesUncompressed: current.estimate.bytesUncompressed + slice.estimate.bytesUncompressed, + }, + } + } + + if (current) merged.push(current) + return merged +} + +function mergeRanges(left: ChunkRange[], right: ChunkRange[]): ChunkRange[] { + return left.map((leftRange) => { + const rightRange = right.find((candidate) => candidate.dimensionIndex === leftRange.dimensionIndex) + return rightRange === undefined + ? leftRange + : { + dimensionIndex: leftRange.dimensionIndex, + from: leftRange.from, + to: rightRange.to, + } + }) +} + +function haveSameTrailingRanges(left: ChunkRange[], right: ChunkRange[]): boolean { + if (left.length !== right.length) return false + + let differingDimensions = 0 + + for (const leftRange of left) { + const rightRange = right.find((candidate) => candidate.dimensionIndex === leftRange.dimensionIndex) + if (!rightRange) return false + + const same = leftRange.from === rightRange.from && leftRange.to === rightRange.to + if (!same) { + differingDimensions += 1 + if (leftRange.to !== rightRange.from) return false + } + } + + return differingDimensions <= 1 +} diff --git a/packages/plugin-backfill/src/chunking/planner.ts b/packages/plugin-backfill/src/chunking/planner.ts new file mode 100644 index 0000000..116d3a9 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/planner.ts @@ -0,0 +1,358 @@ +import { buildRootSlice, mergeAdjacentSlices } from './partition-slices.js' +import { introspectPartitions, introspectSortKeys } from './services/metadata-source.js' +import { getRowProbeStrategy, getSortKeyRange, parsePlannerDateTime } from './services/row-probe.js' +import { splitSliceWithEqualWidthRanges } from './strategies/equal-width-split.js' +import { buildSingleChunkPartition } from './strategies/metadata-single-chunk.js' +import { + findQuantileBoundaryOnDimension, + splitSliceWithQuantiles, +} from './strategies/quantile-range-split.js' +import { refinePartitionSlices } from './strategies/refinement.js' +import { buildRootStringUpperBound, splitSliceWithStringPrefixes } from './strategies/string-prefix-split.js' +import { splitSliceWithTemporalBuckets } from './strategies/temporal-bucket-split.js' +import { getCandidateDimensions } from './strategy-policy.js' +import type { + Chunk, + ChunkPlan, + GenerateChunkPlanInput, + Partition, + PartitionBuildResult, + PartitionSlice, + PlannerContext, + SortKey, + TableProfile, +} from './types.js' +import { generateChunkId, generatePlanId } from './utils/ids.js' +import { getChunkRange, isExactChunkRange, replaceChunkRange } from './utils/ranges.js' + +const MAX_SPLIT_DEPTH_MULTIPLIER = 3 +const STOP_SPLIT_FUZZ_FACTOR = 1.5 + +export async function generateChunkPlan(input: GenerateChunkPlanInput): Promise { + const context: PlannerContext = { + database: input.database, + table: input.table, + from: input.from, + to: input.to, + targetChunkBytes: input.targetChunkBytes, + query: input.query, + rowProbeStrategy: input.rowProbeStrategy ?? 'count', + } + + const partitions = await introspectPartitions(context) + const sortKeys = await introspectSortKeys(context) + const table: TableProfile = { + database: input.database, + table: input.table, + sortKeys, + } + const planId = generatePlanId() + + const slices: PartitionSlice[] = [] + const plannedPartitions: Partition[] = [] + for (const partition of partitions) { + const result = await planPartition(context, partition, table) + slices.push(...result.slices) + plannedPartitions.push({ + ...partition, + diagnostics: result.diagnostics, + }) + } + + const chunks = assignChunkIds(planId, slices) + const chunkBytes = chunks.map((chunk) => chunk.estimate.bytesCompressed) + + return { + planId, + generatedAt: new Date().toISOString(), + rowProbeStrategy: getRowProbeStrategy(context), + targetChunkBytes: context.targetChunkBytes, + table, + partitions: plannedPartitions, + chunks, + totalRows: partitions.reduce((sum, partition) => sum + partition.rows, 0), + totalBytesCompressed: partitions.reduce((sum, partition) => sum + partition.bytesCompressed, 0), + totalBytesUncompressed: partitions.reduce((sum, partition) => sum + partition.bytesUncompressed, 0), + stats: { + totalPartitions: partitions.length, + oversizedPartitions: partitions.filter((partition) => partition.bytesCompressed > context.targetChunkBytes).length, + focusedChunks: chunks.filter((chunk) => chunk.analysis.focusedValue !== undefined).length, + totalChunks: chunks.length, + avgChunkBytes: chunkBytes.length > 0 + ? Math.round(chunkBytes.reduce((sum, value) => sum + value, 0) / chunkBytes.length) + : 0, + maxChunkBytes: chunkBytes.length > 0 ? Math.max(...chunkBytes) : 0, + minChunkBytes: chunkBytes.length > 0 ? Math.min(...chunkBytes) : 0, + }, + } +} + +async function planPartition( + context: PlannerContext, + partition: Partition, + table: TableProfile, +): Promise { + if (partition.bytesCompressed <= context.targetChunkBytes || table.sortKeys.length === 0) { + return refinePartitionSlices( + context, + partition, + buildSingleChunkPartition(partition), + table.sortKeys, + false + ) + } + + const rootSlice = buildRootSlice(partition) + const splitSlices = await splitSliceRecursively(context, partition, rootSlice, table.sortKeys, 0) + const mergedSlices = mergeAdjacentSlices(splitSlices, context.targetChunkBytes) + const usedDistributionFallback = mergedSlices.some((slice) => + slice.estimate.reason === 'string-prefix-distribution' || + slice.estimate.reason === 'temporal-distribution' || + slice.estimate.reason === 'equal-width-distribution' + ) + + return refinePartitionSlices( + context, + partition, + mergedSlices, + table.sortKeys, + usedDistributionFallback + ) +} + +async function splitSliceRecursively( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + depth: number, +): Promise { + if (slice.estimate.bytesCompressed <= context.targetChunkBytes * STOP_SPLIT_FUZZ_FACTOR) { + return [slice] + } + + if (depth >= sortKeys.length * MAX_SPLIT_DEPTH_MULTIPLIER) { + return [slice] + } + + const children = await splitOversizedSlice(context, partition, slice, sortKeys, depth) + if (children.length <= 1) { + return [slice] + } + + const finalized: PartitionSlice[] = [] + for (const child of children) { + finalized.push(...(await splitSliceRecursively(context, partition, child, sortKeys, depth + 1))) + } + + return finalized +} + +async function splitOversizedSlice( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + depth: number, +): Promise { + const candidateDimensions = getCandidateDimensions(sortKeys, slice) + + for (const dimensionIndex of candidateDimensions) { + const preparedSlice = await hydrateSliceRange(context, slice, sortKeys, dimensionIndex) + if (!preparedSlice) continue + + const sortKey = sortKeys[dimensionIndex] + if (!sortKey) continue + + const rootLike = depth === 0 + const focusedValue = findFocusedValue(preparedSlice, sortKeys) + + if (sortKey.category === 'string') { + const stringSlices = await splitSliceWithStringPrefixes(context, partition, preparedSlice, sortKeys, dimensionIndex) + if (isEffectiveSplit(preparedSlice, stringSlices)) { + return applyFocusedValue(stringSlices, focusedValue) + } + } + + if (sortKey.category === 'datetime' && (!rootLike || focusedValue !== undefined)) { + const temporalSlices = await splitSliceWithTemporalBuckets( + context, + partition, + markFocusedSlice(preparedSlice, focusedValue), + sortKeys, + dimensionIndex + ) + if (isEffectiveSplit(preparedSlice, temporalSlices)) { + return applyFocusedValue(temporalSlices, focusedValue) + } + } + + const rangedSlices = await splitWithRanges(context, partition, preparedSlice, sortKeys, dimensionIndex) + if (isEffectiveSplit(preparedSlice, rangedSlices)) { + return applyFocusedValue(rangedSlices, focusedValue) + } + } + + return [slice] +} + +async function splitWithRanges( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, +): Promise { + const sortKey = sortKeys[dimensionIndex] + if (!sortKey) return [slice] + + const range = getChunkRange(slice, dimensionIndex) + if (range.from === undefined || range.to === undefined) return [slice] + if (sortKey.category === 'string' && isExactChunkRange(range)) return [slice] + + const subCount = Math.ceil(slice.estimate.bytesCompressed / context.targetChunkBytes) + if (subCount <= 1) return [slice] + + const quantileBoundaries = await buildQuantileBoundaries(context, slice, sortKeys, dimensionIndex, subCount) + if (quantileBoundaries) { + return splitSliceWithQuantiles(context, partition, slice, sortKeys, dimensionIndex, quantileBoundaries) + } + + return splitSliceWithEqualWidthRanges( + context, + partition, + slice, + sortKeys, + dimensionIndex, + range.from, + range.to, + subCount + ) +} + +async function buildQuantileBoundaries( + context: PlannerContext, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + subCount: number, +): Promise { + const range = getChunkRange(slice, dimensionIndex) + if (range.from === undefined || range.to === undefined) return undefined + + const boundaries: string[] = [range.from] + for (let step = 1; step < subCount; step++) { + const targetCumRows = Math.round((slice.estimate.rows * step) / subCount) + const boundary = await findQuantileBoundaryOnDimension( + context, + slice, + sortKeys, + dimensionIndex, + targetCumRows + ) + boundaries.push(boundary) + } + + const uniqueBoundaryCount = new Set(boundaries).size + if (uniqueBoundaryCount <= Math.max(2, Math.ceil(subCount / 3))) { + return undefined + } + + return boundaries.concat([range.to]) +} + +async function hydrateSliceRange( + context: PlannerContext, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, +): Promise { + const existingRange = getChunkRange(slice, dimensionIndex) + if (existingRange.from !== undefined && existingRange.to !== undefined) { + return slice + } + + const sortKey = sortKeys[dimensionIndex] + if (!sortKey) return undefined + + const observedRange = await getSortKeyRange(context, slice.partitionId, slice.ranges, sortKeys, sortKey) + if (!observedRange) return undefined + + return { + ...slice, + ranges: replaceChunkRange( + slice, + dimensionIndex, + observedRange.min, + toExclusiveUpperBound(observedRange.max, sortKey) + ), + } +} + +function toExclusiveUpperBound(value: string, sortKey: SortKey): string { + if (sortKey.category === 'string') { + return buildRootStringUpperBound(value) + } + if (sortKey.category === 'datetime') { + return new Date(parsePlannerDateTime(value) + 1000).toISOString() + } + return String(Number(value) + 1) +} + +function isEffectiveSplit(parentSlice: PartitionSlice, childSlices: PartitionSlice[]): boolean { + if (childSlices.length <= 1) return false + + return childSlices.some((childSlice) => + childSlice.estimate.rows !== parentSlice.estimate.rows || + JSON.stringify(childSlice.ranges) !== JSON.stringify(parentSlice.ranges) + ) +} + +function findFocusedValue( + slice: PartitionSlice, + sortKeys: SortKey[], +): { dimensionIndex: number; value: string } | undefined { + for (const range of slice.ranges) { + const sortKey = sortKeys[range.dimensionIndex] + if (sortKey?.category !== 'string') continue + if (isExactChunkRange(range) && range.from !== undefined) { + return { dimensionIndex: range.dimensionIndex, value: range.from } + } + } + return undefined +} + +function applyFocusedValue( + slices: PartitionSlice[], + focusedValue: { dimensionIndex: number; value: string } | undefined, +): PartitionSlice[] { + if (!focusedValue) return slices + return slices.map((slice) => markFocusedSlice(slice, focusedValue)) +} + +function markFocusedSlice( + slice: PartitionSlice, + focusedValue: { dimensionIndex: number; value: string } | undefined, +): PartitionSlice { + if (!focusedValue) return slice + return { + ...slice, + analysis: { + ...slice.analysis, + focusedValue, + }, + } +} + +function assignChunkIds(planId: string, slices: PartitionSlice[]): Chunk[] { + const chunkIndexes = new Map() + + return slices.map((slice) => { + const currentIndex = chunkIndexes.get(slice.partitionId) ?? 0 + chunkIndexes.set(slice.partitionId, currentIndex + 1) + return { + ...slice, + id: generateChunkId(planId, slice.partitionId, currentIndex), + } + }) +} diff --git a/packages/plugin-backfill/src/chunking/services/distribution-source.ts b/packages/plugin-backfill/src/chunking/services/distribution-source.ts new file mode 100644 index 0000000..98c56ad --- /dev/null +++ b/packages/plugin-backfill/src/chunking/services/distribution-source.ts @@ -0,0 +1,66 @@ +import { buildWhereClauseFromRanges } from '../sql.js' +import type { + ChunkRange, + PlannerContext, + SortKey, + StringPrefixBucket, + TemporalBucket, +} from '../types.js' + +export async function probeStringPrefixDistribution( + context: Pick, + partitionId: string, + ranges: ChunkRange[], + sortKey: SortKey, + dimensionIndex: number, + depth: number, + sortKeys: SortKey[], +): Promise { + const range = ranges.find((candidate) => candidate.dimensionIndex === dimensionIndex) + if (!range?.from || !range.to) return [] + + const rows = await context.query<{ prefix: string; cnt: string }>(` +SELECT + substring(${sortKey.name}, 1, ${depth}) AS prefix, + count() AS cnt +FROM ${context.database}.${context.table} +WHERE ${buildWhereClauseFromRanges(partitionId, ranges, sortKeys)} +GROUP BY prefix +ORDER BY prefix`) + + return rows.map((row) => ({ + value: row.prefix, + rowCount: Number(row.cnt), + isExactValue: Buffer.from(row.prefix, 'latin1').length < depth, + })) +} + +export async function probeTemporalDistribution( + context: Pick, + partitionId: string, + ranges: ChunkRange[], + sortKeys: SortKey[], + dimensionIndex: number, + grain: 'day' | 'hour', +): Promise { + const sortKey = sortKeys[dimensionIndex] + if (!sortKey || sortKey.category !== 'datetime') return [] + + const bucketExpression = grain === 'day' + ? `toStartOfDay(${sortKey.name})` + : `toStartOfHour(${sortKey.name})` + + const rows = await context.query<{ bucket: string; cnt: string }>(` +SELECT + formatDateTime(${bucketExpression}, '%Y-%m-%dT%H:%i:%sZ') AS bucket, + count() AS cnt +FROM ${context.database}.${context.table} +WHERE ${buildWhereClauseFromRanges(partitionId, ranges, sortKeys)} +GROUP BY bucket +ORDER BY bucket`) + + return rows.map((row) => ({ + start: row.bucket, + rowCount: Number(row.cnt), + })) +} diff --git a/packages/plugin-backfill/src/chunking/services/metadata-source.ts b/packages/plugin-backfill/src/chunking/services/metadata-source.ts new file mode 100644 index 0000000..cea3a4d --- /dev/null +++ b/packages/plugin-backfill/src/chunking/services/metadata-source.ts @@ -0,0 +1,163 @@ +import type { Partition, PlannerContext, SortKey, SortKeyCategory } from '../types.js' + +const NUMERIC_TYPES = /^(U?Int|Float|Decimal)/ +const DATETIME_TYPES = /^(Date|DateTime)/ + +function classifySortKeyType(type: string): SortKeyCategory { + if (NUMERIC_TYPES.test(type)) return 'numeric' + if (DATETIME_TYPES.test(type)) return 'datetime' + return 'string' +} + +function boundaryEncodingForCategory(category: SortKeyCategory): SortKey['boundaryEncoding'] { + return category === 'string' ? 'hex-latin1' : 'literal' +} + +function splitTopLevelCsv(input: string): string[] { + const parts: string[] = [] + let current = '' + let depth = 0 + let quote: '\'' | '"' | undefined + + for (let index = 0; index < input.length; index++) { + const char = input[index] + if (char === undefined) continue + + if (quote) { + current += char + if (char === quote && input[index - 1] !== '\\') quote = undefined + continue + } + + if (char === '\'' || char === '"') { + quote = char + current += char + continue + } + + if (char === '(') { + depth += 1 + current += char + continue + } + + if (char === ')') { + depth = Math.max(0, depth - 1) + current += char + continue + } + + if (char === ',' && depth === 0) { + parts.push(current.trim()) + current = '' + continue + } + + current += char + } + + if (current.trim().length > 0) { + parts.push(current.trim()) + } + + return parts +} + +function resolveSortKeyColumn(expression: string, knownColumns: Set): string | undefined { + const trimmed = expression.trim() + if (knownColumns.has(trimmed)) return trimmed + + const identifiers = Array.from(trimmed.matchAll(/\b[A-Za-z_][A-Za-z0-9_]*\b/g)) + .map((match) => match[0]) + .filter((identifier): identifier is string => Boolean(identifier)) + + const matches = Array.from(new Set(identifiers.filter((identifier) => knownColumns.has(identifier)))) + if (matches.length === 1) return matches[0] + if (knownColumns.size === 0 && identifiers.length > 0) { + return identifiers[identifiers.length - 1] + } + return undefined +} + +export async function introspectPartitions(context: PlannerContext): Promise { + await context.query( + `SELECT 1 FROM ${context.database}.${context.table} LIMIT 1 SETTINGS select_sequential_consistency = 1` + ) + + const rows = await context.query<{ + partition_id: string + total_rows: string + total_bytes: string + total_uncompressed_bytes?: string + min_time: string + max_time: string + }>(`SELECT + partition_id, + toString(sum(rows)) AS total_rows, + toString(sum(bytes_on_disk)) AS total_bytes, + toString(sum(data_uncompressed_bytes)) AS total_uncompressed_bytes, + toString(min(min_time)) AS min_time, + toString(max(max_time)) AS max_time +FROM system.parts +WHERE database = '${context.database}' + AND table = '${context.table}' + AND active = 1 +GROUP BY partition_id +ORDER BY partition_id +SETTINGS select_sequential_consistency = 1`) + + return rows + .map((row) => ({ + partitionId: row.partition_id, + rows: Number(row.total_rows), + bytesCompressed: Number(row.total_bytes), + bytesUncompressed: Number(row.total_uncompressed_bytes ?? row.total_bytes), + minTime: new Date(row.min_time).toISOString(), + maxTime: new Date(row.max_time).toISOString(), + })) + .filter((partition) => { + if (context.from && partition.maxTime < context.from) return false + if (context.to && partition.minTime >= context.to) return false + return true + }) +} + +export async function introspectSortKeys(context: PlannerContext): Promise { + const tableRows = await context.query<{ sorting_key: string }>( + `SELECT sorting_key FROM system.tables WHERE database = '${context.database}' AND name = '${context.table}'` + ) + + const sortingKey = tableRows[0]?.sorting_key + if (!sortingKey) return [] + + const expressions = splitTopLevelCsv(sortingKey) + if (expressions.length === 0) return [] + + const columnRows = await context.query<{ name?: string; type: string }>( + `SELECT name, type FROM system.columns WHERE database = '${context.database}' AND table = '${context.table}'` + ) + + const typeByName = new Map( + columnRows + .filter((row): row is { name: string; type: string } => Boolean(row.name)) + .map((row) => [row.name, row.type]) + ) + + const knownColumns = new Set(typeByName.keys()) + + return expressions.flatMap((expression, index) => { + const column = resolveSortKeyColumn(expression, knownColumns) + const type = column + ? typeByName.get(column) ?? columnRows[index]?.type ?? columnRows[0]?.type + : undefined + if (!column || !type) return [] + + const category = classifySortKeyType(type) + return [{ + name: column, + type, + category, + boundaryEncoding: boundaryEncodingForCategory(category), + }] + }) +} diff --git a/packages/plugin-backfill/src/chunking/services/row-probe.ts b/packages/plugin-backfill/src/chunking/services/row-probe.ts new file mode 100644 index 0000000..989ce32 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/services/row-probe.ts @@ -0,0 +1,102 @@ +import { buildCountSql, buildEstimateSql, buildWhereClauseFromRanges } from '../sql.js' +import type { + ChunkRange, + EstimateFilter, + PlannerContext, + RowProbeStrategy, + SortKey, +} from '../types.js' + +export function getRowProbeStrategy(context: Pick): RowProbeStrategy { + return context.rowProbeStrategy +} + +export async function estimateRows( + context: PlannerContext, + filter: EstimateFilter, + sortKeys: SortKey[], +): Promise { + if (getRowProbeStrategy(context) === 'count') { + return countRowsExact(context, filter, sortKeys) + } + + const rows = await context.query>( + buildEstimateSql(filter, sortKeys, context, getRowProbeStrategy(context)) + ) + + const firstRow = rows[0] + if (!firstRow) return 0 + + for (const [key, value] of Object.entries(firstRow)) { + if (!key.toLowerCase().includes('row')) continue + const parsed = Number(value ?? 0) + if (Number.isFinite(parsed)) return parsed + } + + for (const value of Object.values(firstRow)) { + const parsed = Number(value ?? 0) + if (Number.isFinite(parsed)) return parsed + } + + return 0 +} + +export async function countRowsExact( + context: Pick, + filter: EstimateFilter, + sortKeys: SortKey[], +): Promise { + const rows = await context.query<{ cnt: string }>(buildCountSql(filter, sortKeys, context)) + return Number(rows[0]?.cnt ?? 0) +} + +export async function countRows( + context: Pick, + partitionId: string, + ranges: ChunkRange[], + sortKeys: SortKey[], +): Promise { + const filter: EstimateFilter = { + partitionId, + ranges, + exactDimensionIndex: undefined, + exactValue: undefined, + } + return countRowsExact(context, filter, sortKeys) +} + +export async function countPartitionRows( + context: Pick, + partitionId: string, +): Promise { + const rows = await context.query<{ cnt: string }>( + `SELECT count() AS cnt FROM ${context.database}.${context.table} WHERE _partition_id = '${partitionId}'` + ) + return Number(rows[0]?.cnt ?? 0) +} + +export async function getSortKeyRange( + context: Pick, + partitionId: string, + ranges: ChunkRange[], + sortKeys: SortKey[], + sortKey: SortKey, +): Promise<{ min: string; max: string } | undefined> { + const rows = await context.query<{ minVal: string; maxVal: string }>(` +SELECT + toString(min(${sortKey.name})) AS minVal, + toString(max(${sortKey.name})) AS maxVal +FROM ${context.database}.${context.table} +WHERE ${buildWhereClauseFromRanges(partitionId, ranges, sortKeys)}`) + + if (rows.length === 0) return undefined + return { + min: rows[0]?.minVal ?? '', + max: rows[0]?.maxVal ?? '', + } +} + +export function parsePlannerDateTime(value: string): number { + const normalized = value.includes('T') ? value : value.replace(' ', 'T') + return Date.parse(normalized.endsWith('Z') ? normalized : `${normalized}Z`) +} diff --git a/packages/plugin-backfill/src/chunking/smart-chunking.integration.test.ts b/packages/plugin-backfill/src/chunking/smart-chunking.integration.test.ts index e362f50..a1398ec 100644 --- a/packages/plugin-backfill/src/chunking/smart-chunking.integration.test.ts +++ b/packages/plugin-backfill/src/chunking/smart-chunking.integration.test.ts @@ -1,8 +1,8 @@ import { describe, expect, test } from 'bun:test' import { analyzeAndChunk } from './analyze.js' -import { buildChunkSql } from './sql.js' -import type { SortKeyInfo } from './types.js' +import { buildChunkExecutionSql } from './sql.js' +import type { Chunk, ChunkPlan } from './types.js' const MiB = 1024 ** 2 @@ -35,8 +35,7 @@ function createFixtureQuery(input: { } if (sql.includes('FROM system.parts')) { - const partitions = summarizePartitions(input.rows, bytesPerRow, uncompressedBytesPerRow) - return partitions as T[] + return summarizePartitions(input.rows, bytesPerRow, uncompressedBytesPerRow) as T[] } if (sql.includes('FROM system.tables')) { @@ -75,9 +74,7 @@ function createFixtureQuery(input: { const grouped = new Map() for (const row of filteredRows) { - const bucket = grain === 'day' - ? toStartOfDay(String(row[column])) - : toStartOfHour(String(row[column])) + const bucket = grain === 'day' ? toStartOfDay(String(row[column])) : toStartOfHour(String(row[column])) grouped.set(bucket, (grouped.get(bucket) ?? 0) + 1) } @@ -150,10 +147,10 @@ function evaluateClause(clause: string, row: FixtureRow): boolean { match = clause.match(/^(\w+) < parseDateTimeBestEffort\('([^']+)'\)$/) if (match) return Date.parse(String(row[match[1]])) < Date.parse(match[2]) - match = clause.match(/^(\w+) >= unhex\('([0-9a-f]+)'\)$/i) + match = clause.match(/^(\w+) >= unhex\('([0-9a-f]*)'\)$/i) if (match) return compareLatin1(String(row[match[1]] ?? ''), Buffer.from(match[2], 'hex').toString('latin1')) >= 0 - match = clause.match(/^(\w+) < unhex\('([0-9a-f]+)'\)$/i) + match = clause.match(/^(\w+) < unhex\('([0-9a-f]*)'\)$/i) if (match) return compareLatin1(String(row[match[1]] ?? ''), Buffer.from(match[2], 'hex').toString('latin1')) < 0 match = clause.match(/^(\w+) >= '([^']+)'$/) @@ -182,7 +179,7 @@ function compareValues(left: RowValue, right: RowValue): number { } function formatValueForMinMax(value: RowValue): string { - return typeof value === 'number' ? String(value) : String(value) + return String(value) } function compareLatin1(left: string, right: string): number { @@ -196,21 +193,14 @@ function toStartOfDay(value: string): string { function toStartOfHour(value: string): string { const date = new Date(value) - return new Date(Date.UTC( - date.getUTCFullYear(), - date.getUTCMonth(), - date.getUTCDate(), - date.getUTCHours(), - 0, - 0, - )).toISOString() + return new Date(Date.UTC(date.getUTCFullYear(), date.getUTCMonth(), date.getUTCDate(), date.getUTCHours(), 0, 0)).toISOString() } async function planFixture(input: { rows: FixtureRow[] sortKeys: Array<{ column: string; type: string }> maxChunkBytes: number -}) { +}): Promise { const query = createFixtureQuery({ database: 'app', table: 'events', @@ -221,23 +211,22 @@ async function planFixture(input: { return analyzeAndChunk({ database: 'app', table: 'events', - maxChunkBytes: input.maxChunkBytes, - requireIdempotencyToken: true, + targetChunkBytes: input.maxChunkBytes, query, }) } -function strategyIds(chunk: { lineage?: Array<{ strategyId: string }> }): string[] { - return chunk.lineage?.map((step) => step.strategyId) ?? [] +function strategyIds(chunk: Pick): string[] { + return chunk.analysis.lineage.map((step) => step.strategyId) } -function buildSqlForChunk(chunk: Awaited>['chunks'][number], sortKeys: SortKeyInfo[]) { - return buildChunkSql({ +function buildSqlForChunk(plan: ChunkPlan, chunk: Chunk): string { + return buildChunkExecutionSql({ planId: 'fixture-plan', chunk, target: 'app.events', - sortKey: sortKeys[0], - sortKeys, + sourceTarget: 'app.events', + table: plan.table, }) } @@ -256,15 +245,15 @@ describe('smart chunking integration', () => { id: index, })) - const result = await planFixture({ + const plan = await planFixture({ rows, sortKeys: [{ column: 'id', type: 'UInt64' }], maxChunkBytes: 64 * MiB, }) - expect(result.chunks).toHaveLength(1) - expect(result.chunks[0]?.estimateReason).toBe('partition-metadata') - expect(strategyIds(result.chunks[0] ?? {})).toHaveLength(0) + expect(plan.chunks).toHaveLength(1) + expect(plan.chunks[0]?.estimate.reason).toBe('partition-metadata') + expect(strategyIds(requireChunk(plan.chunks[0], 'metadata chunk'))).toHaveLength(0) }) test('uses quantile range splitting for wide numeric distributions', async () => { @@ -274,15 +263,16 @@ describe('smart chunking integration', () => { id: index, })) - const result = await planFixture({ + const plan = await planFixture({ rows, sortKeys: [{ column: 'id', type: 'UInt64' }], maxChunkBytes: 30 * 1024, }) - expect(result.chunks.length).toBeGreaterThanOrEqual(3) - expect(result.chunks.every((chunk) => strategyIds(chunk).includes('quantile-range-split'))).toBe(true) - const estimatedRows = result.chunks.map((chunk) => chunk.estimatedRows ?? 0) + expect(plan.chunks.length).toBeGreaterThanOrEqual(3) + expect(plan.chunks.every((chunk) => strategyIds(chunk).includes('quantile-range-split'))).toBe(true) + + const estimatedRows = plan.chunks.map((chunk) => chunk.estimate.rows) expect(Math.max(...estimatedRows) - Math.min(...estimatedRows)).toBeLessThanOrEqual(4) }) @@ -293,17 +283,17 @@ describe('smart chunking integration', () => { id: 100 + (index % 2), })) - const result = await planFixture({ + const plan = await planFixture({ rows, sortKeys: [{ column: 'id', type: 'UInt64' }], maxChunkBytes: 20 * 1024, }) - expect(result.chunks.length).toBeGreaterThan(1) - expect(result.chunks.some((chunk) => strategyIds(chunk).includes('equal-width-split'))).toBe(true) - expect(result.chunks.every((chunk) => (chunk.estimatedRows ?? 0) > 0)).toBe(true) - expect(result.chunks.every((chunk) => - chunk.ranges?.every((range) => range.from !== range.to) ?? true + expect(plan.chunks.length).toBeGreaterThan(1) + expect(plan.chunks.some((chunk) => strategyIds(chunk).includes('equal-width-split'))).toBe(true) + expect(plan.chunks.every((chunk) => chunk.estimate.rows > 0)).toBe(true) + expect(plan.chunks.every((chunk) => + chunk.ranges.every((range) => range.from !== range.to) )).toBe(true) }) @@ -319,20 +309,20 @@ describe('smart chunking integration', () => { } } - const result = await planFixture({ + const plan = await planFixture({ rows, sortKeys: [{ column: 'slug', type: 'String' }], maxChunkBytes: 24 * 1024, }) - expect(result.chunks.length).toBeGreaterThan(2) - expect(result.chunks.some((chunk) => strategyIds(chunk).includes('string-prefix-split'))).toBe(true) + expect(plan.chunks.length).toBeGreaterThan(2) + expect(plan.chunks.some((chunk) => strategyIds(chunk).includes('string-prefix-split'))).toBe(true) - const sql = buildSqlForChunk(requireChunk(result.chunks[0], 'string-prefix first chunk'), result.sortKeys) + const sql = buildSqlForChunk(plan, requireChunk(plan.chunks[0], 'string-prefix first chunk')) expect(sql).toContain("unhex('") }) - test('combines string-prefix and temporal splitting for hot-key time windows', async () => { + test('combines string-prefix and temporal splitting for focused time windows', async () => { const rows: FixtureRow[] = [] for (let day = 1; day <= 3; day++) { @@ -355,7 +345,7 @@ describe('smart chunking integration', () => { }) } - const result = await planFixture({ + const plan = await planFixture({ rows, sortKeys: [ { column: 'user_id', type: 'String' }, @@ -364,22 +354,22 @@ describe('smart chunking integration', () => { maxChunkBytes: 18 * 1024, }) - const hotChunks = result.chunks.filter((chunk) => + const hotChunks = plan.chunks.filter((chunk) => strategyIds(chunk).includes('temporal-bucket-split') && - (chunk.ranges?.some((range) => range.dimensionIndex === 0) ?? false) && - (chunk.ranges?.some((range) => range.dimensionIndex === 1) ?? false) + chunk.ranges.some((range) => range.dimensionIndex === 0) && + chunk.ranges.some((range) => range.dimensionIndex === 1) ) expect(hotChunks.length).toBeGreaterThan(0) - expect(hotChunks.every((chunk) => chunk.isHotKey || (chunk.hotKeyValue !== undefined))).toBe(true) + expect(hotChunks.every((chunk) => chunk.analysis.focusedValue?.value === 'hot')).toBe(true) - const sql = buildSqlForChunk(requireChunk(hotChunks[0], 'temporal combo chunk'), result.sortKeys) + const sql = buildSqlForChunk(plan, requireChunk(hotChunks[0], 'temporal combo chunk')) expect(sql).toContain('user_id >=') expect(sql).toContain('event_time >=') expect(sql).toContain('parseDateTimeBestEffort') const temporalRanges = hotChunks - .map((chunk) => chunk.ranges?.find((range) => range.dimensionIndex === 1)) + .map((chunk) => chunk.ranges.find((range) => range.dimensionIndex === 1)) .filter((range): range is NonNullable => Boolean(range)) .sort((left, right) => String(left.from).localeCompare(String(right.from))) @@ -409,7 +399,7 @@ describe('smart chunking integration', () => { }) } - const result = await planFixture({ + const plan = await planFixture({ rows, sortKeys: [ { column: 'account', type: 'String' }, @@ -418,15 +408,15 @@ describe('smart chunking integration', () => { maxChunkBytes: 24 * 1024, }) - const comboChunks = result.chunks.filter((chunk) => + const comboChunks = plan.chunks.filter((chunk) => strategyIds(chunk).includes('quantile-range-split') && - (chunk.ranges?.some((range) => range.dimensionIndex === 0) ?? false) && - (chunk.ranges?.some((range) => range.dimensionIndex === 1) ?? false) + chunk.ranges.some((range) => range.dimensionIndex === 0) && + chunk.ranges.some((range) => range.dimensionIndex === 1) ) expect(comboChunks.length).toBeGreaterThan(0) - const sql = buildSqlForChunk(requireChunk(comboChunks[0], 'numeric combo chunk'), result.sortKeys) + const sql = buildSqlForChunk(plan, requireChunk(comboChunks[0], 'numeric combo chunk')) expect(sql).toContain('account >=') expect(sql).toContain("seq >= '") }) diff --git a/packages/plugin-backfill/src/chunking/splitter.test.ts b/packages/plugin-backfill/src/chunking/splitter.test.ts deleted file mode 100644 index 16f4f3f..0000000 --- a/packages/plugin-backfill/src/chunking/splitter.test.ts +++ /dev/null @@ -1,64 +0,0 @@ -import { describe, expect, test } from 'bun:test' - -import { splitSortKeyRange, stringToUint64, uint64ToString } from './splitter.js' - -describe('splitSortKeyRange', () => { - test('numeric: splits into equal-width ranges', () => { - const ranges = splitSortKeyRange('numeric', '100', '200', 2) - - expect(ranges).toHaveLength(2) - expect(ranges[0]?.from).toBe('100') - expect(ranges[0]?.to).toBe('150') - expect(ranges[1]?.from).toBe('150') - expect(ranges[1]?.to).toBe('201') - }) - - test('datetime: splits into equal-width time ranges', () => { - const ranges = splitSortKeyRange('datetime', '2025-01-01 00:00:00', '2025-01-31 00:00:00', 3) - - expect(ranges).toHaveLength(3) - for (const r of ranges) { - expect(r.from).toBeDefined() - expect(r.to).toBeDefined() - } - }) - - test('string: round-trips through uint64 conversion', () => { - const ranges = splitSortKeyRange('string', 'aaa', 'zzz', 2) - - expect(ranges).toHaveLength(2) - expect(ranges[0]?.from).toBeDefined() - expect(ranges[1]?.to).toBeDefined() - }) -}) - -describe('stringToUint64 / uint64ToString', () => { - test('round-trips short strings', () => { - const original = 'abc' - const n = stringToUint64(original) - const back = uint64ToString(n) - expect(back).toBe(original) - }) - - test('round-trips 8-byte strings', () => { - const original = 'abcdefgh' - const n = stringToUint64(original) - const back = uint64ToString(n) - expect(back).toBe(original) - }) - - test('truncates strings longer than 8 bytes', () => { - const n = stringToUint64('abcdefghijklmnop') - const back = uint64ToString(n) - expect(back).toBe('abcdefgh') - }) - - test('handles embedded zero bytes from arithmetic', () => { - // Simulates a computed intermediate where a middle byte is 0x00 - // e.g. 0x6200000000000001 has zero bytes between 'b' and the trailing 0x01 - const n = 0x6200000000000001n - const result = uint64ToString(n) - expect(result).toBe('b\0\0\0\0\0\0\x01') - expect(result.length).toBe(8) - }) -}) diff --git a/packages/plugin-backfill/src/chunking/splitter.ts b/packages/plugin-backfill/src/chunking/splitter.ts deleted file mode 100644 index c3a3e5c..0000000 --- a/packages/plugin-backfill/src/chunking/splitter.ts +++ /dev/null @@ -1,86 +0,0 @@ -import type { SortKeyInfo } from './types.js' - -export function splitNumericRange(min: number, max: number, count: number): Array<{ from: string; to: string }> { - const span = max - min - const step = span / count - const ranges: Array<{ from: string; to: string }> = [] - for (let i = 0; i < count; i++) { - const from = min + i * step - const to = i === count - 1 ? max + 1 : min + (i + 1) * step - ranges.push({ from: String(from), to: String(to) }) - } - return ranges -} - -export function splitDateTimeRange(min: string, max: string, count: number): Array<{ from: string; to: string }> { - const minMs = new Date(min).getTime() - const maxMs = new Date(max).getTime() - const span = maxMs - minMs - const step = span / count - const ranges: Array<{ from: string; to: string }> = [] - for (let i = 0; i < count; i++) { - const from = new Date(minMs + i * step).toISOString() - const to = i === count - 1 - ? new Date(maxMs + 1).toISOString() - : new Date(minMs + (i + 1) * step).toISOString() - ranges.push({ from, to }) - } - return ranges -} - -export function stringToUint64(s: string): bigint { - let result = 0n - const bytes = Math.min(s.length, 8) - for (let i = 0; i < bytes; i++) { - result = (result << 8n) | BigInt(s.charCodeAt(i)) - } - // Pad remaining bytes with zeros - for (let i = bytes; i < 8; i++) { - result = result << 8n - } - return result -} - -export function uint64ToString(n: bigint): string { - const chars: string[] = [] - for (let i = 7; i >= 0; i--) { - const byte = Number((n >> BigInt(i * 8)) & 0xffn) - chars.push(String.fromCharCode(byte)) - } - // Trim trailing NUL bytes (padding from stringToUint64 for short strings) - let end = chars.length - while (end > 0 && chars[end - 1] === '\0') end-- - return chars.slice(0, end).join('') -} - -export function splitStringRange(min: string, max: string, count: number): Array<{ from: string; to: string }> { - const minVal = stringToUint64(min) - const maxVal = stringToUint64(max) - const span = maxVal - minVal - const step = span / BigInt(count) - const ranges: Array<{ from: string; to: string }> = [] - for (let i = 0; i < count; i++) { - const from = uint64ToString(minVal + BigInt(i) * step) - const to = i === count - 1 - ? uint64ToString(maxVal + 1n) - : uint64ToString(minVal + BigInt(i + 1) * step) - ranges.push({ from, to }) - } - return ranges -} - -export function splitSortKeyRange( - category: SortKeyInfo['category'], - min: string, - max: string, - count: number, -): Array<{ from: string; to: string }> { - switch (category) { - case 'numeric': - return splitNumericRange(Number(min), Number(max), count) - case 'datetime': - return splitDateTimeRange(min, max, count) - case 'string': - return splitStringRange(min, max, count) - } -} diff --git a/packages/plugin-backfill/src/chunking/sql.ts b/packages/plugin-backfill/src/chunking/sql.ts index 0475815..01817f0 100644 --- a/packages/plugin-backfill/src/chunking/sql.ts +++ b/packages/plugin-backfill/src/chunking/sql.ts @@ -1,71 +1,98 @@ -import type { PlannedChunk, SortKeyInfo } from './types.js' - -function buildSettingsClause(token: string): string { - if (token) { - return `SETTINGS async_insert=0, insert_deduplication_token='${token}'` - } - return `SETTINGS async_insert=0` -} - -function quoteSqlString(value: string): string { +import type { + Chunk, + ChunkRange, + EstimateFilter, + PlannerContext, + RowProbeStrategy, + SortKey, + TableProfile, +} from './types.js' + +export function quoteSqlString(value: string): string { return `'${value.replaceAll('\\', '\\\\').replaceAll('\'', '\\\'')}'` } -function formatBound(value: string, sortKey: SortKeyInfo): string { +export function formatBound(value: string, sortKey: SortKey): string { if (sortKey.category === 'datetime') { return `parseDateTimeBestEffort(${quoteSqlString(value)})` } + if (sortKey.category === 'string') { return `unhex('${Buffer.from(value, 'latin1').toString('hex')}')` } + return quoteSqlString(value) } -function buildChunkConditions(chunk: PlannedChunk, sortKeys: SortKeyInfo[]): string[] { - if (chunk.ranges?.length) { - return chunk.ranges.flatMap((range) => { - const sortKey = sortKeys[range.dimensionIndex] - if (!sortKey) return [] +export function buildWhereClauseFromRanges( + partitionId: string, + ranges: ChunkRange[], + sortKeys: SortKey[], +): string { + const conditions = [`_partition_id = ${quoteSqlString(partitionId)}`] - const conditions: string[] = [] - if (range.from !== undefined) { - conditions.push(`${sortKey.column} >= ${formatBound(range.from, sortKey)}`) - } - if (range.to !== undefined) { - conditions.push(`${sortKey.column} < ${formatBound(range.to, sortKey)}`) - } - return conditions - }) + for (const range of ranges) { + const sortKey = sortKeys[range.dimensionIndex] + if (!sortKey) continue + + if (range.from !== undefined) { + conditions.push(`${sortKey.name} >= ${formatBound(range.from, sortKey)}`) + } + if (range.to !== undefined) { + conditions.push(`${sortKey.name} < ${formatBound(range.to, sortKey)}`) + } } - if (chunk.sortKeyFrom !== undefined && chunk.sortKeyTo !== undefined && sortKeys[0]) { - return [ - `${sortKeys[0].column} >= ${formatBound(chunk.sortKeyFrom, sortKeys[0])}`, - `${sortKeys[0].column} < ${formatBound(chunk.sortKeyTo, sortKeys[0])}`, - ] + return conditions.join('\n AND ') +} + +export function buildWhereClauseFromChunk( + chunk: Pick, + table: Pick, +): string { + return buildWhereClauseFromRanges(chunk.partitionId, chunk.ranges, table.sortKeys) +} + +function buildSettingsClause(token: string): string { + if (token) { + return `SETTINGS async_insert=0, insert_deduplication_token='${token}'` } + return 'SETTINGS async_insert=0' +} - return [] +function buildChunkConditions(chunk: Pick, sortKeys: SortKey[]): string[] { + return chunk.ranges.flatMap((range) => { + const sortKey = sortKeys[range.dimensionIndex] + if (!sortKey) return [] + + const conditions: string[] = [] + if (range.from !== undefined) { + conditions.push(`${sortKey.name} >= ${formatBound(range.from, sortKey)}`) + } + if (range.to !== undefined) { + conditions.push(`${sortKey.name} < ${formatBound(range.to, sortKey)}`) + } + return conditions + }) } -export function buildChunkSql(input: { +export function buildChunkExecutionSql(input: { planId: string - chunk: PlannedChunk + chunk: Chunk target: string - sortKey?: SortKeyInfo - sortKeys?: SortKeyInfo[] + table: Pick + sourceTarget?: string mvAsQuery?: string targetColumns?: string[] + idempotencyToken?: string }): string { - const header = `/* chkit backfill plan=${input.planId} chunk=${input.chunk.id} token=${input.chunk.idempotencyToken} */` - const settings = buildSettingsClause(input.chunk.idempotencyToken) - const { chunk } = input - const sortKeys = input.sortKeys ?? (input.sortKey ? [input.sortKey] : []) - const chunkConditions = buildChunkConditions(chunk, sortKeys) + const sourceTarget = input.sourceTarget ?? input.target + const header = `/* chkit backfill plan=${input.planId} chunk=${input.chunk.id} token=${input.idempotencyToken ?? ''} */` + const settings = buildSettingsClause(input.idempotencyToken ?? '') + const chunkConditions = buildChunkConditions(input.chunk, input.table.sortKeys) if (input.mvAsQuery) { - // MV replay: inject partition + sort key filters into the MV's AS query - let filtered = injectPartitionFilter(input.mvAsQuery, chunk.partitionId) + let filtered = injectPartitionFilter(input.mvAsQuery, input.chunk.partitionId) for (const condition of chunkConditions) { filtered = injectWhereCondition(filtered, condition) } @@ -75,13 +102,12 @@ export function buildChunkSql(input: { return [header, `INSERT INTO ${input.target}`, filtered, settings].join('\n') } - // Direct table copy const lines = [ header, `INSERT INTO ${input.target}`, - `SELECT *`, - `FROM ${input.target}`, - `WHERE _partition_id = '${chunk.partitionId}'`, + 'SELECT *', + `FROM ${sourceTarget}`, + `WHERE _partition_id = ${quoteSqlString(input.chunk.partitionId)}`, ] for (const condition of chunkConditions) { @@ -92,28 +118,80 @@ export function buildChunkSql(input: { return lines.join('\n') } -// --- SQL helpers --- +export function buildEstimateSql( + filter: EstimateFilter, + sortKeys: SortKey[], + context: PlannerContext, + rowProbeStrategy: RowProbeStrategy, +): string { + const whereClause = buildWhereClauseFromFilter(filter, sortKeys) + if (rowProbeStrategy === 'count') { + return `SELECT count() AS cnt FROM ${context.database}.${context.table} WHERE ${whereClause}` + } + return `EXPLAIN ESTIMATE SELECT count() FROM ${context.database}.${context.table} WHERE ${whereClause}` +} + +export function buildCountSql( + filter: EstimateFilter, + sortKeys: SortKey[], + context: Pick, +): string { + return `SELECT count() AS cnt FROM ${context.database}.${context.table} WHERE ${buildWhereClauseFromFilter(filter, sortKeys)}` +} + +function buildWhereClauseFromFilter( + filter: EstimateFilter, + sortKeys: SortKey[], +): string { + const conditions = [`_partition_id = ${quoteSqlString(filter.partitionId)}`] + + for (const range of filter.ranges) { + const sortKey = sortKeys[range.dimensionIndex] + if (!sortKey) continue + + if (filter.exactDimensionIndex === range.dimensionIndex && filter.exactValue !== undefined) { + conditions.push(`${sortKey.name} = ${formatBound(filter.exactValue, sortKey)}`) + continue + } + + if (range.from !== undefined) { + conditions.push(`${sortKey.name} >= ${formatBound(range.from, sortKey)}`) + } + if (range.to !== undefined) { + conditions.push(`${sortKey.name} < ${formatBound(range.to, sortKey)}`) + } + } + + return conditions.join(' AND ') +} function injectPartitionFilter(query: string, partitionId: string): string { - const condition = `_partition_id = '${partitionId}'` - return injectWhereCondition(query, condition) + return injectWhereCondition(query, `_partition_id = ${quoteSqlString(partitionId)}`) } export function injectSortKeyFilter( query: string, sortKeyColumn: string, - category: SortKeyInfo['category'], + category: SortKey['category'], from: string, to: string, ): string { let condition: string + if (category === 'datetime') { - condition = `${sortKeyColumn} >= parseDateTimeBestEffort(${quoteSqlString(from)})\n AND ${sortKeyColumn} < parseDateTimeBestEffort(${quoteSqlString(to)})` + condition = + `${sortKeyColumn} >= parseDateTimeBestEffort(${quoteSqlString(from)})\n` + + ` AND ${sortKeyColumn} < parseDateTimeBestEffort(${quoteSqlString(to)})` } else if (category === 'string') { - condition = `${sortKeyColumn} >= unhex('${Buffer.from(from, 'latin1').toString('hex')}')\n AND ${sortKeyColumn} < unhex('${Buffer.from(to, 'latin1').toString('hex')}')` + condition = + `${sortKeyColumn} >= unhex('${Buffer.from(from, 'latin1').toString('hex')}')\n` + + ` AND ${sortKeyColumn} < unhex('${Buffer.from(to, 'latin1').toString('hex')}')` } else { - condition = `${sortKeyColumn} >= ${quoteSqlString(from)}\n AND ${sortKeyColumn} < ${quoteSqlString(to)}` + condition = + `${sortKeyColumn} >= ${quoteSqlString(from)}\n` + + ` AND ${sortKeyColumn} < ${quoteSqlString(to)}` } + return injectWhereCondition(query, condition) } @@ -121,40 +199,51 @@ function injectWhereCondition(query: string, condition: string): string { const trimmed = query.trimEnd() const upper = trimmed.toUpperCase() - interface KWHit { keyword: string; position: number } - const hits: KWHit[] = [] + interface KeywordHit { + keyword: string + position: number + } + + const hits: KeywordHit[] = [] let depth = 0 - for (let i = 0; i < trimmed.length; i++) { - const ch = trimmed[i] - if (ch === '(') { depth++; continue } - if (ch === ')') { depth--; continue } - if (ch === "'") { - i++ - while (i < trimmed.length && trimmed[i] !== "'") { - if (trimmed[i] === '\\') i++ - i++ + for (let index = 0; index < trimmed.length; index++) { + const char = trimmed[index] + if (char === '(') { + depth += 1 + continue + } + if (char === ')') { + depth -= 1 + continue + } + if (char === '\'') { + index += 1 + while (index < trimmed.length && trimmed[index] !== '\'') { + if (trimmed[index] === '\\') index += 1 + index += 1 } continue } if (depth !== 0) continue - - if (i > 0 && /\S/.test(trimmed[i - 1] ?? '')) continue - - const rest = upper.slice(i) - for (const kw of ['WHERE', 'GROUP BY', 'HAVING', 'ORDER BY', 'QUALIFY', 'LIMIT', 'SETTINGS']) { - if (rest.startsWith(kw) && (i + kw.length >= trimmed.length || /\s/.test(trimmed[i + kw.length] ?? ''))) { - hits.push({ keyword: kw, position: i }) + if (index > 0 && /\S/.test(trimmed[index - 1] ?? '')) continue + + const rest = upper.slice(index) + for (const keyword of ['WHERE', 'GROUP BY', 'HAVING', 'ORDER BY', 'QUALIFY', 'LIMIT', 'SETTINGS']) { + if ( + rest.startsWith(keyword) && + (index + keyword.length >= trimmed.length || /\s/.test(trimmed[index + keyword.length] ?? '')) + ) { + hits.push({ keyword, position: index }) break } } } - const whereHit = hits.find(h => h.keyword === 'WHERE') - const trailingKeywords = ['GROUP BY', 'HAVING', 'ORDER BY', 'QUALIFY', 'LIMIT', 'SETTINGS'] + const whereHit = hits.find((hit) => hit.keyword === 'WHERE') const firstTrailing = hits - .filter(h => trailingKeywords.includes(h.keyword)) - .filter(h => !whereHit || h.position > whereHit.position)[0] + .filter((hit) => hit.keyword !== 'WHERE') + .filter((hit) => !whereHit || hit.position > whereHit.position)[0] const insertAt = firstTrailing ? firstTrailing.position : trimmed.length const before = trimmed.slice(0, insertAt).trimEnd() @@ -163,6 +252,7 @@ function injectWhereCondition(query: string, condition: string): string { if (whereHit) { return `${before}\n AND ${condition}${after ? `\n${after}` : ''}` } + return `${before}\nWHERE ${condition}${after ? `\n${after}` : ''}` } @@ -174,57 +264,85 @@ export function rewriteSelectColumns(query: string, targetColumns: string[]): st let fromPos = -1 let depth = 0 - for (let i = 0; i < trimmed.length; i++) { - const ch = trimmed[i] - if (ch === '(') { depth++; continue } - if (ch === ')') { depth--; continue } - if (ch === "'") { - i++ - while (i < trimmed.length && trimmed[i] !== "'") { - if (trimmed[i] === '\\') i++ - i++ + for (let index = 0; index < trimmed.length; index++) { + const char = trimmed[index] + if (char === '(') { + depth += 1 + continue + } + if (char === ')') { + depth -= 1 + continue + } + if (char === '\'') { + index += 1 + while (index < trimmed.length && trimmed[index] !== '\'') { + if (trimmed[index] === '\\') index += 1 + index += 1 } continue } if (depth !== 0) continue - - if (i > 0 && /\S/.test(trimmed[i - 1] ?? '')) continue - - const rest = upper.slice(i) - if (selectPos === -1 && rest.startsWith('SELECT') && (i + 6 >= trimmed.length || /\s/.test(trimmed[i + 6] ?? ''))) { - selectPos = i - } else if (selectPos !== -1 && fromPos === -1 && rest.startsWith('FROM') && (i + 4 >= trimmed.length || /\s/.test(trimmed[i + 4] ?? ''))) { - fromPos = i + if (index > 0 && /\S/.test(trimmed[index - 1] ?? '')) continue + + const rest = upper.slice(index) + if ( + selectPos === -1 && + rest.startsWith('SELECT') && + (index + 6 >= trimmed.length || /\s/.test(trimmed[index + 6] ?? '')) + ) { + selectPos = index + } else if ( + selectPos !== -1 && + fromPos === -1 && + rest.startsWith('FROM') && + (index + 4 >= trimmed.length || /\s/.test(trimmed[index + 4] ?? '')) + ) { + fromPos = index } } if (selectPos === -1 || fromPos === -1) return query - const projStart = selectPos + 6 - const projText = trimmed.slice(projStart, fromPos).trim() + const projectionStart = selectPos + 6 + const rawProjection = trimmed.slice(projectionStart, fromPos).trim() + let projectionPrefix = '' + let projection = rawProjection + + const distinctMatch = rawProjection.match(/^DISTINCT\b\s*/i) + if (distinctMatch) { + projectionPrefix = distinctMatch[0] ?? '' + projection = rawProjection.slice(projectionPrefix.length).trim() + } const items: string[] = [] let itemStart = 0 depth = 0 - for (let i = 0; i < projText.length; i++) { - const ch = projText[i] - if (ch === '(') { depth++; continue } - if (ch === ')') { depth--; continue } - if (ch === "'") { - i++ - while (i < projText.length && projText[i] !== "'") { - if (projText[i] === '\\') i++ - i++ + for (let index = 0; index < projection.length; index++) { + const char = projection[index] + if (char === '(') { + depth += 1 + continue + } + if (char === ')') { + depth -= 1 + continue + } + if (char === '\'') { + index += 1 + while (index < projection.length && projection[index] !== '\'') { + if (projection[index] === '\\') index += 1 + index += 1 } continue } - if (depth === 0 && ch === ',') { - items.push(projText.slice(itemStart, i).trim()) - itemStart = i + 1 + if (depth === 0 && char === ',') { + items.push(projection.slice(itemStart, index).trim()) + itemStart = index + 1 } } - items.push(projText.slice(itemStart).trim()) + items.push(projection.slice(itemStart).trim()) const aliasMap = new Map() for (const item of items) { @@ -232,38 +350,43 @@ export function rewriteSelectColumns(query: string, targetColumns: string[]): st const itemUpper = item.toUpperCase() let asPos = -1 - let d = 0 - - for (let i = 0; i < item.length; i++) { - const ch = item[i] - if (ch === '(') { d++; continue } - if (ch === ')') { d--; continue } - if (ch === "'") { - i++ - while (i < item.length && item[i] !== "'") { - if (item[i] === '\\') i++ - i++ + let itemDepth = 0 + + for (let index = 0; index < item.length; index++) { + const char = item[index] + if (char === '(') { + itemDepth += 1 + continue + } + if (char === ')') { + itemDepth -= 1 + continue + } + if (char === '\'') { + index += 1 + while (index < item.length && item[index] !== '\'') { + if (item[index] === '\\') index += 1 + index += 1 } continue } - if (d !== 0) continue - if (i > 0 && /\S/.test(item[i - 1] ?? '')) continue - - const rest = itemUpper.slice(i) - if (rest.startsWith('AS') && (i + 2 >= item.length || /\s/.test(item[i + 2] ?? ''))) { - asPos = i + if (itemDepth !== 0) continue + if (index > 0 && /\S/.test(item[index - 1] ?? '')) continue + + const rest = itemUpper.slice(index) + if ( + rest.startsWith('AS') && + (index + 2 >= item.length || /\s/.test(item[index + 2] ?? '')) + ) { + asPos = index } } if (asPos !== -1) { - const alias = item.slice(asPos + 2).trim() - aliasMap.set(alias, item) + aliasMap.set(item.slice(asPos + 2).trim(), item) } } - const rewrittenCols = targetColumns.map(col => aliasMap.get(col) ?? col) - - const before = trimmed.slice(0, projStart) - const after = trimmed.slice(fromPos) - return `${before} ${rewrittenCols.join(', ')}\n${after}` + const rewrittenProjection = targetColumns.map((column) => aliasMap.get(column) ?? column) + return `${trimmed.slice(0, projectionStart)} ${projectionPrefix}${rewrittenProjection.join(', ')}\n${trimmed.slice(fromPos)}` } diff --git a/packages/plugin-backfill/src/chunking/strategies/equal-width-split.ts b/packages/plugin-backfill/src/chunking/strategies/equal-width-split.ts new file mode 100644 index 0000000..f98778f --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategies/equal-width-split.ts @@ -0,0 +1,67 @@ +import { buildSliceFromRows } from '../partition-slices.js' +import { estimateRows } from '../services/row-probe.js' +import type { + Partition, + PartitionSlice, + PlannerContext, + SortKey, +} from '../types.js' +import { replaceChunkRange } from '../utils/ranges.js' +import { buildEvenlySpacedBoundaries } from './quantile-range-split.js' + +export async function splitSliceWithEqualWidthRanges( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + rangeFrom: string, + rangeTo: string, + subCount: number, +): Promise { + const sortKey = sortKeys[dimensionIndex] + if (!sortKey) return [slice] + + const boundaries = Array.from( + new Set(buildEvenlySpacedBoundaries(rangeFrom, rangeTo, subCount, sortKey)) + ) + if (boundaries.length <= 2) return [slice] + + const slices: PartitionSlice[] = [] + + for (let index = 0; index < boundaries.length - 1; index++) { + const from = boundaries[index] + const to = boundaries[index + 1] + if (from === undefined || to === undefined || from === to) continue + + const ranges = replaceChunkRange(slice, dimensionIndex, from, to) + const rows = await estimateRows( + context, + { + partitionId: partition.partitionId, + ranges, + }, + sortKeys + ) + if (rows <= 0) continue + + slices.push( + buildSliceFromRows(partition, { + ranges, + rows, + focusedValue: slice.analysis.focusedValue, + confidence: context.rowProbeStrategy === 'count' ? 'exact' : 'low', + reason: context.rowProbeStrategy === 'count' ? 'exact-count' : 'equal-width-distribution', + lineage: slice.analysis.lineage.concat([ + { + strategyId: 'equal-width-split', + dimensionIndex, + reason: 'fallback to equal-width ranges', + }, + ]), + }) + ) + } + + return slices.length > 0 ? slices : [slice] +} diff --git a/packages/plugin-backfill/src/chunking/strategies/metadata-single-chunk.ts b/packages/plugin-backfill/src/chunking/strategies/metadata-single-chunk.ts new file mode 100644 index 0000000..86dca54 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategies/metadata-single-chunk.ts @@ -0,0 +1,6 @@ +import { buildRootSlice } from '../partition-slices.js' +import type { Partition, PartitionSlice } from '../types.js' + +export function buildSingleChunkPartition(partition: Partition): PartitionSlice[] { + return [buildRootSlice(partition)] +} diff --git a/packages/plugin-backfill/src/chunking/strategies/quantile-range-split.ts b/packages/plugin-backfill/src/chunking/strategies/quantile-range-split.ts new file mode 100644 index 0000000..4c586a1 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategies/quantile-range-split.ts @@ -0,0 +1,208 @@ +import { buildSliceFromRows } from '../partition-slices.js' +import { estimateRows, parsePlannerDateTime } from '../services/row-probe.js' +import type { + Partition, + PartitionSlice, + PlannerContext, + SortKey, +} from '../types.js' +import { bigIntToStr, strToBigInt } from '../utils/binary-string.js' +import { getChunkRange, replaceChunkRange } from '../utils/ranges.js' + +const BINARY_SEARCH_STEPS = 24 + +export async function splitSliceWithQuantiles( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + boundaries: string[], +): Promise { + const slices: PartitionSlice[] = [] + + for (let index = 0; index < boundaries.length - 1; index++) { + const from = boundaries[index] + const to = boundaries[index + 1] + if (from === undefined || to === undefined || from === to) continue + + const ranges = replaceChunkRange(slice, dimensionIndex, from, to) + const rows = await estimateRows( + context, + { + partitionId: partition.partitionId, + ranges, + }, + sortKeys + ) + if (rows <= 0) continue + + slices.push( + buildSliceFromRows(partition, { + ranges, + rows, + focusedValue: slice.analysis.focusedValue, + confidence: context.rowProbeStrategy === 'count' ? 'exact' : 'high', + reason: context.rowProbeStrategy === 'count' ? 'exact-count' : 'quantile-estimate', + lineage: slice.analysis.lineage.concat([ + { + strategyId: 'quantile-range-split', + dimensionIndex, + reason: 'split slice into quantile-aligned ranges', + }, + ]), + }) + ) + } + + return slices +} + +export async function findQuantileBoundaryOnDimension( + context: PlannerContext, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + targetCumRows: number, +): Promise { + const sortKey = sortKeys[dimensionIndex] + if (!sortKey) { + throw new Error(`Missing sort key at dimension ${dimensionIndex}`) + } + + const range = getChunkRange(slice, dimensionIndex) + if (range.from === undefined || range.to === undefined) { + throw new Error(`Missing range for quantile split on dimension ${dimensionIndex}`) + } + + if (sortKey.category === 'string') { + return findStringBoundary(context, slice, sortKeys, dimensionIndex, range.from, range.to, targetCumRows) + } + if (sortKey.category === 'datetime') { + return findDateTimeBoundary(context, slice, sortKeys, dimensionIndex, range.from, range.to, targetCumRows) + } + return findNumericBoundary(context, slice, sortKeys, dimensionIndex, range.from, range.to, targetCumRows) +} + +export function buildEvenlySpacedBoundaries( + rangeFrom: string, + rangeTo: string, + subCount: number, + sortKey: SortKey, +): string[] { + if (subCount <= 1) return [rangeFrom, rangeTo] + + if (sortKey.category === 'datetime') { + const start = parsePlannerDateTime(rangeFrom) + const end = parsePlannerDateTime(rangeTo) + return Array.from({ length: subCount + 1 }, (_, index) => + new Date(start + Math.floor(((end - start) * index) / subCount)).toISOString() + ) + } + + if (sortKey.category === 'numeric') { + const start = Number(rangeFrom) + const end = Number(rangeTo) + return Array.from({ length: subCount + 1 }, (_, index) => + String(start + Math.floor(((end - start) * index) / subCount)) + ) + } + + const start = strToBigInt(rangeFrom, 8) + const end = strToBigInt(rangeTo, 8) + return Array.from({ length: subCount + 1 }, (_, index) => + bigIntToStr(start + ((end - start) * BigInt(index)) / BigInt(subCount), 8) + ) +} + +async function findStringBoundary( + context: PlannerContext, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + rangeFrom: string, + rangeTo: string, + targetCumRows: number, +): Promise { + let low = strToBigInt(rangeFrom, 8) + let high = strToBigInt(rangeTo, 8) + + for (let step = 0; step < BINARY_SEARCH_STEPS; step++) { + const midpoint = (low + high) / 2n + if (midpoint === low || midpoint === high) break + + const mid = bigIntToStr(midpoint, 8) + const rows = await estimateRowsUntil(context, slice, sortKeys, dimensionIndex, rangeFrom, mid) + if (rows < targetCumRows) low = midpoint + else high = midpoint + } + + return bigIntToStr((low + high) / 2n, 8) +} + +async function findDateTimeBoundary( + context: PlannerContext, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + rangeFrom: string, + rangeTo: string, + targetCumRows: number, +): Promise { + let low = parsePlannerDateTime(rangeFrom) + let high = parsePlannerDateTime(rangeTo) + + for (let step = 0; step < BINARY_SEARCH_STEPS; step++) { + const midpoint = Math.floor((low + high) / 2) + if (midpoint === low || midpoint === high) break + + const mid = new Date(midpoint).toISOString() + const rows = await estimateRowsUntil(context, slice, sortKeys, dimensionIndex, rangeFrom, mid) + if (rows < targetCumRows) low = midpoint + else high = midpoint + } + + return new Date(Math.floor((low + high) / 2)).toISOString() +} + +async function findNumericBoundary( + context: PlannerContext, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + rangeFrom: string, + rangeTo: string, + targetCumRows: number, +): Promise { + let low = Number(rangeFrom) + let high = Number(rangeTo) + + for (let step = 0; step < BINARY_SEARCH_STEPS; step++) { + const midpoint = Math.floor((low + high) / 2) + if (midpoint === low || midpoint === high) break + + const rows = await estimateRowsUntil(context, slice, sortKeys, dimensionIndex, rangeFrom, String(midpoint)) + if (rows < targetCumRows) low = midpoint + else high = midpoint + } + + return String(Math.floor((low + high) / 2)) +} + +async function estimateRowsUntil( + context: PlannerContext, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + rangeFrom: string, + rangeTo: string, +): Promise { + return estimateRows( + context, + { + partitionId: slice.partitionId, + ranges: replaceChunkRange(slice, dimensionIndex, rangeFrom, rangeTo), + }, + sortKeys + ) +} diff --git a/packages/plugin-backfill/src/chunking/strategies/refinement.ts b/packages/plugin-backfill/src/chunking/strategies/refinement.ts new file mode 100644 index 0000000..4d27c2a --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategies/refinement.ts @@ -0,0 +1,128 @@ +import { buildSliceEstimate } from '../partition-slices.js' +import { countRowsExact, getRowProbeStrategy } from '../services/row-probe.js' +import type { + Partition, + PartitionBuildResult, + PartitionDiagnostics, + PartitionSlice, + PlannerContext, + SortKey, +} from '../types.js' + +const ESTIMATE_RATIO_MIN = 0.7 +const ESTIMATE_RATIO_MAX = 1.3 + +export async function refinePartitionSlices( + context: PlannerContext, + partition: Partition, + slices: PartitionSlice[], + sortKeys: SortKey[], + usedDistributionFallback: boolean, +): Promise { + let workingSlices = slices + let usedLowConfidenceChunkRefinement = false + + if (slices.some((slice) => slice.estimate.confidence === 'low')) { + workingSlices = await refineLowConfidenceSlices(context, partition, slices, sortKeys) + usedLowConfidenceChunkRefinement = true + } + + const diagnostics = buildPartitionDiagnostics( + partition, + workingSlices, + usedDistributionFallback, + usedLowConfidenceChunkRefinement, + false + ) + + if ( + getRowProbeStrategy(context) !== 'explain-estimate' || + !diagnostics.suspiciousEstimate + ) { + return { slices: workingSlices, diagnostics } + } + + const refinedSlices = await refineAllSlices(context, partition, workingSlices, sortKeys) + return { + slices: refinedSlices, + diagnostics: buildPartitionDiagnostics( + partition, + refinedSlices, + usedDistributionFallback, + usedLowConfidenceChunkRefinement, + true + ), + } +} + +export function buildPartitionDiagnostics( + partition: Partition, + slices: PartitionSlice[], + usedDistributionFallback: boolean, + usedLowConfidenceChunkRefinement: boolean, + usedExactCountFallback: boolean, +): PartitionDiagnostics { + const estimatedRowSum = slices.reduce((sum, slice) => sum + slice.estimate.rows, 0) + const estimateToExactRatio = partition.rows > 0 ? estimatedRowSum / partition.rows : 1 + + return { + estimatedRowSum, + exactPartitionRows: partition.rows, + estimateToExactRatio, + suspiciousEstimate: + estimateToExactRatio < ESTIMATE_RATIO_MIN || estimateToExactRatio > ESTIMATE_RATIO_MAX, + lowConfidenceChunkCount: slices.filter((slice) => slice.estimate.confidence === 'low').length, + usedDistributionFallback, + usedLowConfidenceChunkRefinement, + usedExactCountFallback, + } +} + +async function refineLowConfidenceSlices( + context: PlannerContext, + partition: Partition, + slices: PartitionSlice[], + sortKeys: SortKey[], +): Promise { + const refined: PartitionSlice[] = [] + + for (const slice of slices) { + if (slice.estimate.confidence !== 'low') { + refined.push(slice) + continue + } + refined.push(await refineSlice(context, partition, slice, sortKeys)) + } + + return refined +} + +async function refineAllSlices( + context: PlannerContext, + partition: Partition, + slices: PartitionSlice[], + sortKeys: SortKey[], +): Promise { + return Promise.all(slices.map((slice) => refineSlice(context, partition, slice, sortKeys))) +} + +async function refineSlice( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], +): Promise { + const rows = await countRowsExact( + context, + { + partitionId: partition.partitionId, + ranges: slice.ranges, + }, + sortKeys + ) + + return { + ...slice, + estimate: buildSliceEstimate(partition, rows, 'exact', 'exact-count'), + } +} diff --git a/packages/plugin-backfill/src/chunking/strategies/string-prefix-split.ts b/packages/plugin-backfill/src/chunking/strategies/string-prefix-split.ts new file mode 100644 index 0000000..bed8d57 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategies/string-prefix-split.ts @@ -0,0 +1,144 @@ +import { buildSliceFromRows } from '../partition-slices.js' +import { probeStringPrefixDistribution } from '../services/distribution-source.js' +import type { + Partition, + PartitionSlice, + PlannerContext, + SortKey, + StringPrefixBucket, +} from '../types.js' +import { + buildObservedStringUpperBound, + maxBinaryString, + minBinaryString, + nextPrefixValue, +} from '../utils/binary-string.js' +import { getChunkRange, replaceChunkRange } from '../utils/ranges.js' + +const TARGET_BYTES_FUZZ_FACTOR = 1.15 +const PREFIX_START_DEPTH = 1 +const PREFIX_MAX_DEPTH = 4 + +export async function splitSliceWithStringPrefixes( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, +): Promise { + const sortKey = sortKeys[dimensionIndex] + if (!sortKey || sortKey.category !== 'string') return [] + + const range = getChunkRange(slice, dimensionIndex) + if (range.from === undefined || range.to === undefined) return [] + + return buildPrefixSlices( + context, + partition, + slice, + sortKeys, + dimensionIndex, + range.from, + range.to, + PREFIX_START_DEPTH + ) +} + +export function buildRootStringUpperBound(maxValue: string): string { + return buildObservedStringUpperBound(maxValue) +} + +async function buildPrefixSlices( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, + rangeFrom: string, + rangeTo: string, + depth: number, +): Promise { + const sortKey = sortKeys[dimensionIndex] + if (!sortKey) return [] + + const buckets = await probeStringPrefixDistribution( + context, + partition.partitionId, + replaceChunkRange(slice, dimensionIndex, rangeFrom, rangeTo), + sortKey, + dimensionIndex, + depth, + sortKeys + ) + + const slices: PartitionSlice[] = [] + for (const bucket of buckets) { + if (bucket.rowCount <= 0) continue + + const bucketSlice = buildBucketSlice(partition, slice, dimensionIndex, rangeFrom, rangeTo, bucket) + if (!bucketSlice) continue + + if (bucketSlice.estimate.bytesCompressed <= context.targetChunkBytes * TARGET_BYTES_FUZZ_FACTOR) { + slices.push(bucketSlice) + continue + } + + if (!bucket.isExactValue && depth < PREFIX_MAX_DEPTH) { + const bucketRange = getChunkRange(bucketSlice, dimensionIndex) + if (bucketRange.from !== undefined && bucketRange.to !== undefined) { + slices.push( + ...(await buildPrefixSlices( + context, + partition, + slice, + sortKeys, + dimensionIndex, + bucketRange.from, + bucketRange.to, + depth + 1 + )) + ) + continue + } + } + + slices.push(bucketSlice) + } + + return slices +} + +function buildBucketSlice( + partition: Partition, + parentSlice: PartitionSlice, + dimensionIndex: number, + rangeFrom: string, + rangeTo: string, + bucket: StringPrefixBucket, +): PartitionSlice | undefined { + const bucketFrom = maxBinaryString(rangeFrom, bucket.value) + const bucketUpper = bucket.isExactValue ? `${bucket.value}\0` : nextPrefixValue(bucket.value) + if (bucketUpper === undefined) return undefined + + const bucketTo = minBinaryString(rangeTo, bucketUpper) + if (bucketFrom === bucketTo) return undefined + + const focusedValue = bucket.isExactValue + ? { dimensionIndex, value: bucket.value } + : parentSlice.analysis.focusedValue + + return buildSliceFromRows(partition, { + ranges: replaceChunkRange(parentSlice, dimensionIndex, bucketFrom, bucketTo), + rows: bucket.rowCount, + focusedValue, + confidence: 'high', + reason: 'string-prefix-distribution', + lineage: parentSlice.analysis.lineage.concat([ + { + strategyId: 'string-prefix-split', + dimensionIndex, + reason: 'split slice using string prefix distribution', + }, + ]), + }) +} diff --git a/packages/plugin-backfill/src/chunking/strategies/temporal-bucket-split.ts b/packages/plugin-backfill/src/chunking/strategies/temporal-bucket-split.ts new file mode 100644 index 0000000..b8c16db --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategies/temporal-bucket-split.ts @@ -0,0 +1,117 @@ +import { buildSliceFromRows, getTargetChunkRows } from '../partition-slices.js' +import { probeTemporalDistribution } from '../services/distribution-source.js' +import { parsePlannerDateTime } from '../services/row-probe.js' +import type { + Partition, + PartitionSlice, + PlannerContext, + SortKey, + TemporalBucket, +} from '../types.js' +import { getChunkRange, replaceChunkRange } from '../utils/ranges.js' + +const TARGET_BYTES_FUZZ_FACTOR = 1.15 + +export async function splitSliceWithTemporalBuckets( + context: PlannerContext, + partition: Partition, + slice: PartitionSlice, + sortKeys: SortKey[], + dimensionIndex: number, +): Promise { + const dayBuckets = await probeTemporalDistribution( + context, + partition.partitionId, + slice.ranges, + sortKeys, + dimensionIndex, + 'day' + ) + if (dayBuckets.length === 0) return [slice] + + const daySlices = buildTemporalSlices(partition, slice, dimensionIndex, dayBuckets, context.targetChunkBytes) + if (daySlices.every((candidate) => candidate.estimate.bytesCompressed <= context.targetChunkBytes * TARGET_BYTES_FUZZ_FACTOR)) { + return daySlices + } + + const hourBuckets = await probeTemporalDistribution( + context, + partition.partitionId, + slice.ranges, + sortKeys, + dimensionIndex, + 'hour' + ) + if (hourBuckets.length === 0) return daySlices + + return buildTemporalSlices(partition, slice, dimensionIndex, hourBuckets, context.targetChunkBytes) +} + +export function getPartitionEndExclusive(partition: Partition): string { + return new Date(parsePlannerDateTime(partition.maxTime) + 1000).toISOString() +} + +function buildTemporalSlices( + partition: Partition, + parentSlice: PartitionSlice, + dimensionIndex: number, + buckets: TemporalBucket[], + targetChunkBytes: number, +): PartitionSlice[] { + const targetChunkRows = getTargetChunkRows(partition, targetChunkBytes) + const slices: PartitionSlice[] = [] + let currentStart: string | undefined + let currentRows = 0 + const parentRange = getChunkRange(parentSlice, dimensionIndex) + const sliceStart = parentRange.from + const sliceEnd = parentRange.to ?? getPartitionEndExclusive(partition) + + for (let index = 0; index < buckets.length; index++) { + const bucket = buckets[index] + if (!bucket) continue + + const bucketStart = sliceStart && bucket.start < sliceStart ? sliceStart : bucket.start + if (currentStart === undefined) { + currentStart = bucketStart + } + + const wouldExceed = currentRows > 0 && currentRows + bucket.rowCount > targetChunkRows * TARGET_BYTES_FUZZ_FACTOR + if (wouldExceed && currentStart !== undefined && currentStart < bucketStart) { + slices.push(buildSlice(parentSlice, partition, dimensionIndex, currentStart, bucketStart, currentRows)) + currentStart = bucketStart + currentRows = 0 + } + + currentRows += bucket.rowCount + + if (index === buckets.length - 1 && currentStart !== undefined && currentStart < sliceEnd) { + slices.push(buildSlice(parentSlice, partition, dimensionIndex, currentStart, sliceEnd, currentRows)) + } + } + + return slices.length > 0 ? slices : [parentSlice] +} + +function buildSlice( + parentSlice: PartitionSlice, + partition: Partition, + dimensionIndex: number, + from: string, + to: string, + rows: number, +): PartitionSlice { + return buildSliceFromRows(partition, { + ranges: replaceChunkRange(parentSlice, dimensionIndex, from, to), + rows, + focusedValue: parentSlice.analysis.focusedValue, + confidence: 'low', + reason: 'temporal-distribution', + lineage: parentSlice.analysis.lineage.concat([ + { + strategyId: 'temporal-bucket-split', + dimensionIndex, + reason: 'split slice using temporal distribution buckets', + }, + ]), + }) +} diff --git a/packages/plugin-backfill/src/chunking/strategy-policy.test.ts b/packages/plugin-backfill/src/chunking/strategy-policy.test.ts new file mode 100644 index 0000000..04880b7 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategy-policy.test.ts @@ -0,0 +1,13 @@ +import { describe, expect, test } from 'bun:test' + +import { getCandidateDimensions } from './strategy-policy.js' + +describe('getCandidateDimensions', () => { + test('preserves declared sort-key order regardless of type', () => { + expect(getCandidateDimensions([ + { name: 'event_time', type: 'DateTime', category: 'datetime', boundaryEncoding: 'literal' }, + { name: 'account_id', type: 'String', category: 'string', boundaryEncoding: 'hex-latin1' }, + { name: 'seq', type: 'UInt64', category: 'numeric', boundaryEncoding: 'literal' }, + ])).toEqual([0, 1, 2]) + }) +}) diff --git a/packages/plugin-backfill/src/chunking/strategy-policy.ts b/packages/plugin-backfill/src/chunking/strategy-policy.ts new file mode 100644 index 0000000..0b1a4d1 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/strategy-policy.ts @@ -0,0 +1,8 @@ +import type { PartitionSlice, SortKey } from './types.js' + +export function getCandidateDimensions( + sortKeys: SortKey[], + _slice?: PartitionSlice, +): number[] { + return sortKeys.map((_, index) => index) +} diff --git a/packages/plugin-backfill/src/chunking/types.ts b/packages/plugin-backfill/src/chunking/types.ts index 220da04..f45b7fe 100644 --- a/packages/plugin-backfill/src/chunking/types.ts +++ b/packages/plugin-backfill/src/chunking/types.ts @@ -1,76 +1,65 @@ -export interface PartitionInfo { - partitionId: string - rows: number - bytesOnDisk: number - bytesUncompressed?: number - minTime: string - maxTime: string -} +export type RowProbeStrategy = 'explain-estimate' | 'count' + +export type SortKeyCategory = 'numeric' | 'datetime' | 'string' + +export type SortKeyBoundaryEncoding = 'literal' | 'hex-latin1' + +export type EstimateConfidence = 'high' | 'low' | 'exact' + +export type EstimateReason = + | 'partition-metadata' + | 'quantile-estimate' + | 'string-prefix-distribution' + | 'temporal-distribution' + | 'equal-width-distribution' + | 'exact-count' -export interface SortKeyInfo { - column: string +export interface SortKey { + name: string type: string - category: 'numeric' | 'datetime' | 'string' + category: SortKeyCategory + boundaryEncoding: SortKeyBoundaryEncoding } -export interface SliceRange { +export interface ChunkRange { dimensionIndex: number from?: string to?: string } -export interface SliceLineageStep { +export interface ChunkDerivationStep { strategyId: string dimensionIndex?: number reason: string } -export type EstimateConfidence = 'high' | 'low' | 'exact' +export interface ChunkEstimate { + rows: number + bytesCompressed: number + bytesUncompressed: number + confidence: EstimateConfidence + reason: EstimateReason +} -export type EstimateReason = - | 'partition-metadata' - | 'quantile-estimate' - | 'string-prefix-distribution' - | 'temporal-distribution' - | 'equal-width-distribution' - | 'exact-count' +export interface FocusedValue { + dimensionIndex: number + value: string +} -export interface ChunkBoundary { - partitionId: string - ranges?: SliceRange[] - sortKeyFrom?: string - sortKeyTo?: string - estimatedBytes: number - estimatedRows?: number - isHotKey?: boolean - hotDimensionIndex?: number - hotKeyValue?: string - estimateConfidence?: EstimateConfidence - estimateReason?: EstimateReason - lineage?: SliceLineageStep[] -} - -export interface PlannedChunk { +export interface ChunkAnalysis { + focusedValue?: FocusedValue + lineage: ChunkDerivationStep[] +} + +export interface Chunk { id: string partitionId: string - ranges?: SliceRange[] - sortKeyFrom?: string - sortKeyTo?: string - estimatedBytes: number - estimatedRows?: number - idempotencyToken: string - from: string - to: string - isHotKey?: boolean - hotDimensionIndex?: number - hotKeyValue?: string - estimateConfidence?: EstimateConfidence - estimateReason?: EstimateReason - lineage?: SliceLineageStep[] + ranges: ChunkRange[] + estimate: ChunkEstimate + analysis: ChunkAnalysis } export interface PartitionDiagnostics { - partitionId: string estimatedRowSum: number exactPartitionRows: number estimateToExactRatio: number @@ -80,3 +69,99 @@ export interface PartitionDiagnostics { usedLowConfidenceChunkRefinement: boolean usedExactCountFallback: boolean } + +export interface Partition { + partitionId: string + rows: number + bytesCompressed: number + bytesUncompressed: number + minTime: string + maxTime: string + diagnostics?: PartitionDiagnostics +} + +export interface TableProfile { + database: string + table: string + sortKeys: SortKey[] +} + +export interface ChunkPlanStats { + totalPartitions: number + oversizedPartitions: number + focusedChunks: number + totalChunks: number + avgChunkBytes: number + maxChunkBytes: number + minChunkBytes: number +} + +export interface ChunkPlan { + planId: string + generatedAt: string + rowProbeStrategy: RowProbeStrategy + targetChunkBytes: number + table: TableProfile + partitions: Partition[] + chunks: Chunk[] + totalRows: number + totalBytesCompressed: number + totalBytesUncompressed: number + stats: ChunkPlanStats +} + +export type PlannerQuery = (sql: string) => Promise + +export interface PlannerContext { + database: string + table: string + from?: string + to?: string + targetChunkBytes: number + query: PlannerQuery + rowProbeStrategy: RowProbeStrategy +} + +export interface EstimateFilter { + partitionId: string + ranges: ChunkRange[] + exactDimensionIndex?: number + exactValue?: string +} + +export interface StringPrefixBucket { + value: string + rowCount: number + isExactValue: boolean +} + +export interface TemporalBucket { + start: string + rowCount: number +} + +export interface PartitionSlice { + partitionId: string + ranges: ChunkRange[] + estimate: ChunkEstimate + analysis: ChunkAnalysis +} + +export interface PartitionBuildResult { + slices: PartitionSlice[] + diagnostics: PartitionDiagnostics +} + +export interface PlanChunkOptions { + requireIdempotencyToken: boolean +} + +export interface GenerateChunkPlanInput { + database: string + table: string + from?: string + to?: string + targetChunkBytes: number + query: PlannerQuery + rowProbeStrategy?: RowProbeStrategy +} diff --git a/packages/plugin-backfill/src/chunking/utils/binary-string.ts b/packages/plugin-backfill/src/chunking/utils/binary-string.ts new file mode 100644 index 0000000..51fb8d6 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/utils/binary-string.ts @@ -0,0 +1,55 @@ +export function compareBinaryStrings(left: string, right: string): number { + return Buffer.from(left, 'latin1').compare(Buffer.from(right, 'latin1')) +} + +export function minBinaryString(left: string, right: string): string { + return compareBinaryStrings(left, right) <= 0 ? left : right +} + +export function maxBinaryString(left: string, right: string): string { + return compareBinaryStrings(left, right) >= 0 ? left : right +} + +export function nextPrefixValue(prefix: string): string | undefined { + if (prefix.length === 0) return undefined + + const buffer = Buffer.from(prefix, 'latin1') + for (let index = buffer.length - 1; index >= 0; index--) { + const byte = buffer[index] + if (byte === undefined || byte === 0xff) continue + + const next = Buffer.from(buffer.subarray(0, index + 1)) + next[index] = byte + 1 + return next.toString('latin1') + } + + return undefined +} + +export function buildObservedStringUpperBound(maxValue: string): string { + return `${maxValue}\0` +} + +export function strToBigInt(value: string, padTo: number): bigint { + const buffer = Buffer.from(value, 'latin1') + let result = 0n + + for (let index = 0; index < padTo; index++) { + const byte = index < buffer.length ? (buffer[index] ?? 0) : 0 + result = (result << 8n) | BigInt(byte) + } + + return result +} + +export function bigIntToStr(value: bigint, length: number): string { + const buffer = Buffer.alloc(length) + let remaining = value + + for (let index = length - 1; index >= 0; index--) { + buffer[index] = Number(remaining & 0xffn) + remaining >>= 8n + } + + return buffer.toString('latin1') +} diff --git a/packages/plugin-backfill/src/chunking/utils/ids.ts b/packages/plugin-backfill/src/chunking/utils/ids.ts new file mode 100644 index 0000000..32c0ff3 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/utils/ids.ts @@ -0,0 +1,17 @@ +import { hashId, randomPlanId } from '../../state.js' + +export function generatePlanId(): string { + return randomPlanId() +} + +export function generateChunkId( + planId: string, + partitionId: string, + index: number, +): string { + return hashId(`chunk:${planId}:${partitionId}:${index}`).slice(0, 16) +} + +export function generateIdempotencyToken(planId: string, chunkId: string): string { + return hashId(`token:${planId}:${chunkId}`) +} diff --git a/packages/plugin-backfill/src/chunking/utils/ranges.ts b/packages/plugin-backfill/src/chunking/utils/ranges.ts new file mode 100644 index 0000000..3af1571 --- /dev/null +++ b/packages/plugin-backfill/src/chunking/utils/ranges.ts @@ -0,0 +1,31 @@ +import type { ChunkRange, PartitionSlice } from '../types.js' + +export function getChunkRange( + slice: Pick, + dimensionIndex: number, +): ChunkRange { + return ( + slice.ranges.find((range) => range.dimensionIndex === dimensionIndex) ?? { + dimensionIndex, + from: undefined, + to: undefined, + } + ) +} + +export function replaceChunkRange( + slice: Pick, + dimensionIndex: number, + from: string | undefined, + to: string | undefined, +): ChunkRange[] { + return slice.ranges + .filter((range) => range.dimensionIndex !== dimensionIndex) + .concat([{ dimensionIndex, from, to }]) + .sort((left, right) => left.dimensionIndex - right.dimensionIndex) +} + +export function isExactChunkRange(range: Pick): boolean { + if (range.from === undefined || range.to === undefined) return false + return range.to === `${range.from}\0` +} diff --git a/packages/plugin-backfill/src/partition-planner.test.ts b/packages/plugin-backfill/src/partition-planner.test.ts deleted file mode 100644 index b10c8da..0000000 --- a/packages/plugin-backfill/src/partition-planner.test.ts +++ /dev/null @@ -1,185 +0,0 @@ -import { describe, expect, test } from 'bun:test' - -import { buildChunkBoundaries } from './chunking/build.js' -import { buildChunkSql } from './chunking/sql.js' -import { buildPlannedChunks } from './chunking/analyze.js' -import type { PartitionInfo, SortKeyInfo } from './types.js' - -const GiB = 1024 ** 3 - -function buildChunksWithSql(input: { - planId: string - target: string - partitions: PartitionInfo[] - maxChunkBytes: number - sortKey?: SortKeyInfo - sortKeyRanges?: Map - requireIdempotencyToken: boolean - mvAsQuery?: string - targetColumns?: string[] -}) { - const boundaries = buildChunkBoundaries({ - partitions: input.partitions, - maxChunkBytes: input.maxChunkBytes, - sortKey: input.sortKey, - sortKeyRanges: input.sortKeyRanges, - }) - - const planned = buildPlannedChunks({ - planId: input.planId, - partitions: input.partitions, - boundaries, - requireIdempotencyToken: input.requireIdempotencyToken, - }) - - return planned.map(chunk => ({ - ...chunk, - sqlTemplate: buildChunkSql({ - planId: input.planId, - chunk, - target: input.target, - sortKey: input.sortKey, - mvAsQuery: input.mvAsQuery, - targetColumns: input.targetColumns, - }), - })) -} - -describe('buildChunksWithSql', () => { - const basePlanId = 'abc1234567890123' - - test('small partition produces one chunk with _partition_id filter only', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 1000, bytesOnDisk: 5 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T23:59:59.000Z' }, - ] - - const chunks = buildChunksWithSql({ - planId: basePlanId, - target: 'default.events', - partitions, - maxChunkBytes: 10 * GiB, - requireIdempotencyToken: true, - }) - - expect(chunks).toHaveLength(1) - expect(chunks[0]?.sqlTemplate).toContain("WHERE _partition_id = '202501'") - expect(chunks[0]?.partitionId).toBe('202501') - expect(chunks[0]?.estimatedBytes).toBe(5 * GiB) - }) - - test('large partition with datetime sort key produces sub-chunks with parseDateTimeBestEffort', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 10000, bytesOnDisk: 30 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - const sortKey: SortKeyInfo = { column: 'event_time', type: 'DateTime', category: 'datetime' } - const sortKeyRanges = new Map([ - ['202501', { min: '2025-01-01 00:00:00', max: '2025-01-31 00:00:00' }], - ]) - - const chunks = buildChunksWithSql({ - planId: basePlanId, - target: 'default.events', - partitions, - maxChunkBytes: 10 * GiB, - sortKey, - sortKeyRanges, - requireIdempotencyToken: true, - }) - - expect(chunks).toHaveLength(3) - for (const chunk of chunks) { - expect(chunk.sqlTemplate).toContain("WHERE _partition_id = '202501'") - expect(chunk.sqlTemplate).toContain('event_time >= parseDateTimeBestEffort(') - expect(chunk.sqlTemplate).toContain('event_time < parseDateTimeBestEffort(') - expect(chunk.partitionId).toBe('202501') - } - }) - - test('chunk IDs are deterministic for same input', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 1000, bytesOnDisk: 5 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - - const first = buildChunksWithSql({ - planId: basePlanId, - target: 'default.events', - partitions, - maxChunkBytes: 10 * GiB, - requireIdempotencyToken: true, - }) - - const second = buildChunksWithSql({ - planId: basePlanId, - target: 'default.events', - partitions, - maxChunkBytes: 10 * GiB, - requireIdempotencyToken: true, - }) - - expect(first[0]?.id).toBe(second[0]?.id) - expect(first[0]?.idempotencyToken).toBe(second[0]?.idempotencyToken) - }) - - test('idempotency tokens are empty when not required', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 1000, bytesOnDisk: 5 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - - const chunks = buildChunksWithSql({ - planId: basePlanId, - target: 'default.events', - partitions, - maxChunkBytes: 10 * GiB, - requireIdempotencyToken: false, - }) - - expect(chunks[0]?.idempotencyToken).toBe('') - expect(chunks[0]?.sqlTemplate).not.toContain('insert_deduplication_token') - }) - - test('SQL templates include correct INSERT and SELECT structure', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 1000, bytesOnDisk: 5 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - - const chunks = buildChunksWithSql({ - planId: basePlanId, - target: 'default.events', - partitions, - maxChunkBytes: 10 * GiB, - requireIdempotencyToken: true, - }) - - const sql = chunks[0]?.sqlTemplate ?? '' - expect(sql).toContain(`/* chkit backfill plan=${basePlanId}`) - expect(sql).toContain('INSERT INTO default.events') - expect(sql).toContain('SELECT *') - expect(sql).toContain('FROM default.events') - expect(sql).toContain('SETTINGS async_insert=0') - }) - - test('numeric sort key sub-chunks use direct comparison', () => { - const partitions: PartitionInfo[] = [ - { partitionId: '202501', rows: 10000, bytesOnDisk: 20 * GiB, minTime: '2025-01-01T00:00:00.000Z', maxTime: '2025-01-31T00:00:00.000Z' }, - ] - const sortKey: SortKeyInfo = { column: 'id', type: 'UInt64', category: 'numeric' } - const sortKeyRanges = new Map([ - ['202501', { min: '100', max: '200' }], - ]) - - const chunks = buildChunksWithSql({ - planId: basePlanId, - target: 'default.events', - partitions, - maxChunkBytes: 10 * GiB, - sortKey, - sortKeyRanges, - requireIdempotencyToken: false, - }) - - expect(chunks).toHaveLength(2) - expect(chunks[0]?.sqlTemplate).toContain("id >= '100'") - expect(chunks[0]?.sqlTemplate).toContain("id < '150'") - expect(chunks[0]?.sqlTemplate).not.toContain('parseDateTimeBestEffort') - }) -}) diff --git a/packages/plugin-backfill/src/payload.ts b/packages/plugin-backfill/src/payload.ts index f17e096..f79d2aa 100644 --- a/packages/plugin-backfill/src/payload.ts +++ b/packages/plugin-backfill/src/payload.ts @@ -27,15 +27,13 @@ export function planPayload(output: BuildBackfillPlanOutput): { target: output.plan.target, from: output.plan.from, to: output.plan.to, - chunkCount: output.plan.chunks.length, + chunkCount: output.plan.chunkPlan.chunks.length, maxChunkBytes: output.plan.options.maxChunkBytes, sortKeyColumn: output.plan.options.sortKeyColumn, planPath: output.planPath, - strategy: output.plan.strategy, - partitionCount: output.plan.partitions?.length, - totalBytes: output.plan.partitions - ? output.plan.partitions.reduce((sum, p) => sum + p.bytesOnDisk, 0) - : undefined, + strategy: output.plan.execution.mode, + partitionCount: output.plan.chunkPlan.partitions.length, + totalBytes: output.plan.chunkPlan.totalBytesCompressed, } } diff --git a/packages/plugin-backfill/src/planner.test.ts b/packages/plugin-backfill/src/planner.test.ts index be8f8cb..4d83103 100644 --- a/packages/plugin-backfill/src/planner.test.ts +++ b/packages/plugin-backfill/src/planner.test.ts @@ -1,39 +1,52 @@ import { describe, expect, test } from 'bun:test' -import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises' -import { join, resolve } from 'node:path' +import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises' +import { dirname, join, resolve } from 'node:path' import { tmpdir } from 'node:os' import { resolveConfig } from '@chkit/core' +import { buildChunkExecutionSql, rewriteSelectColumns } from './chunking/sql.js' +import { generateIdempotencyToken } from './chunking/utils/ids.js' import { PlanSchema } from './options.js' import { buildBackfillPlan } from './planner.js' -import { injectSortKeyFilter, rewriteSelectColumns } from './chunking/sql.js' -import { computeBackfillStateDir, computeEnvironmentFingerprint } from './state.js' +import { backfillPaths, computeBackfillStateDir, readPlan } from './state.js' function createMockQuery(opts: { - partitions?: Array<{ partition_id: string; total_rows: string; total_bytes: string; min_time: string; max_time: string }> + partitions?: Array<{ + partition_id: string + total_rows: string + total_bytes: string + total_uncompressed_bytes?: string + min_time: string + max_time: string + }> sortingKey?: string - sortKeyType?: string - sortKeyRanges?: Array<{ partition_id: string; min_val: string; max_val: string }> + columnRows?: Array<{ name: string; type: string }> } = {}): (sql: string) => Promise { const partitions = opts.partitions ?? [ - { partition_id: '202601', total_rows: '1000', total_bytes: '500000', min_time: '2026-01-01 00:00:00', max_time: '2026-01-01 18:00:00' }, + { + partition_id: '202601', + total_rows: '1000', + total_bytes: '500000', + total_uncompressed_bytes: '1000000', + min_time: '2026-01-01 00:00:00', + max_time: '2026-01-01 18:00:00', + }, ] const sortingKey = opts.sortingKey ?? 'event_time' - const sortKeyType = opts.sortKeyType ?? 'DateTime' - const sortKeyRanges = opts.sortKeyRanges ?? [] + const columnRows = opts.columnRows ?? [{ name: 'event_time', type: 'DateTime' }] return async (sql: string) => { - if (sql.includes('system.parts')) return partitions as T[] - if (sql.includes('system.tables')) return [{ sorting_key: sortingKey }] as T[] - if (sql.includes('system.columns')) return [{ type: sortKeyType }] as T[] - if (sql.includes('min(') && sql.includes('max(')) return sortKeyRanges as T[] + if (sql.includes('SELECT 1 FROM')) return [{ ok: 1 }] as T[] + if (sql.includes('FROM system.parts')) return partitions as T[] + if (sql.includes('FROM system.tables')) return [{ sorting_key: sortingKey }] as T[] + if (sql.includes('FROM system.columns')) return columnRows as T[] return [] as T[] } } describe('@chkit/plugin-backfill planning', () => { - test('each plan gets a unique random id', async () => { + test('each plan gets a unique random id and canonical chunk plan', async () => { const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) const configPath = join(dir, 'clickhouse.config.ts') @@ -42,12 +55,37 @@ describe('@chkit/plugin-backfill planning', () => { schema: './schema.ts', metaDir: './chkit/meta', }) - const opts = PlanSchema.parse({ target: 'app.events', from: '2026-01-01T00:00:00.000Z', to: '2026-01-01T18:00:00.000Z' }) + const opts = PlanSchema.parse({ + target: 'app.events', + from: '2026-01-01T00:00:00.000Z', + to: '2026-01-01T18:00:00.000Z', + }) const mockQuery = createMockQuery({ partitions: [ - { partition_id: '202601a', total_rows: '500', total_bytes: '250000', min_time: '2026-01-01 00:00:00', max_time: '2026-01-01 06:00:00' }, - { partition_id: '202601b', total_rows: '500', total_bytes: '250000', min_time: '2026-01-01 06:00:00', max_time: '2026-01-01 12:00:00' }, - { partition_id: '202601c', total_rows: '500', total_bytes: '250000', min_time: '2026-01-01 12:00:00', max_time: '2026-01-01 18:00:00' }, + { + partition_id: '202601a', + total_rows: '500', + total_bytes: '250000', + total_uncompressed_bytes: '500000', + min_time: '2026-01-01 00:00:00', + max_time: '2026-01-01 06:00:00', + }, + { + partition_id: '202601b', + total_rows: '500', + total_bytes: '250000', + total_uncompressed_bytes: '500000', + min_time: '2026-01-01 06:00:00', + max_time: '2026-01-01 12:00:00', + }, + { + partition_id: '202601c', + total_rows: '500', + total_bytes: '250000', + total_uncompressed_bytes: '500000', + min_time: '2026-01-01 12:00:00', + max_time: '2026-01-01 18:00:00', + }, ], }) @@ -56,12 +94,24 @@ describe('@chkit/plugin-backfill planning', () => { expect(first.plan.planId).not.toBe(second.plan.planId) expect(first.plan.planId).toMatch(/^[a-f0-9]{16}$/) - expect(first.plan.chunks).toHaveLength(3) - - const chunk = first.plan.chunks[0] - expect(chunk?.idempotencyToken.length).toBe(64) - expect(chunk?.sqlTemplate).toContain('INSERT INTO app.events') - expect(chunk?.sqlTemplate).toContain(`insert_deduplication_token='${chunk?.idempotencyToken}'`) + expect(first.plan.chunkPlan.chunks).toHaveLength(3) + + const chunk = first.plan.chunkPlan.chunks[0] + const token = chunk ? generateIdempotencyToken(first.plan.planId, chunk.id) : '' + const sql = chunk + ? buildChunkExecutionSql({ + planId: first.plan.planId, + chunk, + target: first.plan.target, + sourceTarget: first.plan.execution.sourceTarget, + table: first.plan.chunkPlan.table, + idempotencyToken: token, + }) + : '' + + expect(token).toHaveLength(64) + expect(sql).toContain('INSERT INTO app.events') + expect(sql).toContain(`insert_deduplication_token='${token}'`) } finally { await rm(dir, { recursive: true, force: true }) } @@ -76,22 +126,13 @@ describe('@chkit/plugin-backfill planning', () => { schema: './schema.ts', metaDir: './chkit/meta', }) - const opts = PlanSchema.parse({ target: 'app.events', from: '2026-01-01T00:00:00.000Z', to: '2026-01-01T07:00:00.000Z' }) - const mockQuery = createMockQuery({ - partitions: [ - { partition_id: '202601a', total_rows: '250', total_bytes: '125000', min_time: '2026-01-01 00:00:00', max_time: '2026-01-01 02:00:00' }, - { partition_id: '202601b', total_rows: '250', total_bytes: '125000', min_time: '2026-01-01 02:00:00', max_time: '2026-01-01 04:00:00' }, - { partition_id: '202601c', total_rows: '250', total_bytes: '125000', min_time: '2026-01-01 04:00:00', max_time: '2026-01-01 06:00:00' }, - { partition_id: '202601d', total_rows: '250', total_bytes: '125000', min_time: '2026-01-01 06:00:00', max_time: '2026-01-01 07:00:00' }, - ], - }) - - const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: mockQuery }) + const opts = PlanSchema.parse({ target: 'app.events' }) + const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: createMockQuery() }) const raw = await readFile(output.planPath, 'utf8') - const persisted = JSON.parse(raw) as { planId: string; chunks: Array<{ id: string }> } + const persisted = JSON.parse(raw) as { planId: string; chunkPlan: { chunks: Array<{ id: string }> } } expect(persisted.planId).toBe(output.plan.planId) - expect(persisted.chunks.length).toBe(4) + expect(persisted.chunkPlan.chunks.length).toBe(1) expect(output.planPath).toContain('/plans/') } finally { await rm(dir, { recursive: true, force: true }) @@ -108,45 +149,24 @@ describe('@chkit/plugin-backfill planning', () => { metaDir: './chkit/meta', }) const opts = PlanSchema.parse({ target: 'app.events' }) - const mockQuery = createMockQuery({ - sortingKey: 'session_date', - sortKeyType: 'Date', + const output = await buildBackfillPlan({ + opts, + configPath, + config, + clickhouseQuery: createMockQuery({ + sortingKey: 'session_date', + columnRows: [{ name: 'session_date', type: 'Date' }], + }), }) - const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: mockQuery }) - - expect(output.plan.sortKey?.column).toBe('session_date') - expect(output.plan.sortKey?.category).toBe('datetime') + expect(output.plan.chunkPlan.table.sortKeys[0]?.name).toBe('session_date') + expect(output.plan.chunkPlan.table.sortKeys[0]?.category).toBe('datetime') expect(output.plan.options.sortKeyColumn).toBe('session_date') } finally { await rm(dir, { recursive: true, force: true }) } }) - test('chunk IDs are deterministic within a plan (derived from planId)', async () => { - const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) - const configPath = join(dir, 'clickhouse.config.ts') - - try { - const config = resolveConfig({ - schema: './schema.ts', - metaDir: './chkit/meta', - }) - const opts = PlanSchema.parse({ target: 'app.events' }) - - const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: createMockQuery() }) - - const chunkIds = output.plan.chunks.map(c => c.id) - const uniqueIds = new Set(chunkIds) - expect(uniqueIds.size).toBe(chunkIds.length) - for (const id of chunkIds) { - expect(id).toMatch(/^[a-f0-9]{16}$/) - } - } finally { - await rm(dir, { recursive: true, force: true }) - } - }) - test('computes state dir from config by default and plugin override', () => { const config = resolveConfig({ schema: './schema.ts', @@ -161,7 +181,7 @@ describe('@chkit/plugin-backfill planning', () => { expect(overriddenDir).toBe(resolve('/tmp/project/custom-state')) }) - test('generates MV replay SQL when schema contains materialized view', async () => { + test('generates MV replay execution metadata and SQL when schema contains materialized view', async () => { const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) const configPath = join(dir, 'clickhouse.config.ts') const schemaPath = join(dir, 'schema.ts') @@ -196,348 +216,56 @@ export const events_mv = { metaDir: './chkit/meta', }) const opts = PlanSchema.parse({ target: 'app.events_agg' }) - const mockQuery = createMockQuery() - - const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: mockQuery }) - - expect(output.plan.strategy).toBe('mv_replay') - - const chunk = output.plan.chunks[0] - expect(chunk?.sqlTemplate).toContain('INSERT INTO app.events_agg') - expect(chunk?.sqlTemplate).not.toContain('WITH _backfill_source AS (') - expect(chunk?.sqlTemplate).toContain('SELECT toStartOfHour(event_time)') - expect(chunk?.sqlTemplate).toContain('FROM app.events') - expect(chunk?.sqlTemplate).toContain('GROUP BY event_time') - expect(chunk?.sqlTemplate).toContain('SETTINGS async_insert=0') - expect(chunk?.sqlTemplate).toContain(`insert_deduplication_token='${chunk?.idempotencyToken}'`) - expect(chunk?.sqlTemplate).not.toContain('FROM app.events_agg') - } finally { - await rm(dir, { recursive: true, force: true }) - } - }) - - test('MV replay rewrites SELECT columns to match target table order', async () => { - const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) - const configPath = join(dir, 'clickhouse.config.ts') - const schemaPath = join(dir, 'schema.ts') - - try { - await writeFile( - schemaPath, - `export const sessions = { - kind: 'table', - database: 'app', - name: 'session_analytics', - columns: [ - { name: 'session_date', type: 'Date' }, - { name: 'session_id', type: 'String' }, - { name: 'skills', type: 'Array(String)' }, - { name: 'slash_commands', type: 'Array(String)' }, - { name: 'ingested_at', type: 'DateTime' }, - ], - engine: 'MergeTree', - primaryKey: ['session_date'], - orderBy: ['session_date', 'session_id'], -} -export const sessions_mv = { - kind: 'materialized_view', - database: 'app', - name: 'sessions_mv', - to: { database: 'app', name: 'session_analytics' }, - as: "SELECT *, extractAll(content, 'skill') AS skills, extractAll(content, 'cmd') AS slash_commands FROM app.raw_sessions", -} -` - ) - - const config = resolveConfig({ - schema: './schema.ts', - metaDir: './chkit/meta', - }) - const opts = PlanSchema.parse({ target: 'app.session_analytics' }) - const mockQuery = createMockQuery() - - const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: mockQuery }) - - expect(output.plan.strategy).toBe('mv_replay') - - const chunk = output.plan.chunks[0] - expect(chunk?.sqlTemplate).toContain('INSERT INTO app.session_analytics') - expect(chunk?.sqlTemplate).not.toContain('INSERT INTO app.session_analytics (') - expect(chunk?.sqlTemplate).toContain( - "SELECT session_date, session_id, extractAll(content, 'skill') AS skills, extractAll(content, 'cmd') AS slash_commands, ingested_at" - ) - } finally { - await rm(dir, { recursive: true, force: true }) - } - }) - - test('omits insert_deduplication_token when requireIdempotencyToken is false', async () => { - const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) - const configPath = join(dir, 'clickhouse.config.ts') - - try { - const config = resolveConfig({ - schema: './schema.ts', - metaDir: './chkit/meta', - }) - const opts = PlanSchema.parse({ target: 'app.events', requireIdempotencyToken: false }) - const mockQuery = createMockQuery() - - const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: mockQuery }) - - const chunk = output.plan.chunks[0] - expect(chunk?.idempotencyToken).toBe('') - expect(chunk?.sqlTemplate).toContain('SETTINGS async_insert=0') - expect(chunk?.sqlTemplate).not.toContain('insert_deduplication_token') - } finally { - await rm(dir, { recursive: true, force: true }) - } - }) - - test('uses partition strategy when no MV is found', async () => { - const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) - const configPath = join(dir, 'clickhouse.config.ts') - - try { - const config = resolveConfig({ - schema: './schema.ts', - metaDir: './chkit/meta', - }) - const opts = PlanSchema.parse({ target: 'app.events' }) - const mockQuery = createMockQuery() - - const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: mockQuery }) - - expect(output.plan.strategy).toBe('partition') - - const chunk = output.plan.chunks[0] - expect(chunk?.sqlTemplate).toContain('INSERT INTO app.events') - expect(chunk?.sqlTemplate).toContain('FROM app.events') - expect(chunk?.sqlTemplate).toContain('_partition_id') - expect(chunk?.sqlTemplate).toContain('SETTINGS async_insert=0') - } finally { - await rm(dir, { recursive: true, force: true }) - } - }) - - test('throws when no partitions found', async () => { - const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) - const configPath = join(dir, 'clickhouse.config.ts') - - try { - const config = resolveConfig({ - schema: './schema.ts', - metaDir: './chkit/meta', - }) - const opts = PlanSchema.parse({ target: 'app.events' }) - const mockQuery = createMockQuery({ partitions: [] }) + const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: createMockQuery() }) - await expect( - buildBackfillPlan({ opts, configPath, config, clickhouseQuery: mockQuery }) - ).rejects.toThrow('No partitions found') + expect(output.plan.execution.mode).toBe('mv_replay') + + const chunk = output.plan.chunkPlan.chunks[0] + const sql = chunk + ? buildChunkExecutionSql({ + planId: output.plan.planId, + chunk, + target: output.plan.target, + sourceTarget: output.plan.execution.sourceTarget, + table: output.plan.chunkPlan.table, + mvAsQuery: output.plan.execution.mvAsQuery, + targetColumns: output.plan.execution.targetColumns, + idempotencyToken: generateIdempotencyToken(output.plan.planId, chunk.id), + }) + : '' + + expect(sql).toContain('INSERT INTO app.events_agg') + expect(sql).toContain('SELECT toStartOfHour(event_time)') + expect(sql).toContain('FROM app.events') + expect(sql).toContain('GROUP BY event_time') + expect(sql).toContain('SETTINGS async_insert=0') + expect(sql).not.toContain('FROM app.events_agg') } finally { await rm(dir, { recursive: true, force: true }) } }) -}) - -describe('rewriteSelectColumns', () => { - test('reorders SELECT columns to match target table order', () => { - const query = 'SELECT *, _foo as bar, _baz as qux FROM source WHERE status = 1' - const result = rewriteSelectColumns(query, ['col_a', 'bar', 'col_b', 'qux']) - - expect(result).toContain('SELECT col_a, _foo as bar, col_b, _baz as qux') - expect(result).toContain('FROM source') - expect(result).toContain('WHERE status = 1') - }) - - test('preserves WITH clause when rewriting SELECT', () => { - const query = [ - 'WITH', - " arrayDistinct(extractAll(content, '\\w+')) AS _skills,", - " toUInt64(JSONExtractFloat(meta, 'input')) AS _input_tokens", - 'SELECT *, _skills as skills, _input_tokens as input_tokens', - 'FROM app.sessions', - 'WHERE length(content) > 0', - ].join('\n') - - const result = rewriteSelectColumns(query, ['session_id', 'skills', 'content', 'input_tokens']) - - expect(result).toContain('arrayDistinct') - expect(result).toContain('_input_tokens') - expect(result).toContain('SELECT session_id, _skills as skills, content, _input_tokens as input_tokens') - expect(result).toContain('FROM app.sessions') - expect(result).toContain('WHERE length(content) > 0') - }) - test('handles SELECT without star expansion', () => { - const query = 'SELECT toStartOfHour(event_time) AS event_time, count() AS cnt FROM events GROUP BY event_time' - const result = rewriteSelectColumns(query, ['cnt', 'event_time']) + test('MV replay rewrites SELECT columns to match target table order', () => { + const rewritten = rewriteSelectColumns( + "SELECT *, extractAll(content, 'skill') AS skills, extractAll(content, 'cmd') AS slash_commands FROM app.raw_sessions", + ['session_date', 'session_id', 'skills', 'slash_commands', 'ingested_at'] + ) - expect(result).toContain('SELECT count() AS cnt, toStartOfHour(event_time) AS event_time') - expect(result).toContain('FROM events') - expect(result).toContain('GROUP BY event_time') + expect(rewritten).toContain('SELECT session_date, session_id, extractAll(content, \'skill\') AS skills, extractAll(content, \'cmd\') AS slash_commands, ingested_at') + expect(rewritten).toContain('FROM app.raw_sessions') }) - test('returns query unchanged when SELECT/FROM cannot be found', () => { - const query = 'INSERT INTO t VALUES (1, 2)' - const result = rewriteSelectColumns(query, ['a', 'b']) + test('MV replay preserves DISTINCT when rewriting projection columns', () => { + const rewritten = rewriteSelectColumns( + 'SELECT DISTINCT event_time AS ts, user_id AS uid FROM app.events', + ['uid', 'ts'] + ) - expect(result).toBe(query) + expect(rewritten).toContain('SELECT DISTINCT user_id AS uid, event_time AS ts') + expect(rewritten).toContain('FROM app.events') }) -}) - -describe('injectSortKeyFilter', () => { - const from = '2025-01-01T00:00:00.000Z' - const to = '2025-01-01T06:00:00.000Z' - - test('injects WHERE before GROUP BY for datetime filter', () => { - const query = 'SELECT toStartOfHour(event_time) AS event_time, count() AS count FROM app.events GROUP BY event_time' - const result = injectSortKeyFilter(query, 'event_time', 'datetime', from, to) - - expect(result).toContain("WHERE event_time >= parseDateTimeBestEffort('2025-01-01T00:00:00.000Z')") - expect(result).toContain("AND event_time < parseDateTimeBestEffort('2025-01-01T06:00:00.000Z')") - expect(result).toContain('GROUP BY event_time') - expect(result.indexOf('WHERE')).toBeLessThan(result.indexOf('GROUP BY')) - }) - - test('appends AND to existing WHERE clause', () => { - const query = 'SELECT * FROM app.events WHERE status = 1' - const result = injectSortKeyFilter(query, 'event_time', 'datetime', from, to) - - expect(result).toContain('WHERE status = 1') - expect(result).toContain("AND event_time >= parseDateTimeBestEffort('") - expect(result).toContain("AND event_time < parseDateTimeBestEffort('") - expect(result.match(/WHERE/g)?.length).toBe(1) - }) - - test('numeric sort key uses direct comparison', () => { - const query = 'SELECT * FROM app.events WHERE status = 1' - const result = injectSortKeyFilter(query, 'id', 'numeric', '100', '200') - - expect(result).toContain("AND id >= '100'") - expect(result).toContain("AND id < '200'") - expect(result).not.toContain('parseDateTimeBestEffort') - }) - - test('handles query with WHERE and QUALIFY', () => { - const query = [ - 'SELECT *, skills', - 'FROM app.sessions AS s', - 'WHERE length(timestamps) > 0', - "QUALIFY ROW_NUMBER() OVER (PARTITION BY s.id ORDER BY s.ts DESC) = 1", - ].join('\n') - const result = injectSortKeyFilter(query, 'session_date', 'datetime', from, to) - - expect(result).toContain('WHERE length(timestamps) > 0') - expect(result).toContain("AND session_date >= parseDateTimeBestEffort('") - expect(result.indexOf('AND session_date')).toBeLessThan(result.indexOf('QUALIFY')) - }) - - test('handles MV query with WITH column expressions', () => { - const query = [ - 'WITH', - " arrayDistinct(arrayFilter(x -> x != '', extractAll(content, '\\\\w+'))) AS _skills", - 'SELECT', - ' id,', - ' _skills as skills,', - ' ts', - 'FROM app.sessions', - 'WHERE length(content) > 0', - ].join('\n') - const result = injectSortKeyFilter(query, 'ts', 'datetime', from, to) - - expect(result.match(/WHERE/g)?.length).toBe(1) - expect(result).toContain("AND ts >= parseDateTimeBestEffort('") - expect(result).toContain('arrayDistinct') - }) - - test('injects WHERE at end when query has no WHERE and no trailing clauses', () => { - const query = 'SELECT * FROM app.events' - const result = injectSortKeyFilter(query, 'event_time', 'datetime', from, to) - - expect(result).toContain("WHERE event_time >= parseDateTimeBestEffort('") - expect(result).toContain("AND event_time < parseDateTimeBestEffort('") - }) - - test('ignores WHERE inside parenthesized subquery', () => { - const query = 'SELECT * FROM (SELECT * FROM app.events WHERE inner = 1) AS sub GROUP BY id' - const result = injectSortKeyFilter(query, 'ts', 'datetime', from, to) - - expect(result).toContain("WHERE ts >= parseDateTimeBestEffort('") - expect(result.indexOf("WHERE ts")).toBeLessThan(result.indexOf('GROUP BY')) - expect(result).toContain('WHERE inner = 1') - }) -}) - -describe('computeEnvironmentFingerprint', () => { - test('returns undefined when clickhouse is undefined', () => { - expect(computeEnvironmentFingerprint(undefined)).toBeUndefined() - }) - - test('returns correct structure with fingerprint, url origin, and database', () => { - const env = computeEnvironmentFingerprint({ - url: 'https://my-cluster.clickhouse.cloud:8443/some/path', - database: 'analytics', - }) - expect(env).toBeDefined() - expect(env?.fingerprint).toMatch(/^[a-f0-9]{16}$/) - expect(env?.url).toBe('https://my-cluster.clickhouse.cloud:8443') - expect(env?.database).toBe('analytics') - }) - - test('same URL+database produces same fingerprint', () => { - const a = computeEnvironmentFingerprint({ url: 'https://host:8443/path', database: 'db1' }) - const b = computeEnvironmentFingerprint({ url: 'https://host:8443/other', database: 'db1' }) - - expect(a?.fingerprint).toBe(b?.fingerprint) - }) - - test('different database produces different fingerprint', () => { - const a = computeEnvironmentFingerprint({ url: 'https://host:8443', database: 'staging' }) - const b = computeEnvironmentFingerprint({ url: 'https://host:8443', database: 'production' }) - - expect(a?.fingerprint).not.toBe(b?.fingerprint) - }) - - test('different host produces different fingerprint', () => { - const a = computeEnvironmentFingerprint({ url: 'https://staging.ch.cloud:8443', database: 'db' }) - const b = computeEnvironmentFingerprint({ url: 'https://prod.ch.cloud:8443', database: 'db' }) - - expect(a?.fingerprint).not.toBe(b?.fingerprint) - }) -}) - -describe('environment binding in plan', () => { - test('plan includes environment when clickhouse is provided', async () => { - const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) - const configPath = join(dir, 'clickhouse.config.ts') - - try { - const config = resolveConfig({ - schema: './schema.ts', - metaDir: './chkit/meta', - }) - - const output = await buildBackfillPlan({ - opts: PlanSchema.parse({ target: 'app.events' }), - configPath, - config, - clickhouse: { url: 'https://my-cluster.ch.cloud:8443', database: 'analytics' }, - clickhouseQuery: createMockQuery(), - }) - - expect(output.plan.environment).toBeDefined() - expect(output.plan.environment?.fingerprint).toMatch(/^[a-f0-9]{16}$/) - expect(output.plan.environment?.url).toBe('https://my-cluster.ch.cloud:8443') - expect(output.plan.environment?.database).toBe('analytics') - } finally { - await rm(dir, { recursive: true, force: true }) - } - }) - - test('plan omits environment when clickhouse connection info is not provided', async () => { + test('omits idempotency token when disabled', async () => { const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) const configPath = join(dir, 'clickhouse.config.ts') @@ -546,50 +274,57 @@ describe('environment binding in plan', () => { schema: './schema.ts', metaDir: './chkit/meta', }) + const opts = PlanSchema.parse({ target: 'app.events', requireIdempotencyToken: false }) + const output = await buildBackfillPlan({ opts, configPath, config, clickhouseQuery: createMockQuery() }) - const output = await buildBackfillPlan({ - opts: PlanSchema.parse({ target: 'app.events' }), - configPath, - config, - clickhouseQuery: createMockQuery(), - }) - - expect(output.plan.environment).toBeUndefined() + const chunk = output.plan.chunkPlan.chunks[0] + const sql = chunk + ? buildChunkExecutionSql({ + planId: output.plan.planId, + chunk, + target: output.plan.target, + sourceTarget: output.plan.execution.sourceTarget, + table: output.plan.chunkPlan.table, + idempotencyToken: '', + }) + : '' + + expect(output.plan.execution.requireIdempotencyToken).toBe(false) + expect(sql).toContain('SETTINGS async_insert=0') + expect(sql).not.toContain('insert_deduplication_token') } finally { await rm(dir, { recursive: true, force: true }) } }) - test('plan includes environment from different clickhouse configs', async () => { + test('rejects persisted legacy plans with an actionable error', async () => { const dir = await mkdtemp(join(tmpdir(), 'chkit-backfill-plugin-')) const configPath = join(dir, 'clickhouse.config.ts') + const planId = 'deadbeefdeadbeef' try { const config = resolveConfig({ schema: './schema.ts', metaDir: './chkit/meta', }) - const opts = PlanSchema.parse({ target: 'app.events' }) - - const staging = await buildBackfillPlan({ - opts, - configPath, - config, - clickhouse: { url: 'https://staging.ch.cloud:8443', database: 'analytics' }, - clickhouseQuery: createMockQuery(), - }) - - const production = await buildBackfillPlan({ - opts, + const stateDir = computeBackfillStateDir(config, configPath) + const { planPath } = backfillPaths(stateDir, planId) + await mkdir(dirname(planPath), { recursive: true }) + + await writeFile(planPath, JSON.stringify({ + planId, + target: 'app.events', + createdAt: '2026-01-01T00:00:00.000Z', + from: '2026-01-01T00:00:00.000Z', + to: '2026-01-01T01:00:00.000Z', + chunks: [], + })) + + await expect(readPlan({ + planId, configPath, config, - clickhouse: { url: 'https://prod.ch.cloud:8443', database: 'analytics' }, - clickhouseQuery: createMockQuery(), - }) - - expect(staging.plan.environment?.url).toBe('https://staging.ch.cloud:8443') - expect(production.plan.environment?.url).toBe('https://prod.ch.cloud:8443') - expect(staging.plan.environment?.fingerprint).not.toBe(production.plan.environment?.fingerprint) + })).rejects.toThrow('uses a previous chunking format') } finally { await rm(dir, { recursive: true, force: true }) } diff --git a/packages/plugin-backfill/src/planner.ts b/packages/plugin-backfill/src/planner.ts index 9f586c6..eb5029c 100644 --- a/packages/plugin-backfill/src/planner.ts +++ b/packages/plugin-backfill/src/planner.ts @@ -1,10 +1,10 @@ import { dirname } from 'node:path' -import { loadSchemaDefinitions } from '@chkit/core/schema-loader' import type { ResolvedChxConfig } from '@chkit/core' +import { loadSchemaDefinitions } from '@chkit/core/schema-loader' -import { analyzeAndChunk } from './chunking/analyze.js' -import { buildChunkSql } from './chunking/sql.js' +import { encodeChunkPlanForPersistence } from './chunking/boundary-codec.js' +import { generateChunkPlan } from './chunking/planner.js' import { findMvForTarget } from './detect.js' import { BackfillConfigError } from './errors.js' import type { PlanOptions } from './options.js' @@ -12,13 +12,10 @@ import { backfillPaths, computeBackfillStateDir, computeEnvironmentFingerprint, + nowIso, writeJson, } from './state.js' -import type { - BackfillChunk, - BuildBackfillPlanOutput, - PartitionInfo, -} from './types.js' +import type { BuildBackfillPlanOutput } from './types.js' export async function buildBackfillPlan(input: { opts: PlanOptions @@ -33,40 +30,35 @@ export async function buildBackfillPlan(input: { throw new BackfillConfigError('Invalid target format. Expected .') } - const env = computeEnvironmentFingerprint(input.clickhouse) - - // 1. Analyze table and build planned chunks - const { - planId, - partitions, - sortKey, - sortKeys, - chunks: plannedChunks, - partitionDiagnostics, - } = await analyzeAndChunk({ + const chunkPlan = await generateChunkPlan({ database, table, from: opts.from, to: opts.to, - maxChunkBytes: opts.maxChunkBytes, - requireIdempotencyToken: opts.requireIdempotencyToken, + targetChunkBytes: opts.maxChunkBytes, query: input.clickhouseQuery, }) - if (partitions.length === 0) { + const firstPartition = chunkPlan.partitions[0] + if (!firstPartition) { throw new BackfillConfigError( `No partitions found for ${opts.target}${opts.from || opts.to ? ' within the specified time range' : ''}. The table may be empty.` ) } - const firstPartition = partitions[0] as PartitionInfo - const derivedFrom = opts.from ?? partitions.reduce((min, p) => (p.minTime < min ? p.minTime : min), firstPartition.minTime) - const derivedTo = opts.to ?? partitions.reduce((max, p) => (p.maxTime > max ? p.maxTime : max), firstPartition.maxTime) + const env = computeEnvironmentFingerprint(input.clickhouse) + const derivedFrom = opts.from ?? chunkPlan.partitions.reduce( + (min, partition) => (partition.minTime < min ? partition.minTime : min), + firstPartition.minTime + ) + const derivedTo = opts.to ?? chunkPlan.partitions.reduce( + (max, partition) => (partition.maxTime > max ? partition.maxTime : max), + firstPartition.maxTime + ) const stateDir = computeBackfillStateDir(input.config, input.configPath, opts.stateDir) - const paths = backfillPaths(stateDir, planId) + const paths = backfillPaths(stateDir, chunkPlan.planId) - // 2. Detect MV for replay strategy let mvAsQuery: string | undefined let targetColumns: string[] | undefined @@ -78,73 +70,37 @@ export async function buildBackfillPlan(input: { if (mv) { mvAsQuery = mv.as const tableDef = definitions.find( - (d) => d.kind === 'table' && d.database === database && d.name === table + (definition) => definition.kind === 'table' && definition.database === database && definition.name === table ) - if (tableDef && tableDef.kind === 'table') { - targetColumns = tableDef.columns.map((c) => c.name) + if (tableDef?.kind === 'table') { + targetColumns = tableDef.columns.map((column) => column.name) } } } catch { - // Schema load failed — fall back to direct copy + // Schema load failed, fall back to direct copy. } - // 3. Stamp SQL on each planned chunk to produce BackfillChunk[] - const chunks: BackfillChunk[] = plannedChunks.map(planned => { - const sqlTemplate = buildChunkSql({ - planId, - chunk: planned, - target: opts.target, - sortKey, - sortKeys, - mvAsQuery, - targetColumns, - }) - - return { - id: planned.id, - from: planned.from, - to: planned.to, - status: 'pending' as const, - attempts: 0, - idempotencyToken: planned.idempotencyToken, - sqlTemplate, - partitionId: planned.partitionId, - estimatedBytes: planned.estimatedBytes, - ...(planned.estimatedRows !== undefined ? { estimatedRows: planned.estimatedRows } : {}), - ...(planned.ranges ? { ranges: planned.ranges } : {}), - ...(planned.sortKeyFrom !== undefined ? { sortKeyFrom: planned.sortKeyFrom } : {}), - ...(planned.sortKeyTo !== undefined ? { sortKeyTo: planned.sortKeyTo } : {}), - ...(planned.isHotKey !== undefined ? { isHotKey: planned.isHotKey } : {}), - ...(planned.hotDimensionIndex !== undefined ? { hotDimensionIndex: planned.hotDimensionIndex } : {}), - ...(planned.hotKeyValue !== undefined ? { hotKeyValue: planned.hotKeyValue } : {}), - ...(planned.estimateConfidence !== undefined ? { estimateConfidence: planned.estimateConfidence } : {}), - ...(planned.estimateReason !== undefined ? { estimateReason: planned.estimateReason } : {}), - ...(planned.lineage ? { lineage: planned.lineage } : {}), - } - }) - - const strategy = mvAsQuery ? 'mv_replay' : 'partition' - const plan = { - planId, + planId: chunkPlan.planId, target: opts.target, - createdAt: '1970-01-01T00:00:00.000Z', - status: 'planned' as const, - strategy: strategy as 'partition' | 'mv_replay', + createdAt: nowIso(), ...(env ? { environment: env } : {}), from: derivedFrom, to: derivedTo, - chunks, - partitions, - sortKey, - sortKeys, - partitionDiagnostics, + chunkPlan, + execution: { + mode: mvAsQuery ? 'mv_replay' as const : 'copy' as const, + sourceTarget: opts.target, + ...(mvAsQuery ? { mvAsQuery } : {}), + ...(targetColumns ? { targetColumns } : {}), + requireIdempotencyToken: opts.requireIdempotencyToken, + }, options: { maxChunkBytes: opts.maxChunkBytes, maxParallelChunks: opts.maxParallelChunks, maxRetriesPerChunk: opts.maxRetriesPerChunk, requireIdempotencyToken: opts.requireIdempotencyToken, - sortKeyColumn: sortKey?.column, + sortKeyColumn: chunkPlan.table.sortKeys[0]?.name, }, policy: { requireDryRunBeforeRun: opts.requireDryRunBeforeRun, @@ -158,7 +114,10 @@ export async function buildBackfillPlan(input: { }, } - await writeJson(paths.planPath, plan) + await writeJson(paths.planPath, { + ...plan, + chunkPlan: encodeChunkPlanForPersistence(plan.chunkPlan), + }) return { plan, diff --git a/packages/plugin-backfill/src/plugin.test.ts b/packages/plugin-backfill/src/plugin.test.ts index 3e25054..a3b36e5 100644 --- a/packages/plugin-backfill/src/plugin.test.ts +++ b/packages/plugin-backfill/src/plugin.test.ts @@ -1,9 +1,16 @@ import { describe, expect, test } from 'bun:test' +import { readFileSync } from 'node:fs' import * as sdk from './sdk.js' import * as root from './index.js' import { backfill, createBackfillPlugin } from './plugin.js' +const pluginBackfillPackage = JSON.parse( + readFileSync(new URL('../package.json', import.meta.url), 'utf8') +) as { + exports: Record +} + describe('@chkit/plugin-backfill plugin surface', () => { test('exposes commands and typed registration helper', () => { const plugin = createBackfillPlugin() @@ -29,17 +36,21 @@ describe('@chkit/plugin-backfill plugin surface', () => { expect(root).not.toHaveProperty('executeBackfill') expect(sdk).toHaveProperty('analyzeAndChunk') + expect(sdk).toHaveProperty('generateChunkPlan') expect(sdk).toHaveProperty('executeBackfill') - expect(sdk).toHaveProperty('buildChunkSql') + expect(sdk).toHaveProperty('buildChunkExecutionSql') }) - test('package exports resolve root and sdk subpath separately', async () => { - const packageRoot = await import('@chkit/plugin-backfill') - const packageSdk = await import('@chkit/plugin-backfill/sdk') - - expect(packageRoot).toHaveProperty('backfill') - expect(packageRoot).not.toHaveProperty('analyzeAndChunk') - expect(packageSdk).toHaveProperty('analyzeAndChunk') - expect(packageSdk).toHaveProperty('executeBackfill') + test('package exports declare root and sdk subpath separately', () => { + expect(pluginBackfillPackage.exports['.']).toEqual({ + source: './src/index.ts', + types: './dist/index.d.ts', + default: './dist/index.js', + }) + expect(pluginBackfillPackage.exports['./sdk']).toEqual({ + source: './src/sdk.ts', + types: './dist/sdk.d.ts', + default: './dist/sdk.js', + }) }) }) diff --git a/packages/plugin-backfill/src/plugin.ts b/packages/plugin-backfill/src/plugin.ts index 53079d8..af5b64b 100644 --- a/packages/plugin-backfill/src/plugin.ts +++ b/packages/plugin-backfill/src/plugin.ts @@ -2,6 +2,8 @@ import { createClickHouseExecutor } from '@chkit/clickhouse' import { wrapPluginRun } from '@chkit/core' import { executeBackfill, type BackfillProgress } from './async-backfill.js' +import { buildChunkExecutionSql } from './chunking/sql.js' +import { generateIdempotencyToken } from './chunking/utils/ids.js' import { BackfillConfigError } from './errors.js' import { PLAN_FLAGS, @@ -112,11 +114,22 @@ async function runBackfill(input: { const result = await executeBackfill({ executor: db, planId: plan.planId, - chunks: plan.chunks.map((c) => ({ id: c.id, from: c.from, to: c.to })), + chunks: plan.chunkPlan.chunks.map((chunk) => ({ id: chunk.id })), buildQuery: (chunk) => { - const planChunk = plan.chunks.find((c) => c.id === chunk.id) + const planChunk = plan.chunkPlan.chunks.find((candidate) => candidate.id === chunk.id) if (!planChunk) throw new Error(`Chunk ${chunk.id} not found in plan`) - return planChunk.sqlTemplate + return buildChunkExecutionSql({ + planId: plan.planId, + chunk: planChunk, + target: plan.target, + sourceTarget: plan.execution.sourceTarget, + table: plan.chunkPlan.table, + mvAsQuery: plan.execution.mvAsQuery, + targetColumns: plan.execution.targetColumns, + idempotencyToken: plan.execution.requireIdempotencyToken + ? generateIdempotencyToken(plan.planId, planChunk.id) + : '', + }) }, concurrency: input.concurrency, pollIntervalMs: input.pollIntervalMs, @@ -215,12 +228,11 @@ export function createBackfillPlugin(options: PluginConfig = {}): BackfillPlugin if (context.jsonMode) { context.print(payload) } else { - const partitionCount = output.plan.partitions?.length ?? 0 - const totalBytes = output.plan.partitions - ? formatBytes(output.plan.partitions.reduce((sum, p) => sum + p.bytesOnDisk, 0)) - : 'unknown' - const sortKeyLabel = output.plan.sortKey - ? `, sort key: ${output.plan.sortKey.column} (${output.plan.sortKey.category})` + const partitionCount = output.plan.chunkPlan.partitions.length + const totalBytes = formatBytes(output.plan.chunkPlan.totalBytesCompressed) + const primarySortKey = output.plan.chunkPlan.table.sortKeys[0] + const sortKeyLabel = primarySortKey + ? `, sort key: ${primarySortKey.name} (${primarySortKey.category})` : '' context.print( `Backfill plan ${payload.planId} for ${payload.target} (${payload.chunkCount} chunks across ${partitionCount} partitions, ~${totalBytes}${sortKeyLabel}) -> ${payload.planPath}` diff --git a/packages/plugin-backfill/src/queries.ts b/packages/plugin-backfill/src/queries.ts index 66780ef..ffdef75 100644 --- a/packages/plugin-backfill/src/queries.ts +++ b/packages/plugin-backfill/src/queries.ts @@ -35,8 +35,8 @@ export async function getBackfillStatus(input: { target: plan.target, status: 'planned', totals: { - total: plan.chunks.length, - pending: plan.chunks.length, + total: plan.chunkPlan.chunks.length, + pending: plan.chunkPlan.chunks.length, submitted: 0, running: 0, done: 0, @@ -108,7 +108,7 @@ export async function getBackfillDoctorReport(input: { planId: plan.planId, target: plan.target, status: 'planned' as const, - totals: { total: plan.chunks.length, pending: plan.chunks.length, submitted: 0, running: 0, done: 0, failed: 0 }, + totals: { total: plan.chunkPlan.chunks.length, pending: plan.chunkPlan.chunks.length, submitted: 0, running: 0, done: 0, failed: 0 }, rowsWritten: 0, updatedAt: plan.createdAt, runPath: paths.runPath, diff --git a/packages/plugin-backfill/src/sdk.ts b/packages/plugin-backfill/src/sdk.ts index 0570001..9edf9df 100644 --- a/packages/plugin-backfill/src/sdk.ts +++ b/packages/plugin-backfill/src/sdk.ts @@ -1,6 +1,7 @@ export { executeBackfill, syncProgress } from './async-backfill.js' -export { analyzeAndChunk, analyzeTable, buildPlannedChunks } from './chunking/analyze.js' -export { buildChunkSql, injectSortKeyFilter, rewriteSelectColumns } from './chunking/sql.js' +export { analyzeAndChunk, analyzeTable } from './chunking/analyze.js' +export { generateChunkPlan } from './chunking/planner.js' +export { buildChunkExecutionSql, injectSortKeyFilter, rewriteSelectColumns } from './chunking/sql.js' export type { BackfillOptions, @@ -17,13 +18,14 @@ export type { } from './chunking/analyze.js' export type { - ChunkBoundary, + Chunk, + ChunkDerivationStep, + ChunkPlan, + ChunkRange, EstimateConfidence, EstimateReason, + FocusedValue, + Partition, PartitionDiagnostics, - PartitionInfo, - PlannedChunk, - SliceLineageStep, - SliceRange, - SortKeyInfo, + SortKey, } from './chunking/types.js' diff --git a/packages/plugin-backfill/src/state.ts b/packages/plugin-backfill/src/state.ts index 45dd900..2b47d52 100644 --- a/packages/plugin-backfill/src/state.ts +++ b/packages/plugin-backfill/src/state.ts @@ -5,6 +5,7 @@ import { dirname, join, resolve } from 'node:path' import type { ResolvedChxConfig } from '@chkit/core' +import { decodeChunkPlanFromPersistence } from './chunking/boundary-codec.js' import { BackfillConfigError } from './errors.js' import type { BackfillEnvironment, @@ -89,6 +90,13 @@ async function readJsonMaybe(filePath: string): Promise { return JSON.parse(await readFile(filePath, 'utf8')) as T } +function decodePlan(plan: BackfillPlanState): BackfillPlanState { + return { + ...plan, + chunkPlan: decodeChunkPlanFromPersistence(plan.chunkPlan), + } +} + export async function writeJson(filePath: string, value: unknown): Promise { await mkdir(dirname(filePath), { recursive: true }) await writeFile(filePath, `${JSON.stringify(value, null, 2)}\n`, 'utf8') @@ -102,12 +110,21 @@ export async function readPlan(input: { }): Promise { const stateDir = computeBackfillStateDir(input.config, input.configPath, input.stateDir) const paths = backfillPaths(stateDir, input.planId) - const plan = await readJsonMaybe(paths.planPath) - if (!plan) { + const rawPlan = await readJsonMaybe>(paths.planPath) + if (!rawPlan) { throw new BackfillConfigError(`Backfill plan not found: ${paths.planPath}`) } + + if (!('chunkPlan' in rawPlan)) { + throw new BackfillConfigError( + `Backfill plan ${input.planId} uses a previous chunking format and can no longer be loaded. Recreate the plan.` + ) + } + + const plan = rawPlan as unknown as BackfillPlanState + return { - plan, + plan: decodePlan(plan), planPath: paths.planPath, stateDir, } @@ -132,7 +149,7 @@ export function summarizeRunStatus( plan: BackfillPlanState, ): BackfillStatusSummary { const totals = { - total: plan.chunks.length, + total: plan.chunkPlan.chunks.length, pending: 0, submitted: 0, running: 0, @@ -141,7 +158,7 @@ export function summarizeRunStatus( } let rowsWritten = 0 - for (const chunk of plan.chunks) { + for (const chunk of plan.chunkPlan.chunks) { const state = run.progress[chunk.id] if (!state) { totals.pending += 1 diff --git a/packages/plugin-backfill/src/types.ts b/packages/plugin-backfill/src/types.ts index cf812ea..4f02865 100644 --- a/packages/plugin-backfill/src/types.ts +++ b/packages/plugin-backfill/src/types.ts @@ -2,13 +2,7 @@ import type { ChxInlinePluginRegistration, ResolvedChxConfig } from '@chkit/core import type { BackfillProgress } from './async-backfill.js' import type { - PartitionDiagnostics, - PartitionInfo, - SliceLineageStep, - SliceRange, - SortKeyInfo, - EstimateConfidence, - EstimateReason, + ChunkPlan, } from './chunking/types.js' import type { PluginConfig } from './options.js' @@ -24,52 +18,41 @@ export interface BackfillEnvironment { export type BackfillPlanStatus = 'planned' | 'running' | 'paused' | 'completed' | 'failed' | 'cancelled' -export type { ChunkBoundary, PartitionInfo, PlannedChunk, SortKeyInfo } from './chunking/types.js' +export type { + Chunk, + ChunkDerivationStep, + ChunkPlan, + ChunkRange, + EstimateConfidence, + EstimateReason, + FocusedValue, + Partition, + PartitionDiagnostics, + SortKey, +} from './chunking/types.js' -export interface BackfillChunk { - id: string - from: string - to: string - status: 'pending' | 'running' | 'done' | 'failed' | 'skipped' - attempts: number - idempotencyToken: string - sqlTemplate: string - lastError?: string - partitionId: string - estimatedBytes: number - estimatedRows?: number - ranges?: SliceRange[] - sortKeyFrom?: string - sortKeyTo?: string - isHotKey?: boolean - hotDimensionIndex?: number - hotKeyValue?: string - estimateConfidence?: EstimateConfidence - estimateReason?: EstimateReason - lineage?: SliceLineageStep[] +export interface BackfillExecutionPlan { + mode: 'copy' | 'mv_replay' + sourceTarget: string + mvAsQuery?: string + targetColumns?: string[] + requireIdempotencyToken: boolean } export interface BackfillPlanState { planId: string target: string createdAt: string - status: BackfillPlanStatus - strategy?: 'table' | 'mv_replay' | 'partition' environment?: BackfillEnvironment from: string to: string - chunks: BackfillChunk[] - partitions?: PartitionInfo[] - sortKey?: SortKeyInfo - sortKeys?: SortKeyInfo[] - partitionDiagnostics?: PartitionDiagnostics[] + chunkPlan: ChunkPlan + execution: BackfillExecutionPlan options: { - chunkHours?: number maxChunkBytes?: number maxParallelChunks: number maxRetriesPerChunk: number requireIdempotencyToken: boolean - timeColumn?: string sortKeyColumn?: string } policy: { From 3f9982011b813b60d11ac0c663b0044b552ce530 Mon Sep 17 00:00:00 2001 From: KeKs0r Date: Thu, 2 Apr 2026 15:50:44 +0200 Subject: [PATCH 4/5] Fix CI --- bun.lock | 1 + packages/plugin-obsessiondb/package.json | 1 + 2 files changed, 2 insertions(+) diff --git a/bun.lock b/bun.lock index 1191f8c..0ddc569 100644 --- a/bun.lock +++ b/bun.lock @@ -93,6 +93,7 @@ "name": "@chkit/plugin-obsessiondb", "version": "0.1.0-beta.19", "dependencies": { + "@chkit/clickhouse": "workspace:*", "@chkit/core": "workspace:*", "@orpc/client": "1.13.4", "@orpc/contract": "1.13.4", diff --git a/packages/plugin-obsessiondb/package.json b/packages/plugin-obsessiondb/package.json index 3e4eee5..01b459a 100644 --- a/packages/plugin-obsessiondb/package.json +++ b/packages/plugin-obsessiondb/package.json @@ -41,6 +41,7 @@ "clean": "rm -rf dist" }, "dependencies": { + "@chkit/clickhouse": "workspace:*", "@chkit/core": "workspace:*", "@orpc/client": "1.13.4", "@orpc/contract": "1.13.4", From fbd3a1986865674ef2156778e92d38e44bedaa1d Mon Sep 17 00:00:00 2001 From: KeKs0r Date: Thu, 2 Apr 2026 17:47:39 +0200 Subject: [PATCH 5/5] Export backfill SDK helpers and obsessiondb service types --- packages/plugin-backfill/README.md | 9 ++++++++- packages/plugin-backfill/src/plugin.test.ts | 4 ++++ packages/plugin-backfill/src/sdk.ts | 12 +++++++++++- packages/plugin-obsessiondb/src/index.ts | 4 ++++ 4 files changed, 27 insertions(+), 2 deletions(-) diff --git a/packages/plugin-backfill/README.md b/packages/plugin-backfill/README.md index 4b3f42f..112a391 100644 --- a/packages/plugin-backfill/README.md +++ b/packages/plugin-backfill/README.md @@ -42,7 +42,14 @@ The package root is limited to the plugin registration API. Chunk-planning and async execution internals are exposed from the SDK subpath: ```ts -import { analyzeAndChunk, executeBackfill } from '@chkit/plugin-backfill/sdk' +import { + analyzeAndChunk, + buildWhereClauseFromChunk, + decodeChunkPlanFromPersistence, + encodeChunkPlanForPersistence, + executeBackfill, + generateIdempotencyToken, +} from '@chkit/plugin-backfill/sdk' ``` ## License diff --git a/packages/plugin-backfill/src/plugin.test.ts b/packages/plugin-backfill/src/plugin.test.ts index a3b36e5..cf0f841 100644 --- a/packages/plugin-backfill/src/plugin.test.ts +++ b/packages/plugin-backfill/src/plugin.test.ts @@ -39,6 +39,10 @@ describe('@chkit/plugin-backfill plugin surface', () => { expect(sdk).toHaveProperty('generateChunkPlan') expect(sdk).toHaveProperty('executeBackfill') expect(sdk).toHaveProperty('buildChunkExecutionSql') + expect(sdk).toHaveProperty('buildWhereClauseFromChunk') + expect(sdk).toHaveProperty('encodeChunkPlanForPersistence') + expect(sdk).toHaveProperty('decodeChunkPlanFromPersistence') + expect(sdk).toHaveProperty('generateIdempotencyToken') }) test('package exports declare root and sdk subpath separately', () => { diff --git a/packages/plugin-backfill/src/sdk.ts b/packages/plugin-backfill/src/sdk.ts index 9edf9df..233ff5a 100644 --- a/packages/plugin-backfill/src/sdk.ts +++ b/packages/plugin-backfill/src/sdk.ts @@ -1,7 +1,17 @@ export { executeBackfill, syncProgress } from './async-backfill.js' export { analyzeAndChunk, analyzeTable } from './chunking/analyze.js' +export { + decodeChunkPlanFromPersistence, + encodeChunkPlanForPersistence, +} from './chunking/boundary-codec.js' export { generateChunkPlan } from './chunking/planner.js' -export { buildChunkExecutionSql, injectSortKeyFilter, rewriteSelectColumns } from './chunking/sql.js' +export { + buildChunkExecutionSql, + buildWhereClauseFromChunk, + injectSortKeyFilter, + rewriteSelectColumns, +} from './chunking/sql.js' +export { generateIdempotencyToken } from './chunking/utils/ids.js' export type { BackfillOptions, diff --git a/packages/plugin-obsessiondb/src/index.ts b/packages/plugin-obsessiondb/src/index.ts index 77efd28..d17c6bd 100644 --- a/packages/plugin-obsessiondb/src/index.ts +++ b/packages/plugin-obsessiondb/src/index.ts @@ -12,6 +12,10 @@ import { loadSelectedService } from './service/storage.js' export { loadCredentials, resolveBaseUrl, type Credentials } from './auth/index.js' export { createJobsClient, type JobsClient } from './backfill/index.js' +export { + loadSelectedService, +} from './service/storage.js' +export type { SelectedService } from './service/types.js' export type ObsessionDBPluginOptions = Record