Skip to content

Lift the restriction of minimum 8-byte keys for the compact index #746

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 3, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions bench/micro/Bench/Database/LSMTree/Internal/Index.hs
Original file line number Diff line number Diff line change
@@ -11,15 +11,13 @@ import Criterion.Main (Benchmark, Benchmarkable, bench, bgroup, env,
import Data.List (foldl')
-- foldl' is included in the Prelude from base 4.20 onwards
#endif
import Database.LSMTree.Extras.Generators (getKeyForIndexCompact,
mkPages, toAppends)
import Database.LSMTree.Extras.Generators (mkPages, toAppends)
-- also for @Arbitrary@ instantiation of @SerialisedKey@
import Database.LSMTree.Extras.Index (Append, append)
import Database.LSMTree.Internal.Index (Index,
IndexType (Compact, Ordinary), newWithDefaults, search,
unsafeEnd)
import Database.LSMTree.Internal.Serialise
(SerialisedKey (SerialisedKey))
import Database.LSMTree.Internal.Serialise (SerialisedKey)
import Test.QuickCheck (choose, vector)
import Test.QuickCheck.Gen (Gen (MkGen))
import Test.QuickCheck.Random (mkQCGen)
@@ -61,8 +59,7 @@ generated (MkGen exec) = exec (mkQCGen 411) 30
keysForIndexCompact :: Int -- ^ Number of keys
-> [SerialisedKey] -- ^ Constructed keys
keysForIndexCompact = vector >>>
generated >>>
map (getKeyForIndexCompact >>> SerialisedKey)
generated

{-|
Constructs append operations whose serialised keys conform to the key size
5 changes: 3 additions & 2 deletions doc/format-run.md
Original file line number Diff line number Diff line change
@@ -198,8 +198,9 @@ big-endian.
The compact index type is designed to work with keys that are large
cryptographic hashes, e.g. 32 bytes. In particular it requires:

* keys must be uniformly distributed
* keys must be at least 8 bytes (64bits), but can otherwise be variable length
* keys must be uniformly distributed;
* keys can be of variable length;
* keys less than 8 bytes (64bits) are padded with zeros (in LSB position).

For this important special case, we can do significantly better than storing a
whole key per page: we can typically store just 8 bytes (64bits) per page. This
24 changes: 0 additions & 24 deletions src-extras/Database/LSMTree/Extras/Generators.hs
Original file line number Diff line number Diff line change
@@ -33,8 +33,6 @@ module Database.LSMTree.Extras.Generators (
, genRawBytesSized
, packRawBytesPinnedOrUnpinned
, LargeRawBytes (..)
, isKeyForIndexCompact
, KeyForIndexCompact (..)
, BiasedKey (..)
-- * helpers
, shrinkVec
@@ -510,28 +508,6 @@ instance Arbitrary LargeRawBytes where

deriving newtype instance SerialiseValue LargeRawBytes

-- Serialised keys for the compact index must be at least 8 bytes long.

genKeyForIndexCompact :: Gen RawBytes
genKeyForIndexCompact =
genRawBytesN =<< QC.sized (\s -> QC.chooseInt (8, s + 8))

isKeyForIndexCompact :: RawBytes -> Bool
isKeyForIndexCompact rb = RB.size rb >= 8

newtype KeyForIndexCompact =
KeyForIndexCompact { getKeyForIndexCompact :: RawBytes }
deriving stock (Eq, Ord, Show)

instance Arbitrary KeyForIndexCompact where
arbitrary =
KeyForIndexCompact <$> genKeyForIndexCompact
shrink (KeyForIndexCompact rawBytes) =
[KeyForIndexCompact rawBytes' | rawBytes' <- shrink rawBytes,
isKeyForIndexCompact rawBytes']

deriving newtype instance SerialiseKey KeyForIndexCompact

-- we try to make collisions and close keys more likely (very crudely)
arbitraryBiasedKey :: (RawBytes -> k) -> Gen RawBytes -> Gen k
arbitraryBiasedKey fromRB genUnbiased = fromRB <$> frequency
4 changes: 1 addition & 3 deletions src/Database/LSMTree.hs
Original file line number Diff line number Diff line change
@@ -149,7 +149,6 @@ module Database.LSMTree (
serialiseKeyIdentity,
serialiseKeyIdentityUpToSlicing,
serialiseKeyPreservesOrdering,
serialiseKeyMinimalSize,
serialiseValueIdentity,
serialiseValueIdentityUpToSlicing,
packSlice,
@@ -227,8 +226,7 @@ import Database.LSMTree.Internal.Config
DiskCachePolicy (..), FencePointerIndexType (..),
LevelNo (..), MergeBatchSize (..), MergePolicy (..),
MergeSchedule (..), SizeRatio (..), TableConfig (..),
WriteBufferAlloc (..), defaultTableConfig,
serialiseKeyMinimalSize)
WriteBufferAlloc (..), defaultTableConfig)
import Database.LSMTree.Internal.Config.Override
(TableConfigOverride (..), noTableConfigOverride)
import Database.LSMTree.Internal.Entry (NumEntries (..))
16 changes: 4 additions & 12 deletions src/Database/LSMTree/Internal/Config.hs
Original file line number Diff line number Diff line change
@@ -20,7 +20,6 @@ module Database.LSMTree.Internal.Config (
-- * Fence pointer index
, FencePointerIndexType (..)
, indexTypeForRun
, serialiseKeyMinimalSize
-- * Disk cache policy
, DiskCachePolicy (..)
, diskCachePolicyForLevel
@@ -36,11 +35,9 @@ import Database.LSMTree.Internal.Index (IndexType)
import qualified Database.LSMTree.Internal.Index as Index
(IndexType (Compact, Ordinary))
import qualified Database.LSMTree.Internal.MergingRun as MR
import qualified Database.LSMTree.Internal.RawBytes as RB
import Database.LSMTree.Internal.Run (RunDataCaching (..))
import Database.LSMTree.Internal.RunAcc (RunBloomFilterAlloc (..))
import Database.LSMTree.Internal.RunBuilder (RunParams (..))
import Database.LSMTree.Internal.Serialise.Class (SerialiseKey (..))

newtype LevelNo = LevelNo Int
deriving stock (Show, Eq, Ord)
@@ -321,12 +318,11 @@ data FencePointerIndexType =
| {- |
Compact indexes are designed for the case where the keys in the database are uniformly distributed, e.g., when the keys are hashes.

When using a compact index, the 'Database.LSMTree.Internal.Serialise.Class.serialiseKey' function must satisfy the following additional law:
When using a compact index, some requirements apply to serialised keys:

[Minimal size]
@'Database.LSMTree.Internal.RawBytes.size' ('Database.LSMTree.Internal.Serialise.Class.serialiseKey' x) >= 8@

Use 'serialiseKeyMinimalSize' to test this law.
* keys must be uniformly distributed;
* keys can be of variable length;
* keys less than 8 bytes (64bits) are padded with zeros (in LSB position).
-}
CompactIndex
deriving stock (Eq, Show)
@@ -339,10 +335,6 @@ indexTypeForRun :: FencePointerIndexType -> IndexType
indexTypeForRun CompactIndex = Index.Compact
indexTypeForRun OrdinaryIndex = Index.Ordinary

-- | Test the __Minimal size__ law for the 'CompactIndex' option.
serialiseKeyMinimalSize :: SerialiseKey k => k -> Bool
serialiseKeyMinimalSize x = RB.size (serialiseKey x) >= 8

{-------------------------------------------------------------------------------
Disk cache policy
-------------------------------------------------------------------------------}
4 changes: 2 additions & 2 deletions src/Database/LSMTree/Internal/MergeSchedule.hs
Original file line number Diff line number Diff line change
@@ -673,7 +673,7 @@ addRunToLevels tr conf@TableConfig{..} resolve hfs hbio root salt uc r0 reg leve
traceWith tr $ AtLevel ln TraceAddLevel
-- Make a new level
let policyForLevel = mergePolicyForLevel confMergePolicy ln V.empty ul
ir <- newMerge policyForLevel MR.MergeLastLevel ln rs
ir <- newMerge policyForLevel (mergeTypeForLevel V.empty ul) ln rs
pure $! V.singleton $ Level ir V.empty
go !ln rs' (V.uncons -> Just (Level ir rs, ls)) = do
r <- expectCompletedMerge ln ir
@@ -714,7 +714,7 @@ addRunToLevels tr conf@TableConfig{..} resolve hfs hbio root salt uc r0 reg leve
-- Otherwise we start merging the incoming runs into the run.
LevelLevelling -> do
assert (V.null rs && V.null ls) $ pure ()
ir' <- newMerge LevelLevelling MR.MergeLastLevel ln (rs' `V.snoc` r)
ir' <- newMerge LevelLevelling (mergeTypeForLevel ls ul) ln (rs' `V.snoc` r)
pure $! Level ir' V.empty `V.cons` V.empty

-- Releases the incoming run.
26 changes: 22 additions & 4 deletions src/Database/LSMTree/Internal/RawBytes.hs
Original file line number Diff line number Diff line change
@@ -51,7 +51,7 @@ module Database.LSMTree.Internal.RawBytes (
) where

import Control.DeepSeq (NFData)
import Control.Exception (assert)
import Data.Bits (Bits (..))
import Data.BloomFilter.Hash (Hashable (..), hashByteArray)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Builder as BB
@@ -71,6 +71,9 @@ import GHC.Stack
import GHC.Word
import Text.Printf (printf)

-- $setup
-- >>> import Numeric

{- Note: [Export structure]
~~~~~~~~~~~~~~~~~~~~~~~
Since RawBytes are very similar to Primitive Vectors, the code is sectioned
@@ -172,14 +175,29 @@ drop = coerce VP.drop
--
-- The /top/ corresponds to the most significant bit (big-endian).
--
-- PRECONDITION: The byte-size of the raw bytes should be at least 8 bytes.
-- If the number of bits is smaller than @64@, then any missing bits default to
-- @0@s.
--
-- >>> showHex (topBits64 (pack [1,0,0,0,0,0,0,0])) ""
-- "100000000000000"
--
-- >>> showHex (topBits64 (pack [1,0,0])) ""
-- "100000000000000"
--
-- TODO: optimisation ideas: use unsafe shift/byteswap primops, look at GHC
-- core, find other opportunities for using primops.
--
topBits64 :: RawBytes -> Word64
topBits64 rb@(RawBytes (VP.Vector (I# off#) _size (ByteArray k#))) =
assert (size rb >= 8) $ toWord64 (indexWord8ArrayAsWord64# k# off#)
topBits64 rb@(RawBytes v@(VP.Vector (I# off#) _size (ByteArray k#)))
| n >= 8
= toWord64 (indexWord8ArrayAsWord64# k# off#)
| otherwise
= VP.foldl' f 0 v `unsafeShiftL` ((8 - n) * 8)
where
!n = size rb

f :: Word64 -> Word8 -> Word64
f acc w = acc `unsafeShiftL` 8 + fromIntegral w

#if (MIN_VERSION_GLASGOW_HASKELL(9, 4, 0, 0))
toWord64 :: Word64# -> Word64
6 changes: 2 additions & 4 deletions src/Database/LSMTree/Simple.hs
Original file line number Diff line number Diff line change
@@ -134,7 +134,6 @@ module Database.LSMTree.Simple (
serialiseKeyIdentity,
serialiseKeyIdentityUpToSlicing,
serialiseKeyPreservesOrdering,
serialiseKeyMinimalSize,
serialiseValueIdentity,
serialiseValueIdentityUpToSlicing,
packSlice,
@@ -182,9 +181,8 @@ import Database.LSMTree (BloomFilterAlloc, CursorClosedError (..),
UnionCredits (..), UnionDebt (..), WriteBufferAlloc,
isValidSnapshotName, noTableConfigOverride, packSlice,
serialiseKeyIdentity, serialiseKeyIdentityUpToSlicing,
serialiseKeyMinimalSize, serialiseKeyPreservesOrdering,
serialiseValueIdentity, serialiseValueIdentityUpToSlicing,
toSnapshotName)
serialiseKeyPreservesOrdering, serialiseValueIdentity,
serialiseValueIdentityUpToSlicing, toSnapshotName)
import qualified Database.LSMTree as LSMT
import qualified Database.LSMTree.Internal.Types as LSMT
import qualified Database.LSMTree.Internal.Unsafe as Internal
3 changes: 0 additions & 3 deletions test/Test/Database/LSMTree/Generators.hs
Original file line number Diff line number Diff line change
@@ -64,9 +64,6 @@ tests = testGroup "Test.Database.LSMTree.Generators" [
prop_arbitraryAndShrinkPreserveInvariant
(\(LargeRawBytes rb) -> labelRawBytes rb)
(deepseqInvariant @LargeRawBytes)
, testGroup "KeyForIndexCompact" $
prop_arbitraryAndShrinkPreserveInvariant noTags $
isKeyForIndexCompact . getKeyForIndexCompact
, testGroup "BiasedKey" $
prop_arbitraryAndShrinkPreserveInvariant
(labelTestKOps @BiasedKey)
13 changes: 4 additions & 9 deletions test/Test/Database/LSMTree/Internal/Index/Compact.hs
Original file line number Diff line number Diff line change
@@ -28,7 +28,7 @@ import Data.Word
import Database.LSMTree.Extras
import Database.LSMTree.Extras.Generators (ChunkSize (..),
LogicalPageSummaries, LogicalPageSummary (..), Pages (..),
genRawBytes, isKeyForIndexCompact, labelPages, toAppends)
genRawBytes, labelPages, toAppends)
import Database.LSMTree.Extras.Index (Append (..), appendToCompact)
import Database.LSMTree.Internal.BitMath
import Database.LSMTree.Internal.Chunk as Chunk (toByteString)
@@ -54,9 +54,7 @@ import Text.Printf (printf)

tests :: TestTree
tests = testGroup "Test.Database.LSMTree.Internal.Index.Compact" [
testGroup "TestKey" $
prop_arbitraryAndShrinkPreserveInvariant @TestKey noTags isTestKey
, testProperty "prop_distribution @TestKey" $
testProperty "prop_distribution @TestKey" $
prop_distribution @TestKey
, testProperty "prop_searchMinMaxKeysAfterConstruction" $
prop_searchMinMaxKeysAfterConstruction @TestKey 100
@@ -173,15 +171,12 @@ instance Arbitrary TestKey where
-- Shrink keys extensively: most failures will occur in small counterexamples,
-- so we don't have to limit the number of shrinks as much.
shrink (TestKey bytes) = [
TestKey bytes'
testkey'
| let RawBytes vec = bytes
, vec' <- VP.fromList <$> shrink (VP.toList vec)
, let bytes' = RawBytes vec'
, isKeyForIndexCompact bytes'
, let testkey' = TestKey $ RawBytes vec'
]

isTestKey :: TestKey -> Bool
isTestKey (TestKey bytes) = isKeyForIndexCompact bytes

{-------------------------------------------------------------------------------
Properties
10 changes: 3 additions & 7 deletions test/Test/Database/LSMTree/Internal/Lookup.hs
Original file line number Diff line number Diff line change
@@ -60,7 +60,7 @@ import qualified Database.LSMTree.Internal.Run as Run
import Database.LSMTree.Internal.RunAcc as Run
import Database.LSMTree.Internal.RunBuilder
(RunDataCaching (CacheRunData), RunParams (RunParams))
import Database.LSMTree.Internal.Serialise
import Database.LSMTree.Internal.Serialise as Serialise
import Database.LSMTree.Internal.Serialise.Class
import Database.LSMTree.Internal.UniqCounter
import qualified Database.LSMTree.Internal.WriteBuffer as WB
@@ -569,14 +569,10 @@ liftShrink3InMemLookupData shrinkKey shrinkValue shrinkBlob InMemLookupData{ run
shrinkEntry = liftShrink2 shrinkValue shrinkBlob

genSerialisedKey :: Gen SerialisedKey
genSerialisedKey = frequency [
(9, arbitrary `suchThat` (\k -> sizeofKey k >= 8))
, (1, do x <- getSmall <$> arbitrary
pure $ SerialisedKey (RB.pack [0,0,0,0,0,0,0, x]))
]
genSerialisedKey = Serialise.serialiseKey <$> arbitraryBoundedIntegral @Word64

shrinkSerialisedKey :: SerialisedKey -> [SerialisedKey]
shrinkSerialisedKey k = [k' | k' <- shrink k, sizeofKey k' >= 8]
shrinkSerialisedKey k = Serialise.serialiseKey <$> shrink (Serialise.deserialiseKey k :: Word64)

genSerialisedValue :: Gen SerialisedValue
genSerialisedValue = frequency [ (50, arbitrary), (1, genLongValue) ]
32 changes: 29 additions & 3 deletions test/Test/Database/LSMTree/Internal/RawBytes.hs
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
{-# LANGUAGE OverloadedLists #-}

module Test.Database.LSMTree.Internal.RawBytes (tests) where

import Data.Bits (Bits (shiftL))
import qualified Data.List as List
import qualified Data.Vector.Primitive as VP
import Database.LSMTree.Extras.Generators ()
import Database.LSMTree.Internal.RawBytes (RawBytes)
import qualified Database.LSMTree.Internal.RawBytes as RB (size)
import Database.LSMTree.Internal.RawBytes (RawBytes (RawBytes))
import qualified Database.LSMTree.Internal.RawBytes as RB
import Test.QuickCheck (Property, classify, collect, mapSize,
withDiscardRatio, withMaxSuccess, (.||.), (===), (==>))
import Test.Tasty (TestTree, testGroup)
@@ -26,7 +31,9 @@ tests = testGroup "Test.Database.LSMTree.Internal.RawBytes" $
testProperty "Transitivity" prop_ordTransitivity,
testProperty "Reflexivity" prop_ordReflexivity,
testProperty "Antisymmetry" prop_ordAntisymmetry
]
],
testProperty "prop_topBits64" prop_topBits64,
testProperty "prop_topBits64_default0s" prop_topBits64_default0s
]

-- * Utilities
@@ -92,3 +99,22 @@ prop_ordAntisymmetry = mapSize (const 4) $
untunedProp block1 block2
= withFirstBlockSizeInfo block1 $
block1 <= block2 && block2 <= block1 ==> block1 === block2

{-------------------------------------------------------------------------------
Accessors
-------------------------------------------------------------------------------}

-- | Compare 'topBits64' against a model
prop_topBits64 :: RawBytes -> Property
prop_topBits64 x@(RawBytes v) =
expected === RB.topBits64 x
where
expected =
let ws = take 8 (VP.toList v ++ repeat 0)
in List.foldl' (\acc w -> acc `shiftL` 8 + fromIntegral w) 0 ws

-- | If @x@ has fewer than 8 bytes, then all missing bits in the result default
-- to 0s.
prop_topBits64_default0s :: RawBytes -> Property
prop_topBits64_default0s x =
RB.topBits64 x === RB.topBits64 (x <> mconcat (replicate 8 [0]))
5 changes: 2 additions & 3 deletions test/Test/Database/LSMTree/Internal/RunAcc.hs
Original file line number Diff line number Diff line change
@@ -143,8 +143,7 @@ fromProtoValue (Proto.Value bs) = SerialisedValue . RB.fromShortByteString $ SBS
fromProtoBlobRef :: Proto.BlobRef -> BlobSpan
fromProtoBlobRef (Proto.BlobRef x y) = BlobSpan x y

-- | Wrapper around 'PageLogical' that generates nearly-full pages, and
-- keys that are always large enough (>= 8 bytes) for the compact index.
-- | Wrapper around 'PageLogical' that generates nearly-full pages.
newtype PageLogical' = PageLogical' { getPrototypeKOps :: [(Proto.Key, Proto.Operation)] }
deriving stock Show

@@ -153,7 +152,7 @@ getRealKOps = fmap fromProtoKOp . getPrototypeKOps

instance Arbitrary PageLogical' where
arbitrary = PageLogical' <$>
Proto.genPageContentFits Proto.DiskPage4k (Proto.MinKeySize 8)
Proto.genPageContentFits Proto.DiskPage4k Proto.noMinKeySize
shrink (PageLogical' page) =
[ PageLogical' page' | page' <- shrink page ]

6 changes: 3 additions & 3 deletions test/Test/Database/LSMTree/StateMachine.hs
Original file line number Diff line number Diff line change
@@ -92,11 +92,11 @@ import qualified Database.LSMTree as R
import Database.LSMTree.Class (Entry (..), LookupResult (..))
import qualified Database.LSMTree.Class as Class
import Database.LSMTree.Extras (showPowersOf)
import Database.LSMTree.Extras.Generators (KeyForIndexCompact)
import Database.LSMTree.Extras.Generators ()
import Database.LSMTree.Extras.NoThunks (propNoThunks)
import qualified Database.LSMTree.Internal.Config as R (TableConfig (..))
import Database.LSMTree.Internal.Serialise (SerialisedBlob,
SerialisedValue)
SerialisedKey, SerialisedValue)
import qualified Database.LSMTree.Internal.Types as R.Types
import qualified Database.LSMTree.Internal.Unsafe as R.Unsafe
import qualified Database.LSMTree.Model.IO as ModelIO
@@ -574,7 +574,7 @@ handleFsError = Model.ErrFsError . displayException
Key and value types
-------------------------------------------------------------------------------}

newtype Key = Key KeyForIndexCompact
newtype Key = Key SerialisedKey
deriving stock (Show, Eq, Ord)
deriving newtype (Arbitrary, R.SerialiseKey)