Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 76 additions & 24 deletions src/Infidex.Tests/PersistenceTests.cs
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
using Microsoft.VisualStudio.TestTools.UnitTesting;
using Infidex.Core;
using CsvHelper;
using CsvHelper.Configuration;
using Infidex.Api;
using System.IO;
using Infidex.Core;
using System.Globalization;
using CsvHelper;
using CsvHelper.Configuration;

namespace Infidex.Tests;

Expand All @@ -25,15 +23,15 @@ public void SaveAndLoadIndex_PreservesData()
new Document(2L, "jumps over the lazy dog")
};
engine.IndexDocuments(documents);

// Verify search before save
var resultsBefore = engine.Search(new Query("fox", 10));
Assert.AreEqual(1, resultsBefore.Records.Length);
Assert.AreEqual(1L, resultsBefore.Records[0].DocumentId);

// 2. Save
engine.Save(filePath);

// 3. Load
// Use default config for loading (must match what was used for indexing usually)
var config = ConfigurationParameters.GetConfig(400);
Expand All @@ -49,17 +47,17 @@ public void SaveAndLoadIndex_PreservesData()
config.StopTermLimit,
config.WordMatcherSetup
);

// 4. Verify search after load
var resultsAfter = loadedEngine.Search(new Query("fox", 10));
Assert.AreEqual(1, resultsAfter.Records.Length);
Assert.AreEqual(1L, resultsAfter.Records[0].DocumentId);

// Verify another term
var resultsDog = loadedEngine.Search(new Query("dog", 10));
Assert.AreEqual(1, resultsDog.Records.Length);
Assert.AreEqual(2L, resultsDog.Records[0].DocumentId);

// Verify statistics
var statsBefore = engine.GetStatistics();
var statsAfter = loadedEngine.GetStatistics();
Expand All @@ -72,7 +70,7 @@ public void SaveAndLoadIndex_PreservesData()
File.Delete(filePath);
}
}

[TestMethod]
public void SaveAndLoad40kMovies_MeasureIndexSize()
{
Expand All @@ -82,32 +80,32 @@ public void SaveAndLoad40kMovies_MeasureIndexSize()
// Load movies from CSV
var movies = LoadMovies();
Console.WriteLine($"Loaded {movies.Count} movies from CSV");

// Create and index
var engine = SearchEngine.CreateDefault();
var documents = movies.Select((m, i) =>
new Document((long)i, m.Title)).ToList();

Console.WriteLine($"Indexing {documents.Count} movie titles...");
engine.IndexDocuments(documents);

var stats = engine.GetStatistics();
Console.WriteLine($"Index stats: {stats.DocumentCount} documents, {stats.VocabularySize} unique terms");

// Test search before save
var testResults = engine.Search(new Query("redemption", 5));
Console.WriteLine($"Test search found {testResults.Records.Length} results");

// Save index
Console.WriteLine("Saving index to disk...");
engine.Save(filePath);

// Measure file size
var fileInfo = new FileInfo(filePath);
long fileSizeBytes = fileInfo.Length;
double fileSizeKB = fileSizeBytes / 1024.0;
double fileSizeMB = fileSizeKB / 1024.0;

Console.WriteLine($"\n=== INDEX FILE SIZE METRICS ===");
Console.WriteLine($"Documents indexed: {documents.Count:N0}");
Console.WriteLine($"Unique terms: {stats.VocabularySize:N0}");
Expand All @@ -116,7 +114,7 @@ public void SaveAndLoad40kMovies_MeasureIndexSize()
Console.WriteLine($"File size: {fileSizeMB:N2} MB");
Console.WriteLine($"Bytes per document: {fileSizeBytes / (double)documents.Count:N2}");
Console.WriteLine($"================================\n");

// Load back and verify
Console.WriteLine("Loading index from disk...");
var config = ConfigurationParameters.GetConfig(400);
Expand All @@ -132,17 +130,17 @@ public void SaveAndLoad40kMovies_MeasureIndexSize()
config.StopTermLimit,
config.WordMatcherSetup
);

// Verify loaded index works
var loadedStats = loadedEngine.GetStatistics();
Assert.AreEqual(stats.DocumentCount, loadedStats.DocumentCount);
Assert.AreEqual(stats.VocabularySize, loadedStats.VocabularySize);

// Verify search results match
var loadedResults = loadedEngine.Search(new Query("redemption", 5));
Assert.AreEqual(testResults.Records.Length, loadedResults.Records.Length);
Console.WriteLine($"Loaded index verified: search returned {loadedResults.Records.Length} results");

// Additional searches to verify quality
var searchTerms = new[] { "batman", "matrix", "star wars", "love", "action" };
foreach (var term in searchTerms)
Expand All @@ -157,7 +155,61 @@ public void SaveAndLoad40kMovies_MeasureIndexSize()
File.Delete(filePath);
}
}


[TestMethod]
public void SaveAndLoadIndex_UnicodeSurrogateCharacters()
{
string filePath = "surrogates_index.bin";
try
{
// 1. Create and index
var engine = SearchEngine.CreateDefault();
var documents = new[]
{
new Document(1L, "\uD83D\uDD0D")
};
engine.IndexDocuments(documents);

var resultsBefore = engine.Search(new Query("\uD83D\uDD0D", 10));
Assert.HasCount(1, resultsBefore.Records);
Assert.AreEqual(1L, resultsBefore.Records[0].DocumentId);

// 2. Save
engine.Save(filePath);

// 3. Load
var config = ConfigurationParameters.GetConfig(400);
var loadedEngine = SearchEngine.Load(
filePath,
config.IndexSizes,
config.StartPadSize,
config.StopPadSize,
true,
config.TextNormalizer,
config.TokenizerSetup,
null,
config.StopTermLimit,
config.WordMatcherSetup
);

// 4. Verify search after load
var resultsAfter = loadedEngine.Search(new Query("\uD83D\uDD0D", 10));
Assert.HasCount(1, resultsAfter.Records);
Assert.AreEqual(1L, resultsAfter.Records[0].DocumentId);

// Verify statistics
var statsBefore = engine.GetStatistics();
var statsAfter = loadedEngine.GetStatistics();
Assert.AreEqual(statsBefore.DocumentCount, statsAfter.DocumentCount);
Assert.AreEqual(statsBefore.VocabularySize, statsAfter.VocabularySize);
}
finally
{
if (File.Exists(filePath))
File.Delete(filePath);
}
}

private static List<MovieRecord> LoadMovies()
{
var config = new CsvConfiguration(CultureInfo.InvariantCulture)
Expand Down
48 changes: 23 additions & 25 deletions src/Infidex/Indexing/Fst/FstSerializer.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
using System.Runtime.InteropServices;

namespace Infidex.Indexing.Fst;

/// <summary>
Expand All @@ -10,7 +8,7 @@ internal static class FstSerializer
{
private const uint FST_MAGIC = 0x46535432; // "FST2" in little-endian
private const ushort FST_VERSION = 1;

/// <summary>
/// Serializes an FST index to a binary writer.
/// </summary>
Expand All @@ -20,21 +18,21 @@ public static void Write(BinaryWriter writer, FstIndex index)
writer.Write(FST_MAGIC);
writer.Write(FST_VERSION);
writer.Write(index.TermCount);

(FstNode[] forwardNodes, FstArc[] forwardArcs, int forwardRoot) = index.GetForwardFst();
(FstNode[] reverseNodes, FstArc[] reverseArcs, int reverseRoot) = index.GetReverseFst();

// Write forward FST
WriteNodeArray(writer, forwardNodes);
WriteArcArray(writer, forwardArcs);
writer.Write(forwardRoot);

// Write reverse FST
WriteNodeArray(writer, reverseNodes);
WriteArcArray(writer, reverseArcs);
writer.Write(reverseRoot);
}

/// <summary>
/// Deserializes an FST index from a binary reader.
/// </summary>
Expand All @@ -44,33 +42,33 @@ public static FstIndex Read(BinaryReader reader)
uint magic = reader.ReadUInt32();
if (magic != FST_MAGIC)
throw new InvalidDataException($"Invalid FST magic number: 0x{magic:X8}");

ushort version = reader.ReadUInt16();
if (version != FST_VERSION)
throw new InvalidDataException($"Unsupported FST version: {version}");

int termCount = reader.ReadInt32();

// Read forward FST
FstNode[] forwardNodes = ReadNodeArray(reader);
FstArc[] forwardArcs = ReadArcArray(reader);
int forwardRoot = reader.ReadInt32();

// Read reverse FST
FstNode[] reverseNodes = ReadNodeArray(reader);
FstArc[] reverseArcs = ReadArcArray(reader);
int reverseRoot = reader.ReadInt32();

return new FstIndex(
forwardNodes, forwardArcs, forwardRoot,
reverseNodes, reverseArcs, reverseRoot,
termCount);
}

private static void WriteNodeArray(BinaryWriter writer, FstNode[] nodes)
{
writer.Write(nodes.Length);

foreach (ref readonly FstNode node in nodes.AsSpan())
{
writer.Write(node.ArcStartIndex);
Expand All @@ -79,12 +77,12 @@ private static void WriteNodeArray(BinaryWriter writer, FstNode[] nodes)
writer.Write(node.Output);
}
}

private static FstNode[] ReadNodeArray(BinaryReader reader)
{
int length = reader.ReadInt32();
FstNode[] nodes = new FstNode[length];

for (int i = 0; i < length; i++)
{
nodes[i] = new FstNode
Expand All @@ -95,41 +93,41 @@ private static FstNode[] ReadNodeArray(BinaryReader reader)
Output = reader.ReadInt32()
};
}

return nodes;
}

private static void WriteArcArray(BinaryWriter writer, FstArc[] arcs)
{
writer.Write(arcs.Length);

foreach (ref readonly FstArc arc in arcs.AsSpan())
{
writer.Write(arc.Label);
writer.Write((ushort)arc.Label);
writer.Write(arc.TargetNodeIndex);
writer.Write(arc.Output);
writer.Write(arc.IsFinal);
}
}

private static FstArc[] ReadArcArray(BinaryReader reader)
{
int length = reader.ReadInt32();
FstArc[] arcs = new FstArc[length];

for (int i = 0; i < length; i++)
{
arcs[i] = new FstArc
{
Label = reader.ReadChar(),
Label = (char)reader.ReadUInt16(),
TargetNodeIndex = reader.ReadInt32(),
Output = reader.ReadInt32(),
IsFinal = reader.ReadBoolean()
};
}

return arcs;
}

}

Loading
Loading