fix Reranking if documents is too large

nipeone · nipeone · commit d99670c1e8f9 · 2025-04-11T16:37:42.000+08:00
diff --git a/LLama/LLamaReranker.cs b/LLama/LLamaReranker.cs
@@ -5,6 +5,7 @@
 using System.Text;
 using System.Threading;
 using System.Threading.Tasks;
+using System.Xml.Linq;
 using LLama.Abstractions;
 using LLama.Exceptions;
 using LLama.Native;
@@ -65,16 +66,52 @@ public void Dispose()
     public async Task<IReadOnlyList<float>> GetRelevanceScores(string input, IReadOnlyList<string> documents, bool normalize = false, CancellationToken cancellationToken = default)
     {
         List<float> scores = new List<float>(documents.Count);
-        var batch = new LLamaBatch();
         var inputTokens = Context.Tokenize(input);
-        foreach (var (index, document) in documents.Select((item, index) => (index, item)))
+        var batch = new LLamaBatch();
+        var clearFlag = 0;
+
+        for(var idx = 0; idx < documents.Count; idx++)
         {
-            var docTokens = Context.Tokenize(document);
+            var docTokens = Context.Tokenize(documents[idx]);
             LLamaToken[] tokens = [.. inputTokens, .. docTokens];
+
+            if (batch.TokenCount + tokens.Length > Context.ContextSize)
+            {
+                scores.AddRange(await CalcRelevanceScores(batch, normalize, cancellationToken));
+                batch.Clear();
+                clearFlag = idx;
+            }
+
             for (var i = 0; i < tokens.Length; i++)
-                batch.Add(tokens[i], i, (LLamaSeqId)index, true);
+                batch.Add(tokens[i], i, (LLamaSeqId)(idx - clearFlag), true);
+        }
+        if (batch.LogitPositionCount > 0)
+        {
+            scores.AddRange(await CalcRelevanceScores(batch, normalize, cancellationToken));
+            batch.Clear();
         }
 
+        return scores;
+    }
+
+    /// <summary>
+    /// Retrieve relevance score for input and document by reranking
+    /// </summary>
+    /// <param name="input"></param>
+    /// <param name="document"></param>
+    /// <param name="cancellationToken"></param>
+    /// <returns></returns>
+    /// <exception cref="RuntimeError"></exception>
+    /// <exception cref="NotSupportedException"></exception>
+    public async Task<(float Score, int Tokens)> GetRelevanceScoreWithTokenCount(string input, string document, bool normalize = false, CancellationToken cancellationToken = default)
+    {
+        var inputTokens = Context.Tokenize(input);
+        var docTokens = Context.Tokenize(document);
+        LLamaToken[] tokens = [..inputTokens, ..docTokens];
+        var batch = new LLamaBatch();
+        for (var i = 0; i < tokens.Length; i++)
+            batch.Add(tokens[i], i, LLamaSeqId.Zero, true);
+
         // clear previous kv_cache values
         Context.NativeHandle.KvCacheClear();
 
@@ -104,35 +141,18 @@ public async Task<IReadOnlyList<float>> GetRelevanceScores(string input, IReadOn
                 throw new NotSupportedException("Unsupported model type");
         }
 
-        for (var i = 0; i < documents.Count; i++)
-        {
-            var score = Context.NativeHandle.GetEmbeddingsSeq((LLamaSeqId)i)[0];
-            scores.Add(normalize ? Sigmoid(score) : score);
-        }
+        var score = Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero)[0];
 
         Context.NativeHandle.KvCacheClear();
 
-        return scores;
+        return (normalize ? Sigmoid(score) : score, tokens.Length);
     }
 
-    /// <summary>
-    /// Retrieve relevance score for input and document by reranking
-    /// </summary>
-    /// <param name="input"></param>
-    /// <param name="document"></param>
-    /// <param name="cancellationToken"></param>
-    /// <returns></returns>
-    /// <exception cref="RuntimeError"></exception>
-    /// <exception cref="NotSupportedException"></exception>
-    public async Task<(float Score, int Tokens)> GetRelevanceScoreWithTokenCount(string input, string document, bool normalize = false, CancellationToken cancellationToken = default)
+    private async Task<IReadOnlyList<float>> CalcRelevanceScores(LLamaBatch batch, bool normalize = false, CancellationToken cancellationToken = default)
     {
-        var inputTokens = Context.Tokenize(input);
-        var docTokens = Context.Tokenize(document);
-        LLamaToken[] tokens = [..inputTokens, ..docTokens];
-        var batch = new LLamaBatch();
-        for (var i = 0; i < tokens.Length; i++)
-            batch.Add(tokens[i], i, LLamaSeqId.Zero, true);
-
+        var (logicCap, _) = batch.GetLogitPositions()[batch.LogitPositionCount - 1];
+        var seqNum = logicCap.Value + 1;
+        List<float> scores = new List<float>(seqNum);
         // clear previous kv_cache values
         Context.NativeHandle.KvCacheClear();
 
@@ -162,11 +182,15 @@ public async Task<IReadOnlyList<float>> GetRelevanceScores(string input, IReadOn
                 throw new NotSupportedException("Unsupported model type");
         }
 
-        var score = Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero)[0];
+        for (var seq = 0; seq < seqNum; seq++)
+        {
+            var score = Context.NativeHandle.GetEmbeddingsSeq((LLamaSeqId)seq)[0];
+            scores.Add(normalize ? Sigmoid(score) : score);
+        }
 
         Context.NativeHandle.KvCacheClear();
 
-        return (normalize ? Sigmoid(score) : score, tokens.Length);
+        return scores;
     }
 
     private float Sigmoid(float x)