From 2105dc3562f1a391704fc26fc729e2467325270d Mon Sep 17 00:00:00 2001 From: nipeone Date: Thu, 27 Mar 2025 15:02:42 +0800 Subject: [PATCH 01/11] add support for linux-arm64 --- .github/workflows/compile.yml | 33 +++++++++++---- LLama/LLamaSharp.Runtime.targets | 18 ++++++++ LLama/Native/Load/NativeLibraryUtils.cs | 42 +++++++++++++------ LLama/Native/Load/NativeLibraryWithAvx.cs | 16 ++++--- .../build/LLamaSharp.Backend.Cpu.nuspec | 8 +++- 5 files changed, 91 insertions(+), 26 deletions(-) diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index 5b84f6753..6226d7379 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -28,13 +28,25 @@ jobs: include: - build: 'noavx' defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF' + os: ubuntu-20.04 + arch: x64 - build: 'avx2' defines: '' + os: ubuntu-20.04 + arch: x64 - build: 'avx' defines: '-DGGML_AVX2=OFF' + os: ubuntu-20.04 + arch: x64 - build: 'avx512' defines: '-DGGML_AVX512=ON' - runs-on: ubuntu-20.04 + os: ubuntu-20.04 + arch: x64 + - build: 'aarch64' + defines: '-DGGML_NATIVE=OFF -DGGML_CPU_AARCH64=ON -DGGML_CPU_ARM_ARCH=generic' + os: ubuntu-20.04-arm + arch: arm64 + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 with: @@ -52,28 +64,28 @@ jobs: - uses: actions/upload-artifact@v4 with: path: ./build/bin/libllama.so - name: llama-bin-linux-${{ matrix.build }}-x64.so + name: llama-bin-linux-${{ matrix.build }}-${{ matrix.arch }}.so if-no-files-found: error - uses: actions/upload-artifact@v4 with: path: ./build/bin/libggml.so - name: ggml-bin-linux-${{ matrix.build }}-x64.so + name: ggml-bin-linux-${{ matrix.build }}-${{ matrix.arch }}.so if-no-files-found: error - uses: actions/upload-artifact@v4 with: path: ./build/bin/libggml-base.so - name: ggml-base-bin-linux-${{ matrix.build }}-x64.so + name: ggml-base-bin-linux-${{ matrix.build }}-${{ matrix.arch }}.so if-no-files-found: error - uses: actions/upload-artifact@v4 with: path: ./build/bin/libggml-cpu.so - name: ggml-cpu-bin-linux-${{ matrix.build }}-x64.so + name: ggml-cpu-bin-linux-${{ matrix.build }}-${{ matrix.arch }}.so if-no-files-found: error - name: Upload Llava uses: actions/upload-artifact@v4 with: path: ./build/bin/libllava_shared.so - name: llava-bin-linux-${{ matrix.build }}-x64.so + name: llava-bin-linux-${{ matrix.build }}-${{ matrix.arch }}.so if-no-files-found: error compile-musl: @@ -601,7 +613,7 @@ jobs: - name: Rearrange Files run: | # Make all directories at once - mkdir --parents deps/{noavx,avx,avx2,avx512,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu11.7.1,cu12.2.0,vulkan,android-arm64-v8a,android-x86,android-x86_64} + mkdir --parents deps/{noavx,avx,avx2,avx512,linux-arm64,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu11.7.1,cu12.2.0,vulkan,android-arm64-v8a,android-x86,android-x86_64} # Linux cp artifacts/ggml-bin-linux-noavx-x64.so/libggml.so deps/noavx/libggml.so @@ -628,6 +640,13 @@ jobs: cp artifacts/llama-bin-linux-avx512-x64.so/libllama.so deps/avx512/libllama.so cp artifacts/llava-bin-linux-avx512-x64.so/libllava_shared.so deps/avx512/libllava_shared.so + # Arm64 + cp artifacts/ggml-bin-linux-aarch64-arm64.so/libggml.so deps/linux-arm64/libggml.so + cp artifacts/ggml-base-bin-linux-aarch64-arm64.so/libggml-base.so deps/linux-arm64/libggml-base.so + cp artifacts/ggml-cpu-bin-linux-aarch64-arm64.so/libggml-cpu.so deps/linux-arm64/libggml-cpu.so + cp artifacts/llama-bin-linux-aarch64-arm64.so/libllama.so deps/linux-arm64/libllama.so + cp artifacts/llava-bin-linux-aarch64-arm64.so/libllava_shared.so deps/linux-arm64/libllava_shared.so + # Musl cp artifacts/ggml-bin-musl-noavx-x64.so/libggml.so deps/musl-noavx/libggml.so cp artifacts/ggml-base-bin-musl-noavx-x64.so/libggml-base.so deps/musl-noavx/libggml-base.so diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets index 22a3e04e1..6714ddec0 100644 --- a/LLama/LLamaSharp.Runtime.targets +++ b/LLama/LLamaSharp.Runtime.targets @@ -202,6 +202,24 @@ + + PreserveNewest + runtimes/linux-arm64/native/libllama.so + + + PreserveNewest + runtimes/linux-arm64/native/libggml.so + + + PreserveNewest + runtimes/linux-arm64/native/libggml-base.so + + + PreserveNewest + runtimes/linux-arm64/native/libggml-cpu.so + + + PreserveNewest runtimes/linux-x64/native/cuda11/libllama.so diff --git a/LLama/Native/Load/NativeLibraryUtils.cs b/LLama/Native/Load/NativeLibraryUtils.cs index b0e8a792a..9f6457cd1 100644 --- a/LLama/Native/Load/NativeLibraryUtils.cs +++ b/LLama/Native/Load/NativeLibraryUtils.cs @@ -88,19 +88,28 @@ internal static IntPtr TryLoadLibrary(NativeLibraryConfig config, out INativeLib // On other platforms (Windows, Linux), we need to load the CPU backend from the specified AVX level directory // We are using the AVX level supplied by NativeLibraryConfig, which automatically detects the highest supported AVX level for us - // ggml-cpu - dependencyPaths.Add(Path.Combine( - $"runtimes/{os}/native/{NativeLibraryConfig.AvxLevelToString(library.Metadata.AvxLevel)}", - $"{libPrefix}ggml-cpu{ext}" - )); - - // ggml-cuda - if (library.Metadata.UseCuda) - dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cuda{ext}")); - - // ggml-vulkan - if (library.Metadata.UseVulkan) - dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-vulkan{ext}")); + if (os == "linux-arm64"){ + dependencyPaths.Add(Path.Combine( + $"runtimes/{os}/native", + $"{libPrefix}ggml-cpu{ext}" + )); + } + else{ + // ggml-cpu + dependencyPaths.Add(Path.Combine( + $"runtimes/{os}/native/{NativeLibraryConfig.AvxLevelToString(library.Metadata.AvxLevel)}", + $"{libPrefix}ggml-cpu{ext}" + )); + + // ggml-cuda + if (library.Metadata.UseCuda) + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cuda{ext}")); + + // ggml-vulkan + if (library.Metadata.UseVulkan) + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-vulkan{ext}")); + } + } } @@ -218,6 +227,13 @@ public static void GetPlatformPathParts(OSPlatform platform, out string os, out if (platform == OSPlatform.Linux) { + if(System.Runtime.Intrinsics.Arm.ArmBase.Arm64.IsSupported){ + // linux arm64 + os = "linux-arm64"; + fileExtension = ".so"; + libPrefix = "lib"; + return; + } if(RuntimeInformation.RuntimeIdentifier.ToLower().StartsWith("alpine")) { // alpine linux distro diff --git a/LLama/Native/Load/NativeLibraryWithAvx.cs b/LLama/Native/Load/NativeLibraryWithAvx.cs index 932c49866..e6cbd86f3 100644 --- a/LLama/Native/Load/NativeLibraryWithAvx.cs +++ b/LLama/Native/Load/NativeLibraryWithAvx.cs @@ -50,11 +50,17 @@ public IEnumerable Prepare(SystemInfo systemInfo, NativeLogConfig.LLamaL private string? GetAvxPath(SystemInfo systemInfo, AvxLevel avxLevel, NativeLogConfig.LLamaLogCallback? logCallback) { NativeLibraryUtils.GetPlatformPathParts(systemInfo.OSPlatform, out var os, out var fileExtension, out var libPrefix); - var avxStr = NativeLibraryConfig.AvxLevelToString(avxLevel); - if (!string.IsNullOrEmpty(avxStr)) - avxStr += "/"; - var relativePath = $"runtimes/{os}/native/{avxStr}{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}"; - return relativePath; + if (os != "linux-arm64"){ + var avxStr = NativeLibraryConfig.AvxLevelToString(avxLevel); + if (!string.IsNullOrEmpty(avxStr)) + avxStr += "/"; + var relativePath = $"runtimes/{os}/native/{avxStr}{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}"; + return relativePath; + } else { + var relativePath = $"runtimes/{os}/native/{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}"; + return relativePath; + } + } } #endif diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec index 7c69534da..d173039a9 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec @@ -66,7 +66,13 @@ - + + + + + + + From bc4dde86ace11771430072fce5f167d8a2a3f382 Mon Sep 17 00:00:00 2001 From: nipeone Date: Thu, 27 Mar 2025 15:34:22 +0800 Subject: [PATCH 02/11] update compile.yml --- .github/workflows/compile.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index 6226d7379..a472ad584 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -44,7 +44,7 @@ jobs: arch: x64 - build: 'aarch64' defines: '-DGGML_NATIVE=OFF -DGGML_CPU_AARCH64=ON -DGGML_CPU_ARM_ARCH=generic' - os: ubuntu-20.04-arm + os: ubuntu-22.04-arm arch: arm64 runs-on: ${{ matrix.os }} steps: From aeef2eb23dd1921a85fcfd0e8137f19d28b100a5 Mon Sep 17 00:00:00 2001 From: nipeone Date: Thu, 27 Mar 2025 15:45:20 +0800 Subject: [PATCH 03/11] update compile.yml DGGML_CPU_ARM_ARCH=armv8-a --- .github/workflows/compile.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index a472ad584..1448c5618 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -43,7 +43,7 @@ jobs: os: ubuntu-20.04 arch: x64 - build: 'aarch64' - defines: '-DGGML_NATIVE=OFF -DGGML_CPU_AARCH64=ON -DGGML_CPU_ARM_ARCH=generic' + defines: '-DGGML_NATIVE=OFF -DGGML_CPU_AARCH64=ON -DGGML_CPU_ARM_ARCH=armv8-a' os: ubuntu-22.04-arm arch: arm64 runs-on: ${{ matrix.os }} From 80d75d9281ea53ad49cc4c4e0a2b6b3bac9c84a9 Mon Sep 17 00:00:00 2001 From: nipeone Date: Thu, 27 Mar 2025 16:08:55 +0800 Subject: [PATCH 04/11] update runtime.targets --- LLama/LLamaSharp.Runtime.targets | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets index 6714ddec0..2972bcadd 100644 --- a/LLama/LLamaSharp.Runtime.targets +++ b/LLama/LLamaSharp.Runtime.targets @@ -218,6 +218,10 @@ PreserveNewest runtimes/linux-arm64/native/libggml-cpu.so + + PreserveNewest + runtimes/linux-arm64/native/libllava_shared.so + From 6f4c53c6770b37e14d6aeb6315ab048c5a657caf Mon Sep 17 00:00:00 2001 From: nipeone Date: Thu, 3 Apr 2025 12:35:40 +0800 Subject: [PATCH 05/11] add LLamaReranker and tests --- LLama.Unittest/Constants.cs | 1 + LLama.Unittest/LLama.Unittest.csproj | 10 +- LLama.Unittest/LLamaRerankerTests.cs | 74 ++++++++++ .../SafeLlamaModelHandleVocabularyTests.cs | 37 +++++ LLama/LLamaReranker.cs | 137 ++++++++++++++++++ LLama/Native/SafeLlamaModelHandle.cs | 18 ++- 6 files changed, 269 insertions(+), 8 deletions(-) create mode 100644 LLama.Unittest/LLamaRerankerTests.cs create mode 100644 LLama.Unittest/Native/SafeLlamaModelHandleVocabularyTests.cs create mode 100644 LLama/LLamaReranker.cs diff --git a/LLama.Unittest/Constants.cs b/LLama.Unittest/Constants.cs index a30951750..59585142a 100644 --- a/LLama.Unittest/Constants.cs +++ b/LLama.Unittest/Constants.cs @@ -7,6 +7,7 @@ internal static class Constants public static readonly string GenerativeModelPath = "Models/Llama-3.2-1B-Instruct-Q4_0.gguf"; public static readonly string GenerativeModelPath2 = "Models/smollm-360m-instruct-add-basics-q8_0.gguf"; public static readonly string EmbeddingModelPath = "Models/all-MiniLM-L12-v2.Q8_0.gguf"; + public static readonly string RerankingModelPath = "Models/jina-reranker-v1-tiny-en-FP16.gguf"; public static readonly string LLavaModelPath = "Models/llava-v1.6-mistral-7b.Q3_K_XS.gguf"; public static readonly string LLavaMmpPath = "Models/mmproj-model-f16.gguf"; diff --git a/LLama.Unittest/LLama.Unittest.csproj b/LLama.Unittest/LLama.Unittest.csproj index 11b65557e..f407a2e7a 100644 --- a/LLama.Unittest/LLama.Unittest.csproj +++ b/LLama.Unittest/LLama.Unittest.csproj @@ -34,8 +34,11 @@ - - + + + + + @@ -63,6 +66,9 @@ PreserveNewest + + PreserveNewest + PreserveNewest diff --git a/LLama.Unittest/LLamaRerankerTests.cs b/LLama.Unittest/LLamaRerankerTests.cs new file mode 100644 index 000000000..8b786ff9b --- /dev/null +++ b/LLama.Unittest/LLamaRerankerTests.cs @@ -0,0 +1,74 @@ +using LLama.Common; +using LLama.Extensions; +using LLama.Native; +using Microsoft.Extensions.AI; +using System.Runtime.InteropServices; +using Xunit.Abstractions; + +namespace LLama.Unittest; + +public sealed class LLamaRerankerTests +{ + private readonly ITestOutputHelper _testOutputHelper; + private readonly LLamaReranker _reranker; + public LLamaRerankerTests(ITestOutputHelper testOutputHelper) + { + _testOutputHelper = testOutputHelper; + + var @params = new ModelParams(Constants.RerankingModelPath) + { + ContextSize = 0, + PoolingType = LLamaPoolingType.Rank, + GpuLayerCount = Constants.CIGpuLayerCount, + + }; + using var weights = LLamaWeights.LoadFromFile(@params); + _reranker = new LLamaReranker(weights, @params); + } + + [Fact] + public async Task CompareRerankingScore() + { + + + var input = "what is panda?"; + var documents = new string[] { + "hi", + "it's a bear", + string.Join(", ","The giant panda (Ailuropoda melanoleuca)", + "sometimes called a panda bear or simply panda", + "is a bear species endemic to China.") + }; + var scores = await _reranker.GetRelevanceScores(input, documents, normalize: false); + + Assert.True(documents.Length == scores.Count); + + _testOutputHelper.WriteLine($"Rerank score 0: {scores[0]:F4}"); + _testOutputHelper.WriteLine($"Rerank score 1: {scores[1]:F4}"); + _testOutputHelper.WriteLine($"Rerank score 2: {scores[2]:F4}"); + } + + [Fact] + public async Task MostRelevantDocument() + { + var input = "what is panda?"; + var documents = new string[] { + "hi", + "it's a bear", + string.Join(", ","The giant panda (Ailuropoda melanoleuca)", + "sometimes called a panda bear or simply panda", + "is a bear species endemic to China.") + }; + var scores = await _reranker.GetRelevanceScores(input, documents, normalize: true); + + Assert.True(documents.Length == scores.Count); + + int maxIndex = scores + .Select((score, index) => new { Score = score, Index = index }) + .MaxBy(x => x.Score) + .Index; + + var maxScoreDocument = documents[maxIndex]; + Assert.Equal(documents[2], maxScoreDocument); + } +} diff --git a/LLama.Unittest/Native/SafeLlamaModelHandleVocabularyTests.cs b/LLama.Unittest/Native/SafeLlamaModelHandleVocabularyTests.cs new file mode 100644 index 000000000..5b8e12ac3 --- /dev/null +++ b/LLama.Unittest/Native/SafeLlamaModelHandleVocabularyTests.cs @@ -0,0 +1,37 @@ +using System.Text; +using System.Xml.Linq; +using LLama.Common; +using LLama.Extensions; +using Microsoft.Extensions.Logging; + + +namespace LLama.Unittest.Native; + +public class SafeLlamaModelHandleVocabularyTests +{ + private readonly LLamaWeights _model; + + public SafeLlamaModelHandleVocabularyTests() + { + var @params = new ModelParams(Constants.RerankingModelPath) + { + ContextSize = 0, + PoolingType = LLama.Native.LLamaPoolingType.Rank, + GpuLayerCount = Constants.CIGpuLayerCount + }; + _model = LLamaWeights.LoadFromFile(@params); + } + + [Fact] + public void GetLLamaTokenString() + { + var bos = _model.Vocab.BOS; + var eos = _model.Vocab.EOS; + + var bosStr = _model.Vocab.LLamaTokenToString(bos, true); + var eosStr = _model.Vocab.LLamaTokenToString(eos, true); + + Assert.Equal("", bosStr); + Assert.Equal("", eosStr); + } +} diff --git a/LLama/LLamaReranker.cs b/LLama/LLamaReranker.cs new file mode 100644 index 000000000..4c0aa2394 --- /dev/null +++ b/LLama/LLamaReranker.cs @@ -0,0 +1,137 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using LLama.Abstractions; +using LLama.Exceptions; +using LLama.Native; +using Microsoft.Extensions.Logging; + +namespace LLama; + +/// +/// Get rank scores between prompt and documents +/// +public sealed partial class LLamaReranker + : IDisposable +{ + /// + /// string BOS + /// + public string StrBOS { get; } + /// + /// string EOS + /// + public string StrEOS { get; } + + + /// + /// Dimension of embedding vectors + /// + public int EmbeddingSize => Context.EmbeddingSize; + + /// + /// LLama Context + /// + public LLamaContext Context { get; } + + /// + /// Create a new reranker, using the given LLamaWeights + /// + /// + /// + /// + public LLamaReranker(LLamaWeights weights, IContextParams @params, ILogger? logger = null) + { + if (@params.UBatchSize != @params.BatchSize) + throw new ArgumentException("For non-causal models, batch size must be equal to ubatch size", nameof(@params)); + if (weights.NativeHandle is { HasEncoder: true, HasDecoder: true }) + throw new NotSupportedException("Computing rank in encoder-decoder models is not supported"); + if (@params.PoolingType != LLamaPoolingType.Rank) + throw new NotSupportedException("Computing rank score, PoolingType must be equal to LLamaPoolingType.Rank"); + Context = weights.CreateContext(@params, logger); + NativeApi.llama_set_embeddings(Context.NativeHandle, true); + StrBOS = Context.Vocab.LLamaTokenToString(Context.Vocab.BOS, true) ?? ""; + StrEOS = Context.Vocab.LLamaTokenToString(Context.Vocab.EOS, true) ?? ""; + } + + /// + public void Dispose() + { + Context.Dispose(); + } + + /// + /// Retrieve relevance scores for input and document by reranking + /// + /// + /// + /// Whether to normalize the score to the range (0, 1) + /// + /// + /// + /// + public async Task> GetRelevanceScores(string input, IReadOnlyList documents, bool normalize = false, CancellationToken cancellationToken = default) { + List scores = new List(documents.Count); + foreach (var document in documents) + { + var score = (await GetRelevanceScoreWithTokenCount(input, document, cancellationToken).ConfigureAwait(false)).Score; + scores.Add(normalize ? Sigmoid(score) : score); + } + return scores; + } + + + private async Task<(float Score, int Tokens)> GetRelevanceScoreWithTokenCount(string input, string document, CancellationToken cancellationToken = default) + { + var prompt = $"{input}{document}"; + // Add all of the tokens to the batch + var tokens = Context.Tokenize(prompt, special: true); + var batch = new LLamaBatch(); + for (var i = 0; i < tokens.Length; i++) + batch.Add(tokens[i], i, LLamaSeqId.Zero, true); + + // clear previous kv_cache values + Context.NativeHandle.KvCacheClear(); + + // Check if we should cancel the work, just before doing anything expensive (encode/decode) + cancellationToken.ThrowIfCancellationRequested(); + + // Run model + switch (Context.NativeHandle.ModelHandle.HasEncoder, Context.NativeHandle.ModelHandle.HasDecoder) + { + case (true, false): + { + var result = await Context.EncodeAsync(batch, cancellationToken); + if (result != EncodeResult.Ok) + throw new RuntimeError($"Failed to encode: {result}"); + break; + } + + case (false, true): + { + var result = await Context.DecodeAsync(batch, cancellationToken); + if (result != DecodeResult.Ok) + throw new RuntimeError($"Failed to decode: {result}"); + break; + } + + default: + throw new NotSupportedException("Unsupported model type"); + } + + var score = Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero)[0]; + + Context.NativeHandle.KvCacheClear(); + + return (score, tokens.Length); + } + + private float Sigmoid(float x) + { + return (float)(1 / (1 + Math.Exp(-x))); + } +} diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index db198ec30..801d25167 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -651,7 +651,18 @@ internal Vocabulary(SafeLlamaModelHandle model) _model = model; } - private string? LLamaTokenToString(LLamaToken? token, bool isSpecialToken) + private static LLamaToken? Normalize(LLamaToken token) + { + return token == -1 ? null : token; + } + + /// + /// Translate LLamaToken to String + /// + /// + /// + /// + public string? LLamaTokenToString(LLamaToken? token, bool isSpecialToken) { if (!token.HasValue) return null; @@ -676,11 +687,6 @@ internal Vocabulary(SafeLlamaModelHandle model) return Encoding.UTF8.GetStringFromSpan(slice); } - private static LLamaToken? Normalize(LLamaToken token) - { - return token == -1 ? null : token; - } - /// /// Total number of tokens in this vocabulary /// From c60435924440f5e7184f23028780ed4dda282abb Mon Sep 17 00:00:00 2001 From: nipeone Date: Fri, 11 Apr 2025 12:55:19 +0800 Subject: [PATCH 06/11] optimize LLamaReranker function --- LLama.Unittest/LLamaRerankerTests.cs | 8 +-- LLama/LLamaReranker.cs | 83 ++++++++++++++++++++-------- 2 files changed, 65 insertions(+), 26 deletions(-) diff --git a/LLama.Unittest/LLamaRerankerTests.cs b/LLama.Unittest/LLamaRerankerTests.cs index 8b786ff9b..30753ffe6 100644 --- a/LLama.Unittest/LLamaRerankerTests.cs +++ b/LLama.Unittest/LLamaRerankerTests.cs @@ -61,12 +61,12 @@ public async Task MostRelevantDocument() }; var scores = await _reranker.GetRelevanceScores(input, documents, normalize: true); + Assert.NotNull(scores); Assert.True(documents.Length == scores.Count); - int maxIndex = scores - .Select((score, index) => new { Score = score, Index = index }) - .MaxBy(x => x.Score) - .Index; + int maxIndex = scores.Select((score, index) => (score, index)) + .MaxBy(x => x.score) + .index; var maxScoreDocument = documents[maxIndex]; Assert.Equal(documents[2], maxScoreDocument); diff --git a/LLama/LLamaReranker.cs b/LLama/LLamaReranker.cs index 4c0aa2394..a113d2363 100644 --- a/LLama/LLamaReranker.cs +++ b/LLama/LLamaReranker.cs @@ -18,16 +18,6 @@ namespace LLama; public sealed partial class LLamaReranker : IDisposable { - /// - /// string BOS - /// - public string StrBOS { get; } - /// - /// string EOS - /// - public string StrEOS { get; } - - /// /// Dimension of embedding vectors /// @@ -54,8 +44,6 @@ public LLamaReranker(LLamaWeights weights, IContextParams @params, ILogger? logg throw new NotSupportedException("Computing rank score, PoolingType must be equal to LLamaPoolingType.Rank"); Context = weights.CreateContext(@params, logger); NativeApi.llama_set_embeddings(Context.NativeHandle, true); - StrBOS = Context.Vocab.LLamaTokenToString(Context.Vocab.BOS, true) ?? ""; - StrEOS = Context.Vocab.LLamaTokenToString(Context.Vocab.EOS, true) ?? ""; } /// @@ -65,7 +53,7 @@ public void Dispose() } /// - /// Retrieve relevance scores for input and document by reranking + /// Retrieve relevance scores for input and documents by reranking, execute once. /// /// /// @@ -74,22 +62,73 @@ public void Dispose() /// /// /// - public async Task> GetRelevanceScores(string input, IReadOnlyList documents, bool normalize = false, CancellationToken cancellationToken = default) { + public async Task> GetRelevanceScores(string input, IReadOnlyList documents, bool normalize = false, CancellationToken cancellationToken = default) + { List scores = new List(documents.Count); - foreach (var document in documents) + var batch = new LLamaBatch(); + var inputTokens = Context.Tokenize(input); + foreach (var (index, document) in documents.Select((item, index) => (index, item))) + { + var docTokens = Context.Tokenize(document); + LLamaToken[] tokens = [.. inputTokens, .. docTokens]; + for (var i = 0; i < tokens.Length; i++) + batch.Add(tokens[i], i, (LLamaSeqId)index, true); + } + + // clear previous kv_cache values + Context.NativeHandle.KvCacheClear(); + + // Check if we should cancel the work, just before doing anything expensive (encode/decode) + cancellationToken.ThrowIfCancellationRequested(); + + // Run model + switch (Context.NativeHandle.ModelHandle.HasEncoder, Context.NativeHandle.ModelHandle.HasDecoder) { - var score = (await GetRelevanceScoreWithTokenCount(input, document, cancellationToken).ConfigureAwait(false)).Score; + case (true, false): + { + var result = await Context.EncodeAsync(batch, cancellationToken); + if (result != EncodeResult.Ok) + throw new RuntimeError($"Failed to encode: {result}"); + break; + } + + case (false, true): + { + var result = await Context.DecodeAsync(batch, cancellationToken); + if (result != DecodeResult.Ok) + throw new RuntimeError($"Failed to decode: {result}"); + break; + } + + default: + throw new NotSupportedException("Unsupported model type"); + } + + for (var i = 0; i < documents.Count; i++) + { + var score = Context.NativeHandle.GetEmbeddingsSeq((LLamaSeqId)i)[0]; scores.Add(normalize ? Sigmoid(score) : score); } + + Context.NativeHandle.KvCacheClear(); + return scores; } - - private async Task<(float Score, int Tokens)> GetRelevanceScoreWithTokenCount(string input, string document, CancellationToken cancellationToken = default) + /// + /// Retrieve relevance score for input and document by reranking + /// + /// + /// + /// + /// + /// + /// + public async Task<(float Score, int Tokens)> GetRelevanceScoreWithTokenCount(string input, string document, bool normalize = false, CancellationToken cancellationToken = default) { - var prompt = $"{input}{document}"; - // Add all of the tokens to the batch - var tokens = Context.Tokenize(prompt, special: true); + var inputTokens = Context.Tokenize(input); + var docTokens = Context.Tokenize(document); + LLamaToken[] tokens = [..inputTokens, ..docTokens]; var batch = new LLamaBatch(); for (var i = 0; i < tokens.Length; i++) batch.Add(tokens[i], i, LLamaSeqId.Zero, true); @@ -127,7 +166,7 @@ public async Task> GetRelevanceScores(string input, IReadOn Context.NativeHandle.KvCacheClear(); - return (score, tokens.Length); + return (normalize ? Sigmoid(score) : score, tokens.Length); } private float Sigmoid(float x) From d99670c1e8f9b4360490f2b5ac0e794b8c40e02c Mon Sep 17 00:00:00 2001 From: nipeone Date: Fri, 11 Apr 2025 16:37:42 +0800 Subject: [PATCH 07/11] fix Reranking if documents is too large --- LLama/LLamaReranker.cs | 82 +++++++++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 29 deletions(-) diff --git a/LLama/LLamaReranker.cs b/LLama/LLamaReranker.cs index a113d2363..389e44d86 100644 --- a/LLama/LLamaReranker.cs +++ b/LLama/LLamaReranker.cs @@ -5,6 +5,7 @@ using System.Text; using System.Threading; using System.Threading.Tasks; +using System.Xml.Linq; using LLama.Abstractions; using LLama.Exceptions; using LLama.Native; @@ -65,16 +66,52 @@ public void Dispose() public async Task> GetRelevanceScores(string input, IReadOnlyList documents, bool normalize = false, CancellationToken cancellationToken = default) { List scores = new List(documents.Count); - var batch = new LLamaBatch(); var inputTokens = Context.Tokenize(input); - foreach (var (index, document) in documents.Select((item, index) => (index, item))) + var batch = new LLamaBatch(); + var clearFlag = 0; + + for(var idx = 0; idx < documents.Count; idx++) { - var docTokens = Context.Tokenize(document); + var docTokens = Context.Tokenize(documents[idx]); LLamaToken[] tokens = [.. inputTokens, .. docTokens]; + + if (batch.TokenCount + tokens.Length > Context.ContextSize) + { + scores.AddRange(await CalcRelevanceScores(batch, normalize, cancellationToken)); + batch.Clear(); + clearFlag = idx; + } + for (var i = 0; i < tokens.Length; i++) - batch.Add(tokens[i], i, (LLamaSeqId)index, true); + batch.Add(tokens[i], i, (LLamaSeqId)(idx - clearFlag), true); + } + if (batch.LogitPositionCount > 0) + { + scores.AddRange(await CalcRelevanceScores(batch, normalize, cancellationToken)); + batch.Clear(); } + return scores; + } + + /// + /// Retrieve relevance score for input and document by reranking + /// + /// + /// + /// + /// + /// + /// + public async Task<(float Score, int Tokens)> GetRelevanceScoreWithTokenCount(string input, string document, bool normalize = false, CancellationToken cancellationToken = default) + { + var inputTokens = Context.Tokenize(input); + var docTokens = Context.Tokenize(document); + LLamaToken[] tokens = [..inputTokens, ..docTokens]; + var batch = new LLamaBatch(); + for (var i = 0; i < tokens.Length; i++) + batch.Add(tokens[i], i, LLamaSeqId.Zero, true); + // clear previous kv_cache values Context.NativeHandle.KvCacheClear(); @@ -104,35 +141,18 @@ public async Task> GetRelevanceScores(string input, IReadOn throw new NotSupportedException("Unsupported model type"); } - for (var i = 0; i < documents.Count; i++) - { - var score = Context.NativeHandle.GetEmbeddingsSeq((LLamaSeqId)i)[0]; - scores.Add(normalize ? Sigmoid(score) : score); - } + var score = Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero)[0]; Context.NativeHandle.KvCacheClear(); - return scores; + return (normalize ? Sigmoid(score) : score, tokens.Length); } - /// - /// Retrieve relevance score for input and document by reranking - /// - /// - /// - /// - /// - /// - /// - public async Task<(float Score, int Tokens)> GetRelevanceScoreWithTokenCount(string input, string document, bool normalize = false, CancellationToken cancellationToken = default) + private async Task> CalcRelevanceScores(LLamaBatch batch, bool normalize = false, CancellationToken cancellationToken = default) { - var inputTokens = Context.Tokenize(input); - var docTokens = Context.Tokenize(document); - LLamaToken[] tokens = [..inputTokens, ..docTokens]; - var batch = new LLamaBatch(); - for (var i = 0; i < tokens.Length; i++) - batch.Add(tokens[i], i, LLamaSeqId.Zero, true); - + var (logicCap, _) = batch.GetLogitPositions()[batch.LogitPositionCount - 1]; + var seqNum = logicCap.Value + 1; + List scores = new List(seqNum); // clear previous kv_cache values Context.NativeHandle.KvCacheClear(); @@ -162,11 +182,15 @@ public async Task> GetRelevanceScores(string input, IReadOn throw new NotSupportedException("Unsupported model type"); } - var score = Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero)[0]; + for (var seq = 0; seq < seqNum; seq++) + { + var score = Context.NativeHandle.GetEmbeddingsSeq((LLamaSeqId)seq)[0]; + scores.Add(normalize ? Sigmoid(score) : score); + } Context.NativeHandle.KvCacheClear(); - return (normalize ? Sigmoid(score) : score, tokens.Length); + return scores; } private float Sigmoid(float x) From 05677feef5e614e9455223b122a3dee9a1735d85 Mon Sep 17 00:00:00 2001 From: nipeone Date: Tue, 15 Apr 2025 16:29:06 +0800 Subject: [PATCH 08/11] fix Reranking if document contains null --- LLama/LLamaReranker.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LLama/LLamaReranker.cs b/LLama/LLamaReranker.cs index 389e44d86..71c111eb6 100644 --- a/LLama/LLamaReranker.cs +++ b/LLama/LLamaReranker.cs @@ -72,7 +72,7 @@ public async Task> GetRelevanceScores(string input, IReadOn for(var idx = 0; idx < documents.Count; idx++) { - var docTokens = Context.Tokenize(documents[idx]); + var docTokens = Context.Tokenize(documents[idx] ?? ""); LLamaToken[] tokens = [.. inputTokens, .. docTokens]; if (batch.TokenCount + tokens.Length > Context.ContextSize) From c62980f23b434de5312fd0f8794855df4b329eac Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Mon, 21 Apr 2025 14:55:18 +0100 Subject: [PATCH 09/11] Apply suggestions from code review Upgraded Linux runners to Ubuntu 24.04 --- .github/workflows/compile.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index 1448c5618..e91ce7179 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -28,15 +28,15 @@ jobs: include: - build: 'noavx' defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF' - os: ubuntu-20.04 + os: ubuntu-24.04 arch: x64 - build: 'avx2' defines: '' - os: ubuntu-20.04 + os: ubuntu-24.04 arch: x64 - build: 'avx' defines: '-DGGML_AVX2=OFF' - os: ubuntu-20.04 + os: ubuntu-24.04 arch: x64 - build: 'avx512' defines: '-DGGML_AVX512=ON' @@ -44,7 +44,7 @@ jobs: arch: x64 - build: 'aarch64' defines: '-DGGML_NATIVE=OFF -DGGML_CPU_AARCH64=ON -DGGML_CPU_ARM_ARCH=armv8-a' - os: ubuntu-22.04-arm + os: ubuntu-24.04-arm arch: arm64 runs-on: ${{ matrix.os }} steps: From dfb3cc9209b511ca047c5c5aca7dbecccf5d3f1b Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Mon, 21 Apr 2025 14:55:56 +0100 Subject: [PATCH 10/11] Update .github/workflows/compile.yml Upgraded Linux AVX512 to Ubuntu 24 --- .github/workflows/compile.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index e91ce7179..b63c4fa85 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -40,7 +40,7 @@ jobs: arch: x64 - build: 'avx512' defines: '-DGGML_AVX512=ON' - os: ubuntu-20.04 + os: ubuntu-24.04 arch: x64 - build: 'aarch64' defines: '-DGGML_NATIVE=OFF -DGGML_CPU_AARCH64=ON -DGGML_CPU_ARM_ARCH=armv8-a' From 9ed73783f7f0bce122756aaa2a92bf13f0917504 Mon Sep 17 00:00:00 2001 From: nipeone Date: Tue, 6 May 2025 09:55:28 +0800 Subject: [PATCH 11/11] Merge upstream/master and resolve conflicts --- LLama.Unittest/LLama.Unittest.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LLama.Unittest/LLama.Unittest.csproj b/LLama.Unittest/LLama.Unittest.csproj index 22289124d..6b0e0b8f4 100644 --- a/LLama.Unittest/LLama.Unittest.csproj +++ b/LLama.Unittest/LLama.Unittest.csproj @@ -46,7 +46,7 @@ smollm-360m-instruct-add-basics-q8_0.gguf - + https://huggingface.co/gpustack/jina-reranker-v1-tiny-en-GGUF/resolve/main/jina-reranker-v1-tiny-en-FP16.gguf Models jina-reranker-v1-tiny-en-FP16.gguf