diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index 5ade12316..cc7a750ec 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -28,13 +28,25 @@ jobs: include: - build: 'noavx' defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF' + os: ubuntu-24.04 + arch: x64 - build: 'avx2' defines: '' + os: ubuntu-24.04 + arch: x64 - build: 'avx' defines: '-DGGML_AVX2=OFF' + os: ubuntu-24.04 + arch: x64 - build: 'avx512' defines: '-DGGML_AVX512=ON' - runs-on: ubuntu-24.04 + os: ubuntu-24.04 + arch: x64 + - build: 'aarch64' + defines: '-DGGML_NATIVE=OFF -DGGML_CPU_AARCH64=ON -DGGML_CPU_ARM_ARCH=armv8-a' + os: ubuntu-24.04-arm + arch: arm64 + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 with: @@ -52,28 +64,28 @@ jobs: - uses: actions/upload-artifact@v4 with: path: ./build/bin/libllama.so - name: llama-bin-linux-${{ matrix.build }}-x64.so + name: llama-bin-linux-${{ matrix.build }}-${{ matrix.arch }}.so if-no-files-found: error - uses: actions/upload-artifact@v4 with: path: ./build/bin/libggml.so - name: ggml-bin-linux-${{ matrix.build }}-x64.so + name: ggml-bin-linux-${{ matrix.build }}-${{ matrix.arch }}.so if-no-files-found: error - uses: actions/upload-artifact@v4 with: path: ./build/bin/libggml-base.so - name: ggml-base-bin-linux-${{ matrix.build }}-x64.so + name: ggml-base-bin-linux-${{ matrix.build }}-${{ matrix.arch }}.so if-no-files-found: error - uses: actions/upload-artifact@v4 with: path: ./build/bin/libggml-cpu.so - name: ggml-cpu-bin-linux-${{ matrix.build }}-x64.so + name: ggml-cpu-bin-linux-${{ matrix.build }}-${{ matrix.arch }}.so if-no-files-found: error - name: Upload Llava uses: actions/upload-artifact@v4 with: path: ./build/bin/libllava_shared.so - name: llava-bin-linux-${{ matrix.build }}-x64.so + name: llava-bin-linux-${{ matrix.build }}-${{ matrix.arch }}.so if-no-files-found: error compile-musl: @@ -601,7 +613,7 @@ jobs: - name: Rearrange Files run: | # Make all directories at once - mkdir --parents deps/{noavx,avx,avx2,avx512,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu11.7.1,cu12.2.0,vulkan,android-arm64-v8a,android-x86,android-x86_64} + mkdir --parents deps/{noavx,avx,avx2,avx512,linux-arm64,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu11.7.1,cu12.2.0,vulkan,android-arm64-v8a,android-x86,android-x86_64} # Linux cp artifacts/ggml-bin-linux-noavx-x64.so/libggml.so deps/noavx/libggml.so @@ -628,6 +640,13 @@ jobs: cp artifacts/llama-bin-linux-avx512-x64.so/libllama.so deps/avx512/libllama.so cp artifacts/llava-bin-linux-avx512-x64.so/libllava_shared.so deps/avx512/libllava_shared.so + # Arm64 + cp artifacts/ggml-bin-linux-aarch64-arm64.so/libggml.so deps/linux-arm64/libggml.so + cp artifacts/ggml-base-bin-linux-aarch64-arm64.so/libggml-base.so deps/linux-arm64/libggml-base.so + cp artifacts/ggml-cpu-bin-linux-aarch64-arm64.so/libggml-cpu.so deps/linux-arm64/libggml-cpu.so + cp artifacts/llama-bin-linux-aarch64-arm64.so/libllama.so deps/linux-arm64/libllama.so + cp artifacts/llava-bin-linux-aarch64-arm64.so/libllava_shared.so deps/linux-arm64/libllava_shared.so + # Musl cp artifacts/ggml-bin-musl-noavx-x64.so/libggml.so deps/musl-noavx/libggml.so cp artifacts/ggml-base-bin-musl-noavx-x64.so/libggml-base.so deps/musl-noavx/libggml-base.so diff --git a/LLama.Unittest/Constants.cs b/LLama.Unittest/Constants.cs index 3d81f23bf..d501b189b 100644 --- a/LLama.Unittest/Constants.cs +++ b/LLama.Unittest/Constants.cs @@ -7,6 +7,7 @@ internal static class Constants public static readonly string GenerativeModelPath = "Models/Llama-3.2-1B-Instruct-Q4_0.gguf"; public static readonly string GenerativeModelPath2 = "Models/smollm-360m-instruct-add-basics-q8_0.gguf"; public static readonly string EmbeddingModelPath = "Models/all-MiniLM-L12-v2.Q8_0.gguf"; + public static readonly string RerankingModelPath = "Models/jina-reranker-v1-tiny-en-FP16.gguf"; public static readonly string LLavaModelPath = "Models/llava-v1.6-mistral-7b.Q3_K_XS.gguf"; public static readonly string LLavaMmpPath = "Models/mmproj-model-f16.gguf"; diff --git a/LLama.Unittest/LLama.Unittest.csproj b/LLama.Unittest/LLama.Unittest.csproj index 2dd85e88f..6b0e0b8f4 100644 --- a/LLama.Unittest/LLama.Unittest.csproj +++ b/LLama.Unittest/LLama.Unittest.csproj @@ -46,6 +46,12 @@ smollm-360m-instruct-add-basics-q8_0.gguf + + https://huggingface.co/gpustack/jina-reranker-v1-tiny-en-GGUF/resolve/main/jina-reranker-v1-tiny-en-FP16.gguf + Models + jina-reranker-v1-tiny-en-FP16.gguf + + https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/llava-v1.6-mistral-7b.Q3_K_XS.gguf Models @@ -130,6 +136,9 @@ PreserveNewest + + PreserveNewest + PreserveNewest diff --git a/LLama.Unittest/LLamaRerankerTests.cs b/LLama.Unittest/LLamaRerankerTests.cs new file mode 100644 index 000000000..30753ffe6 --- /dev/null +++ b/LLama.Unittest/LLamaRerankerTests.cs @@ -0,0 +1,74 @@ +using LLama.Common; +using LLama.Extensions; +using LLama.Native; +using Microsoft.Extensions.AI; +using System.Runtime.InteropServices; +using Xunit.Abstractions; + +namespace LLama.Unittest; + +public sealed class LLamaRerankerTests +{ + private readonly ITestOutputHelper _testOutputHelper; + private readonly LLamaReranker _reranker; + public LLamaRerankerTests(ITestOutputHelper testOutputHelper) + { + _testOutputHelper = testOutputHelper; + + var @params = new ModelParams(Constants.RerankingModelPath) + { + ContextSize = 0, + PoolingType = LLamaPoolingType.Rank, + GpuLayerCount = Constants.CIGpuLayerCount, + + }; + using var weights = LLamaWeights.LoadFromFile(@params); + _reranker = new LLamaReranker(weights, @params); + } + + [Fact] + public async Task CompareRerankingScore() + { + + + var input = "what is panda?"; + var documents = new string[] { + "hi", + "it's a bear", + string.Join(", ","The giant panda (Ailuropoda melanoleuca)", + "sometimes called a panda bear or simply panda", + "is a bear species endemic to China.") + }; + var scores = await _reranker.GetRelevanceScores(input, documents, normalize: false); + + Assert.True(documents.Length == scores.Count); + + _testOutputHelper.WriteLine($"Rerank score 0: {scores[0]:F4}"); + _testOutputHelper.WriteLine($"Rerank score 1: {scores[1]:F4}"); + _testOutputHelper.WriteLine($"Rerank score 2: {scores[2]:F4}"); + } + + [Fact] + public async Task MostRelevantDocument() + { + var input = "what is panda?"; + var documents = new string[] { + "hi", + "it's a bear", + string.Join(", ","The giant panda (Ailuropoda melanoleuca)", + "sometimes called a panda bear or simply panda", + "is a bear species endemic to China.") + }; + var scores = await _reranker.GetRelevanceScores(input, documents, normalize: true); + + Assert.NotNull(scores); + Assert.True(documents.Length == scores.Count); + + int maxIndex = scores.Select((score, index) => (score, index)) + .MaxBy(x => x.score) + .index; + + var maxScoreDocument = documents[maxIndex]; + Assert.Equal(documents[2], maxScoreDocument); + } +} diff --git a/LLama.Unittest/Native/SafeLlamaModelHandleVocabularyTests.cs b/LLama.Unittest/Native/SafeLlamaModelHandleVocabularyTests.cs new file mode 100644 index 000000000..5b8e12ac3 --- /dev/null +++ b/LLama.Unittest/Native/SafeLlamaModelHandleVocabularyTests.cs @@ -0,0 +1,37 @@ +using System.Text; +using System.Xml.Linq; +using LLama.Common; +using LLama.Extensions; +using Microsoft.Extensions.Logging; + + +namespace LLama.Unittest.Native; + +public class SafeLlamaModelHandleVocabularyTests +{ + private readonly LLamaWeights _model; + + public SafeLlamaModelHandleVocabularyTests() + { + var @params = new ModelParams(Constants.RerankingModelPath) + { + ContextSize = 0, + PoolingType = LLama.Native.LLamaPoolingType.Rank, + GpuLayerCount = Constants.CIGpuLayerCount + }; + _model = LLamaWeights.LoadFromFile(@params); + } + + [Fact] + public void GetLLamaTokenString() + { + var bos = _model.Vocab.BOS; + var eos = _model.Vocab.EOS; + + var bosStr = _model.Vocab.LLamaTokenToString(bos, true); + var eosStr = _model.Vocab.LLamaTokenToString(eos, true); + + Assert.Equal("", bosStr); + Assert.Equal("", eosStr); + } +} diff --git a/LLama/LLamaReranker.cs b/LLama/LLamaReranker.cs new file mode 100644 index 000000000..71c111eb6 --- /dev/null +++ b/LLama/LLamaReranker.cs @@ -0,0 +1,200 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using System.Xml.Linq; +using LLama.Abstractions; +using LLama.Exceptions; +using LLama.Native; +using Microsoft.Extensions.Logging; + +namespace LLama; + +/// +/// Get rank scores between prompt and documents +/// +public sealed partial class LLamaReranker + : IDisposable +{ + /// + /// Dimension of embedding vectors + /// + public int EmbeddingSize => Context.EmbeddingSize; + + /// + /// LLama Context + /// + public LLamaContext Context { get; } + + /// + /// Create a new reranker, using the given LLamaWeights + /// + /// + /// + /// + public LLamaReranker(LLamaWeights weights, IContextParams @params, ILogger? logger = null) + { + if (@params.UBatchSize != @params.BatchSize) + throw new ArgumentException("For non-causal models, batch size must be equal to ubatch size", nameof(@params)); + if (weights.NativeHandle is { HasEncoder: true, HasDecoder: true }) + throw new NotSupportedException("Computing rank in encoder-decoder models is not supported"); + if (@params.PoolingType != LLamaPoolingType.Rank) + throw new NotSupportedException("Computing rank score, PoolingType must be equal to LLamaPoolingType.Rank"); + Context = weights.CreateContext(@params, logger); + NativeApi.llama_set_embeddings(Context.NativeHandle, true); + } + + /// + public void Dispose() + { + Context.Dispose(); + } + + /// + /// Retrieve relevance scores for input and documents by reranking, execute once. + /// + /// + /// + /// Whether to normalize the score to the range (0, 1) + /// + /// + /// + /// + public async Task> GetRelevanceScores(string input, IReadOnlyList documents, bool normalize = false, CancellationToken cancellationToken = default) + { + List scores = new List(documents.Count); + var inputTokens = Context.Tokenize(input); + var batch = new LLamaBatch(); + var clearFlag = 0; + + for(var idx = 0; idx < documents.Count; idx++) + { + var docTokens = Context.Tokenize(documents[idx] ?? ""); + LLamaToken[] tokens = [.. inputTokens, .. docTokens]; + + if (batch.TokenCount + tokens.Length > Context.ContextSize) + { + scores.AddRange(await CalcRelevanceScores(batch, normalize, cancellationToken)); + batch.Clear(); + clearFlag = idx; + } + + for (var i = 0; i < tokens.Length; i++) + batch.Add(tokens[i], i, (LLamaSeqId)(idx - clearFlag), true); + } + if (batch.LogitPositionCount > 0) + { + scores.AddRange(await CalcRelevanceScores(batch, normalize, cancellationToken)); + batch.Clear(); + } + + return scores; + } + + /// + /// Retrieve relevance score for input and document by reranking + /// + /// + /// + /// + /// + /// + /// + public async Task<(float Score, int Tokens)> GetRelevanceScoreWithTokenCount(string input, string document, bool normalize = false, CancellationToken cancellationToken = default) + { + var inputTokens = Context.Tokenize(input); + var docTokens = Context.Tokenize(document); + LLamaToken[] tokens = [..inputTokens, ..docTokens]; + var batch = new LLamaBatch(); + for (var i = 0; i < tokens.Length; i++) + batch.Add(tokens[i], i, LLamaSeqId.Zero, true); + + // clear previous kv_cache values + Context.NativeHandle.KvCacheClear(); + + // Check if we should cancel the work, just before doing anything expensive (encode/decode) + cancellationToken.ThrowIfCancellationRequested(); + + // Run model + switch (Context.NativeHandle.ModelHandle.HasEncoder, Context.NativeHandle.ModelHandle.HasDecoder) + { + case (true, false): + { + var result = await Context.EncodeAsync(batch, cancellationToken); + if (result != EncodeResult.Ok) + throw new RuntimeError($"Failed to encode: {result}"); + break; + } + + case (false, true): + { + var result = await Context.DecodeAsync(batch, cancellationToken); + if (result != DecodeResult.Ok) + throw new RuntimeError($"Failed to decode: {result}"); + break; + } + + default: + throw new NotSupportedException("Unsupported model type"); + } + + var score = Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero)[0]; + + Context.NativeHandle.KvCacheClear(); + + return (normalize ? Sigmoid(score) : score, tokens.Length); + } + + private async Task> CalcRelevanceScores(LLamaBatch batch, bool normalize = false, CancellationToken cancellationToken = default) + { + var (logicCap, _) = batch.GetLogitPositions()[batch.LogitPositionCount - 1]; + var seqNum = logicCap.Value + 1; + List scores = new List(seqNum); + // clear previous kv_cache values + Context.NativeHandle.KvCacheClear(); + + // Check if we should cancel the work, just before doing anything expensive (encode/decode) + cancellationToken.ThrowIfCancellationRequested(); + + // Run model + switch (Context.NativeHandle.ModelHandle.HasEncoder, Context.NativeHandle.ModelHandle.HasDecoder) + { + case (true, false): + { + var result = await Context.EncodeAsync(batch, cancellationToken); + if (result != EncodeResult.Ok) + throw new RuntimeError($"Failed to encode: {result}"); + break; + } + + case (false, true): + { + var result = await Context.DecodeAsync(batch, cancellationToken); + if (result != DecodeResult.Ok) + throw new RuntimeError($"Failed to decode: {result}"); + break; + } + + default: + throw new NotSupportedException("Unsupported model type"); + } + + for (var seq = 0; seq < seqNum; seq++) + { + var score = Context.NativeHandle.GetEmbeddingsSeq((LLamaSeqId)seq)[0]; + scores.Add(normalize ? Sigmoid(score) : score); + } + + Context.NativeHandle.KvCacheClear(); + + return scores; + } + + private float Sigmoid(float x) + { + return (float)(1 / (1 + Math.Exp(-x))); + } +} diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets index 22a3e04e1..2972bcadd 100644 --- a/LLama/LLamaSharp.Runtime.targets +++ b/LLama/LLamaSharp.Runtime.targets @@ -202,6 +202,28 @@ + + PreserveNewest + runtimes/linux-arm64/native/libllama.so + + + PreserveNewest + runtimes/linux-arm64/native/libggml.so + + + PreserveNewest + runtimes/linux-arm64/native/libggml-base.so + + + PreserveNewest + runtimes/linux-arm64/native/libggml-cpu.so + + + PreserveNewest + runtimes/linux-arm64/native/libllava_shared.so + + + PreserveNewest runtimes/linux-x64/native/cuda11/libllama.so diff --git a/LLama/Native/Load/NativeLibraryUtils.cs b/LLama/Native/Load/NativeLibraryUtils.cs index b0e8a792a..9f6457cd1 100644 --- a/LLama/Native/Load/NativeLibraryUtils.cs +++ b/LLama/Native/Load/NativeLibraryUtils.cs @@ -88,19 +88,28 @@ internal static IntPtr TryLoadLibrary(NativeLibraryConfig config, out INativeLib // On other platforms (Windows, Linux), we need to load the CPU backend from the specified AVX level directory // We are using the AVX level supplied by NativeLibraryConfig, which automatically detects the highest supported AVX level for us - // ggml-cpu - dependencyPaths.Add(Path.Combine( - $"runtimes/{os}/native/{NativeLibraryConfig.AvxLevelToString(library.Metadata.AvxLevel)}", - $"{libPrefix}ggml-cpu{ext}" - )); - - // ggml-cuda - if (library.Metadata.UseCuda) - dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cuda{ext}")); - - // ggml-vulkan - if (library.Metadata.UseVulkan) - dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-vulkan{ext}")); + if (os == "linux-arm64"){ + dependencyPaths.Add(Path.Combine( + $"runtimes/{os}/native", + $"{libPrefix}ggml-cpu{ext}" + )); + } + else{ + // ggml-cpu + dependencyPaths.Add(Path.Combine( + $"runtimes/{os}/native/{NativeLibraryConfig.AvxLevelToString(library.Metadata.AvxLevel)}", + $"{libPrefix}ggml-cpu{ext}" + )); + + // ggml-cuda + if (library.Metadata.UseCuda) + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cuda{ext}")); + + // ggml-vulkan + if (library.Metadata.UseVulkan) + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-vulkan{ext}")); + } + } } @@ -218,6 +227,13 @@ public static void GetPlatformPathParts(OSPlatform platform, out string os, out if (platform == OSPlatform.Linux) { + if(System.Runtime.Intrinsics.Arm.ArmBase.Arm64.IsSupported){ + // linux arm64 + os = "linux-arm64"; + fileExtension = ".so"; + libPrefix = "lib"; + return; + } if(RuntimeInformation.RuntimeIdentifier.ToLower().StartsWith("alpine")) { // alpine linux distro diff --git a/LLama/Native/Load/NativeLibraryWithAvx.cs b/LLama/Native/Load/NativeLibraryWithAvx.cs index 932c49866..e6cbd86f3 100644 --- a/LLama/Native/Load/NativeLibraryWithAvx.cs +++ b/LLama/Native/Load/NativeLibraryWithAvx.cs @@ -50,11 +50,17 @@ public IEnumerable Prepare(SystemInfo systemInfo, NativeLogConfig.LLamaL private string? GetAvxPath(SystemInfo systemInfo, AvxLevel avxLevel, NativeLogConfig.LLamaLogCallback? logCallback) { NativeLibraryUtils.GetPlatformPathParts(systemInfo.OSPlatform, out var os, out var fileExtension, out var libPrefix); - var avxStr = NativeLibraryConfig.AvxLevelToString(avxLevel); - if (!string.IsNullOrEmpty(avxStr)) - avxStr += "/"; - var relativePath = $"runtimes/{os}/native/{avxStr}{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}"; - return relativePath; + if (os != "linux-arm64"){ + var avxStr = NativeLibraryConfig.AvxLevelToString(avxLevel); + if (!string.IsNullOrEmpty(avxStr)) + avxStr += "/"; + var relativePath = $"runtimes/{os}/native/{avxStr}{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}"; + return relativePath; + } else { + var relativePath = $"runtimes/{os}/native/{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}"; + return relativePath; + } + } } #endif diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index db198ec30..801d25167 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -651,7 +651,18 @@ internal Vocabulary(SafeLlamaModelHandle model) _model = model; } - private string? LLamaTokenToString(LLamaToken? token, bool isSpecialToken) + private static LLamaToken? Normalize(LLamaToken token) + { + return token == -1 ? null : token; + } + + /// + /// Translate LLamaToken to String + /// + /// + /// + /// + public string? LLamaTokenToString(LLamaToken? token, bool isSpecialToken) { if (!token.HasValue) return null; @@ -676,11 +687,6 @@ internal Vocabulary(SafeLlamaModelHandle model) return Encoding.UTF8.GetStringFromSpan(slice); } - private static LLamaToken? Normalize(LLamaToken token) - { - return token == -1 ? null : token; - } - /// /// Total number of tokens in this vocabulary /// diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec index 7c69534da..d173039a9 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec @@ -66,7 +66,13 @@ - + + + + + + +