diff --git a/src/benchmarks/micro/sve/Exponent.cs b/src/benchmarks/micro/sve/Exponent.cs
new file mode 100644
index 00000000000..08fdaa7dc51
--- /dev/null
+++ b/src/benchmarks/micro/sve/Exponent.cs
@@ -0,0 +1,206 @@
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Extensions;
+using BenchmarkDotNet.Configs;
+using BenchmarkDotNet.Filters;
+using MicroBenchmarks;
+
+namespace SveBenchmarks
+{
+    [BenchmarkCategory(Categories.Runtime)]
+    [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)]
+    [Config(typeof(Config))]
+    public class Exponent
+    {
+        private class Config : ManualConfig
+        {
+            public Config()
+            {
+                AddFilter(new SimpleFilter(_ => Sve.IsSupported));
+            }
+        }
+
+        [Params(15, 127, 527, 10015)]
+        public int Size;
+
+        private float[] _input;
+        private float[] _data_sve;
+        private float[] _data_neon;
+        private float[] _output;
+
+        [GlobalSetup]
+        public virtual void Setup()
+        {
+            _input = ValuesGenerator.Array<float>(Size);
+
+            _data_sve = new float[] {
+                // c1, c3, inv_ln2
+                BitConverter.UInt32BitsToSingle(0x3f000000),
+                BitConverter.UInt32BitsToSingle(0x3d2aaab5),
+                BitConverter.UInt32BitsToSingle(0x3fb8aa3b),
+                // ln2_lo, c0, c2, c4
+                BitConverter.UInt32BitsToSingle(0x35bfbe8e),
+                BitConverter.UInt32BitsToSingle(0x3f800000),
+                BitConverter.UInt32BitsToSingle(0x3e2aaaab),
+                BitConverter.UInt32BitsToSingle(0x3c057330),
+                // ln2_hi, shift
+                BitConverter.UInt32BitsToSingle(0x3f317200),
+                BitConverter.UInt32BitsToSingle(0x48401fc0),
+            };
+
+            _data_neon = new float[] {
+                // inv_ln2, ln2_lo, c0, c2
+                BitConverter.UInt32BitsToSingle(0x3fb8aa3b),
+                BitConverter.UInt32BitsToSingle(0x35bfbe8e),
+                BitConverter.UInt32BitsToSingle(0x3c07cfce),
+                BitConverter.UInt32BitsToSingle(0x3e2aad40),
+                // ln2_hi, shift, c1, c3, c4
+                BitConverter.UInt32BitsToSingle(0x3f317200),
+                BitConverter.UInt32BitsToSingle(0x4b40007f),
+                BitConverter.UInt32BitsToSingle(0x3d2b9d0d),
+                BitConverter.UInt32BitsToSingle(0x3efffee3),
+                BitConverter.UInt32BitsToSingle(0x3f7ffffb),
+            };
+
+            _output = new float[Size];
+        }
+
+        [GlobalCleanup]
+        public virtual void Verify()
+        {
+            float[] current = (float[])_output.Clone();
+            Setup();
+            Scalar();
+            float[] scalar = (float[])_output.Clone();
+
+            // Check that the result is the same as scalar (within 3ULP).
+            for (int i = 0; i < Size; i++)
+            {
+                int e = (int)(BitConverter.SingleToUInt32Bits(scalar[i]) >> 23 & 0xff);
+                if (e == 0) e++;
+                float ulpScale = (float)Math.ScaleB(1.0, e - 127 - 23);
+                float ulpError = (float)Math.Abs(current[i] - scalar[i]) / ulpScale;
+                Debug.Assert(ulpError <= 3);
+            }
+        }
+
+        [Benchmark]
+        public unsafe void Scalar()
+        {
+            fixed (float* input = _input, output = _output)
+            {
+                for (int i = 0; i < Size; i++)
+                {
+                    output[i] = (float)Math.Exp(input[i]);
+                }
+            }
+        }
+
+        [Benchmark]
+        public unsafe void Vector128Exponent()
+        {
+            // Algorithm based on Arm Optimized-Routines.
+            // https://github.com/ARM-software/optimized-routines/blob/v25.07/math/aarch64/advsimd/expf.c
+            fixed (float* input = _input, output = _output, d = _data_neon)
+            {
+                int i = 0;
+
+                Vector128<float> constVec = AdvSimd.LoadVector128(d);
+                Vector128<float> ln2hiVec = Vector128.Create(d[4]);
+                Vector128<float> shiftVec = Vector128.Create(d[5]);
+                Vector128<float> c1Vec = Vector128.Create(d[6]);
+                Vector128<float> c3Vec = Vector128.Create(d[7]);
+                Vector128<float> c4Vec = Vector128.Create(d[8]);
+
+                for (; i < Size - 4; i += 4)
+                {
+                    Vector128<float> x = AdvSimd.LoadVector128(input + i);
+
+                    // z = shift + x * 1/ln2
+                    Vector128<float> z = AdvSimd.Arm64.FusedMultiplyAddBySelectedScalar(shiftVec, x, constVec, 0);
+                    // -n = shift - z
+                    Vector128<float> neg_n = AdvSimd.Subtract(shiftVec, z);
+                    // scale = z << 23
+                    Vector128<float> scale = AdvSimd.ShiftLeftLogical(z.AsUInt32(), 23).AsSingle();
+
+                    // r = x - n * ln2_hi
+                    Vector128<float> r = AdvSimd.FusedMultiplyAdd(x, neg_n, ln2hiVec);
+                    // r = r - n * ln2_lo
+                    r = AdvSimd.Arm64.FusedMultiplyAddBySelectedScalar(r, neg_n, constVec, 1);
+                    Vector128<float> r2 = AdvSimd.Multiply(r, r);
+
+                    // poly(r) = exp(r) - 1.
+                    Vector128<float> p10 = AdvSimd.Arm64.FusedMultiplyAddBySelectedScalar(c1Vec, r, constVec, 2);
+                    Vector128<float> p32 = AdvSimd.Arm64.FusedMultiplyAddBySelectedScalar(c3Vec, r, constVec, 3);
+                    Vector128<float> p30 = AdvSimd.FusedMultiplyAdd(p32, r2, p10);
+                    Vector128<float> p4 = AdvSimd.Multiply(r, c4Vec);
+                    Vector128<float> poly = AdvSimd.FusedMultiplyAdd(p4, r2, p30);
+
+                    // result = scale * (1 + poly).
+                    Vector128<float> result = AdvSimd.FusedMultiplyAdd(scale, poly, scale);
+                    AdvSimd.Store(output + i, result);
+                }
+                // Handle remaining elements.
+                for (; i < Size; i++)
+                {
+                    output[i] = (float)Math.Exp(input[i]);
+                }
+            }
+        }
+
+        [Benchmark]
+        public unsafe void SveExponent()
+        {
+            fixed (float* input = _input, output = _output, d = _data_sve)
+            {
+                int i = 0;
+                int cntw = (int)Sve.Count32BitElements();
+
+                Vector<float> c1Vec = new Vector<float>(d[0]);
+                Vector<float> c3Vec = new Vector<float>(d[1]);
+                Vector<float> invln2Vec = new Vector<float>(d[2]);
+                Vector<float> shiftVec = new Vector<float>(d[8]);
+                Vector<float> ln2hiVec = new Vector<float>(d[7]);
+                Vector<float> constVec = Sve.LoadVector(Sve.CreateTrueMaskSingle(), &d[3]);
+
+                Vector<uint> pTrue = Sve.CreateTrueMaskUInt32();
+                Vector<uint> pLoop = Sve.CreateWhileLessThanMask32Bit(0, Size);
+                while (Sve.TestFirstTrue(pTrue, pLoop))
+                {
+                    Vector<float> x = (Vector<float>)Sve.LoadVector(pLoop, (uint*)(input + i));
+
+                    // n = round(x/(ln2/N)).
+                    Vector<float> z = Sve.FusedMultiplyAdd(shiftVec, invln2Vec, x);
+                    Vector<float> n = Sve.Subtract(z, shiftVec);
+
+                    // r = x - n*ln2/N.
+                    Vector<float> r = Sve.FusedMultiplySubtract(x, ln2hiVec, n);
+                    r = Sve.FusedMultiplySubtractBySelectedScalar(r, n, constVec, 0);
+                    // scale = 2^(n/N).
+                    Vector<float> scale = Sve.FloatingPointExponentialAccelerator((Vector<uint>)z);
+
+                    // poly(r) = exp(r) - 1.
+                    Vector<float> p12 = Sve.FusedMultiplyAddBySelectedScalar(c1Vec, r, constVec, 2);
+                    Vector<float> p34 = Sve.FusedMultiplyAddBySelectedScalar(c3Vec, r, constVec, 3);
+                    Vector<float> r2 = Sve.Multiply(r, r);
+                    Vector<float> p14 = Sve.FusedMultiplyAdd(p12, p34, r2);
+                    Vector<float> p0 = Sve.MultiplyBySelectedScalar(r, constVec, 1);
+                    Vector<float> poly = Sve.FusedMultiplyAdd(p0, r2, p14);
+
+                    // result = scale * (1 + poly).
+                    Vector<float> result = Sve.FusedMultiplyAdd(scale, poly, scale);
+                    Sve.StoreAndZip(pLoop, (uint*)output + i, (Vector<uint>)result);
+
+                    // Handle loop.
+                    i += cntw;
+                    pLoop = Sve.CreateWhileLessThanMask32Bit(i, Size);
+                }
+            }
+        }
+
+    }
+}
diff --git a/src/benchmarks/micro/sve/FP64Overflow.cs b/src/benchmarks/micro/sve/FP64Overflow.cs
new file mode 100644
index 00000000000..14a4647eb3b
--- /dev/null
+++ b/src/benchmarks/micro/sve/FP64Overflow.cs
@@ -0,0 +1,223 @@
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Extensions;
+using BenchmarkDotNet.Configs;
+using BenchmarkDotNet.Filters;
+using MicroBenchmarks;
+
+namespace SveBenchmarks
+{
+    [BenchmarkCategory(Categories.Runtime)]
+    [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)]
+    [Config(typeof(Config))]
+    public class FP64Overflow
+    {
+        private class Config : ManualConfig
+        {
+            public Config()
+            {
+                AddFilter(new SimpleFilter(_ => Sve2.IsSupported));
+            }
+        }
+
+        [Params(15, 127, 527, 10015)]
+        public int Size;
+
+        private double[] _input1;
+        private double[] _input2;
+        private double[] _output;
+        private long[] _exponent;
+
+        [GlobalSetup]
+        public virtual void Setup()
+        {
+            _input1 = new double[Size];
+            _input2 = new double[Size];
+            double[] vals = ValuesGenerator.Array<double>(Size * 2);
+            for (int i = 0; i < Size; i++)
+            {
+                _input1[i] = vals[i] + Double.MinValue;
+                _input2[i] = vals[Size + i];
+            }
+            _output = new double[Size];
+            _exponent = new long[Size];
+        }
+
+        [GlobalCleanup]
+        public virtual void Verify()
+        {
+            double[] current = (double[])_output.Clone();
+            long[] current_exp = (long[])_exponent.Clone();
+            Setup();
+            Scalar();
+            double[] scalar = (double[])_output.Clone();
+            long[] scalar_exp = (long[])_exponent.Clone();
+
+            // Check that the result is the same as scalar.
+            for (int i = 0; i < Size; i++)
+            {
+                Debug.Assert(current[i] == scalar[i]);
+                Debug.Assert(current_exp[i] == scalar_exp[i]);
+            }
+        }
+
+        // The following algorithms are adapted from the Arm simd-loops repository:
+        // https://gitlab.arm.com/architecture/simd-loops/-/blob/main/loops/loop_111.c
+
+        [Benchmark]
+        public unsafe void Scalar()
+        {
+            fixed (double* input1 = _input1, input2 = _input2, output = _output)
+            fixed (long* exponent = _exponent)
+            {
+                long mask = 1023;
+
+                for (int i = 0; i < Size; i++)
+                {
+                    // Convert element from double to ulong.
+                    ulong in1Bits = *(ulong*)&input1[i];
+                    // Extract the exponent bits by shifting left by 1 then right by 53.
+                    long exp = (long)((in1Bits << 1) >> 53);
+                    long scale = mask - exp;
+                    output[i] = Math.ScaleB(input1[i], (int)scale);
+                    output[i] *= input2[i];
+                    // Calculate exponent offset.
+                    exponent[i] = exp - mask;
+                }
+            }
+        }
+
+        [Benchmark]
+        public unsafe void Vector128FP64Overflow()
+        {
+            fixed (double* input1 = _input1, input2 = _input2, output = _output)
+            fixed (long* exponent = _exponent)
+            {
+                int i = 0;
+                long mask = 1023;
+                Vector128<long> maskVec = Vector128.Create(mask);
+                Vector128<long> scaleMask = Vector128.Create(~(2047L << 52));
+
+                for (; i <= Size - 2; i += 2)
+                {
+                    Vector128<double> in1Vec = AdvSimd.LoadVector128(input1 + i);
+                    Vector128<double> in2Vec = AdvSimd.LoadVector128(input2 + i);
+                    Vector128<long> in1Bits = in1Vec.AsInt64();
+
+                    // Extract the exponent bits by shifting left by 1 then right by 53.
+                    Vector128<long> exp = AdvSimd.ShiftRightLogical(AdvSimd.ShiftLeftLogical(in1Vec.AsUInt64(), 1), 53).AsInt64();
+                    Vector128<long> scale = AdvSimd.Subtract(maskVec, exp);
+
+                    // Calculate Scale(in1Vec, scale).
+                    scale = AdvSimd.ShiftLeftLogical(scale, 52);
+                    Vector128<long> outBits = AdvSimd.Add(in1Bits, scale);
+                    in1Bits = AdvSimd.And(in1Bits, scaleMask);
+                    outBits = AdvSimd.Or(in1Bits, outBits);
+                    Vector128<double> outVec = outBits.AsDouble();
+                    outVec = AdvSimd.Arm64.Multiply(outVec, in2Vec);
+
+                    // Store result to output array.
+                    AdvSimd.Store(output + i, outVec);
+
+                    // Calculate exponent offset.
+                    exp = AdvSimd.Subtract(exp, maskVec);
+                    // Store result to exponent array.
+                    AdvSimd.Store(exponent + i, exp);
+                }
+                // Handle tail.
+                for (; i < Size; i++)
+                {
+                    ulong in1Bits = *(ulong*)&input1[i];
+                    long exp = (long)((in1Bits << 1) >> 53);
+                    long scale = mask - exp;
+                    output[i] = Math.ScaleB(input1[i], (int)scale);
+                    output[i] *= input2[i];
+                    exponent[i] = exp - mask;
+                }
+            }
+        }
+
+        [Benchmark]
+        public unsafe void SveFP64Overflow()
+        {
+            fixed (double* input1 = _input1, input2 = _input2, output = _output)
+            fixed (long* exponent = _exponent)
+            {
+                int i = 0;
+                int cntd = (int)Sve.Count64BitElements();
+
+                Vector<long> maskVec = new Vector<long>(1023);
+
+                Vector<ulong> pTrue = Sve.CreateTrueMaskUInt64();
+                Vector<ulong> pLoop = Sve.CreateWhileLessThanMask64Bit(i, Size);
+                while (Sve.TestFirstTrue(pTrue, pLoop))
+                {
+                    // Load Vector<input1> as ulong then convert to Vector<double>.
+                    Vector<ulong> in1Bits = Sve.LoadVector(pLoop, (ulong*)input1 + i);
+                    Vector<double> in1Vec = (Vector<double>)in1Bits;
+                    Vector<double> in2Vec = (Vector<double>)Sve.LoadVector(pLoop, (ulong*)input2 + i);
+
+                    // Extract the exponent bits by shifting left by 1 then right by 53.
+                    Vector<long> exp = (Vector<long>)Sve.ShiftRightLogical(Sve.ShiftLeftLogical(in1Bits, new Vector<ulong>(1)), new Vector<ulong>(53));
+
+                    // Compute the output.
+                    Vector<long> scale = Sve.Subtract(maskVec, exp);
+                    Vector<double> outVec = Sve.Scale(in1Vec, scale);
+                    outVec = Sve.Multiply(outVec, in2Vec);
+                    // Store result to output array.
+                    Sve.StoreAndZip(pLoop, (ulong*)output + i, (Vector<ulong>)outVec);
+
+                    // Calculate exponent offset.
+                    exp = Sve.Subtract(exp, maskVec);
+                    // Store result to exponent array.
+                    Sve.StoreAndZip(pLoop, (ulong*)exponent + i, (Vector<ulong>)exp);
+
+                    // Handle loop.
+                    i += cntd;
+                    pLoop = Sve.CreateWhileLessThanMask64Bit(i, Size);
+                }
+            }
+        }
+
+        [Benchmark]
+        public unsafe void Sve2FP64Overflow()
+        {
+            fixed (double* input1 = _input1, input2 = _input2, output = _output)
+            fixed (long* exponent = _exponent)
+            {
+                int i = 0;
+                int cntd = (int)Sve.Count64BitElements();
+
+                Vector<ulong> pTrue = Sve.CreateTrueMaskUInt64();
+                Vector<ulong> pLoop = Sve.CreateWhileLessThanMask64Bit(i, Size);
+                while (Sve.TestFirstTrue(pTrue, pLoop))
+                {
+                    // Load input vectors.
+                    Vector<double> in1Vec = (Vector<double>)Sve.LoadVector(pLoop, (ulong*)input1 + i);
+                    Vector<double> in2Vec = (Vector<double>)Sve.LoadVector(pLoop, (ulong*)input2 + i);
+
+                    // Get the exponent by taking log.
+                    Vector<long> exp = Sve2.Log2(in1Vec);
+
+                    // Compute the output.
+                    Vector<long> scale = Sve.Negate(exp);
+                    Vector<double> outVec = Sve.Scale(in1Vec, scale);
+                    outVec = Sve.Multiply(outVec, in2Vec);
+                    // Store result to output array.
+                    Sve.StoreAndZip(pLoop, (ulong*)output + i, (Vector<ulong>)outVec);
+                    // Store result to exponent array.
+                    Sve.StoreAndZip(pLoop, (ulong*)exponent + i, (Vector<ulong>)exp);
+
+                    // Handle loop.
+                    i += cntd;
+                    pLoop = Sve.CreateWhileLessThanMask64Bit(i, Size);
+                }
+            }
+        }
+
+    }
+}
diff --git a/src/benchmarks/micro/sve/FastDivision.cs b/src/benchmarks/micro/sve/FastDivision.cs
new file mode 100644
index 00000000000..e78acfafaea
--- /dev/null
+++ b/src/benchmarks/micro/sve/FastDivision.cs
@@ -0,0 +1,154 @@
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Extensions;
+using BenchmarkDotNet.Configs;
+using BenchmarkDotNet.Filters;
+using MicroBenchmarks;
+
+namespace SveBenchmarks
+{
+    [BenchmarkCategory(Categories.Runtime)]
+    [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)]
+    [Config(typeof(Config))]
+    public class FastDivision
+    {
+        private class Config : ManualConfig
+        {
+            public Config()
+            {
+                AddFilter(new SimpleFilter(_ => Sve.IsSupported));
+            }
+        }
+
+        [Params(15, 127, 527, 10015)]
+        public int Size;
+
+        private double[] _input1;
+        private double[] _input2;
+        private double[] _output;
+
+        [GlobalSetup]
+        public virtual void Setup()
+        {
+            _input1 = new double[Size];
+            _input2 = new double[Size];
+
+            double[] vals = ValuesGenerator.Array<double>(Size * 2);
+            for (int i = 0; i < Size; i++)
+            {
+                _input1[i] = vals[i];
+                _input2[i] = vals[Size + i];
+            }
+            _output = new double[Size];
+        }
+
+        [GlobalCleanup]
+        public virtual void Verify()
+        {
+            double[] current = (double[])_output.Clone();
+            Setup();
+            Scalar();
+            double[] scalar = (double[])_output.Clone();
+
+            // Check that the result is the same as scalar (within 3ULP).
+            for (int i = 0; i < Size; i++)
+            {
+                int e = (int)(BitConverter.DoubleToUInt64Bits(scalar[i]) >> 52 & 0x7ff);
+                if (e == 0) e++;
+                double ulpScale = Math.ScaleB(1.0f, e - 1023 - 52);
+                double ulpError = Math.Abs(current[i] - scalar[i]) / ulpScale;
+                Debug.Assert(ulpError <= 3);
+            }
+        }
+
+        // The following algorithms are adapted from the Arm simd-loops repository:
+        // https://gitlab.arm.com/architecture/simd-loops/-/blob/main/loops/loop_028.c
+
+        [Benchmark]
+        public unsafe void Scalar()
+        {
+            fixed (double* input1 = _input1, input2 = _input2, output = _output)
+            {
+                for (int i = 0; i < Size; i++)
+                {
+                    output[i] = input1[i] / input2[i];
+                }
+            }
+        }
+
+        [Benchmark]
+        public unsafe void Vector128FastDivision()
+        {
+            fixed (double* input1 = _input1, input2 = _input2, output = _output)
+            {
+                int i = 0;
+                for (; i <= Size - 2; i += 2)
+                {
+                    Vector128<double> input1Vec = AdvSimd.LoadVector128(input1 + i);
+                    Vector128<double> input2Vec = AdvSimd.LoadVector128(input2 + i);
+
+                    // Estimate the reciprocal of 1/input2Vec.
+                    Vector128<double> input2VecInv = AdvSimd.Arm64.ReciprocalEstimate(input2Vec);
+
+                    // Iteratively refine the estimation by multiplying the reicrocal step.
+                    Vector128<double> stp2;
+                    for (int j = 0; j < 3; j++)
+                    {
+                        stp2 = AdvSimd.Arm64.ReciprocalStep(input2Vec, input2VecInv);
+                        input2VecInv = AdvSimd.Arm64.Multiply(input2VecInv, stp2);
+                    }
+
+                    // Get the result of input1Vec * (1/input2Vec)
+                    Vector128<double> outVec = AdvSimd.Arm64.Multiply(input2VecInv, input1Vec);
+                    AdvSimd.Store(output + i, outVec);
+                }
+                // Handle tail.
+                for (; i < Size; i++)
+                {
+                    output[i] = input1[i] / input2[i];
+                }
+            }
+        }
+
+        [Benchmark]
+        public unsafe void SveFastDivision()
+        {
+            fixed (double* input1 = _input1, input2 = _input2, output = _output)
+            {
+                int i = 0;
+                int cntd = (int)Sve.Count64BitElements();
+
+                Vector<ulong> pTrue = Sve.CreateTrueMaskUInt64();
+                Vector<ulong> pLoop = Sve.CreateWhileLessThanMask64Bit(i, Size);
+                while (Sve.TestFirstTrue(pTrue, pLoop))
+                {
+                    Vector<double> input1Vec = (Vector<double>)Sve.LoadVector(pLoop, (ulong*)input1 + i);
+                    Vector<double> input2Vec = (Vector<double>)Sve.LoadVector(pLoop, (ulong*)input2 + i);
+
+                    // Estimate the reciprocal of 1/input2Vec.
+                    Vector<double> input2VecInv = Sve.ReciprocalEstimate(input2Vec);
+
+                    // Iteratively refine the estimation by multiplying the reicrocal step.
+                    Vector<double> stp2;
+                    for (int j = 0; j < 3; j++)
+                    {
+                        stp2 = Sve.ReciprocalStep(input2Vec, input2VecInv);
+                        input2VecInv = Sve.Multiply(input2VecInv, stp2);
+                    }
+
+                    // Get the result of input1Vec * (1/input2Vec)
+                    Vector<double> outVec = Sve.Multiply(input2VecInv, input1Vec);
+                    Sve.StoreAndZip(pLoop, (ulong*)output + i, (Vector<ulong>)outVec);
+
+                    i += cntd;
+                    pLoop = Sve.CreateWhileLessThanMask64Bit(i, Size);
+                }
+            }
+        }
+
+    }
+}
diff --git a/src/benchmarks/micro/sve/MultiplyPow2.cs b/src/benchmarks/micro/sve/MultiplyPow2.cs
new file mode 100644
index 00000000000..012401725b7
--- /dev/null
+++ b/src/benchmarks/micro/sve/MultiplyPow2.cs
@@ -0,0 +1,165 @@
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Extensions;
+using BenchmarkDotNet.Configs;
+using BenchmarkDotNet.Filters;
+using MicroBenchmarks;
+
+namespace SveBenchmarks
+{
+    [BenchmarkCategory(Categories.Runtime)]
+    [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)]
+    [Config(typeof(Config))]
+    public class MultiplyPow2
+    {
+        private class Config : ManualConfig
+        {
+            public Config()
+            {
+                AddFilter(new SimpleFilter(_ => Sve.IsSupported));
+            }
+        }
+
+        [Params(15, 127, 527, 10015)]
+        public int Size;
+
+        private double[] _input;
+        private long[] _scale;
+        private double[] _output;
+
+        [GlobalSetup]
+        public virtual void Setup()
+        {
+            _input = ValuesGenerator.Array<double>(Size);
+            _scale = ValuesGenerator.Array<long>(Size);
+
+            for (int i = 0; i < Size; i++)
+            {
+                // Set the scale to within the range of [-128, 128).
+                _scale[i] = _scale[i] % 256 - 128;
+            }
+            _output = new double[Size];
+        }
+
+        [GlobalCleanup]
+        public virtual void Verify()
+        {
+            double[] current = (double[])_output.Clone();
+            Setup();
+            Scalar();
+            double[] scalar = (double[])_output.Clone();
+
+            // Check that the result is the same as scalar.
+            for (int i = 0; i < Size; i++)
+            {
+                Debug.Assert(current[i] == scalar[i]);
+            }
+        }
+
+        // The following algorithms are adapted from the Arm simd-loops repository:
+        // https://gitlab.arm.com/architecture/simd-loops/-/blob/main/loops/loop_029.c
+
+        [Benchmark]
+        public unsafe void Scalar()
+        {
+            fixed (double* input = _input, output = _output)
+            fixed (long* scale = _scale)
+            {
+                for (int i = 0; i < Size; i++)
+                {
+                    output[i] = Math.ScaleB(input[i], (int)scale[i]);
+                }
+            }
+        }
+
+        [Benchmark]
+        public unsafe void Vector128MultiplyPow2()
+        {
+            fixed (double* input = _input, output = _output)
+            fixed (long* scale = _scale)
+            {
+                Vector128<long> mask = Vector128.Create(~(2047L << 52));
+
+                int i = 0;
+                for (; i <= Size - 2; i += 2)
+                {
+                    Vector128<long> inVec = AdvSimd.LoadVector128((long*)input + i);
+                    Vector128<long> scaleVec = AdvSimd.LoadVector128(scale + i);
+
+                    scaleVec = AdvSimd.ShiftLeftLogical(scaleVec, 52);
+
+                    Vector128<long> outVec = AdvSimd.Add(inVec, scaleVec);
+                    inVec = AdvSimd.And(inVec, mask);
+                    outVec = AdvSimd.Or(inVec, outVec);
+
+                    AdvSimd.Store((long*)output + i, outVec);
+                }
+                // Handle tail.
+                for (; i < Size; i++)
+                {
+                    output[i] = Math.ScaleB(input[i], (int)scale[i]);
+                }
+            }
+        }
+
+        [Benchmark]
+        public unsafe void SveMultiplyPow2()
+        {
+            fixed (double* input = _input, output = _output)
+            fixed (long* scale = _scale)
+            {
+                int i = 0;
+                int cntd = (int)Sve.Count64BitElements();
+
+                Vector<ulong> pTrue = Sve.CreateTrueMaskUInt64();
+                Vector<ulong> pLoop = Sve.CreateWhileLessThanMask64Bit(i, Size);
+                while (Sve.TestFirstTrue(pTrue, pLoop))
+                {
+                    // Cast the array pointers to ulong so the predicate can be shared.
+                    // Avoid casting predicate vectors.
+                    Vector<double> inVec = (Vector<double>)Sve.LoadVector(pLoop, (ulong*)input + i);
+                    Vector<long> scaleVec = (Vector<long>)Sve.LoadVector(pLoop, (ulong*)scale + i);
+
+                    Vector<double> outVec = Sve.Scale(inVec, scaleVec);
+                    Sve.StoreAndZip(pLoop, (ulong*)output + i, (Vector<ulong>)outVec);
+
+                    // Handle loop.
+                    i += cntd;
+                    pLoop = Sve.CreateWhileLessThanMask64Bit(i, Size);
+                }
+            }
+        }
+
+        [Benchmark]
+        public unsafe void SveTail()
+        {
+            fixed (double* input = _input, output = _output)
+            fixed (long* scale = _scale)
+            {
+                int i = 0;
+                int cntd = (int)Sve.Count64BitElements();
+
+                Vector<long> pTrue = Sve.CreateTrueMaskInt64();
+                Vector<double> pTrueD = Sve.CreateTrueMaskDouble();
+                for (; i <= Size - cntd; i += cntd)
+                {
+                    Vector<double> inVec = Sve.LoadVector(pTrueD, input + i);
+                    Vector<long> scaleVec = Sve.LoadVector(pTrue, scale + i);
+
+                    Vector<double> outVec = Sve.Scale(inVec, scaleVec);
+                    Sve.StoreAndZip(pTrueD, output + i, outVec);
+                }
+                // Handle tail.
+                for (; i < Size; i++)
+                {
+                    output[i] = Math.ScaleB(input[i], (int)scale[i]);
+                }
+            }
+        }
+
+    }
+}