diff --git a/src/benchmarks/micro/sve/Exponent.cs b/src/benchmarks/micro/sve/Exponent.cs new file mode 100644 index 00000000000..08fdaa7dc51 --- /dev/null +++ b/src/benchmarks/micro/sve/Exponent.cs @@ -0,0 +1,206 @@ +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Extensions; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Filters; +using MicroBenchmarks; + +namespace SveBenchmarks +{ + [BenchmarkCategory(Categories.Runtime)] + [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)] + [Config(typeof(Config))] + public class Exponent + { + private class Config : ManualConfig + { + public Config() + { + AddFilter(new SimpleFilter(_ => Sve.IsSupported)); + } + } + + [Params(15, 127, 527, 10015)] + public int Size; + + private float[] _input; + private float[] _data_sve; + private float[] _data_neon; + private float[] _output; + + [GlobalSetup] + public virtual void Setup() + { + _input = ValuesGenerator.Array(Size); + + _data_sve = new float[] { + // c1, c3, inv_ln2 + BitConverter.UInt32BitsToSingle(0x3f000000), + BitConverter.UInt32BitsToSingle(0x3d2aaab5), + BitConverter.UInt32BitsToSingle(0x3fb8aa3b), + // ln2_lo, c0, c2, c4 + BitConverter.UInt32BitsToSingle(0x35bfbe8e), + BitConverter.UInt32BitsToSingle(0x3f800000), + BitConverter.UInt32BitsToSingle(0x3e2aaaab), + BitConverter.UInt32BitsToSingle(0x3c057330), + // ln2_hi, shift + BitConverter.UInt32BitsToSingle(0x3f317200), + BitConverter.UInt32BitsToSingle(0x48401fc0), + }; + + _data_neon = new float[] { + // inv_ln2, ln2_lo, c0, c2 + BitConverter.UInt32BitsToSingle(0x3fb8aa3b), + BitConverter.UInt32BitsToSingle(0x35bfbe8e), + BitConverter.UInt32BitsToSingle(0x3c07cfce), + BitConverter.UInt32BitsToSingle(0x3e2aad40), + // ln2_hi, shift, c1, c3, c4 + BitConverter.UInt32BitsToSingle(0x3f317200), + BitConverter.UInt32BitsToSingle(0x4b40007f), + BitConverter.UInt32BitsToSingle(0x3d2b9d0d), + BitConverter.UInt32BitsToSingle(0x3efffee3), + BitConverter.UInt32BitsToSingle(0x3f7ffffb), + }; + + _output = new float[Size]; + } + + [GlobalCleanup] + public virtual void Verify() + { + float[] current = (float[])_output.Clone(); + Setup(); + Scalar(); + float[] scalar = (float[])_output.Clone(); + + // Check that the result is the same as scalar (within 3ULP). + for (int i = 0; i < Size; i++) + { + int e = (int)(BitConverter.SingleToUInt32Bits(scalar[i]) >> 23 & 0xff); + if (e == 0) e++; + float ulpScale = (float)Math.ScaleB(1.0, e - 127 - 23); + float ulpError = (float)Math.Abs(current[i] - scalar[i]) / ulpScale; + Debug.Assert(ulpError <= 3); + } + } + + [Benchmark] + public unsafe void Scalar() + { + fixed (float* input = _input, output = _output) + { + for (int i = 0; i < Size; i++) + { + output[i] = (float)Math.Exp(input[i]); + } + } + } + + [Benchmark] + public unsafe void Vector128Exponent() + { + // Algorithm based on Arm Optimized-Routines. + // https://github.com/ARM-software/optimized-routines/blob/v25.07/math/aarch64/advsimd/expf.c + fixed (float* input = _input, output = _output, d = _data_neon) + { + int i = 0; + + Vector128 constVec = AdvSimd.LoadVector128(d); + Vector128 ln2hiVec = Vector128.Create(d[4]); + Vector128 shiftVec = Vector128.Create(d[5]); + Vector128 c1Vec = Vector128.Create(d[6]); + Vector128 c3Vec = Vector128.Create(d[7]); + Vector128 c4Vec = Vector128.Create(d[8]); + + for (; i < Size - 4; i += 4) + { + Vector128 x = AdvSimd.LoadVector128(input + i); + + // z = shift + x * 1/ln2 + Vector128 z = AdvSimd.Arm64.FusedMultiplyAddBySelectedScalar(shiftVec, x, constVec, 0); + // -n = shift - z + Vector128 neg_n = AdvSimd.Subtract(shiftVec, z); + // scale = z << 23 + Vector128 scale = AdvSimd.ShiftLeftLogical(z.AsUInt32(), 23).AsSingle(); + + // r = x - n * ln2_hi + Vector128 r = AdvSimd.FusedMultiplyAdd(x, neg_n, ln2hiVec); + // r = r - n * ln2_lo + r = AdvSimd.Arm64.FusedMultiplyAddBySelectedScalar(r, neg_n, constVec, 1); + Vector128 r2 = AdvSimd.Multiply(r, r); + + // poly(r) = exp(r) - 1. + Vector128 p10 = AdvSimd.Arm64.FusedMultiplyAddBySelectedScalar(c1Vec, r, constVec, 2); + Vector128 p32 = AdvSimd.Arm64.FusedMultiplyAddBySelectedScalar(c3Vec, r, constVec, 3); + Vector128 p30 = AdvSimd.FusedMultiplyAdd(p32, r2, p10); + Vector128 p4 = AdvSimd.Multiply(r, c4Vec); + Vector128 poly = AdvSimd.FusedMultiplyAdd(p4, r2, p30); + + // result = scale * (1 + poly). + Vector128 result = AdvSimd.FusedMultiplyAdd(scale, poly, scale); + AdvSimd.Store(output + i, result); + } + // Handle remaining elements. + for (; i < Size; i++) + { + output[i] = (float)Math.Exp(input[i]); + } + } + } + + [Benchmark] + public unsafe void SveExponent() + { + fixed (float* input = _input, output = _output, d = _data_sve) + { + int i = 0; + int cntw = (int)Sve.Count32BitElements(); + + Vector c1Vec = new Vector(d[0]); + Vector c3Vec = new Vector(d[1]); + Vector invln2Vec = new Vector(d[2]); + Vector shiftVec = new Vector(d[8]); + Vector ln2hiVec = new Vector(d[7]); + Vector constVec = Sve.LoadVector(Sve.CreateTrueMaskSingle(), &d[3]); + + Vector pTrue = Sve.CreateTrueMaskUInt32(); + Vector pLoop = Sve.CreateWhileLessThanMask32Bit(0, Size); + while (Sve.TestFirstTrue(pTrue, pLoop)) + { + Vector x = (Vector)Sve.LoadVector(pLoop, (uint*)(input + i)); + + // n = round(x/(ln2/N)). + Vector z = Sve.FusedMultiplyAdd(shiftVec, invln2Vec, x); + Vector n = Sve.Subtract(z, shiftVec); + + // r = x - n*ln2/N. + Vector r = Sve.FusedMultiplySubtract(x, ln2hiVec, n); + r = Sve.FusedMultiplySubtractBySelectedScalar(r, n, constVec, 0); + // scale = 2^(n/N). + Vector scale = Sve.FloatingPointExponentialAccelerator((Vector)z); + + // poly(r) = exp(r) - 1. + Vector p12 = Sve.FusedMultiplyAddBySelectedScalar(c1Vec, r, constVec, 2); + Vector p34 = Sve.FusedMultiplyAddBySelectedScalar(c3Vec, r, constVec, 3); + Vector r2 = Sve.Multiply(r, r); + Vector p14 = Sve.FusedMultiplyAdd(p12, p34, r2); + Vector p0 = Sve.MultiplyBySelectedScalar(r, constVec, 1); + Vector poly = Sve.FusedMultiplyAdd(p0, r2, p14); + + // result = scale * (1 + poly). + Vector result = Sve.FusedMultiplyAdd(scale, poly, scale); + Sve.StoreAndZip(pLoop, (uint*)output + i, (Vector)result); + + // Handle loop. + i += cntw; + pLoop = Sve.CreateWhileLessThanMask32Bit(i, Size); + } + } + } + + } +} diff --git a/src/benchmarks/micro/sve/FP64Overflow.cs b/src/benchmarks/micro/sve/FP64Overflow.cs new file mode 100644 index 00000000000..14a4647eb3b --- /dev/null +++ b/src/benchmarks/micro/sve/FP64Overflow.cs @@ -0,0 +1,223 @@ +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Extensions; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Filters; +using MicroBenchmarks; + +namespace SveBenchmarks +{ + [BenchmarkCategory(Categories.Runtime)] + [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)] + [Config(typeof(Config))] + public class FP64Overflow + { + private class Config : ManualConfig + { + public Config() + { + AddFilter(new SimpleFilter(_ => Sve2.IsSupported)); + } + } + + [Params(15, 127, 527, 10015)] + public int Size; + + private double[] _input1; + private double[] _input2; + private double[] _output; + private long[] _exponent; + + [GlobalSetup] + public virtual void Setup() + { + _input1 = new double[Size]; + _input2 = new double[Size]; + double[] vals = ValuesGenerator.Array(Size * 2); + for (int i = 0; i < Size; i++) + { + _input1[i] = vals[i] + Double.MinValue; + _input2[i] = vals[Size + i]; + } + _output = new double[Size]; + _exponent = new long[Size]; + } + + [GlobalCleanup] + public virtual void Verify() + { + double[] current = (double[])_output.Clone(); + long[] current_exp = (long[])_exponent.Clone(); + Setup(); + Scalar(); + double[] scalar = (double[])_output.Clone(); + long[] scalar_exp = (long[])_exponent.Clone(); + + // Check that the result is the same as scalar. + for (int i = 0; i < Size; i++) + { + Debug.Assert(current[i] == scalar[i]); + Debug.Assert(current_exp[i] == scalar_exp[i]); + } + } + + // The following algorithms are adapted from the Arm simd-loops repository: + // https://gitlab.arm.com/architecture/simd-loops/-/blob/main/loops/loop_111.c + + [Benchmark] + public unsafe void Scalar() + { + fixed (double* input1 = _input1, input2 = _input2, output = _output) + fixed (long* exponent = _exponent) + { + long mask = 1023; + + for (int i = 0; i < Size; i++) + { + // Convert element from double to ulong. + ulong in1Bits = *(ulong*)&input1[i]; + // Extract the exponent bits by shifting left by 1 then right by 53. + long exp = (long)((in1Bits << 1) >> 53); + long scale = mask - exp; + output[i] = Math.ScaleB(input1[i], (int)scale); + output[i] *= input2[i]; + // Calculate exponent offset. + exponent[i] = exp - mask; + } + } + } + + [Benchmark] + public unsafe void Vector128FP64Overflow() + { + fixed (double* input1 = _input1, input2 = _input2, output = _output) + fixed (long* exponent = _exponent) + { + int i = 0; + long mask = 1023; + Vector128 maskVec = Vector128.Create(mask); + Vector128 scaleMask = Vector128.Create(~(2047L << 52)); + + for (; i <= Size - 2; i += 2) + { + Vector128 in1Vec = AdvSimd.LoadVector128(input1 + i); + Vector128 in2Vec = AdvSimd.LoadVector128(input2 + i); + Vector128 in1Bits = in1Vec.AsInt64(); + + // Extract the exponent bits by shifting left by 1 then right by 53. + Vector128 exp = AdvSimd.ShiftRightLogical(AdvSimd.ShiftLeftLogical(in1Vec.AsUInt64(), 1), 53).AsInt64(); + Vector128 scale = AdvSimd.Subtract(maskVec, exp); + + // Calculate Scale(in1Vec, scale). + scale = AdvSimd.ShiftLeftLogical(scale, 52); + Vector128 outBits = AdvSimd.Add(in1Bits, scale); + in1Bits = AdvSimd.And(in1Bits, scaleMask); + outBits = AdvSimd.Or(in1Bits, outBits); + Vector128 outVec = outBits.AsDouble(); + outVec = AdvSimd.Arm64.Multiply(outVec, in2Vec); + + // Store result to output array. + AdvSimd.Store(output + i, outVec); + + // Calculate exponent offset. + exp = AdvSimd.Subtract(exp, maskVec); + // Store result to exponent array. + AdvSimd.Store(exponent + i, exp); + } + // Handle tail. + for (; i < Size; i++) + { + ulong in1Bits = *(ulong*)&input1[i]; + long exp = (long)((in1Bits << 1) >> 53); + long scale = mask - exp; + output[i] = Math.ScaleB(input1[i], (int)scale); + output[i] *= input2[i]; + exponent[i] = exp - mask; + } + } + } + + [Benchmark] + public unsafe void SveFP64Overflow() + { + fixed (double* input1 = _input1, input2 = _input2, output = _output) + fixed (long* exponent = _exponent) + { + int i = 0; + int cntd = (int)Sve.Count64BitElements(); + + Vector maskVec = new Vector(1023); + + Vector pTrue = Sve.CreateTrueMaskUInt64(); + Vector pLoop = Sve.CreateWhileLessThanMask64Bit(i, Size); + while (Sve.TestFirstTrue(pTrue, pLoop)) + { + // Load Vector as ulong then convert to Vector. + Vector in1Bits = Sve.LoadVector(pLoop, (ulong*)input1 + i); + Vector in1Vec = (Vector)in1Bits; + Vector in2Vec = (Vector)Sve.LoadVector(pLoop, (ulong*)input2 + i); + + // Extract the exponent bits by shifting left by 1 then right by 53. + Vector exp = (Vector)Sve.ShiftRightLogical(Sve.ShiftLeftLogical(in1Bits, new Vector(1)), new Vector(53)); + + // Compute the output. + Vector scale = Sve.Subtract(maskVec, exp); + Vector outVec = Sve.Scale(in1Vec, scale); + outVec = Sve.Multiply(outVec, in2Vec); + // Store result to output array. + Sve.StoreAndZip(pLoop, (ulong*)output + i, (Vector)outVec); + + // Calculate exponent offset. + exp = Sve.Subtract(exp, maskVec); + // Store result to exponent array. + Sve.StoreAndZip(pLoop, (ulong*)exponent + i, (Vector)exp); + + // Handle loop. + i += cntd; + pLoop = Sve.CreateWhileLessThanMask64Bit(i, Size); + } + } + } + + [Benchmark] + public unsafe void Sve2FP64Overflow() + { + fixed (double* input1 = _input1, input2 = _input2, output = _output) + fixed (long* exponent = _exponent) + { + int i = 0; + int cntd = (int)Sve.Count64BitElements(); + + Vector pTrue = Sve.CreateTrueMaskUInt64(); + Vector pLoop = Sve.CreateWhileLessThanMask64Bit(i, Size); + while (Sve.TestFirstTrue(pTrue, pLoop)) + { + // Load input vectors. + Vector in1Vec = (Vector)Sve.LoadVector(pLoop, (ulong*)input1 + i); + Vector in2Vec = (Vector)Sve.LoadVector(pLoop, (ulong*)input2 + i); + + // Get the exponent by taking log. + Vector exp = Sve2.Log2(in1Vec); + + // Compute the output. + Vector scale = Sve.Negate(exp); + Vector outVec = Sve.Scale(in1Vec, scale); + outVec = Sve.Multiply(outVec, in2Vec); + // Store result to output array. + Sve.StoreAndZip(pLoop, (ulong*)output + i, (Vector)outVec); + // Store result to exponent array. + Sve.StoreAndZip(pLoop, (ulong*)exponent + i, (Vector)exp); + + // Handle loop. + i += cntd; + pLoop = Sve.CreateWhileLessThanMask64Bit(i, Size); + } + } + } + + } +} diff --git a/src/benchmarks/micro/sve/FastDivision.cs b/src/benchmarks/micro/sve/FastDivision.cs new file mode 100644 index 00000000000..e78acfafaea --- /dev/null +++ b/src/benchmarks/micro/sve/FastDivision.cs @@ -0,0 +1,154 @@ +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Extensions; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Filters; +using MicroBenchmarks; + +namespace SveBenchmarks +{ + [BenchmarkCategory(Categories.Runtime)] + [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)] + [Config(typeof(Config))] + public class FastDivision + { + private class Config : ManualConfig + { + public Config() + { + AddFilter(new SimpleFilter(_ => Sve.IsSupported)); + } + } + + [Params(15, 127, 527, 10015)] + public int Size; + + private double[] _input1; + private double[] _input2; + private double[] _output; + + [GlobalSetup] + public virtual void Setup() + { + _input1 = new double[Size]; + _input2 = new double[Size]; + + double[] vals = ValuesGenerator.Array(Size * 2); + for (int i = 0; i < Size; i++) + { + _input1[i] = vals[i]; + _input2[i] = vals[Size + i]; + } + _output = new double[Size]; + } + + [GlobalCleanup] + public virtual void Verify() + { + double[] current = (double[])_output.Clone(); + Setup(); + Scalar(); + double[] scalar = (double[])_output.Clone(); + + // Check that the result is the same as scalar (within 3ULP). + for (int i = 0; i < Size; i++) + { + int e = (int)(BitConverter.DoubleToUInt64Bits(scalar[i]) >> 52 & 0x7ff); + if (e == 0) e++; + double ulpScale = Math.ScaleB(1.0f, e - 1023 - 52); + double ulpError = Math.Abs(current[i] - scalar[i]) / ulpScale; + Debug.Assert(ulpError <= 3); + } + } + + // The following algorithms are adapted from the Arm simd-loops repository: + // https://gitlab.arm.com/architecture/simd-loops/-/blob/main/loops/loop_028.c + + [Benchmark] + public unsafe void Scalar() + { + fixed (double* input1 = _input1, input2 = _input2, output = _output) + { + for (int i = 0; i < Size; i++) + { + output[i] = input1[i] / input2[i]; + } + } + } + + [Benchmark] + public unsafe void Vector128FastDivision() + { + fixed (double* input1 = _input1, input2 = _input2, output = _output) + { + int i = 0; + for (; i <= Size - 2; i += 2) + { + Vector128 input1Vec = AdvSimd.LoadVector128(input1 + i); + Vector128 input2Vec = AdvSimd.LoadVector128(input2 + i); + + // Estimate the reciprocal of 1/input2Vec. + Vector128 input2VecInv = AdvSimd.Arm64.ReciprocalEstimate(input2Vec); + + // Iteratively refine the estimation by multiplying the reicrocal step. + Vector128 stp2; + for (int j = 0; j < 3; j++) + { + stp2 = AdvSimd.Arm64.ReciprocalStep(input2Vec, input2VecInv); + input2VecInv = AdvSimd.Arm64.Multiply(input2VecInv, stp2); + } + + // Get the result of input1Vec * (1/input2Vec) + Vector128 outVec = AdvSimd.Arm64.Multiply(input2VecInv, input1Vec); + AdvSimd.Store(output + i, outVec); + } + // Handle tail. + for (; i < Size; i++) + { + output[i] = input1[i] / input2[i]; + } + } + } + + [Benchmark] + public unsafe void SveFastDivision() + { + fixed (double* input1 = _input1, input2 = _input2, output = _output) + { + int i = 0; + int cntd = (int)Sve.Count64BitElements(); + + Vector pTrue = Sve.CreateTrueMaskUInt64(); + Vector pLoop = Sve.CreateWhileLessThanMask64Bit(i, Size); + while (Sve.TestFirstTrue(pTrue, pLoop)) + { + Vector input1Vec = (Vector)Sve.LoadVector(pLoop, (ulong*)input1 + i); + Vector input2Vec = (Vector)Sve.LoadVector(pLoop, (ulong*)input2 + i); + + // Estimate the reciprocal of 1/input2Vec. + Vector input2VecInv = Sve.ReciprocalEstimate(input2Vec); + + // Iteratively refine the estimation by multiplying the reicrocal step. + Vector stp2; + for (int j = 0; j < 3; j++) + { + stp2 = Sve.ReciprocalStep(input2Vec, input2VecInv); + input2VecInv = Sve.Multiply(input2VecInv, stp2); + } + + // Get the result of input1Vec * (1/input2Vec) + Vector outVec = Sve.Multiply(input2VecInv, input1Vec); + Sve.StoreAndZip(pLoop, (ulong*)output + i, (Vector)outVec); + + i += cntd; + pLoop = Sve.CreateWhileLessThanMask64Bit(i, Size); + } + } + } + + } +} diff --git a/src/benchmarks/micro/sve/MultiplyPow2.cs b/src/benchmarks/micro/sve/MultiplyPow2.cs new file mode 100644 index 00000000000..012401725b7 --- /dev/null +++ b/src/benchmarks/micro/sve/MultiplyPow2.cs @@ -0,0 +1,165 @@ +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Extensions; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Filters; +using MicroBenchmarks; + +namespace SveBenchmarks +{ + [BenchmarkCategory(Categories.Runtime)] + [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)] + [Config(typeof(Config))] + public class MultiplyPow2 + { + private class Config : ManualConfig + { + public Config() + { + AddFilter(new SimpleFilter(_ => Sve.IsSupported)); + } + } + + [Params(15, 127, 527, 10015)] + public int Size; + + private double[] _input; + private long[] _scale; + private double[] _output; + + [GlobalSetup] + public virtual void Setup() + { + _input = ValuesGenerator.Array(Size); + _scale = ValuesGenerator.Array(Size); + + for (int i = 0; i < Size; i++) + { + // Set the scale to within the range of [-128, 128). + _scale[i] = _scale[i] % 256 - 128; + } + _output = new double[Size]; + } + + [GlobalCleanup] + public virtual void Verify() + { + double[] current = (double[])_output.Clone(); + Setup(); + Scalar(); + double[] scalar = (double[])_output.Clone(); + + // Check that the result is the same as scalar. + for (int i = 0; i < Size; i++) + { + Debug.Assert(current[i] == scalar[i]); + } + } + + // The following algorithms are adapted from the Arm simd-loops repository: + // https://gitlab.arm.com/architecture/simd-loops/-/blob/main/loops/loop_029.c + + [Benchmark] + public unsafe void Scalar() + { + fixed (double* input = _input, output = _output) + fixed (long* scale = _scale) + { + for (int i = 0; i < Size; i++) + { + output[i] = Math.ScaleB(input[i], (int)scale[i]); + } + } + } + + [Benchmark] + public unsafe void Vector128MultiplyPow2() + { + fixed (double* input = _input, output = _output) + fixed (long* scale = _scale) + { + Vector128 mask = Vector128.Create(~(2047L << 52)); + + int i = 0; + for (; i <= Size - 2; i += 2) + { + Vector128 inVec = AdvSimd.LoadVector128((long*)input + i); + Vector128 scaleVec = AdvSimd.LoadVector128(scale + i); + + scaleVec = AdvSimd.ShiftLeftLogical(scaleVec, 52); + + Vector128 outVec = AdvSimd.Add(inVec, scaleVec); + inVec = AdvSimd.And(inVec, mask); + outVec = AdvSimd.Or(inVec, outVec); + + AdvSimd.Store((long*)output + i, outVec); + } + // Handle tail. + for (; i < Size; i++) + { + output[i] = Math.ScaleB(input[i], (int)scale[i]); + } + } + } + + [Benchmark] + public unsafe void SveMultiplyPow2() + { + fixed (double* input = _input, output = _output) + fixed (long* scale = _scale) + { + int i = 0; + int cntd = (int)Sve.Count64BitElements(); + + Vector pTrue = Sve.CreateTrueMaskUInt64(); + Vector pLoop = Sve.CreateWhileLessThanMask64Bit(i, Size); + while (Sve.TestFirstTrue(pTrue, pLoop)) + { + // Cast the array pointers to ulong so the predicate can be shared. + // Avoid casting predicate vectors. + Vector inVec = (Vector)Sve.LoadVector(pLoop, (ulong*)input + i); + Vector scaleVec = (Vector)Sve.LoadVector(pLoop, (ulong*)scale + i); + + Vector outVec = Sve.Scale(inVec, scaleVec); + Sve.StoreAndZip(pLoop, (ulong*)output + i, (Vector)outVec); + + // Handle loop. + i += cntd; + pLoop = Sve.CreateWhileLessThanMask64Bit(i, Size); + } + } + } + + [Benchmark] + public unsafe void SveTail() + { + fixed (double* input = _input, output = _output) + fixed (long* scale = _scale) + { + int i = 0; + int cntd = (int)Sve.Count64BitElements(); + + Vector pTrue = Sve.CreateTrueMaskInt64(); + Vector pTrueD = Sve.CreateTrueMaskDouble(); + for (; i <= Size - cntd; i += cntd) + { + Vector inVec = Sve.LoadVector(pTrueD, input + i); + Vector scaleVec = Sve.LoadVector(pTrue, scale + i); + + Vector outVec = Sve.Scale(inVec, scaleVec); + Sve.StoreAndZip(pTrueD, output + i, outVec); + } + // Handle tail. + for (; i < Size; i++) + { + output[i] = Math.ScaleB(input[i], (int)scale[i]); + } + } + } + + } +}