-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use AVX-512 in LowerCallMemcmp #84854
Conversation
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch, @kunalspathak Issue DetailsContributes to #77034 Uses AVX-512 for bool Test(byte[] src, byte[] dst) =>
src.AsSpan(0, 100).SequenceEqual(dst); Old codegen (Main):; Method AlignedBenchmarks:Test(ubyte[],ubyte[]):bool:this
sub rsp, 40
test rdx, rdx
je SHORT G_M22779_IG09
cmp dword ptr [rdx+08H], 100
jb SHORT G_M22779_IG09
add rdx, 16
mov rcx, rdx
test r8, r8
jne SHORT G_M22779_IG04
xor rdx, rdx
xor eax, eax
jmp SHORT G_M22779_IG05
G_M22779_IG04:
lea rdx, bword ptr [r8+10H]
mov eax, dword ptr [r8+08H]
G_M22779_IG05:
cmp eax, 100
jne SHORT G_M22779_IG07
mov r8d, 100
call [System.SpanHelpers:SequenceEqual(byref,byref,ulong):bool]
jmp SHORT G_M22779_IG08
G_M22779_IG07:
xor eax, eax
G_M22779_IG08:
add rsp, 40
ret
G_M22779_IG09:
call [System.ThrowHelper:ThrowArgumentOutOfRangeException()]
int3
; Total bytes of code: 74 New codegen:; Method AlignedBenchmarks:Test(ubyte[],ubyte[]):bool:this
sub rsp, 40
vzeroupper
test rdx, rdx
je SHORT G_M22779_IG09
cmp dword ptr [rdx+08H], 100
jb SHORT G_M22779_IG09
add rdx, 16
test r8, r8
jne SHORT G_M22779_IG04
xor rax, rax
xor ecx, ecx
jmp SHORT G_M22779_IG05
G_M22779_IG04:
lea rax, bword ptr [r8+10H]
mov ecx, dword ptr [r8+08H]
G_M22779_IG05:
cmp ecx, 100
jne SHORT G_M22779_IG07
vmovups zmm0, zmmword ptr [rdx]
vmovups zmm1, zmmword ptr [rax]
vmovups zmm2, zmmword ptr [rdx+24H]
vmovups zmm3, zmmword ptr [rax+24H]
vpxorq zmm0, zmm0, zmm1
vpxorq zmm1, zmm2, zmm3
vporq zmm0, zmm0, zmm1
vxorps zmm1, zmm1, zmm1
vpcmpuq k1, zmm0, zmm1, 0
kortestb k1, k1
setb al
movzx rax, al
jmp SHORT G_M22779_IG08
G_M22779_IG07:
xor eax, eax
G_M22779_IG08:
vzeroupper
add rsp, 40
ret
G_M22779_IG09:
call [System.ThrowHelper:ThrowArgumentOutOfRangeException()]
int3
; Total bytes of code: 138 Benchmarks:using System;
using System.Runtime.InteropServices;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
[CsvExporter]
public unsafe class AlignedBenchmarks
{
void* _src;
void* _dst;
[GlobalSetup]
public void GlobalSetup()
{
_src = NativeMemory.AlignedAlloc(1024, 64);
_dst = NativeMemory.AlignedAlloc(1024, 64);
}
[Benchmark] public bool Compare_32_aligned() =>
new Span<byte>(_src, 32).SequenceEqual(new Span<byte>(_dst, 32));
[Benchmark] public bool Compare_48_aligned() =>
new Span<byte>(_src, 48).SequenceEqual(new Span<byte>(_dst, 48));
[Benchmark] public bool Compare_64_aligned() =>
new Span<byte>(_src, 64).SequenceEqual(new Span<byte>(_dst, 64));
[Benchmark] public bool Compare_65_aligned() =>
new Span<byte>(_src, 65).SequenceEqual(new Span<byte>(_dst, 65));
[Benchmark] public bool Compare_100_aligned() =>
new Span<byte>(_src, 100).SequenceEqual(new Span<byte>(_dst, 100));
[Benchmark] public bool Compare_128_aligned() =>
new Span<byte>(_src, 128).SequenceEqual(new Span<byte>(_dst, 128));
[Benchmark] public bool Compare_129_aligned() =>
new Span<byte>(_src, 129).SequenceEqual(new Span<byte>(_dst, 129));
[Benchmark] public bool Compare_150_aligned() =>
new Span<byte>(_src, 150).SequenceEqual(new Span<byte>(_dst, 150));
[Benchmark] public bool Compare_200_aligned() =>
new Span<byte>(_src, 200).SequenceEqual(new Span<byte>(_dst, 200));
[GlobalCleanup]
public void GlobalCleanup()
{
NativeMemory.Free(_src);
NativeMemory.Free(_dst);
}
} Also tested mis-aligned access and it still was faster than the baseline, although, the penalty is noticeably.
|
/benchmark json aspnet-citrine-lin runtime |
Benchmark started for json on aspnet-citrine-lin with runtime. Logs: link |
A couple of diffs (size regressions obviously). @dotnet/avx512-contrib PTAL, I've checked that this has no negative impact on TE using crank and local builds |
# Conflicts: # src/coreclr/jit/emitxarch.cpp
cc @kunalspathak it never finished |
Contributes to #77034
Uses AVX-512 for
SequenceEqual
unrolling/vectoriazation for[64..128]
range where previously we used to give up, e.g.Old codegen (Main):
New codegen:
Benchmarks:
(Ryzen 7950X with V512=V256*2)
Len=128 has the lowest (best) timing because it's the best case for the algorithm - two loads are perfectly aligned since the data is always 64-byte aligned. Same reason why Len=65 (next to 64) has the worst one. I also tested a case where source data is misaligned and avx-512 was still faster than the baseline.