-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Add vectorization to improve CRC32 performance #83321
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 1 commit
Commits
Show all changes
15 commits
Select commit
Hold shift + click to select a range
03dbc18
Add x86 intrinsics to improve CRC32 performance
brantburnett 24114dc
Use vector operator overloads and ref byte indexing
brantburnett 7df1df4
Fix error and remove ref ROS
brantburnett ddfbe79
Drop aggressive inlining and legibility improvements
brantburnett 340317e
Don't overcheck intrinsics
brantburnett 49f970d
First pass at ARM support
brantburnett 6bacc2a
Merge branch 'main' into crc32-x86
brantburnett 068e79f
ARM tweaks
brantburnett c38eac8
A bit of cleanup for legibility
brantburnett 12563ec
A little more cleanup
brantburnett 3c9e1d7
Add license notices
brantburnett 3b7c981
Move vector shift right to helper function
brantburnett eebae5e
A bit of cleanup
brantburnett 7122e02
Use System.Diagnostics.UnreachableException
brantburnett ec5ed7c
Use ReadUnaligned for ARM CRC
brantburnett File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Use vector operator overloads and ref byte indexing
- Loading branch information
commit 24114dcae8b94d0e5f7bf989b02c9c26f9a466ef
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,113 +10,130 @@ namespace System.IO.Hashing | |
{ | ||
public partial class Crc32 | ||
{ | ||
private const int X86MinBufferSize = 64; | ||
private const int X86BlockSize = 64; | ||
|
||
// Processes the bytes in source in X86MinBufferSize chunks using x86 intrinsics. After completion source is updated | ||
// to refer to any remaining bytes (at most X86MinBufferSize-1). Requires support for Sse2 and Pclmulqdq intrinsics. | ||
private const byte CarrylessMultiplyLower = 0x00; | ||
private const byte CarrylessMultiplyUpper = 0x11; | ||
private const byte CarrylessMultiplyLeftLowerRightUpper = 0x10; | ||
|
||
// Processes the bytes in source in X86BlockSize chunks using x86 intrinsics, followed by processing 16 | ||
// byte chunks. After completion source is updated to refer to any remaining bytes (at most 16). | ||
// Requires support for Sse2 and Pclmulqdq intrinsics. | ||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
|
||
private static uint UpdateX86(uint crc, ref ReadOnlySpan<byte> source) | ||
{ | ||
if (source.Length < X86MinBufferSize) | ||
if (source.Length < X86BlockSize) | ||
{ | ||
return crc; | ||
} | ||
|
||
// There's at least one block of 64. | ||
Vector128<ulong> x1 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source)); | ||
Vector128<ulong> x2 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source.Slice(16))); | ||
Vector128<ulong> x3 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source.Slice(32))); | ||
Vector128<ulong> x4 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source.Slice(48))); | ||
// Work with a reference to where we're at in the ReadOnlySpan and a local length | ||
// to avoid extraneous range checks. | ||
ref byte srcRef = ref MemoryMarshal.GetReference(source); | ||
int length = source.Length; | ||
|
||
Vector128<ulong> x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64(); | ||
Vector128<ulong> x2 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64(); | ||
Vector128<ulong> x3 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64(); | ||
Vector128<ulong> x4 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64(); | ||
Vector128<ulong> x5; | ||
|
||
x1 = Vector128.Xor(x1, Vector128.CreateScalar((ulong) crc)); | ||
x1 ^= Vector128.CreateScalar((ulong) crc); | ||
Vector128<ulong> x0 = Vector128.Create(0x0154442bd4, 0x01c6e41596).AsUInt64(); // k1, k2 | ||
|
||
source = source.Slice(64); | ||
srcRef = ref Unsafe.Add(ref srcRef, X86BlockSize); | ||
length -= X86BlockSize; | ||
|
||
// Parallel fold blocks of 64, if any. | ||
while (source.Length >= 64) | ||
while (length >= X86BlockSize) | ||
{ | ||
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); | ||
Vector128<ulong> x6 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00); | ||
Vector128<ulong> x7 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x00); | ||
Vector128<ulong> x8 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x00); | ||
|
||
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); | ||
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x11); | ||
x3 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x11); | ||
x4 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x11); | ||
|
||
Vector128<ulong> y5 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source)); | ||
Vector128<ulong> y6 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source.Slice(16))); | ||
Vector128<ulong> y7 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source.Slice(32))); | ||
Vector128<ulong> y8 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source.Slice(48))); | ||
|
||
x1 = Vector128.Xor(x1, x5); | ||
x2 = Vector128.Xor(x2, x6); | ||
x3 = Vector128.Xor(x3, x7); | ||
x4 = Vector128.Xor(x4, x8); | ||
|
||
x1 = Vector128.Xor(x1, y5); | ||
x2 = Vector128.Xor(x2, y6); | ||
x3 = Vector128.Xor(x3, y7); | ||
x4 = Vector128.Xor(x4, y8); | ||
|
||
source = source.Slice(64); | ||
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyLower); | ||
Vector128<ulong> x6 = Pclmulqdq.CarrylessMultiply(x2, x0, CarrylessMultiplyLower); | ||
Vector128<ulong> x7 = Pclmulqdq.CarrylessMultiply(x3, x0, CarrylessMultiplyLower); | ||
Vector128<ulong> x8 = Pclmulqdq.CarrylessMultiply(x4, x0, CarrylessMultiplyLower); | ||
|
||
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyUpper); | ||
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, CarrylessMultiplyUpper); | ||
x3 = Pclmulqdq.CarrylessMultiply(x3, x0, CarrylessMultiplyUpper); | ||
x4 = Pclmulqdq.CarrylessMultiply(x4, x0, CarrylessMultiplyUpper); | ||
|
||
Vector128<ulong> y5 = Vector128.LoadUnsafe(ref srcRef).AsUInt64(); | ||
Vector128<ulong> y6 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64(); | ||
Vector128<ulong> y7 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64(); | ||
Vector128<ulong> y8 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64(); | ||
|
||
x1 ^= x5; | ||
x2 ^= x6; | ||
x3 ^= x7; | ||
x4 ^= x8; | ||
|
||
x1 ^= y5; | ||
x2 ^= y6; | ||
x3 ^= y7; | ||
x4 ^= y8; | ||
|
||
srcRef = ref Unsafe.Add(ref srcRef, X86BlockSize); | ||
length -= X86BlockSize; | ||
} | ||
|
||
// Fold into 128-bits. | ||
x0 = Vector128.Create(0x01751997d0, 0x00ccaa009e).AsUInt64(); // k3, k4 | ||
|
||
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); | ||
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); | ||
x1 = Vector128.Xor(x1, x2); | ||
x1 = Vector128.Xor(x1, x5); | ||
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyLower); | ||
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyUpper); | ||
x1 ^= x2; | ||
x1 ^= x5; | ||
|
||
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); | ||
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); | ||
x1 = Vector128.Xor(x1, x3); | ||
x1 = Vector128.Xor(x1, x5); | ||
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyLower); | ||
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyUpper); | ||
x1 ^= x3; | ||
x1 ^= x5; | ||
|
||
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); | ||
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); | ||
x1 = Vector128.Xor(x1, x4); | ||
x1 = Vector128.Xor(x1, x5); | ||
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyLower); | ||
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyUpper); | ||
x1 ^= x4; | ||
x1 ^= x5; | ||
|
||
// Single fold blocks of 16, if any. | ||
while (source.Length >= 16) | ||
while (length >= 16) | ||
{ | ||
x2 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source)); | ||
x2 = Vector128.LoadUnsafe(ref srcRef).AsUInt64(); | ||
|
||
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); | ||
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); | ||
x1 = Vector128.Xor(x1, x2); | ||
x1 = Vector128.Xor(x1, x5); | ||
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyLower); | ||
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyUpper); | ||
x1 ^= x2; | ||
x1 ^= x5; | ||
|
||
source = source.Slice(16); | ||
srcRef = ref Unsafe.Add(ref srcRef, 16); | ||
length -= 16; | ||
} | ||
|
||
// Fold 128 - bits to 64 - bits. | ||
x2 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x10); | ||
// Fold 128 bits to 64 bits. | ||
x2 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyLeftLowerRightUpper); | ||
x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64(); | ||
x1 = Sse2.ShiftRightLogical128BitLane(x1, 8); | ||
x1 = Vector128.Xor(x1, x2); | ||
x1 ^= x2; | ||
|
||
x0 = Vector128.CreateScalar(0x0163cd6124).AsUInt64(); // k5, k0 | ||
|
||
x2 = Sse2.ShiftRightLogical128BitLane(x1, 4); | ||
x1 = Vector128.BitwiseAnd(x1, x3); | ||
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); | ||
x1 = Vector128.Xor(x1, x2); | ||
x1 &= x3; | ||
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyLower); | ||
x1 ^= x2; | ||
|
||
// Reduce to 32-bits. | ||
// Reduce to 32 bits. | ||
x0 = Vector128.Create(0x01db710641, 0x01f7011641).AsUInt64(); // polynomial | ||
|
||
x2 = Vector128.BitwiseAnd(x1, x3); | ||
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x10); | ||
x2 = Vector128.BitwiseAnd(x2, x3); | ||
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00); | ||
x1 = Vector128.Xor(x1, x2); | ||
x2 &= x3; | ||
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, CarrylessMultiplyLeftLowerRightUpper); | ||
x2 &= x3; | ||
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, CarrylessMultiplyLower); | ||
x1 ^= x2; | ||
|
||
// Update the source ReadOnlySpan to refer to the remaining data | ||
source = length > 0 | ||
? MemoryMarshal.CreateReadOnlySpan(ref srcRef, length) | ||
: ReadOnlySpan<byte>.Empty; | ||
|
||
return x1.AsUInt32().GetElement(1); | ||
} | ||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: I'm generally a fan of putting values like this into named consts, but in this particular case I think it actually muddies the water. There are a bunch of other related const values throughout the code, e.g. 16, 32, 48, that don't have or need such a name, but then when 64 is used there is a name, which to me at least makes it harder to understand the relationship and code. I'd just inline this number into where it's used, and put a comment on the very first use in the up-front guard check that explains where the number comes from.
You could also avoid the numbers and named consts and use things like
Vector128<byte>.Count
andVector128<byte>.Count * 4
throughout the code.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After considering it, I agree. The constant was a holdover from a previous iteration where I was checking the length before calling the method, which made it harder to intuit the value. Since you requested that the length check be moved to the
Update
method, I've left the constant for that purpose only and renamed it appropriately. All the other sites useVector128<byte>.Count
.Let me know if you still think the
Update
method should just useVector128<byte>.Count * 4
. I just thought it made things clearer when the logic is split between two files.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe I have this all resolved. Thanks.