Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Use vector operator overloads and ref byte indexing
  • Loading branch information
brantburnett committed Mar 13, 2023
commit 24114dcae8b94d0e5f7bf989b02c9c26f9a466ef
155 changes: 86 additions & 69 deletions src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc32.X86.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,113 +10,130 @@ namespace System.IO.Hashing
{
public partial class Crc32
{
private const int X86MinBufferSize = 64;
private const int X86BlockSize = 64;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: I'm generally a fan of putting values like this into named consts, but in this particular case I think it actually muddies the water. There are a bunch of other related const values throughout the code, e.g. 16, 32, 48, that don't have or need such a name, but then when 64 is used there is a name, which to me at least makes it harder to understand the relationship and code. I'd just inline this number into where it's used, and put a comment on the very first use in the up-front guard check that explains where the number comes from.

You could also avoid the numbers and named consts and use things like Vector128<byte>.Count and Vector128<byte>.Count * 4 throughout the code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After considering it, I agree. The constant was a holdover from a previous iteration where I was checking the length before calling the method, which made it harder to intuit the value. Since you requested that the length check be moved to the Update method, I've left the constant for that purpose only and renamed it appropriately. All the other sites use Vector128<byte>.Count.

Let me know if you still think the Update method should just use Vector128<byte>.Count * 4. I just thought it made things clearer when the logic is split between two files.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe I have this all resolved. Thanks.


// Processes the bytes in source in X86MinBufferSize chunks using x86 intrinsics. After completion source is updated
// to refer to any remaining bytes (at most X86MinBufferSize-1). Requires support for Sse2 and Pclmulqdq intrinsics.
private const byte CarrylessMultiplyLower = 0x00;
private const byte CarrylessMultiplyUpper = 0x11;
private const byte CarrylessMultiplyLeftLowerRightUpper = 0x10;

// Processes the bytes in source in X86BlockSize chunks using x86 intrinsics, followed by processing 16
// byte chunks. After completion source is updated to refer to any remaining bytes (at most 16).
// Requires support for Sse2 and Pclmulqdq intrinsics.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This results in ~800 bytes of asm. We don't want to inline it :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

private static uint UpdateX86(uint crc, ref ReadOnlySpan<byte> source)
{
if (source.Length < X86MinBufferSize)
if (source.Length < X86BlockSize)
{
return crc;
}

// There's at least one block of 64.
Vector128<ulong> x1 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source));
Vector128<ulong> x2 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source.Slice(16)));
Vector128<ulong> x3 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source.Slice(32)));
Vector128<ulong> x4 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source.Slice(48)));
// Work with a reference to where we're at in the ReadOnlySpan and a local length
// to avoid extraneous range checks.
ref byte srcRef = ref MemoryMarshal.GetReference(source);
int length = source.Length;

Vector128<ulong> x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
Vector128<ulong> x2 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
Vector128<ulong> x3 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
Vector128<ulong> x4 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();
Vector128<ulong> x5;

x1 = Vector128.Xor(x1, Vector128.CreateScalar((ulong) crc));
x1 ^= Vector128.CreateScalar((ulong) crc);
Vector128<ulong> x0 = Vector128.Create(0x0154442bd4, 0x01c6e41596).AsUInt64(); // k1, k2

source = source.Slice(64);
srcRef = ref Unsafe.Add(ref srcRef, X86BlockSize);
length -= X86BlockSize;

// Parallel fold blocks of 64, if any.
while (source.Length >= 64)
while (length >= X86BlockSize)
{
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
Vector128<ulong> x6 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00);
Vector128<ulong> x7 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x00);
Vector128<ulong> x8 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x00);

x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x11);
x3 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x11);
x4 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x11);

Vector128<ulong> y5 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source));
Vector128<ulong> y6 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source.Slice(16)));
Vector128<ulong> y7 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source.Slice(32)));
Vector128<ulong> y8 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source.Slice(48)));

x1 = Vector128.Xor(x1, x5);
x2 = Vector128.Xor(x2, x6);
x3 = Vector128.Xor(x3, x7);
x4 = Vector128.Xor(x4, x8);

x1 = Vector128.Xor(x1, y5);
x2 = Vector128.Xor(x2, y6);
x3 = Vector128.Xor(x3, y7);
x4 = Vector128.Xor(x4, y8);

source = source.Slice(64);
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyLower);
Vector128<ulong> x6 = Pclmulqdq.CarrylessMultiply(x2, x0, CarrylessMultiplyLower);
Vector128<ulong> x7 = Pclmulqdq.CarrylessMultiply(x3, x0, CarrylessMultiplyLower);
Vector128<ulong> x8 = Pclmulqdq.CarrylessMultiply(x4, x0, CarrylessMultiplyLower);

x1 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyUpper);
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, CarrylessMultiplyUpper);
x3 = Pclmulqdq.CarrylessMultiply(x3, x0, CarrylessMultiplyUpper);
x4 = Pclmulqdq.CarrylessMultiply(x4, x0, CarrylessMultiplyUpper);

Vector128<ulong> y5 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
Vector128<ulong> y6 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
Vector128<ulong> y7 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
Vector128<ulong> y8 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();

x1 ^= x5;
x2 ^= x6;
x3 ^= x7;
x4 ^= x8;

x1 ^= y5;
x2 ^= y6;
x3 ^= y7;
x4 ^= y8;

srcRef = ref Unsafe.Add(ref srcRef, X86BlockSize);
length -= X86BlockSize;
}

// Fold into 128-bits.
x0 = Vector128.Create(0x01751997d0, 0x00ccaa009e).AsUInt64(); // k3, k4

x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
x1 = Vector128.Xor(x1, x2);
x1 = Vector128.Xor(x1, x5);
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyLower);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyUpper);
x1 ^= x2;
x1 ^= x5;

x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
x1 = Vector128.Xor(x1, x3);
x1 = Vector128.Xor(x1, x5);
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyLower);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyUpper);
x1 ^= x3;
x1 ^= x5;

x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
x1 = Vector128.Xor(x1, x4);
x1 = Vector128.Xor(x1, x5);
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyLower);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyUpper);
x1 ^= x4;
x1 ^= x5;

// Single fold blocks of 16, if any.
while (source.Length >= 16)
while (length >= 16)
{
x2 = Vector128.Create(MemoryMarshal.Cast<byte, ulong>(source));
x2 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();

x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
x1 = Vector128.Xor(x1, x2);
x1 = Vector128.Xor(x1, x5);
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyLower);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyUpper);
x1 ^= x2;
x1 ^= x5;

source = source.Slice(16);
srcRef = ref Unsafe.Add(ref srcRef, 16);
length -= 16;
}

// Fold 128 - bits to 64 - bits.
x2 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x10);
// Fold 128 bits to 64 bits.
x2 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyLeftLowerRightUpper);
x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64();
x1 = Sse2.ShiftRightLogical128BitLane(x1, 8);
x1 = Vector128.Xor(x1, x2);
x1 ^= x2;

x0 = Vector128.CreateScalar(0x0163cd6124).AsUInt64(); // k5, k0

x2 = Sse2.ShiftRightLogical128BitLane(x1, 4);
x1 = Vector128.BitwiseAnd(x1, x3);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
x1 = Vector128.Xor(x1, x2);
x1 &= x3;
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, CarrylessMultiplyLower);
x1 ^= x2;

// Reduce to 32-bits.
// Reduce to 32 bits.
x0 = Vector128.Create(0x01db710641, 0x01f7011641).AsUInt64(); // polynomial

x2 = Vector128.BitwiseAnd(x1, x3);
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x10);
x2 = Vector128.BitwiseAnd(x2, x3);
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00);
x1 = Vector128.Xor(x1, x2);
x2 &= x3;
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, CarrylessMultiplyLeftLowerRightUpper);
x2 &= x3;
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, CarrylessMultiplyLower);
x1 ^= x2;

// Update the source ReadOnlySpan to refer to the remaining data
source = length > 0
? MemoryMarshal.CreateReadOnlySpan(ref srcRef, length)
: ReadOnlySpan<byte>.Empty;

return x1.AsUInt32().GetElement(1);
}
Expand Down