-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Description
We have a non-trivial number of implementations that duplicate loops across multiple hardware instruction sets plus a generalized Vector path, e.g.
runtime/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
Lines 266 to 434 in 9f93bcb
if (Avx2.IsSupported) | |
{ | |
if (offset < (nuint)(uint)length) | |
{ | |
if ((((nuint)(uint)Unsafe.AsPointer(ref searchSpace) + offset) & (nuint)(Vector256<byte>.Count - 1)) != 0) | |
{ | |
// Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches | |
// with no upper bound e.g. String.strlen. | |
// Start with a check on Vector128 to align to Vector256, before moving to processing Vector256. | |
// This ensures we do not fault across memory pages while searching for an end of string. | |
Vector128<byte> values = Vector128.Create(value); | |
Vector128<byte> search = LoadVector128(ref searchSpace, offset); | |
// Same method as below | |
int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search)); | |
if (matches == 0) | |
{ | |
// Zero flags set so no matches | |
offset += (nuint)Vector128<byte>.Count; | |
} | |
else | |
{ | |
// Find bitflag offset of first match and add to current offset | |
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); | |
} | |
} | |
lengthToExamine = GetByteVector256SpanLength(offset, length); | |
if (lengthToExamine > offset) | |
{ | |
Vector256<byte> values = Vector256.Create(value); | |
do | |
{ | |
Vector256<byte> search = LoadVector256(ref searchSpace, offset); | |
int matches = Avx2.MoveMask(Avx2.CompareEqual(values, search)); | |
// Note that MoveMask has converted the equal vector elements into a set of bit flags, | |
// So the bit position in 'matches' corresponds to the element offset. | |
if (matches == 0) | |
{ | |
// Zero flags set so no matches | |
offset += (nuint)Vector256<byte>.Count; | |
continue; | |
} | |
// Find bitflag offset of first match and add to current offset | |
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); | |
} while (lengthToExamine > offset); | |
} | |
lengthToExamine = GetByteVector128SpanLength(offset, length); | |
if (lengthToExamine > offset) | |
{ | |
Vector128<byte> values = Vector128.Create(value); | |
Vector128<byte> search = LoadVector128(ref searchSpace, offset); | |
// Same method as above | |
int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search)); | |
if (matches == 0) | |
{ | |
// Zero flags set so no matches | |
offset += (nuint)Vector128<byte>.Count; | |
} | |
else | |
{ | |
// Find bitflag offset of first match and add to current offset | |
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); | |
} | |
} | |
if (offset < (nuint)(uint)length) | |
{ | |
lengthToExamine = ((nuint)(uint)length - offset); | |
goto SequentialScan; | |
} | |
} | |
} | |
else if (Sse2.IsSupported) | |
{ | |
if (offset < (nuint)(uint)length) | |
{ | |
lengthToExamine = GetByteVector128SpanLength(offset, length); | |
Vector128<byte> values = Vector128.Create(value); | |
while (lengthToExamine > offset) | |
{ | |
Vector128<byte> search = LoadVector128(ref searchSpace, offset); | |
// Same method as above | |
int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search)); | |
if (matches == 0) | |
{ | |
// Zero flags set so no matches | |
offset += (nuint)Vector128<byte>.Count; | |
continue; | |
} | |
// Find bitflag offset of first match and add to current offset | |
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); | |
} | |
if (offset < (nuint)(uint)length) | |
{ | |
lengthToExamine = ((nuint)(uint)length - offset); | |
goto SequentialScan; | |
} | |
} | |
} | |
else if (AdvSimd.Arm64.IsSupported) | |
{ | |
if (offset < (nuint)(uint)length) | |
{ | |
lengthToExamine = GetByteVector128SpanLength(offset, length); | |
// Mask to help find the first lane in compareResult that is set. | |
// MSB 0x10 corresponds to 1st lane, 0x01 corresponds to 0th lane and so forth. | |
Vector128<byte> mask = Vector128.Create((ushort)0x1001).AsByte(); | |
int matchedLane = 0; | |
Vector128<byte> values = Vector128.Create(value); | |
while (lengthToExamine > offset) | |
{ | |
Vector128<byte> search = LoadVector128(ref searchSpace, offset); | |
Vector128<byte> compareResult = AdvSimd.CompareEqual(values, search); | |
if (!TryFindFirstMatchedLane(mask, compareResult, ref matchedLane)) | |
{ | |
// Zero flags set so no matches | |
offset += (nuint)Vector128<byte>.Count; | |
continue; | |
} | |
return (int)(offset + (uint)matchedLane); | |
} | |
if (offset < (nuint)(uint)length) | |
{ | |
lengthToExamine = ((nuint)(uint)length - offset); | |
goto SequentialScan; | |
} | |
} | |
} | |
else if (Vector.IsHardwareAccelerated) | |
{ | |
if (offset < (nuint)(uint)length) | |
{ | |
lengthToExamine = GetByteVectorSpanLength(offset, length); | |
Vector<byte> values = new Vector<byte>(value); | |
while (lengthToExamine > offset) | |
{ | |
var matches = Vector.Equals(values, LoadVector(ref searchSpace, offset)); | |
if (Vector<byte>.Zero.Equals(matches)) | |
{ | |
offset += (nuint)Vector<byte>.Count; | |
continue; | |
} | |
// Find offset of first match and add to current offset | |
return (int)offset + LocateFirstFoundByte(matches); | |
} | |
if (offset < (nuint)(uint)length) | |
{ | |
lengthToExamine = ((nuint)(uint)length - offset); | |
goto SequentialScan; | |
} | |
} | |
} |
has a structure that's approximately:
if (Avx2.IsSupported)
{
... // use Avx2
}
else if (Sse2.IsSupported)
{
... // use Sse2
}
else if (AdvSimd.Arm64.IsSupported)
{
... // use AdvSimd
}
else if (Vector.IsHardwareAccelerated)
{
... // use Vector / Vector<T>
}
We have others like:
runtime/src/libraries/System.Collections/src/System/Collections/BitArray.cs
Lines 152 to 224 in 9f93bcb
if (Avx2.IsSupported) | |
{ | |
// JIT does not support code hoisting for SIMD yet | |
Vector256<byte> zero = Vector256<byte>.Zero; | |
fixed (bool* ptr = values) | |
{ | |
for (; (i + Vector256ByteCount) <= (uint)values.Length; i += Vector256ByteCount) | |
{ | |
Vector256<byte> vector = Avx.LoadVector256((byte*)ptr + i); | |
Vector256<byte> isFalse = Avx2.CompareEqual(vector, zero); | |
int result = Avx2.MoveMask(isFalse); | |
m_array[i / 32u] = ~result; | |
} | |
} | |
} | |
else if (Sse2.IsSupported) | |
{ | |
// JIT does not support code hoisting for SIMD yet | |
Vector128<byte> zero = Vector128<byte>.Zero; | |
fixed (bool* ptr = values) | |
{ | |
for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u) | |
{ | |
Vector128<byte> lowerVector = Sse2.LoadVector128((byte*)ptr + i); | |
Vector128<byte> lowerIsFalse = Sse2.CompareEqual(lowerVector, zero); | |
int lowerPackedIsFalse = Sse2.MoveMask(lowerIsFalse); | |
Vector128<byte> upperVector = Sse2.LoadVector128((byte*)ptr + i + Vector128<byte>.Count); | |
Vector128<byte> upperIsFalse = Sse2.CompareEqual(upperVector, zero); | |
int upperPackedIsFalse = Sse2.MoveMask(upperIsFalse); | |
m_array[i / 32u] = ~((upperPackedIsFalse << 16) | lowerPackedIsFalse); | |
} | |
} | |
} | |
else if (AdvSimd.Arm64.IsSupported) | |
{ | |
// JIT does not support code hoisting for SIMD yet | |
// However comparison against zero can be replaced to cmeq against zero (vceqzq_s8) | |
// See dotnet/runtime#33972 for details | |
Vector128<byte> zero = Vector128<byte>.Zero; | |
fixed (bool* ptr = values) | |
{ | |
for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u) | |
{ | |
// Same logic as SSE2 path, however we lack MoveMask (equivalent) instruction | |
// As a workaround, mask out the relevant bit after comparison | |
// and combine by ORing all of them together (In this case, adding all of them does the same thing) | |
Vector128<byte> lowerVector = AdvSimd.LoadVector128((byte*)ptr + i); | |
Vector128<byte> lowerIsFalse = AdvSimd.CompareEqual(lowerVector, zero); | |
Vector128<byte> bitsExtracted1 = AdvSimd.And(lowerIsFalse, s_bitMask128); | |
bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1); | |
bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1); | |
bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1); | |
Vector128<short> lowerPackedIsFalse = bitsExtracted1.AsInt16(); | |
Vector128<byte> upperVector = AdvSimd.LoadVector128((byte*)ptr + i + Vector128<byte>.Count); | |
Vector128<byte> upperIsFalse = AdvSimd.CompareEqual(upperVector, zero); | |
Vector128<byte> bitsExtracted2 = AdvSimd.And(upperIsFalse, s_bitMask128); | |
bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2); | |
bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2); | |
bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2); | |
Vector128<short> upperPackedIsFalse = bitsExtracted2.AsInt16(); | |
int result = AdvSimd.Arm64.ZipLow(lowerPackedIsFalse, upperPackedIsFalse).AsInt32().ToScalar(); | |
if (!BitConverter.IsLittleEndian) | |
{ | |
result = BinaryPrimitives.ReverseEndianness(result); | |
} | |
m_array[i / 32u] = ~result; | |
} | |
} | |
} |
that similarly have paths for Avx2, Sse2, and AdvSimd but without a Vector path.
This is a lot of complicated code being duplicated, and it should no longer be necessary. In most cases, the path using Vector<T>
should produce code identical to what we'd otherwise write manually using hardware intrinsics, automatically picking 32-bytes or 16-bytes based on what the hardware supports. So, for any of these cases that do Avx2/Sse2/AdvSimd/Vector, we should theoretically be able to simply delete the Avx2/Sse2/AdvSimd code paths, leaving only the Vector code path, because if the Avx2 path would be supported, then Vector would similarly use 32-byte widths, and if Avx2 wouldn't be supported but Sse2/AdvSimd would, Vector would similarly use 16-byte widths. And for the cases that today use Avx2/Sse2/AdvSimd without a Vector path, we should be able to add a path for Vector and then delete the others, for the same reason.
For paths that don't have an Avx2 option, and instead just have paths for Sse2/AdvSimd/Vector, it's less clear whether we could delete the Sse2/AdvSimd implementations, as it's possible there could be performance differences (for better or worse) if the Vector path ended up using 32-byte widths.
cc: @tannergooding, @benaadams