Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Delete redundant vectorization code paths #49846

@stephentoub

Description

@stephentoub

We have a non-trivial number of implementations that duplicate loops across multiple hardware instruction sets plus a generalized Vector path, e.g.

if (Avx2.IsSupported)
{
if (offset < (nuint)(uint)length)
{
if ((((nuint)(uint)Unsafe.AsPointer(ref searchSpace) + offset) & (nuint)(Vector256<byte>.Count - 1)) != 0)
{
// Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches
// with no upper bound e.g. String.strlen.
// Start with a check on Vector128 to align to Vector256, before moving to processing Vector256.
// This ensures we do not fault across memory pages while searching for an end of string.
Vector128<byte> values = Vector128.Create(value);
Vector128<byte> search = LoadVector128(ref searchSpace, offset);
// Same method as below
int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search));
if (matches == 0)
{
// Zero flags set so no matches
offset += (nuint)Vector128<byte>.Count;
}
else
{
// Find bitflag offset of first match and add to current offset
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
}
}
lengthToExamine = GetByteVector256SpanLength(offset, length);
if (lengthToExamine > offset)
{
Vector256<byte> values = Vector256.Create(value);
do
{
Vector256<byte> search = LoadVector256(ref searchSpace, offset);
int matches = Avx2.MoveMask(Avx2.CompareEqual(values, search));
// Note that MoveMask has converted the equal vector elements into a set of bit flags,
// So the bit position in 'matches' corresponds to the element offset.
if (matches == 0)
{
// Zero flags set so no matches
offset += (nuint)Vector256<byte>.Count;
continue;
}
// Find bitflag offset of first match and add to current offset
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
} while (lengthToExamine > offset);
}
lengthToExamine = GetByteVector128SpanLength(offset, length);
if (lengthToExamine > offset)
{
Vector128<byte> values = Vector128.Create(value);
Vector128<byte> search = LoadVector128(ref searchSpace, offset);
// Same method as above
int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search));
if (matches == 0)
{
// Zero flags set so no matches
offset += (nuint)Vector128<byte>.Count;
}
else
{
// Find bitflag offset of first match and add to current offset
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
}
}
if (offset < (nuint)(uint)length)
{
lengthToExamine = ((nuint)(uint)length - offset);
goto SequentialScan;
}
}
}
else if (Sse2.IsSupported)
{
if (offset < (nuint)(uint)length)
{
lengthToExamine = GetByteVector128SpanLength(offset, length);
Vector128<byte> values = Vector128.Create(value);
while (lengthToExamine > offset)
{
Vector128<byte> search = LoadVector128(ref searchSpace, offset);
// Same method as above
int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search));
if (matches == 0)
{
// Zero flags set so no matches
offset += (nuint)Vector128<byte>.Count;
continue;
}
// Find bitflag offset of first match and add to current offset
return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
}
if (offset < (nuint)(uint)length)
{
lengthToExamine = ((nuint)(uint)length - offset);
goto SequentialScan;
}
}
}
else if (AdvSimd.Arm64.IsSupported)
{
if (offset < (nuint)(uint)length)
{
lengthToExamine = GetByteVector128SpanLength(offset, length);
// Mask to help find the first lane in compareResult that is set.
// MSB 0x10 corresponds to 1st lane, 0x01 corresponds to 0th lane and so forth.
Vector128<byte> mask = Vector128.Create((ushort)0x1001).AsByte();
int matchedLane = 0;
Vector128<byte> values = Vector128.Create(value);
while (lengthToExamine > offset)
{
Vector128<byte> search = LoadVector128(ref searchSpace, offset);
Vector128<byte> compareResult = AdvSimd.CompareEqual(values, search);
if (!TryFindFirstMatchedLane(mask, compareResult, ref matchedLane))
{
// Zero flags set so no matches
offset += (nuint)Vector128<byte>.Count;
continue;
}
return (int)(offset + (uint)matchedLane);
}
if (offset < (nuint)(uint)length)
{
lengthToExamine = ((nuint)(uint)length - offset);
goto SequentialScan;
}
}
}
else if (Vector.IsHardwareAccelerated)
{
if (offset < (nuint)(uint)length)
{
lengthToExamine = GetByteVectorSpanLength(offset, length);
Vector<byte> values = new Vector<byte>(value);
while (lengthToExamine > offset)
{
var matches = Vector.Equals(values, LoadVector(ref searchSpace, offset));
if (Vector<byte>.Zero.Equals(matches))
{
offset += (nuint)Vector<byte>.Count;
continue;
}
// Find offset of first match and add to current offset
return (int)offset + LocateFirstFoundByte(matches);
}
if (offset < (nuint)(uint)length)
{
lengthToExamine = ((nuint)(uint)length - offset);
goto SequentialScan;
}
}
}

has a structure that's approximately:

if (Avx2.IsSupported)
{
    ... // use Avx2
}
else if (Sse2.IsSupported)
{
    ... // use Sse2
}
else if (AdvSimd.Arm64.IsSupported)
{
    ... // use AdvSimd
}
else if (Vector.IsHardwareAccelerated)
{
    ... // use Vector / Vector<T>
}

We have others like:

if (Avx2.IsSupported)
{
// JIT does not support code hoisting for SIMD yet
Vector256<byte> zero = Vector256<byte>.Zero;
fixed (bool* ptr = values)
{
for (; (i + Vector256ByteCount) <= (uint)values.Length; i += Vector256ByteCount)
{
Vector256<byte> vector = Avx.LoadVector256((byte*)ptr + i);
Vector256<byte> isFalse = Avx2.CompareEqual(vector, zero);
int result = Avx2.MoveMask(isFalse);
m_array[i / 32u] = ~result;
}
}
}
else if (Sse2.IsSupported)
{
// JIT does not support code hoisting for SIMD yet
Vector128<byte> zero = Vector128<byte>.Zero;
fixed (bool* ptr = values)
{
for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u)
{
Vector128<byte> lowerVector = Sse2.LoadVector128((byte*)ptr + i);
Vector128<byte> lowerIsFalse = Sse2.CompareEqual(lowerVector, zero);
int lowerPackedIsFalse = Sse2.MoveMask(lowerIsFalse);
Vector128<byte> upperVector = Sse2.LoadVector128((byte*)ptr + i + Vector128<byte>.Count);
Vector128<byte> upperIsFalse = Sse2.CompareEqual(upperVector, zero);
int upperPackedIsFalse = Sse2.MoveMask(upperIsFalse);
m_array[i / 32u] = ~((upperPackedIsFalse << 16) | lowerPackedIsFalse);
}
}
}
else if (AdvSimd.Arm64.IsSupported)
{
// JIT does not support code hoisting for SIMD yet
// However comparison against zero can be replaced to cmeq against zero (vceqzq_s8)
// See dotnet/runtime#33972 for details
Vector128<byte> zero = Vector128<byte>.Zero;
fixed (bool* ptr = values)
{
for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u)
{
// Same logic as SSE2 path, however we lack MoveMask (equivalent) instruction
// As a workaround, mask out the relevant bit after comparison
// and combine by ORing all of them together (In this case, adding all of them does the same thing)
Vector128<byte> lowerVector = AdvSimd.LoadVector128((byte*)ptr + i);
Vector128<byte> lowerIsFalse = AdvSimd.CompareEqual(lowerVector, zero);
Vector128<byte> bitsExtracted1 = AdvSimd.And(lowerIsFalse, s_bitMask128);
bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1);
bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1);
bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1);
Vector128<short> lowerPackedIsFalse = bitsExtracted1.AsInt16();
Vector128<byte> upperVector = AdvSimd.LoadVector128((byte*)ptr + i + Vector128<byte>.Count);
Vector128<byte> upperIsFalse = AdvSimd.CompareEqual(upperVector, zero);
Vector128<byte> bitsExtracted2 = AdvSimd.And(upperIsFalse, s_bitMask128);
bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2);
bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2);
bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2);
Vector128<short> upperPackedIsFalse = bitsExtracted2.AsInt16();
int result = AdvSimd.Arm64.ZipLow(lowerPackedIsFalse, upperPackedIsFalse).AsInt32().ToScalar();
if (!BitConverter.IsLittleEndian)
{
result = BinaryPrimitives.ReverseEndianness(result);
}
m_array[i / 32u] = ~result;
}
}
}

that similarly have paths for Avx2, Sse2, and AdvSimd but without a Vector path.

This is a lot of complicated code being duplicated, and it should no longer be necessary. In most cases, the path using Vector<T> should produce code identical to what we'd otherwise write manually using hardware intrinsics, automatically picking 32-bytes or 16-bytes based on what the hardware supports. So, for any of these cases that do Avx2/Sse2/AdvSimd/Vector, we should theoretically be able to simply delete the Avx2/Sse2/AdvSimd code paths, leaving only the Vector code path, because if the Avx2 path would be supported, then Vector would similarly use 32-byte widths, and if Avx2 wouldn't be supported but Sse2/AdvSimd would, Vector would similarly use 16-byte widths. And for the cases that today use Avx2/Sse2/AdvSimd without a Vector path, we should be able to add a path for Vector and then delete the others, for the same reason.

For paths that don't have an Avx2 option, and instead just have paths for Sse2/AdvSimd/Vector, it's less clear whether we could delete the Sse2/AdvSimd implementations, as it's possible there could be performance differences (for better or worse) if the Vector path ended up using 32-byte widths.

cc: @tannergooding, @benaadams

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions