Codestin Search App

We have a non-trivial number of implementations that duplicate loops across multiple hardware instruction sets plus a generalized Vector path, e.g.

runtime/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs

Lines 266 to 434 in 9f93bcb

    
           if (Avx2.IsSupported) 
        
           { 
        
               if (offset < (nuint)(uint)length) 
        
               { 
        
                   if ((((nuint)(uint)Unsafe.AsPointer(ref searchSpace) + offset) & (nuint)(Vector256<byte>.Count - 1)) != 0) 
        
                   { 
        
                       // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches 
        
                       // with no upper bound e.g. String.strlen. 
        
                       // Start with a check on Vector128 to align to Vector256, before moving to processing Vector256. 
        
                       // This ensures we do not fault across memory pages while searching for an end of string. 
        
                       Vector128<byte> values = Vector128.Create(value); 
        
                       Vector128<byte> search = LoadVector128(ref searchSpace, offset); 
        
                       // Same method as below 
        
                       int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search)); 
        
                       if (matches == 0) 
        
                       { 
        
                           // Zero flags set so no matches 
        
                           offset += (nuint)Vector128<byte>.Count; 
        
                       } 
        
                       else 
        
                       { 
        
                           // Find bitflag offset of first match and add to current offset 
        
                           return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); 
        
                       } 
        
                   } 
        
                   lengthToExamine = GetByteVector256SpanLength(offset, length); 
        
                   if (lengthToExamine > offset) 
        
                   { 
        
                       Vector256<byte> values = Vector256.Create(value); 
        
                       do 
        
                       { 
        
                           Vector256<byte> search = LoadVector256(ref searchSpace, offset); 
        
                           int matches = Avx2.MoveMask(Avx2.CompareEqual(values, search)); 
        
                           // Note that MoveMask has converted the equal vector elements into a set of bit flags, 
        
                           // So the bit position in 'matches' corresponds to the element offset. 
        
                           if (matches == 0) 
        
                           { 
        
                               // Zero flags set so no matches 
        
                               offset += (nuint)Vector256<byte>.Count; 
        
                               continue; 
        
                           } 
        
                           // Find bitflag offset of first match and add to current offset 
        
                           return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); 
        
                       } while (lengthToExamine > offset); 
        
                   } 
        
                   lengthToExamine = GetByteVector128SpanLength(offset, length); 
        
                   if (lengthToExamine > offset) 
        
                   { 
        
                       Vector128<byte> values = Vector128.Create(value); 
        
                       Vector128<byte> search = LoadVector128(ref searchSpace, offset); 
        
                       // Same method as above 
        
                       int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search)); 
        
                       if (matches == 0) 
        
                       { 
        
                           // Zero flags set so no matches 
        
                           offset += (nuint)Vector128<byte>.Count; 
        
                       } 
        
                       else 
        
                       { 
        
                           // Find bitflag offset of first match and add to current offset 
        
                           return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); 
        
                       } 
        
                   } 
        
                   if (offset < (nuint)(uint)length) 
        
                   { 
        
                       lengthToExamine = ((nuint)(uint)length - offset); 
        
                       goto SequentialScan; 
        
                   } 
        
               } 
        
           } 
        
           else if (Sse2.IsSupported) 
        
           { 
        
               if (offset < (nuint)(uint)length) 
        
               { 
        
                   lengthToExamine = GetByteVector128SpanLength(offset, length); 
        
                   Vector128<byte> values = Vector128.Create(value); 
        
                   while (lengthToExamine > offset) 
        
                   { 
        
                       Vector128<byte> search = LoadVector128(ref searchSpace, offset); 
        
                       // Same method as above 
        
                       int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search)); 
        
                       if (matches == 0) 
        
                       { 
        
                           // Zero flags set so no matches 
        
                           offset += (nuint)Vector128<byte>.Count; 
        
                           continue; 
        
                       } 
        
                       // Find bitflag offset of first match and add to current offset 
        
                       return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); 
        
                   } 
        
                   if (offset < (nuint)(uint)length) 
        
                   { 
        
                       lengthToExamine = ((nuint)(uint)length - offset); 
        
                       goto SequentialScan; 
        
                   } 
        
               } 
        
           } 
        
           else if (AdvSimd.Arm64.IsSupported) 
        
           { 
        
               if (offset < (nuint)(uint)length) 
        
               { 
        
                   lengthToExamine = GetByteVector128SpanLength(offset, length); 
        
                   // Mask to help find the first lane in compareResult that is set. 
        
                   // MSB 0x10 corresponds to 1st lane, 0x01 corresponds to 0th lane and so forth. 
        
                   Vector128<byte> mask = Vector128.Create((ushort)0x1001).AsByte(); 
        
                   int matchedLane = 0; 
        
                   Vector128<byte> values = Vector128.Create(value); 
        
                   while (lengthToExamine > offset) 
        
                   { 
        
                       Vector128<byte> search = LoadVector128(ref searchSpace, offset); 
        
                       Vector128<byte> compareResult = AdvSimd.CompareEqual(values, search); 
        
                       if (!TryFindFirstMatchedLane(mask, compareResult, ref matchedLane)) 
        
                       { 
        
                           // Zero flags set so no matches 
        
                           offset += (nuint)Vector128<byte>.Count; 
        
                           continue; 
        
                       } 
        
                       return (int)(offset + (uint)matchedLane); 
        
                   } 
        
                   if (offset < (nuint)(uint)length) 
        
                   { 
        
                       lengthToExamine = ((nuint)(uint)length - offset); 
        
                       goto SequentialScan; 
        
                   } 
        
               } 
        
           } 
        
           else if (Vector.IsHardwareAccelerated) 
        
           { 
        
               if (offset < (nuint)(uint)length) 
        
               { 
        
                   lengthToExamine = GetByteVectorSpanLength(offset, length); 
        
                   Vector<byte> values = new Vector<byte>(value); 
        
                   while (lengthToExamine > offset) 
        
                   { 
        
                       var matches = Vector.Equals(values, LoadVector(ref searchSpace, offset)); 
        
                       if (Vector<byte>.Zero.Equals(matches)) 
        
                       { 
        
                           offset += (nuint)Vector<byte>.Count; 
        
                           continue; 
        
                       } 
        
                       // Find offset of first match and add to current offset 
        
                       return (int)offset + LocateFirstFoundByte(matches); 
        
                   } 
        
                   if (offset < (nuint)(uint)length) 
        
                   { 
        
                       lengthToExamine = ((nuint)(uint)length - offset); 
        
                       goto SequentialScan; 
        
                   } 
        
               } 
        
           }

has a structure that's approximately:

if (Avx2.IsSupported)
{
    ... // use Avx2
}
else if (Sse2.IsSupported)
{
    ... // use Sse2
}
else if (AdvSimd.Arm64.IsSupported)
{
    ... // use AdvSimd
}
else if (Vector.IsHardwareAccelerated)
{
    ... // use Vector / Vector<T>
}

We have others like:

runtime/src/libraries/System.Collections/src/System/Collections/BitArray.cs

Lines 152 to 224 in 9f93bcb

    
           if (Avx2.IsSupported) 
        
           { 
        
               // JIT does not support code hoisting for SIMD yet 
        
               Vector256<byte> zero = Vector256<byte>.Zero; 
        
               fixed (bool* ptr = values) 
        
               { 
        
                   for (; (i + Vector256ByteCount) <= (uint)values.Length; i += Vector256ByteCount) 
        
                   { 
        
                       Vector256<byte> vector = Avx.LoadVector256((byte*)ptr + i); 
        
                       Vector256<byte> isFalse = Avx2.CompareEqual(vector, zero); 
        
                       int result = Avx2.MoveMask(isFalse); 
        
                       m_array[i / 32u] = ~result; 
        
                   } 
        
               } 
        
           } 
        
           else if (Sse2.IsSupported) 
        
           { 
        
               // JIT does not support code hoisting for SIMD yet 
        
               Vector128<byte> zero = Vector128<byte>.Zero; 
        
               fixed (bool* ptr = values) 
        
               { 
        
                   for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u) 
        
                   { 
        
                       Vector128<byte> lowerVector = Sse2.LoadVector128((byte*)ptr + i); 
        
                       Vector128<byte> lowerIsFalse = Sse2.CompareEqual(lowerVector, zero); 
        
                       int lowerPackedIsFalse = Sse2.MoveMask(lowerIsFalse); 
        
                       Vector128<byte> upperVector = Sse2.LoadVector128((byte*)ptr + i + Vector128<byte>.Count); 
        
                       Vector128<byte> upperIsFalse = Sse2.CompareEqual(upperVector, zero); 
        
                       int upperPackedIsFalse = Sse2.MoveMask(upperIsFalse); 
        
                       m_array[i / 32u] = ~((upperPackedIsFalse << 16) | lowerPackedIsFalse); 
        
                   } 
        
               } 
        
           } 
        
           else if (AdvSimd.Arm64.IsSupported) 
        
           { 
        
               // JIT does not support code hoisting for SIMD yet 
        
               // However comparison against zero can be replaced to cmeq against zero (vceqzq_s8) 
        
               // See dotnet/runtime#33972 for details 
        
               Vector128<byte> zero = Vector128<byte>.Zero; 
        
               fixed (bool* ptr = values) 
        
               { 
        
                   for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u) 
        
                   { 
        
                       // Same logic as SSE2 path, however we lack MoveMask (equivalent) instruction 
        
                       // As a workaround, mask out the relevant bit after comparison 
        
                       // and combine by ORing all of them together (In this case, adding all of them does the same thing) 
        
                       Vector128<byte> lowerVector = AdvSimd.LoadVector128((byte*)ptr + i); 
        
                       Vector128<byte> lowerIsFalse = AdvSimd.CompareEqual(lowerVector, zero); 
        
                       Vector128<byte> bitsExtracted1 = AdvSimd.And(lowerIsFalse, s_bitMask128); 
        
                       bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1); 
        
                       bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1); 
        
                       bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1); 
        
                       Vector128<short> lowerPackedIsFalse = bitsExtracted1.AsInt16(); 
        
                       Vector128<byte> upperVector = AdvSimd.LoadVector128((byte*)ptr + i + Vector128<byte>.Count); 
        
                       Vector128<byte> upperIsFalse = AdvSimd.CompareEqual(upperVector, zero); 
        
                       Vector128<byte> bitsExtracted2 = AdvSimd.And(upperIsFalse, s_bitMask128); 
        
                       bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2); 
        
                       bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2); 
        
                       bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2); 
        
                       Vector128<short> upperPackedIsFalse = bitsExtracted2.AsInt16(); 
        
                       int result = AdvSimd.Arm64.ZipLow(lowerPackedIsFalse, upperPackedIsFalse).AsInt32().ToScalar(); 
        
                       if (!BitConverter.IsLittleEndian) 
        
                       { 
        
                           result = BinaryPrimitives.ReverseEndianness(result); 
        
                       } 
        
                       m_array[i / 32u] = ~result; 
        
                   } 
        
               } 
        
           }

that similarly have paths for Avx2, Sse2, and AdvSimd but without a Vector path.

This is a lot of complicated code being duplicated, and it should no longer be necessary. In most cases, the path using Vector<T> should produce code identical to what we'd otherwise write manually using hardware intrinsics, automatically picking 32-bytes or 16-bytes based on what the hardware supports. So, for any of these cases that do Avx2/Sse2/AdvSimd/Vector, we should theoretically be able to simply delete the Avx2/Sse2/AdvSimd code paths, leaving only the Vector code path, because if the Avx2 path would be supported, then Vector would similarly use 32-byte widths, and if Avx2 wouldn't be supported but Sse2/AdvSimd would, Vector would similarly use 16-byte widths. And for the cases that today use Avx2/Sse2/AdvSimd without a Vector path, we should be able to add a path for Vector and then delete the others, for the same reason.

For paths that don't have an Avx2 option, and instead just have paths for Sse2/AdvSimd/Vector, it's less clear whether we could delete the Sse2/AdvSimd implementations, as it's possible there could be performance differences (for better or worse) if the Vector path ended up using 32-byte widths.

cc: @tannergooding, @benaadams

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Delete redundant vectorization code paths #49846

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

	if (Avx2.IsSupported)
	{
	if (offset < (nuint)(uint)length)
	{
	if ((((nuint)(uint)Unsafe.AsPointer(ref searchSpace) + offset) & (nuint)(Vector256<byte>.Count - 1)) != 0)
	{
	// Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches
	// with no upper bound e.g. String.strlen.
	// Start with a check on Vector128 to align to Vector256, before moving to processing Vector256.
	// This ensures we do not fault across memory pages while searching for an end of string.
	Vector128<byte> values = Vector128.Create(value);
	Vector128<byte> search = LoadVector128(ref searchSpace, offset);

	// Same method as below
	int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search));
	if (matches == 0)
	{
	// Zero flags set so no matches
	offset += (nuint)Vector128<byte>.Count;
	}
	else
	{
	// Find bitflag offset of first match and add to current offset
	return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
	}
	}

	lengthToExamine = GetByteVector256SpanLength(offset, length);
	if (lengthToExamine > offset)
	{
	Vector256<byte> values = Vector256.Create(value);
	do
	{
	Vector256<byte> search = LoadVector256(ref searchSpace, offset);
	int matches = Avx2.MoveMask(Avx2.CompareEqual(values, search));
	// Note that MoveMask has converted the equal vector elements into a set of bit flags,
	// So the bit position in 'matches' corresponds to the element offset.
	if (matches == 0)
	{
	// Zero flags set so no matches
	offset += (nuint)Vector256<byte>.Count;
	continue;
	}

	// Find bitflag offset of first match and add to current offset
	return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
	} while (lengthToExamine > offset);
	}

	lengthToExamine = GetByteVector128SpanLength(offset, length);
	if (lengthToExamine > offset)
	{
	Vector128<byte> values = Vector128.Create(value);
	Vector128<byte> search = LoadVector128(ref searchSpace, offset);

	// Same method as above
	int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search));
	if (matches == 0)
	{
	// Zero flags set so no matches
	offset += (nuint)Vector128<byte>.Count;
	}
	else
	{
	// Find bitflag offset of first match and add to current offset
	return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
	}
	}

	if (offset < (nuint)(uint)length)
	{
	lengthToExamine = ((nuint)(uint)length - offset);
	goto SequentialScan;
	}
	}
	}
	else if (Sse2.IsSupported)
	{
	if (offset < (nuint)(uint)length)
	{
	lengthToExamine = GetByteVector128SpanLength(offset, length);

	Vector128<byte> values = Vector128.Create(value);
	while (lengthToExamine > offset)
	{
	Vector128<byte> search = LoadVector128(ref searchSpace, offset);

	// Same method as above
	int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search));
	if (matches == 0)
	{
	// Zero flags set so no matches
	offset += (nuint)Vector128<byte>.Count;
	continue;
	}

	// Find bitflag offset of first match and add to current offset
	return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
	}

	if (offset < (nuint)(uint)length)
	{
	lengthToExamine = ((nuint)(uint)length - offset);
	goto SequentialScan;
	}
	}
	}
	else if (AdvSimd.Arm64.IsSupported)
	{
	if (offset < (nuint)(uint)length)
	{
	lengthToExamine = GetByteVector128SpanLength(offset, length);

	// Mask to help find the first lane in compareResult that is set.
	// MSB 0x10 corresponds to 1st lane, 0x01 corresponds to 0th lane and so forth.
	Vector128<byte> mask = Vector128.Create((ushort)0x1001).AsByte();
	int matchedLane = 0;

	Vector128<byte> values = Vector128.Create(value);
	while (lengthToExamine > offset)
	{
	Vector128<byte> search = LoadVector128(ref searchSpace, offset);
	Vector128<byte> compareResult = AdvSimd.CompareEqual(values, search);

	if (!TryFindFirstMatchedLane(mask, compareResult, ref matchedLane))
	{
	// Zero flags set so no matches
	offset += (nuint)Vector128<byte>.Count;
	continue;
	}

	return (int)(offset + (uint)matchedLane);
	}

	if (offset < (nuint)(uint)length)
	{
	lengthToExamine = ((nuint)(uint)length - offset);
	goto SequentialScan;
	}
	}
	}
	else if (Vector.IsHardwareAccelerated)
	{
	if (offset < (nuint)(uint)length)
	{
	lengthToExamine = GetByteVectorSpanLength(offset, length);

	Vector<byte> values = new Vector<byte>(value);

	while (lengthToExamine > offset)
	{
	var matches = Vector.Equals(values, LoadVector(ref searchSpace, offset));
	if (Vector<byte>.Zero.Equals(matches))
	{
	offset += (nuint)Vector<byte>.Count;
	continue;
	}

	// Find offset of first match and add to current offset
	return (int)offset + LocateFirstFoundByte(matches);
	}

	if (offset < (nuint)(uint)length)
	{
	lengthToExamine = ((nuint)(uint)length - offset);
	goto SequentialScan;
	}
	}
	}

	if (Avx2.IsSupported)
	{
	// JIT does not support code hoisting for SIMD yet
	Vector256<byte> zero = Vector256<byte>.Zero;
	fixed (bool* ptr = values)
	{
	for (; (i + Vector256ByteCount) <= (uint)values.Length; i += Vector256ByteCount)
	{
	Vector256<byte> vector = Avx.LoadVector256((byte*)ptr + i);
	Vector256<byte> isFalse = Avx2.CompareEqual(vector, zero);
	int result = Avx2.MoveMask(isFalse);
	m_array[i / 32u] = ~result;
	}
	}
	}
	else if (Sse2.IsSupported)
	{
	// JIT does not support code hoisting for SIMD yet
	Vector128<byte> zero = Vector128<byte>.Zero;
	fixed (bool* ptr = values)
	{
	for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u)
	{
	Vector128<byte> lowerVector = Sse2.LoadVector128((byte*)ptr + i);
	Vector128<byte> lowerIsFalse = Sse2.CompareEqual(lowerVector, zero);
	int lowerPackedIsFalse = Sse2.MoveMask(lowerIsFalse);

	Vector128<byte> upperVector = Sse2.LoadVector128((byte*)ptr + i + Vector128<byte>.Count);
	Vector128<byte> upperIsFalse = Sse2.CompareEqual(upperVector, zero);
	int upperPackedIsFalse = Sse2.MoveMask(upperIsFalse);

	m_array[i / 32u] = ~((upperPackedIsFalse << 16) \| lowerPackedIsFalse);
	}
	}
	}
	else if (AdvSimd.Arm64.IsSupported)
	{
	// JIT does not support code hoisting for SIMD yet
	// However comparison against zero can be replaced to cmeq against zero (vceqzq_s8)
	// See dotnet/runtime#33972 for details
	Vector128<byte> zero = Vector128<byte>.Zero;
	fixed (bool* ptr = values)
	{
	for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u)
	{
	// Same logic as SSE2 path, however we lack MoveMask (equivalent) instruction
	// As a workaround, mask out the relevant bit after comparison
	// and combine by ORing all of them together (In this case, adding all of them does the same thing)
	Vector128<byte> lowerVector = AdvSimd.LoadVector128((byte*)ptr + i);
	Vector128<byte> lowerIsFalse = AdvSimd.CompareEqual(lowerVector, zero);
	Vector128<byte> bitsExtracted1 = AdvSimd.And(lowerIsFalse, s_bitMask128);
	bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1);
	bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1);
	bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1);
	Vector128<short> lowerPackedIsFalse = bitsExtracted1.AsInt16();

	Vector128<byte> upperVector = AdvSimd.LoadVector128((byte*)ptr + i + Vector128<byte>.Count);
	Vector128<byte> upperIsFalse = AdvSimd.CompareEqual(upperVector, zero);
	Vector128<byte> bitsExtracted2 = AdvSimd.And(upperIsFalse, s_bitMask128);
	bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2);
	bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2);
	bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2);
	Vector128<short> upperPackedIsFalse = bitsExtracted2.AsInt16();

	int result = AdvSimd.Arm64.ZipLow(lowerPackedIsFalse, upperPackedIsFalse).AsInt32().ToScalar();
	if (!BitConverter.IsLittleEndian)
	{
	result = BinaryPrimitives.ReverseEndianness(result);
	}
	m_array[i / 32u] = ~result;
	}
	}
	}

Delete redundant vectorization code paths #49846

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions