diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 7bbb8b5415b01a..0f860979b74fa0 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3231,6 +3231,18 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) assert(size <= INT32_MAX); assert(dstOffset < (INT32_MAX - static_cast(size))); + auto emitStore = [&](instruction ins, unsigned width, regNumber target) { + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(ins, EA_ATTR(width), target, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(ins, EA_ATTR(width), target, dstAddrBaseReg, dstAddrIndexReg, dstAddrIndexScale, + dstOffset); + } + }; + #ifdef FEATURE_SIMD if (willUseSimdMov) { @@ -3244,18 +3256,6 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) instruction simdMov = simdUnalignedMovIns(); unsigned bytesWritten = 0; - auto emitSimdMovs = [&]() { - if (dstLclNum != BAD_VAR_NUM) - { - emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset); - } - else - { - emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg, - dstAddrIndexScale, dstOffset); - } - }; - while (bytesWritten < size) { if (bytesWritten + regSize > size) @@ -3264,7 +3264,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) break; } - emitSimdMovs(); + emitStore(simdMov, regSize, srcXmmReg); dstOffset += regSize; bytesWritten += regSize; } @@ -3279,10 +3279,71 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) // Rewind dstOffset so we can fit a vector for the while remainder dstOffset -= (regSize - size); - emitSimdMovs(); + emitStore(simdMov, regSize, srcXmmReg); size = 0; } } + else if (node->IsOnHeapAndContainsReferences() && ((internalRegisters.GetAll(node) & RBM_ALLFLOAT) != 0)) + { + // For block with GC refs we still can use SIMD, but only for continuous + // non-GC parts where atomicity guarantees are not that strict. + assert(!willUseSimdMov); + ClassLayout* layout = node->GetLayout(); + + regNumber simdZeroReg = REG_NA; + unsigned slots = layout->GetSlotCount(); + unsigned slot = 0; + while (slot < slots) + { + if (!layout->IsGCPtr(slot)) + { + // How many continuous non-GC slots do we have? + unsigned nonGcSlotCount = 0; + do + { + nonGcSlotCount++; + slot++; + } while ((slot < slots) && !layout->IsGCPtr(slot)); + + for (unsigned nonGcSlot = 0; nonGcSlot < nonGcSlotCount; nonGcSlot++) + { + // Are continuous nongc slots enough to use SIMD? + unsigned simdSize = compiler->roundDownSIMDSize((nonGcSlotCount - nonGcSlot) * REGSIZE_BYTES); + if (simdSize > 0) + { + // Initialize simdZeroReg with zero on demand + if (simdZeroReg == REG_NA) + { + simdZeroReg = internalRegisters.GetSingle(node, RBM_ALLFLOAT); + // SIMD16 is sufficient for any SIMD size + simd_t vecCon = {}; + genSetRegToConst(simdZeroReg, TYP_SIMD16, &vecCon); + } + + emitStore(simdUnalignedMovIns(), simdSize, simdZeroReg); + dstOffset += (int)simdSize; + nonGcSlot += (simdSize / REGSIZE_BYTES) - 1; + } + else + { + emitStore(INS_mov, REGSIZE_BYTES, srcIntReg); + dstOffset += REGSIZE_BYTES; + } + } + } + else + { + // GC slot - update atomically + emitStore(INS_mov, REGSIZE_BYTES, srcIntReg); + dstOffset += REGSIZE_BYTES; + slot++; + } + } + + // There are no trailing elements + assert((layout->GetSize() % TARGET_POINTER_SIZE) == 0); + size = 0; + } #endif // FEATURE_SIMD assert((srcIntReg != REG_NA) || (size == 0)); @@ -3298,15 +3359,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) for (; size > regSize; size -= regSize, dstOffset += regSize) { - if (dstLclNum != BAD_VAR_NUM) - { - emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset); - } - else - { - emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg, - dstAddrIndexScale, dstOffset); - } + emitStore(INS_mov, regSize, srcIntReg); } // Handle the non-SIMD remainder by overlapping with previously processed data if needed @@ -3322,15 +3375,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) assert(shiftBack <= regSize); dstOffset -= shiftBack; - if (dstLclNum != BAD_VAR_NUM) - { - emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset); - } - else - { - emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg, - dstAddrIndexScale, dstOffset); - } + emitStore(INS_mov, regSize, srcIntReg); } #else // TARGET_X86 for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, dstOffset += regSize) @@ -3339,15 +3384,8 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) { regSize /= 2; } - if (dstLclNum != BAD_VAR_NUM) - { - emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset); - } - else - { - emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg, - dstAddrIndexScale, dstOffset); - } + + emitStore(INS_mov, regSize, srcIntReg); } #endif } diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 8d376cff4e725c..eabfc8b4afc8bb 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1430,9 +1430,30 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) { case GenTreeBlk::BlkOpKindUnroll: { - const bool canUse16BytesSimdMov = - !blkNode->IsOnHeapAndContainsReferences() && compiler->IsBaselineSimdIsaSupported(); - const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES); + bool willUseSimdMov = compiler->IsBaselineSimdIsaSupported() && (size >= XMM_REGSIZE_BYTES); + if (willUseSimdMov && blkNode->IsOnHeapAndContainsReferences()) + { + ClassLayout* layout = blkNode->GetLayout(); + + unsigned xmmCandidates = 0; + unsigned continuousNonGc = 0; + for (unsigned slot = 0; slot < layout->GetSlotCount(); slot++) + { + if (layout->IsGCPtr(slot)) + { + xmmCandidates += ((continuousNonGc * TARGET_POINTER_SIZE) / XMM_REGSIZE_BYTES); + continuousNonGc = 0; + } + else + { + continuousNonGc++; + } + } + xmmCandidates += ((continuousNonGc * TARGET_POINTER_SIZE) / XMM_REGSIZE_BYTES); + + // Just one XMM candidate is not profitable + willUseSimdMov = xmmCandidates > 1; + } if (willUseSimdMov) {