Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/coreclr/jit/codegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3124,7 +3124,7 @@ void CodeGen::genLclHeap(GenTree* tree)

if (compiler->info.compInitMem)
{
if (amount <= LCLHEAP_UNROLL_LIMIT)
if (amount <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memset))
{
// The following zeroes the last 16 bytes and probes the page containing [sp, #16] address.
// stp xzr, xzr, [sp, #-16]!
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/codegenloongarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2765,7 +2765,7 @@ void CodeGen::genCodeForDivMod(GenTreeOp* tree)
// Generate code for InitBlk by performing a loop unroll
// Preconditions:
// a) Both the size and fill byte value are integer constants.
// b) The size of the struct to initialize is smaller than INITBLK_UNROLL_LIMIT bytes.
// b) The size of the struct to initialize is smaller than getUnrollThreshold() bytes.
void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
{
assert(node->OperIs(GT_STORE_BLK));
Expand Down Expand Up @@ -6457,7 +6457,7 @@ void CodeGen::genCodeForCpBlkHelper(GenTreeBlk* cpBlkNode)
// None
//
// Assumption:
// The size argument of the CpBlk node is a constant and <= CPBLK_UNROLL_LIMIT bytes.
// The size argument of the CpBlk node is a constant and <= getUnrollThreshold() bytes.
//
void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode)
{
Expand Down
3 changes: 2 additions & 1 deletion src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3613,7 +3613,8 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode)
}

unsigned loadSize = putArgNode->GetArgLoadSize();
assert(!src->GetLayout(compiler)->HasGCPtr() && (loadSize <= CPBLK_UNROLL_LIMIT));
assert(!src->GetLayout(compiler)->HasGCPtr() &&
(loadSize <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memcpy)));

unsigned offset = 0;
regNumber xmmTmpReg = REG_NA;
Expand Down
47 changes: 47 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -8962,6 +8962,53 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#endif // FEATURE_SIMD

public:
enum UnrollKind
{
Memset, // Initializing memory with some value
Memcpy // Copying memory from src to dst
};

unsigned int getUnrollThreshold(UnrollKind type)
{
unsigned threshold = TARGET_POINTER_SIZE;

#if defined(FEATURE_SIMD)
threshold = maxSIMDStructBytes();
#if defined(TARGET_ARM64)
// ldp/stp instructions can load/store two 16-byte vectors at once, e.g.:
//
// ldp q0, q1, [x1]
// stp q0, q1, [x0]
//
threshold *= 2;
#elif defined(TARGET_XARCH)
// Ignore AVX-512 for now
threshold = max(threshold, YMM_REGSIZE_BYTES);
#endif
#endif

if (type == UnrollKind::Memset)
{
// Typically, memset-like operations require less instructions than memcpy
threshold *= 2;
}

// Use 4 as a multiplier by default, thus, the final threshold will be:
//
// | arch | memset | memcpy |
// |-------------|--------|--------|
// | x86 avx512 | 512 | 256 | (ignored for now)
// | x86 avx | 256 | 128 |
// | x86 sse | 128 | 64 |
// | arm64 | 256 | 128 | ldp/stp (2x128bit)
// | arm | 32 | 16 | no SIMD support
// | loongarch64 | 64 | 32 | no SIMD support
//
// We might want to use a different multiplier for trully hot/cold blocks based on PGO data
//
return threshold * 4;
}

//------------------------------------------------------------------------
// largestEnregisterableStruct: The size in bytes of the largest struct that can be enregistered.
//
Expand Down
26 changes: 3 additions & 23 deletions src/coreclr/jit/lowerarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,18 +527,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
blkNode->SetOper(GT_STORE_BLK);
}

unsigned initBlockUnrollLimit = INITBLK_UNROLL_LIMIT;

#ifdef TARGET_ARM64
if (isDstAddrLocal)
{
// Since dstAddr points to the stack CodeGen can use more optimal
// quad-word store SIMD instructions for InitBlock.
initBlockUnrollLimit = INITBLK_LCL_UNROLL_LIMIT;
}
#endif

if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= initBlockUnrollLimit) && src->OperIs(GT_CNS_INT))
if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memset)) &&
src->OperIs(GT_CNS_INT))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;

Expand Down Expand Up @@ -608,17 +598,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
}
}

unsigned copyBlockUnrollLimit = CPBLK_UNROLL_LIMIT;

#ifdef TARGET_ARM64
if (isSrcAddrLocal && isDstAddrLocal)
{
// Since both srcAddr and dstAddr point to the stack CodeGen can use more optimal
// quad-word load and store SIMD instructions for CopyBlock.
copyBlockUnrollLimit = CPBLK_LCL_UNROLL_LIMIT;
}
#endif

unsigned copyBlockUnrollLimit = comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy);
if (blkNode->OperIs(GT_STORE_OBJ))
{
if (!blkNode->AsObj()->GetLayout()->HasGCPtr())
Expand Down
7 changes: 4 additions & 3 deletions src/coreclr/jit/lowerloongarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
blkNode->SetOper(GT_STORE_BLK);
}

if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT) && src->OperIs(GT_CNS_INT))
if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= getUnrollThreshold(UnrollKind::Memset)) &&
src->OperIs(GT_CNS_INT))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;

Expand Down Expand Up @@ -353,7 +354,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
{
blkNode->SetOper(GT_STORE_BLK);
}
else if (dstAddr->OperIsLocalAddr() && (size <= CPBLK_UNROLL_LIMIT))
else if (dstAddr->OperIsLocalAddr() && (size <= getUnrollThreshold(UnrollKind::Memcpy)))
{
// If the size is small enough to unroll then we need to mark the block as non-interruptible
// to actually allow unrolling. The generated code does not report GC references loaded in the
Expand All @@ -371,7 +372,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT))
else if (blkNode->OperIs(GT_STORE_BLK) && (size <= getUnrollThreshold(UnrollKind::Memcpy)))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;

Expand Down
8 changes: 4 additions & 4 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
blkNode->SetOper(GT_STORE_BLK);
}

if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT))
if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memset)))
{
if (!src->OperIs(GT_CNS_INT))
{
Expand Down Expand Up @@ -412,7 +412,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
blkNode->SetOper(GT_STORE_BLK);
}
#ifndef JIT32_GCENCODER
else if (dstAddr->OperIsLocalAddr() && (size <= CPBLK_UNROLL_LIMIT))
else if (dstAddr->OperIsLocalAddr() && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy)))
{
// If the size is small enough to unroll then we need to mark the block as non-interruptible
// to actually allow unrolling. The generated code does not report GC references loaded in the
Expand Down Expand Up @@ -472,7 +472,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
}
}
else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT))
else if (blkNode->OperIs(GT_STORE_BLK) && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy)))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;

Expand Down Expand Up @@ -655,7 +655,7 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
}
else
#endif // TARGET_X86
if (loadSize <= CPBLK_UNROLL_LIMIT)
if (loadSize <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy))
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
}
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/lsraarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -591,7 +591,7 @@ int LinearScan::BuildNode(GenTree* tree)
// localloc.
sizeVal = AlignUp(sizeVal, STACK_ALIGN);

if (sizeVal <= LCLHEAP_UNROLL_LIMIT)
if (sizeVal <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memset))
{
// Need no internal registers
}
Expand Down
2 changes: 0 additions & 2 deletions src/coreclr/jit/targetamd64.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
#define ROUND_FLOAT 0 // Do not round intermed float expression results
#define CPU_HAS_BYTE_REGS 0

#define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk.
#define INITBLK_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk.
#define CPOBJ_NONGC_SLOTS_LIMIT 4 // For CpObj code generation, this is the threshold of the number
// of contiguous non-gc slots that trigger generating rep movsq instead of
// sequences of movsq instructions
Expand Down
3 changes: 0 additions & 3 deletions src/coreclr/jit/targetarm.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@
#define ROUND_FLOAT 0 // Do not round intermed float expression results
#define CPU_HAS_BYTE_REGS 0

#define CPBLK_UNROLL_LIMIT 32 // Upper bound to let the code generator to loop unroll CpBlk.
#define INITBLK_UNROLL_LIMIT 16 // Upper bound to let the code generator to loop unroll InitBlk.

#define FEATURE_FIXED_OUT_ARGS 1 // Preallocate the outgoing arg area in the prolog
#define FEATURE_STRUCTPROMOTE 1 // JIT Optimization to promote fields of structs into registers
#define FEATURE_MULTIREG_STRUCT_PROMOTE 0 // True when we want to promote fields of a multireg struct into registers
Expand Down
6 changes: 0 additions & 6 deletions src/coreclr/jit/targetarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,6 @@
#define ROUND_FLOAT 0 // Do not round intermed float expression results
#define CPU_HAS_BYTE_REGS 0

#define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk
#define CPBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll CpBlk (when both srcAddr and dstAddr point to the stack)
#define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk
#define INITBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk (when dstAddr points to the stack)
#define LCLHEAP_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll LclHeap (when zeroing is required)

#ifdef FEATURE_SIMD
#define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned
#define FEATURE_PARTIAL_SIMD_CALLEE_SAVE 1 // Whether SIMD registers are partially saved at calls
Expand Down
3 changes: 0 additions & 3 deletions src/coreclr/jit/targetloongarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,6 @@
#define ROUND_FLOAT 0 // Do not round intermed float expression results
#define CPU_HAS_BYTE_REGS 0

#define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk.
#define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk.

#ifdef FEATURE_SIMD
#pragma error("SIMD Unimplemented yet LOONGARCH")
#define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned
Expand Down
2 changes: 0 additions & 2 deletions src/coreclr/jit/targetx86.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@

// TODO-CQ: Fine tune the following xxBlk threshold values:

#define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk.
#define INITBLK_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk.
#define CPOBJ_NONGC_SLOTS_LIMIT 4 // For CpObj code generation, this is the threshold of the number
// of contiguous non-gc slots that trigger generating rep movsq instead of
// sequences of movsq instructions
Expand Down