Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for genReg1/genReg2->SIMD8 store on x86 windows. #52581

Merged
merged 3 commits into from
May 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 38 additions & 12 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1902,7 +1902,8 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
//
void CodeGen::genMultiRegStoreToSIMDLocal(GenTreeLclVar* lclNode)
{
#ifdef UNIX_AMD64_ABI
assert(varTypeIsSIMD(lclNode));

regNumber dst = lclNode->GetRegNum();
GenTree* op1 = lclNode->gtGetOp1();
GenTree* actualOp1 = op1->gtSkipReloadOrCopy();
Expand All @@ -1920,15 +1921,10 @@ void CodeGen::genMultiRegStoreToSIMDLocal(GenTreeLclVar* lclNode)
assert(retTypeDesc->GetReturnRegCount() == MAX_RET_REG_COUNT);

assert(regCount == 2);
assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(0)));
assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(1)));

// This is a case where the two 8-bytes that comprise the operand are in
// two different xmm registers and need to be assembled into a single
// xmm register.
regNumber targetReg = lclNode->GetRegNum();
regNumber reg0 = call->GetRegNumByIdx(0);
regNumber reg1 = call->GetRegNumByIdx(1);

regNumber reg0 = call->GetRegNumByIdx(0);
regNumber reg1 = call->GetRegNumByIdx(1);

if (op1->IsCopyOrReload())
{
Expand All @@ -1947,6 +1943,13 @@ void CodeGen::genMultiRegStoreToSIMDLocal(GenTreeLclVar* lclNode)
}
}

#ifdef UNIX_AMD64_ABI
assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(0)));
assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(1)));

// This is a case where the two 8-bytes that comprise the operand are in
// two different xmm registers and need to be assembled into a single
// xmm register.
if (targetReg != reg0 && targetReg != reg1)
{
// targetReg = reg0;
Expand Down Expand Up @@ -1979,9 +1982,32 @@ void CodeGen::genMultiRegStoreToSIMDLocal(GenTreeLclVar* lclNode)
inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, targetReg, 0x01);
}
genProduceReg(lclNode);
#else // !UNIX_AMD64_ABI
assert(!"Multireg store to SIMD reg not supported on Windows");
#endif // !UNIX_AMD64_ABI
#elif defined(TARGET_X86) && defined(TARGET_WINDOWS)
assert(varTypeIsIntegral(retTypeDesc->GetReturnRegType(0)));
assert(varTypeIsIntegral(retTypeDesc->GetReturnRegType(1)));
assert(lclNode->TypeIs(TYP_SIMD8));

// This is a case where a SIMD8 struct returned as [EAX, EDX]
// and needs to be assembled into a single xmm register,
// note we can't check reg0=EAX, reg1=EDX because they could be already moved.

inst_RV_RV(ins_Copy(reg0, TYP_FLOAT), targetReg, reg0, TYP_INT);
const emitAttr size = emitTypeSize(TYP_SIMD8);
if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
GetEmitter()->emitIns_SIMD_R_R_R_I(INS_pinsrd, size, targetReg, targetReg, reg1, 1);
}
else
{
regNumber tempXmm = lclNode->GetSingleTempReg();
inst_RV_RV(ins_Copy(reg1, TYP_FLOAT), tempXmm, reg1, TYP_INT);
GetEmitter()->emitIns_SIMD_R_R_R(INS_punpckldq, size, targetReg, targetReg, tempXmm);
}
#elif defined(TARGET_WINDOWS) && defined(TARGET_AMD64)
assert(!"Multireg store to SIMD reg not supported on Windows x64");
#else
#error Unsupported or unset target architecture
#endif
}
#endif // FEATURE_SIMD

Expand Down
15 changes: 13 additions & 2 deletions src/coreclr/jit/instr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -399,11 +399,22 @@ void CodeGen::inst_RV(instruction ins, regNumber reg, var_types type, emitAttr s
* Generate a "op reg1, reg2" instruction.
*/

//------------------------------------------------------------------------
// inst_RV_RV: Generate a "op reg1, reg2" instruction.
//
// Arguments:
// ins - the instruction to generate;
// reg1 - the first register to use, the dst for most instructions;
// reg2 - the second register to use, the src for most instructions;
// type - the type used to get the size attribute if not given, usually type of the reg2 operand;
// size - the size attribute, the type arg is ignored if this arg is provided with an actual value;
// flags - whether flags are set for arm32.
//
void CodeGen::inst_RV_RV(instruction ins,
regNumber reg1,
regNumber reg2,
var_types type,
emitAttr size,
var_types type /* = TYP_I_IMPL */,
emitAttr size /* = EA_UNKNOWN */,
insFlags flags /* = INS_FLAGS_DONT_CARE */)
{
if (size == EA_UNKNOWN)
Expand Down
13 changes: 11 additions & 2 deletions src/coreclr/jit/lsrabuild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3338,12 +3338,11 @@ int LinearScan::BuildStoreLoc(GenTreeLclVarCommon* storeLoc)

// First, define internal registers.
#ifdef FEATURE_SIMD
RefPosition* internalFloatDef = nullptr;
if (varTypeIsSIMD(storeLoc) && !op1->IsCnsIntOrI() && (storeLoc->TypeGet() == TYP_SIMD12))
{
// Need an additional register to extract upper 4 bytes of Vector3,
// it has to be float for x86.
internalFloatDef = buildInternalFloatRegisterDefForNode(storeLoc, allSIMDRegs());
buildInternalFloatRegisterDefForNode(storeLoc, allSIMDRegs());
}
#endif // FEATURE_SIMD

Expand All @@ -3360,6 +3359,16 @@ int LinearScan::BuildStoreLoc(GenTreeLclVarCommon* storeLoc)
{
BuildUse(op1, RBM_NONE, i);
}
#if defined(FEATURE_SIMD) && defined(TARGET_X86) && defined(TARGET_WINDOWS)
if (!compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
if (varTypeIsSIMD(storeLoc) && op1->IsCall())
{
// Need an additional register to create a SIMD8 from EAX/EDX without SSE4.1.
buildInternalFloatRegisterDefForNode(storeLoc, allSIMDRegs());
}
}
#endif // FEATURE_SIMD && TARGET_X86 && TARGET_WINDOWS
}
else if (op1->isContained() && op1->OperIs(GT_BITCAST))
{
Expand Down