Skip to content

Commit

Permalink
Allow multiple kmask registers to be allocated and cleanup some codeg…
Browse files Browse the repository at this point in the history
…en around them (#89059)

* Allow multiple kmask registers to be allocated and cleanup some codegen around them

* Apply formatting patch

* Fix an assert to include TYP_STRUCT

* Ensure kmask registers aren't in the default killset

* Apply formatting patch

* Move the kmask optimizations up to morph

* Ensure unique VN for ConvertMaskToVector

* Ensure some basic other handling for kmask testing is handled

* Improve the implementation for some managed Vector512 code paths

* Apply formatting patch

* Ensure that the knot intrinsic is inserted into the IR

* Apply formatting patch

* Ensure the conversion of CompareEqualMask(x, zero) to Test(x, x) doesn't happen for floating-point

* Have callee/callerSaveRegs() use an array based lookup

* Respond to PR feedback and try to reduce TP regression more

* Ensure PTEST doesn't try to handle something utilizing embedded broadcast
  • Loading branch information
tannergooding committed Jul 25, 2023
1 parent 764f774 commit 90df613
Show file tree
Hide file tree
Showing 30 changed files with 1,472 additions and 584 deletions.
9 changes: 7 additions & 2 deletions src/coreclr/jit/codegencommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,19 @@ CodeGenInterface::CodeGenInterface(Compiler* theCompiler)
{
}

#if defined(TARGET_AMD64)
#if defined(TARGET_XARCH)
void CodeGenInterface::CopyRegisterInfo()
{
#if defined(TARGET_AMD64)
rbmAllFloat = compiler->rbmAllFloat;
rbmFltCalleeTrash = compiler->rbmFltCalleeTrash;
}
#endif // TARGET_AMD64

rbmAllMask = compiler->rbmAllMask;
rbmMskCalleeTrash = compiler->rbmMskCalleeTrash;
}
#endif // TARGET_XARCH

/*****************************************************************************/

CodeGen::CodeGen(Compiler* theCompiler) : CodeGenInterface(theCompiler)
Expand Down
24 changes: 19 additions & 5 deletions src/coreclr/jit/codegeninterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,19 +63,33 @@ class CodeGenInterface
regMaskTP rbmAllFloat;
regMaskTP rbmFltCalleeTrash;

// Call this function after the equivalent fields in Compiler have been initialized.
void CopyRegisterInfo();

regMaskTP get_RBM_ALLFLOAT() const
FORCEINLINE regMaskTP get_RBM_ALLFLOAT() const
{
return this->rbmAllFloat;
}
regMaskTP get_RBM_FLT_CALLEE_TRASH() const
FORCEINLINE regMaskTP get_RBM_FLT_CALLEE_TRASH() const
{
return this->rbmFltCalleeTrash;
}
#endif // TARGET_AMD64

#if defined(TARGET_XARCH)
regMaskTP rbmAllMask;
regMaskTP rbmMskCalleeTrash;

// Call this function after the equivalent fields in Compiler have been initialized.
void CopyRegisterInfo();

FORCEINLINE regMaskTP get_RBM_ALLMASK() const
{
return this->rbmAllMask;
}
FORCEINLINE regMaskTP get_RBM_MSK_CALLEE_TRASH() const
{
return this->rbmMskCalleeTrash;
}
#endif // TARGET_XARCH

// genSpillVar is called by compUpdateLifeVar.
// TODO-Cleanup: We should handle the spill directly in CodeGen, rather than
// calling it from compUpdateLifeVar. Then this can be non-virtual.
Expand Down
33 changes: 28 additions & 5 deletions src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,25 +106,25 @@ inline bool _our_GetThreadCycles(unsigned __int64* cycleOut)
#endif // which host OS

const BYTE genTypeSizes[] = {
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) sz,
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) sz,
#include "typelist.h"
#undef DEF_TP
};

const BYTE genTypeAlignments[] = {
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) al,
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) al,
#include "typelist.h"
#undef DEF_TP
};

const BYTE genTypeStSzs[] = {
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) st,
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) st,
#include "typelist.h"
#undef DEF_TP
};

const BYTE genActualTypes[] = {
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) jitType,
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) jitType,
#include "typelist.h"
#undef DEF_TP
};
Expand Down Expand Up @@ -3379,9 +3379,32 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
rbmFltCalleeTrash |= RBM_HIGHFLOAT;
cntCalleeTrashFloat += CNT_CALLEE_TRASH_HIGHFLOAT;
}
#endif // TARGET_AMD64

#if defined(TARGET_XARCH)
rbmAllMask = RBM_ALLMASK_INIT;
rbmMskCalleeTrash = RBM_MSK_CALLEE_TRASH_INIT;
cntCalleeTrashMask = CNT_CALLEE_TRASH_MASK_INIT;

if (canUseEvexEncoding())
{
rbmAllMask |= RBM_ALLMASK_EVEX;
rbmMskCalleeTrash |= RBM_MSK_CALLEE_TRASH_EVEX;
cntCalleeTrashMask += CNT_CALLEE_TRASH_MASK_EVEX;
}

// Make sure we copy the register info and initialize the
// trash regs after the underlying fields are initialized

const regMaskTP vtCalleeTrashRegs[TYP_COUNT]{
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) ctr,
#include "typelist.h"
#undef DEF_TP
};
memcpy(varTypeCalleeTrashRegs, vtCalleeTrashRegs, sizeof(regMaskTP) * TYP_COUNT);

codeGen->CopyRegisterInfo();
#endif // TARGET_AMD64
#endif // TARGET_XARCH
}

#ifdef DEBUG
Expand Down
44 changes: 41 additions & 3 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -10953,21 +10953,59 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
unsigned cntCalleeTrashFloat;

public:
regMaskTP get_RBM_ALLFLOAT() const
FORCEINLINE regMaskTP get_RBM_ALLFLOAT() const
{
return this->rbmAllFloat;
}
regMaskTP get_RBM_FLT_CALLEE_TRASH() const
FORCEINLINE regMaskTP get_RBM_FLT_CALLEE_TRASH() const
{
return this->rbmFltCalleeTrash;
}
unsigned get_CNT_CALLEE_TRASH_FLOAT() const
FORCEINLINE unsigned get_CNT_CALLEE_TRASH_FLOAT() const
{
return this->cntCalleeTrashFloat;
}

#endif // TARGET_AMD64

#if defined(TARGET_XARCH)
private:
// The following are for initializing register allocator "constants" defined in targetamd64.h
// that now depend upon runtime ISA information, e.g., the presence of AVX512F/VL, which adds
// 8 mask registers for use.
//
// Users of these values need to define four accessor functions:
//
// regMaskTP get_RBM_ALLMASK();
// regMaskTP get_RBM_MSK_CALLEE_TRASH();
// unsigned get_CNT_CALLEE_TRASH_MASK();
// unsigned get_AVAILABLE_REG_COUNT();
//
// which return the values of these variables.
//
// This was done to avoid polluting all `targetXXX.h` macro definitions with a compiler parameter, where only
// TARGET_XARCH requires one.
//
regMaskTP rbmAllMask;
regMaskTP rbmMskCalleeTrash;
unsigned cntCalleeTrashMask;
regMaskTP varTypeCalleeTrashRegs[TYP_COUNT];

public:
FORCEINLINE regMaskTP get_RBM_ALLMASK() const
{
return this->rbmAllMask;
}
FORCEINLINE regMaskTP get_RBM_MSK_CALLEE_TRASH() const
{
return this->rbmMskCalleeTrash;
}
FORCEINLINE unsigned get_CNT_CALLEE_TRASH_MASK() const
{
return this->cntCalleeTrashMask;
}
#endif // TARGET_XARCH

}; // end of class Compiler

//---------------------------------------------------------------------------------------------------------------------
Expand Down
8 changes: 6 additions & 2 deletions src/coreclr/jit/emit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -590,13 +590,13 @@ void emitterStats(FILE* fout)
/*****************************************************************************/

const unsigned short emitTypeSizes[] = {
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) sze,
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) sze,
#include "typelist.h"
#undef DEF_TP
};

const unsigned short emitTypeActSz[] = {
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) asze,
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) asze,
#include "typelist.h"
#undef DEF_TP
};
Expand Down Expand Up @@ -747,6 +747,10 @@ void emitter::emitBegCG(Compiler* comp, COMP_HANDLE cmpHandle)
#if defined(TARGET_AMD64)
rbmFltCalleeTrash = emitComp->rbmFltCalleeTrash;
#endif // TARGET_AMD64

#if defined(TARGET_XARCH)
rbmMskCalleeTrash = emitComp->rbmMskCalleeTrash;
#endif // TARGET_XARCH
}

void emitter::emitEndCG()
Expand Down
11 changes: 10 additions & 1 deletion src/coreclr/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -2305,12 +2305,21 @@ class emitter
#if defined(TARGET_AMD64)
regMaskTP rbmFltCalleeTrash;

regMaskTP get_RBM_FLT_CALLEE_TRASH() const
FORCEINLINE regMaskTP get_RBM_FLT_CALLEE_TRASH() const
{
return this->rbmFltCalleeTrash;
}
#endif // TARGET_AMD64

#if defined(TARGET_XARCH)
regMaskTP rbmMskCalleeTrash;

FORCEINLINE regMaskTP get_RBM_MSK_CALLEE_TRASH() const
{
return this->rbmMskCalleeTrash;
}
#endif // TARGET_AMD64

CORINFO_FIELD_HANDLE emitFltOrDblConst(double constValue, emitAttr attr);
#if defined(FEATURE_SIMD)
CORINFO_FIELD_HANDLE emitSimd8Const(simd8_t constValue);
Expand Down
33 changes: 30 additions & 3 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6246,12 +6246,25 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size)
case INS_kmovb_msk:
case INS_kmovw_msk:
case INS_kmovd_msk:
{
// Zero-extends the source
hasSideEffect = true;
break;
}

case INS_kmovq_msk:
{
// No side effect, register is 64-bits
hasSideEffect = false;
break;
}

case INS_kmovb_gpr:
case INS_kmovw_gpr:
case INS_kmovd_gpr:
case INS_kmovq_gpr:
{
// Zero-extends the source
hasSideEffect = true;
break;
}
Expand Down Expand Up @@ -6977,7 +6990,7 @@ void emitter::emitIns_R_R_C(instruction ins,
void emitter::emitIns_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2)
{
assert(IsAvx512OrPriorInstruction(ins));
assert(IsThreeOperandAVXInstruction(ins));
assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins));

instrDesc* id = emitNewInstr(attr);
id->idIns(ins);
Expand Down Expand Up @@ -11557,7 +11570,7 @@ void emitter::emitDispIns(
case IF_RWR_RWR_RRD:
{
assert(IsVexOrEvexEncodableInstruction(ins));
assert(IsThreeOperandAVXInstruction(ins));
assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins));
regNumber reg2 = id->idReg2();
regNumber reg3 = id->idReg3();
if (ins == INS_bextr || ins == INS_bzhi
Expand Down Expand Up @@ -14956,7 +14969,7 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)

instruction ins = id->idIns();
assert(IsVexOrEvexEncodableInstruction(ins));
assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins) || isAvx512Blendv(ins));
assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins) || isAvx512Blendv(ins) || IsKInstruction(ins));
regNumber targetReg = id->idReg1();
regNumber src1 = id->idReg2();
regNumber src2 = id->idReg3();
Expand Down Expand Up @@ -19172,6 +19185,20 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_6C : PERFSCORE_LATENCY_4C;
break;

case INS_vptestmb:
case INS_vptestmd:
case INS_vptestmq:
case INS_vptestmw:
case INS_vptestnmb:
case INS_vptestnmd:
case INS_vptestnmq:
case INS_vptestnmw:
{
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += PERFSCORE_LATENCY_4C;
break;
}

case INS_mpsadbw:
result.insThroughput = PERFSCORE_THROUGHPUT_2C;
result.insLatency += PERFSCORE_LATENCY_4C;
Expand Down
Loading

0 comments on commit 90df613

Please sign in to comment.