From 654a8d5ff0cf1a7e0968c68a4b84aecf03ed9c1c Mon Sep 17 00:00:00 2001 From: Fei Peng Date: Fri, 19 Jan 2018 00:01:21 -0800 Subject: [PATCH] Merge SSE intrinsics into the table-driven framework --- src/jit/compiler.h | 2 +- src/jit/emitxarch.cpp | 44 +++-- src/jit/emitxarch.h | 19 +- src/jit/hwintrinsiccodegenxarch.cpp | 216 ++++++-------------- src/jit/hwintrinsiclistxarch.h | 132 ++++++------- src/jit/hwintrinsicxarch.cpp | 292 ++++++++-------------------- src/jit/instrsxarch.h | 1 - src/jit/lowerxarch.cpp | 12 +- src/jit/lsraxarch.cpp | 3 - src/jit/namedintrinsiclist.h | 26 ++- 10 files changed, 267 insertions(+), 480 deletions(-) diff --git a/src/jit/compiler.h b/src/jit/compiler.h index ea1c6b2ac443..f48a7b796561 100644 --- a/src/jit/compiler.h +++ b/src/jit/compiler.h @@ -3122,7 +3122,7 @@ class Compiler static int numArgsOfHWIntrinsic(NamedIntrinsic intrinsic); static instruction insOfHWIntrinsic(NamedIntrinsic intrinsic, var_types type); static HWIntrinsicCategory categoryOfHWIntrinsic(NamedIntrinsic intrinsic); - static HWIntrinsicFlag flagOfHWIntrinsic(NamedIntrinsic intrinsic); + static HWIntrinsicFlag flagsOfHWIntrinsic(NamedIntrinsic intrinsic); GenTree* getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argClass); GenTreeArgList* buildArgList(CORINFO_SIG_INFO* sig); #endif // _TARGET_XARCH_ diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp index 0f6a9ff059b8..73f28d94c469 100644 --- a/src/jit/emitxarch.cpp +++ b/src/jit/emitxarch.cpp @@ -4228,6 +4228,7 @@ void emitter::emitIns_R_R_S_I( emitCurIGsize += sz; } +#ifdef DEBUG static bool isAvxBlendv(instruction ins) { return ins == INS_vblendvps || ins == INS_vblendvpd || ins == INS_vpblendvb; @@ -4237,6 +4238,7 @@ static bool isSse41Blendv(instruction ins) { return ins == INS_blendvps || ins == INS_blendvpd || ins == INS_pblendvb; } +#endif void emitter::emitIns_R_R_R_R( instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, regNumber reg3) @@ -5216,23 +5218,23 @@ void emitter::emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber reg, { emitIns_R_R(INS_movaps, attr, reg, reg1); } - emitIns_R_A(ins, emitTypeSize(simdtype), reg, indir, IF_RRW_ARD); + emitIns_R_A(ins, attr, reg, indir, IF_RRW_ARD); } } -void emitter::emitIns_SIMD_R_R_AR(instruction ins, regNumber reg, regNumber reg1, regNumber base, var_types simdtype) +void emitter::emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber base) { if (UseVEXEncoding()) { - emitIns_R_R_AR(ins, emitTypeSize(simdtype), reg, reg1, base, 0); + emitIns_R_R_AR(ins, attr, reg, reg1, base, 0); } else { if (reg1 != reg) { - emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1); + emitIns_R_R(INS_movaps, attr, reg, reg1); } - emitIns_R_AR(ins, emitTypeSize(simdtype), reg, base, 0); + emitIns_R_AR(ins, attr, reg, base, 0); } } @@ -5325,70 +5327,70 @@ void emitter::emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber reg, } void emitter::emitIns_SIMD_R_R_A_I( - instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival, var_types simdtype) + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival) { if (UseVEXEncoding()) { - emitIns_R_R_A_I(ins, emitTypeSize(simdtype), reg, reg1, indir, ival, IF_RWR_RRD_ARD_CNS); + emitIns_R_R_A_I(ins, attr, reg, reg1, indir, ival, IF_RWR_RRD_ARD_CNS); } else { if (reg1 != reg) { - emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1); + emitIns_R_R(INS_movaps, attr, reg, reg1); } - emitIns_R_A_I(ins, emitTypeSize(simdtype), reg, indir, ival); + emitIns_R_A_I(ins, attr, reg, indir, ival); } } void emitter::emitIns_SIMD_R_R_C_I( - instruction ins, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival, var_types simdtype) + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival) { if (UseVEXEncoding()) { - emitIns_R_R_C_I(ins, emitTypeSize(simdtype), reg, reg1, fldHnd, offs, ival); + emitIns_R_R_C_I(ins, attr, reg, reg1, fldHnd, offs, ival); } else { if (reg1 != reg) { - emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1); + emitIns_R_R(INS_movaps, attr, reg, reg1); } - emitIns_R_C_I(ins, emitTypeSize(simdtype), reg, fldHnd, offs, ival); + emitIns_R_C_I(ins, attr, reg, fldHnd, offs, ival); } } void emitter::emitIns_SIMD_R_R_R_I( - instruction ins, regNumber reg, regNumber reg1, regNumber reg2, int ival, var_types simdtype) + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, int ival) { if (UseVEXEncoding()) { - emitIns_R_R_R_I(ins, emitTypeSize(simdtype), reg, reg1, reg2, ival); + emitIns_R_R_R_I(ins, attr, reg, reg1, reg2, ival); } else { if (reg1 != reg) { - emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1); + emitIns_R_R(INS_movaps, attr, reg, reg1); } - emitIns_R_R_I(ins, emitTypeSize(simdtype), reg, reg2, ival); + emitIns_R_R_I(ins, attr, reg, reg2, ival); } } void emitter::emitIns_SIMD_R_R_S_I( - instruction ins, regNumber reg, regNumber reg1, int varx, int offs, int ival, var_types simdtype) + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs, int ival) { if (UseVEXEncoding()) { - emitIns_R_R_S_I(ins, emitTypeSize(simdtype), reg, reg1, varx, offs, ival); + emitIns_R_R_S_I(ins, attr, reg, reg1, varx, offs, ival); } else { if (reg1 != reg) { - emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1); + emitIns_R_R(INS_movaps, attr, reg, reg1); } - emitIns_R_S_I(ins, emitTypeSize(simdtype), reg, varx, offs, ival); + emitIns_R_S_I(ins, attr, reg, varx, offs, ival); } } #endif diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h index 047344787116..f2e426d07a7c 100644 --- a/src/jit/emitxarch.h +++ b/src/jit/emitxarch.h @@ -455,19 +455,12 @@ void emitIns_R_AX(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, unsigned mul, int disp); #if FEATURE_HW_INTRINSICS -void emitIns_SIMD_R_R_AR(instruction ins, regNumber reg, regNumber reg1, regNumber base, var_types simdtype); -void emitIns_SIMD_R_R_A_I( - instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival, var_types simdtype); -void emitIns_SIMD_R_R_C_I(instruction ins, - regNumber reg, - regNumber reg1, - CORINFO_FIELD_HANDLE fldHnd, - int offs, - int ival, - var_types simdtype); -void emitIns_SIMD_R_R_R_I(instruction ins, regNumber reg, regNumber reg1, regNumber reg2, int ival, var_types simdtype); -void emitIns_SIMD_R_R_S_I( - instruction ins, regNumber reg, regNumber reg1, int varx, int offs, int ival, var_types simdtype); +void emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber base); +void emitIns_SIMD_R_R_A_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival); +void emitIns_SIMD_R_R_C_I( + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival); +void emitIns_SIMD_R_R_R_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, int ival); +void emitIns_SIMD_R_R_S_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs, int ival); void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir); void emitIns_SIMD_R_R_C( instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs); diff --git a/src/jit/hwintrinsiccodegenxarch.cpp b/src/jit/hwintrinsiccodegenxarch.cpp index 6ea0de7e2332..69b3cf54baeb 100644 --- a/src/jit/hwintrinsiccodegenxarch.cpp +++ b/src/jit/hwintrinsiccodegenxarch.cpp @@ -33,12 +33,14 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // Return Value: // returns true if this category can be table-driven in CodeGen // -static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category) +static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsicFlag flags) { // TODO - make more categories to the table-driven framework - const bool tableDrivenIntrinsic = category == HW_Category_SimpleSIMD; - const bool nonTableDrivenIntrinsic = category == HW_Category_Special; - return tableDrivenIntrinsic && !nonTableDrivenIntrinsic; + // HW_Category_Helper and HW_Flag_MultiIns usually need manual codegen + const bool tableDrivenCategory = + category == HW_Category_SimpleSIMD || category == HW_Category_MemoryLoad || category == HW_Category_SIMDScalar; + const bool tableDrivenFlag = (flags & HW_Flag_MultiIns) == 0; + return tableDrivenCategory && tableDrivenFlag; } void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) @@ -46,12 +48,13 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) NamedIntrinsic intrinsicID = node->gtHWIntrinsicId; InstructionSet isa = Compiler::isaOfHWIntrinsic(intrinsicID); HWIntrinsicCategory category = Compiler::categoryOfHWIntrinsic(intrinsicID); - HWIntrinsicFlag flag = Compiler::flagOfHWIntrinsic(intrinsicID); + HWIntrinsicFlag flags = Compiler::flagsOfHWIntrinsic(intrinsicID); + int ival = Compiler::ivalOfHWIntrinsic(intrinsicID); int numArgs = Compiler::numArgsOfHWIntrinsic(intrinsicID); - assert((flag & HW_Flag_NoCodeGen) == 0); + assert((flags & HW_Flag_NoCodeGen) == 0); - if (genIsTableDrivenHWIntrinsic(category)) + if (genIsTableDrivenHWIntrinsic(category, flags)) { GenTree* op1 = node->gtGetOp1(); GenTree* op2 = node->gtGetOp2(); @@ -74,10 +77,35 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) case 1: genConsumeOperands(node); op1Reg = op1->gtRegNum; - emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg); + if (category == HW_Category_MemoryLoad) + { + emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0); + } + else if (category == HW_Category_SIMDScalar && (flags & HW_Flag_CopyUpperBits) != 0) + { + emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg); + } + else + { + + emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg); + } break; + case 2: - genHWIntrinsic_R_R_RM(node, ins); + genConsumeOperands(node); + if (ival != -1) + { + genHWIntrinsic_R_R_RM_I(node, ins); + } + else if (category == HW_Category_MemoryLoad) + { + emit->emitIns_SIMD_R_R_AR(ins, emitTypeSize(TYP_SIMD16), targetReg, op1->gtRegNum, op2->gtRegNum); + } + else + { + genHWIntrinsic_R_R_RM(node, ins); + } break; case 3: { @@ -329,14 +357,14 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins) case GT_CLS_VAR_ADDR: { - emit->emitIns_SIMD_R_R_C_I(ins, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0, ival, - targetType); + emit->emitIns_SIMD_R_R_C_I(ins, emitTypeSize(targetType), targetReg, op1Reg, + memBase->gtClsVar.gtClsVarHnd, 0, ival); return; } default: { - emit->emitIns_SIMD_R_R_A_I(ins, targetReg, op1Reg, memIndir, ival, targetType); + emit->emitIns_SIMD_R_R_A_I(ins, emitTypeSize(targetType), targetReg, op1Reg, memIndir, ival); return; } } @@ -374,11 +402,11 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins) assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr)); assert(offset != (unsigned)-1); - emit->emitIns_SIMD_R_R_S_I(ins, targetReg, op1Reg, varNum, offset, ival, targetType); + emit->emitIns_SIMD_R_R_S_I(ins, emitTypeSize(targetType), targetReg, op1Reg, varNum, offset, ival); } else { - emit->emitIns_SIMD_R_R_R_I(ins, targetReg, op1Reg, op2->gtRegNum, ival, targetType); + emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(targetType), targetReg, op1Reg, op2->gtRegNum, ival); } } @@ -392,7 +420,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) regNumber targetReg = node->gtRegNum; var_types targetType = node->TypeGet(); var_types baseType = node->gtSIMDBaseType; - instruction ins = INS_invalid; + instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); regNumber op1Reg = REG_NA; regNumber op2Reg = REG_NA; @@ -408,28 +436,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) switch (intrinsicID) { - case NI_SSE_Add: - case NI_SSE_AddScalar: - case NI_SSE_And: - case NI_SSE_AndNot: case NI_SSE_ConvertToVector128SingleScalar: - case NI_SSE_Divide: - case NI_SSE_DivideScalar: - case NI_SSE_Max: - case NI_SSE_MaxScalar: - case NI_SSE_Min: - case NI_SSE_MinScalar: - case NI_SSE_MoveHighToLow: - case NI_SSE_MoveLowToHigh: - case NI_SSE_MoveScalar: - case NI_SSE_Multiply: - case NI_SSE_MultiplyScalar: - case NI_SSE_Or: - case NI_SSE_Subtract: - case NI_SSE_SubtractScalar: - case NI_SSE_UnpackHigh: - case NI_SSE_UnpackLow: - case NI_SSE_Xor: { assert(node->TypeGet() == TYP_SIMD16); assert(node->gtSIMDBaseType == TYP_FLOAT); @@ -439,51 +446,14 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) genHWIntrinsic_R_R_RM(node, ins); break; } - - case NI_SSE_CompareEqual: - case NI_SSE_CompareEqualScalar: - case NI_SSE_CompareGreaterThan: - case NI_SSE_CompareGreaterThanScalar: - case NI_SSE_CompareGreaterThanOrEqual: - case NI_SSE_CompareGreaterThanOrEqualScalar: - case NI_SSE_CompareLessThan: - case NI_SSE_CompareLessThanScalar: - case NI_SSE_CompareLessThanOrEqual: - case NI_SSE_CompareLessThanOrEqualScalar: - case NI_SSE_CompareNotEqual: - case NI_SSE_CompareNotEqualScalar: - case NI_SSE_CompareNotGreaterThan: - case NI_SSE_CompareNotGreaterThanScalar: - case NI_SSE_CompareNotGreaterThanOrEqual: - case NI_SSE_CompareNotGreaterThanOrEqualScalar: - case NI_SSE_CompareNotLessThan: - case NI_SSE_CompareNotLessThanScalar: - case NI_SSE_CompareNotLessThanOrEqual: - case NI_SSE_CompareNotLessThanOrEqualScalar: - case NI_SSE_CompareOrdered: - case NI_SSE_CompareOrderedScalar: - case NI_SSE_CompareUnordered: - case NI_SSE_CompareUnorderedScalar: - { - assert(node->TypeGet() == TYP_SIMD16); - assert(node->gtSIMDBaseType == TYP_FLOAT); - assert(Compiler::ivalOfHWIntrinsic(intrinsicID) != -1); - - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - genHWIntrinsic_R_R_RM_I(node, ins); - break; - } - case NI_SSE_CompareEqualOrderedScalar: case NI_SSE_CompareEqualUnorderedScalar: { assert(baseType == TYP_FLOAT); - op2Reg = op2->gtRegNum; - - regNumber tmpReg = node->GetSingleTempReg(); - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); + op2Reg = op2->gtRegNum; + regNumber tmpReg = node->GetSingleTempReg(); - emit->emitIns_SIMD_R_R(ins, op1Reg, op2Reg, TYP_SIMD16); + emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg); emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg); emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg); emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg); @@ -498,8 +468,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) assert(baseType == TYP_FLOAT); op2Reg = op2->gtRegNum; - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R(ins, op1Reg, op2Reg, TYP_SIMD16); + emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg); emit->emitIns_R(INS_seta, EA_1BYTE, targetReg); emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); break; @@ -511,8 +480,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) assert(baseType == TYP_FLOAT); op2Reg = op2->gtRegNum; - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R(ins, op1Reg, op2Reg, TYP_SIMD16); + emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg); emit->emitIns_R(INS_setae, EA_1BYTE, targetReg); emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); break; @@ -524,8 +492,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) assert(baseType == TYP_FLOAT); op2Reg = op2->gtRegNum; - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R(ins, op2Reg, op1Reg, TYP_SIMD16); + emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg); emit->emitIns_R(INS_seta, EA_1BYTE, targetReg); emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); break; @@ -537,8 +504,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) assert(baseType == TYP_FLOAT); op2Reg = op2->gtRegNum; - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R(ins, op2Reg, op1Reg, TYP_SIMD16); + emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg); emit->emitIns_R(INS_setae, EA_1BYTE, targetReg); emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); break; @@ -550,10 +516,9 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) assert(baseType == TYP_FLOAT); op2Reg = op2->gtRegNum; - regNumber tmpReg = node->GetSingleTempReg(); - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); + regNumber tmpReg = node->GetSingleTempReg(); - emit->emitIns_SIMD_R_R(ins, op1Reg, op2Reg, TYP_SIMD16); + emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg); emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg); emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg); emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg); @@ -562,84 +527,23 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) break; } - case NI_SSE_ConvertToInt32: - case NI_SSE_ConvertToInt32WithTruncation: - case NI_SSE_ConvertToInt64: - case NI_SSE_ConvertToInt64WithTruncation: - case NI_SSE_Reciprocal: - case NI_SSE_ReciprocalSqrt: - case NI_SSE_Sqrt: - { - assert(baseType == TYP_FLOAT); - assert(op2 == nullptr); - - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R(ins, targetReg, op1Reg, TYP_SIMD16); - break; - } - case NI_SSE_ConvertToSingle: case NI_SSE_StaticCast: { assert(op2 == nullptr); if (op1Reg != targetReg) { - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R(ins, targetReg, op1Reg, TYP_SIMD16); + emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg); } break; } - case NI_SSE_LoadAlignedVector128: - case NI_SSE_LoadScalar: - case NI_SSE_LoadVector128: - { - assert(baseType == TYP_FLOAT); - assert(op2 == nullptr); - - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_R_AR(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, 0); - break; - } - - case NI_SSE_LoadHigh: - case NI_SSE_LoadLow: - { - assert(baseType == TYP_FLOAT); - op2Reg = op2->gtRegNum; - - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R_AR(ins, targetReg, op1Reg, op2Reg, TYP_SIMD16); - break; - } - case NI_SSE_MoveMask: { assert(baseType == TYP_FLOAT); assert(op2 == nullptr); - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R(ins, targetReg, op1Reg, TYP_INT); - break; - } - - case NI_SSE_ReciprocalScalar: - case NI_SSE_ReciprocalSqrtScalar: - case NI_SSE_SqrtScalar: - { - assert(baseType == TYP_FLOAT); - assert(op2 == nullptr); - - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R_R(ins, targetReg, op1Reg, op1Reg, TYP_SIMD16); - break; - } - - case NI_SSE_SetAllVector128: - { - assert(baseType == TYP_FLOAT); - assert(op2 == nullptr); - emit->emitIns_SIMD_R_R_R_I(INS_shufps, targetReg, op1Reg, op1Reg, 0, TYP_SIMD16); + emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg); break; } @@ -651,12 +555,12 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) if (op1Reg == targetReg) { regNumber tmpReg = node->GetSingleTempReg(); - emit->emitIns_SIMD_R_R(INS_movaps, tmpReg, op1Reg, TYP_SIMD16); + emit->emitIns_R_R(INS_movaps, emitTypeSize(TYP_SIMD16), tmpReg, op1Reg); op1Reg = tmpReg; } - emit->emitIns_SIMD_R_R_R(INS_xorps, targetReg, targetReg, targetReg, TYP_SIMD16); - emit->emitIns_SIMD_R_R_R(INS_movss, targetReg, targetReg, op1Reg, TYP_SIMD16); + emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg); + emit->emitIns_SIMD_R_R_R(INS_movss, emitTypeSize(TYP_SIMD16), targetReg, targetReg, op1Reg); break; } @@ -665,7 +569,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) assert(baseType == TYP_FLOAT); assert(op1 == nullptr); assert(op2 == nullptr); - emit->emitIns_SIMD_R_R_R(INS_xorps, targetReg, targetReg, targetReg, TYP_SIMD16); + emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg); break; } @@ -699,7 +603,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) if (op3->IsCnsIntOrI()) { ssize_t ival = op3->AsIntConCommon()->IconValue(); - emit->emitIns_SIMD_R_R_R_I(INS_shufps, targetReg, op1Reg, op2Reg, (int)ival, TYP_SIMD16); + emit->emitIns_SIMD_R_R_R_I(INS_shufps, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, (int)ival); } else { @@ -749,7 +653,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) for (unsigned i = 0; i < jmpCount; i++) { genDefineTempLabel(jmpTable[i]); - emit->emitIns_SIMD_R_R_R_I(INS_shufps, targetReg, op1Reg, op2Reg, i, TYP_SIMD16); + emit->emitIns_SIMD_R_R_R_I(INS_shufps, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, i); emit->emitIns_J(INS_jmp, switchTableEnd); } diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h index f9ccf7fd8c27..ec9c9d70b054 100644 --- a/src/jit/hwintrinsiclistxarch.h +++ b/src/jit/hwintrinsiclistxarch.h @@ -15,7 +15,7 @@ 1) Each hardware intrinsic has a unique Intrinsic ID with type of `enum NamedIntrinsic` 2) All the overloads of an intrinsic in an ISA class share one Intrinsic ID 3) The intrinsic that generates instructions with a fixed imm8 operand has a `ival` field with "not -1" value, e.g., Sse.CompareEqual(v1,v2) -> cmpps xmm0, xmm1, 0 - 4) SIMD intrinsics have a non-zero `SIMD size` field based-on that operate over `Vector128` (16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_) or `Vector256` + 4) SIMD intrinsics have a non-zero `SIMD size` field based-on that operate over `Vector128`(16) or `Vector256`(32) 5) Scalar intrinsics that operate over general purpose registers (e.g., Sse41.Crc32) have `SIMD size` with 0 6) Each intrinsic has a `NumArg` for number of parameters, and some intrinsics that are overloaded on multiple parameter numbers have this field with -1 7) Each intrinsic has 10 `instructions` fields that list the instructions should be generated based-on the base type @@ -29,92 +29,86 @@ // SSE Intrinsics HARDWARE_INTRINSIC(SSE_IsSupported, "get_IsSupported", SSE, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_Add, "Add", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(SSE_AddScalar, "AddScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_AddScalar, "AddScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE_And, "And", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(SSE_AndNot, "AndNot", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Fixed) -HARDWARE_INTRINSIC(SSE_CompareEqual, "CompareEqual", SSE, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareEqualOrderedScalar, "CompareEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareEqualScalar, "CompareEqualScalar", SSE, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareEqualUnorderedScalar, "CompareEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_AndNot, "AndNot", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareEqual, "CompareEqual", SSE, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_CompareEqualOrderedScalar, "CompareEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_CompareEqualScalar, "CompareEqualScalar", SSE, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE_CompareEqualUnorderedScalar, "CompareEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE_CompareGreaterThan, "CompareGreaterThan", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrderedScalar, "CompareGreaterThanOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanScalar, "CompareGreaterThanScalar", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanUnorderedScalar, "CompareGreaterThanUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrderedScalar, "CompareGreaterThanOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanScalar, "CompareGreaterThanScalar", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanUnorderedScalar, "CompareGreaterThanUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqual, "CompareGreaterThanOrEqual", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualOrderedScalar, "CompareGreaterThanOrEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualScalar, "CompareGreaterThanOrEqualScalar", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualUnorderedScalar, "CompareGreaterThanOrEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualOrderedScalar, "CompareGreaterThanOrEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualScalar, "CompareGreaterThanOrEqualScalar", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualUnorderedScalar, "CompareGreaterThanOrEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE_CompareLessThan, "CompareLessThan", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareLessThanOrderedScalar, "CompareLessThanOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareLessThanScalar, "CompareLessThanScalar", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareLessThanUnorderedScalar, "CompareLessThanUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareLessThanOrderedScalar, "CompareLessThanOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_CompareLessThanScalar, "CompareLessThanScalar", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareLessThanUnorderedScalar, "CompareLessThanUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqual, "CompareLessThanOrEqual", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualOrderedScalar, "CompareLessThanOrEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualScalar, "CompareLessThanOrEqualScalar", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualUnorderedScalar, "CompareLessThanOrEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareNotEqual, "CompareNotEqual", SSE, 4, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareNotEqualOrderedScalar, "CompareNotEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareNotEqualScalar, "CompareNotEqualScalar", SSE, 4, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareNotEqualUnorderedScalar, "CompareNotEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualOrderedScalar, "CompareLessThanOrEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualScalar, "CompareLessThanOrEqualScalar", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualUnorderedScalar, "CompareLessThanOrEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_CompareNotEqual, "CompareNotEqual", SSE, 4, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_CompareNotEqualOrderedScalar, "CompareNotEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_CompareNotEqualScalar, "CompareNotEqualScalar", SSE, 4, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_CompareNotEqualUnorderedScalar, "CompareNotEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE_CompareNotGreaterThan, "CompareNotGreaterThan", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanScalar, "CompareNotGreaterThanScalar", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanScalar, "CompareNotGreaterThanScalar", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqual, "CompareNotGreaterThanOrEqual", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqualScalar, "CompareNotGreaterThanOrEqualScalar", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqualScalar, "CompareNotGreaterThanOrEqualScalar", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_CompareNotLessThan, "CompareNotLessThan", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareNotLessThanScalar, "CompareNotLessThanScalar", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotLessThanScalar, "CompareNotLessThanScalar", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqual, "CompareNotLessThanOrEqual", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqualScalar, "CompareNotLessThanOrEqualScalar", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqualScalar, "CompareNotLessThanOrEqualScalar", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_CompareOrdered, "CompareOrdered", SSE, 7, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareOrderedScalar, "CompareOrderedScalar", SSE, 7, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareOrderedScalar, "CompareOrderedScalar", SSE, 7, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_CompareUnordered, "CompareUnordered", SSE, 3, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareUnorderedScalar, "CompareUnorderedScalar", SSE, 3, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_ConvertToInt32, "ConvertToInt32", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_ConvertToInt64, "ConvertToInt64", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_ConvertToSingle, "ConvertToSingle", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_ConvertToVector128SingleScalar, "ConvertToVector128SingleScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation, "ConvertToInt32WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_ConvertToInt64WithTruncation, "ConvertToInt64WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_Divide, "Divide", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_DivideScalar, "DivideScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_LoadAlignedVector128, "LoadAlignedVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_LoadHigh, "LoadHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_LoadLow, "LoadLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_LoadScalar, "LoadScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_LoadVector128, "LoadVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_Max, "Max", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_MaxScalar, "MaxScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_Min, "Min", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_MinScalar, "MinScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_MoveHighToLow, "MoveHighToLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_MoveLowToHigh, "MoveLowToHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_MoveMask, "MoveMask", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_MoveScalar, "MoveScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_Multiply, "Multiply", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_MultiplyScalar, "MultiplyScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareUnorderedScalar, "CompareUnorderedScalar", SSE, 3, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ConvertToInt32, "ConvertToInt32", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_ConvertToInt64, "ConvertToInt64", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_ConvertToSingle, "ConvertToSingle", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_Helper, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ConvertToVector128SingleScalar, "ConvertToVector128SingleScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss, INS_invalid}, HW_Category_Special, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation, "ConvertToInt32WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_ConvertToInt64WithTruncation, "ConvertToInt64WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_Divide, "Divide", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_DivideScalar, "DivideScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_LoadAlignedVector128, "LoadAlignedVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_LoadHigh, "LoadHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_LoadLow, "LoadLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_LoadScalar, "LoadScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_LoadVector128, "LoadVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Max, "Max", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_MaxScalar, "MaxScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_Min, "Min", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_MinScalar, "MinScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_MoveHighToLow, "MoveHighToLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_MoveLowToHigh, "MoveLowToHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_MoveMask, "MoveMask", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_MoveScalar, "MoveScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_Multiply, "Multiply", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_MultiplyScalar, "MultiplyScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE_Or, "Or", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_Reciprocal, "Reciprocal", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_ReciprocalScalar, "ReciprocalScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ReciprocalScalar, "ReciprocalScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE_ReciprocalSqrt, "ReciprocalSqrt", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar, "ReciprocalSqrtScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_SetAllVector128, "SetAllVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_SetScalar, "SetScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_SetVector128, "SetVector128", SSE, -1, 16, 4, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_SetZeroVector128, "SetZeroVector128", SSE, -1, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_Shuffle, "Shuffle", SSE, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar, "ReciprocalSqrtScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE_SetAllVector128, "SetAllVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(SSE_SetScalar, "SetScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_Helper, HW_Flag_MultiIns) +HARDWARE_INTRINSIC(SSE_SetVector128, "SetVector128", SSE, -1, 16, 4, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(SSE_SetZeroVector128, "SetZeroVector128", SSE, -1, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_Helper, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Shuffle, "Shuffle", SSE, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_invalid}, HW_Category_IMM, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_Sqrt, "Sqrt", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_SqrtScalar, "SqrtScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_StaticCast, "StaticCast", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -//HARDWARE_INTRINSIC(SSE_Store, "Store", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -//HARDWARE_INTRINSIC(SSE_StoreAligned, "StoreAligned", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -//HARDWARE_INTRINSIC(SSE_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movntps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -//HARDWARE_INTRINSIC(SSE_StoreHigh, "StoreHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -//HARDWARE_INTRINSIC(SSE_StoreLow, "StoreLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -//HARDWARE_INTRINSIC(SSE_StoreScalar, "StoreScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_SqrtScalar, "SqrtScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE_StaticCast, "StaticCast", SSE, -1, 16, 1, {INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps}, HW_Category_Helper, HW_Flag_TwoTypeGeneric) HARDWARE_INTRINSIC(SSE_Subtract, "Subtract", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_SubtractScalar, "SubtractScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_SubtractScalar, "SubtractScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_UnpackHigh, "UnpackHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_UnpackLow, "UnpackLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_Xor, "Xor", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Xor, "Xor", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) // SSE2 Intrinsics HARDWARE_INTRINSIC(SSE2_IsSupported, "get_IsSupported", SSE2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) diff --git a/src/jit/hwintrinsicxarch.cpp b/src/jit/hwintrinsicxarch.cpp index a96952db6cb6..80091f189fac 100644 --- a/src/jit/hwintrinsicxarch.cpp +++ b/src/jit/hwintrinsicxarch.cpp @@ -16,7 +16,7 @@ struct HWIntrinsicInfo int numArgs; instruction ins[10]; HWIntrinsicCategory category; - HWIntrinsicFlag flag; + HWIntrinsicFlag flags; }; static const HWIntrinsicInfo hwIntrinsicInfoArray[] = { @@ -192,7 +192,7 @@ int Compiler::ivalOfHWIntrinsic(NamedIntrinsic intrinsic) static unsigned simdSizeOfHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig) { assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END); - assert((hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flag & HW_Flag_UnfixedSIMDSize) == 0); + assert((hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flags & HW_Flag_UnfixedSIMDSize) == 0); return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].simdSize; } @@ -248,19 +248,19 @@ HWIntrinsicCategory Compiler::categoryOfHWIntrinsic(NamedIntrinsic intrinsic) } //------------------------------------------------------------------------ -// HWIntrinsicFlag: get the flag of the given intrinsic +// HWIntrinsicFlag: get the flags of the given intrinsic // // Arguments: // intrinsic -- id of the intrinsic function. // // Return Value: -// the flag of the given intrinsic +// the flags of the given intrinsic // -HWIntrinsicFlag Compiler::flagOfHWIntrinsic(NamedIntrinsic intrinsic) +HWIntrinsicFlag Compiler::flagsOfHWIntrinsic(NamedIntrinsic intrinsic) { assert(intrinsic != NI_Illegal); assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END); - return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flag; + return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flags; } //------------------------------------------------------------------------ @@ -290,7 +290,7 @@ GenTree* Compiler::getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE assert(varTypeIsArithmetic(argType)); arg = impPopStack().val; assert(varTypeIsArithmetic(arg->TypeGet())); - assert(genTypeSize(argType) <= genTypeSize(arg->TypeGet())); + assert(genActualType(arg->gtType) == genActualType(argType)); } return arg; } @@ -436,12 +436,10 @@ GenTree* Compiler::impUnsupportedHWIntrinsic(unsigned helper, // Return Value: // returns true if this category can be table-driven in the importer // -static bool impIsTableDrivenHWIntrinsic(HWIntrinsicCategory category) +static bool impIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsicFlag flags) { - // TODO - make more categories to the table-driven framework - const bool tableDrivenIntrinsic = category == HW_Category_SimpleSIMD; - const bool nonTableDrivenIntrinsic = category == HW_Category_Special; - return tableDrivenIntrinsic && !nonTableDrivenIntrinsic; + // HW_Flag_NoCodeGen implies this intrinsic should be manually morphed in the importer. + return category != HW_Category_Special && category != HW_Category_Scalar && (flags & HW_Flag_NoCodeGen) == 0; } //------------------------------------------------------------------------ @@ -462,15 +460,24 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic intrinsic, { InstructionSet isa = isaOfHWIntrinsic(intrinsic); HWIntrinsicCategory category = categoryOfHWIntrinsic(intrinsic); + HWIntrinsicFlag flags = flagsOfHWIntrinsic(intrinsic); int numArgs = sig->numArgs; - var_types callType = JITtype2varType(sig->retType); + var_types retType = JITtype2varType(sig->retType); + var_types baseType = TYP_UNKNOWN; + if (retType == TYP_STRUCT && featureSIMD) + { + unsigned int sizeBytes; + baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeSigClass, &sizeBytes); + retType = getSIMDTypeForSize(sizeBytes); + assert(sizeBytes != 0 && baseType != TYP_UNKNOWN); + } // This intrinsic is supported if // - the ISA is available on the underlying hardware (compSupports returns true) // - the compiler supports this hardware intrinsics (compSupportsHWIntrinsic returns true) // - intrinsics do not require 64-bit registers (r64) on 32-bit platforms (isTypeSupportedForIntrinsic returns // true) - bool issupported = compSupports(isa) && compSupportsHWIntrinsic(isa) && isTypeSupportedForIntrinsic(callType); + bool issupported = compSupports(isa) && compSupportsHWIntrinsic(isa) && isTypeSupportedForIntrinsic(retType); if (category == HW_Category_IsSupportedProperty) { @@ -481,22 +488,59 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic intrinsic, { return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand); } + else if (category == HW_Category_IMM) + { + GenTree* lastOp = impStackTop().val; + if (!lastOp->IsCnsIntOrI() && !mustExpand) + { + // When the imm-argument is not a constant and we are not being forced to expand, we need to + // return nullptr so a GT_CALL to the intrinsic method is emitted instead. The + // intrinsic method is recursive and will be forced to expand, at which point + // we emit some less efficient fallback code. + return nullptr; + } + } + + if ((flags & HW_Flag_Generic) != 0) + { + assert(baseType != TYP_UNKNOWN); + // When the type argument is not a numeric type (and we are not being forced to expand), we need to + // return nullptr so a GT_CALL to the intrinsic method is emitted that will throw NotSupportedException + if (!varTypeIsArithmetic(baseType)) + { + assert(!mustExpand); + return nullptr; + } + + if ((flags & HW_Flag_TwoTypeGeneric) != 0) + { + // StaticCast has two type parameters. + assert(!mustExpand); + assert(numArgs == 1); + var_types srcType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)); + assert(srcType != TYP_UNKNOWN); + if (!varTypeIsArithmetic(srcType)) + { + return nullptr; + } + } + } // table-driven importer of simple intrinsics - if (impIsTableDrivenHWIntrinsic(category)) + if (impIsTableDrivenHWIntrinsic(category, flags)) { - unsigned int sizeBytes; - var_types baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeSigClass, &sizeBytes); - assert(baseType != TYP_UNKNOWN && sizeBytes != 0); - var_types retType = getSIMDTypeForSize(sizeBytes); + if (!varTypeIsSIMD(retType)) + { + baseType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)); + assert(baseType != TYP_UNKNOWN); + } + unsigned simdSize = simdSizeOfHWIntrinsic(intrinsic, sig); CORINFO_ARG_LIST_HANDLE argList = sig->args; CORINFO_CLASS_HANDLE argClass; var_types argType = TYP_UNKNOWN; - assert(numArgs > 0); - assert(retType != TYP_UNDEF); - assert(retType == TYP_SIMD16 || retType == TYP_SIMD32); + assert(numArgs >= 0); assert(insOfHWIntrinsic(intrinsic, baseType) != INS_invalid); assert(simdSize == 32 || simdSize == 16); @@ -506,10 +550,12 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic intrinsic, switch (numArgs) { + case 0: + retNode = gtNewSimdHWIntrinsicNode(retType, intrinsic, baseType, simdSize); + break; case 1: argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass))); op1 = getArgForHWIntrinsic(argType, argClass); - retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize); break; case 2: @@ -537,8 +583,7 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic intrinsic, argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass))); op1 = getArgForHWIntrinsic(argType, argClass); - op1 = gtNewArgList(op1, op2, op3); - retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, op3, intrinsic, baseType, simdSize); break; } default: @@ -653,11 +698,13 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig, bool mustExpand) { - GenTree* retNode = nullptr; - GenTree* op1 = nullptr; - GenTree* op2 = nullptr; - GenTree* op3 = nullptr; - GenTree* op4 = nullptr; + GenTree* retNode = nullptr; + GenTree* op1 = nullptr; + GenTree* op2 = nullptr; + GenTree* op3 = nullptr; + GenTree* op4 = nullptr; + int simdSize = simdSizeOfHWIntrinsic(intrinsic, sig); + assert(simdSize == 16); switch (intrinsic) { @@ -671,112 +718,14 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic intrinsic, op2 = impPopStack().val; op1 = impPopStack().val; - GenTree* left = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op4, op3, NI_SSE_UnpackLow, TYP_FLOAT, 16); - GenTree* right = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, op1, NI_SSE_UnpackLow, TYP_FLOAT, 16); + GenTree* left = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op4, op3, NI_SSE_UnpackLow, TYP_FLOAT, simdSize); + GenTree* right = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, op1, NI_SSE_UnpackLow, TYP_FLOAT, simdSize); GenTree* control = gtNewIconNode(68, TYP_UBYTE); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, left, right, control, NI_SSE_Shuffle, TYP_FLOAT, 16); + retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, left, right, control, NI_SSE_Shuffle, TYP_FLOAT, simdSize); break; } - case NI_SSE_Shuffle: - { - assert(sig->numArgs == 3); - assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT); - - op3 = impStackTop().val; - - if (op3->IsCnsIntOrI() || mustExpand) - { - impPopStack(); // Pop the value we peeked at - op2 = impSIMDPopStack(TYP_SIMD16); - op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, op3, intrinsic, TYP_FLOAT, 16); - } - else - { - // When op3 is not a constant and we are not being forced to expand, we need to - // return nullptr so a GT_CALL to the intrinsic method is emitted instead. The - // intrinsic method is recursive and will be forced to expand, at which point - // we emit some less efficient fallback code. - - return nullptr; - } - break; - } - - case NI_SSE_Add: - case NI_SSE_AddScalar: - case NI_SSE_And: - case NI_SSE_AndNot: - case NI_SSE_CompareEqual: - case NI_SSE_CompareEqualScalar: - case NI_SSE_CompareGreaterThan: - case NI_SSE_CompareGreaterThanScalar: - case NI_SSE_CompareGreaterThanOrEqual: - case NI_SSE_CompareGreaterThanOrEqualScalar: - case NI_SSE_CompareLessThan: - case NI_SSE_CompareLessThanScalar: - case NI_SSE_CompareLessThanOrEqual: - case NI_SSE_CompareLessThanOrEqualScalar: - case NI_SSE_CompareNotEqual: - case NI_SSE_CompareNotEqualScalar: - case NI_SSE_CompareNotGreaterThan: - case NI_SSE_CompareNotGreaterThanScalar: - case NI_SSE_CompareNotGreaterThanOrEqual: - case NI_SSE_CompareNotGreaterThanOrEqualScalar: - case NI_SSE_CompareNotLessThan: - case NI_SSE_CompareNotLessThanScalar: - case NI_SSE_CompareNotLessThanOrEqual: - case NI_SSE_CompareNotLessThanOrEqualScalar: - case NI_SSE_CompareOrdered: - case NI_SSE_CompareOrderedScalar: - case NI_SSE_CompareUnordered: - case NI_SSE_CompareUnorderedScalar: - case NI_SSE_Divide: - case NI_SSE_DivideScalar: - case NI_SSE_Max: - case NI_SSE_MaxScalar: - case NI_SSE_Min: - case NI_SSE_MinScalar: - case NI_SSE_MoveHighToLow: - case NI_SSE_MoveLowToHigh: - case NI_SSE_MoveScalar: - case NI_SSE_Multiply: - case NI_SSE_MultiplyScalar: - case NI_SSE_Or: - case NI_SSE_Subtract: - case NI_SSE_SubtractScalar: - case NI_SSE_UnpackHigh: - case NI_SSE_UnpackLow: - case NI_SSE_Xor: - assert(sig->numArgs == 2); - assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT); - op2 = impSIMDPopStack(TYP_SIMD16); - op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_FLOAT, 16); - break; - - case NI_SSE_CompareEqualOrderedScalar: - case NI_SSE_CompareEqualUnorderedScalar: - case NI_SSE_CompareGreaterThanOrderedScalar: - case NI_SSE_CompareGreaterThanUnorderedScalar: - case NI_SSE_CompareGreaterThanOrEqualOrderedScalar: - case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar: - case NI_SSE_CompareLessThanOrderedScalar: - case NI_SSE_CompareLessThanUnorderedScalar: - case NI_SSE_CompareLessThanOrEqualOrderedScalar: - case NI_SSE_CompareLessThanOrEqualUnorderedScalar: - case NI_SSE_CompareNotEqualOrderedScalar: - case NI_SSE_CompareNotEqualUnorderedScalar: - assert(sig->numArgs == 2); - assert(JITtype2varType(sig->retType) == TYP_BOOL); - assert(getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)) == TYP_FLOAT); - op2 = impSIMDPopStack(TYP_SIMD16); - op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_BOOL, op1, op2, intrinsic, TYP_FLOAT, 16); - break; - case NI_SSE_ConvertToVector128SingleScalar: { assert(sig->numArgs == 2); @@ -797,18 +746,7 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic intrinsic, op2 = impPopStack().val; op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_FLOAT, 16); - break; - } - - case NI_SSE_LoadHigh: - case NI_SSE_LoadLow: - { - assert(sig->numArgs == 2); - assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT); - op2 = impPopStack().val; - op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_FLOAT, 16); + retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_FLOAT, simdSize); break; } @@ -817,77 +755,15 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic intrinsic, assert(JITtype2varType(sig->retType) == TYP_INT); assert(getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)) == TYP_FLOAT); op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_INT, op1, intrinsic, TYP_FLOAT, 16); - break; - - case NI_SSE_StaticCast: - { - assert(sig->numArgs == 1); - var_types tgtType = getBaseTypeOfSIMDType(sig->retTypeSigClass); - var_types srcType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)); - - if (varTypeIsArithmetic(tgtType) && varTypeIsArithmetic(srcType)) - { - op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, tgtType, 16); - } - else - { - return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand); - } + retNode = gtNewSimdHWIntrinsicNode(TYP_INT, op1, intrinsic, TYP_FLOAT, simdSize); break; - } - case NI_SSE_LoadAlignedVector128: - case NI_SSE_LoadScalar: - case NI_SSE_LoadVector128: case NI_SSE_SetAllVector128: - case NI_SSE_SetScalar: assert(sig->numArgs == 1); assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT); op1 = impPopStack().val; - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, TYP_FLOAT, 16); - break; - - case NI_SSE_Reciprocal: - case NI_SSE_ReciprocalScalar: - case NI_SSE_ReciprocalSqrt: - case NI_SSE_ReciprocalSqrtScalar: - case NI_SSE_Sqrt: - case NI_SSE_SqrtScalar: - assert(sig->numArgs == 1); - assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT); - op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, TYP_FLOAT, 16); - break; - - case NI_SSE_ConvertToInt32: - case NI_SSE_ConvertToInt32WithTruncation: - case NI_SSE_ConvertToInt64: - case NI_SSE_ConvertToInt64WithTruncation: - case NI_SSE_ConvertToSingle: - { - assert(sig->numArgs == 1); - assert(getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)) == TYP_FLOAT); - var_types callType = JITtype2varType(sig->retType); - -#ifdef _TARGET_X86_ - if (varTypeIsLong(callType)) - { - assert(intrinsic == NI_SSE_ConvertToInt64 || intrinsic == NI_SSE_ConvertToInt64WithTruncation); - return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand); - } -#endif // _TARGET_X86_ - - op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(callType, op1, intrinsic, TYP_FLOAT, 16); - break; - } - - case NI_SSE_SetZeroVector128: - assert(sig->numArgs == 0); - assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, intrinsic, TYP_FLOAT, 16); + retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, gtCloneExpr(op1), gtNewIconNode(0), NI_SSE_Shuffle, + TYP_FLOAT, simdSize); break; default: diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index f48a6ce6ef4b..1b8533248e83 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -252,7 +252,6 @@ INST3( andnpd, "andnpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x55)) / INST3( orps, "orps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x56)) // Or packed singles INST3( orpd, "orpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x56)) // Or packed doubles INST3( haddpd, "haddpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x7C)) // Horizontal add packed doubles -INST3( rcpps, "rcpps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x53)) // Reciprocals of Packed Singles // SSE 2 approx arith INST3( rcpps, "rcpps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x53)) // Reciprocal of packed singles diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index 4169b4a4106e..a046704c91be 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -2305,14 +2305,17 @@ void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode) void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) { NamedIntrinsic intrinsicID = node->gtHWIntrinsicId; - HWIntrinsicCategory category = comp->categoryOfHWIntrinsic(intrinsicID); - int numArgs = comp->numArgsOfHWIntrinsic(intrinsicID); + HWIntrinsicCategory category = Compiler::categoryOfHWIntrinsic(intrinsicID); + HWIntrinsicFlag flags = Compiler::flagsOfHWIntrinsic(intrinsicID); + int numArgs = Compiler::numArgsOfHWIntrinsic(intrinsicID); GenTree* op1 = node->gtGetOp1(); GenTree* op2 = node->gtGetOp2(); // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained // TODO-XArch-CQ: Non-VEX encoded instructions require memory ops to be aligned - if (category == HW_Category_SimpleSIMD && numArgs == 2 && comp->canUseVexEncoding()) + + if (comp->canUseVexEncoding() && numArgs == 2 && (flags & HW_Flag_NoContainment) == 0 && + category == HW_Category_SimpleSIMD) { if (IsContainableMemoryOp(op2)) { @@ -2325,7 +2328,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) } } - if (NamedIntrinsic == NI_SSE_Shuffle) + // TODO - change to all IMM intrinsics + if (intrinsicID == NI_SSE_Shuffle) { assert(op1->OperIsList()); GenTree* op3 = op1->AsArgList()->Rest()->Rest()->Current(); diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp index f56a36a7d077..fbadd566c8ef 100644 --- a/src/jit/lsraxarch.cpp +++ b/src/jit/lsraxarch.cpp @@ -2564,7 +2564,6 @@ void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, info->internalIntCount = 2; info->setInternalCandidates(this, allRegs(TYP_INT)); - break; } break; } @@ -2577,7 +2576,6 @@ void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, break; case NI_SSE41_BlendVariable: - { if (!compiler->canUseVexEncoding()) { // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0 @@ -2589,7 +2587,6 @@ void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, info->hasDelayFreeSrc = true; } break; - } #ifdef _TARGET_X86_ case NI_SSE42_Crc32: diff --git a/src/jit/namedintrinsiclist.h b/src/jit/namedintrinsiclist.h index 8d5aac28f3a2..6387f60cbe51 100644 --- a/src/jit/namedintrinsiclist.h +++ b/src/jit/namedintrinsiclist.h @@ -39,18 +39,33 @@ enum HWIntrinsicFlag : unsigned int // Generic // - must throw NotSupportException if the type argument is not numeric type HW_Flag_Generic = 0x4, + // Two-type Generic + // - the intrinsic has two type parameters + HW_Flag_TwoTypeGeneric = 0xC, // NoCodeGen // - should be transformed in the compiler front-end, cannot reach CodeGen - HW_Flag_NoCodeGen = 0x8, + HW_Flag_NoCodeGen = 0x10, // Unfixed SIMD-size // - overloaded on multiple vector sizes (SIMD size in the table is unreliable) - HW_Flag_UnfixedSIMDSize = 0x10, + HW_Flag_UnfixedSIMDSize = 0x20, // Complex overload // - the codegen of overloads cannot be determined by intrinsicID and base type - HW_Flag_ComplexOverloads = 0x20, + HW_Flag_ComplexOverloads = 0x40, + + // Multi-instruction + // - that one intrinsic can generate multiple instructions + HW_Flag_MultiIns = 0x80, + + // NoContainment + // the intrinsic cannot be contained + HW_Flag_NoContainment = 0x100, + + // Copy Upper bits + // some SIMD scalar intrinsics need the semantics of copying upper bits from the source operand + HW_Flag_CopyUpperBits = 0x200, }; inline HWIntrinsicFlag operator|(HWIntrinsicFlag c1, HWIntrinsicFlag c2) @@ -63,7 +78,6 @@ enum HWIntrinsicCategory : unsigned int // Simple SIMD intrinsics // - take Vector128/256 parameters // - return a Vector128/256 - // - generate single instruction // - the codegen of overloads can be determined by intrinsicID and base type of returned vector HW_Category_SimpleSIMD, @@ -79,6 +93,10 @@ enum HWIntrinsicCategory : unsigned int // - operate over general purpose registers, like crc32, lzcnt, popcnt, etc. HW_Category_Scalar, + // SIMD scalar + // - operate over vector registers(XMM), but just compute on the first element + HW_Category_SIMDScalar, + // Memory access intrinsics // - e.g., Avx.Load, Avx.Store, Sse.LoadAligned HW_Category_MemoryLoad,