From 1a7904ee34fe29d82fc605e0844f00e238d9d41b Mon Sep 17 00:00:00 2001 From: Khushal Modi Date: Thu, 4 Apr 2024 23:46:15 -0700 Subject: [PATCH] Scalar/Packed conversions for floating point to integer (#97529) * merging with main Initial changes for scalar conversion double -> ulong * Basic working version of double -> ulong saturation * Moving the code in a do-while with proper checks to amke sure we are adding the fixup node at all cases * adjusting comments * Merging with main Saturating NaN to 0 and also adding Dbl2Ulng implementation in MathHelpers. Adding vector conversion support for double /float -> ulong conversion * removing conflicts from gentree.h flags merging with main doubel to uint conversion * float to uint conversion verified. removing commented code * merging with main. Making changes to simdashwintrinsic.cpp and listxarch.h float -> uint packed conversion * progress on double to long morphing * another attempt at double to long conversion * Merge with main Merge with main adding a new helper function ofr float to uint scalar conversion for SSE2. * adding handling for scalar conversion cases for SSE2. Remaining float/double -> long/int for AVX512. * partial changes for float to int conversion using double to int for avx512. vfixup not working. next step is to fix the vfixup instruction and get it working * adding float to int working scalar conversion case. Working on vectro case here on. * partial work on float to int packed conversion * partial version of float to int conversion * working version of float to int scalar/packed for avx512 * complete conversions code for floating point to integral conversions for scalar/packed for SSE / avx512 * Merging with main. fixing out of range test case adn adding conversion changes to simdashwintrinsic * fixing debug checks hitting asserts for TYP_ULONG and TYP_UINT at IR level * adding JIT_Dbl2Int for target_x86 and other architectures. * Supporting x86 for saturating conversions as well * fixing errors in packed conversion * accomodate unsigned in IR * adding evex support for cvttss2si * Mergw with main defining nativeaot helpers for x86 * Catch divide by zero exception * Handle overflow cases * Fix tests to check saturating behavior * Correct mapping of instructions * Convert float -> ulong / long as float -> double -> ulong / long * Merging with main Initial changes for scalar conversion double -> ulong * Merging with main adjusting comments * removing conflicts from gentree.h flags merging with main doubel to uint conversion * merging with main. Making changes to simdashwintrinsic.cpp and listxarch.h float -> uint packed conversion * adding a new helper function ofr float to uint scalar conversion for SSE2. * Merging with main adding handling for scalar conversion cases for SSE2. Remaining float/double -> long/int for AVX512. * partial changes for float to int conversion using double to int for avx512. vfixup not working. next step is to fix the vfixup instruction and get it working * partial version of float to int conversion * working version of float to int scalar/packed for avx512 * Merging with main. fixing out of range test case adn adding conversion changes to simdashwintrinsic * Changing the way helper functions are handled in morph fixing debug checks hitting asserts for TYP_ULONG and TYP_UINT at IR level * adding JIT_Dbl2Int for target_x86 and other architectures. * Supporting x86 for saturating conversions as well * fixing errors in packed conversion * Correct mapping of instructions * delete extra files * Merging main review changes * Merge with main and adding new helpers in nativeaot Rebasing with main * changing type of cast node as signed when making cast nodes * Avoiding removing extra element from the stack * Fix formatting, Change comp->IsaSupportedDebugOnly to IsBaselineVector512SupportedDebugOnly * Reverting some changes to maintain uniformity in code * Handling cases where AVX512 is not supported in simdashwintrinsic.cpp * fixing exit conditions for ConvertVectorT_ToDouble * Check for AVX512 support for TARGET_XARCH * Avoid avx512 path for x86 * Enable AVX512F codepath for conversions in x86 arch. Move x86 to using c++ helpers * Add SSE41 path for scalar conversions and 128 bit float to int packed conversions * Adding SSE41 path for floating point to UINT scalar conversions * Add AVX path for ConvertToInt32 * Adding comments and cleaning the code * Fix errors in double to ulong * Addressing review comments * Fix tests * Reverse val < 0 check in dbltoUint and dbltoUlng helpers * Add overflow conversions for 86/x64, remove FastDbl2Lng and inline it * Apply suggestions from code review Co-authored-by: Jan Kotas * Correct Dbl2UlngOvf * Apply suggestions from code review * Apply suggestions from code review * Update src/coreclr/vm/jithelpers.cpp * Disable failing mono tests * Working version of saturating logic moved to lowering for x86/x64 * Making changes for pre SSE41 * Apply suggestions from code review Co-authored-by: Jan Kotas * Removing dead code * Fix formatting * Address review comments, add proper docstrings --------- Co-authored-by: Jan Kotas --- src/coreclr/inc/jithelpers.h | 6 +- src/coreclr/jit/codegenxarch.cpp | 11 +- src/coreclr/jit/compiler.h | 8 + src/coreclr/jit/emit.h | 6 +- src/coreclr/jit/emitxarch.cpp | 41 ++- src/coreclr/jit/gentree.cpp | 230 +++++++++++++ src/coreclr/jit/hwintrinsiclistxarch.h | 20 +- src/coreclr/jit/hwintrinsicxarch.cpp | 105 ++++-- src/coreclr/jit/instr.cpp | 16 +- src/coreclr/jit/instrsxarch.h | 9 +- src/coreclr/jit/lower.cpp | 12 +- src/coreclr/jit/lower.h | 2 +- src/coreclr/jit/lowerarmarch.cpp | 6 +- src/coreclr/jit/lowerloongarch64.cpp | 4 +- src/coreclr/jit/lowerriscv64.cpp | 4 +- src/coreclr/jit/lowerxarch.cpp | 321 +++++++++++++++++- src/coreclr/jit/morph.cpp | 42 ++- src/coreclr/jit/simdashwintrinsic.cpp | 140 ++++++-- src/coreclr/nativeaot/Runtime/MathHelpers.cpp | 60 ++-- src/coreclr/vm/i386/jithelp.S | 81 ----- src/coreclr/vm/i386/jithelp.asm | 214 ------------ src/coreclr/vm/i386/jitinterfacex86.cpp | 46 --- src/coreclr/vm/jithelpers.cpp | 131 +++---- src/coreclr/vm/jitinterface.h | 11 - .../out_of_range_fp_to_int_conversions.cpp | 15 - .../out_of_range_fp_to_int_conversions.cs | 29 +- .../CLR-x86-JIT/V1-M12-Beta2/b28598/b28598.il | 3 + .../CLR-x86-JIT/V1-M12-Beta2/b50027/b50027.il | 3 + .../JitBlue/Runtime_62692/Runtime_62692.cs | 5 +- src/tests/issues.targets | 3 + 30 files changed, 987 insertions(+), 597 deletions(-) diff --git a/src/coreclr/inc/jithelpers.h b/src/coreclr/inc/jithelpers.h index 746f02367f936..f1711a9acfd9b 100644 --- a/src/coreclr/inc/jithelpers.h +++ b/src/coreclr/inc/jithelpers.h @@ -55,11 +55,11 @@ JITHELPER(CORINFO_HELP_ULMOD, JIT_ULMod, CORINFO_HELP_SIG_16_STACK) JITHELPER(CORINFO_HELP_LNG2DBL, JIT_Lng2Dbl, CORINFO_HELP_SIG_8_STACK) JITHELPER(CORINFO_HELP_ULNG2DBL, JIT_ULng2Dbl, CORINFO_HELP_SIG_8_STACK) - DYNAMICJITHELPER(CORINFO_HELP_DBL2INT, JIT_Dbl2Lng, CORINFO_HELP_SIG_8_STACK) + JITHELPER(CORINFO_HELP_DBL2INT, JIT_Dbl2Int, CORINFO_HELP_SIG_8_STACK) JITHELPER(CORINFO_HELP_DBL2INT_OVF, JIT_Dbl2IntOvf, CORINFO_HELP_SIG_8_STACK) - DYNAMICJITHELPER(CORINFO_HELP_DBL2LNG, JIT_Dbl2Lng, CORINFO_HELP_SIG_8_STACK) + JITHELPER(CORINFO_HELP_DBL2LNG, JIT_Dbl2Lng, CORINFO_HELP_SIG_8_STACK) JITHELPER(CORINFO_HELP_DBL2LNG_OVF, JIT_Dbl2LngOvf, CORINFO_HELP_SIG_8_STACK) - DYNAMICJITHELPER(CORINFO_HELP_DBL2UINT, JIT_Dbl2Lng, CORINFO_HELP_SIG_8_STACK) + JITHELPER(CORINFO_HELP_DBL2UINT, JIT_Dbl2UInt, CORINFO_HELP_SIG_8_STACK) JITHELPER(CORINFO_HELP_DBL2UINT_OVF, JIT_Dbl2UIntOvf, CORINFO_HELP_SIG_8_STACK) JITHELPER(CORINFO_HELP_DBL2ULNG, JIT_Dbl2ULng, CORINFO_HELP_SIG_8_STACK) JITHELPER(CORINFO_HELP_DBL2ULNG_OVF, JIT_Dbl2ULngOvf, CORINFO_HELP_SIG_8_STACK) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index f25e5bb046d29..6f68063d0b4fa 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -7602,13 +7602,16 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode) noway_assert((dstSize == EA_ATTR(genTypeSize(TYP_INT))) || (dstSize == EA_ATTR(genTypeSize(TYP_LONG)))); // We shouldn't be seeing uint64 here as it should have been converted - // into a helper call by either front-end or lowering phase. - assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG)))); + // into a helper call by either front-end or lowering phase, unless we have AVX512F + // accelerated conversions. + assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG))) || + compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); // If the dstType is TYP_UINT, we have 32-bits to encode the // float number. Any of 33rd or above bits can be the sign bit. // To achieve it we pretend as if we are converting it to a long. - if (varTypeIsUnsigned(dstType) && (dstSize == EA_ATTR(genTypeSize(TYP_INT)))) + if (varTypeIsUnsigned(dstType) && (dstSize == EA_ATTR(genTypeSize(TYP_INT))) && + !compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F)) { dstType = TYP_LONG; } @@ -7616,7 +7619,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode) // Note that we need to specify dstType here so that it will determine // the size of destination integer register and also the rex.w prefix. genConsumeOperands(treeNode->AsOp()); - instruction ins = ins_FloatConv(TYP_INT, srcType, emitTypeSize(srcType)); + instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(srcType)); GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1); genProduceReg(treeNode); } diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 7e6b2c57c89dc..6d2a6068d11e5 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3204,6 +3204,14 @@ class Compiler CorInfoType simdBaseJitType, unsigned simdSize); +#if defined(TARGET_XARCH) + GenTree* gtNewSimdCvtNode(var_types type, + GenTree* op1, + CorInfoType simdTargetBaseJitType, + CorInfoType simdSourceBaseJitType, + unsigned simdSize); +#endif //TARGET_XARCH + GenTree* gtNewSimdCreateBroadcastNode( var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize); diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 4e37226e2b581..f3767fc6f1807 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -4012,7 +4012,8 @@ emitAttr emitter::emitGetBaseMemOpSize(instrDesc* id) const case INS_comiss: case INS_cvtss2sd: case INS_cvtss2si: - case INS_cvttss2si: + case INS_cvttss2si32: + case INS_cvttss2si64: case INS_divss: case INS_extractps: case INS_insertps: @@ -4055,7 +4056,8 @@ emitAttr emitter::emitGetBaseMemOpSize(instrDesc* id) const case INS_comisd: case INS_cvtsd2si: case INS_cvtsd2ss: - case INS_cvttsd2si: + case INS_cvttsd2si32: + case INS_cvttsd2si64: case INS_divsd: case INS_maxsd: case INS_minsd: diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 848ec0f479edd..424d93d865b7e 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -1522,9 +1522,11 @@ bool emitter::TakesRexWPrefix(const instrDesc* id) const switch (ins) { case INS_cvtss2si: - case INS_cvttss2si: + case INS_cvttss2si32: + case INS_cvttss2si64: case INS_cvtsd2si: - case INS_cvttsd2si: + case INS_cvttsd2si32: + case INS_cvttsd2si64: case INS_movd: case INS_movnti: case INS_andn: @@ -1544,7 +1546,6 @@ bool emitter::TakesRexWPrefix(const instrDesc* id) const #endif // TARGET_AMD64 case INS_vcvtsd2usi: case INS_vcvtss2usi: - case INS_vcvttsd2usi: { if (attr == EA_8BYTE) { @@ -2723,8 +2724,10 @@ bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id) case INS_blsmsk: case INS_blsr: case INS_bzhi: - case INS_cvttsd2si: - case INS_cvttss2si: + case INS_cvttsd2si32: + case INS_cvttsd2si64: + case INS_cvttss2si32: + case INS_cvttss2si64: case INS_cvtsd2si: case INS_cvtss2si: case INS_extractps: @@ -2748,7 +2751,8 @@ bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id) #endif case INS_vcvtsd2usi: case INS_vcvtss2usi: - case INS_vcvttsd2usi: + case INS_vcvttsd2usi32: + case INS_vcvttsd2usi64: case INS_vcvttss2usi32: case INS_vcvttss2usi64: { @@ -11605,22 +11609,20 @@ void emitter::emitDispIns( break; } - case INS_cvttsd2si: + case INS_cvttsd2si32: + case INS_cvttsd2si64: case INS_cvtss2si: case INS_cvtsd2si: - case INS_cvttss2si: + case INS_cvttss2si32: + case INS_cvttss2si64: case INS_vcvtsd2usi: case INS_vcvtss2usi: - case INS_vcvttsd2usi: - { - printf(" %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), EA_16BYTE)); - break; - } - + case INS_vcvttsd2usi32: + case INS_vcvttsd2usi64: case INS_vcvttss2usi32: case INS_vcvttss2usi64: { - printf(" %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), EA_4BYTE)); + printf(" %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), EA_16BYTE)); break; } @@ -19048,7 +19050,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; } - case INS_cvttsd2si: + case INS_cvttsd2si32: + case INS_cvttsd2si64: case INS_cvtsd2si: case INS_cvtsi2sd32: case INS_cvtsi2ss32: @@ -19057,7 +19060,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vcvtsd2usi: case INS_vcvtusi2ss32: case INS_vcvtusi2ss64: - case INS_vcvttsd2usi: + case INS_vcvttsd2usi32: + case INS_vcvttsd2usi64: case INS_vcvttss2usi32: result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency += PERFSCORE_LATENCY_7C; @@ -19069,7 +19073,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insLatency += PERFSCORE_LATENCY_5C; break; - case INS_cvttss2si: + case INS_cvttss2si32: + case INS_cvttss2si64: case INS_cvtss2si: case INS_vcvtss2usi: result.insThroughput = PERFSCORE_THROUGHPUT_1C; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 9cf06c7bb1fcb..71d0d0e2dfe90 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21334,6 +21334,236 @@ GenTree* Compiler::gtNewSimdCeilNode(var_types type, GenTree* op1, CorInfoType s return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize); } +#if defined(TARGET_XARCH) +GenTree* Compiler::gtNewSimdCvtNode(var_types type, + GenTree* op1, + CorInfoType simdTargetBaseJitType, + CorInfoType simdSourceBaseJitType, + unsigned simdSize) +{ + assert(varTypeIsSIMD(type)); + assert(getSIMDTypeForSize(simdSize) == type); + assert(op1 != nullptr); + assert(op1->TypeIs(type)); + + var_types simdSourceBaseType = JitType2PreciseVarType(simdSourceBaseJitType); + var_types simdTargetBaseType = JitType2PreciseVarType(simdTargetBaseJitType); + assert(varTypeIsFloating(simdSourceBaseType)); + assert(varTypeIsIntegral(simdTargetBaseType)); + + assert(IsBaselineSimdIsaSupportedDebugOnly()); + assert(IsBaselineVector512IsaSupportedDebugOnly() || + ((simdTargetBaseType == TYP_INT) && ((simdSize == 16 && compIsaSupportedDebugOnly(InstructionSet_SSE41)) || + (simdSize == 32 && compIsaSupportedDebugOnly(InstructionSet_AVX))))); + + // Generate intrinsic needed for conversion + NamedIntrinsic hwIntrinsicID = NI_Illegal; + switch (simdSourceBaseJitType) + { + case CORINFO_TYPE_FLOAT: + { + switch (simdTargetBaseJitType) + { + case CORINFO_TYPE_INT: + { + switch (simdSize) + { + case 64: + { + hwIntrinsicID = NI_AVX512F_ConvertToVector512Int32WithTruncation; + break; + } + case 32: + { + hwIntrinsicID = NI_AVX_ConvertToVector256Int32WithTruncation; + break; + } + case 16: + { + hwIntrinsicID = NI_SSE2_ConvertToVector128Int32WithTruncation; + break; + } + default: + unreached(); + } + break; + } + case CORINFO_TYPE_UINT: + { + switch (simdSize) + { + case 64: + { + hwIntrinsicID = NI_AVX512F_ConvertToVector512UInt32WithTruncation; + break; + } + case 32: + { + hwIntrinsicID = NI_AVX512F_VL_ConvertToVector256UInt32WithTruncation; + break; + } + case 16: + { + hwIntrinsicID = NI_AVX512F_VL_ConvertToVector128UInt32WithTruncation; + break; + } + default: + unreached(); + } + break; + } + default: + unreached(); + } + break; + } + case CORINFO_TYPE_DOUBLE: + { + switch (simdTargetBaseJitType) + { + case CORINFO_TYPE_LONG: + { + switch (simdSize) + { + case 64: + { + hwIntrinsicID = NI_AVX512DQ_ConvertToVector512Int64WithTruncation; + break; + } + case 32: + { + hwIntrinsicID = NI_AVX512DQ_VL_ConvertToVector256Int64WithTruncation; + break; + } + case 16: + { + hwIntrinsicID = NI_AVX512DQ_VL_ConvertToVector128Int64WithTruncation; + break; + } + default: + unreached(); + } + break; + } + case CORINFO_TYPE_ULONG: + { + switch (simdSize) + { + case 64: + { + hwIntrinsicID = NI_AVX512DQ_ConvertToVector512UInt64WithTruncation; + break; + } + case 32: + { + hwIntrinsicID = NI_AVX512DQ_VL_ConvertToVector256UInt64WithTruncation; + break; + } + case 16: + { + hwIntrinsicID = NI_AVX512DQ_VL_ConvertToVector128UInt64WithTruncation; + break; + } + default: + unreached(); + } + break; + } + default: + unreached(); + } + break; + } + default: + unreached(); + } + assert(hwIntrinsicID != NI_Illegal); + + GenTree* fixupVal; + + if (IsBaselineVector512IsaSupportedOpportunistically()) + { + /*Generate the control table for VFIXUPIMMSD/SS + - For conversion to unsigned + // QNAN: 0b1000: Saturate to Zero + // SNAN: 0b1000: Saturate to Zero + // ZERO: 0b0000 + // +ONE: 0b0000 + // -INF: 0b1000: Saturate to Zero + // +INF: 0b0000 + // -VAL: 0b1000: Saturate to Zero + // +VAL: 0b0000 + - For conversion to signed + // QNAN: 0b1000: Saturate to Zero + // SNAN: 0b1000: Saturate to Zero + // ZERO: 0b0000 + // +ONE: 0b0000 + // -INF: 0b0000 + // +INF: 0b0000 + // -VAL: 0b0000 + // +VAL: 0b0000 + */ + int32_t iconVal = varTypeIsUnsigned(simdTargetBaseType) ? 0x08080088 : 0x00000088; + GenTree* tblCon = gtNewSimdCreateBroadcastNode(type, gtNewIconNode(iconVal), simdTargetBaseJitType, simdSize); + + // We need op1Clone to run fixup + GenTree* op1Clone = fgMakeMultiUse(&op1); + + // run vfixupimmsd base on table and no flags reporting + fixupVal = gtNewSimdHWIntrinsicNode(type, op1, op1Clone, tblCon, gtNewIconNode(0), NI_AVX512F_Fixup, + simdSourceBaseJitType, simdSize); + } + else + { + // Zero out NaN values from the input. + // mask1 contains the output either 0xFFFFFFFF or 0. + // FixupVal zeros out any NaN values in the input by ANDing input with mask1. + GenTree* op1Clone1 = fgMakeMultiUse(&op1); + GenTree* op1Clone2 = fgMakeMultiUse(&op1); + GenTree* mask1 = gtNewSimdCmpOpNode(GT_EQ, type, op1, op1Clone1, simdSourceBaseJitType, simdSize); + fixupVal = gtNewSimdBinOpNode(GT_AND, type, op1Clone2, mask1, simdSourceBaseJitType, simdSize); + } + + if (varTypeIsSigned(simdTargetBaseType)) + { + GenTree* maxVal; + GenTree* maxValDup; + if (varTypeIsLong(simdTargetBaseType)) + { + int64_t actualMaxVal = INT64_MAX; + maxVal = gtNewDconNode(static_cast(actualMaxVal), simdSourceBaseType); + maxVal = gtNewSimdCreateBroadcastNode(type, maxVal, simdSourceBaseJitType, simdSize); + maxValDup = + gtNewSimdCreateBroadcastNode(type, gtNewLconNode(actualMaxVal), simdTargetBaseJitType, simdSize); + } + else + { + ssize_t actualMaxVal = INT32_MAX; + maxVal = gtNewDconNode(static_cast(actualMaxVal), simdSourceBaseType); + maxVal = gtNewSimdCreateBroadcastNode(type, maxVal, simdSourceBaseJitType, simdSize); + maxValDup = + gtNewSimdCreateBroadcastNode(type, gtNewIconNode(actualMaxVal), simdTargetBaseJitType, simdSize); + } + + // we will be using the input value twice + GenTree* fixupValDup = fgMakeMultiUse(&fixupVal); + + // compare with max value of integer/long + fixupVal = gtNewSimdCmpOpNode(GT_GE, type, fixupVal, maxVal, simdSourceBaseJitType, simdSize); + + // cast it + GenTree* castNode = gtNewSimdHWIntrinsicNode(type, fixupValDup, hwIntrinsicID, simdSourceBaseJitType, simdSize); + + // use the fixupVal mask with input value and max value to blend + return gtNewSimdCndSelNode(type, fixupVal, maxValDup, castNode, simdTargetBaseJitType, simdSize); + } + else + { + return gtNewSimdHWIntrinsicNode(type, fixupVal, hwIntrinsicID, simdSourceBaseJitType, simdSize); + } +} +#endif // TARGET_XARCH + GenTree* Compiler::gtNewSimdCmpOpNode( genTreeOps op, var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize) { diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 3093c9ff71a56..07bc2e4838c88 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -273,8 +273,12 @@ HARDWARE_INTRINSIC(Vector512, Create, HARDWARE_INTRINSIC(Vector512, CreateScalar, 64, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, CreateScalarUnsafe, 64, 1, true, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Vector512, CreateSequence, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, ConvertToDouble, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector512, ConvertToSingle, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector512, ConvertToInt32, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector512, ConvertToInt64, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector512, ConvertToUInt32, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector512, ConvertToUInt64, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector512, Divide, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, Equals, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, EqualsAll, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) @@ -411,7 +415,7 @@ HARDWARE_INTRINSIC(SSE, CompareUnordered, HARDWARE_INTRINSIC(SSE, CompareScalarUnordered, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, ConvertToInt32, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, ConvertScalarToVector128Single, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, ConvertToInt32WithTruncation, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE, ConvertToInt32WithTruncation, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si32, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, Divide, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(SSE, DivideScalar, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, LoadAlignedVector128, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) @@ -460,7 +464,7 @@ HARDWARE_INTRINSIC(SSE, Xor, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // SSE 64-bit-only Intrinsics HARDWARE_INTRINSIC(SSE_X64, ConvertToInt64, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(SSE_X64, ConvertToInt64WithTruncation, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(SSE_X64, ConvertToInt64WithTruncation, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si64, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(SSE_X64, ConvertScalarToVector128Single, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss64, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits|HW_Flag_SpecialCodeGen) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -511,7 +515,7 @@ HARDWARE_INTRINSIC(SSE2, CompareScalarOrdered, HARDWARE_INTRINSIC(SSE2, CompareUnordered, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE2, CompareScalarUnordered, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToInt32, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, ConvertToInt32WithTruncation, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE2, ConvertToInt32WithTruncation, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttsd2si32}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToUInt32, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToVector128Double, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtdq2pd, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2pd, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, ConvertScalarToVector128Double, 16, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2sd32, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2sd, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg) @@ -578,7 +582,7 @@ HARDWARE_INTRINSIC(SSE2, Xor, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // SSE2 64-bit-only Intrinsics HARDWARE_INTRINSIC(SSE2_X64, ConvertToInt64, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_cvtsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2_X64, ConvertToInt64WithTruncation, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE2_X64, ConvertToInt64WithTruncation, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttsd2si64}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_X64, ConvertToUInt64, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2_X64, ConvertScalarToVector128Double, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2sd64, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(SSE2_X64, ConvertScalarToVector128Int64, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) @@ -708,9 +712,9 @@ HARDWARE_INTRINSIC(AVX, CompareNotLessThanOrEqual, HARDWARE_INTRINSIC(AVX, CompareOrdered, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareUnordered, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareScalar, 16, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, ConvertToVector128Int32, 32, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX, ConvertToVector128Int32, 32, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2dq}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, ConvertToVector128Single, 32, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2ps}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX, ConvertToVector256Int32, 32, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX, ConvertToVector256Int32, 32, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2dq, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2dq, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, ConvertToVector256Single, 32, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtdq2ps, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(AVX, ConvertToVector256Double, 32, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtdq2pd, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2pd, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(AVX, ConvertToVector128Int32WithTruncation, 32, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -861,7 +865,7 @@ HARDWARE_INTRINSIC(AVX512F, ConvertScalarToVector128Double, HARDWARE_INTRINSIC(AVX512F, ConvertScalarToVector128Single, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss32, INS_vcvtusi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_cvtsd2ss}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) HARDWARE_INTRINSIC(AVX512F, ConvertToInt32, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_cvtsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_EmbRoundingCompatible) HARDWARE_INTRINSIC(AVX512F, ConvertToUInt32, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtss2usi, INS_vcvtsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX512F, ConvertToUInt32WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi32, INS_vcvttsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(AVX512F, ConvertToUInt32WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi32, INS_vcvttsd2usi32}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F, ConvertToVector128Byte, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovdb, INS_vpmovdb, INS_vpmovqb, INS_vpmovqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F, ConvertToVector128ByteWithSaturation, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovusdb, INS_invalid, INS_vpmovusqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F, ConvertToVector128Int16, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovqw, INS_vpmovqw, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) @@ -1027,7 +1031,7 @@ HARDWARE_INTRINSIC(AVX512F_X64, ConvertScalarToVector128Double, HARDWARE_INTRINSIC(AVX512F_X64, ConvertScalarToVector128Single, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss64, INS_vcvtusi2ss64, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits|HW_Flag_SpecialCodeGen|HW_Flag_EmbRoundingCompatible) HARDWARE_INTRINSIC(AVX512F_X64, ConvertToInt64, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_cvtsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_EmbRoundingCompatible) HARDWARE_INTRINSIC(AVX512F_X64, ConvertToUInt64, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtss2usi, INS_vcvtsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX512F_X64, ConvertToUInt64WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi64, INS_vcvttsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(AVX512F_X64, ConvertToUInt64WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi64, INS_vcvttsd2usi64}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg EncodesExtraTypeArg Instructions Category Flags diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 87332c07f0113..fc3c01e4c31d2 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1415,15 +1415,70 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_ConvertToDouble: case NI_Vector256_ConvertToDouble: + case NI_Vector512_ConvertToDouble: + { + assert(sig->numArgs == 1); + assert(varTypeIsLong(simdBaseType)); + if (IsBaselineVector512IsaSupportedOpportunistically()) + { + if (simdSize == 64) + { + intrinsic = NI_AVX512DQ_ConvertToVector512Double; + } + else if (simdSize == 32) + { + intrinsic = NI_AVX512DQ_VL_ConvertToVector256Double; + } + else + { + assert(simdSize == 16); + intrinsic = NI_AVX512DQ_VL_ConvertToVector128Double; + } + op1 = impSIMDPopStack(); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); + } + break; + } + case NI_Vector128_ConvertToInt64: case NI_Vector256_ConvertToInt64: + case NI_Vector512_ConvertToInt64: + { + assert(sig->numArgs == 1); + assert(simdBaseType == TYP_DOUBLE); + if (IsBaselineVector512IsaSupportedOpportunistically()) + { + op1 = impSIMDPopStack(); + retNode = gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_LONG, simdBaseJitType, simdSize); + } + break; + } + case NI_Vector128_ConvertToUInt32: case NI_Vector256_ConvertToUInt32: + case NI_Vector512_ConvertToUInt32: + { + assert(sig->numArgs == 1); + assert(simdBaseType == TYP_FLOAT); + if (IsBaselineVector512IsaSupportedOpportunistically()) + { + op1 = impSIMDPopStack(); + retNode = gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_UINT, simdBaseJitType, simdSize); + } + break; + } + case NI_Vector128_ConvertToUInt64: case NI_Vector256_ConvertToUInt64: + case NI_Vector512_ConvertToUInt64: { assert(sig->numArgs == 1); - // TODO-XARCH-CQ: These intrinsics should be accelerated + assert(simdBaseType == TYP_DOUBLE); + if (IsBaselineVector512IsaSupportedOpportunistically()) + { + op1 = impSIMDPopStack(); + retNode = gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_ULONG, simdBaseJitType, simdSize); + } break; } @@ -1433,24 +1488,11 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 1); assert(simdBaseType == TYP_FLOAT); - - switch (simdSize) + if (compOpportunisticallyDependsOn(InstructionSet_SSE41)) { - case 16: - intrinsic = NI_SSE2_ConvertToVector128Int32WithTruncation; - break; - case 32: - intrinsic = NI_AVX_ConvertToVector256Int32WithTruncation; - break; - case 64: - intrinsic = NI_AVX512F_ConvertToVector512Int32WithTruncation; - break; - default: - unreached(); + op1 = impSIMDPopStack(); + retNode = gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_INT, simdBaseJitType, simdSize); } - - op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); break; } @@ -1459,7 +1501,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector512_ConvertToSingle: { assert(sig->numArgs == 1); - + assert(varTypeIsInt(simdBaseType)); + intrinsic = NI_Illegal; if (simdBaseType == TYP_INT) { switch (simdSize) @@ -1476,14 +1519,28 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, default: unreached(); } - - op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); } - else + else if (simdBaseType == TYP_UINT && IsBaselineVector512IsaSupportedOpportunistically()) { - // TODO-XARCH-CQ: These intrinsics should be accelerated - assert(simdBaseType == TYP_UINT); + switch (simdSize) + { + case 16: + intrinsic = NI_AVX512F_VL_ConvertToVector128Single; + break; + case 32: + intrinsic = NI_AVX512F_VL_ConvertToVector256Single; + break; + case 64: + intrinsic = NI_AVX512F_ConvertToVector512Single; + break; + default: + unreached(); + } + } + if (intrinsic != NI_Illegal) + { + op1 = impSIMDPopStack(); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); } break; } diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 7866c8a5e7b0f..79aae2c334549 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -2408,13 +2408,17 @@ instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr) switch (to) { case TYP_INT: - return INS_cvttss2si; + return INS_cvttss2si32; case TYP_LONG: - return INS_cvttss2si; + return INS_cvttss2si64; case TYP_FLOAT: return ins_Move_Extend(TYP_FLOAT, false); case TYP_DOUBLE: return INS_cvtss2sd; + case TYP_ULONG: + return INS_vcvttss2usi64; + case TYP_UINT: + return INS_vcvttss2usi32; default: unreached(); } @@ -2424,13 +2428,17 @@ instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr) switch (to) { case TYP_INT: - return INS_cvttsd2si; + return INS_cvttsd2si32; case TYP_LONG: - return INS_cvttsd2si; + return INS_cvttsd2si64; case TYP_FLOAT: return INS_cvtsd2ss; case TYP_DOUBLE: return ins_Move_Extend(TYP_DOUBLE, false); + case TYP_ULONG: + return INS_vcvttsd2usi64; + case TYP_UINT: + return INS_vcvttsd2usi32; default: unreached(); } diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 440cc0033c82f..030bcffd41c6e 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -201,7 +201,8 @@ INST3(comiss, "comiss", IUM_RD, BAD_CODE, BAD_CODE, INST3(cvtsi2ss32, "cvtsi2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2A), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt DWORD to scalar single INST3(cvtsi2ss64, "cvtsi2ss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2A), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt QWORD to scalar single INST3(cvtss2si, "cvtss2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2D), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_WX | Encoding_VEX | Encoding_EVEX) // cvt scalar single to DWORD/QWORD -INST3(cvttss2si, "cvttss2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2C), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_WX | Encoding_VEX | Encoding_EVEX) // cvt with trunc scalar single to DWORD +INST3(cvttss2si32, "cvttss2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2C), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // cvt with trunc scalar single to DWORD +INST3(cvttss2si64, "cvttss2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2C), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_W1 | Encoding_VEX | Encoding_EVEX) // cvt with trunc scalar single to DWORD INST3(divps, "divps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5E), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Divide packed singles INST3(divss, "divss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5E), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide scalar singles INST3(maxps, "maxps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5F), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Return Maximum packed singles @@ -260,7 +261,8 @@ INST3(cvtsi2sd64, "cvtsi2sd", IUM_WR, BAD_CODE, BAD_CODE, INST3(cvtss2sd, "cvtss2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5A), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar single to scalar doubles INST3(cvttpd2dq, "cvttpd2dq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE6), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // cvt with trunc packed doubles to DWORDs INST3(cvttps2dq, "cvttps2dq", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5B), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // cvt with trunc packed singles to DWORDs -INST3(cvttsd2si, "cvttsd2si", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2C), INS_TT_TUPLE1_FIXED, Input_64Bit | REX_WX | Encoding_VEX | Encoding_EVEX) // cvt with trunc scalar double to signed DWORDs +INST3(cvttsd2si32, "cvttsd2si", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2C), INS_TT_TUPLE1_FIXED, Input_64Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // cvt with trunc scalar double to signed DWORDs +INST3(cvttsd2si64, "cvttsd2si", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2C), INS_TT_TUPLE1_FIXED, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX) // cvt with trunc scalar double to signed DWORDs INST3(divpd, "divpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5E), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Divide packed doubles INST3(divsd, "divsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5E), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide scalar doubles INST3(lfence, "lfence", IUM_RD, 0x000FE8AE, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG) @@ -640,7 +642,8 @@ INST3(vcvtsd2usi, "cvtsd2usi", IUM_WR, BAD_CODE, BAD_ INST3(vcvtss2usi, "cvtss2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x79), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_WX | Encoding_EVEX) // cvt scalar single to unsigned DWORD/QWORD INST3(vcvttpd2udq, "cvttpd2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x78), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt w/ truncation packed doubles to unsigned DWORDs INST3(vcvttps2udq, "cvttps2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x78), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt w/ truncation packed singles to unsigned DWORDs -INST3(vcvttsd2usi, "cvttsd2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x78), INS_TT_TUPLE1_FIXED, Input_64Bit | REX_WX | Encoding_EVEX) // cvt w/ truncation scalar double to unsigned DWORD/QWORD +INST3(vcvttsd2usi32, "cvttsd2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x78), INS_TT_TUPLE1_FIXED, Input_64Bit | REX_W0 | Encoding_EVEX) // cvt w/ truncation scalar double to unsigned DWORD +INST3(vcvttsd2usi64, "cvttsd2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x78), INS_TT_TUPLE1_FIXED, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt w/ truncation scalar double to unsigned QWORD INST3(vcvttss2usi32, "cvttss2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x78), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt w/ truncation scalar single to unsigned DWORD/QWORD INST3(vcvttss2usi64, "cvttss2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x78), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_W1 | Encoding_EVEX) // cvt w/ truncation scalar single to unsigned DWORD/QWORD INST3(vcvtudq2pd, "cvtudq2pd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x7A), INS_TT_HALF, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt packed unsigned DWORDs to doubles diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 0cd0500a9d163..be0dde669e34b 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -529,8 +529,16 @@ GenTree* Lowering::LowerNode(GenTree* node) break; case GT_CAST: - LowerCast(node); - break; + { + GenTree* nextNode = LowerCast(node); +#if defined(TARGET_XARCH) + if (nextNode != nullptr) + { + return nextNode; + } +#endif // TARGET_XARCH + } + break; case GT_BITCAST: ContainCheckBitCast(node); diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 318a148ee9c1c..5d4cc1ac080a1 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -358,7 +358,7 @@ class Lowering final : public Phase GenTree* switchValue, weight_t defaultLikelihood); - void LowerCast(GenTree* node); + GenTree* LowerCast(GenTree* node); #if !CPU_LOAD_STORE_ARCH bool IsRMWIndirCandidate(GenTree* operand, GenTree* storeInd); diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 498093ae6fc52..9731331885bec 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -778,7 +778,7 @@ void Lowering::LowerPutArgStkOrSplit(GenTreePutArgStk* putArgNode) // tree - GT_CAST node to be lowered // // Return Value: -// None. +// nextNode to be lowered if tree is modified else returns nullptr // // Notes: // Casts from float/double to a smaller int type are transformed as follows: @@ -791,7 +791,7 @@ void Lowering::LowerPutArgStkOrSplit(GenTreePutArgStk* putArgNode) // don't expect to see them here. // i) GT_CAST(float/double, int type with overflow detection) // -void Lowering::LowerCast(GenTree* tree) +GenTree* Lowering::LowerCast(GenTree* tree) { assert(tree->OperGet() == GT_CAST); @@ -814,6 +814,8 @@ void Lowering::LowerCast(GenTree* tree) // Now determine if we have operands that should be contained. ContainCheckCast(tree->AsCast()); + + return nullptr; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lowerloongarch64.cpp b/src/coreclr/jit/lowerloongarch64.cpp index f0b61aeba6630..4e826be0b2257 100644 --- a/src/coreclr/jit/lowerloongarch64.cpp +++ b/src/coreclr/jit/lowerloongarch64.cpp @@ -515,7 +515,7 @@ void Lowering::LowerPutArgStkOrSplit(GenTreePutArgStk* putArgNode) // i) GT_CAST(float/double, int type with overflow detection) // -void Lowering::LowerCast(GenTree* tree) +GenTree* Lowering::LowerCast(GenTree* tree) { assert(tree->OperGet() == GT_CAST); @@ -538,6 +538,8 @@ void Lowering::LowerCast(GenTree* tree) // Now determine if we have operands that should be contained. ContainCheckCast(tree->AsCast()); + + return nullptr; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lowerriscv64.cpp b/src/coreclr/jit/lowerriscv64.cpp index 22830e92ba25c..aa8342ee1af5a 100644 --- a/src/coreclr/jit/lowerriscv64.cpp +++ b/src/coreclr/jit/lowerriscv64.cpp @@ -434,7 +434,7 @@ void Lowering::LowerPutArgStkOrSplit(GenTreePutArgStk* putArgNode) // i) GT_CAST(float/double, int type with overflow detection) // -void Lowering::LowerCast(GenTree* tree) +GenTree* Lowering::LowerCast(GenTree* tree) { assert(tree->OperGet() == GT_CAST); @@ -457,6 +457,8 @@ void Lowering::LowerCast(GenTree* tree) // Now determine if we have operands that should be contained. ContainCheckCast(tree->AsCast()); + + return nullptr; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 999a3fc6d338c..5a9b12ca4aa27 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -815,12 +815,13 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) * system.windows.forms, scimark, fractals, bio mums). If we ever find evidence that * doing this optimization is a win, should consider generating in-lined code. */ -void Lowering::LowerCast(GenTree* tree) +GenTree* Lowering::LowerCast(GenTree* tree) { assert(tree->OperGet() == GT_CAST); GenTree* castOp = tree->AsCast()->CastOp(); var_types castToType = tree->CastToType(); + var_types dstType = castToType; var_types srcType = castOp->TypeGet(); var_types tmpType = TYP_UNDEF; @@ -843,7 +844,7 @@ void Lowering::LowerCast(GenTree* tree) if (varTypeIsFloating(srcType)) { noway_assert(!tree->gtOverflow()); - noway_assert(castToType != TYP_ULONG); + assert(castToType != TYP_ULONG || comp->IsBaselineVector512IsaSupportedDebugOnly()); } else if (srcType == TYP_UINT) { @@ -851,9 +852,322 @@ void Lowering::LowerCast(GenTree* tree) } else if (srcType == TYP_ULONG) { - assert(castToType != TYP_FLOAT || comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + assert(castToType != TYP_FLOAT || comp->IsBaselineVector512IsaSupportedDebugOnly()); } +#if defined(TARGET_AMD64) + // Handle saturation logic for X64 + if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType) && !varTypeIsSmall(dstType)) + { + // We should have filtered out float -> long conversion and + // converted it to float -> double -> long conversion. + assert((dstType != TYP_LONG) || (srcType != TYP_FLOAT)); + + // we should have handled overflow cases in morph itself + assert(!tree->gtOverflow()); + + CorInfoType fieldType = (srcType == TYP_DOUBLE) ? CORINFO_TYPE_DOUBLE : CORINFO_TYPE_FLOAT; + GenTree* castOutput = nullptr; + LIR::Use castOpUse(BlockRange(), &(tree->AsCast()->CastOp()), tree); + ReplaceWithLclVar(castOpUse); + castOp = tree->AsCast()->CastOp(); + /*The code below is to introduce saturating conversions on X86/X64. + The C# equivalence of the code is given below --> + + // Replace QNaN and SNaN with Zero + op1 = Avx512F.Fixup(op1, op1, Vector128.Create(0x88), 0); + + // Convert from double to long, replacing any values that were greater than or equal to MaxValue + with MaxValue + // Values that were less than or equal to MinValue will already be MinValue + return Vector128.ConditionalSelect( + Vector128.LessThan(op1, Vector128.Create(long.MaxValue)).AsInt64(), + Avx512DQ.VL.ConvertToVector128Int64(op1), + Vector128.Create(long.MaxValue) + ); + */ + if (comp->IsBaselineVector512IsaSupportedOpportunistically()) + { + // Clone the cast operand for usage. + GenTree* op1Clone1 = comp->gtClone(castOp); + BlockRange().InsertAfter(castOp, op1Clone1); + + // Generate the control table for VFIXUPIMMSD + // The behavior we want is to saturate negative values to 0. + GenTreeVecCon* tbl = comp->gtNewVconNode(TYP_SIMD16); + tbl->gtSimdVal.i32[0] = (varTypeIsUnsigned(dstType)) ? 0x08080088 : 0x00000088; + BlockRange().InsertAfter(op1Clone1, tbl); + + // get a zero int node for control table + GenTree* ctrlByte = comp->gtNewIconNode(0); + BlockRange().InsertAfter(tbl, ctrlByte); + + if (varTypeIsUnsigned(dstType)) + { + // run vfixupimmsd base on table and no flags reporting + GenTree* oper1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, castOp, op1Clone1, tbl, ctrlByte, + NI_AVX512F_FixupScalar, fieldType, 16); + BlockRange().InsertAfter(ctrlByte, oper1); + LowerNode(oper1); + + // Convert to scalar + // Here, we try to insert a Vector128 to Scalar node so that the input + // can be provided to the scalar cast + GenTree* oper2 = comp->gtNewSimdHWIntrinsicNode(srcType, oper1, NI_Vector128_ToScalar, fieldType, 16); + BlockRange().InsertAfter(oper1, oper2); + LowerNode(oper2); + + castOutput = comp->gtNewCastNode(genActualType(dstType), oper2, false, dstType); + BlockRange().InsertAfter(oper2, castOutput); + } + else + { + CorInfoType destFieldType = (dstType == TYP_INT) ? CORINFO_TYPE_INT : CORINFO_TYPE_LONG; + + ssize_t actualMaxVal = (dstType == TYP_INT) ? INT32_MAX : INT64_MAX; + + // run vfixupimmsd base on table and no flags reporting + GenTree* fixupVal = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, castOp, op1Clone1, tbl, ctrlByte, + NI_AVX512F_FixupScalar, fieldType, 16); + BlockRange().InsertAfter(ctrlByte, fixupVal); + LowerNode(fixupVal); + + // get the max value vector + GenTree* maxValScalar = (srcType == TYP_DOUBLE) + ? comp->gtNewDconNodeD(static_cast(actualMaxVal)) + : comp->gtNewDconNodeF(static_cast(actualMaxVal)); + GenTree* maxVal = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, maxValScalar, fieldType, 16); + BlockRange().InsertAfter(fixupVal, maxVal); + + GenTree* maxValDstTypeScalar = (dstType == TYP_INT) ? comp->gtNewIconNode(actualMaxVal, dstType) + : comp->gtNewLconNode(actualMaxVal); + GenTree* maxValDstType = + comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, maxValDstTypeScalar, destFieldType, 16); + BlockRange().InsertAfter(maxVal, maxValDstType); + + // usage 1 --> compare with max value of integer + GenTree* compMask = comp->gtNewSimdCmpOpNode(GT_GE, TYP_SIMD16, fixupVal, maxVal, fieldType, 16); + BlockRange().InsertAfter(maxValDstType, compMask); + + // convert fixupVal to local variable and clone it for further use + LIR::Use fixupValUse(BlockRange(), &(compMask->AsHWIntrinsic()->Op(1)), compMask); + ReplaceWithLclVar(fixupValUse); + fixupVal = compMask->AsHWIntrinsic()->Op(1); + GenTree* fixupValClone = comp->gtClone(fixupVal); + LowerNode(compMask); + BlockRange().InsertAfter(fixupVal, fixupValClone); + + GenTree* FixupValCloneScalar = + comp->gtNewSimdHWIntrinsicNode(srcType, fixupValClone, NI_Vector128_ToScalar, fieldType, 16); + BlockRange().InsertAfter(compMask, FixupValCloneScalar); + LowerNode(FixupValCloneScalar); + + // cast it + GenTreeCast* newCast = comp->gtNewCastNode(dstType, FixupValCloneScalar, false, dstType); + BlockRange().InsertAfter(FixupValCloneScalar, newCast); + + GenTree* newTree = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, newCast, destFieldType, 16); + BlockRange().InsertAfter(newCast, newTree); + LowerNode(newTree); + + // usage 2 --> use thecompared mask with input value and max value to blend + GenTree* control = comp->gtNewIconNode(0xCA); // (B & A) | (C & ~A) + BlockRange().InsertAfter(newTree, control); + GenTree* cndSelect = comp->gtNewSimdTernaryLogicNode(TYP_SIMD16, compMask, maxValDstType, newTree, + control, destFieldType, 16); + BlockRange().InsertAfter(control, cndSelect); + LowerNode(cndSelect); + + castOutput = + comp->gtNewSimdHWIntrinsicNode(dstType, cndSelect, NI_Vector128_ToScalar, destFieldType, 16); + BlockRange().InsertAfter(cndSelect, castOutput); + LowerNode(castOutput); + } + } + else if (varTypeIsSigned(dstType) && comp->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + CorInfoType destFieldType = (dstType == TYP_INT) ? CORINFO_TYPE_INT : CORINFO_TYPE_LONG; + + ssize_t actualMaxVal = (dstType == TYP_INT) ? INT32_MAX : INT64_MAX; + + // create clones for usage + GenTree* castOpClone1 = comp->gtClone(castOp); + GenTree* castOpClone2 = comp->gtClone(castOp); + BlockRange().InsertAfter(castOp, castOpClone1); + BlockRange().InsertAfter(castOpClone1, castOpClone2); + + GenTree* oper = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, castOp, fieldType, 16); + BlockRange().InsertAfter(castOpClone2, oper); + LowerNode(oper); + GenTree* op1Clone1 = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, castOpClone1, fieldType, 16); + BlockRange().InsertAfter(oper, op1Clone1); + LowerNode(op1Clone1); + GenTree* op1Clone2 = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, castOpClone2, fieldType, 16); + BlockRange().InsertAfter(op1Clone1, op1Clone2); + LowerNode(op1Clone2); + + // check NaN + GenTree* mask1 = comp->gtNewSimdCmpOpNode(GT_EQ, TYP_SIMD16, oper, op1Clone1, fieldType, 16); + BlockRange().InsertAfter(op1Clone2, mask1); + LowerNode(mask1); + // inp = inp & mask + GenTree* maskNaN = comp->gtNewSimdBinOpNode(GT_AND, TYP_SIMD16, op1Clone2, mask1, fieldType, 16); + BlockRange().InsertAfter(mask1, maskNaN); + LowerNode(maskNaN); + + // get the max value vector + GenTree* maxVal = (srcType == TYP_DOUBLE) ? comp->gtNewDconNodeD(static_cast(actualMaxVal)) + : comp->gtNewDconNodeF(static_cast(actualMaxVal)); + GenTree* maxValDup = + (dstType == TYP_INT) ? comp->gtNewIconNode(actualMaxVal) : comp->gtNewLconNode(actualMaxVal); + maxVal = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, maxVal, fieldType, 16); + BlockRange().InsertAfter(maskNaN, maxVal); + maxValDup = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, maxValDup, destFieldType, 16); + BlockRange().InsertAfter(maxVal, maxValDup); + + // usage 1 --> compare with max value of integer + GenTree* compMask = comp->gtNewSimdCmpOpNode(GT_GE, TYP_SIMD16, maskNaN, maxVal, fieldType, 16); + BlockRange().InsertAfter(maxValDup, compMask); + + // we will be using the maskNaN value twice + LIR::Use maskNaNUse(BlockRange(), &(compMask->AsHWIntrinsic()->Op(1)), compMask); + ReplaceWithLclVar(maskNaNUse); + maskNaN = compMask->AsHWIntrinsic()->Op(1); + GenTree* maskNaNClone = comp->gtClone(maskNaN); + LowerNode(compMask); + BlockRange().InsertAfter(maskNaN, maskNaNClone); + + // convert to scalar for conversion + GenTree* maskNaNCloneScalar = + comp->gtNewSimdHWIntrinsicNode(srcType, maskNaNClone, NI_Vector128_ToScalar, fieldType, 16); + BlockRange().InsertAfter(compMask, maskNaNCloneScalar); + LowerNode(maskNaNCloneScalar); + + // cast it + GenTreeCast* newCast = comp->gtNewCastNode(dstType, maskNaNCloneScalar, false, dstType); + BlockRange().InsertAfter(maskNaNCloneScalar, newCast); + GenTree* newTree = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, newCast, destFieldType, 16); + BlockRange().InsertAfter(newCast, newTree); + LowerNode(newTree); + + // usage 2 --> use thecompared mask with input value and max value to blend + GenTree* cndSelect = comp->gtNewSimdCndSelNode(TYP_SIMD16, compMask, maxValDup, newTree, destFieldType, 16); + BlockRange().InsertAfter(newTree, cndSelect); + LowerNode(cndSelect); + + castOutput = comp->gtNewSimdHWIntrinsicNode(dstType, cndSelect, NI_Vector128_ToScalar, destFieldType, 16); + BlockRange().InsertAfter(cndSelect, castOutput); + LowerNode(castOutput); + } + else + { + // The remaining case not handled above should be conversion + // to TYP_UINT in case where SSE41 is supported. + // We should have converted float -> uint conversion to + // float -> double -> uint during morph. + assert((dstType == TYP_UINT) && comp->compIsaSupportedDebugOnly(InstructionSet_SSE41) && + (srcType != TYP_FLOAT)); + + ssize_t actualMaxVal = UINT32_MAX; + CorInfoType destFieldType = CORINFO_TYPE_LONG; + + GenTree* castOpClone1 = comp->gtClone(castOp); + GenTree* castOpClone2 = comp->gtClone(castOp); + GenTree* castOpClone3 = comp->gtClone(castOp); + BlockRange().InsertAfter(castOp, castOpClone1); + BlockRange().InsertAfter(castOpClone1, castOpClone2); + BlockRange().InsertAfter(castOpClone2, castOpClone3); + + GenTree* oper = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, castOp, fieldType, 16); + BlockRange().InsertAfter(castOpClone3, oper); + LowerNode(oper); + GenTree* op1Clone1 = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, castOpClone1, fieldType, 16); + BlockRange().InsertAfter(oper, op1Clone1); + LowerNode(op1Clone1); + GenTree* op1Clone2 = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, castOpClone2, fieldType, 16); + BlockRange().InsertAfter(op1Clone1, op1Clone2); + LowerNode(op1Clone2); + GenTree* op1Clone3 = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, castOpClone3, fieldType, 16); + BlockRange().InsertAfter(op1Clone2, op1Clone3); + LowerNode(op1Clone3); + + // get the max/min value vector + GenTree* minVal = comp->gtNewDconNodeD(static_cast(0)); + minVal = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, minVal, fieldType, 16); + BlockRange().InsertAfter(op1Clone3, minVal); + GenTree* maxVal = comp->gtNewDconNodeD(static_cast(actualMaxVal)); + maxVal = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, maxVal, fieldType, 16); + BlockRange().InsertAfter(minVal, maxVal); + + // check NaN + GenTree* mask1 = comp->gtNewSimdCmpOpNode(GT_EQ, TYP_SIMD16, oper, op1Clone1, fieldType, 16); + BlockRange().InsertAfter(maxVal, mask1); + LowerNode(mask1); + + // check negative + GenTree* mask2 = comp->gtNewSimdCmpOpNode(GT_GE, TYP_SIMD16, op1Clone2, minVal, fieldType, 16); + BlockRange().InsertAfter(mask1, mask2); + LowerNode(mask2); + + // and mask + GenTree* mask12 = comp->gtNewSimdBinOpNode(GT_AND, TYP_SIMD16, mask1, mask2, fieldType, 16); + BlockRange().InsertAfter(mask2, mask12); + LowerNode(mask12); + + // inp = inp & mask + GenTree* saturatedVal = comp->gtNewSimdBinOpNode(GT_AND, TYP_SIMD16, op1Clone3, mask12, fieldType, 16); + BlockRange().InsertAfter(mask12, saturatedVal); + LowerNode(saturatedVal); + + // compare with max value of uint + GenTree* mask3 = comp->gtNewSimdCmpOpNode(GT_GE, TYP_SIMD16, saturatedVal, maxVal, fieldType, 16); + BlockRange().InsertAfter(saturatedVal, mask3); + + // Convert both the operands of mask3 to local variables for reusage + LIR::Use saturatedValUse(BlockRange(), &(mask3->AsHWIntrinsic()->Op(1)), mask3); + ReplaceWithLclVar(saturatedValUse); + saturatedVal = mask3->AsHWIntrinsic()->Op(1); + GenTree* saturatedValDup = comp->gtClone(saturatedVal); + BlockRange().InsertAfter(saturatedVal, saturatedValDup); + + LIR::Use maxValUse(BlockRange(), &(mask3->AsHWIntrinsic()->Op(2)), mask3); + ReplaceWithLclVar(maxValUse); + maxVal = mask3->AsHWIntrinsic()->Op(2); + GenTree* maxValDup = comp->gtClone(maxVal); + LowerNode(mask3); + BlockRange().InsertAfter(maxVal, maxValDup); + + // Select based on mask3 + GenTree* castOpVal = + comp->gtNewSimdCndSelNode(TYP_SIMD16, mask3, maxValDup, saturatedValDup, fieldType, 16); + BlockRange().InsertAfter(mask3, castOpVal); + LowerNode(castOpVal); + + // scalar + GenTree* castOpValScalar = + comp->gtNewSimdHWIntrinsicNode(srcType, castOpVal, NI_Vector128_ToScalar, fieldType, 16); + BlockRange().InsertAfter(castOpVal, castOpValScalar); + LowerNode(castOpValScalar); + + // cast it + castOutput = comp->gtNewCastNode(TYP_INT, castOpValScalar, false, dstType); + BlockRange().InsertAfter(castOpValScalar, castOutput); + } + assert(castOutput != nullptr); + LIR::Use use; + if (BlockRange().TryGetUse(tree, &use)) + { + use.ReplaceWith(castOutput); + } + else + { + castOutput->SetUnusedValue(); + } + BlockRange().Remove(tree); + return castOutput->gtNext; + } +#endif // TARGET_AMD64 + // Case of src is a small type and dst is a floating point type. if (varTypeIsSmall(srcType) && varTypeIsFloating(castToType)) { @@ -880,6 +1194,7 @@ void Lowering::LowerCast(GenTree* tree) // Now determine if we have operands that should be contained. ContainCheckCast(tree->AsCast()); + return nullptr; } #ifdef FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 4b301696a1eeb..e140c39550533 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -335,14 +335,20 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) && tree->gtOverflow() #elif defined(TARGET_AMD64) // Amd64: src = float, dst = uint64 or overflow conversion. - // This goes through helper and hence src needs to be converted to double. - && (tree->gtOverflow() || (dstType == TYP_ULONG)) + // src needs to be converted to double except for the following cases + // dstType = int/uint/ulong for AVX512F + // dstType = int for SSE41 + // For pre-SSE41, the all src is converted to TYP_DOUBLE + // and goes through helpers. + && (tree->gtOverflow() || (dstType == TYP_LONG) || + !(compOpportunisticallyDependsOn(InstructionSet_AVX512F) || + (dstType == TYP_INT && compOpportunisticallyDependsOn(InstructionSet_SSE41)))) #elif defined(TARGET_ARM) // Arm: src = float, dst = int64/uint64 or overflow conversion. && (tree->gtOverflow() || varTypeIsLong(dstType)) #else // x86: src = float, dst = uint32/int64/uint64 or overflow conversion. - && (tree->gtOverflow() || varTypeIsLong(dstType) || (dstType == TYP_UINT)) + && (tree->gtOverflow() || varTypeIsIntegral(dstType)) #endif ) { @@ -368,25 +374,39 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) #if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) return nullptr; #else +#if defined(TARGET_AMD64) + // Following nodes are handled when lowering the nodes + // float -> ulong/uint/int for AVX512F + // double -> ulong/uint/long/int for AVX512F + // float -> int for SSE41 + // double -> int/uint/long for SSE41 + // For all other conversions, we use helper functions. + if (compOpportunisticallyDependsOn(InstructionSet_AVX512F) || + ((dstType != TYP_ULONG) && compOpportunisticallyDependsOn(InstructionSet_SSE41))) + { + if (tree->CastOp() != oper) + { + tree->CastOp() = oper; + } + return nullptr; + } +#endif // TARGET_AMD64 switch (dstType) { case TYP_INT: +#ifdef TARGET_XARCH + return fgMorphCastIntoHelper(tree, CORINFO_HELP_DBL2INT, oper); +#endif // TARGET_XARCH return nullptr; case TYP_UINT: -#if defined(TARGET_ARM) || defined(TARGET_AMD64) +#if defined(TARGET_ARM) return nullptr; -#else // TARGET_X86 +#endif return fgMorphCastIntoHelper(tree, CORINFO_HELP_DBL2UINT, oper); -#endif // TARGET_X86 case TYP_LONG: -#ifdef TARGET_AMD64 - // SSE2 has instructions to convert a float/double directly to a long - return nullptr; -#else // !TARGET_AMD64 return fgMorphCastIntoHelper(tree, CORINFO_HELP_DBL2LNG, oper); -#endif // !TARGET_AMD64 case TYP_ULONG: return fgMorphCastIntoHelper(tree, CORINFO_HELP_DBL2ULNG, oper); diff --git a/src/coreclr/jit/simdashwintrinsic.cpp b/src/coreclr/jit/simdashwintrinsic.cpp index c22ebc7b63544..9ffd3b7b011d5 100644 --- a/src/coreclr/jit/simdashwintrinsic.cpp +++ b/src/coreclr/jit/simdashwintrinsic.cpp @@ -513,23 +513,44 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, switch (intrinsic) { #if defined(TARGET_XARCH) + case NI_VectorT_ConvertToDouble: + { + if (IsBaselineVector512IsaSupportedOpportunistically()) + { + break; + } + return nullptr; + } + case NI_VectorT_ConvertToInt64: case NI_VectorT_ConvertToUInt32: case NI_VectorT_ConvertToUInt64: { - // TODO-XARCH-CQ: These intrinsics should be accelerated + if (IsBaselineVector512IsaSupportedOpportunistically()) + { + break; + } + return nullptr; + } + + case NI_VectorT_ConvertToInt32: + { + if (compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + break; + } return nullptr; } case NI_VectorT_ConvertToSingle: { - if (simdBaseType == TYP_UINT) + if ((simdBaseType == TYP_INT) || + (simdBaseType == TYP_UINT && IsBaselineVector512IsaSupportedOpportunistically())) { - // TODO-XARCH-CQ: These intrinsics should be accelerated - return nullptr; + break; } - break; + return nullptr; } #endif // TARGET_XARCH @@ -1154,50 +1175,95 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, } #if defined(TARGET_XARCH) + + case NI_VectorT_ConvertToInt64: + { + assert(sig->numArgs == 1); + assert(simdBaseType == TYP_DOUBLE); + return gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_LONG, simdBaseJitType, simdSize); + } + + case NI_VectorT_ConvertToUInt32: + { + assert(sig->numArgs == 1); + assert(simdBaseType == TYP_FLOAT); + return gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_UINT, simdBaseJitType, simdSize); + } + + case NI_VectorT_ConvertToUInt64: + { + assert(sig->numArgs == 1); + assert(simdBaseType == TYP_DOUBLE); + return gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_ULONG, simdBaseJitType, simdSize); + } + case NI_VectorT_ConvertToInt32: { assert(simdBaseType == TYP_FLOAT); - NamedIntrinsic convert; + return gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_INT, simdBaseJitType, simdSize); + } - switch (simdSize) + case NI_VectorT_ConvertToDouble: + { + assert(sig->numArgs == 1); + assert(varTypeIsLong(simdBaseType)); + NamedIntrinsic intrinsic = NI_Illegal; + if (simdSize == 64) { - case 16: - convert = NI_SSE2_ConvertToVector128Int32WithTruncation; - break; - case 32: - convert = NI_AVX_ConvertToVector256Int32WithTruncation; - break; - case 64: - convert = NI_AVX512F_ConvertToVector512Int32WithTruncation; - break; - default: - unreached(); + intrinsic = NI_AVX512DQ_ConvertToVector512Double; } - - return gtNewSimdHWIntrinsicNode(retType, op1, convert, simdBaseJitType, simdSize); + else if (simdSize == 32) + { + intrinsic = NI_AVX512DQ_VL_ConvertToVector256Double; + } + else + { + assert(simdSize == 16); + intrinsic = NI_AVX512DQ_VL_ConvertToVector128Double; + } + return gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); } case NI_VectorT_ConvertToSingle: { - assert(simdBaseType == TYP_INT); - NamedIntrinsic convert; - - switch (simdSize) + assert(varTypeIsInt(simdBaseType)); + NamedIntrinsic intrinsic = NI_Illegal; + if (simdBaseType == TYP_INT) { - case 16: - convert = NI_SSE2_ConvertToVector128Single; - break; - case 32: - convert = NI_AVX_ConvertToVector256Single; - break; - case 64: - convert = NI_AVX512F_ConvertToVector512Single; - break; - default: - unreached(); + switch (simdSize) + { + case 16: + intrinsic = NI_SSE2_ConvertToVector128Single; + break; + case 32: + intrinsic = NI_AVX_ConvertToVector256Single; + break; + case 64: + intrinsic = NI_AVX512F_ConvertToVector512Single; + break; + default: + unreached(); + } } - - return gtNewSimdHWIntrinsicNode(retType, op1, convert, simdBaseJitType, simdSize); + else if (simdBaseType == TYP_UINT) + { + switch (simdSize) + { + case 16: + intrinsic = NI_AVX512F_VL_ConvertToVector128Single; + break; + case 32: + intrinsic = NI_AVX512F_VL_ConvertToVector256Single; + break; + case 64: + intrinsic = NI_AVX512F_ConvertToVector512Single; + break; + default: + unreached(); + } + } + assert(intrinsic != NI_Illegal); + return gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); } #elif defined(TARGET_ARM64) case NI_VectorT_ConvertToDouble: diff --git a/src/coreclr/nativeaot/Runtime/MathHelpers.cpp b/src/coreclr/nativeaot/Runtime/MathHelpers.cpp index b5c4a251c82ba..73a5aa924794d 100644 --- a/src/coreclr/nativeaot/Runtime/MathHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/MathHelpers.cpp @@ -11,6 +11,10 @@ FCIMPL1_D(uint64_t, RhpDbl2ULng, double val) { +#if defined(HOST_X86) || defined(HOST_AMD64) + const double uint64_max_plus_1 = 4294967296.0 * 4294967296.0; + return (val > 0) ? ((val >= uint64_max_plus_1) ? UINT64_MAX : (uint64_t)val) : 0; +#else const double two63 = 2147483648.0 * 4294967296.0; uint64_t ret; if (val < two63) @@ -23,6 +27,42 @@ FCIMPL1_D(uint64_t, RhpDbl2ULng, double val) ret = (int64_t)(val - two63) + I64(0x8000000000000000); } return ret; +#endif //HOST_X86 || HOST_AMD64 +} +FCIMPLEND + +FCIMPL1_D(int64_t, RhpDbl2Lng, double val) +{ +#if defined(HOST_X86) || defined(HOST_AMD64) + const double int64_min = -2147483648.0 * 4294967296.0; + const double int64_max = 2147483648.0 * 4294967296.0; + return (val != val) ? 0 : (val <= int64_min) ? INT64_MIN : (val >= int64_max) ? INT64_MAX : (int64_t)val; +#else + return (int64_t)val; +#endif //HOST_X86 || HOST_AMD64 +} +FCIMPLEND + +FCIMPL1_D(int32_t, RhpDbl2Int, double val) +{ +#if defined(HOST_X86) || defined(HOST_AMD64) + const double int32_min = -2147483648.0; + const double int32_max_plus_1 = 2147483648.0; + return (val != val) ? 0 : (val <= int32_min) ? INT32_MIN : (val >= int32_max_plus_1) ? INT32_MAX : (int32_t)val; +#else + return (int32_t)val; +#endif //HOST_X86 || HOST_AMD64 +} +FCIMPLEND + +FCIMPL1_D(uint32_t, RhpDbl2UInt, double val) +{ +#if defined(HOST_X86) || defined(HOST_AMD64) + const double uint_max = 4294967295.0; + return (val > 0) ? ((val >= uint_max) ? UINT32_MAX : (uint32_t)val) : 0; +#else + return (uint32_t)val; +#endif //HOST_X86 || HOST_AMD64 } FCIMPLEND @@ -51,24 +91,6 @@ EXTERN_C uint64_t QCALLTYPE RhpULMod(uint64_t i, uint64_t j) return i % j; } -FCIMPL1_D(int64_t, RhpDbl2Lng, double val) -{ - return (int64_t)val; -} -FCIMPLEND - -FCIMPL1_D(int32_t, RhpDbl2Int, double val) -{ - return (int32_t)val; -} -FCIMPLEND - -FCIMPL1_D(uint32_t, RhpDbl2UInt, double val) -{ - return (uint32_t)val; -} -FCIMPLEND - FCIMPL1_L(double, RhpLng2Dbl, int64_t val) { return (double)val; @@ -336,4 +358,4 @@ FCIMPL2_FI(float, modff, float x, float* intptr) return std::modff(x, intptr); FCIMPLEND -#endif +#endif \ No newline at end of file diff --git a/src/coreclr/vm/i386/jithelp.S b/src/coreclr/vm/i386/jithelp.S index c1da6f4dcb801..d027525202781 100644 --- a/src/coreclr/vm/i386/jithelp.S +++ b/src/coreclr/vm/i386/jithelp.S @@ -551,87 +551,6 @@ LOCAL_LABEL(LRszMORE32): ret LEAF_END JIT_LRsz, _TEXT -// *********************************************************************/ -// JIT_Dbl2LngP4x87 -// -// Purpose: -// converts a double to a long truncating toward zero (C semantics) -// -// uses stdcall calling conventions -// -// This code is faster on a P4 than the Dbl2Lng code above, but is -// slower on a PIII. Hence we choose this code when on a P4 or above. -// -LEAF_ENTRY JIT_Dbl2LngP4x87, _TEXT - // get some local space - sub esp, 8 - - #define arg1 [esp + 0x0C] - fld QWORD PTR arg1 // fetch arg - fnstcw WORD PTR arg1 // store FPCW - movzx eax, WORD PTR arg1 // zero extend - wide - or ah, 0x0C // turn on OE and DE flags - mov DWORD PTR [esp], eax // store new FPCW bits - fldcw WORD PTR [esp] // reload FPCW with new bits - fistp QWORD PTR [esp] // convert - - // reload FP result - mov eax, DWORD PTR [esp] - mov edx, DWORD PTR [esp + 4] - - // reload original FPCW value - fldcw WORD PTR arg1 - #undef arg1 - - // restore stack - add esp, 8 - - ret -LEAF_END JIT_Dbl2LngP4x87, _TEXT - -// *********************************************************************/ -// JIT_Dbl2LngSSE3 -// -// Purpose: -// converts a double to a long truncating toward zero (C semantics) -// -// uses stdcall calling conventions -// -// This code is faster than the above P4 x87 code for Intel processors -// equal or later than Core2 and Atom that have SSE3 support -// -LEAF_ENTRY JIT_Dbl2LngSSE3, _TEXT - // get some local space - sub esp, 8 - - fld QWORD PTR [esp + 0x0C] // fetch arg - fisttp QWORD PTR [esp] // convert - mov eax, DWORD PTR [esp] // reload FP result - mov edx, DWORD PTR [esp + 4] - - // restore stack - add esp, 8 - - ret -LEAF_END JIT_Dbl2LngSSE3, _TEXT - -// *********************************************************************/ -// JIT_Dbl2IntSSE2 -// -// Purpose: -// converts a double to a long truncating toward zero (C semantics) -// -// uses stdcall calling conventions -// -// This code is even faster than the P4 x87 code for Dbl2LongP4x87, -// but only returns a 32 bit value (only good for int). -// -LEAF_ENTRY JIT_Dbl2IntSSE2, _TEXT - movsd xmm0, [esp + 4] - cvttsd2si eax, xmm0 - ret -LEAF_END JIT_Dbl2IntSSE2, _TEXT - // *********************************************************************/ // JIT_StackProbe // diff --git a/src/coreclr/vm/i386/jithelp.asm b/src/coreclr/vm/i386/jithelp.asm index 0faf7cde0e0b2..c2011190abc3f 100644 --- a/src/coreclr/vm/i386/jithelp.asm +++ b/src/coreclr/vm/i386/jithelp.asm @@ -36,11 +36,6 @@ JIT_LLsh TEXTEQU <_JIT_LLsh@0> JIT_LRsh TEXTEQU <_JIT_LRsh@0> JIT_LRsz TEXTEQU <_JIT_LRsz@0> JIT_LMul TEXTEQU <@JIT_LMul@16> -JIT_Dbl2LngOvf TEXTEQU <@JIT_Dbl2LngOvf@8> -JIT_Dbl2Lng TEXTEQU <@JIT_Dbl2Lng@8> -JIT_Dbl2IntSSE2 TEXTEQU <@JIT_Dbl2IntSSE2@8> -JIT_Dbl2LngP4x87 TEXTEQU <@JIT_Dbl2LngP4x87@8> -JIT_Dbl2LngSSE3 TEXTEQU <@JIT_Dbl2LngSSE3@8> JIT_InternalThrowFromHelper TEXTEQU <@JIT_InternalThrowFromHelper@4> JIT_WriteBarrierReg_PreGrow TEXTEQU <_JIT_WriteBarrierReg_PreGrow@0> JIT_WriteBarrierReg_PostGrow TEXTEQU <_JIT_WriteBarrierReg_PostGrow@0> @@ -637,182 +632,6 @@ LMul_hard: JIT_LMul ENDP -;*********************************************************************/ -; JIT_Dbl2LngOvf - -;Purpose: -; converts a double to a long truncating toward zero (C semantics) -; with check for overflow -; -; uses stdcall calling conventions -; -PUBLIC JIT_Dbl2LngOvf -JIT_Dbl2LngOvf PROC - fnclex - fld qword ptr [esp+4] - push ecx - push ecx - fstp qword ptr [esp] - call JIT_Dbl2Lng - mov ecx,eax - fnstsw ax - test ax,01h - jnz Dbl2LngOvf_throw - mov eax,ecx - ret 8 - -Dbl2LngOvf_throw: - mov ECX, CORINFO_OverflowException_ASM - call JIT_InternalThrowFromHelper - ret 8 -JIT_Dbl2LngOvf ENDP - -;*********************************************************************/ -; JIT_Dbl2Lng - -;Purpose: -; converts a double to a long truncating toward zero (C semantics) -; -; uses stdcall calling conventions -; -; note that changing the rounding mode is very expensive. This -; routine basiclly does the truncation semantics without changing -; the rounding mode, resulting in a win. -; -PUBLIC JIT_Dbl2Lng -JIT_Dbl2Lng PROC - fld qword ptr[ESP+4] ; fetch arg - lea ecx,[esp-8] - sub esp,16 ; allocate frame - and ecx,-8 ; align pointer on boundary of 8 - fld st(0) ; duplciate top of stack - fistp qword ptr[ecx] ; leave arg on stack, also save in temp - fild qword ptr[ecx] ; arg, round(arg) now on stack - mov edx,[ecx+4] ; high dword of integer - mov eax,[ecx] ; low dword of integer - test eax,eax - je integer_QNaN_or_zero - -arg_is_not_integer_QNaN: - fsubp st(1),st ; TOS=d-round(d), - ; { st(1)=st(1)-st & pop ST } - test edx,edx ; what's sign of integer - jns positive - ; number is negative - ; dead cycle - ; dead cycle - fstp dword ptr[ecx] ; result of subtraction - mov ecx,[ecx] ; dword of difference(single precision) - add esp,16 - xor ecx,80000000h - add ecx,7fffffffh ; if difference>0 then increment integer - adc eax,0 ; inc eax (add CARRY flag) - adc edx,0 ; propagate carry flag to upper bits - ret 8 - -positive: - fstp dword ptr[ecx] ;17-18 ; result of subtraction - mov ecx,[ecx] ; dword of difference (single precision) - add esp,16 - add ecx,7fffffffh ; if difference<0 then decrement integer - sbb eax,0 ; dec eax (subtract CARRY flag) - sbb edx,0 ; propagate carry flag to upper bits - ret 8 - -integer_QNaN_or_zero: - test edx,7fffffffh - jnz arg_is_not_integer_QNaN - fstp st(0) ;; pop round(arg) - fstp st(0) ;; arg - add esp,16 - ret 8 -JIT_Dbl2Lng ENDP - -;*********************************************************************/ -; JIT_Dbl2LngP4x87 - -;Purpose: -; converts a double to a long truncating toward zero (C semantics) -; -; uses stdcall calling conventions -; -; This code is faster on a P4 than the Dbl2Lng code above, but is -; slower on a PIII. Hence we choose this code when on a P4 or above. -; -PUBLIC JIT_Dbl2LngP4x87 -JIT_Dbl2LngP4x87 PROC -arg1 equ <[esp+0Ch]> - - sub esp, 8 ; get some local space - - fld qword ptr arg1 ; fetch arg - fnstcw word ptr arg1 ; store FPCW - movzx eax, word ptr arg1 ; zero extend - wide - or ah, 0Ch ; turn on OE and DE flags - mov dword ptr [esp], eax ; store new FPCW bits - fldcw word ptr [esp] ; reload FPCW with new bits - fistp qword ptr [esp] ; convert - mov eax, dword ptr [esp] ; reload FP result - mov edx, dword ptr [esp+4] ; - fldcw word ptr arg1 ; reload original FPCW value - - add esp, 8 ; restore stack - - ret 8 -JIT_Dbl2LngP4x87 ENDP - -;*********************************************************************/ -; JIT_Dbl2LngSSE3 - -;Purpose: -; converts a double to a long truncating toward zero (C semantics) -; -; uses stdcall calling conventions -; -; This code is faster than the above P4 x87 code for Intel processors -; equal or later than Core2 and Atom that have SSE3 support -; -.686P -.XMM -PUBLIC JIT_Dbl2LngSSE3 -JIT_Dbl2LngSSE3 PROC -arg1 equ <[esp+0Ch]> - - sub esp, 8 ; get some local space - - fld qword ptr arg1 ; fetch arg - fisttp qword ptr [esp] ; convert - mov eax, dword ptr [esp] ; reload FP result - mov edx, dword ptr [esp+4] - - add esp, 8 ; restore stack - - ret 8 -JIT_Dbl2LngSSE3 ENDP -.586 - -;*********************************************************************/ -; JIT_Dbl2IntSSE2 - -;Purpose: -; converts a double to a long truncating toward zero (C semantics) -; -; uses stdcall calling conventions -; -; This code is even faster than the P4 x87 code for Dbl2LongP4x87, -; but only returns a 32 bit value (only good for int). -; -.686P -.XMM -PUBLIC JIT_Dbl2IntSSE2 -JIT_Dbl2IntSSE2 PROC - $movsd xmm0, [esp+4] - cvttsd2si eax, xmm0 - ret 8 -JIT_Dbl2IntSSE2 ENDP -.586 - - ;*********************************************************************/ ; This is the small write barrier thunk we use when we know the ; ephemeral generation is higher in memory than older generations. @@ -1214,39 +1033,6 @@ JIT_TailCallVSDLeave: JIT_TailCall ENDP - -;------------------------------------------------------------------------------ - -; HCIMPL2_VV(float, JIT_FltRem, float dividend, float divisor) -@JIT_FltRem@8 proc public - fld dword ptr [esp+4] ; divisor - fld dword ptr [esp+8] ; dividend -fremloop: - fprem - fstsw ax - fwait - sahf - jp fremloop ; Continue while the FPU status bit C2 is set - fxch ; swap, so divisor is on top and result is in st(1) - fstp ST(0) ; Pop the divisor from the FP stack - retn 8 ; Return value is in st(0) -@JIT_FltRem@8 endp - -; HCIMPL2_VV(float, JIT_DblRem, float dividend, float divisor) -@JIT_DblRem@16 proc public - fld qword ptr [esp+4] ; divisor - fld qword ptr [esp+12] ; dividend -fremloopd: - fprem - fstsw ax - fwait - sahf - jp fremloopd ; Continue while the FPU status bit C2 is set - fxch ; swap, so divisor is on top and result is in st(1) - fstp ST(0) ; Pop the divisor from the FP stack - retn 16 ; Return value is in st(0) -@JIT_DblRem@16 endp - ;------------------------------------------------------------------------------ ; PatchedCodeStart and PatchedCodeEnd are used to determine bounds of patched code. diff --git a/src/coreclr/vm/i386/jitinterfacex86.cpp b/src/coreclr/vm/i386/jitinterfacex86.cpp index 08360e9ff0c06..bfc7c0abc674b 100644 --- a/src/coreclr/vm/i386/jitinterfacex86.cpp +++ b/src/coreclr/vm/i386/jitinterfacex86.cpp @@ -96,26 +96,6 @@ extern "C" void STDCALL WriteBarrierAssert(BYTE* ptr, Object* obj) #endif // _DEBUG -#ifndef TARGET_UNIX - -HCIMPL1_V(INT32, JIT_Dbl2IntOvf, double val) -{ - FCALL_CONTRACT; - - INT64 ret = HCCALL1_V(JIT_Dbl2Lng, val); - - if (ret != (INT32) ret) - goto THROW; - - return (INT32) ret; - -THROW: - FCThrow(kOverflowException); -} -HCIMPLEND -#endif // TARGET_UNIX - - FCDECL1(Object*, JIT_New, CORINFO_CLASS_HANDLE typeHnd_); @@ -961,32 +941,6 @@ void InitJITHelpers1() JIT_TrialAlloc::Flags flags = GCHeapUtilities::UseThreadAllocationContexts() ? JIT_TrialAlloc::MP_ALLOCATOR : JIT_TrialAlloc::NORMAL; - // Get CPU features and check for SSE2 support. - // This code should eventually probably be moved into codeman.cpp, - // where we set the cpu feature flags for the JIT based on CPU type and features. - int cpuFeatures[4]; - __cpuid(cpuFeatures, 1); - - DWORD dwCPUFeaturesECX = cpuFeatures[2]; - DWORD dwCPUFeaturesEDX = cpuFeatures[3]; - - // If bit 26 (SSE2) is set, then we can use the SSE2 flavors - // and faster x87 implementation for the P4 of Dbl2Lng. - if (dwCPUFeaturesEDX & (1<<26)) - { - SetJitHelperFunction(CORINFO_HELP_DBL2INT, JIT_Dbl2IntSSE2); - if (dwCPUFeaturesECX & 1) // check SSE3 - { - SetJitHelperFunction(CORINFO_HELP_DBL2UINT, JIT_Dbl2LngSSE3); - SetJitHelperFunction(CORINFO_HELP_DBL2LNG, JIT_Dbl2LngSSE3); - } - else - { - SetJitHelperFunction(CORINFO_HELP_DBL2UINT, JIT_Dbl2LngP4x87); // SSE2 only for signed - SetJitHelperFunction(CORINFO_HELP_DBL2LNG, JIT_Dbl2LngP4x87); - } - } - if (!(TrackAllocationsEnabled() || LoggingOn(LF_GCALLOC, LL_INFO10) #ifdef _DEBUG diff --git a/src/coreclr/vm/jithelpers.cpp b/src/coreclr/vm/jithelpers.cpp index 18629a0da2414..4614a89f403c4 100644 --- a/src/coreclr/vm/jithelpers.cpp +++ b/src/coreclr/vm/jithelpers.cpp @@ -514,46 +514,60 @@ HCIMPL1_V(double, JIT_Lng2Dbl, INT64 val) HCIMPLEND /*********************************************************************/ -// Call fast Dbl2Lng conversion - used by functions below -FORCEINLINE INT64 FastDbl2Lng(double val) +HCIMPL1_V(INT64, JIT_Dbl2Lng, double val) { -#ifdef TARGET_X86 FCALL_CONTRACT; - return HCCALL1_V(JIT_Dbl2Lng, val); + +#if defined(TARGET_X86) || defined(TARGET_AMD64) + const double int64_min = -2147483648.0 * 4294967296.0; + const double int64_max = 2147483648.0 * 4294967296.0; + return (val != val) ? 0 : (val <= int64_min) ? INT64_MIN : (val >= int64_max) ? INT64_MAX : (INT64)val; #else - FCALL_CONTRACT; - return((__int64) val); -#endif + return((INT64)val); +#endif // TARGET_X86 || TARGET_AMD64 } +HCIMPLEND /*********************************************************************/ HCIMPL1_V(UINT32, JIT_Dbl2UIntOvf, double val) { FCALL_CONTRACT; - // Note that this expression also works properly for val = NaN case + // Note that this expression also works properly for val = NaN case if (val > -1.0 && val < 4294967296.0) - return((UINT32)FastDbl2Lng(val)); + return((UINT32)val); FCThrow(kOverflowException); } HCIMPLEND /*********************************************************************/ -HCIMPL1_V(UINT64, JIT_Dbl2ULng, double val) +HCIMPL1_V(int, JIT_Dbl2IntOvf, double val) +{ + FCALL_CONTRACT; + + const double two31 = 2147483648.0; + // Note that this expression also works properly for val = NaN case + if (val > -two31 - 1 && val < two31) + return((INT32)val); + + FCThrow(kOverflowException); +} +HCIMPLEND + +/*********************************************************************/ +HCIMPL1_V(INT64, JIT_Dbl2LngOvf, double val) { FCALL_CONTRACT; const double two63 = 2147483648.0 * 4294967296.0; - UINT64 ret; - if (val < two63) { - ret = FastDbl2Lng(val); - } - else { - // subtract 0x8000000000000000, do the convert then add it back again - ret = FastDbl2Lng(val - two63) + I64(0x8000000000000000); - } - return ret; + + // Note that this expression also works properly for val = NaN case + // We need to compare with the very next double to two63. 0x402 is epsilon to get us there. + if (val > -two63 - 0x402 && val < two63) + return((INT64)val); + + FCThrow(kOverflowException); } HCIMPLEND @@ -563,69 +577,69 @@ HCIMPL1_V(UINT64, JIT_Dbl2ULngOvf, double val) FCALL_CONTRACT; const double two64 = 4294967296.0 * 4294967296.0; - // Note that this expression also works properly for val = NaN case - if (val > -1.0 && val < two64) { - const double two63 = 2147483648.0 * 4294967296.0; - UINT64 ret; - if (val < two63) { - ret = FastDbl2Lng(val); - } - else { - // subtract 0x8000000000000000, do the convert then add it back again - ret = FastDbl2Lng(val - two63) + I64(0x8000000000000000); - } -#ifdef _DEBUG - // since no overflow can occur, the value always has to be within 1 - double roundTripVal = HCCALL1_V(JIT_ULng2Dbl, ret); - _ASSERTE(val - 1.0 <= roundTripVal && roundTripVal <= val + 1.0); -#endif // _DEBUG - return ret; - } + // Note that this expression also works properly for val = NaN case + if (val > -1.0 && val < two64) + return (UINT64)val; FCThrow(kOverflowException); } HCIMPLEND - -#if !defined(TARGET_X86) || defined(TARGET_UNIX) - -HCIMPL1_V(INT64, JIT_Dbl2Lng, double val) +HCIMPL1_V(UINT32, JIT_Dbl2UInt, double val) { FCALL_CONTRACT; - return((INT64)val); +#if defined(TARGET_X86) || defined(TARGET_AMD64) + const double uint_max = 4294967295.0; + // Note that this expression also works properly for val = NaN case + return (val >= 0) ? ((val >= uint_max) ? UINT32_MAX : (UINT32)val) : 0; +#else + return((UINT32)val); +#endif //TARGET_X86 || TARGET_AMD64 } HCIMPLEND -HCIMPL1_V(int, JIT_Dbl2IntOvf, double val) +/*********************************************************************/ +HCIMPL1_V(INT32, JIT_Dbl2Int, double val) { FCALL_CONTRACT; - const double two31 = 2147483648.0; - - // Note that this expression also works properly for val = NaN case - if (val > -two31 - 1 && val < two31) - return((INT32)val); - - FCThrow(kOverflowException); +#if defined(TARGET_X86) || defined(TARGET_AMD64) + const double int32_min = -2147483648.0; + const double int32_max_plus_1 = 2147483648.0; + return (val != val) ? 0 : (val <= int32_min) ? INT32_MIN : (val >= int32_max_plus_1) ? INT32_MAX : (INT32)val; +#else + return((INT32)val); +#endif // TARGET_X86 || TARGET_AMD64 } HCIMPLEND -HCIMPL1_V(INT64, JIT_Dbl2LngOvf, double val) +/*********************************************************************/ +HCIMPL1_V(UINT64, JIT_Dbl2ULng, double val) { FCALL_CONTRACT; - const double two63 = 2147483648.0 * 4294967296.0; - +#if defined(TARGET_X86) || defined(TARGET_AMD64) + const double uint64_max_plus_1 = 4294967296.0 * 4294967296.0; // Note that this expression also works properly for val = NaN case - // We need to compare with the very next double to two63. 0x402 is epsilon to get us there. - if (val > -two63 - 0x402 && val < two63) - return((INT64)val); + return (val >= 0) ? ((val >= uint64_max_plus_1) ? UINT64_MAX : (UINT64)val) : 0; - FCThrow(kOverflowException); +#else + const double two63 = 2147483648.0 * 4294967296.0; + UINT64 ret; + if (val < two63) { + ret = (INT64)(val); + } + else { + // subtract 0x8000000000000000, do the convert then add it back again + ret = (INT64)(val - two63) + I64(0x8000000000000000); + } + return ret; +#endif // TARGET_X86 || TARGET_AMD64 } HCIMPLEND +/*********************************************************************/ HCIMPL2_VV(float, JIT_FltRem, float dividend, float divisor) { FCALL_CONTRACT; @@ -634,6 +648,7 @@ HCIMPL2_VV(float, JIT_FltRem, float dividend, float divisor) } HCIMPLEND +/*********************************************************************/ HCIMPL2_VV(double, JIT_DblRem, double dividend, double divisor) { FCALL_CONTRACT; @@ -642,8 +657,6 @@ HCIMPL2_VV(double, JIT_DblRem, double dividend, double divisor) } HCIMPLEND -#endif // !TARGET_X86 || TARGET_UNIX - #include diff --git a/src/coreclr/vm/jitinterface.h b/src/coreclr/vm/jitinterface.h index 90c3fdcc9f4d0..7429352a47de6 100644 --- a/src/coreclr/vm/jitinterface.h +++ b/src/coreclr/vm/jitinterface.h @@ -325,17 +325,6 @@ EXTERN_C FCDECL2(Object*, JIT_NewArr1OBJ_MP_InlineGetThread, CORINFO_CLASS_HANDL EXTERN_C FCDECL2_VV(INT64, JIT_LMul, INT64 val1, INT64 val2); -EXTERN_C FCDECL1_V(INT64, JIT_Dbl2Lng, double val); -EXTERN_C FCDECL1_V(INT64, JIT_Dbl2IntSSE2, double val); -EXTERN_C FCDECL1_V(INT64, JIT_Dbl2LngP4x87, double val); -EXTERN_C FCDECL1_V(INT64, JIT_Dbl2LngSSE3, double val); -EXTERN_C FCDECL1_V(INT64, JIT_Dbl2LngOvf, double val); - -EXTERN_C FCDECL1_V(INT32, JIT_Dbl2IntOvf, double val); - -EXTERN_C FCDECL2_VV(float, JIT_FltRem, float dividend, float divisor); -EXTERN_C FCDECL2_VV(double, JIT_DblRem, double dividend, double divisor); - #ifndef HOST_64BIT #ifdef TARGET_X86 // JIThelp.asm diff --git a/src/tests/JIT/Directed/Convert/out_of_range_fp_to_int_conversions.cpp b/src/tests/JIT/Directed/Convert/out_of_range_fp_to_int_conversions.cpp index eaf7f2fa1a9da..a16932e8a78a4 100644 --- a/src/tests/JIT/Directed/Convert/out_of_range_fp_to_int_conversions.cpp +++ b/src/tests/JIT/Directed/Convert/out_of_range_fp_to_int_conversions.cpp @@ -17,7 +17,6 @@ typedef enum { CONVERT_SENTINEL, CONVERT_SATURATING, CONVERT_NATIVECOMPILERBEHAVIOR, - CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64, CONVERT_MANAGED_BACKWARD_COMPATIBLE_ARM32, } FPtoIntegerConversionType; @@ -30,7 +29,6 @@ extern "C" DLLEXPORT int32_t ConvertDoubleToInt32(double x, FPtoIntegerConversio switch (t) { case CONVERT_BACKWARD_COMPATIBLE: - case CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64: case CONVERT_SENTINEL: return ((x != x) || (x < INT32_MIN) || (x > INT32_MAX)) ? INT32_MIN : (int32_t)x; @@ -53,7 +51,6 @@ extern "C" DLLEXPORT uint32_t ConvertDoubleToUInt32(double x, FPtoIntegerConvers const double int64_max_plus_1 = 0x1.p63; // 0x43e0000000000000 // (uint64_t)INT64_MAX + 1; switch (t) { - case CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64: case CONVERT_BACKWARD_COMPATIBLE: return ((x != x) || (x < INT64_MIN) || (x >= int64_max_plus_1)) ? 0 : (uint32_t)(int64_t)x; @@ -95,7 +92,6 @@ extern "C" DLLEXPORT int64_t ConvertDoubleToInt64(double x, FPtoIntegerConversio const double int32_max_plus1 = ((double)INT32_MAX) + 1; switch (t) { - case CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64: case CONVERT_BACKWARD_COMPATIBLE: case CONVERT_SENTINEL: return ((x != x) || (x < INT64_MIN) || (x >= int64_max_plus_1)) ? INT64_MIN : (int64_t)x; @@ -154,17 +150,6 @@ extern "C" DLLEXPORT uint64_t ConvertDoubleToUInt64(double x, FPtoIntegerConver } } - case CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64: - if (x < int64_max_plus_1) - { - return (x < INT64_MIN) ? (uint64_t)INT64_MIN : (uint64_t)(int64_t)x; - } - else - { - x -= int64_max_plus_1; - x = trunc(x); - return (uint64_t)(((x != x) || (x >= int64_max_plus_1)) ? INT64_MIN : (int64_t)x) + (0x8000000000000000); - } case CONVERT_NATIVECOMPILERBEHAVIOR: // handled above, but add case to silence warning return 0; } diff --git a/src/tests/JIT/Directed/Convert/out_of_range_fp_to_int_conversions.cs b/src/tests/JIT/Directed/Convert/out_of_range_fp_to_int_conversions.cs index 5b78783c09e4c..e61078a0e0501 100644 --- a/src/tests/JIT/Directed/Convert/out_of_range_fp_to_int_conversions.cs +++ b/src/tests/JIT/Directed/Convert/out_of_range_fp_to_int_conversions.cs @@ -19,7 +19,6 @@ public enum FPtoIntegerConversionType CONVERT_SENTINEL, CONVERT_SATURATING, CONVERT_NATIVECOMPILERBEHAVIOR, - CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64, CONVERT_MANAGED_BACKWARD_COMPATIBLE_ARM32, } @@ -87,7 +86,6 @@ public static int ConvertDoubleToInt32(double x, FPtoIntegerConversionType t) switch (t) { - case FPtoIntegerConversionType.CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64: case FPtoIntegerConversionType.CONVERT_BACKWARD_COMPATIBLE: case FPtoIntegerConversionType.CONVERT_SENTINEL: return (Double.IsNaN(x) || (x int.MaxValue)) ? int.MinValue: (int) x; @@ -109,7 +107,6 @@ public static uint ConvertDoubleToUInt32(double x, FPtoIntegerConversionType t) switch (t) { - case FPtoIntegerConversionType.CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64: case FPtoIntegerConversionType.CONVERT_BACKWARD_COMPATIBLE: return (Double.IsNaN(x) || (x < long.MinValue) || (x >= llong_max_plus_1)) ? 0 : (uint)(long)x; @@ -136,7 +133,6 @@ public static long ConvertDoubleToInt64(double x, FPtoIntegerConversionType t) switch (t) { - case FPtoIntegerConversionType.CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64: case FPtoIntegerConversionType.CONVERT_BACKWARD_COMPATIBLE: case FPtoIntegerConversionType.CONVERT_SENTINEL: return (Double.IsNaN(x) || (x < long.MinValue) || (x >= llong_max_plus_1)) ? long.MinValue : (long)x; @@ -199,21 +195,6 @@ public static ulong ConvertDoubleToUInt64(double x, FPtoIntegerConversionType t) return (ulong)ConvertDoubleToInt64(x - two63, FPtoIntegerConversionType.CONVERT_MANAGED_BACKWARD_COMPATIBLE_ARM32) + (0x8000000000000000); } } - - case FPtoIntegerConversionType.CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64: - - if (x < two63) - { - return (x < long.MinValue) ? unchecked((ulong)long.MinValue) : (ulong)(long)x; - } - else - { - // (double)LLONG_MAX cannot be represented exactly as double - const double llong_max_plus_1 = (double)((ulong)long.MaxValue + 1); - x -= two63; - x = Math.Truncate(x); - return (ulong)((Double.IsNaN(x) || (x >= llong_max_plus_1)) ? long.MinValue : (long)x) + (0x8000000000000000); - } } return 0; @@ -263,7 +244,7 @@ public static Vector ConvertToVectorUInt64(Vector vFloat, FPtoInt public class Program { static int failures = 0; - static FPtoIntegerConversionType ManagedConversionRule = FPtoIntegerConversionType.CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64; + static FPtoIntegerConversionType ManagedConversionRule = FPtoIntegerConversionType.CONVERT_SATURATING; static void TestBitValue(uint value, double? dblValNullable = null, FPtoIntegerConversionType? tValue = null) { @@ -280,7 +261,6 @@ static void TestBitValue(uint value, double? dblValNullable = null, FPtoIntegerC if (!tValue.HasValue) { - TestBitValue(value, dblVal, FPtoIntegerConversionType.CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64); TestBitValue(value, dblVal, FPtoIntegerConversionType.CONVERT_MANAGED_BACKWARD_COMPATIBLE_ARM32); TestBitValue(value, dblVal, FPtoIntegerConversionType.CONVERT_BACKWARD_COMPATIBLE); TestBitValue(value, dblVal, FPtoIntegerConversionType.CONVERT_SATURATING); @@ -377,15 +357,12 @@ public static int TestEntryPoint() { switch (RuntimeInformation.ProcessArchitecture) { - case Architecture.X86: - case Architecture.X64: - Program.ManagedConversionRule = FPtoIntegerConversionType.CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64; - break; - case Architecture.Arm: Program.ManagedConversionRule = FPtoIntegerConversionType.CONVERT_MANAGED_BACKWARD_COMPATIBLE_ARM32; break; + case Architecture.X86: + case Architecture.X64: case Architecture.Arm64: Program.ManagedConversionRule = FPtoIntegerConversionType.CONVERT_SATURATING; break; diff --git a/src/tests/JIT/Regression/CLR-x86-JIT/V1-M12-Beta2/b28598/b28598.il b/src/tests/JIT/Regression/CLR-x86-JIT/V1-M12-Beta2/b28598/b28598.il index b8ccece0a1d6f..ff132dd868596 100644 --- a/src/tests/JIT/Regression/CLR-x86-JIT/V1-M12-Beta2/b28598/b28598.il +++ b/src/tests/JIT/Regression/CLR-x86-JIT/V1-M12-Beta2/b28598/b28598.il @@ -48,6 +48,9 @@ End_Orphan_3: } catch [mscorlib]System.OverflowException { pop leave the_end +} catch [mscorlib]System.DivideByZeroException { + pop + leave the_end } the_end: ldc.i4 100 diff --git a/src/tests/JIT/Regression/CLR-x86-JIT/V1-M12-Beta2/b50027/b50027.il b/src/tests/JIT/Regression/CLR-x86-JIT/V1-M12-Beta2/b50027/b50027.il index 65f3bc2af34f6..0422a59b02052 100644 --- a/src/tests/JIT/Regression/CLR-x86-JIT/V1-M12-Beta2/b50027/b50027.il +++ b/src/tests/JIT/Regression/CLR-x86-JIT/V1-M12-Beta2/b50027/b50027.il @@ -684,6 +684,9 @@ leave END } catch [mscorlib]System.OverflowException { pop leave END +} catch [mscorlib]System.DivideByZeroException { + pop + leave END } END: ldc.i4 100 diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_62692/Runtime_62692.cs b/src/tests/JIT/Regression/JitBlue/Runtime_62692/Runtime_62692.cs index 5b85cbb0115a0..fe5105d7a91fb 100644 --- a/src/tests/JIT/Regression/JitBlue/Runtime_62692/Runtime_62692.cs +++ b/src/tests/JIT/Regression/JitBlue/Runtime_62692/Runtime_62692.cs @@ -5,6 +5,7 @@ using System; using System.Runtime.Intrinsics.X86; using Xunit; +using System.Runtime.InteropServices; public unsafe class Runtime_62692 { @@ -39,8 +40,8 @@ public static int TestEntryPoint() AssertEqual(Problem2(1111, 0xFFFF_FFFF_0000_0001), 3414328792); AssertEqual(Problem3(1, 0xFFFF_0001), 0); AssertEqual(Problem4(1111, 0xFFFF_FFFF_0000_0001), 3414328792); - AssertEqual(Problem5(1111, double.MaxValue), 3307008522); - AssertEqual(Problem6(1111, float.MaxValue), 3307008522); + AssertEqual(Problem5(1111, double.MaxValue), 1921271346); + AssertEqual(Problem6(1111, float.MaxValue), 1921271346); AssertEqual(Problem5(1111, double.MinValue), 3307008522); AssertEqual(Problem6(1111, float.MinValue), 3307008522); AssertEqual(Problem5(1111, -0.0), 3307008522); diff --git a/src/tests/issues.targets b/src/tests/issues.targets index 886d6dd9f743e..f0b1baef76e00 100644 --- a/src/tests/issues.targets +++ b/src/tests/issues.targets @@ -1166,6 +1166,9 @@ + + https://github.com/dotnet/runtime/issues/100368 + https://github.com/dotnet/runtime/issues/88775