diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index e0d7c01ca33516..28f66e71c2f2de 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -1015,6 +1015,7 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat Number) { case APFloat::S_Float8E5M2FNUZ: case APFloat::S_Float8E4M3FNUZ: case APFloat::S_Float8E4M3B11FNUZ: + case APFloat::S_Float8E3M4: case APFloat::S_FloatTF32: case APFloat::S_Float6E3M2FN: case APFloat::S_Float6E2M3FN: diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h index bff8e6490d1de9..7039e961bff82d 100644 --- a/llvm/include/llvm/ADT/APFloat.h +++ b/llvm/include/llvm/ADT/APFloat.h @@ -188,6 +188,9 @@ struct APFloatBase { // This format's exponent bias is 11, instead of the 7 (2 ** (4 - 1) - 1) // that IEEE precedent would imply. S_Float8E4M3B11FNUZ, + // 8-bit floating point number following IEEE-754 conventions with bit + // layout S1E3M4. + S_Float8E3M4, // Floating point number that occupies 32 bits or less of storage, providing // improved range compared to half (16-bit) formats, at (potentially) // greater throughput than single precision (32-bit) formats. @@ -224,6 +227,7 @@ struct APFloatBase { static const fltSemantics &Float8E4M3FN() LLVM_READNONE; static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE; static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE; + static const fltSemantics &Float8E3M4() LLVM_READNONE; static const fltSemantics &FloatTF32() LLVM_READNONE; static const fltSemantics &Float6E3M2FN() LLVM_READNONE; static const fltSemantics &Float6E2M3FN() LLVM_READNONE; @@ -646,6 +650,7 @@ class IEEEFloat final : public APFloatBase { APInt convertFloat8E4M3FNAPFloatToAPInt() const; APInt convertFloat8E4M3FNUZAPFloatToAPInt() const; APInt convertFloat8E4M3B11FNUZAPFloatToAPInt() const; + APInt convertFloat8E3M4APFloatToAPInt() const; APInt convertFloatTF32APFloatToAPInt() const; APInt convertFloat6E3M2FNAPFloatToAPInt() const; APInt convertFloat6E2M3FNAPFloatToAPInt() const; @@ -665,6 +670,7 @@ class IEEEFloat final : public APFloatBase { void initFromFloat8E4M3FNAPInt(const APInt &api); void initFromFloat8E4M3FNUZAPInt(const APInt &api); void initFromFloat8E4M3B11FNUZAPInt(const APInt &api); + void initFromFloat8E3M4APInt(const APInt &api); void initFromFloatTF32APInt(const APInt &api); void initFromFloat6E3M2FNAPInt(const APInt &api); void initFromFloat6E2M3FNAPInt(const APInt &api); diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index 26b4f8e55448ff..7f68c5ab9b7cf7 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -143,6 +143,7 @@ static constexpr fltSemantics semFloat8E4M3FNUZ = { 7, -7, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero}; static constexpr fltSemantics semFloat8E4M3B11FNUZ = { 4, -10, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero}; +static constexpr fltSemantics semFloat8E3M4 = {3, -2, 5, 8}; static constexpr fltSemantics semFloatTF32 = {127, -126, 11, 19}; static constexpr fltSemantics semFloat6E3M2FN = { 4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly}; @@ -217,6 +218,8 @@ const llvm::fltSemantics &APFloatBase::EnumToSemantics(Semantics S) { return Float8E4M3FNUZ(); case S_Float8E4M3B11FNUZ: return Float8E4M3B11FNUZ(); + case S_Float8E3M4: + return Float8E3M4(); case S_FloatTF32: return FloatTF32(); case S_Float6E3M2FN: @@ -257,6 +260,8 @@ APFloatBase::SemanticsToEnum(const llvm::fltSemantics &Sem) { return S_Float8E4M3FNUZ; else if (&Sem == &llvm::APFloat::Float8E4M3B11FNUZ()) return S_Float8E4M3B11FNUZ; + else if (&Sem == &llvm::APFloat::Float8E3M4()) + return S_Float8E3M4; else if (&Sem == &llvm::APFloat::FloatTF32()) return S_FloatTF32; else if (&Sem == &llvm::APFloat::Float6E3M2FN()) @@ -287,6 +292,7 @@ const fltSemantics &APFloatBase::Float8E4M3FNUZ() { return semFloat8E4M3FNUZ; } const fltSemantics &APFloatBase::Float8E4M3B11FNUZ() { return semFloat8E4M3B11FNUZ; } +const fltSemantics &APFloatBase::Float8E3M4() { return semFloat8E3M4; } const fltSemantics &APFloatBase::FloatTF32() { return semFloatTF32; } const fltSemantics &APFloatBase::Float6E3M2FN() { return semFloat6E3M2FN; } const fltSemantics &APFloatBase::Float6E2M3FN() { return semFloat6E2M3FN; } @@ -3643,6 +3649,11 @@ APInt IEEEFloat::convertFloat8E4M3B11FNUZAPFloatToAPInt() const { return convertIEEEFloatToAPInt(); } +APInt IEEEFloat::convertFloat8E3M4APFloatToAPInt() const { + assert(partCount() == 1); + return convertIEEEFloatToAPInt(); +} + APInt IEEEFloat::convertFloatTF32APFloatToAPInt() const { assert(partCount() == 1); return convertIEEEFloatToAPInt(); @@ -3704,6 +3715,9 @@ APInt IEEEFloat::bitcastToAPInt() const { if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3B11FNUZ) return convertFloat8E4M3B11FNUZAPFloatToAPInt(); + if (semantics == (const llvm::fltSemantics *)&semFloat8E3M4) + return convertFloat8E3M4APFloatToAPInt(); + if (semantics == (const llvm::fltSemantics *)&semFloatTF32) return convertFloatTF32APFloatToAPInt(); @@ -3932,6 +3946,10 @@ void IEEEFloat::initFromFloat8E4M3B11FNUZAPInt(const APInt &api) { initFromIEEEAPInt(api); } +void IEEEFloat::initFromFloat8E3M4APInt(const APInt &api) { + initFromIEEEAPInt(api); +} + void IEEEFloat::initFromFloatTF32APInt(const APInt &api) { initFromIEEEAPInt(api); } @@ -3977,6 +3995,8 @@ void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) { return initFromFloat8E4M3FNUZAPInt(api); if (Sem == &semFloat8E4M3B11FNUZ) return initFromFloat8E4M3B11FNUZAPInt(api); + if (Sem == &semFloat8E3M4) + return initFromFloat8E3M4APInt(api); if (Sem == &semFloatTF32) return initFromFloatTF32APInt(api); if (Sem == &semFloat6E3M2FN) diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp index d50bdf4a65dcbf..be675bb7fe5a53 100644 --- a/llvm/unittests/ADT/APFloatTest.cpp +++ b/llvm/unittests/ADT/APFloatTest.cpp @@ -2141,6 +2141,8 @@ TEST(APFloatTest, getZero) { {&APFloat::Float8E4M3FNUZ(), true, false, {0, 0}, 1}, {&APFloat::Float8E4M3B11FNUZ(), false, false, {0, 0}, 1}, {&APFloat::Float8E4M3B11FNUZ(), true, false, {0, 0}, 1}, + {&APFloat::Float8E3M4(), false, true, {0, 0}, 1}, + {&APFloat::Float8E3M4(), true, true, {0x80ULL, 0}, 1}, {&APFloat::FloatTF32(), false, true, {0, 0}, 1}, {&APFloat::FloatTF32(), true, true, {0x40000ULL, 0}, 1}, {&APFloat::Float6E3M2FN(), false, true, {0, 0}, 1}, @@ -6636,6 +6638,45 @@ TEST(APFloatTest, Float8E4M3FNUZToDouble) { EXPECT_TRUE(std::isnan(QNaN.convertToDouble())); } +TEST(APFloatTest, Float8E3M4ToDouble) { + APFloat PosZero = APFloat::getZero(APFloat::Float8E3M4(), false); + APFloat PosZeroToDouble(PosZero.convertToDouble()); + EXPECT_TRUE(PosZeroToDouble.isPosZero()); + APFloat NegZero = APFloat::getZero(APFloat::Float8E3M4(), true); + APFloat NegZeroToDouble(NegZero.convertToDouble()); + EXPECT_TRUE(NegZeroToDouble.isNegZero()); + + APFloat One(APFloat::Float8E3M4(), "1.0"); + EXPECT_EQ(1.0, One.convertToDouble()); + APFloat Two(APFloat::Float8E3M4(), "2.0"); + EXPECT_EQ(2.0, Two.convertToDouble()); + APFloat PosLargest = APFloat::getLargest(APFloat::Float8E3M4(), false); + EXPECT_EQ(15.5F, PosLargest.convertToDouble()); + APFloat NegLargest = APFloat::getLargest(APFloat::Float8E3M4(), true); + EXPECT_EQ(-15.5F, NegLargest.convertToDouble()); + APFloat PosSmallest = + APFloat::getSmallestNormalized(APFloat::Float8E3M4(), false); + EXPECT_EQ(0x1.p-2, PosSmallest.convertToDouble()); + APFloat NegSmallest = + APFloat::getSmallestNormalized(APFloat::Float8E3M4(), true); + EXPECT_EQ(-0x1.p-2, NegSmallest.convertToDouble()); + + APFloat PosSmallestDenorm = + APFloat::getSmallest(APFloat::Float8E3M4(), false); + EXPECT_TRUE(PosSmallestDenorm.isDenormal()); + EXPECT_EQ(0x1.p-6, PosSmallestDenorm.convertToDouble()); + APFloat NegSmallestDenorm = APFloat::getSmallest(APFloat::Float8E3M4(), true); + EXPECT_TRUE(NegSmallestDenorm.isDenormal()); + EXPECT_EQ(-0x1.p-6, NegSmallestDenorm.convertToDouble()); + + APFloat PosInf = APFloat::getInf(APFloat::Float8E3M4()); + EXPECT_EQ(std::numeric_limits::infinity(), PosInf.convertToDouble()); + APFloat NegInf = APFloat::getInf(APFloat::Float8E3M4(), true); + EXPECT_EQ(-std::numeric_limits::infinity(), NegInf.convertToDouble()); + APFloat QNaN = APFloat::getQNaN(APFloat::Float8E3M4()); + EXPECT_TRUE(std::isnan(QNaN.convertToDouble())); +} + TEST(APFloatTest, FloatTF32ToDouble) { APFloat One(APFloat::FloatTF32(), "1.0"); EXPECT_EQ(1.0, One.convertToDouble()); @@ -6944,6 +6985,46 @@ TEST(APFloatTest, Float8E4M3FNToFloat) { EXPECT_TRUE(std::isnan(QNaN.convertToFloat())); } +TEST(APFloatTest, Float8E3M4ToFloat) { + APFloat PosZero = APFloat::getZero(APFloat::Float8E3M4(), false); + APFloat PosZeroToFloat(PosZero.convertToFloat()); + EXPECT_TRUE(PosZeroToFloat.isPosZero()); + APFloat NegZero = APFloat::getZero(APFloat::Float8E3M4(), true); + APFloat NegZeroToFloat(NegZero.convertToFloat()); + EXPECT_TRUE(NegZeroToFloat.isNegZero()); + + APFloat One(APFloat::Float8E3M4(), "1.0"); + EXPECT_EQ(1.0F, One.convertToFloat()); + APFloat Two(APFloat::Float8E3M4(), "2.0"); + EXPECT_EQ(2.0F, Two.convertToFloat()); + + APFloat PosLargest = APFloat::getLargest(APFloat::Float8E3M4(), false); + EXPECT_EQ(15.5F, PosLargest.convertToFloat()); + APFloat NegLargest = APFloat::getLargest(APFloat::Float8E3M4(), true); + EXPECT_EQ(-15.5F, NegLargest.convertToFloat()); + APFloat PosSmallest = + APFloat::getSmallestNormalized(APFloat::Float8E3M4(), false); + EXPECT_EQ(0x1.p-2, PosSmallest.convertToFloat()); + APFloat NegSmallest = + APFloat::getSmallestNormalized(APFloat::Float8E3M4(), true); + EXPECT_EQ(-0x1.p-2, NegSmallest.convertToFloat()); + + APFloat PosSmallestDenorm = + APFloat::getSmallest(APFloat::Float8E3M4(), false); + EXPECT_TRUE(PosSmallestDenorm.isDenormal()); + EXPECT_EQ(0x1.p-6, PosSmallestDenorm.convertToFloat()); + APFloat NegSmallestDenorm = APFloat::getSmallest(APFloat::Float8E3M4(), true); + EXPECT_TRUE(NegSmallestDenorm.isDenormal()); + EXPECT_EQ(-0x1.p-6, NegSmallestDenorm.convertToFloat()); + + APFloat PosInf = APFloat::getInf(APFloat::Float8E3M4()); + EXPECT_EQ(std::numeric_limits::infinity(), PosInf.convertToFloat()); + APFloat NegInf = APFloat::getInf(APFloat::Float8E3M4(), true); + EXPECT_EQ(-std::numeric_limits::infinity(), NegInf.convertToFloat()); + APFloat QNaN = APFloat::getQNaN(APFloat::Float8E3M4()); + EXPECT_TRUE(std::isnan(QNaN.convertToFloat())); +} + TEST(APFloatTest, FloatTF32ToFloat) { APFloat PosZero = APFloat::getZero(APFloat::FloatTF32()); APFloat PosZeroToFloat(PosZero.convertToFloat());