Implement float dataType

kuzudb · Feb 27, 2023 · ec1c8e4 · ec1c8e4
1 parent 7c9ccf9
commit ec1c8e4
Show file tree

Hide file tree

Showing 44 changed files with 576 additions and 380 deletions.
diff --git a/dataset/tinysnb/eMeets.csv b/dataset/tinysnb/eMeets.csv
@@ -1,7 +1,7 @@
-0,2
-2,5
-3,7
-7,3
-8,3
-9,3
-10,2
+0,2,"[7.82,3.54]"
+2,5,"[2.87,4.23]"
+3,7,"[3.65,8.44]"
+7,3,"[2.11,3.1]"
+8,3,"[2.2,9.0]"
+9,3,"[3,5.2]"
+10,2,"[3.5,1.1]"
diff --git a/dataset/tinysnb/eWorkAt.csv b/dataset/tinysnb/eWorkAt.csv
@@ -1,3 +1,3 @@
-3,4,2015,"[3.8,2.5]"
-5,6,2010,"[2.1,4.4]"
-7,6,2015,"[9.2,3.1]"
+3,4,2015,"[3.8,2.5]",8.2
+5,6,2010,"[2.1,4.4]",7.6
+7,6,2015,"[9.2,3.1]",9.2
diff --git a/dataset/tinysnb/schema.cypher b/dataset/tinysnb/schema.cypher
@@ -1,8 +1,8 @@
-create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], grades INT64[4],PRIMARY KEY (ID));
+create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], grades INT64[4], height float, PRIMARY KEY (ID));
 create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, PRIMARY KEY (ID));
 create node table movies (name STRING, PRIMARY KEY (name));
 create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY);
 create rel table studyAt (FROM person TO organisation, year INT64, places STRING[], MANY_ONE);
-create rel table workAt (FROM person TO organisation, year INT64, grading DOUBLE[2], MANY_ONE);
-create rel table meets (FROM person TO person, MANY_ONE);
+create rel table workAt (FROM person TO organisation, year INT64, grading DOUBLE[2], rating float, MANY_ONE);
+create rel table meets (FROM person TO person, location FLOAT[2], MANY_ONE);
 create rel table marries (FROM person TO person, usedAddress STRING[], note STRING, ONE_ONE);
diff --git a/dataset/tinysnb/vPerson.csv b/dataset/tinysnb/vPerson.csv
@@ -1,9 +1,9 @@
-id,fname,Gender,ISStudent,isWorker,age,eyeSight,birthdate,registerTime,lastJobDuration,workedHours,usedNames,courseScoresPerTerm,grades
-0,Alice,1,true,false,35,5.0,1900-01-01,2011-08-20 11:25:30Z+00:00,3 years 2 days 13 hours 2 minutes,"[10,5]","[Aida]","[[10,8],[6,7,8]]","[96,54,86,92]"
-2,Bob,2,true,false,30,5.1,1900-01-01,2008-11-03 13:25:30.000526-02:00,10 years 5 months 13 hours 24 us,"[12,8]","[Bobby]","[[8,9],[9,10]]","[98,42,93,88]"
-3,Carol,1,false,true,45,5.0,1940-06-22,1911-08-20 02:32:21,48 hours 24 minutes 11 seconds,"[4,5]","[Carmen,Fred]","[[8,10]]","[91,75,21,95]"
-5,Dan,2,false,true,20,4.8,1950-7-23,2031-11-30 12:25:30Z,10 years 5 months 13 hours 24 us,"[1,9]","[Wolfeschlegelstein,Daniel]","[[7,4],[8,8],[9]]","[76,88,99,89]"
-7,Elizabeth,1,false,true,20,4.7,1980-10-26,1976-12-23 11:21:42,48 hours 24 minutes 11 seconds,"[2]","[Ein]","[[6],[7],[8]]","[96,59,65,88]"
-8,Farooq,2,true,false,25,4.5,1980-10-26,1972-07-31 13:22:30.678559,18 minutes 24 milliseconds,"[3,4,5,6,7]","[Fesdwe]","[[8]]","[80,78,34,83]"
-9,Greg,2,false,false,40,4.9,1980-10-26,1976-12-23 11:21:42Z+06:40,10 years 5 months 13 hours 24 us,"[1]","[Grad]","[[10]]","[43,83,67,43]"
-10,Hubert Blaine Wolfeschlegelsteinhausenbergerdorff,2,false,true,83,4.9,1990-11-27,2023-02-21 13:25:30,3 years 2 days 13 hours 2 minutes,"[10,11,12,3,4,5,6,7]","[Ad,De,Hi,Kye,Orlan]","[[7],[10],[6,7]]","[77,64,100,54]"
+id,fname,Gender,ISStudent,isWorker,age,eyeSight,birthdate,registerTime,lastJobDuration,workedHours,usedNames,courseScoresPerTerm,grades,height
+0,Alice,1,true,false,35,5.0,1900-01-01,2011-08-20 11:25:30Z+00:00,3 years 2 days 13 hours 2 minutes,"[10,5]","[Aida]","[[10,8],[6,7,8]]","[96,54,86,92]",1.731
+2,Bob,2,true,false,30,5.1,1900-01-01,2008-11-03 13:25:30.000526-02:00,10 years 5 months 13 hours 24 us,"[12,8]","[Bobby]","[[8,9],[9,10]]","[98,42,93,88]",0.99
+3,Carol,1,false,true,45,5.0,1940-06-22,1911-08-20 02:32:21,48 hours 24 minutes 11 seconds,"[4,5]","[Carmen,Fred]","[[8,10]]","[91,75,21,95]",1.00
+5,Dan,2,false,true,20,4.8,1950-7-23,2031-11-30 12:25:30Z,10 years 5 months 13 hours 24 us,"[1,9]","[Wolfeschlegelstein,Daniel]","[[7,4],[8,8],[9]]","[76,88,99,89]",1.30
+7,Elizabeth,1,false,true,20,4.7,1980-10-26,1976-12-23 11:21:42,48 hours 24 minutes 11 seconds,"[2]","[Ein]","[[6],[7],[8]]","[96,59,65,88]",1.463
+8,Farooq,2,true,false,25,4.5,1980-10-26,1972-07-31 13:22:30.678559,18 minutes 24 milliseconds,"[3,4,5,6,7]","[Fesdwe]","[[8]]","[80,78,34,83]",1.51
+9,Greg,2,false,false,40,4.9,1980-10-26,1976-12-23 11:21:42Z+06:40,10 years 5 months 13 hours 24 us,"[1]","[Grad]","[[10]]","[43,83,67,43]",1.6
+10,Hubert Blaine Wolfeschlegelsteinhausenbergerdorff,2,false,true,83,4.9,1990-11-27,2023-02-21 13:25:30,3 years 2 days 13 hours 2 minutes,"[10,11,12,3,4,5,6,7]","[Ad,De,Hi,Kye,Orlan]","[[7],[10],[6,7]]","[77,64,100,54]",1.323
diff --git a/src/common/csv_reader/csv_reader.cpp b/src/common/csv_reader/csv_reader.cpp
@@ -223,7 +223,7 @@ int64_t CSVReader::getInt64() {
 
 double_t CSVReader::getDouble() {
     setNextTokenIsProcessed();
-    return TypeUtils::convertToDouble(line + linePtrStart);
+    return TypeUtils::convertFloatingPointNumber<double_t>(line + linePtrStart, DOUBLE);
 }
 
 uint8_t CSVReader::getBoolean() {

diff --git a/src/common/type_utils.cpp b/src/common/type_utils.cpp
@@ -31,17 +31,6 @@ uint32_t TypeUtils::convertToUint32(const char* data) {
     return val;
 }
 
-double_t TypeUtils::convertToDouble(const char* data) {
-    char* eptr;
-    errno = 0;
-    auto retVal = strtod(data, &eptr);
-    throwConversionExceptionIfNoOrNotEveryCharacterIsConsumed(data, eptr, DOUBLE);
-    if ((HUGE_VAL == retVal || -HUGE_VAL == retVal) && errno == ERANGE) {
-        throwConversionExceptionOutOfRange(data, DOUBLE);
-    }
-    return retVal;
-};
-
 bool TypeUtils::convertToBoolean(const char* data) {
     auto len = strlen(data);
     if (len == 4 && 't' == tolower(data[0]) && 'r' == tolower(data[1]) && 'u' == tolower(data[2]) &&

diff --git a/src/common/types/types.cpp b/src/common/types/types.cpp
@@ -44,6 +44,7 @@ DataType::DataType(const DataType& other) {
     case DATE:
     case TIMESTAMP:
     case INTERVAL:
+    case FLOAT:
     case STRING: {
         typeID = other.typeID;
     } break;
@@ -57,13 +58,13 @@ DataType::DataType(DataType&& other) noexcept
       fixedNumElementsInList{other.fixedNumElementsInList} {}
 
 std::vector<DataTypeID> DataType::getNumericalTypeIDs() {
-    return std::vector<DataTypeID>{INT64, DOUBLE};
+    return std::vector<DataTypeID>{INT64, DOUBLE, FLOAT};
 }
 
 std::vector<DataTypeID> DataType::getAllValidTypeIDs() {
     // TODO(Ziyi): Add FIX_LIST type to allValidTypeID when we support functions on VAR_LIST.
     return std::vector<DataTypeID>{
-        INTERNAL_ID, BOOL, INT64, DOUBLE, STRING, DATE, TIMESTAMP, INTERVAL, VAR_LIST};
+        INTERNAL_ID, BOOL, INT64, DOUBLE, STRING, DATE, TIMESTAMP, INTERVAL, VAR_LIST, FLOAT};
 }
 
 DataType& DataType::operator=(const DataType& other) {
@@ -84,6 +85,7 @@ DataType& DataType::operator=(const DataType& other) {
     case DATE:
     case TIMESTAMP:
     case INTERVAL:
+    case FLOAT:
     case STRING: {
         typeID = other.typeID;
     } break;
@@ -112,6 +114,7 @@ bool DataType::operator==(const DataType& other) const {
     case DATE:
     case TIMESTAMP:
     case INTERVAL:
+    case FLOAT:
     case STRING:
         return typeID == other.typeID;
     default:
@@ -149,6 +152,7 @@ std::unique_ptr<DataType> DataType::copy() {
     case TIMESTAMP:
     case INTERVAL:
     case STRING:
+    case FLOAT:
         return std::make_unique<DataType>(typeID);
     default:
         throw InternalException("Unsupported DataType: " + Types::dataTypeToString(typeID) + ".");
@@ -202,6 +206,8 @@ DataTypeID Types::dataTypeIDFromString(const std::string& dataTypeIDString) {
         return TIMESTAMP;
     } else if ("INTERVAL" == dataTypeIDString) {
         return INTERVAL;
+    } else if ("FLOAT" == dataTypeIDString) {
+        return FLOAT;
     } else {
         throw InternalException("Cannot parse dataTypeID: " + dataTypeIDString);
     }
@@ -225,6 +231,7 @@ std::string Types::dataTypeToString(const DataType& dataType) {
     case TIMESTAMP:
     case INTERVAL:
     case STRING:
+    case FLOAT:
         return dataTypeToString(dataType.typeID);
     default:
         throw InternalException("Unsupported DataType: " + Types::dataTypeToString(dataType) + ".");
@@ -259,6 +266,8 @@ std::string Types::dataTypeToString(DataTypeID dataTypeID) {
         return "VAR_LIST";
     case FIXED_LIST:
         return "FIXED_LIST";
+    case FLOAT:
+        return "FLOAT";
     default:
         throw InternalException(
             "Unsupported DataType: " + Types::dataTypeToString(dataTypeID) + ".");
@@ -305,6 +314,8 @@ uint32_t Types::getDataTypeSize(DataTypeID dataTypeID) {
         return sizeof(ku_string_t);
     case VAR_LIST:
         return sizeof(ku_list_t);
+    case FLOAT:
+        return sizeof(float_t);
     default:
         throw InternalException(
             "Cannot infer the size of dataTypeID: " + dataTypeToString(dataTypeID) + ".");
@@ -324,6 +335,7 @@ uint32_t Types::getDataTypeSize(const DataType& dataType) {
     case INTERVAL:
     case STRING:
     case VAR_LIST:
+    case FLOAT:
         return getDataTypeSize(dataType.typeID);
     default:
         throw InternalException(

diff --git a/src/common/types/value.cpp b/src/common/types/value.cpp
@@ -43,7 +43,7 @@ Value Value::createDefaultValue(const DataType& dataType) {
     case BOOL:
         return Value(true);
     case DOUBLE:
-        return Value(0.0);
+        return Value((double_t)0);
     case DATE:
         return Value(date_t());
     case TIMESTAMP:
@@ -54,6 +54,8 @@ Value Value::createDefaultValue(const DataType& dataType) {
         return Value(nodeID_t());
     case STRING:
         return Value(std::string(""));
+    case FLOAT:
+        return Value((float_t)0);
     case VAR_LIST:
     case FIXED_LIST:
         return Value(dataType, std::vector<std::unique_ptr<Value>>{});
@@ -108,6 +110,10 @@ Value::Value(DataType dataType, std::vector<std::unique_ptr<Value>> vals)
     listVal = std::move(vals);
 }
 
+Value::Value(float_t val_) : dataType{FLOAT}, isNull_{false} {
+    val.floatVal = val_;
+}
+
 Value::Value(std::unique_ptr<NodeVal> val_) : dataType{NODE}, isNull_{false} {
     nodeVal = std::move(val_);
 }
@@ -157,6 +163,9 @@ void Value::copyValueFrom(const uint8_t* value) {
     case FIXED_LIST: {
         listVal = convertKUFixedListToVector(value);
     } break;
+    case FLOAT: {
+        val.floatVal = *((float_t*)value);
+    } break;
     default:
         throw RuntimeException(
             "Data type " + Types::dataTypeToString(dataType) + " is not supported for Value::set");
@@ -207,6 +216,9 @@ void Value::copyValueFrom(const Value& other) {
     case REL: {
         relVal = other.relVal->copy();
     } break;
+    case FLOAT: {
+        val.floatVal = other.val.floatVal;
+    } break;
     default:
         throw NotImplementedException("Value::Value(const Value&) for type " +
                                       Types::dataTypeToString(dataType) + " is not implemented.");
@@ -251,6 +263,8 @@ std::string Value::toString() const {
         return nodeVal->toString();
     case REL:
         return relVal->toString();
+    case FLOAT:
+        return TypeUtils::toString(val.floatVal);
     default:
         throw NotImplementedException("Value::toString for type " +
                                       Types::dataTypeToString(dataType) + " is not implemented.");
@@ -287,20 +301,17 @@ std::vector<std::unique_ptr<Value>> Value::convertKUVarListToVector(ku_list_t& l
 
 std::vector<std::unique_ptr<Value>> Value::convertKUFixedListToVector(
     const uint8_t* fixedList) const {
-    std::vector<std::unique_ptr<Value>> fixedListResultVal;
+    std::vector<std::unique_ptr<Value>> fixedListResultVal{dataType.fixedNumElementsInList};
     auto numBytesPerElement = Types::getDataTypeSize(*dataType.childType);
     switch (dataType.childType->typeID) {
     case common::DataTypeID::INT64: {
-        for (auto i = 0; i < dataType.fixedNumElementsInList; ++i) {
-            fixedListResultVal.emplace_back(
-                std::make_unique<Value>(*(int64_t*)(fixedList + i * numBytesPerElement)));
-        }
+        putValuesIntoVector<int64_t>(fixedListResultVal, fixedList, numBytesPerElement);
     } break;
     case common::DataTypeID::DOUBLE: {
-        for (auto i = 0; i < dataType.fixedNumElementsInList; ++i) {
-            fixedListResultVal.emplace_back(
-                std::make_unique<Value>(*(double_t*)(fixedList + i * numBytesPerElement)));
-        }
+        putValuesIntoVector<double_t>(fixedListResultVal, fixedList, numBytesPerElement);
+    } break;
+    case common::DataTypeID::FLOAT: {
+        putValuesIntoVector<float_t>(fixedListResultVal, fixedList, numBytesPerElement);
     } break;
     default:
         assert(false);

diff --git a/src/function/aggregate_function.cpp b/src/function/aggregate_function.cpp
@@ -37,6 +37,10 @@ std::unique_ptr<AggregateFunction> AggregateFunctionUtil::getAvgFunction(
         return std::make_unique<AggregateFunction>(AvgFunction<double_t>::initialize,
             AvgFunction<double_t>::updateAll, AvgFunction<double_t>::updatePos,
             AvgFunction<double_t>::combine, AvgFunction<double_t>::finalize, inputType, isDistinct);
+    case FLOAT:
+        return std::make_unique<AggregateFunction>(AvgFunction<float_t>::initialize,
+            AvgFunction<float_t>::updateAll, AvgFunction<float_t>::updatePos,
+            AvgFunction<float_t>::combine, AvgFunction<float_t>::finalize, inputType, isDistinct);
     default:
         throw RuntimeException("Unsupported input data type " + Types::dataTypeToString(inputType) +
                                " for AggregateFunctionUtil::getAvgFunction.");
@@ -54,6 +58,10 @@ std::unique_ptr<AggregateFunction> AggregateFunctionUtil::getSumFunction(
         return std::make_unique<AggregateFunction>(SumFunction<double_t>::initialize,
             SumFunction<double_t>::updateAll, SumFunction<double_t>::updatePos,
             SumFunction<double_t>::combine, SumFunction<double_t>::finalize, inputType, isDistinct);
+    case FLOAT:
+        return std::make_unique<AggregateFunction>(SumFunction<float_t>::initialize,
+            SumFunction<float_t>::updateAll, SumFunction<float_t>::updatePos,
+            SumFunction<float_t>::combine, SumFunction<float_t>::finalize, inputType, isDistinct);
     default:
         throw RuntimeException("Unsupported input data type " + Types::dataTypeToString(inputType) +
                                " for AggregateFunctionUtil::getSumFunction.");

diff --git a/src/function/built_in_vector_operations.cpp b/src/function/built_in_vector_operations.cpp
@@ -86,9 +86,11 @@ uint32_t BuiltInVectorOperations::getCastCost(DataTypeID inputTypeID, DataTypeID
             // ANY type can be any type
             return 0;
         case common::INT64:
-            return implicitCastInt64(targetTypeID);
+            return castInt64(targetTypeID);
         case common::DOUBLE:
-            return implicitCastDouble(targetTypeID);
+            return castDouble(targetTypeID);
+        case common::FLOAT:
+            return castFloat(targetTypeID);
         default:
             return UINT32_MAX;
         }
@@ -110,22 +112,43 @@ uint32_t BuiltInVectorOperations::getCastCost(
     }
 }
 
-uint32_t BuiltInVectorOperations::implicitCastInt64(common::DataTypeID targetTypeID) {
-    switch (targetTypeID) {
+uint32_t BuiltInVectorOperations::getTargetTypeCost(common::DataTypeID typeID) {
+    switch (typeID) {
+    case common::FLOAT:
+        return 110;
     case common::DOUBLE:
         return 102;
+    default:
+        throw InternalException("Unsupported casting operation.");
+    }
+}
+
+uint32_t BuiltInVectorOperations::castInt64(common::DataTypeID targetTypeID) {
+    switch (targetTypeID) {
+    case common::FLOAT:
+    case common::DOUBLE:
+        return getTargetTypeCost(targetTypeID);
     default:
         return UINT32_MAX;
     }
 }
 
-uint32_t BuiltInVectorOperations::implicitCastDouble(common::DataTypeID targetTypeID) {
+uint32_t BuiltInVectorOperations::castDouble(common::DataTypeID targetTypeID) {
     switch (targetTypeID) {
     default:
         return UINT32_MAX;
     }
 }
 
+uint32_t BuiltInVectorOperations::castFloat(common::DataTypeID targetTypeID) {
+    switch (targetTypeID) {
+    case common::DOUBLE:
+        return getTargetTypeCost(targetTypeID);
+    default:
+        return UINT32_MAX;
+    }
+}
+
 // When there is multiple candidates functions, e.g. double + int and double + double for input
 // "1.5 + parameter", we prefer the one without any implicit casting i.e. double + double.
 VectorOperationDefinition* BuiltInVectorOperations::getBestMatch(
@@ -329,6 +352,8 @@ void BuiltInVectorOperations::registerCastOperations() {
         {CAST_TO_STRING_FUNC_NAME, CastToStringVectorOperation::getDefinitions()});
     vectorOperations.insert(
         {CAST_TO_DOUBLE_FUNC_NAME, CastToDoubleVectorOperation::getDefinitions()});
+    vectorOperations.insert(
+        {CAST_TO_FLOAT_FUNC_NAME, CastToFloatVectorOperation::getDefinitions()});
 }
 
 void BuiltInVectorOperations::registerListOperations() {