Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

replace cast string to union function in driver.cpp #2227

Merged
merged 1 commit into from
Oct 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions dataset/load-from-test/union/union_correct.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"false","","255","18446744073709551615",fsdfa
" true ","432","0","-1.43241543","543fasf"
" 34234 ","4294967295","65535",-128,432
" -42342345 ","-1","-1","-129",fasf
" T ","2022-06-06","4324.123","-32768",
"null","2019-03-19","-12.3432","32768",""
"","-2147483648","1970-01-01 00:00:00.004666-10","-32769",fsdxcv
"0","0","2014-05-12 12:11:59",4324254534123134324321.4343252435,"fsaf"
" F","-4325"," Null ",18446744073709551616," dfsa"
1 change: 1 addition & 0 deletions dataset/load-from-test/union/union_error.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fdsaf
2 changes: 0 additions & 2 deletions src/include/storage/store/table_copy_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,6 @@ class TableCopyUtils {

static std::shared_ptr<arrow::DataType> toArrowDataType(const common::LogicalType& dataType);

static bool tryCast(const common::LogicalType& targetType, const char* value, uint64_t length);

static std::vector<StructFieldIdxAndValue> parseStructFieldNameAndValues(
common::LogicalType& type, std::string_view structString,
const common::CSVReaderConfig& csvReaderConfig);
Expand Down
143 changes: 119 additions & 24 deletions src/processor/operator/persistent/reader/csv/driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,124 @@ static void castStringToStruct(const char* input, uint64_t len, ValueVector* vec
}
}

template<typename T>
static inline void testAndSetValue(ValueVector* vector, uint64_t rowToAdd, T result, bool success) {
if (success) {
vector->setValue(rowToAdd, result);
}
}

static bool tryCastUnionField(
ValueVector* vector, uint64_t rowToAdd, const char* input, uint64_t len) {
auto& targetType = vector->dataType;
bool success = false;
switch (targetType.getLogicalTypeID()) {
case LogicalTypeID::BOOL: {
bool result;
success = function::tryCastToBool(input, len, result);
testAndSetValue(vector, rowToAdd, result, success);
} break;
case LogicalTypeID::INT64: {
int64_t result;
success = function::trySimpleIntegerCast(input, len, result);
testAndSetValue(vector, rowToAdd, result, success);
} break;
case LogicalTypeID::INT32: {
AEsir777 marked this conversation as resolved.
Show resolved Hide resolved
int32_t result;
success = function::trySimpleIntegerCast(input, len, result);
testAndSetValue(vector, rowToAdd, result, success);
} break;
case LogicalTypeID::INT16: {
int16_t result;
success = function::trySimpleIntegerCast(input, len, result);
testAndSetValue(vector, rowToAdd, result, success);
} break;
case LogicalTypeID::INT8: {
int8_t result;
success = function::trySimpleIntegerCast(input, len, result);
testAndSetValue(vector, rowToAdd, result, success);
} break;
case LogicalTypeID::UINT64: {
uint64_t result;
success = function::trySimpleIntegerCast<uint64_t, false>(input, len, result);
testAndSetValue(vector, rowToAdd, result, success);
} break;
case LogicalTypeID::UINT32: {
uint32_t result;
success = function::trySimpleIntegerCast<uint32_t, false>(input, len, result);
testAndSetValue(vector, rowToAdd, result, success);
} break;
case LogicalTypeID::UINT16: {
uint16_t result;
success = function::trySimpleIntegerCast<uint16_t, false>(input, len, result);
testAndSetValue(vector, rowToAdd, result, success);
} break;
case LogicalTypeID::UINT8: {
uint8_t result;
success = function::trySimpleIntegerCast<uint8_t, false>(input, len, result);
testAndSetValue(vector, rowToAdd, result, success);
} break;
case LogicalTypeID::DOUBLE: {
double_t result;
success = function::tryDoubleCast(input, len, result);
testAndSetValue(vector, rowToAdd, result, success);
} break;
case LogicalTypeID::FLOAT: {
float_t result;
success = function::tryDoubleCast(input, len, result);
testAndSetValue(vector, rowToAdd, result, success);
} break;
case LogicalTypeID::DATE: {
date_t result;
uint64_t pos;
success = Date::tryConvertDate(input, len, pos, result);
testAndSetValue(vector, rowToAdd, result, success);
} break;
case LogicalTypeID::TIMESTAMP: {
timestamp_t result;
success = Timestamp::tryConvertTimestamp(input, len, result);
testAndSetValue(vector, rowToAdd, result, success);
} break;
case LogicalTypeID::STRING: {
if (!utf8proc::Utf8Proc::isValid(input, len)) {
throw common::CopyException{"Invalid UTF8-encoded string."};
}
StringVector::addString(vector, rowToAdd, input, len);
return true;
} break;
default: {
return false;
}
}
return success;
}

static void castStringToUnion(ValueVector* vector, std::string_view strVal, uint64_t rowToAdd) {
auto& type = vector->dataType;
union_field_idx_t selectedFieldIdx = INVALID_STRUCT_FIELD_IDX;

for (auto i = 0u; i < UnionType::getNumFields(&type); i++) {
auto internalFieldIdx = UnionType::getInternalFieldIdx(i);
auto fieldVector = StructVector::getFieldVector(vector, internalFieldIdx).get();
if (tryCastUnionField(fieldVector, rowToAdd, strVal.data(), strVal.length())) {
fieldVector->setNull(rowToAdd, false /* isNull */);
selectedFieldIdx = i;
break;
} else {
fieldVector->setNull(rowToAdd, true /* isNull */);
}
}

if (selectedFieldIdx == INVALID_STRUCT_FIELD_IDX) {
throw ConversionException{stringFormat("Could not convert to union type {}: {}.",
LogicalTypeUtils::dataTypeToString(type), strVal)};
}
StructVector::getFieldVector(vector, UnionType::TAG_FIELD_IDX)
->setValue(rowToAdd, selectedFieldIdx);
StructVector::getFieldVector(vector, UnionType::TAG_FIELD_IDX)
->setNull(rowToAdd, false /* isNull */);
}

void copyStringToVector(ValueVector* vector, uint64_t rowToAdd, std::string_view strVal,
const CSVReaderConfig& csvReaderConfig) {
auto& type = vector->dataType;
Expand Down Expand Up @@ -569,30 +687,7 @@ void copyStringToVector(ValueVector* vector, uint64_t rowToAdd, std::string_view
castStringToStruct(strVal.data(), strVal.length(), vector, rowToAdd, csvReaderConfig);
} break;
case LogicalTypeID::UNION: {
union_field_idx_t selectedFieldIdx = INVALID_STRUCT_FIELD_IDX;
for (auto i = 0u; i < UnionType::getNumFields(&type); i++) {
auto internalFieldIdx = UnionType::getInternalFieldIdx(i);
if (storage::TableCopyUtils::tryCast(
*UnionType::getFieldType(&type, i), strVal.data(), strVal.length())) {
StructVector::getFieldVector(vector, internalFieldIdx)
->setNull(rowToAdd, false /* isNull */);
copyStringToVector(StructVector::getFieldVector(vector, internalFieldIdx).get(),
rowToAdd, strVal, csvReaderConfig);
selectedFieldIdx = i;
break;
} else {
StructVector::getFieldVector(vector, internalFieldIdx)
->setNull(rowToAdd, true /* isNull */);
}
}
if (selectedFieldIdx == INVALID_STRUCT_FIELD_IDX) {
throw ConversionException{stringFormat("Could not convert to union type {}: {}.",
LogicalTypeUtils::dataTypeToString(type), strVal)};
}
StructVector::getFieldVector(vector, UnionType::TAG_FIELD_IDX)
->setValue(rowToAdd, selectedFieldIdx);
StructVector::getFieldVector(vector, UnionType::TAG_FIELD_IDX)
->setNull(rowToAdd, false /* isNull */);
castStringToUnion(vector, strVal, rowToAdd);
} break;
default: { // LCOV_EXCL_START
throw NotImplementedException("BaseCSVReader::copyStringToVector");
Expand Down
65 changes: 0 additions & 65 deletions src/storage/store/table_copy_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@
reader->parquet_reader()->metadata()->schema()->group_node()->field_count();
if (config.getNumColumns() != actualNumColumns) {
// Note: Some parquet files may contain an index column.
throw CopyException(
stringFormat("Unmatched number of columns in parquet file. Expected: {}, got: {}.",
config.getNumColumns(), actualNumColumns));

Check warning on line 72 in src/storage/store/table_copy_utils.cpp

View check run for this annotation

Codecov / codecov/patch

src/storage/store/table_copy_utils.cpp#L70-L72

Added lines #L70 - L72 were not covered by tests
}
return reader;
}
Expand Down Expand Up @@ -306,71 +306,6 @@
}
}

bool TableCopyUtils::tryCast(
const common::LogicalType& targetType, const char* value, uint64_t length) {
switch (targetType.getLogicalTypeID()) {
case LogicalTypeID::BOOL: {
bool result;
return function::tryCastToBool(value, length, result);
}
case LogicalTypeID::INT64: {
int64_t result;
return function::trySimpleIntegerCast(value, length, result);
}
case LogicalTypeID::INT32: {
int32_t result;
return function::trySimpleIntegerCast(value, length, result);
}
case LogicalTypeID::INT16: {
int16_t result;
return function::trySimpleIntegerCast(value, length, result);
}
case LogicalTypeID::INT8: {
int8_t result;
return function::trySimpleIntegerCast(value, length, result);
}
case LogicalTypeID::UINT64: {
uint64_t result;
return function::trySimpleIntegerCast<uint64_t, false>(value, length, result);
}
case LogicalTypeID::UINT32: {
uint32_t result;
return function::trySimpleIntegerCast<uint32_t, false>(value, length, result);
}
case LogicalTypeID::UINT16: {
uint16_t result;
return function::trySimpleIntegerCast<uint16_t, false>(value, length, result);
}
case LogicalTypeID::UINT8: {
uint8_t result;
return function::trySimpleIntegerCast<uint8_t, false>(value, length, result);
}
case LogicalTypeID::DOUBLE: {
double_t result;
return function::tryDoubleCast(value, length, result);
}
case LogicalTypeID::FLOAT: {
float_t result;
return function::tryDoubleCast(value, length, result);
}
case LogicalTypeID::DATE: {
date_t result;
uint64_t pos;
return Date::tryConvertDate(value, length, pos, result);
}
case LogicalTypeID::TIMESTAMP: {
timestamp_t result;
return Timestamp::tryConvertTimestamp(value, length, result);
}
case LogicalTypeID::STRING: {
return true;
}
default: {
return false;
}
}
}

std::vector<StructFieldIdxAndValue> TableCopyUtils::parseStructFieldNameAndValues(
LogicalType& type, std::string_view structString, const CSVReaderConfig& csvReaderConfig) {
std::vector<StructFieldIdxAndValue> structFieldIdxAndValueParis;
Expand Down
Loading
Loading