From eb17b2e7aee96765beaa57d839201d3511b5a3ec Mon Sep 17 00:00:00 2001 From: Qi Chen Date: Tue, 16 Jul 2024 18:59:37 +0800 Subject: [PATCH] [Fix](multi-catalog) Fix some undefined behaviors. (#37845) ## Proposed changes - Null pointer of type 'doris::StringRef' in orc reader. The root cause is error will throw when `num_values == 0` in `_decode_string_non_dict_encoded_column`. ``` /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_vector.h:1046:9: runtime error: reference binding to null pointer of type 'doris::StringRef' #0 0x562516fa9770 in std::vector >::operator[](unsigned long) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_vector.h:1046:2 #1 0x562516fa9770 in doris::Status doris::vectorized::OrcReader::_decode_string_non_dict_encoded_column(std::__cxx11::basic_string, std::allocator > const&, COW::mutable_ptr const&, orc::TypeKind const&, orc::EncodedStringVectorBatch*, unsigned long) /root/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1164:39 #2 0x562516f9c08b in doris::Status doris::vectorized::OrcReader::_decode_string_column(std::__cxx11::basic_string, std::allocator > const&, COW::mutable_ptr const&, orc::TypeKind const&, orc::ColumnVectorBatch*, unsigned long) /root/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1116:16 #3 0x562516f91d73 in doris::Status doris::vectorized::OrcReader::_fill_doris_data_column(std::__cxx11::basic_string, std::allocator > const&, COW::mutable_ptr&, std::shared_ptr const&, orc::Type const*, orc::ColumnVectorBatch*, unsigned long) /root/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1357:16 #4 0x562516c79a0c in doris::Status doris::vectorized::OrcReader::_orc_column_to_doris_column(std::__cxx11::basic_string, std::allocator > const&, COW::immutable_ptr&, std::shared_ptr const&, orc::Type const*, orc::ColumnVectorBatch*, unsigned long) /root/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1524:5 #5 0x562516f9339a in doris::Status doris::vectorized::OrcReader::_fill_doris_data_column(std::__cxx11::basic_string, std::allocator > const&, COW::mutable_ptr&, std::shared_ptr const&, orc::Type const*, orc::ColumnVectorBatch*, unsigned long) /root/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1402:9 #6 0x562516c79a0c in doris::Status doris::vectorized::OrcReader::_orc_column_to_doris_column(std::__cxx11::basic_string, std::allocator > const&, COW::immutable_ptr&, std::shared_ptr const&, orc::Type const*, orc::ColumnVectorBatch*, unsigned long) /root/doris/be/src/vec/exec/format/orc/vorc_reader.cpp:1524:5 ... ``` - Shift exponent 128 is too large for 128-bit type 'ValueCopyType' (aka '__int128') in parquet reader. The root cause is error will throw when `len == 0`. ``` /root/doris/be/src/vec/exec/format/parquet/parquet_column_convert.h:413:27: runtime error: shift exponent 128 is too large for 128-bit type 'ValueCopyType' (aka '__int128') #0 0x56251760fbc7 in doris::vectorized::parquet::StringToDecimal::physical_convert(COW::immutable_ptr&, COW::immutable_ptr&) /root/doris/be/src/vec/exec/format/parquet/parquet_column_convert.h:413:27 #1 0x562517290dc4 in doris::vectorized::parquet::PhysicalToLogicalConverter::convert(COW::immutable_ptr&, doris::TypeDescriptor, std::shared_ptr const&, COW::immutable_ptr&, bool) /root/doris/be/src/vec/exec/format/parquet/parquet_column_convert.h:209:9 #2 0x562517284a6d in doris::vectorized::ScalarColumnReader::read_column_data(COW::immutable_ptr&, std::shared_ptr&, doris::vectorized::ColumnSelectVector&, unsigned long, unsigned long*, bool*, bool) /root/doris/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:569:24 #3 0x56251725ae7e in doris::vectorized::RowGroupReader::_read_column_data(doris::vectorized::Block*, std::vector, std::allocator >, std::allocator, std::allocator > > > const&, unsigned long, unsigned long*, bool*, doris::vectorized::ColumnSelectVector&) /root/doris/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp:421:13 #4 0x56251724d6d2 in doris::vectorized::RowGroupReader::next_batch(doris::vectorized::Block*, unsigned long, unsigned long*, bool*) /root/doris/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp:321:9 #5 0x56251708eb97 in doris::vectorized::ParquetReader::get_next_block(doris::vectorized::Block*, unsigned long*, bool*) /root/doris/be/src/vec/exec/format/parquet/vparquet_reader.cpp:530:36 #6 0x56253036772d in doris::vectorized::VFileScanner::_get_block_wrapped(doris::RuntimeState*, doris::vectorized::Block*, bool*) /root/doris/be/src/vec/exec/scan/vfile_scanner.cpp:311:13 #7 0x562530366549 in doris::vectorized::VFileScanner::_get_block_impl(doris::RuntimeState*, doris::vectorized::Block*, bool*) /root/doris/be/src/vec/exec/scan/vfile_scanner.cpp:253:17 #8 0x5625176e79c8 in doris::vectorized::VScanner::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*) /root/doris/be/src/vec/exec/scan/vscanner.cpp:117:17 #9 0x5625176e6fc1 in doris::vectorized::VScanner::get_block_after_projects(doris::RuntimeState*, doris::vectorized::Block*, bool*) /root/doris/be/src/vec/exec/scan/vscanner.cpp:84:12 #10 0x562517698047 in doris::vectorized::ScannerScheduler::_scanner_scan(std::shared_ptr, std::shared_ptr) /root/doris/be/src/vec/exec/scan/scanner_scheduler.cpp:250:5 #11 0x56251769bc1f in doris::vectorized::ScannerScheduler::submit(std::shared_ptr, std::shared_ptr)::$_1::operator()() const::'lambda'()::operator()() const::'lambda'()::operator()() const /root/doris/be/src/vec/exec/scan/scanner_scheduler.cpp:172:25 #12 0x56251769bc1f in doris::vectorized::ScannerScheduler::submit(std::shared_ptr, std::shared_ptr)::$_1::operator()() const::'lambda'()::operator()() const /root/doris/be/src/vec/exec/scan/scanner_scheduler.cpp:171:35 ... ``` --- be/src/vec/exec/format/orc/vorc_reader.cpp | 4 +-- .../format/parquet/parquet_column_convert.h | 26 ++++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index aba1511c854496..9c016a4565e4e8 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -1583,7 +1583,7 @@ Status OrcReader::get_next_block_impl(Block* block, size_t* read_rows, bool* eof _decimal_scale_params_index = 0; try { rr = _row_reader->nextBatch(*_batch, block); - if (rr == 0) { + if (rr == 0 || _batch->numElements == 0) { *eof = true; *read_rows = 0; return Status::OK(); @@ -1653,7 +1653,7 @@ Status OrcReader::get_next_block_impl(Block* block, size_t* read_rows, bool* eof _decimal_scale_params_index = 0; try { rr = _row_reader->nextBatch(*_batch, block); - if (rr == 0) { + if (rr == 0 || _batch->numElements == 0) { *eof = true; *read_rows = 0; return Status::OK(); diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h index bc6bc2323275cf..551bf7e14edbc8 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -408,18 +408,20 @@ class StringToDecimal : public PhysicalToLogicalConverter { // When Decimal in parquet is stored in byte arrays, binary and fixed, // the unscaled number must be encoded as two's complement using big-endian byte order. ValueCopyType value = 0; - memcpy(reinterpret_cast(&value), buf + offset[i - 1], len); - value = BitUtil::big_endian_to_host(value); - value = value >> ((sizeof(value) - len) * 8); - if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { - value *= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { - value /= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) { - // do nothing - } else { - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); + if (len > 0) { + memcpy(reinterpret_cast(&value), buf + offset[i - 1], len); + value = BitUtil::big_endian_to_host(value); + value = value >> ((sizeof(value) - len) * 8); + if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { + value *= scale_params.scale_factor; + } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { + value /= scale_params.scale_factor; + } else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) { + // do nothing + } else { + LOG(FATAL) << "__builtin_unreachable"; + __builtin_unreachable(); + } } auto& v = reinterpret_cast(data[start_idx + i]); v = (DecimalType)value;