Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

23.3 Backports of #48294 - update arrow v11 #405

Open
wants to merge 5 commits into
base: customizations/23.3.19
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions base/glibc-compatibility/musl/expf.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/* origin: FreeBSD /usr/src/lib/msun/src/e_expf.c */
/*
* Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
*/
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/

#include "libm.h"

static const float
half[2] = {0.5,-0.5},
ln2hi = 6.9314575195e-1f, /* 0x3f317200 */
ln2lo = 1.4286067653e-6f, /* 0x35bfbe8e */
invln2 = 1.4426950216e+0f, /* 0x3fb8aa3b */
/*
* Domain [-0.34568, 0.34568], range ~[-4.278e-9, 4.447e-9]:
* |x*(exp(x)+1)/(exp(x)-1) - p(x)| < 2**-27.74
*/
P1 = 1.6666625440e-1f, /* 0xaaaa8f.0p-26 */
P2 = -2.7667332906e-3f; /* -0xb55215.0p-32 */

float expf(float x)
{
float_t hi, lo, c, xx, y;
int k, sign;
uint32_t hx;

GET_FLOAT_WORD(hx, x);
sign = hx >> 31; /* sign bit of x */
hx &= 0x7fffffff; /* high word of |x| */

/* special cases */
if (hx >= 0x42aeac50) { /* if |x| >= -87.33655f or NaN */
if (hx >= 0x42b17218 && !sign) { /* x >= 88.722839f */
/* overflow */
x *= 0x1p127f;
return x;
}
if (sign) {
/* underflow */
FORCE_EVAL(-0x1p-149f/x);
if (hx >= 0x42cff1b5) /* x <= -103.972084f */
return 0;
}
}

/* argument reduction */
if (hx > 0x3eb17218) { /* if |x| > 0.5 ln2 */
if (hx > 0x3f851592) /* if |x| > 1.5 ln2 */
k = invln2*x + half[sign];
else
k = 1 - sign - sign;
hi = x - k*ln2hi; /* k*ln2hi is exact here */
lo = k*ln2lo;
x = hi - lo;
} else if (hx > 0x39000000) { /* |x| > 2**-14 */
k = 0;
hi = x;
lo = 0;
} else {
/* raise inexact */
FORCE_EVAL(0x1p127f + x);
return 1 + x;
}

/* x is now in primary range */
xx = x*x;
c = x - xx*(P1+xx*P2);
y = 1 + (x*c/(2-c) - lo + hi);
if (k == 0)
return y;
return scalbnf(y, k);
}
31 changes: 31 additions & 0 deletions base/glibc-compatibility/musl/scalbnf.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#include <math.h>
#include <stdint.h>

float scalbnf(float x, int n)
{
union {float f; uint32_t i;} u;
float_t y = x;

if (n > 127) {
y *= 0x1p127f;
n -= 127;
if (n > 127) {
y *= 0x1p127f;
n -= 127;
if (n > 127)
n = 127;
}
} else if (n < -126) {
y *= 0x1p-126f;
n += 126;
if (n < -126) {
y *= 0x1p-126f;
n += 126;
if (n < -126)
n = -126;
}
}
u.i = (uint32_t)(0x7f+n)<<23;
x = y * u.f;
return x;
}
2 changes: 1 addition & 1 deletion contrib/arrow
Submodule arrow updated 4489 files
25 changes: 21 additions & 4 deletions contrib/arrow-cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/builder.cc"
"${LIBRARY_DIR}/buffer.cc"
"${LIBRARY_DIR}/chunked_array.cc"
"${LIBRARY_DIR}/chunk_resolver.cc"
"${LIBRARY_DIR}/compare.cc"
"${LIBRARY_DIR}/config.cc"
"${LIBRARY_DIR}/datum.cc"
Expand Down Expand Up @@ -268,6 +269,10 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/util/uri.cc"
"${LIBRARY_DIR}/util/utf8.cc"
"${LIBRARY_DIR}/util/value_parsing.cc"
"${LIBRARY_DIR}/util/byte_size.cc"
"${LIBRARY_DIR}/util/debug.cc"
"${LIBRARY_DIR}/util/tracing.cc"
"${LIBRARY_DIR}/util/atfork_internal.cc"
"${LIBRARY_DIR}/vendored/base64.cpp"
"${LIBRARY_DIR}/vendored/datetime/tz.cpp"

Expand Down Expand Up @@ -301,9 +306,11 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/compute/exec/source_node.cc"
"${LIBRARY_DIR}/compute/exec/sink_node.cc"
"${LIBRARY_DIR}/compute/exec/order_by_impl.cc"
"${LIBRARY_DIR}/compute/exec/partition_util.cc"
"${LIBRARY_DIR}/compute/function.cc"
"${LIBRARY_DIR}/compute/function_internal.cc"
"${LIBRARY_DIR}/compute/kernel.cc"
"${LIBRARY_DIR}/compute/light_array.cc"
"${LIBRARY_DIR}/compute/registry.cc"
"${LIBRARY_DIR}/compute/kernels/aggregate_basic.cc"
"${LIBRARY_DIR}/compute/kernels/aggregate_mode.cc"
Expand All @@ -317,21 +324,28 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/compute/kernels/scalar_cast_boolean.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_dictionary.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_internal.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_extension.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_nested.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_numeric.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_string.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_temporal.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_compare.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_nested.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_random.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_round.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_set_lookup.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_string.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_temporal_binary.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_temporal_unary.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_validity.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_if_else.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_string_ascii.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_string_utf8.cc"
"${LIBRARY_DIR}/compute/kernels/util_internal.cc"
"${LIBRARY_DIR}/compute/kernels/vector_array_sort.cc"
"${LIBRARY_DIR}/compute/kernels/vector_cumulative_ops.cc"
"${LIBRARY_DIR}/compute/kernels/vector_hash.cc"
"${LIBRARY_DIR}/compute/kernels/vector_rank.cc"
"${LIBRARY_DIR}/compute/kernels/vector_select_k.cc"
"${LIBRARY_DIR}/compute/kernels/vector_nested.cc"
"${LIBRARY_DIR}/compute/kernels/vector_replace.cc"
"${LIBRARY_DIR}/compute/kernels/vector_selection.cc"
Expand All @@ -340,13 +354,15 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/compute/exec/union_node.cc"
"${LIBRARY_DIR}/compute/exec/key_hash.cc"
"${LIBRARY_DIR}/compute/exec/key_map.cc"
"${LIBRARY_DIR}/compute/exec/key_compare.cc"
"${LIBRARY_DIR}/compute/exec/key_encode.cc"
"${LIBRARY_DIR}/compute/exec/util.cc"
"${LIBRARY_DIR}/compute/exec/hash_join_dict.cc"
"${LIBRARY_DIR}/compute/exec/hash_join.cc"
"${LIBRARY_DIR}/compute/exec/hash_join_node.cc"
"${LIBRARY_DIR}/compute/exec/task_util.cc"
"${LIBRARY_DIR}/compute/row/encode_internal.cc"
"${LIBRARY_DIR}/compute/row/grouper.cc"
"${LIBRARY_DIR}/compute/row/compare_internal.cc"
"${LIBRARY_DIR}/compute/row/row_internal.cc"

"${LIBRARY_DIR}/ipc/dictionary.cc"
"${LIBRARY_DIR}/ipc/feather.cc"
Expand All @@ -357,7 +373,8 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/ipc/writer.cc"

"${ARROW_SRC_DIR}/arrow/adapters/orc/adapter.cc"
"${ARROW_SRC_DIR}/arrow/adapters/orc/adapter_util.cc"
"${ARROW_SRC_DIR}/arrow/adapters/orc/util.cc"
"${ARROW_SRC_DIR}/arrow/adapters/orc/options.cc"
)

add_definitions(-DARROW_WITH_LZ4)
Expand Down
18 changes: 5 additions & 13 deletions src/Processors/Formats/Impl/ArrowFieldIndexUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@ namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
/// For ORC format, index_nested_type = true, a nested type takes one index count. And the
/// the start index for ORC format should be 1, since index 0 indicates to select all columns.
template<bool index_nested_type>

class ArrowFieldIndexUtil
{
public:
Expand All @@ -46,9 +44,7 @@ class ArrowFieldIndexUtil
calculateFieldIndices(const arrow::Schema & schema)
{
std::unordered_map<std::string, std::pair<int, int>> result;
// For format like ORC, index = 0 indicates to select all columns, so we skip 0 and start
// from 1.
int index_start = index_nested_type;
int index_start = 0;
for (int i = 0; i < schema.num_fields(); ++i)
{
const auto & field = schema.field(i);
Expand Down Expand Up @@ -94,17 +90,16 @@ class ArrowFieldIndexUtil
}

/// Count the number of indices for types.
/// For orc format, index_nested_type is true, a complex type takes one index.
size_t countIndicesForType(std::shared_ptr<arrow::DataType> type)
{
if (type->id() == arrow::Type::LIST)
{
return countIndicesForType(static_cast<arrow::ListType *>(type.get())->value_type()) + index_nested_type;
return countIndicesForType(static_cast<arrow::ListType *>(type.get())->value_type());
}

if (type->id() == arrow::Type::STRUCT)
{
int indices = index_nested_type;
int indices = 0;
auto * struct_type = static_cast<arrow::StructType *>(type.get());
for (int i = 0; i != struct_type->num_fields(); ++i)
indices += countIndicesForType(struct_type->field(i)->type());
Expand All @@ -114,7 +109,7 @@ class ArrowFieldIndexUtil
if (type->id() == arrow::Type::MAP)
{
auto * map_type = static_cast<arrow::MapType *>(type.get());
return countIndicesForType(map_type->key_type()) + countIndicesForType(map_type->item_type()) + index_nested_type;
return countIndicesForType(map_type->key_type()) + countIndicesForType(map_type->item_type()) ;
}

return 1;
Expand Down Expand Up @@ -144,8 +139,6 @@ class ArrowFieldIndexUtil
index_info.first = current_start_index;
if (field_type->id() == arrow::Type::STRUCT)
{
current_start_index += index_nested_type;

auto * struct_type = static_cast<arrow::StructType *>(field_type.get());
for (int i = 0, n = struct_type->num_fields(); i < n; ++i)
{
Expand All @@ -161,7 +154,6 @@ class ArrowFieldIndexUtil
const auto * list_type = static_cast<arrow::ListType *>(field_type.get());
const auto value_field = list_type->value_field();
auto index_snapshot = current_start_index;
current_start_index += index_nested_type;
calculateFieldIndices(*value_field, field_name, current_start_index, result, name_prefix);
// The nested struct field has the same name as this list field.
// rewrite it back to the original value.
Expand Down
15 changes: 11 additions & 4 deletions src/Processors/Formats/Impl/ORCBlockInputFormat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,17 @@ void ORCBlockInputFormat::prepareReader()
format_settings.null_as_default,
format_settings.orc.case_insensitive_column_matching);

ArrowFieldIndexUtil<true> field_util(
format_settings.orc.case_insensitive_column_matching,
format_settings.orc.allow_missing_columns);
include_indices = field_util.findRequiredIndices(getPort().getHeader(), *schema);
const bool ignore_case = format_settings.orc.case_insensitive_column_matching;
std::unordered_set<String> nested_table_names;
if (format_settings.orc.import_nested)
nested_table_names = Nested::getAllTableNames(getPort().getHeader(), ignore_case);

for (int i = 0; i < schema->num_fields(); ++i)
{
const auto & name = schema->field(i)->name();
if (getPort().getHeader().has(name, ignore_case) || nested_table_names.contains(ignore_case ? boost::to_lower_copy(name) : name))
include_indices.push_back(i);
}
}

ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
Expand Down
2 changes: 1 addition & 1 deletion src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ void ParquetBlockInputFormat::prepareReader()
format_settings.null_as_default,
format_settings.parquet.case_insensitive_column_matching);

ArrowFieldIndexUtil<false> field_util(
ArrowFieldIndexUtil field_util(
format_settings.parquet.case_insensitive_column_matching,
format_settings.parquet.allow_missing_columns);
column_indices = field_util.findRequiredIndices(getPort().getHeader(), *schema);
Expand Down
10 changes: 5 additions & 5 deletions src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,14 @@ void ParquetBlockOutputFormat::consume(Chunk chunk)
builder.version(getParquetVersion(format_settings));
builder.compression(getParquetCompression(format_settings.parquet.output_compression_method));
auto props = builder.build();
auto status = parquet::arrow::FileWriter::Open(
auto result = parquet::arrow::FileWriter::Open(
*arrow_table->schema(),
arrow::default_memory_pool(),
sink,
props, /*parquet::default_writer_properties(),*/
&file_writer);
if (!status.ok())
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while opening a table: {}", status.ToString());
props);
if (!result.ok())
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while opening a table: {}", result.status().ToString());
file_writer = std::move(result.ValueOrDie());
}

// TODO: calculate row_group_size depending on a number of rows and table size
Expand Down
7 changes: 5 additions & 2 deletions tests/queries/0_stateless/00900_long_parquet_load.reference
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,11 @@ idx10 ['This','is','a','test']
123 1
456 2
=== Try load data from datapage_v2.snappy.parquet
Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Unknown encoding type.: While executing ParquetBlockInputFormat: data for INSERT was parsed from stdin: (in query: INSERT INTO parquet_load FORMAT Parquet). (CANNOT_READ_ALL_DATA)

abc 1 2 1 [1,2,3]
abc 2 3 1 []
abc 3 4 1 []
\N 4 5 0 [1,2,3]
abc 5 2 1 [1,2]
=== Try load data from datatype-date32.parquet
1925-01-01
1949-10-01
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1 @@
`a` Nullable(String), `b` Array(Nullable(Int32)), `c` Nullable(Float64), `d` Nullable(UInt8), `e` Array(Nullable(Int32))
`a` Nullable(String), `b` Nullable(Int32), `c` Nullable(Float64), `d` Nullable(UInt8), `e` Array(Nullable(Int32))
Loading