From 24e119186bb3a001b3a7f4fe33097699011a69c4 Mon Sep 17 00:00:00 2001 From: Keith McHugh Date: Mon, 11 Sep 2023 07:59:20 -0700 Subject: [PATCH 1/5] Add ColumnValueParser to convert complex types into ruby objects --- lib/trino/client/column_value_parser.rb | 118 +++++++++++++++++++++ lib/trino/client/query.rb | 1 + spec/column_value_parser_spec.rb | 132 ++++++++++++++++++++++++ 3 files changed, 251 insertions(+) create mode 100644 lib/trino/client/column_value_parser.rb create mode 100644 spec/column_value_parser_spec.rb diff --git a/lib/trino/client/column_value_parser.rb b/lib/trino/client/column_value_parser.rb new file mode 100644 index 00000000..d7a500ae --- /dev/null +++ b/lib/trino/client/column_value_parser.rb @@ -0,0 +1,118 @@ +module Trino::Client + class ColumnValueParser + INSIDE_MATCHING_PARENS_REGEX = /\((?>[^)(]+|\g<0>)*\)/ + + attr_reader :name, :type + + def initialize(column) + @name = column.name + @type = prepare_type_for_parsing(column.type) + end + + # Public: Parse the value of a row's field by using its column's Trino type. + # Trino types can be scalars like VARCHAR and TIMESTAMP or complex types + # like ARRAY and ROW. ROW types are treated as objects. + # An ARRAY column's type is an array of types as you'd expect. A ROW + # column's type is a comma-separated list of space-separated (name, type) tuples. + # + # data - The value of a row's field. Can be a string, number, an array of those, + # or an arrays of arrays, etc. + # dtype - The Trino type string of the column. See above explanation. + # + # Returns: + # - The given value for strings and numbers + # - A Time for timestamps + # - A Hash of { field1 => value1, field2 => value2, ...etc } for row types + # - An array of the above for array types + def value(data, dtype = type) + # Convert Trino ARRAY elements into Ruby Arrays + if starts_with?(dtype, 'array(') + return parse_array_element(data, dtype) + + # Convert Trino ROW elements into Ruby Hashes + elsif starts_with?(dtype, 'row(') + return parse_row_element(data, dtype) + + # Decode VARBINARY strings + elsif starts_with?(dtype, 'varbinary') + return blank?(data) ? nil : Base64.decode64(data) + + # Convert TIMESTAMP fields to Ruby Time objects + elsif starts_with?(dtype, 'timestamp') + return blank?(data) ? nil : Time.parse(data) + end + + # Other types are returned unaltered + data + end + + private + + # Private: Remove quotation marks and handle recent versions of + # Trino having a 'with time zone' suffix on some fields that breaks + # out assumption that types don't have spaces in them. + # + # Returns a string. + def prepare_type_for_parsing(type) + type.gsub('"', '').gsub(' with time zone', '_with_time_zone') + end + + def parse_array_element(data, dtype) + # If the element is empty, return an empty array + return [] if blank?(data) + + # Inner data type will be the current dtype with `array(` and `)` chopped off + inner_dtype = dtype.match(INSIDE_MATCHING_PARENS_REGEX)[0][1..-2] + + data.map { |inner_data| value(inner_data, inner_dtype) } + end + + def parse_row_element(data, dtype) + # If the element is empty, return an empty object + return {} if blank?(data) + + parsed_row_element = {} + + inner_dtype = dtype.match(INSIDE_MATCHING_PARENS_REGEX)[0][1..-2] + elems = inner_dtype.split(' ') + num_elems_to_skip = 0 + field_position = 0 + + # Iterate over each datatype of the row and mutate parsed_row_element + # to have a key of the field name and value for that field's value. + elems.each_with_index do |field, i| + # We detected an array or row and are skipping all of the elements within it + # since its conversion was handled by calling `value` recursively. + if num_elems_to_skip.positive? + num_elems_to_skip -= 1 + next + end + + # Field names never have these characters and are never the last element. + next if field.include?(',') || field.include?('(') || field.include?(')') || i == elems.length - 1 + + type = elems[(i + 1)..].join(' ') + + # If this row has a nested array or row, the type of this field is that array or row's type. + if starts_with?(type, 'array(') || starts_with?(type, 'row(') + datatype = type.sub(/\(.*/, '') + type = "#{datatype}#{type.match(INSIDE_MATCHING_PARENS_REGEX)[0]}" + num_elems_to_skip = type.split(' ').length # see above comment about num_elems_to_skip + end + + parsed_row_element[field] = value(data[field_position], type) + field_position += 1 + end + + parsed_row_element + end + + def blank?(obj) + obj.respond_to?(:empty?) ? !!obj.empty? : !obj + end + + def starts_with?(str, prefix) + prefix.respond_to?(:to_str) && str[0, prefix.length] == prefix + end + end +end diff --git a/lib/trino/client/query.rb b/lib/trino/client/query.rb index c6e256ee..a5962d9e 100644 --- a/lib/trino/client/query.rb +++ b/lib/trino/client/query.rb @@ -18,6 +18,7 @@ module Trino::Client require 'faraday' require 'faraday/gzip' require 'faraday/follow_redirects' + require 'trino/client/column_value_parser' require 'trino/client/models' require 'trino/client/errors' require 'trino/client/faraday_client' diff --git a/spec/column_value_parser_spec.rb b/spec/column_value_parser_spec.rb new file mode 100644 index 00000000..1a0f3915 --- /dev/null +++ b/spec/column_value_parser_spec.rb @@ -0,0 +1,132 @@ +require 'spec_helper' + +describe Trino::Client::ColumnValueParser do + def column_value(data, type) + column = Struct.new(:type, :name).new(type) + Trino::Client::ColumnValueParser.new(column).value(data) + end + + it 'parses varchar values' do + data = 'a string' + type = 'varchar' + expected_value = 'a string' + expect(column_value(data, type)).to eq(expected_value) + end + + it 'parses timestamp values' do + data = '2022-07-01T14:53:02Z' + type = 'timestamp with time zone' + expected_value = Time.parse(data) + expect(column_value(data, type)).to eq(expected_value) + end + + it 'parses array type values' do + data = [1, 2, 3, 4] + type = 'array(integer, integer, integer, integer)' + expected_value = [1, 2, 3, 4] + expect(column_value(data, type)).to eq(expected_value) + end + + it 'parses row type values' do + data = [ + 'userId', + 'userLogin', + 'SKU_FREE', + 'TYPE_USER', + '2022-07-01T14:53:02Z', + '' + ] + type = 'row(id varchar, "name" varchar, plan_sku varchar, type varchar, created_at timestamp with time zone, organization_tenant_name varchar)' + expected_value = { + 'id' => 'userId', + 'name' => 'userLogin', + 'plan_sku' => 'SKU_FREE', + 'type' => 'TYPE_USER', + 'created_at' => Time.parse('2022-07-01T14:53:02Z'), + 'organization_tenant_name' => '' + } + value = column_value(data, type) + expect(column_value(data, type)).to eq(expected_value) + expect(value['created_at'].is_a?(Time)).to eq true + end + + it 'parses an array of row type values' do + data = [[ + 'userId', + 'userLogin', + 'SKU_FREE', + 'TYPE_USER', + '2022-07-01T14:53:02Z', + '' + ]] + type = 'array(row(id varchar, "name" varchar, plan_sku varchar, type varchar, created_at timestamp with time zone, organization_tenant_name varchar))' + expected_value = [{ + 'id' => 'userId', + 'name' => 'userLogin', + 'plan_sku' => 'SKU_FREE', + 'type' => 'TYPE_USER', + 'created_at' => Time.parse('2022-07-01T14:53:02Z'), + 'organization_tenant_name' => '' + }] + value = column_value(data, type) + expect(column_value(data, type)).to eq(expected_value) + expect(value[0]['created_at'].is_a?(Time)).to eq true + end + + it 'parses row type values that have an array in them' do + data = [ + 'userId', + %w[userLogin1 userLogin2], + 'value' + ] + type = 'row(id varchar, logins array(varchar), onemore varchar)' + expected_value = { + 'id' => 'userId', + 'logins' => %w[userLogin1 userLogin2], + 'onemore' => 'value' + } + expect(column_value(data, type)).to eq(expected_value) + end + + it 'parses row type values that have a row in them' do + data = [ + 'userId', + ['userLogin', '2022-07-01T14:53:02Z', 1234], + 'value' + ] + type = 'row(id varchar, subobj row(login varchar, created_at timestamp with time zone, id integer), onemore varchar)' + expected_value = { + 'id' => 'userId', + 'subobj' => { + 'login' => 'userLogin', + 'created_at' => Time.parse('2022-07-01T14:53:02Z'), + 'id' => 1234 + }, + 'onemore' => 'value' + } + value = column_value(data, type) + expect(column_value(data, type)).to eq(expected_value) + expect(value['subobj']['created_at'].is_a?(Time)).to eq true + end + + it 'parses row type values that have nested rows in them' do + data = [ + 'userId', + ['userLogin', '2022-07-01T14:53:02Z', [1234]], + 'value' + ] + type = 'row(id varchar, subobj row(login varchar, created_at timestamp with time zone, id row(subid integer)), onemore varchar)' + expected_value = { + 'id' => 'userId', + 'subobj' => { + 'login' => 'userLogin', + 'created_at' => Time.parse('2022-07-01T14:53:02Z'), + 'id' => { 'subid' => 1234 } + }, + 'onemore' => 'value' + } + value = column_value(data, type) + expect(column_value(data, type)).to eq(expected_value) + expect(value['subobj']['created_at'].is_a?(Time)).to eq true + end +end From 986d89c89bf3e5840b5e6ffd4b48bb67b7622000 Mon Sep 17 00:00:00 2001 From: Keith McHugh Date: Mon, 11 Sep 2023 08:47:29 -0700 Subject: [PATCH 2/5] Add specs showing that complex types are returned as arrays of values --- spec/client_spec.rb | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/spec/client_spec.rb b/spec/client_spec.rb index cb163b81..80b7787f 100644 --- a/spec/client_spec.rb +++ b/spec/client_spec.rb @@ -8,15 +8,17 @@ [ Models::Column.new(name: 'animal', type: 'string'), Models::Column.new(name: 'score', type: 'integer'), - Models::Column.new(name: 'name', type: 'string') + Models::Column.new(name: 'name', type: 'string'), + Models::Column.new(name: 'foods', type: 'array(string string)'), + Models::Column.new(name: 'traits', type: 'row(breed string, num_spots integer)') ] end it 'multiple rows' do rows = [ - ['dog', 1, 'Lassie'], - ['horse', 5, 'Mr. Ed'], - ['t-rex', 37, 'Doug'] + ['dog', 1, 'Lassie', ['kibble', 'peanut butter'], ['spaniel', 2]], + ['horse', 5, 'Mr. Ed', ['hay', 'sugar cubes'], ['some horse', 0]], + ['t-rex', 37, 'Doug', ['rodents', 'small dinos'], ['dino', 0]] ] client.stub(:run).and_return([columns, rows]) @@ -27,18 +29,26 @@ expect(rehashed[0]['animal']).to eq 'dog' expect(rehashed[0]['score']).to eq 1 expect(rehashed[0]['name']).to eq 'Lassie' + expect(rehashed[0]['foods']).to eq ['kibble', 'peanut butter'] + expect(rehashed[0]['traits']).to eq ['spaniel', 2] expect(rehashed[0].values[0]).to eq 'dog' expect(rehashed[0].values[1]).to eq 1 expect(rehashed[0].values[2]).to eq 'Lassie' + expect(rehashed[0].values[3]).to eq ['kibble', 'peanut butter'] + expect(rehashed[0].values[4]).to eq ['spaniel', 2] expect(rehashed[1]['animal']).to eq 'horse' expect(rehashed[1]['score']).to eq 5 expect(rehashed[1]['name']).to eq 'Mr. Ed' + expect(rehashed[1]['foods']).to eq ['hay', 'sugar cubes'] + expect(rehashed[1]['traits']).to eq ['some horse', 0] expect(rehashed[1].values[0]).to eq 'horse' expect(rehashed[1].values[1]).to eq 5 expect(rehashed[1].values[2]).to eq 'Mr. Ed' + expect(rehashed[1].values[3]).to eq ['hay', 'sugar cubes'] + expect(rehashed[1].values[4]).to eq ['some horse', 0] end it 'empty results' do @@ -58,17 +68,21 @@ "animal" => "wrong", "score" => "count", "name" => nil, + "foods" => nil, + "traits" => nil }] end it 'handles too many result columns' do - rows = [['wrong', 'count', 'too', 'much', 'columns']] + rows = [['wrong', 'count', 'too', 'too', 'too', 'much', 'columns']] client.stub(:run).and_return([columns, rows]) expect(client.run_with_names('fake query')).to eq [{ "animal" => "wrong", "score" => "count", - "name" => 'too', + "name" => "too", + "foods" => "too", + "traits" => "too" }] end end From 59438f7db03364b8e5403efb401548310bdfa228 Mon Sep 17 00:00:00 2001 From: Keith McHugh Date: Mon, 11 Sep 2023 10:04:30 -0700 Subject: [PATCH 3/5] Add methods for transforming rows into Ruby objects --- lib/trino/client/query.rb | 27 +++++++++++++++++++++++++++ spec/client_spec.rb | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/lib/trino/client/query.rb b/lib/trino/client/query.rb index a5962d9e..02d6ad39 100644 --- a/lib/trino/client/query.rb +++ b/lib/trino/client/query.rb @@ -45,6 +45,19 @@ def self.faraday_client(options) Trino::Client.faraday_client(options) end + def self.transform_row(column_value_parsers, row) + row_object = {} + + row.each_with_index do |element, i| + column = column_value_parsers[i] + value = column.value(element) + + row_object[column.name] = value + end + + row_object + end + def initialize(api) @api = api end @@ -87,6 +100,20 @@ def columns return @api.current_results.columns end + def column_value_parsers + @column_value_parsers ||= columns.map {|column| + ColumnValueParser.new(column) + } + end + + def transform_rows + rows.map(&:transform_row) + end + + def transform_row(row) + self.class.transform_row(column_value_parsers, row) + end + def rows rows = [] each_row_chunk {|chunk| diff --git a/spec/client_spec.rb b/spec/client_spec.rb index 80b7787f..e0bc56f5 100644 --- a/spec/client_spec.rb +++ b/spec/client_spec.rb @@ -51,6 +51,41 @@ expect(rehashed[1].values[4]).to eq ['some horse', 0] end + it 'transforms rows into Ruby objects' do + rows = [ + ['dog', 1, 'Lassie', ['kibble', 'peanut butter'], ['spaniel', 2]], + ['horse', 5, 'Mr. Ed', ['hay', 'sugar cubes'], ['some horse', 0]], + ['t-rex', 37, 'Doug', ['rodents', 'small dinos'], ['dino', 0]] + ] + client.stub(:run).and_return([columns, rows]) + + columns, rows = client.run('fake query') + column_value_parsers = columns.map { |column| Trino::Client::ColumnValueParser.new(column) } + transformed_rows = rows.map { |row| Trino::Client::Query.transform_row(column_value_parsers, row) } + + expect(transformed_rows[0]).to eq({ + "animal" => "dog", + "score" => 1, + "name" => "Lassie", + "foods" => ["kibble", "peanut butter"], + "traits" => { + "breed" => "spaniel", + "num_spots" => 2, + }, + }) + + expect(transformed_rows[1]).to eq({ + "animal" => "horse", + "score" => 5, + "name" => "Mr. Ed", + "foods" => ["hay", "sugar cubes"], + "traits" => { + "breed" => "some horse", + "num_spots" => 0, + }, + }) + end + it 'empty results' do rows = [] client.stub(:run).and_return([columns, rows]) From 7efef67b247da14126f614b4d7b4394a52e6b1a4 Mon Sep 17 00:00:00 2001 From: Keith McHugh Date: Tue, 12 Sep 2023 07:02:29 -0700 Subject: [PATCH 4/5] bundle exec standardrb --fix --- spec/column_value_parser_spec.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/column_value_parser_spec.rb b/spec/column_value_parser_spec.rb index 1a0f3915..442df930 100644 --- a/spec/column_value_parser_spec.rb +++ b/spec/column_value_parser_spec.rb @@ -121,7 +121,7 @@ def column_value(data, type) 'subobj' => { 'login' => 'userLogin', 'created_at' => Time.parse('2022-07-01T14:53:02Z'), - 'id' => { 'subid' => 1234 } + 'id' => {'subid' => 1234} }, 'onemore' => 'value' } From 91cabadb2cf9bf61043fcb50e9295862ad49b1ba Mon Sep 17 00:00:00 2001 From: Keith McHugh Date: Wed, 13 Sep 2023 09:21:05 -0700 Subject: [PATCH 5/5] Don't configure scalar types by default, let users configure how to do this --- lib/trino/client/column_value_parser.rb | 17 +++++++---------- spec/column_value_parser_spec.rb | 25 +++++++++---------------- 2 files changed, 16 insertions(+), 26 deletions(-) diff --git a/lib/trino/client/column_value_parser.rb b/lib/trino/client/column_value_parser.rb index d7a500ae..cfc9dff2 100644 --- a/lib/trino/client/column_value_parser.rb +++ b/lib/trino/client/column_value_parser.rb @@ -2,11 +2,12 @@ module Trino::Client class ColumnValueParser INSIDE_MATCHING_PARENS_REGEX = /\((?>[^)(]+|\g<0>)*\)/ - attr_reader :name, :type + attr_reader :name, :type, :scalar_parser - def initialize(column) + def initialize(column, scalar_parser = nil) @name = column.name @type = prepare_type_for_parsing(column.type) + @scalar_parser = scalar_parser end # Public: Parse the value of a row's field by using its column's Trino type. @@ -33,16 +34,12 @@ def value(data, dtype = type) elsif starts_with?(dtype, 'row(') return parse_row_element(data, dtype) - # Decode VARBINARY strings - elsif starts_with?(dtype, 'varbinary') - return blank?(data) ? nil : Base64.decode64(data) - - # Convert TIMESTAMP fields to Ruby Time objects - elsif starts_with?(dtype, 'timestamp') - return blank?(data) ? nil : Time.parse(data) + # If defined, use scalar_parser to convert scalar types + elsif !scalar_parser.nil? + return scalar_parser.call(data, dtype) end - # Other types are returned unaltered + # Otherwise, values are returned unaltered data end diff --git a/spec/column_value_parser_spec.rb b/spec/column_value_parser_spec.rb index 442df930..ea8dd955 100644 --- a/spec/column_value_parser_spec.rb +++ b/spec/column_value_parser_spec.rb @@ -1,9 +1,9 @@ require 'spec_helper' describe Trino::Client::ColumnValueParser do - def column_value(data, type) + def column_value(data, type, scalar_parser = nil) column = Struct.new(:type, :name).new(type) - Trino::Client::ColumnValueParser.new(column).value(data) + Trino::Client::ColumnValueParser.new(column, scalar_parser).value(data) end it 'parses varchar values' do @@ -13,11 +13,12 @@ def column_value(data, type) expect(column_value(data, type)).to eq(expected_value) end - it 'parses timestamp values' do + it 'converts scalar values if configured to do so' do data = '2022-07-01T14:53:02Z' type = 'timestamp with time zone' + scalar_parser = ->(value, _dtype) { Time.parse(value) } expected_value = Time.parse(data) - expect(column_value(data, type)).to eq(expected_value) + expect(column_value(data, type, scalar_parser)).to eq(expected_value) end it 'parses array type values' do @@ -42,12 +43,10 @@ def column_value(data, type) 'name' => 'userLogin', 'plan_sku' => 'SKU_FREE', 'type' => 'TYPE_USER', - 'created_at' => Time.parse('2022-07-01T14:53:02Z'), + 'created_at' => '2022-07-01T14:53:02Z', 'organization_tenant_name' => '' } - value = column_value(data, type) expect(column_value(data, type)).to eq(expected_value) - expect(value['created_at'].is_a?(Time)).to eq true end it 'parses an array of row type values' do @@ -65,12 +64,10 @@ def column_value(data, type) 'name' => 'userLogin', 'plan_sku' => 'SKU_FREE', 'type' => 'TYPE_USER', - 'created_at' => Time.parse('2022-07-01T14:53:02Z'), + 'created_at' => '2022-07-01T14:53:02Z', 'organization_tenant_name' => '' }] - value = column_value(data, type) expect(column_value(data, type)).to eq(expected_value) - expect(value[0]['created_at'].is_a?(Time)).to eq true end it 'parses row type values that have an array in them' do @@ -99,14 +96,12 @@ def column_value(data, type) 'id' => 'userId', 'subobj' => { 'login' => 'userLogin', - 'created_at' => Time.parse('2022-07-01T14:53:02Z'), + 'created_at' => '2022-07-01T14:53:02Z', 'id' => 1234 }, 'onemore' => 'value' } - value = column_value(data, type) expect(column_value(data, type)).to eq(expected_value) - expect(value['subobj']['created_at'].is_a?(Time)).to eq true end it 'parses row type values that have nested rows in them' do @@ -120,13 +115,11 @@ def column_value(data, type) 'id' => 'userId', 'subobj' => { 'login' => 'userLogin', - 'created_at' => Time.parse('2022-07-01T14:53:02Z'), + 'created_at' => '2022-07-01T14:53:02Z', 'id' => {'subid' => 1234} }, 'onemore' => 'value' } - value = column_value(data, type) expect(column_value(data, type)).to eq(expected_value) - expect(value['subobj']['created_at'].is_a?(Time)).to eq true end end