Skip to content

Commit

Permalink
Add utility for transforming Trino ROW type columns into Ruby hashes (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
kmcq authored Sep 19, 2023
1 parent ebd9d9e commit e14251c
Show file tree
Hide file tree
Showing 4 changed files with 323 additions and 6 deletions.
115 changes: 115 additions & 0 deletions lib/trino/client/column_value_parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
module Trino::Client
class ColumnValueParser
INSIDE_MATCHING_PARENS_REGEX = /\((?>[^)(]+|\g<0>)*\)/

attr_reader :name, :type, :scalar_parser

def initialize(column, scalar_parser = nil)
@name = column.name
@type = prepare_type_for_parsing(column.type)
@scalar_parser = scalar_parser
end

# Public: Parse the value of a row's field by using its column's Trino type.
# Trino types can be scalars like VARCHAR and TIMESTAMP or complex types
# like ARRAY and ROW. ROW types are treated as objects.
# An ARRAY column's type is an array of types as you'd expect. A ROW
# column's type is a comma-separated list of space-separated (name, type) tuples.
#
# data - The value of a row's field. Can be a string, number, an array of those,
# or an arrays of arrays, etc.
# dtype - The Trino type string of the column. See above explanation.
#
# Returns:
# - The given value for strings and numbers
# - A Time for timestamps
# - A Hash of { field1 => value1, field2 => value2, ...etc } for row types
# - An array of the above for array types
def value(data, dtype = type)
# Convert Trino ARRAY elements into Ruby Arrays
if starts_with?(dtype, 'array(')
return parse_array_element(data, dtype)

# Convert Trino ROW elements into Ruby Hashes
elsif starts_with?(dtype, 'row(')
return parse_row_element(data, dtype)

# If defined, use scalar_parser to convert scalar types
elsif !scalar_parser.nil?
return scalar_parser.call(data, dtype)
end

# Otherwise, values are returned unaltered
data
end

private

# Private: Remove quotation marks and handle recent versions of
# Trino having a 'with time zone' suffix on some fields that breaks
# out assumption that types don't have spaces in them.
#
# Returns a string.
def prepare_type_for_parsing(type)
type.gsub('"', '').gsub(' with time zone', '_with_time_zone')
end

def parse_array_element(data, dtype)
# If the element is empty, return an empty array
return [] if blank?(data)

# Inner data type will be the current dtype with `array(` and `)` chopped off
inner_dtype = dtype.match(INSIDE_MATCHING_PARENS_REGEX)[0][1..-2]

data.map { |inner_data| value(inner_data, inner_dtype) }
end

def parse_row_element(data, dtype)
# If the element is empty, return an empty object
return {} if blank?(data)

parsed_row_element = {}

inner_dtype = dtype.match(INSIDE_MATCHING_PARENS_REGEX)[0][1..-2]
elems = inner_dtype.split(' ')
num_elems_to_skip = 0
field_position = 0

# Iterate over each datatype of the row and mutate parsed_row_element
# to have a key of the field name and value for that field's value.
elems.each_with_index do |field, i|
# We detected an array or row and are skipping all of the elements within it
# since its conversion was handled by calling `value` recursively.
if num_elems_to_skip.positive?
num_elems_to_skip -= 1
next
end

# Field names never have these characters and are never the last element.
next if field.include?(',') || field.include?('(') || field.include?(')') || i == elems.length - 1

type = elems[(i + 1)..].join(' ')

# If this row has a nested array or row, the type of this field is that array or row's type.
if starts_with?(type, 'array(') || starts_with?(type, 'row(')
datatype = type.sub(/\(.*/, '')
type = "#{datatype}#{type.match(INSIDE_MATCHING_PARENS_REGEX)[0]}"
num_elems_to_skip = type.split(' ').length # see above comment about num_elems_to_skip
end

parsed_row_element[field] = value(data[field_position], type)
field_position += 1
end

parsed_row_element
end

def blank?(obj)
obj.respond_to?(:empty?) ? !!obj.empty? : !obj
end

def starts_with?(str, prefix)
prefix.respond_to?(:to_str) && str[0, prefix.length] == prefix
end
end
end
28 changes: 28 additions & 0 deletions lib/trino/client/query.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ module Trino::Client
require 'faraday'
require 'faraday/gzip'
require 'faraday/follow_redirects'
require 'trino/client/column_value_parser'
require 'trino/client/models'
require 'trino/client/errors'
require 'trino/client/faraday_client'
Expand All @@ -44,6 +45,19 @@ def self.faraday_client(options)
Trino::Client.faraday_client(options)
end

def self.transform_row(column_value_parsers, row)
row_object = {}

row.each_with_index do |element, i|
column = column_value_parsers[i]
value = column.value(element)

row_object[column.name] = value
end

row_object
end

def initialize(api)
@api = api
end
Expand Down Expand Up @@ -86,6 +100,20 @@ def columns
return @api.current_results.columns
end

def column_value_parsers
@column_value_parsers ||= columns.map {|column|
ColumnValueParser.new(column)
}
end

def transform_rows
rows.map(&:transform_row)
end

def transform_row(row)
self.class.transform_row(column_value_parsers, row)
end

def rows
rows = []
each_row_chunk {|chunk|
Expand Down
61 changes: 55 additions & 6 deletions spec/client_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,17 @@
[
Models::Column.new(name: 'animal', type: 'string'),
Models::Column.new(name: 'score', type: 'integer'),
Models::Column.new(name: 'name', type: 'string')
Models::Column.new(name: 'name', type: 'string'),
Models::Column.new(name: 'foods', type: 'array(string string)'),
Models::Column.new(name: 'traits', type: 'row(breed string, num_spots integer)')
]
end

it 'multiple rows' do
rows = [
['dog', 1, 'Lassie'],
['horse', 5, 'Mr. Ed'],
['t-rex', 37, 'Doug']
['dog', 1, 'Lassie', ['kibble', 'peanut butter'], ['spaniel', 2]],
['horse', 5, 'Mr. Ed', ['hay', 'sugar cubes'], ['some horse', 0]],
['t-rex', 37, 'Doug', ['rodents', 'small dinos'], ['dino', 0]]
]
client.stub(:run).and_return([columns, rows])

Expand All @@ -27,18 +29,61 @@
expect(rehashed[0]['animal']).to eq 'dog'
expect(rehashed[0]['score']).to eq 1
expect(rehashed[0]['name']).to eq 'Lassie'
expect(rehashed[0]['foods']).to eq ['kibble', 'peanut butter']
expect(rehashed[0]['traits']).to eq ['spaniel', 2]

expect(rehashed[0].values[0]).to eq 'dog'
expect(rehashed[0].values[1]).to eq 1
expect(rehashed[0].values[2]).to eq 'Lassie'
expect(rehashed[0].values[3]).to eq ['kibble', 'peanut butter']
expect(rehashed[0].values[4]).to eq ['spaniel', 2]

expect(rehashed[1]['animal']).to eq 'horse'
expect(rehashed[1]['score']).to eq 5
expect(rehashed[1]['name']).to eq 'Mr. Ed'
expect(rehashed[1]['foods']).to eq ['hay', 'sugar cubes']
expect(rehashed[1]['traits']).to eq ['some horse', 0]

expect(rehashed[1].values[0]).to eq 'horse'
expect(rehashed[1].values[1]).to eq 5
expect(rehashed[1].values[2]).to eq 'Mr. Ed'
expect(rehashed[1].values[3]).to eq ['hay', 'sugar cubes']
expect(rehashed[1].values[4]).to eq ['some horse', 0]
end

it 'transforms rows into Ruby objects' do
rows = [
['dog', 1, 'Lassie', ['kibble', 'peanut butter'], ['spaniel', 2]],
['horse', 5, 'Mr. Ed', ['hay', 'sugar cubes'], ['some horse', 0]],
['t-rex', 37, 'Doug', ['rodents', 'small dinos'], ['dino', 0]]
]
client.stub(:run).and_return([columns, rows])

columns, rows = client.run('fake query')
column_value_parsers = columns.map { |column| Trino::Client::ColumnValueParser.new(column) }
transformed_rows = rows.map { |row| Trino::Client::Query.transform_row(column_value_parsers, row) }

expect(transformed_rows[0]).to eq({
"animal" => "dog",
"score" => 1,
"name" => "Lassie",
"foods" => ["kibble", "peanut butter"],
"traits" => {
"breed" => "spaniel",
"num_spots" => 2,
},
})

expect(transformed_rows[1]).to eq({
"animal" => "horse",
"score" => 5,
"name" => "Mr. Ed",
"foods" => ["hay", "sugar cubes"],
"traits" => {
"breed" => "some horse",
"num_spots" => 0,
},
})
end

it 'empty results' do
Expand All @@ -58,17 +103,21 @@
"animal" => "wrong",
"score" => "count",
"name" => nil,
"foods" => nil,
"traits" => nil
}]
end

it 'handles too many result columns' do
rows = [['wrong', 'count', 'too', 'much', 'columns']]
rows = [['wrong', 'count', 'too', 'too', 'too', 'much', 'columns']]
client.stub(:run).and_return([columns, rows])

expect(client.run_with_names('fake query')).to eq [{
"animal" => "wrong",
"score" => "count",
"name" => 'too',
"name" => "too",
"foods" => "too",
"traits" => "too"
}]
end
end
Expand Down
125 changes: 125 additions & 0 deletions spec/column_value_parser_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
require 'spec_helper'

describe Trino::Client::ColumnValueParser do
def column_value(data, type, scalar_parser = nil)
column = Struct.new(:type, :name).new(type)
Trino::Client::ColumnValueParser.new(column, scalar_parser).value(data)
end

it 'parses varchar values' do
data = 'a string'
type = 'varchar'
expected_value = 'a string'
expect(column_value(data, type)).to eq(expected_value)
end

it 'converts scalar values if configured to do so' do
data = '2022-07-01T14:53:02Z'
type = 'timestamp with time zone'
scalar_parser = ->(value, _dtype) { Time.parse(value) }
expected_value = Time.parse(data)
expect(column_value(data, type, scalar_parser)).to eq(expected_value)
end

it 'parses array type values' do
data = [1, 2, 3, 4]
type = 'array(integer, integer, integer, integer)'
expected_value = [1, 2, 3, 4]
expect(column_value(data, type)).to eq(expected_value)
end

it 'parses row type values' do
data = [
'userId',
'userLogin',
'SKU_FREE',
'TYPE_USER',
'2022-07-01T14:53:02Z',
''
]
type = 'row(id varchar, "name" varchar, plan_sku varchar, type varchar, created_at timestamp with time zone, organization_tenant_name varchar)'
expected_value = {
'id' => 'userId',
'name' => 'userLogin',
'plan_sku' => 'SKU_FREE',
'type' => 'TYPE_USER',
'created_at' => '2022-07-01T14:53:02Z',
'organization_tenant_name' => ''
}
expect(column_value(data, type)).to eq(expected_value)
end

it 'parses an array of row type values' do
data = [[
'userId',
'userLogin',
'SKU_FREE',
'TYPE_USER',
'2022-07-01T14:53:02Z',
''
]]
type = 'array(row(id varchar, "name" varchar, plan_sku varchar, type varchar, created_at timestamp with time zone, organization_tenant_name varchar))'
expected_value = [{
'id' => 'userId',
'name' => 'userLogin',
'plan_sku' => 'SKU_FREE',
'type' => 'TYPE_USER',
'created_at' => '2022-07-01T14:53:02Z',
'organization_tenant_name' => ''
}]
expect(column_value(data, type)).to eq(expected_value)
end

it 'parses row type values that have an array in them' do
data = [
'userId',
%w[userLogin1 userLogin2],
'value'
]
type = 'row(id varchar, logins array(varchar), onemore varchar)'
expected_value = {
'id' => 'userId',
'logins' => %w[userLogin1 userLogin2],
'onemore' => 'value'
}
expect(column_value(data, type)).to eq(expected_value)
end

it 'parses row type values that have a row in them' do
data = [
'userId',
['userLogin', '2022-07-01T14:53:02Z', 1234],
'value'
]
type = 'row(id varchar, subobj row(login varchar, created_at timestamp with time zone, id integer), onemore varchar)'
expected_value = {
'id' => 'userId',
'subobj' => {
'login' => 'userLogin',
'created_at' => '2022-07-01T14:53:02Z',
'id' => 1234
},
'onemore' => 'value'
}
expect(column_value(data, type)).to eq(expected_value)
end

it 'parses row type values that have nested rows in them' do
data = [
'userId',
['userLogin', '2022-07-01T14:53:02Z', [1234]],
'value'
]
type = 'row(id varchar, subobj row(login varchar, created_at timestamp with time zone, id row(subid integer)), onemore varchar)'
expected_value = {
'id' => 'userId',
'subobj' => {
'login' => 'userLogin',
'created_at' => '2022-07-01T14:53:02Z',
'id' => {'subid' => 1234}
},
'onemore' => 'value'
}
expect(column_value(data, type)).to eq(expected_value)
end
end

0 comments on commit e14251c

Please sign in to comment.