Skip to content

Commit

Permalink
apacheGH-38916: [R] Simplify dataset and table print output (apache#3…
Browse files Browse the repository at this point in the history
…8917)

### Rationale for this change

When printing objects with data with lots of rows, the output is long and unwieldy.

### What changes are included in this PR?

* Truncates long schema print output and adds the number of columns to dataset print output.
* Add number of columns to output so it's clear how many there are in total

### Are these changes tested?

Yes

### Are there any user-facing changes?

Yes

Before:

``` r
library(arrow)
x <- tibble::tibble(!!!letters, .rows = 5)
InMemoryDataset$create(x)
#> InMemoryDataset
#> "a": string
#> "b": string
#> "c": string
#> "d": string
#> "e": string
#> "f": string
#> "g": string
#> "h": string
#> "i": string
#> "j": string
#> "k": string
#> "l": string
#> "m": string
#> "n": string
#> "o": string
#> "p": string
#> "q": string
#> "r": string
#> "s": string
#> "t": string
#> "u": string
#> "v": string
#> "w": string
#> "x": string
#> "y": string
#> "z": string
arrow_table(x)
#> Table
#> 5 rows x 26 columns
#> $"a" <string>
#> $"b" <string>
#> $"c" <string>
#> $"d" <string>
#> $"e" <string>
#> $"f" <string>
#> $"g" <string>
#> $"h" <string>
#> $"i" <string>
#> $"j" <string>
#> $"k" <string>
#> $"l" <string>
#> $"m" <string>
#> $"n" <string>
#> $"o" <string>
#> $"p" <string>
#> $"q" <string>
#> $"r" <string>
#> $"s" <string>
#> $"t" <string>
#> $"u" <string>
#> $"v" <string>
#> $"w" <string>
#> $"x" <string>
#> $"y" <string>
#> $"z" <string>
record_batch(x)
#> RecordBatch
#> 5 rows x 26 columns
#> $"a" <string>
#> $"b" <string>
#> $"c" <string>
#> $"d" <string>
#> $"e" <string>
#> $"f" <string>
#> $"g" <string>
#> $"h" <string>
#> $"i" <string>
#> $"j" <string>
#> $"k" <string>
#> $"l" <string>
#> $"m" <string>
#> $"n" <string>
#> $"o" <string>
#> $"p" <string>
#> $"q" <string>
#> $"r" <string>
#> $"s" <string>
#> $"t" <string>
#> $"u" <string>
#> $"v" <string>
#> $"w" <string>
#> $"x" <string>
#> $"y" <string>
#> $"z" <string>
```

After:

``` r
library(arrow)

x <- tibble::tibble(!!!letters, .rows = 5)
InMemoryDataset$create(x)
#> InMemoryDataset
#> 26 columns 
#> "a": string
#> "b": string
#> "c": string
#> "d": string
#> "e": string
#> "f": string
#> "g": string
#> "h": string
#> "i": string
#> "j": string
#> "k": string
#> "l": string
#> "m": string
#> "n": string
#> "o": string
#> "p": string
#> "q": string
#> "r": string
#> "s": string
#> "t": string
#> ...
#> Use `schema()` to see entire schema
arrow_table(x)
#> Table
#> 5 rows x 26 columns
#> $"a" <string>
#> $"b" <string>
#> $"c" <string>
#> $"d" <string>
#> $"e" <string>
#> $"f" <string>
#> $"g" <string>
#> $"h" <string>
#> $"i" <string>
#> $"j" <string>
#> $"k" <string>
#> $"l" <string>
#> $"m" <string>
#> $"n" <string>
#> $"o" <string>
#> $"p" <string>
#> $"q" <string>
#> $"r" <string>
#> $"s" <string>
#> $"t" <string>
#> ...
#> Use `schema()` to see entire schema
record_batch(x)
#> RecordBatch
#> 5 rows x 26 columns
#> $"a" <string>
#> $"b" <string>
#> $"c" <string>
#> $"d" <string>
#> $"e" <string>
#> $"f" <string>
#> $"g" <string>
#> $"h" <string>
#> $"i" <string>
#> $"j" <string>
#> $"k" <string>
#> $"l" <string>
#> $"m" <string>
#> $"n" <string>
#> $"o" <string>
#> $"p" <string>
#> $"q" <string>
#> $"r" <string>
#> $"s" <string>
#> $"t" <string>
#> ...
#> Use `schema()` to see entire schema
```

* Closes: apache#38916

Lead-authored-by: Nic Crane <thisisnic@gmail.com>
Co-authored-by: Bryce Mecum <petridish@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
  • Loading branch information
thisisnic and amoeba authored Mar 13, 2024
1 parent bd3fab4 commit ac1708c
Show file tree
Hide file tree
Showing 8 changed files with 47 additions and 8 deletions.
2 changes: 1 addition & 1 deletion r/R/arrow-tabular.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ ArrowTabular <- R6Class("ArrowTabular",
inherit = ArrowObject,
public = list(
ToString = function() {
sch <- unlist(strsplit(self$schema$ToString(), "\n"))
sch <- unlist(strsplit(self$schema$ToString(truncate = TRUE), "\n"))
sch <- sub("(.*): (.*)", "$\\1 <\\2>", sch)
dims <- sprintf("%s rows x %s columns", self$num_rows, self$num_columns)
paste(c(dims, sch), collapse = "\n")
Expand Down
2 changes: 1 addition & 1 deletion r/R/dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ Dataset <- R6Class("Dataset",
# Start a new scan of the data
# @return A [ScannerBuilder]
NewScan = function() dataset___Dataset__NewScan(self),
ToString = function() self$schema$ToString(),
ToString = function() format_schema(self),
WithSchema = function(schema) {
assert_is(schema, "Schema")
dataset___Dataset__ReplaceSchema(self, schema)
Expand Down
2 changes: 1 addition & 1 deletion r/R/record-batch-reader.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ RecordBatchReader <- R6Class("RecordBatchReader",
read_table = function() Table__from_RecordBatchReader(self),
Close = function() RecordBatchReader__Close(self),
export_to_c = function(stream_ptr) ExportRecordBatchReader(self, stream_ptr),
ToString = function() self$schema$ToString(),
ToString = function() format_schema(self),
.unsafe_delete = function() {
RecordBatchReader__UnsafeDelete(self)
super$.unsafe_delete()
Expand Down
31 changes: 26 additions & 5 deletions r/R/schema.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@
Schema <- R6Class("Schema",
inherit = ArrowObject,
public = list(
ToString = function() {
fields <- print_schema_fields(self)
ToString = function(truncate = FALSE) {
fields <- print_schema_fields(self, truncate)
if (self$HasMetadata) {
fields <- paste0(fields, "\n\nSee $metadata for additional Schema metadata")
}
Expand Down Expand Up @@ -224,9 +224,19 @@ prepare_key_value_metadata <- function(metadata) {
map_chr(metadata, as.character)
}

print_schema_fields <- function(s) {
# Alternative to Schema__ToString that doesn't print metadata
paste(map_chr(s$fields, ~ .$ToString()), collapse = "\n")
# Alternative to Schema__ToString that doesn't print metadata
print_schema_fields <- function(s, truncate = FALSE, max_fields = 20L) {
assert_that(max_fields > 0)
num_fields <- length(s$fields)
if (truncate && num_fields > max_fields) {
fields_out <- paste(map_chr(s$fields[seq_len(max_fields)], ~ .$ToString()), collapse = "\n")
fields_out <- paste0(fields_out, "\n...\n")
fields_out <- paste0(fields_out, num_fields - max_fields, " more columns\n")
fields_out <- paste0(fields_out, "Use `schema()` to see entire schema")
} else {
fields_out <- paste(map_chr(s$fields, ~ .$ToString()), collapse = "\n")
}
fields_out
}

#' Create a schema or extract one from an object.
Expand Down Expand Up @@ -460,3 +470,14 @@ as.data.frame.Schema <- function(x, row.names = NULL, optional = FALSE, ...) {

#' @export
`names<-.Schema` <- function(x, value) x$WithNames(value)

#' Get a string representing a Dataset or RecordBatchReader object's schema
#' @param obj a Dataset or RecordBatchReader
#' @return A string containing a formatted representation of the schema of `obj`
#' @keywords internal
format_schema <- function(obj) {
assert_is(obj, c("Dataset", "RecordBatchReader"))
n_fields_out <- paste0(length(obj$schema$fields), " columns", "\n")
schema <- obj$schema$ToString(truncate = TRUE)
paste0(n_fields_out, schema)
}
1 change: 1 addition & 0 deletions r/tests/testthat/_snaps/dplyr-glimpse.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
Cannot glimpse() data from a RecordBatchReader because it can only be read one time; call `as_arrow_table()` to consume it first.
Output
RecordBatchReader
7 columns
int: int32
dbl: double
dbl2: double
Expand Down
1 change: 1 addition & 0 deletions r/tests/testthat/test-dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -910,6 +910,7 @@ test_that("Dataset and query print methods", {
print(ds),
paste(
"FileSystemDataset with 2 Parquet files",
"8 columns",
"int: int32",
"dbl: double",
"lgl: bool",
Expand Down
1 change: 1 addition & 0 deletions r/tests/testthat/test-record-batch-reader.R
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ test_that("RBR methods", {
expect_output(
print(reader),
"RecordBatchStreamReader
2 columns
x: int32
y: string"
)
Expand Down
15 changes: 15 additions & 0 deletions r/tests/testthat/test-schema.R
Original file line number Diff line number Diff line change
Expand Up @@ -315,5 +315,20 @@ test_that("schema extraction", {

adq <- as_adq(example_data)
expect_equal(schema(adq), adq$.data$schema)
})

test_that("schema print truncation", {
tbl <- arrow_table(example_data)
out <- print_schema_fields(schema(tbl), truncate = TRUE, max_fields = 1)
expect_output(
cat(out),
"int: int32\n...\n6 more columns\nUse `schema()` to see entire schema",
fixed = TRUE
)

expect_error(
print_schema_fields(schema(tbl), truncate = TRUE, max_fields = 0),
regexp = "max_fields not greater than 0"
)

})

0 comments on commit ac1708c

Please sign in to comment.