Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions r/R/arrow-tabular.R
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,10 @@ ArrowTabular <- R6Class(
# Helper for the R metadata that handles the serialization
# See also method on Schema
if (missing(new)) {
self$metadata$r
self$metadata[["r"]]
} else {
# Set the R metadata
self$metadata$r <- new
self$metadata[["r"]] <- new

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I love partial matching so much.

self
}
}
Expand All @@ -95,7 +95,7 @@ ArrowTabular <- R6Class(
#' @export
as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) {
df <- x$to_data_frame()
out <- apply_arrow_r_metadata(df, x$metadata$r)
out <- apply_arrow_r_metadata(df, x$metadata[["r"]])
as.data.frame(out, row.names = row.names, optional = optional, ...)
}

Expand Down
16 changes: 8 additions & 8 deletions r/R/arrowExports.R

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion r/R/dataset-write.R
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ write_dataset <- function(
if (inherits(dataset, "grouped_df")) {
force(partitioning)
# Drop the grouping metadata before writing; we've already consumed it
# now to construct `partitioning` and don't want it in the metadata$r
# now to construct `partitioning` and don't want it in the metadata[["r"]]
dataset <- dplyr::ungroup(dataset)
}
dataset <- as_adq(dataset)
Expand Down
2 changes: 1 addition & 1 deletion r/R/dplyr-collect.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) {
collect.ArrowTabular <- function(x, as_data_frame = TRUE, ...) {
if (as_data_frame) {
df <- x$to_data_frame()
apply_arrow_r_metadata(df, x$metadata$r)
apply_arrow_r_metadata(df, x$metadata[["r"]])
} else {
x
}
Expand Down
8 changes: 4 additions & 4 deletions r/R/dplyr-group-by.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ group_vars.arrow_dplyr_query <- function(x) x$group_by_vars
group_vars.Dataset <- function(x) character()
group_vars.RecordBatchReader <- function(x) character()
group_vars.ArrowTabular <- function(x) {
x$metadata$r$attributes$.group_vars %||% character()
x$metadata[["r"]]$attributes$.group_vars %||% character()
}

# the logical literal in the two functions below controls the default value of
Expand All @@ -62,7 +62,7 @@ group_by_drop_default.arrow_dplyr_query <- function(.tbl) {
.tbl$drop_empty_groups %||% TRUE
}
group_by_drop_default.ArrowTabular <- function(.tbl) {
.tbl$metadata$r$attributes$.group_by_drop %||% TRUE
.tbl$metadata[["r"]]$attributes$.group_by_drop %||% TRUE
}
group_by_drop_default.Dataset <- group_by_drop_default.RecordBatchReader <-
function(.tbl) TRUE
Expand All @@ -84,11 +84,11 @@ set_group_attributes <- function(tab, group_vars, .drop) {
# so passing NULL means unset (ungroup)
if (is.null(group_vars) || length(group_vars)) {
# Since accessing schema metadata does some work, only overwrite if needed
new_atts <- old_atts <- tab$metadata$r$attributes %||% list()
new_atts <- old_atts <- tab$metadata[["r"]]$attributes %||% list()
new_atts[[".group_vars"]] <- group_vars
new_atts[[".group_by_drop"]] <- .drop
if (!identical(new_atts, old_atts)) {
tab$metadata$r$attributes <- new_atts
tab$metadata[["r"]]$attributes <- new_atts
}
}
tab
Expand Down
2 changes: 1 addition & 1 deletion r/R/feather.R
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, mmap = T

if (isTRUE(as_data_frame)) {
df <- out$to_data_frame()
out <- apply_arrow_r_metadata(df, out$metadata$r)
out <- apply_arrow_r_metadata(df, out$metadata[["r"]])
}
out
}
Expand Down
6 changes: 3 additions & 3 deletions r/R/metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
if (getOption("arrow.debug", FALSE)) {
print(conditionMessage(e))
}
warning("Invalid metadata$r", call. = FALSE)
warning('Invalid metadata$[["r"]]', call. = FALSE)
NULL
})
}
Expand Down Expand Up @@ -228,7 +228,7 @@ apply_arrow_r_metadata <- function(x, r_metadata) {
}
},
error = function(e) {
warning("Invalid metadata$r", call. = FALSE)
warning('Invalid metadata$[["r"]]', call. = FALSE)
}
)
x
Expand Down Expand Up @@ -323,7 +323,7 @@ arrow_attributes <- function(x, only_top_level = FALSE) {
get_r_metadata_from_old_schema <- function(new_schema, old_schema) {
# TODO: do we care about other (non-R) metadata preservation?
# How would we know if it were meaningful?
r_meta <- old_schema$metadata$r
r_meta <- old_schema$metadata[["r"]]
if (!is.null(r_meta)) {
# Filter r_metadata$columns on columns with name _and_ type match
common_names <- intersect(names(r_meta$columns), names(new_schema))
Expand Down
10 changes: 5 additions & 5 deletions r/R/schema.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
#' schema (column names and types) which is compatible with other Arrow
#' clients. The R metadata is only read by R and is ignored by other clients
#' (e.g. Pandas has its own custom metadata). This metadata is stored in
#' `$metadata$r`.
#' `$metadata[["r"]]`.
#'
#' Since Schema metadata keys and values must be strings, this metadata is
#' saved by serializing R's attribute list structure to a string. If the
Expand Down Expand Up @@ -138,8 +138,8 @@ Schema <- R6Class(
renamed_schema <- Schema__WithNames(self, names)

# if we have R metadata containing column names, update names there too
if (!is.null(existing_metadata$r$columns)) {
names(existing_metadata$r$columns) <- names
if (!is.null(existing_metadata[["r"]]$columns)) {
names(existing_metadata[["r"]]$columns) <- names
}
renamed_schema$WithMetadata(existing_metadata)
}
Expand Down Expand Up @@ -176,10 +176,10 @@ Schema <- R6Class(
# Helper for the R metadata that handles the serialization
# See also method on ArrowTabular
if (missing(new)) {
self$metadata$r
self$metadata[["r"]]
} else {
# Set the R metadata
self$metadata$r <- new
self$metadata[["r"]] <- new
self
}
}
Expand Down
2 changes: 1 addition & 1 deletion r/extra-tests/test-read-files.R
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ test_that("Can see the extra metadata (parquet)", {
if (if_version_less_than("3.0.0")) {
expect_warning(
df <- read_parquet(pq_file),
"Invalid metadata$r",
"Invalid metadata",
fixed = TRUE
)
Comment thread
thisisnic marked this conversation as resolved.
expect_s3_class(df, "tbl")
Expand Down
2 changes: 1 addition & 1 deletion r/man/Schema-class.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

44 changes: 26 additions & 18 deletions r/tests/testthat/test-metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,16 +62,16 @@ test_that("Table R metadata", {

test_that("R metadata is not stored for types that map to Arrow types (factor, Date, etc.)", {
tab <- Table$create(example_data[1:6])
expect_null(tab$metadata$r)
expect_null(tab$metadata[["r"]])

expect_null(Table$create(example_with_times[1:3])$metadata$r)
expect_null(Table$create(example_with_times[1:3])$metadata[["r"]])
})

test_that("R metadata is not stored for ExtensionType columns", {
tab <- Table$create(
x = vctrs::new_vctr(1:5, class = "special_integer")
)
expect_null(tab$metadata$r)
expect_null(tab$metadata[["r"]])
})

test_that("classes are not stored for arrow_binary/arrow_large_binary/arrow_fixed_size_binary (ARROW-14140)", {
Expand All @@ -81,21 +81,21 @@ test_that("classes are not stored for arrow_binary/arrow_large_binary/arrow_fixe
large_binary <- Array$create(list(raws), large_binary())
fixed_size_binary <- Array$create(list(raws), fixed_size_binary(7L))

expect_null(RecordBatch$create(b = binary)$metadata$r)
expect_null(RecordBatch$create(b = large_binary)$metadata$r)
expect_null(RecordBatch$create(b = fixed_size_binary)$metadata$r)
expect_null(RecordBatch$create(b = binary)$metadata[["r"]])
expect_null(RecordBatch$create(b = large_binary)$metadata[["r"]])
expect_null(RecordBatch$create(b = fixed_size_binary)$metadata[["r"]])

expect_null(Table$create(b = binary)$metadata$r)
expect_null(Table$create(b = large_binary)$metadata$r)
expect_null(Table$create(b = fixed_size_binary)$metadata$r)
expect_null(Table$create(b = binary)$metadata[["r"]])
expect_null(Table$create(b = large_binary)$metadata[["r"]])
expect_null(Table$create(b = fixed_size_binary)$metadata[["r"]])
})

test_that("Garbage R metadata doesn't break things", {
tab <- Table$create(example_data[1:6])
tab$metadata$r <- "garbage"
expect_warning(
as.data.frame(tab),
"Invalid metadata$r",
'Invalid metadata$[["r"]]',
fixed = TRUE
)
# serialize data like .serialize_arrow_r_metadata does, but don't call that
Expand All @@ -104,7 +104,7 @@ test_that("Garbage R metadata doesn't break things", {
tab$metadata$r <- rawToChar(serialize("garbage", NULL, ascii = TRUE))
expect_warning(
as.data.frame(tab),
"Invalid metadata$r",
'Invalid metadata$[["r"]]',
fixed = TRUE
)

Expand All @@ -113,7 +113,7 @@ test_that("Garbage R metadata doesn't break things", {
tab$metadata <- list(r = rawToChar(serialize(bad, NULL, ascii = TRUE)))
expect_warning(
as.data.frame(tab),
"Invalid metadata$r",
'Invalid metadata$[["r"]]',
fixed = TRUE
)

Expand Down Expand Up @@ -144,7 +144,7 @@ arbitrary\040code\040was\040just\040executed
expect_message(
expect_warning(
as.data.frame(tab),
"Invalid metadata$r",
'Invalid metadata$[["r"]]',
fixed = TRUE
),
NA
Expand Down Expand Up @@ -465,7 +465,7 @@ test_that("grouped_df metadata is recorded (efficiently)", {
expect_s3_class(grouped, "grouped_df")
grouped_tab <- Table$create(grouped)
expect_r6_class(grouped_tab, "Table")
expect_equal(grouped_tab$metadata$r$attributes$.group_vars, "a")
expect_equal(grouped_tab$metadata[["r"]]$attributes$.group_vars, "a")
})

test_that("grouped_df non-arrow metadata is preserved", {
Expand Down Expand Up @@ -496,24 +496,32 @@ test_that("apply_arrow_r_metadata doesn't add in metadata from plain data.frame
plain_df <- data.frame(x = 1:5)
plain_df_arrow <- arrow_table(plain_df)

expect_equal(plain_df_arrow$metadata$r$columns, list(x = NULL))
expect_equal(plain_df_arrow$metadata[["r"]]$columns, list(x = NULL))

plain_df_no_metadata <- plain_df_arrow$to_data_frame()
plain_df_with_metadata <- apply_arrow_r_metadata(plain_df_no_metadata, plain_df_arrow$metadata$r)
plain_df_with_metadata <- apply_arrow_r_metadata(plain_df_no_metadata, plain_df_arrow$metadata[["r"]])

expect_identical(plain_df_no_metadata, plain_df_with_metadata)

# with more complex column metadata - it preserves it
spicy_df_arrow <- arrow_table(haven_data)

expect_equal(
spicy_df_arrow$metadata$r$columns,
spicy_df_arrow$metadata[["r"]]$columns,
list(num = list(attributes = list(format.spss = "F8.2"), columns = NULL), cat_int = NULL, cat_chr = NULL)
)

spicy_df_no_metadata <- spicy_df_arrow$to_data_frame()
spicy_df_with_metadata <- apply_arrow_r_metadata(spicy_df_no_metadata, spicy_df_arrow$metadata$r)
spicy_df_with_metadata <- apply_arrow_r_metadata(spicy_df_no_metadata, spicy_df_arrow$metadata[["r"]])

expect_null(attr(spicy_df_no_metadata$num, "format.spss"))
expect_equal(attr(spicy_df_with_metadata$num, "format.spss"), "F8.2")
})

test_that("metadata keys starting with 'r' don't cause partial matching - GH-50163", {
tbl <- arrow_table(x = 1:3)
tbl <- tbl$cast(tbl$schema$WithMetadata(list(rachel = "some_value")))

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for this!


expect_no_warning(as.data.frame(tbl))
expect_no_warning(collect.ArrowTabular(tbl))
})
Comment thread
thisisnic marked this conversation as resolved.
2 changes: 1 addition & 1 deletion r/vignettes/metadata.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ tb$metadata$new_key <- "new value"

Metadata attached to a Schema is preserved when writing the Table to Arrow/Feather or Parquet formats. When reading those files into R, or when calling `as.data.frame()` on a Table or RecordBatch, the column attributes are restored to the columns of the resulting `data.frame`. This means that custom data types, including `haven::labelled`, `vctrs` annotations, and others, are preserved when doing a round-trip through Arrow.

Note that the attributes stored in `$metadata$r` are only understood by R. If you write a `data.frame` with `haven` columns to a Feather file and read that in Pandas, the `haven` metadata won't be recognized there. Similarly, Pandas writes its own custom metadata, which the R package does not consume. You are free, however, to define custom metadata conventions for your application and assign any (string) values you want to other metadata keys.
Note that the attributes stored in `$metadata[["r"]]` are only understood by R. If you write a `data.frame` with `haven` columns to a Feather file and read that in Pandas, the `haven` metadata won't be recognized there. Similarly, Pandas writes its own custom metadata, which the R package does not consume. You are free, however, to define custom metadata conventions for your application and assign any (string) values you want to other metadata keys.

## Further reading

Expand Down
Loading