diff --git a/R/RcppExports.R b/R/RcppExports.R index b5c7b6d0..cfd5e074 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -53,7 +53,7 @@ parse_ref <- function(ref) { .Call('readxl_parse_ref', PACKAGE = 'readxl', ref) } -xlsx_col_types <- function(path, sheet = 0L, na = "", nskip = 0L, n = 100L) { +xlsx_col_types <- function(path, sheet = 0L, na = character(), nskip = 0L, n = 100L) { .Call('readxl_xlsx_col_types', PACKAGE = 'readxl', path, sheet, na, nskip, n) } diff --git a/src/CellType.h b/src/CellType.h index 31f811d4..ef93b415 100644 --- a/src/CellType.h +++ b/src/CellType.h @@ -3,6 +3,7 @@ #include #include +#include "StringSet.h" enum CellType { CELL_BLANK, @@ -48,21 +49,21 @@ inline std::string cellTypeDesc(CellType type) { inline CellType cellType(xls::st_cell::st_cell_data cell, xls::st_xf* styles, const std::set& customDateFormats, - std::string na = "") { + const StringSet &na = "") { // Find codes in [MS-XLS] S2.3.2 (p175). // See xls_addCell for those used for cells switch(cell.id) { case 253: // LabelSst case 516: // Label - return (na.compare((char*) cell.str) == 0) ? CELL_BLANK : CELL_TEXT; + return na.contains((char*) cell.str) ? CELL_BLANK : CELL_TEXT; break; case 6: // formula case 1030: // formula (Apple Numbers Bug) if (cell.l == 0) { - return CELL_NUMERIC; + return na.contains(cell.d) ? CELL_BLANK : CELL_NUMERIC; } else { - if (na.compare((char*) cell.str) == 0) { + if (na.contains((char*) cell.str)) { return CELL_BLANK; } else { return CELL_TEXT; @@ -74,6 +75,9 @@ inline CellType cellType(xls::st_cell::st_cell_data cell, xls::st_xf* styles, case 515: // Number case 638: // Rk { + if (na.contains(cell.d)) + return CELL_BLANK; + if (styles == NULL) return CELL_NUMERIC; diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 99d3b726..029c0166 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -64,13 +64,13 @@ BEGIN_RCPP END_RCPP } // xls_col_types -CharacterVector xls_col_types(std::string path, std::string na, int sheet, int nskip, int n, bool has_col_names); +CharacterVector xls_col_types(std::string path, std::vector na, int sheet, int nskip, int n, bool has_col_names); RcppExport SEXP readxl_xls_col_types(SEXP pathSEXP, SEXP naSEXP, SEXP sheetSEXP, SEXP nskipSEXP, SEXP nSEXP, SEXP has_col_namesSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< std::string >::type path(pathSEXP); - Rcpp::traits::input_parameter< std::string >::type na(naSEXP); + Rcpp::traits::input_parameter< std::vector >::type na(naSEXP); Rcpp::traits::input_parameter< int >::type sheet(sheetSEXP); Rcpp::traits::input_parameter< int >::type nskip(nskipSEXP); Rcpp::traits::input_parameter< int >::type n(nSEXP); @@ -80,7 +80,7 @@ BEGIN_RCPP END_RCPP } // xls_cols -List xls_cols(std::string path, int i, CharacterVector col_names, CharacterVector col_types, std::string na, int nskip); +List xls_cols(std::string path, int i, CharacterVector col_names, CharacterVector col_types, std::vector na, int nskip); RcppExport SEXP readxl_xls_cols(SEXP pathSEXP, SEXP iSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP naSEXP, SEXP nskipSEXP) { BEGIN_RCPP Rcpp::RObject __result; @@ -89,7 +89,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< int >::type i(iSEXP); Rcpp::traits::input_parameter< CharacterVector >::type col_names(col_namesSEXP); Rcpp::traits::input_parameter< CharacterVector >::type col_types(col_typesSEXP); - Rcpp::traits::input_parameter< std::string >::type na(naSEXP); + Rcpp::traits::input_parameter< std::vector >::type na(naSEXP); Rcpp::traits::input_parameter< int >::type nskip(nskipSEXP); __result = Rcpp::wrap(xls_cols(path, i, col_names, col_types, na, nskip)); return __result; @@ -163,14 +163,14 @@ BEGIN_RCPP END_RCPP } // xlsx_col_types -CharacterVector xlsx_col_types(std::string path, int sheet, std::string na, int nskip, int n); +CharacterVector xlsx_col_types(std::string path, int sheet, CharacterVector na, int nskip, int n); RcppExport SEXP readxl_xlsx_col_types(SEXP pathSEXP, SEXP sheetSEXP, SEXP naSEXP, SEXP nskipSEXP, SEXP nSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< std::string >::type path(pathSEXP); Rcpp::traits::input_parameter< int >::type sheet(sheetSEXP); - Rcpp::traits::input_parameter< std::string >::type na(naSEXP); + Rcpp::traits::input_parameter< CharacterVector >::type na(naSEXP); Rcpp::traits::input_parameter< int >::type nskip(nskipSEXP); Rcpp::traits::input_parameter< int >::type n(nSEXP); __result = Rcpp::wrap(xlsx_col_types(path, sheet, na, nskip, n)); @@ -191,7 +191,7 @@ BEGIN_RCPP END_RCPP } // read_xlsx_ -List read_xlsx_(std::string path, int sheet, RObject col_names, RObject col_types, std::string na, int nskip); +List read_xlsx_(std::string path, int sheet, RObject col_names, RObject col_types, std::vector na, int nskip); RcppExport SEXP readxl_read_xlsx_(SEXP pathSEXP, SEXP sheetSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP naSEXP, SEXP nskipSEXP) { BEGIN_RCPP Rcpp::RObject __result; @@ -200,7 +200,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< int >::type sheet(sheetSEXP); Rcpp::traits::input_parameter< RObject >::type col_names(col_namesSEXP); Rcpp::traits::input_parameter< RObject >::type col_types(col_typesSEXP); - Rcpp::traits::input_parameter< std::string >::type na(naSEXP); + Rcpp::traits::input_parameter< std::vector >::type na(naSEXP); Rcpp::traits::input_parameter< int >::type nskip(nskipSEXP); __result = Rcpp::wrap(read_xlsx_(path, sheet, col_names, col_types, na, nskip)); return __result; diff --git a/src/StringSet.h b/src/StringSet.h new file mode 100644 index 00000000..329ebf25 --- /dev/null +++ b/src/StringSet.h @@ -0,0 +1,31 @@ +#ifndef READXL_STRINGSET_ +#define READXL_STRINGSET_ + +#include + +class StringSet +{ + std::set set_; +public: + StringSet(const char *s = "") { + if (strlen(s) > 0) + set_.insert(s); + } + StringSet(const std::vector &s) { + for (std::vector::const_iterator i = s.begin(); i != s.end(); ++i) + set_.insert(*i); + } + StringSet(const Rcpp::CharacterVector &s) { + for (Rcpp::CharacterVector::const_iterator i = s.begin(); i != s.end(); ++i) + set_.insert(Rcpp::as(*i)); + } + bool contains(const std::string &s) const { + return set_.find(s) != set_.end(); + } + bool contains(const double d) const { + std::ostringstream str; str << d; + return contains(str.str()); + } +}; + +#endif diff --git a/src/XlsWorkSheet.cpp b/src/XlsWorkSheet.cpp index f4d57565..bb5fa337 100644 --- a/src/XlsWorkSheet.cpp +++ b/src/XlsWorkSheet.cpp @@ -13,7 +13,7 @@ CharacterVector xls_col_names(std::string path, int i = 0, int nskip = 0) { } // [[Rcpp::export]] -CharacterVector xls_col_types(std::string path, std::string na, int sheet = 0, +CharacterVector xls_col_types(std::string path, std::vector na, int sheet = 0, int nskip = 0, int n = 100, bool has_col_names = false) { XlsWorkBook wb = XlsWorkBook(path); std::vector types = wb.sheet(sheet).colTypes(na, nskip + has_col_names, n); @@ -37,7 +37,7 @@ CharacterVector xls_col_types(std::string path, std::string na, int sheet = 0, // [[Rcpp::export]] List xls_cols(std::string path, int i, CharacterVector col_names, - CharacterVector col_types, std::string na, int nskip = 0) { + CharacterVector col_types, std::vector na, int nskip = 0) { XlsWorkBook wb = XlsWorkBook(path); XlsWorkSheet sheet = wb.sheet(i); diff --git a/src/XlsWorkSheet.h b/src/XlsWorkSheet.h index 72721264..6fee5165 100644 --- a/src/XlsWorkSheet.h +++ b/src/XlsWorkSheet.h @@ -65,7 +65,7 @@ class XlsWorkSheet { return out; } - std::vector colTypes(std::string na, int nskip = 0, int n_max = 100) { + std::vector colTypes(const StringSet &na, int nskip = 0, int n_max = 100) { std::vector types(ncol_); for (int i = nskip; i < nrow_ && i < n_max; ++i) { @@ -88,7 +88,7 @@ class XlsWorkSheet { } Rcpp::List readCols(Rcpp::CharacterVector names, std::vector types, - std::string na, int nskip = 0) { + const StringSet &na, int nskip = 0) { if ((int) names.size() != ncol_ || (int) types.size() != ncol_) Rcpp::stop("Need one name and type for each column"); @@ -154,7 +154,7 @@ class XlsWorkSheet { SET_STRING_ELT(col, i, NA_STRING); } else { std::string stdString((char*) cell.str); - Rcpp::RObject rString = stdString == na ? NA_STRING : Rf_mkCharCE(stdString.c_str(), CE_UTF8); + Rcpp::RObject rString = na.contains(stdString) ? NA_STRING : Rf_mkCharCE(stdString.c_str(), CE_UTF8); SET_STRING_ELT(col, i, rString); } break; diff --git a/src/XlsxCell.h b/src/XlsxCell.h index 21314077..93719c46 100644 --- a/src/XlsxCell.h +++ b/src/XlsxCell.h @@ -107,31 +107,31 @@ class XlsxCell { return stringTable.at(id); } - double asDouble(const std::string& na) { + double asDouble(const StringSet& na) { rapidxml::xml_node<>* v = cell_->first_node("v"); - if (v == NULL || na.compare(v->value()) == 0) + if (v == NULL || na.contains(v->value())) return NA_REAL; return (v == NULL) ? 0 : atof(v->value()); } - double asDate(const std::string& na, int offset) { + double asDate(const StringSet& na, int offset) { rapidxml::xml_node<>* v = cell_->first_node("v"); - if (v == NULL || na.compare(v->value()) == 0) + if (v == NULL || na.contains(v->value())) return NA_REAL; double value = atof(v->value()); return (v == NULL) ? 0 : (value - offset) * 86400; } - Rcpp::RObject asCharSxp(const std::string& na, + Rcpp::RObject asCharSxp(const StringSet& na, const std::vector& stringTable) { // Is it an inline string? // 18.3.1.53 is (Rich Text Inline) [p1649] rapidxml::xml_node<>* is = cell_->first_node("is"); if (is != NULL) { std::string value; - if (!parseString(is, &value) || na.compare(value) == 0) { + if (!parseString(is, &value) || na.contains(value)) { return NA_STRING; } else { return Rf_mkCharCE(value.c_str(), CE_UTF8); @@ -148,7 +148,7 @@ class XlsxCell { if (t != NULL && strncmp(t->value(), "s", t->value_size()) == 0) { return stringFromTable(v->value(), na, stringTable); } else { - if (na.compare(v->value()) == 0) { + if (na.contains(v->value())) { return NA_STRING; } else { return Rf_mkCharCE(v->value(), CE_UTF8); @@ -156,7 +156,7 @@ class XlsxCell { } } - CellType type(const std::string& na, + CellType type(const StringSet& na, const std::vector& stringTable, const std::set& dateStyles) { rapidxml::xml_attribute<>* t = cell_->first_attribute("t"); @@ -182,13 +182,13 @@ class XlsxCell { int id = atoi(v->value()); const std::string& string = stringTable.at(id); - return (string == na) ? CELL_BLANK : CELL_TEXT; + return na.contains(string) ? CELL_BLANK : CELL_TEXT; } else if (strncmp(t->value(), "str", 5) == 0) { // formula rapidxml::xml_node<>* v = cell_->first_node("v"); if (v == NULL) return CELL_BLANK; - return (na.compare(v->value()) == 0) ? CELL_BLANK : CELL_TEXT; + return na.contains(v->value()) ? CELL_BLANK : CELL_TEXT; } else if (strncmp(t->value(), "inlineStr", 9) == 0) { // formula return CELL_TEXT; } else { @@ -203,7 +203,7 @@ class XlsxCell { private: - Rcpp::RObject stringFromTable(const char* val, const std::string& na, + Rcpp::RObject stringFromTable(const char* val, const StringSet& na, const std::vector& stringTable) { int id = atoi(val); if (id < 0 || id >= (int) stringTable.size()) { @@ -212,7 +212,7 @@ class XlsxCell { } const std::string& string = stringTable.at(id); - return (string == na) ? NA_STRING : Rf_mkCharCE(string.c_str(), CE_UTF8); + return na.contains(string) ? NA_STRING : Rf_mkCharCE(string.c_str(), CE_UTF8); } }; diff --git a/src/XlsxWorkSheet.cpp b/src/XlsxWorkSheet.cpp index a5128742..67796a1b 100644 --- a/src/XlsxWorkSheet.cpp +++ b/src/XlsxWorkSheet.cpp @@ -22,7 +22,7 @@ IntegerVector parse_ref(std::string ref) { // [[Rcpp::export]] CharacterVector xlsx_col_types(std::string path, int sheet = 0, - std::string na = "", int nskip = 0, + CharacterVector na = CharacterVector(), int nskip = 0, int n = 100) { XlsxWorkSheet ws(path, sheet); @@ -43,7 +43,7 @@ CharacterVector xlsx_col_names(std::string path, int sheet = 0, int nskip = 0) { // [[Rcpp::export]] List read_xlsx_(std::string path, int sheet, RObject col_names, - RObject col_types, std::string na, int nskip = 0) { + RObject col_types, std::vector na, int nskip = 0) { XlsxWorkSheet ws(path, sheet); diff --git a/src/XlsxWorkSheet.h b/src/XlsxWorkSheet.h index 8b0c59f2..a03f3f9e 100644 --- a/src/XlsxWorkSheet.h +++ b/src/XlsxWorkSheet.h @@ -63,7 +63,7 @@ class XlsxWorkSheet { } - std::vector colTypes(const std::string& na, int nskip = 0, int n_max = 100, bool has_col_names = false) { + std::vector colTypes(const StringSet& na, int nskip = 0, int n_max = 100, bool has_col_names = false) { rapidxml::xml_node<>* row = getRow(nskip + has_col_names); std::vector types; types.resize(ncol_); @@ -117,7 +117,7 @@ class XlsxWorkSheet { Rcpp::List readCols(Rcpp::CharacterVector names, const std::vector& types, - const std::string& na, int nskip = 0) { + const StringSet& na, int nskip = 0) { if ((int) names.size() != ncol_ || (int) types.size() != ncol_) Rcpp::stop("Need one name and type for each column"); diff --git a/tests/testthat/test-missing-values.R b/tests/testthat/test-missing-values.R index 9e771131..7a4c388f 100644 --- a/tests/testthat/test-missing-values.R +++ b/tests/testthat/test-missing-values.R @@ -23,12 +23,24 @@ test_that("na arg maps strings to to NA [xls]", { expect_equal(df$y, c(NA, 1, 1)) # formula column }) +test_that("na arg allows multiple strings [xls]", { + df <- read_excel("missing-values.xls", na = c("NA", "1")) + expect_true(all(is.na(df$x))) + expect_true(all(is.na(df$y))) # formula column +}) + test_that("na arg maps strings to to NA [xlsx]", { df <- read_excel("missing-values.xlsx", na = "NA") expect_equal(df$x, c(NA, 1, 1)) expect_equal(df$y, c(NA, 1, 1)) # formula column }) +test_that("na arg allows multiple strings [xlsx]", { + df <- read_excel("missing-values.xlsx", na = c("NA", "1")) + expect_true(all(is.na(df$x))) + expect_true(all(is.na(df$y))) # formula column +}) + test_that("text values in numeric column gives warning & NA", { expect_warning( df <- read_excel("missing-values.xls", col_types = rep("numeric", 2)),