Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support multiple na values #56

Merged
merged 3 commits into from
Jan 19, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ parse_ref <- function(ref) {
.Call('readxl_parse_ref', PACKAGE = 'readxl', ref)
}

xlsx_col_types <- function(path, sheet = 0L, na = "", nskip = 0L, n = 100L) {
xlsx_col_types <- function(path, sheet = 0L, na = character(), nskip = 0L, n = 100L) {
.Call('readxl_xlsx_col_types', PACKAGE = 'readxl', path, sheet, na, nskip, n)
}

Expand Down
12 changes: 8 additions & 4 deletions src/CellType.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <Rcpp.h>
#include <libxls/xls.h>
#include "StringSet.h"

enum CellType {
CELL_BLANK,
Expand Down Expand Up @@ -48,21 +49,21 @@ inline std::string cellTypeDesc(CellType type) {

inline CellType cellType(xls::st_cell::st_cell_data cell, xls::st_xf* styles,
const std::set<int>& customDateFormats,
std::string na = "") {
const StringSet &na = "") {
// Find codes in [MS-XLS] S2.3.2 (p175).
// See xls_addCell for those used for cells
switch(cell.id) {
case 253: // LabelSst
case 516: // Label
return (na.compare((char*) cell.str) == 0) ? CELL_BLANK : CELL_TEXT;
return na.contains((char*) cell.str) ? CELL_BLANK : CELL_TEXT;
break;

case 6: // formula
case 1030: // formula (Apple Numbers Bug)
if (cell.l == 0) {
return CELL_NUMERIC;
return na.contains(cell.d) ? CELL_BLANK : CELL_NUMERIC;
} else {
if (na.compare((char*) cell.str) == 0) {
if (na.contains((char*) cell.str)) {
return CELL_BLANK;
} else {
return CELL_TEXT;
Expand All @@ -74,6 +75,9 @@ inline CellType cellType(xls::st_cell::st_cell_data cell, xls::st_xf* styles,
case 515: // Number
case 638: // Rk
{
if (na.contains(cell.d))
return CELL_BLANK;

if (styles == NULL)
return CELL_NUMERIC;

Expand Down
16 changes: 8 additions & 8 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,13 @@ BEGIN_RCPP
END_RCPP
}
// xls_col_types
CharacterVector xls_col_types(std::string path, std::string na, int sheet, int nskip, int n, bool has_col_names);
CharacterVector xls_col_types(std::string path, std::vector<std::string> na, int sheet, int nskip, int n, bool has_col_names);
RcppExport SEXP readxl_xls_col_types(SEXP pathSEXP, SEXP naSEXP, SEXP sheetSEXP, SEXP nskipSEXP, SEXP nSEXP, SEXP has_col_namesSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Rcpp::RNGScope __rngScope;
Rcpp::traits::input_parameter< std::string >::type path(pathSEXP);
Rcpp::traits::input_parameter< std::string >::type na(naSEXP);
Rcpp::traits::input_parameter< std::vector<std::string> >::type na(naSEXP);
Rcpp::traits::input_parameter< int >::type sheet(sheetSEXP);
Rcpp::traits::input_parameter< int >::type nskip(nskipSEXP);
Rcpp::traits::input_parameter< int >::type n(nSEXP);
Expand All @@ -80,7 +80,7 @@ BEGIN_RCPP
END_RCPP
}
// xls_cols
List xls_cols(std::string path, int i, CharacterVector col_names, CharacterVector col_types, std::string na, int nskip);
List xls_cols(std::string path, int i, CharacterVector col_names, CharacterVector col_types, std::vector<std::string> na, int nskip);
RcppExport SEXP readxl_xls_cols(SEXP pathSEXP, SEXP iSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP naSEXP, SEXP nskipSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Expand All @@ -89,7 +89,7 @@ BEGIN_RCPP
Rcpp::traits::input_parameter< int >::type i(iSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type col_names(col_namesSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type col_types(col_typesSEXP);
Rcpp::traits::input_parameter< std::string >::type na(naSEXP);
Rcpp::traits::input_parameter< std::vector<std::string> >::type na(naSEXP);
Rcpp::traits::input_parameter< int >::type nskip(nskipSEXP);
__result = Rcpp::wrap(xls_cols(path, i, col_names, col_types, na, nskip));
return __result;
Expand Down Expand Up @@ -163,14 +163,14 @@ BEGIN_RCPP
END_RCPP
}
// xlsx_col_types
CharacterVector xlsx_col_types(std::string path, int sheet, std::string na, int nskip, int n);
CharacterVector xlsx_col_types(std::string path, int sheet, CharacterVector na, int nskip, int n);
RcppExport SEXP readxl_xlsx_col_types(SEXP pathSEXP, SEXP sheetSEXP, SEXP naSEXP, SEXP nskipSEXP, SEXP nSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Rcpp::RNGScope __rngScope;
Rcpp::traits::input_parameter< std::string >::type path(pathSEXP);
Rcpp::traits::input_parameter< int >::type sheet(sheetSEXP);
Rcpp::traits::input_parameter< std::string >::type na(naSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type na(naSEXP);
Rcpp::traits::input_parameter< int >::type nskip(nskipSEXP);
Rcpp::traits::input_parameter< int >::type n(nSEXP);
__result = Rcpp::wrap(xlsx_col_types(path, sheet, na, nskip, n));
Expand All @@ -191,7 +191,7 @@ BEGIN_RCPP
END_RCPP
}
// read_xlsx_
List read_xlsx_(std::string path, int sheet, RObject col_names, RObject col_types, std::string na, int nskip);
List read_xlsx_(std::string path, int sheet, RObject col_names, RObject col_types, std::vector<std::string> na, int nskip);
RcppExport SEXP readxl_read_xlsx_(SEXP pathSEXP, SEXP sheetSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP naSEXP, SEXP nskipSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Expand All @@ -200,7 +200,7 @@ BEGIN_RCPP
Rcpp::traits::input_parameter< int >::type sheet(sheetSEXP);
Rcpp::traits::input_parameter< RObject >::type col_names(col_namesSEXP);
Rcpp::traits::input_parameter< RObject >::type col_types(col_typesSEXP);
Rcpp::traits::input_parameter< std::string >::type na(naSEXP);
Rcpp::traits::input_parameter< std::vector<std::string> >::type na(naSEXP);
Rcpp::traits::input_parameter< int >::type nskip(nskipSEXP);
__result = Rcpp::wrap(read_xlsx_(path, sheet, col_names, col_types, na, nskip));
return __result;
Expand Down
31 changes: 31 additions & 0 deletions src/StringSet.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#ifndef READXL_STRINGSET_
#define READXL_STRINGSET_

#include <Rcpp.h>

class StringSet
{
std::set<std::string> set_;
public:
StringSet(const char *s = "") {
if (strlen(s) > 0)
set_.insert(s);
}
StringSet(const std::vector<std::string> &s) {
for (std::vector<std::string>::const_iterator i = s.begin(); i != s.end(); ++i)
set_.insert(*i);
}
StringSet(const Rcpp::CharacterVector &s) {
for (Rcpp::CharacterVector::const_iterator i = s.begin(); i != s.end(); ++i)
set_.insert(Rcpp::as<std::string>(*i));
}
bool contains(const std::string &s) const {
return set_.find(s) != set_.end();
}
bool contains(const double d) const {
std::ostringstream str; str << d;
return contains(str.str());
}
};

#endif
4 changes: 2 additions & 2 deletions src/XlsWorkSheet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ CharacterVector xls_col_names(std::string path, int i = 0, int nskip = 0) {
}

// [[Rcpp::export]]
CharacterVector xls_col_types(std::string path, std::string na, int sheet = 0,
CharacterVector xls_col_types(std::string path, std::vector<std::string> na, int sheet = 0,
int nskip = 0, int n = 100, bool has_col_names = false) {
XlsWorkBook wb = XlsWorkBook(path);
std::vector<CellType> types = wb.sheet(sheet).colTypes(na, nskip + has_col_names, n);
Expand All @@ -37,7 +37,7 @@ CharacterVector xls_col_types(std::string path, std::string na, int sheet = 0,

// [[Rcpp::export]]
List xls_cols(std::string path, int i, CharacterVector col_names,
CharacterVector col_types, std::string na, int nskip = 0) {
CharacterVector col_types, std::vector<std::string> na, int nskip = 0) {
XlsWorkBook wb = XlsWorkBook(path);
XlsWorkSheet sheet = wb.sheet(i);

Expand Down
6 changes: 3 additions & 3 deletions src/XlsWorkSheet.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class XlsWorkSheet {
return out;
}

std::vector<CellType> colTypes(std::string na, int nskip = 0, int n_max = 100) {
std::vector<CellType> colTypes(const StringSet &na, int nskip = 0, int n_max = 100) {
std::vector<CellType> types(ncol_);

for (int i = nskip; i < nrow_ && i < n_max; ++i) {
Expand All @@ -88,7 +88,7 @@ class XlsWorkSheet {
}

Rcpp::List readCols(Rcpp::CharacterVector names, std::vector<CellType> types,
std::string na, int nskip = 0) {
const StringSet &na, int nskip = 0) {
if ((int) names.size() != ncol_ || (int) types.size() != ncol_)
Rcpp::stop("Need one name and type for each column");

Expand Down Expand Up @@ -154,7 +154,7 @@ class XlsWorkSheet {
SET_STRING_ELT(col, i, NA_STRING);
} else {
std::string stdString((char*) cell.str);
Rcpp::RObject rString = stdString == na ? NA_STRING : Rf_mkCharCE(stdString.c_str(), CE_UTF8);
Rcpp::RObject rString = na.contains(stdString) ? NA_STRING : Rf_mkCharCE(stdString.c_str(), CE_UTF8);
SET_STRING_ELT(col, i, rString);
}
break;
Expand Down
24 changes: 12 additions & 12 deletions src/XlsxCell.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,31 +107,31 @@ class XlsxCell {
return stringTable.at(id);
}

double asDouble(const std::string& na) {
double asDouble(const StringSet& na) {
rapidxml::xml_node<>* v = cell_->first_node("v");
if (v == NULL || na.compare(v->value()) == 0)
if (v == NULL || na.contains(v->value()))
return NA_REAL;

return (v == NULL) ? 0 : atof(v->value());
}

double asDate(const std::string& na, int offset) {
double asDate(const StringSet& na, int offset) {
rapidxml::xml_node<>* v = cell_->first_node("v");
if (v == NULL || na.compare(v->value()) == 0)
if (v == NULL || na.contains(v->value()))
return NA_REAL;

double value = atof(v->value());
return (v == NULL) ? 0 : (value - offset) * 86400;
}

Rcpp::RObject asCharSxp(const std::string& na,
Rcpp::RObject asCharSxp(const StringSet& na,
const std::vector<std::string>& stringTable) {

// Is it an inline string? // 18.3.1.53 is (Rich Text Inline) [p1649]
rapidxml::xml_node<>* is = cell_->first_node("is");
if (is != NULL) {
std::string value;
if (!parseString(is, &value) || na.compare(value) == 0) {
if (!parseString(is, &value) || na.contains(value)) {
return NA_STRING;
} else {
return Rf_mkCharCE(value.c_str(), CE_UTF8);
Expand All @@ -148,15 +148,15 @@ class XlsxCell {
if (t != NULL && strncmp(t->value(), "s", t->value_size()) == 0) {
return stringFromTable(v->value(), na, stringTable);
} else {
if (na.compare(v->value()) == 0) {
if (na.contains(v->value())) {
return NA_STRING;
} else {
return Rf_mkCharCE(v->value(), CE_UTF8);
}
}
}

CellType type(const std::string& na,
CellType type(const StringSet& na,
const std::vector<std::string>& stringTable,
const std::set<int>& dateStyles) {
rapidxml::xml_attribute<>* t = cell_->first_attribute("t");
Expand All @@ -182,13 +182,13 @@ class XlsxCell {

int id = atoi(v->value());
const std::string& string = stringTable.at(id);
return (string == na) ? CELL_BLANK : CELL_TEXT;
return na.contains(string) ? CELL_BLANK : CELL_TEXT;
} else if (strncmp(t->value(), "str", 5) == 0) { // formula
rapidxml::xml_node<>* v = cell_->first_node("v");
if (v == NULL)
return CELL_BLANK;

return (na.compare(v->value()) == 0) ? CELL_BLANK : CELL_TEXT;
return na.contains(v->value()) ? CELL_BLANK : CELL_TEXT;
} else if (strncmp(t->value(), "inlineStr", 9) == 0) { // formula
return CELL_TEXT;
} else {
Expand All @@ -203,7 +203,7 @@ class XlsxCell {
private:


Rcpp::RObject stringFromTable(const char* val, const std::string& na,
Rcpp::RObject stringFromTable(const char* val, const StringSet& na,
const std::vector<std::string>& stringTable) {
int id = atoi(val);
if (id < 0 || id >= (int) stringTable.size()) {
Expand All @@ -212,7 +212,7 @@ class XlsxCell {
}

const std::string& string = stringTable.at(id);
return (string == na) ? NA_STRING : Rf_mkCharCE(string.c_str(), CE_UTF8);
return na.contains(string) ? NA_STRING : Rf_mkCharCE(string.c_str(), CE_UTF8);
}

};
Expand Down
4 changes: 2 additions & 2 deletions src/XlsxWorkSheet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ IntegerVector parse_ref(std::string ref) {

// [[Rcpp::export]]
CharacterVector xlsx_col_types(std::string path, int sheet = 0,
std::string na = "", int nskip = 0,
CharacterVector na = CharacterVector(), int nskip = 0,
int n = 100) {

XlsxWorkSheet ws(path, sheet);
Expand All @@ -43,7 +43,7 @@ CharacterVector xlsx_col_names(std::string path, int sheet = 0, int nskip = 0) {

// [[Rcpp::export]]
List read_xlsx_(std::string path, int sheet, RObject col_names,
RObject col_types, std::string na, int nskip = 0) {
RObject col_types, std::vector<std::string> na, int nskip = 0) {

XlsxWorkSheet ws(path, sheet);

Expand Down
4 changes: 2 additions & 2 deletions src/XlsxWorkSheet.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class XlsxWorkSheet {
}


std::vector<CellType> colTypes(const std::string& na, int nskip = 0, int n_max = 100, bool has_col_names = false) {
std::vector<CellType> colTypes(const StringSet& na, int nskip = 0, int n_max = 100, bool has_col_names = false) {
rapidxml::xml_node<>* row = getRow(nskip + has_col_names);
std::vector<CellType> types;
types.resize(ncol_);
Expand Down Expand Up @@ -117,7 +117,7 @@ class XlsxWorkSheet {

Rcpp::List readCols(Rcpp::CharacterVector names,
const std::vector<CellType>& types,
const std::string& na, int nskip = 0) {
const StringSet& na, int nskip = 0) {
if ((int) names.size() != ncol_ || (int) types.size() != ncol_)
Rcpp::stop("Need one name and type for each column");

Expand Down
12 changes: 12 additions & 0 deletions tests/testthat/test-missing-values.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,24 @@ test_that("na arg maps strings to to NA [xls]", {
expect_equal(df$y, c(NA, 1, 1)) # formula column
})

test_that("na arg allows multiple strings [xls]", {
df <- read_excel("missing-values.xls", na = c("NA", "1"))
expect_true(all(is.na(df$x)))
expect_true(all(is.na(df$y))) # formula column
})

test_that("na arg maps strings to to NA [xlsx]", {
df <- read_excel("missing-values.xlsx", na = "NA")
expect_equal(df$x, c(NA, 1, 1))
expect_equal(df$y, c(NA, 1, 1)) # formula column
})

test_that("na arg allows multiple strings [xlsx]", {
df <- read_excel("missing-values.xlsx", na = c("NA", "1"))
expect_true(all(is.na(df$x)))
expect_true(all(is.na(df$y))) # formula column
})

test_that("text values in numeric column gives warning & NA", {
expect_warning(
df <- read_excel("missing-values.xls", col_types = rep("numeric", 2)),
Expand Down