Skip to content

Commit

Permalink
add support for character vector for na values. Closes #13
Browse files Browse the repository at this point in the history
  • Loading branch information
Jonathan Marshall committed Apr 10, 2015
1 parent 5ef0ca5 commit 2dd82d6
Show file tree
Hide file tree
Showing 9 changed files with 65 additions and 33 deletions.
2 changes: 1 addition & 1 deletion R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ parse_ref <- function(ref) {
.Call('readxl_parse_ref', PACKAGE = 'readxl', ref)
}

xlsx_col_types <- function(path, sheet = 0L, na = "", nskip = 0L, n = 100L) {
xlsx_col_types <- function(path, sheet = 0L, na = character(), nskip = 0L, n = 100L) {
.Call('readxl_xlsx_col_types', PACKAGE = 'readxl', path, sheet, na, nskip, n)
}

Expand Down
7 changes: 4 additions & 3 deletions src/CellType.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <Rcpp.h>
#include <libxls/xls.h>
#include "StringSet.h"

enum CellType {
CELL_BLANK,
Expand Down Expand Up @@ -48,21 +49,21 @@ inline std::string cellTypeDesc(CellType type) {

inline CellType cellType(xls::st_cell::st_cell_data cell, xls::st_xf* styles,
const std::set<int>& customDateFormats,
std::string na = "") {
const StringSet &na = "") {
// Find codes in [MS-XLS] S2.3.2 (p175).
// See xls_addCell for those used for cells
switch(cell.id) {
case 253: // LabelSst
case 516: // Label
return (na.compare((char*) cell.str) == 0) ? CELL_BLANK : CELL_TEXT;
return na.contains((char*) cell.str) ? CELL_BLANK : CELL_TEXT;
break;

case 6: // formula
case 1030: // formula (Apple Numbers Bug)
if (cell.l == 0) {
return CELL_NUMERIC;
} else {
if (na.compare((char*) cell.str) == 0) {
if (na.contains((char*) cell.str)) {
return CELL_BLANK;
} else {
return CELL_TEXT;
Expand Down
16 changes: 8 additions & 8 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,13 @@ BEGIN_RCPP
END_RCPP
}
// xls_col_types
CharacterVector xls_col_types(std::string path, std::string na, int sheet, int nskip, int n, bool has_col_names);
CharacterVector xls_col_types(std::string path, std::vector<std::string> na, int sheet, int nskip, int n, bool has_col_names);
RcppExport SEXP readxl_xls_col_types(SEXP pathSEXP, SEXP naSEXP, SEXP sheetSEXP, SEXP nskipSEXP, SEXP nSEXP, SEXP has_col_namesSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Rcpp::RNGScope __rngScope;
Rcpp::traits::input_parameter< std::string >::type path(pathSEXP);
Rcpp::traits::input_parameter< std::string >::type na(naSEXP);
Rcpp::traits::input_parameter< std::vector<std::string> >::type na(naSEXP);
Rcpp::traits::input_parameter< int >::type sheet(sheetSEXP);
Rcpp::traits::input_parameter< int >::type nskip(nskipSEXP);
Rcpp::traits::input_parameter< int >::type n(nSEXP);
Expand All @@ -80,7 +80,7 @@ BEGIN_RCPP
END_RCPP
}
// xls_cols
List xls_cols(std::string path, int i, CharacterVector col_names, CharacterVector col_types, std::string na, int nskip);
List xls_cols(std::string path, int i, CharacterVector col_names, CharacterVector col_types, std::vector<std::string> na, int nskip);
RcppExport SEXP readxl_xls_cols(SEXP pathSEXP, SEXP iSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP naSEXP, SEXP nskipSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Expand All @@ -89,7 +89,7 @@ BEGIN_RCPP
Rcpp::traits::input_parameter< int >::type i(iSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type col_names(col_namesSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type col_types(col_typesSEXP);
Rcpp::traits::input_parameter< std::string >::type na(naSEXP);
Rcpp::traits::input_parameter< std::vector<std::string> >::type na(naSEXP);
Rcpp::traits::input_parameter< int >::type nskip(nskipSEXP);
__result = Rcpp::wrap(xls_cols(path, i, col_names, col_types, na, nskip));
return __result;
Expand Down Expand Up @@ -163,14 +163,14 @@ BEGIN_RCPP
END_RCPP
}
// xlsx_col_types
CharacterVector xlsx_col_types(std::string path, int sheet, std::string na, int nskip, int n);
CharacterVector xlsx_col_types(std::string path, int sheet, CharacterVector na, int nskip, int n);
RcppExport SEXP readxl_xlsx_col_types(SEXP pathSEXP, SEXP sheetSEXP, SEXP naSEXP, SEXP nskipSEXP, SEXP nSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Rcpp::RNGScope __rngScope;
Rcpp::traits::input_parameter< std::string >::type path(pathSEXP);
Rcpp::traits::input_parameter< int >::type sheet(sheetSEXP);
Rcpp::traits::input_parameter< std::string >::type na(naSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type na(naSEXP);
Rcpp::traits::input_parameter< int >::type nskip(nskipSEXP);
Rcpp::traits::input_parameter< int >::type n(nSEXP);
__result = Rcpp::wrap(xlsx_col_types(path, sheet, na, nskip, n));
Expand All @@ -191,7 +191,7 @@ BEGIN_RCPP
END_RCPP
}
// read_xlsx_
List read_xlsx_(std::string path, int sheet, RObject col_names, RObject col_types, std::string na, int nskip);
List read_xlsx_(std::string path, int sheet, RObject col_names, RObject col_types, std::vector<std::string> na, int nskip);
RcppExport SEXP readxl_read_xlsx_(SEXP pathSEXP, SEXP sheetSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP naSEXP, SEXP nskipSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Expand All @@ -200,7 +200,7 @@ BEGIN_RCPP
Rcpp::traits::input_parameter< int >::type sheet(sheetSEXP);
Rcpp::traits::input_parameter< RObject >::type col_names(col_namesSEXP);
Rcpp::traits::input_parameter< RObject >::type col_types(col_typesSEXP);
Rcpp::traits::input_parameter< std::string >::type na(naSEXP);
Rcpp::traits::input_parameter< std::vector<std::string> >::type na(naSEXP);
Rcpp::traits::input_parameter< int >::type nskip(nskipSEXP);
__result = Rcpp::wrap(read_xlsx_(path, sheet, col_names, col_types, na, nskip));
return __result;
Expand Down
31 changes: 31 additions & 0 deletions src/StringSet.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#ifndef READXL_STRINGSET_
#define READXL_STRINGSET_

#include <Rcpp.h>

class StringSet
{
std::set<std::string> set_;
public:
StringSet(const char *s = "") {
if (strlen(s) > 0)
set_.insert(s);
}
StringSet(const std::vector<std::string> &s) {
for (std::vector<std::string>::const_iterator i = s.begin(); i != s.end(); ++i)
set_.insert(*i);
}
StringSet(const Rcpp::CharacterVector &s) {
for (Rcpp::CharacterVector::const_iterator i = s.begin(); i != s.end(); ++i)
set_.insert(Rcpp::as<std::string>(*i));
}
bool contains(const std::string &s) const {
return set_.find(s) != set_.end();
}
bool contains(const double d) const {
std::ostringstream str; str << d;
return contains(str.str());
}
};

#endif
4 changes: 2 additions & 2 deletions src/XlsWorkSheet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ CharacterVector xls_col_names(std::string path, int i = 0, int nskip = 0) {
}

// [[Rcpp::export]]
CharacterVector xls_col_types(std::string path, std::string na, int sheet = 0,
CharacterVector xls_col_types(std::string path, std::vector<std::string> na, int sheet = 0,
int nskip = 0, int n = 100, bool has_col_names = false) {
XlsWorkBook wb = XlsWorkBook(path);
std::vector<CellType> types = wb.sheet(sheet).colTypes(na, nskip + has_col_names, n);
Expand All @@ -37,7 +37,7 @@ CharacterVector xls_col_types(std::string path, std::string na, int sheet = 0,

// [[Rcpp::export]]
List xls_cols(std::string path, int i, CharacterVector col_names,
CharacterVector col_types, std::string na, int nskip = 0) {
CharacterVector col_types, std::vector<std::string> na, int nskip = 0) {
XlsWorkBook wb = XlsWorkBook(path);
XlsWorkSheet sheet = wb.sheet(i);

Expand Down
6 changes: 3 additions & 3 deletions src/XlsWorkSheet.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class XlsWorkSheet {
return out;
}

std::vector<CellType> colTypes(std::string na, int nskip = 0, int n_max = 100) {
std::vector<CellType> colTypes(const StringSet &na, int nskip = 0, int n_max = 100) {
std::vector<CellType> types(ncol_);

for (int i = nskip; i < nrow_ && i < n_max; ++i) {
Expand All @@ -88,7 +88,7 @@ class XlsWorkSheet {
}

Rcpp::List readCols(Rcpp::CharacterVector names, std::vector<CellType> types,
std::string na, int nskip = 0) {
const StringSet &na, int nskip = 0) {
if ((int) names.size() != ncol_ || (int) types.size() != ncol_)
Rcpp::stop("Need one name and type for each column");

Expand Down Expand Up @@ -154,7 +154,7 @@ class XlsWorkSheet {
SET_STRING_ELT(col, i, NA_STRING);
} else {
std::string stdString((char*) cell.str);
Rcpp::RObject rString = stdString == na ? NA_STRING : Rf_mkCharCE(stdString.c_str(), CE_UTF8);
Rcpp::RObject rString = na.contains(stdString) ? NA_STRING : Rf_mkCharCE(stdString.c_str(), CE_UTF8);
SET_STRING_ELT(col, i, rString);
}
break;
Expand Down
24 changes: 12 additions & 12 deletions src/XlsxCell.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,31 +107,31 @@ class XlsxCell {
return stringTable.at(id);
}

double asDouble(const std::string& na) {
double asDouble(const StringSet& na) {
rapidxml::xml_node<>* v = cell_->first_node("v");
if (v == NULL || na.compare(v->value()) == 0)
if (v == NULL || na.contains(v->value()))
return NA_REAL;

return (v == NULL) ? 0 : atof(v->value());
}

double asDate(const std::string& na, int offset) {
double asDate(const StringSet& na, int offset) {
rapidxml::xml_node<>* v = cell_->first_node("v");
if (v == NULL || na.compare(v->value()) == 0)
if (v == NULL || na.contains(v->value()))
return NA_REAL;

double value = atof(v->value());
return (v == NULL) ? 0 : (value - offset) * 86400;
}

Rcpp::RObject asCharSxp(const std::string& na,
Rcpp::RObject asCharSxp(const StringSet& na,
const std::vector<std::string>& stringTable) {

// Is it an inline string? // 18.3.1.53 is (Rich Text Inline) [p1649]
rapidxml::xml_node<>* is = cell_->first_node("is");
if (is != NULL) {
std::string value;
if (!parseString(is, &value) || na.compare(value) == 0) {
if (!parseString(is, &value) || na.contains(value)) {
return NA_STRING;
} else {
return Rf_mkCharCE(value.c_str(), CE_UTF8);
Expand All @@ -148,15 +148,15 @@ class XlsxCell {
if (t != NULL && strncmp(t->value(), "s", t->value_size()) == 0) {
return stringFromTable(v->value(), na, stringTable);
} else {
if (na.compare(v->value()) == 0) {
if (na.contains(v->value())) {
return NA_STRING;
} else {
return Rf_mkCharCE(v->value(), CE_UTF8);
}
}
}

CellType type(const std::string& na,
CellType type(const StringSet& na,
const std::vector<std::string>& stringTable,
const std::set<int>& dateStyles) {
rapidxml::xml_attribute<>* t = cell_->first_attribute("t");
Expand All @@ -182,13 +182,13 @@ class XlsxCell {

int id = atoi(v->value());
const std::string& string = stringTable.at(id);
return (string == na) ? CELL_BLANK : CELL_TEXT;
return na.contains(string) ? CELL_BLANK : CELL_TEXT;
} else if (strncmp(t->value(), "str", 5) == 0) { // formula
rapidxml::xml_node<>* v = cell_->first_node("v");
if (v == NULL)
return CELL_BLANK;

return (na.compare(v->value()) == 0) ? CELL_BLANK : CELL_TEXT;
return na.contains(v->value()) ? CELL_BLANK : CELL_TEXT;
} else if (strncmp(t->value(), "inlineStr", 9) == 0) { // formula
return CELL_TEXT;
} else {
Expand All @@ -203,7 +203,7 @@ class XlsxCell {
private:


Rcpp::RObject stringFromTable(const char* val, const std::string& na,
Rcpp::RObject stringFromTable(const char* val, const StringSet& na,
const std::vector<std::string>& stringTable) {
int id = atoi(val);
if (id < 0 || id >= (int) stringTable.size()) {
Expand All @@ -212,7 +212,7 @@ class XlsxCell {
}

const std::string& string = stringTable.at(id);
return (string == na) ? NA_STRING : Rf_mkCharCE(string.c_str(), CE_UTF8);
return na.contains(string) ? NA_STRING : Rf_mkCharCE(string.c_str(), CE_UTF8);
}

};
Expand Down
4 changes: 2 additions & 2 deletions src/XlsxWorkSheet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ IntegerVector parse_ref(std::string ref) {

// [[Rcpp::export]]
CharacterVector xlsx_col_types(std::string path, int sheet = 0,
std::string na = "", int nskip = 0,
CharacterVector na = CharacterVector(), int nskip = 0,
int n = 100) {

XlsxWorkSheet ws(path, sheet);
Expand All @@ -43,7 +43,7 @@ CharacterVector xlsx_col_names(std::string path, int sheet = 0, int nskip = 0) {

// [[Rcpp::export]]
List read_xlsx_(std::string path, int sheet, RObject col_names,
RObject col_types, std::string na, int nskip = 0) {
RObject col_types, std::vector<std::string> na, int nskip = 0) {

XlsxWorkSheet ws(path, sheet);

Expand Down
4 changes: 2 additions & 2 deletions src/XlsxWorkSheet.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class XlsxWorkSheet {
}


std::vector<CellType> colTypes(const std::string& na, int nskip = 0, int n_max = 100, bool has_col_names = false) {
std::vector<CellType> colTypes(const StringSet& na, int nskip = 0, int n_max = 100, bool has_col_names = false) {
rapidxml::xml_node<>* row = getRow(nskip + has_col_names);
std::vector<CellType> types;
types.resize(ncol_);
Expand Down Expand Up @@ -117,7 +117,7 @@ class XlsxWorkSheet {

Rcpp::List readCols(Rcpp::CharacterVector names,
const std::vector<CellType>& types,
const std::string& na, int nskip = 0) {
const StringSet& na, int nskip = 0) {
if ((int) names.size() != ncol_ || (int) types.size() != ncol_)
Rcpp::stop("Need one name and type for each column");

Expand Down

0 comments on commit 2dd82d6

Please sign in to comment.