From 57c24f8c6447c82d4e609e6b9e28680fa6bf5e97 Mon Sep 17 00:00:00 2001 From: Adrian Castravete Date: Tue, 29 May 2018 15:38:36 +0300 Subject: [PATCH] BUG: Fix handling of encoding for the StataReader #21244 --- doc/source/whatsnew/v0.24.0.txt | 4 ++-- pandas/io/stata.py | 5 +++-- pandas/tests/io/test_stata.py | 10 +++++----- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index e931450cb5c01..ee33f31aad01b 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -146,7 +146,8 @@ MultiIndex I/O ^^^ -- +- :func:`pandas.read_stata` now honours the ``encoding`` parameter, and supports the 'utf-8' + encoding. #21244 - - @@ -184,4 +185,3 @@ Other - - - - diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2797924985c70..f0e8b8d638d0d 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -37,7 +37,8 @@ from pandas.util._decorators import deprecate_kwarg VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1', - 'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1') + 'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1', + 'utf-8', 'utf8') _version_error = ("Version of given Stata file is not 104, 105, 108, " "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), " @@ -1335,7 +1336,7 @@ def _calcsize(self, fmt): def _decode(self, s): s = s.partition(b"\0")[0] - return s.decode('utf-8') + return s.decode(self._encoding or self._default_encoding) def _null_terminate(self, s): if compat.PY3 or self._encoding is not None: diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index f3a465da4e87f..db38227155df4 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -99,9 +99,9 @@ def setup_method(self, method): self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta') - def read_dta(self, file): + def read_dta(self, file, encoding='latin-1'): # Legacy default reader configuration - return read_stata(file, convert_dates=True) + return read_stata(file, convert_dates=True, encoding=encoding) def read_csv(self, file): return read_csv(file, parse_dates=True) @@ -268,7 +268,7 @@ def test_read_dta12(self): tm.assert_frame_equal(parsed_117, expected, check_dtype=False) def test_read_dta18(self): - parsed_118 = self.read_dta(self.dta22_118) + parsed_118 = self.read_dta(self.dta22_118, encoding='utf-8') parsed_118["Bytes"] = parsed_118["Bytes"].astype('O') expected = DataFrame.from_records( [['Cat', 'Bogota', u'Bogotá', 1, 1.0, u'option b Ünicode', 1.0], @@ -283,7 +283,7 @@ def test_read_dta18(self): for col in parsed_118.columns: tm.assert_almost_equal(parsed_118[col], expected[col]) - with StataReader(self.dta22_118) as rdr: + with StataReader(self.dta22_118, encoding='utf-8') as rdr: vl = rdr.variable_labels() vl_expected = {u'Unicode_Cities_Strl': u'Here are some strls with Ünicode chars', @@ -1358,7 +1358,7 @@ def test_invalid_encoding(self): original = self.read_csv(self.csv3) with pytest.raises(ValueError): with tm.ensure_clean() as path: - original.to_stata(path, encoding='utf-8') + original.to_stata(path, encoding='pokemon') def test_path_pathlib(self): df = tm.makeDataFrame()