Skip to content

Commit

Permalink
BUG: Fix handling of encoding for the StataReader #21244
Browse files Browse the repository at this point in the history
  • Loading branch information
Adrian Castravete committed May 30, 2018
1 parent c85ab08 commit 2968c59
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 9 deletions.
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ MultiIndex
I/O
^^^

-
- :func:`pandas.read_stata` now honours the ``encoding`` parameter, and supports the 'utf-8'
encoding.
-
-

Expand Down Expand Up @@ -184,4 +185,3 @@ Other
-
-
-

5 changes: 3 additions & 2 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
from pandas.util._decorators import deprecate_kwarg

VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1',
'utf-8', 'utf8')

_version_error = ("Version of given Stata file is not 104, 105, 108, "
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
Expand Down Expand Up @@ -1335,7 +1336,7 @@ def _calcsize(self, fmt):

def _decode(self, s):
s = s.partition(b"\0")[0]
return s.decode('utf-8')
return s.decode(self._encoding or self._default_encoding)

def _null_terminate(self, s):
if compat.PY3 or self._encoding is not None:
Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@ def setup_method(self, method):

self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')

def read_dta(self, file):
def read_dta(self, file, encoding='latin-1'):
# Legacy default reader configuration
return read_stata(file, convert_dates=True)
return read_stata(file, convert_dates=True, encoding=encoding)

def read_csv(self, file):
return read_csv(file, parse_dates=True)
Expand Down Expand Up @@ -268,7 +268,7 @@ def test_read_dta12(self):
tm.assert_frame_equal(parsed_117, expected, check_dtype=False)

def test_read_dta18(self):
parsed_118 = self.read_dta(self.dta22_118)
parsed_118 = self.read_dta(self.dta22_118, encoding='utf-8')
parsed_118["Bytes"] = parsed_118["Bytes"].astype('O')
expected = DataFrame.from_records(
[['Cat', 'Bogota', u'Bogotá', 1, 1.0, u'option b Ünicode', 1.0],
Expand All @@ -283,7 +283,7 @@ def test_read_dta18(self):
for col in parsed_118.columns:
tm.assert_almost_equal(parsed_118[col], expected[col])

with StataReader(self.dta22_118) as rdr:
with StataReader(self.dta22_118, encoding='utf-8') as rdr:
vl = rdr.variable_labels()
vl_expected = {u'Unicode_Cities_Strl':
u'Here are some strls with Ünicode chars',
Expand Down Expand Up @@ -1358,7 +1358,7 @@ def test_invalid_encoding(self):
original = self.read_csv(self.csv3)
with pytest.raises(ValueError):
with tm.ensure_clean() as path:
original.to_stata(path, encoding='utf-8')
original.to_stata(path, encoding='pokemon')

def test_path_pathlib(self):
df = tm.makeDataFrame()
Expand Down

0 comments on commit 2968c59

Please sign in to comment.