From 57c24f8c6447c82d4e609e6b9e28680fa6bf5e97 Mon Sep 17 00:00:00 2001
From: Adrian Castravete <adrian@figshare.com>
Date: Tue, 29 May 2018 15:38:36 +0300
Subject: [PATCH] BUG: Fix handling of encoding for the StataReader #21244

---
 doc/source/whatsnew/v0.24.0.txt |  4 ++--
 pandas/io/stata.py              |  5 +++--
 pandas/tests/io/test_stata.py   | 10 +++++-----
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index e931450cb5c01..ee33f31aad01b 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -146,7 +146,8 @@ MultiIndex
 I/O
 ^^^
 
--
+- :func:`pandas.read_stata` now honours the ``encoding`` parameter, and supports the 'utf-8'
+  encoding.  #21244
 -
 -
 
@@ -184,4 +185,3 @@ Other
 -
 -
 -
-
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 2797924985c70..f0e8b8d638d0d 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -37,7 +37,8 @@
 from pandas.util._decorators import deprecate_kwarg
 
 VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
-                   'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
+                   'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1',
+                   'utf-8', 'utf8')
 
 _version_error = ("Version of given Stata file is not 104, 105, 108, "
                   "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
@@ -1335,7 +1336,7 @@ def _calcsize(self, fmt):
 
     def _decode(self, s):
         s = s.partition(b"\0")[0]
-        return s.decode('utf-8')
+        return s.decode(self._encoding or self._default_encoding)
 
     def _null_terminate(self, s):
         if compat.PY3 or self._encoding is not None:
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index f3a465da4e87f..db38227155df4 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -99,9 +99,9 @@ def setup_method(self, method):
 
         self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')
 
-    def read_dta(self, file):
+    def read_dta(self, file, encoding='latin-1'):
         # Legacy default reader configuration
-        return read_stata(file, convert_dates=True)
+        return read_stata(file, convert_dates=True, encoding=encoding)
 
     def read_csv(self, file):
         return read_csv(file, parse_dates=True)
@@ -268,7 +268,7 @@ def test_read_dta12(self):
         tm.assert_frame_equal(parsed_117, expected, check_dtype=False)
 
     def test_read_dta18(self):
-        parsed_118 = self.read_dta(self.dta22_118)
+        parsed_118 = self.read_dta(self.dta22_118, encoding='utf-8')
         parsed_118["Bytes"] = parsed_118["Bytes"].astype('O')
         expected = DataFrame.from_records(
             [['Cat', 'Bogota', u'Bogotá', 1, 1.0, u'option b Ünicode', 1.0],
@@ -283,7 +283,7 @@ def test_read_dta18(self):
         for col in parsed_118.columns:
             tm.assert_almost_equal(parsed_118[col], expected[col])
 
-        with StataReader(self.dta22_118) as rdr:
+        with StataReader(self.dta22_118, encoding='utf-8') as rdr:
             vl = rdr.variable_labels()
             vl_expected = {u'Unicode_Cities_Strl':
                            u'Here are some strls with Ünicode chars',
@@ -1358,7 +1358,7 @@ def test_invalid_encoding(self):
         original = self.read_csv(self.csv3)
         with pytest.raises(ValueError):
             with tm.ensure_clean() as path:
-                original.to_stata(path, encoding='utf-8')
+                original.to_stata(path, encoding='pokemon')
 
     def test_path_pathlib(self):
         df = tm.makeDataFrame()