diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index de985d4db5fa32..68c1839221508c 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -45,7 +45,7 @@ Other API Changes Deprecations ~~~~~~~~~~~~ -- +- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`). - - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ca572e2e56b6c4..0985de3126c5a7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -80,7 +80,8 @@ from pandas.compat import PY36 from pandas.compat.numpy import function as nv from pandas.util._decorators import (Appender, Substitution, - rewrite_axis_style_signature) + rewrite_axis_style_signature, + deprecate_kwarg) from pandas.util._validators import (validate_bool_kwarg, validate_axis_style_args) @@ -1764,6 +1765,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', startcol=startcol, freeze_panes=freeze_panes, engine=engine) + @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) def to_stata(self, fname, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, variable_labels=None, version=114, @@ -1869,9 +1871,8 @@ def to_stata(self, fname, convert_dates=None, write_index=True, kwargs['convert_strl'] = convert_strl writer = statawriter(fname, self, convert_dates=convert_dates, - encoding=encoding, byteorder=byteorder, - time_stamp=time_stamp, data_label=data_label, - write_index=write_index, + byteorder=byteorder, time_stamp=time_stamp, + data_label=data_label, write_index=write_index, variable_labels=variable_labels, **kwargs) writer.write_file() diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 8584e1f0e3f14b..b2a5bec2a48370 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -33,11 +33,7 @@ from pandas.core.series import Series from pandas.io.common import (get_filepath_or_buffer, BaseIterator, _stringify_path) -from pandas.util._decorators import Appender -from pandas.util._decorators import deprecate_kwarg - -VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1', - 'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1') +from pandas.util._decorators import Appender, deprecate_kwarg _version_error = ("Version of given Stata file is not 104, 105, 108, " "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), " @@ -169,6 +165,7 @@ @Appender(_read_stata_doc) +@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col') def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index_col=None, @@ -952,6 +949,7 @@ def __init__(self): class StataReader(StataParser, BaseIterator): __doc__ = _stata_reader_doc + @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col') def __init__(self, path_or_buf, convert_dates=True, convert_categoricals=True, index_col=None, @@ -970,7 +968,7 @@ def __init__(self, path_or_buf, convert_dates=True, self._preserve_dtypes = preserve_dtypes self._columns = columns self._order_categoricals = order_categoricals - self._encoding = encoding + self._encoding = None self._chunksize = chunksize # State variables for the file @@ -1962,17 +1960,14 @@ class StataWriter(StataParser): _max_string_length = 244 + @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, variable_labels=None): super(StataWriter, self).__init__() self._convert_dates = {} if convert_dates is None else convert_dates self._write_index = write_index - if encoding is not None: - if encoding not in VALID_ENCODINGS: - raise ValueError('Unknown encoding. Only latin-1 and ascii ' - 'supported.') - self._encoding = encoding + self._encoding = 'latin-1' self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels @@ -2731,6 +2726,7 @@ class StataWriter117(StataWriter): _max_string_length = 2045 + @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, variable_labels=None, convert_strl=None): @@ -2738,9 +2734,10 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, self._convert_strl = [] if convert_strl is None else convert_strl[:] super(StataWriter117, self).__init__(fname, data, convert_dates, - write_index, encoding, byteorder, - time_stamp, data_label, - variable_labels) + write_index, byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + variable_labels=variable_labels) self._map = None self._strl_blob = None diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index e5585902a9dd69..6e77cfe52da27d 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -361,7 +361,9 @@ def test_encoding(self, version): # GH 4626, proper encoding handling raw = read_stata(self.dta_encoding) - encoded = read_stata(self.dta_encoding, encoding="latin-1") + with warnings.catch_warnings(record=True) as w: + encoded = read_stata(self.dta_encoding, encoding='latin-1') + assert len(w) == 1 result = encoded.kreis1849[0] expected = raw.kreis1849[0] @@ -369,8 +371,10 @@ def test_encoding(self, version): assert isinstance(result, compat.string_types) with tm.ensure_clean() as path: - encoded.to_stata(path, encoding='latin-1', - write_index=False, version=version) + with warnings.catch_warnings(record=True) as w: + encoded.to_stata(path, write_index=False, version=version, + encoding='latin-1') + assert len(w) == 1 reread_encoded = read_stata(path) tm.assert_frame_equal(encoded, reread_encoded) @@ -1349,13 +1353,6 @@ def test_out_of_range_float(self): assert 'ColumnTooBig' in cm.exception assert 'infinity' in cm.exception - def test_invalid_encoding(self): - # GH15723, validate encoding - original = self.read_csv(self.csv3) - with pytest.raises(ValueError): - with tm.ensure_clean() as path: - original.to_stata(path, encoding='utf-8') - def test_path_pathlib(self): df = tm.makeDataFrame() df.index.name = 'index'