Skip to content

Commit

Permalink
MAINT: Deprecate encoding from stata reader/writer (pandas-dev#21400)
Browse files Browse the repository at this point in the history
Deprecate the encoding parameter from all Stata reading and writing
methods and classes.  The encoding depends only on the file format and
cannot be changed by users.
  • Loading branch information
bashtage authored and victor committed Sep 30, 2018
1 parent 766b9fd commit 0ade274
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 29 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Other API Changes
Deprecations
~~~~~~~~~~~~

-
- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`).
-
-

Expand Down
9 changes: 5 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@
from pandas.compat import PY36
from pandas.compat.numpy import function as nv
from pandas.util._decorators import (Appender, Substitution,
rewrite_axis_style_signature)
rewrite_axis_style_signature,
deprecate_kwarg)
from pandas.util._validators import (validate_bool_kwarg,
validate_axis_style_args)

Expand Down Expand Up @@ -1764,6 +1765,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
startcol=startcol, freeze_panes=freeze_panes,
engine=engine)

@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
def to_stata(self, fname, convert_dates=None, write_index=True,
encoding="latin-1", byteorder=None, time_stamp=None,
data_label=None, variable_labels=None, version=114,
Expand Down Expand Up @@ -1869,9 +1871,8 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
kwargs['convert_strl'] = convert_strl

writer = statawriter(fname, self, convert_dates=convert_dates,
encoding=encoding, byteorder=byteorder,
time_stamp=time_stamp, data_label=data_label,
write_index=write_index,
byteorder=byteorder, time_stamp=time_stamp,
data_label=data_label, write_index=write_index,
variable_labels=variable_labels, **kwargs)
writer.write_file()

Expand Down
25 changes: 11 additions & 14 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,7 @@
from pandas.core.series import Series
from pandas.io.common import (get_filepath_or_buffer, BaseIterator,
_stringify_path)
from pandas.util._decorators import Appender
from pandas.util._decorators import deprecate_kwarg

VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
from pandas.util._decorators import Appender, deprecate_kwarg

_version_error = ("Version of given Stata file is not 104, 105, 108, "
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
Expand Down Expand Up @@ -169,6 +165,7 @@


@Appender(_read_stata_doc)
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
def read_stata(filepath_or_buffer, convert_dates=True,
convert_categoricals=True, encoding=None, index_col=None,
Expand Down Expand Up @@ -952,6 +949,7 @@ def __init__(self):
class StataReader(StataParser, BaseIterator):
__doc__ = _stata_reader_doc

@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
def __init__(self, path_or_buf, convert_dates=True,
convert_categoricals=True, index_col=None,
Expand All @@ -970,7 +968,7 @@ def __init__(self, path_or_buf, convert_dates=True,
self._preserve_dtypes = preserve_dtypes
self._columns = columns
self._order_categoricals = order_categoricals
self._encoding = encoding
self._encoding = None
self._chunksize = chunksize

# State variables for the file
Expand Down Expand Up @@ -1962,17 +1960,14 @@ class StataWriter(StataParser):

_max_string_length = 244

@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
def __init__(self, fname, data, convert_dates=None, write_index=True,
encoding="latin-1", byteorder=None, time_stamp=None,
data_label=None, variable_labels=None):
super(StataWriter, self).__init__()
self._convert_dates = {} if convert_dates is None else convert_dates
self._write_index = write_index
if encoding is not None:
if encoding not in VALID_ENCODINGS:
raise ValueError('Unknown encoding. Only latin-1 and ascii '
'supported.')
self._encoding = encoding
self._encoding = 'latin-1'
self._time_stamp = time_stamp
self._data_label = data_label
self._variable_labels = variable_labels
Expand Down Expand Up @@ -2731,16 +2726,18 @@ class StataWriter117(StataWriter):

_max_string_length = 2045

@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
def __init__(self, fname, data, convert_dates=None, write_index=True,
encoding="latin-1", byteorder=None, time_stamp=None,
data_label=None, variable_labels=None, convert_strl=None):
# Shallow copy since convert_strl might be modified later
self._convert_strl = [] if convert_strl is None else convert_strl[:]

super(StataWriter117, self).__init__(fname, data, convert_dates,
write_index, encoding, byteorder,
time_stamp, data_label,
variable_labels)
write_index, byteorder=byteorder,
time_stamp=time_stamp,
data_label=data_label,
variable_labels=variable_labels)
self._map = None
self._strl_blob = None

Expand Down
15 changes: 5 additions & 10 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,16 +361,18 @@ def test_encoding(self, version):

# GH 4626, proper encoding handling
raw = read_stata(self.dta_encoding)
encoded = read_stata(self.dta_encoding, encoding="latin-1")
with tm.assert_produces_warning(FutureWarning):
encoded = read_stata(self.dta_encoding, encoding='latin-1')
result = encoded.kreis1849[0]

expected = raw.kreis1849[0]
assert result == expected
assert isinstance(result, compat.string_types)

with tm.ensure_clean() as path:
encoded.to_stata(path, encoding='latin-1',
write_index=False, version=version)
with tm.assert_produces_warning(FutureWarning):
encoded.to_stata(path, write_index=False, version=version,
encoding='latin-1')
reread_encoded = read_stata(path)
tm.assert_frame_equal(encoded, reread_encoded)

Expand Down Expand Up @@ -1349,13 +1351,6 @@ def test_out_of_range_float(self):
assert 'ColumnTooBig' in cm.exception
assert 'infinity' in cm.exception

def test_invalid_encoding(self):
# GH15723, validate encoding
original = self.read_csv(self.csv3)
with pytest.raises(ValueError):
with tm.ensure_clean() as path:
original.to_stata(path, encoding='utf-8')

def test_path_pathlib(self):
df = tm.makeDataFrame()
df.index.name = 'index'
Expand Down

0 comments on commit 0ade274

Please sign in to comment.