Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT: Deprecate encoding from stata reader/writer #21400

Merged
merged 1 commit into from
Jun 12, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Other API Changes
Deprecations
~~~~~~~~~~~~

-
- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`).
-
-

Expand Down
9 changes: 5 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@
from pandas.compat import PY36
from pandas.compat.numpy import function as nv
from pandas.util._decorators import (Appender, Substitution,
rewrite_axis_style_signature)
rewrite_axis_style_signature,
deprecate_kwarg)
from pandas.util._validators import (validate_bool_kwarg,
validate_axis_style_args)

Expand Down Expand Up @@ -1764,6 +1765,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
startcol=startcol, freeze_panes=freeze_panes,
engine=engine)

@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
def to_stata(self, fname, convert_dates=None, write_index=True,
encoding="latin-1", byteorder=None, time_stamp=None,
data_label=None, variable_labels=None, version=114,
Expand Down Expand Up @@ -1869,9 +1871,8 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
kwargs['convert_strl'] = convert_strl

writer = statawriter(fname, self, convert_dates=convert_dates,
encoding=encoding, byteorder=byteorder,
time_stamp=time_stamp, data_label=data_label,
write_index=write_index,
byteorder=byteorder, time_stamp=time_stamp,
data_label=data_label, write_index=write_index,
variable_labels=variable_labels, **kwargs)
writer.write_file()

Expand Down
25 changes: 11 additions & 14 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,7 @@
from pandas.core.series import Series
from pandas.io.common import (get_filepath_or_buffer, BaseIterator,
_stringify_path)
from pandas.util._decorators import Appender
from pandas.util._decorators import deprecate_kwarg

VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
from pandas.util._decorators import Appender, deprecate_kwarg

_version_error = ("Version of given Stata file is not 104, 105, 108, "
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
Expand Down Expand Up @@ -169,6 +165,7 @@


@Appender(_read_stata_doc)
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
def read_stata(filepath_or_buffer, convert_dates=True,
convert_categoricals=True, encoding=None, index_col=None,
Expand Down Expand Up @@ -952,6 +949,7 @@ def __init__(self):
class StataReader(StataParser, BaseIterator):
__doc__ = _stata_reader_doc

@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
def __init__(self, path_or_buf, convert_dates=True,
convert_categoricals=True, index_col=None,
Expand All @@ -970,7 +968,7 @@ def __init__(self, path_or_buf, convert_dates=True,
self._preserve_dtypes = preserve_dtypes
self._columns = columns
self._order_categoricals = order_categoricals
self._encoding = encoding
self._encoding = None
self._chunksize = chunksize

# State variables for the file
Expand Down Expand Up @@ -1962,17 +1960,14 @@ class StataWriter(StataParser):

_max_string_length = 244

@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
def __init__(self, fname, data, convert_dates=None, write_index=True,
encoding="latin-1", byteorder=None, time_stamp=None,
data_label=None, variable_labels=None):
super(StataWriter, self).__init__()
self._convert_dates = {} if convert_dates is None else convert_dates
self._write_index = write_index
if encoding is not None:
if encoding not in VALID_ENCODINGS:
raise ValueError('Unknown encoding. Only latin-1 and ascii '
'supported.')
self._encoding = encoding
self._encoding = 'latin-1'
self._time_stamp = time_stamp
self._data_label = data_label
self._variable_labels = variable_labels
Expand Down Expand Up @@ -2731,16 +2726,18 @@ class StataWriter117(StataWriter):

_max_string_length = 2045

@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
def __init__(self, fname, data, convert_dates=None, write_index=True,
encoding="latin-1", byteorder=None, time_stamp=None,
data_label=None, variable_labels=None, convert_strl=None):
# Shallow copy since convert_strl might be modified later
self._convert_strl = [] if convert_strl is None else convert_strl[:]

super(StataWriter117, self).__init__(fname, data, convert_dates,
write_index, encoding, byteorder,
time_stamp, data_label,
variable_labels)
write_index, byteorder=byteorder,
time_stamp=time_stamp,
data_label=data_label,
variable_labels=variable_labels)
self._map = None
self._strl_blob = None

Expand Down
15 changes: 5 additions & 10 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,16 +361,18 @@ def test_encoding(self, version):

# GH 4626, proper encoding handling
raw = read_stata(self.dta_encoding)
encoded = read_stata(self.dta_encoding, encoding="latin-1")
with tm.assert_produces_warning(FutureWarning):
encoded = read_stata(self.dta_encoding, encoding='latin-1')
result = encoded.kreis1849[0]

expected = raw.kreis1849[0]
assert result == expected
assert isinstance(result, compat.string_types)

with tm.ensure_clean() as path:
encoded.to_stata(path, encoding='latin-1',
write_index=False, version=version)
with tm.assert_produces_warning(FutureWarning):
encoded.to_stata(path, write_index=False, version=version,
encoding='latin-1')
reread_encoded = read_stata(path)
tm.assert_frame_equal(encoded, reread_encoded)

Expand Down Expand Up @@ -1349,13 +1351,6 @@ def test_out_of_range_float(self):
assert 'ColumnTooBig' in cm.exception
assert 'infinity' in cm.exception

def test_invalid_encoding(self):
# GH15723, validate encoding
original = self.read_csv(self.csv3)
with pytest.raises(ValueError):
with tm.ensure_clean() as path:
original.to_stata(path, encoding='utf-8')

def test_path_pathlib(self):
df = tm.makeDataFrame()
df.index.name = 'index'
Expand Down