Skip to content

Commit

Permalink
MAINT: Deprecate encoding from stata reader/writer
Browse files Browse the repository at this point in the history
Deprecate the encoding parameter from all Stata reading and writing
methods and classes.  The encoding depends only on the file format and
cannot be changed by users.
  • Loading branch information
bashtage committed Jun 9, 2018
1 parent 415012f commit dc00bd8
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 26 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Other API Changes
Deprecations
~~~~~~~~~~~~

-
- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`).
-
-

Expand Down
9 changes: 5 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@
from pandas.compat import PY36
from pandas.compat.numpy import function as nv
from pandas.util._decorators import (Appender, Substitution,
rewrite_axis_style_signature)
rewrite_axis_style_signature,
deprecate_kwarg)
from pandas.util._validators import (validate_bool_kwarg,
validate_axis_style_args)

Expand Down Expand Up @@ -1764,6 +1765,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
startcol=startcol, freeze_panes=freeze_panes,
engine=engine)

@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
def to_stata(self, fname, convert_dates=None, write_index=True,
encoding="latin-1", byteorder=None, time_stamp=None,
data_label=None, variable_labels=None, version=114,
Expand Down Expand Up @@ -1869,9 +1871,8 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
kwargs['convert_strl'] = convert_strl

writer = statawriter(fname, self, convert_dates=convert_dates,
encoding=encoding, byteorder=byteorder,
time_stamp=time_stamp, data_label=data_label,
write_index=write_index,
byteorder=byteorder, time_stamp=time_stamp,
data_label=data_label, write_index=write_index,
variable_labels=variable_labels, **kwargs)
writer.write_file()

Expand Down
22 changes: 11 additions & 11 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@
from pandas.core.series import Series
from pandas.io.common import (get_filepath_or_buffer, BaseIterator,
_stringify_path)
from pandas.util._decorators import Appender
from pandas.util._decorators import deprecate_kwarg
from pandas.util._decorators import Appender, deprecate_kwarg

VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
Expand Down Expand Up @@ -169,6 +168,7 @@


@Appender(_read_stata_doc)
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
def read_stata(filepath_or_buffer, convert_dates=True,
convert_categoricals=True, encoding=None, index_col=None,
Expand Down Expand Up @@ -952,6 +952,7 @@ def __init__(self):
class StataReader(StataParser, BaseIterator):
__doc__ = _stata_reader_doc

@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
def __init__(self, path_or_buf, convert_dates=True,
convert_categoricals=True, index_col=None,
Expand All @@ -970,7 +971,7 @@ def __init__(self, path_or_buf, convert_dates=True,
self._preserve_dtypes = preserve_dtypes
self._columns = columns
self._order_categoricals = order_categoricals
self._encoding = encoding
self._encoding = None
self._chunksize = chunksize

# State variables for the file
Expand Down Expand Up @@ -1962,17 +1963,14 @@ class StataWriter(StataParser):

_max_string_length = 244

@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
def __init__(self, fname, data, convert_dates=None, write_index=True,
encoding="latin-1", byteorder=None, time_stamp=None,
data_label=None, variable_labels=None):
super(StataWriter, self).__init__()
self._convert_dates = {} if convert_dates is None else convert_dates
self._write_index = write_index
if encoding is not None:
if encoding not in VALID_ENCODINGS:
raise ValueError('Unknown encoding. Only latin-1 and ascii '
'supported.')
self._encoding = encoding
self._encoding = 'latin-1'
self._time_stamp = time_stamp
self._data_label = data_label
self._variable_labels = variable_labels
Expand Down Expand Up @@ -2731,16 +2729,18 @@ class StataWriter117(StataWriter):

_max_string_length = 2045

@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
def __init__(self, fname, data, convert_dates=None, write_index=True,
encoding="latin-1", byteorder=None, time_stamp=None,
data_label=None, variable_labels=None, convert_strl=None):
# Shallow copy since convert_strl might be modified later
self._convert_strl = [] if convert_strl is None else convert_strl[:]

super(StataWriter117, self).__init__(fname, data, convert_dates,
write_index, encoding, byteorder,
time_stamp, data_label,
variable_labels)
write_index, byteorder=byteorder,
time_stamp=time_stamp,
data_label=data_label,
variable_labels=variable_labels)
self._map = None
self._strl_blob = None

Expand Down
17 changes: 7 additions & 10 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,16 +361,20 @@ def test_encoding(self, version):

# GH 4626, proper encoding handling
raw = read_stata(self.dta_encoding)
encoded = read_stata(self.dta_encoding, encoding="latin-1")
with warnings.catch_warnings(record=True) as w:
encoded = read_stata(self.dta_encoding, encoding='latin-1')
assert len(w) == 1
result = encoded.kreis1849[0]

expected = raw.kreis1849[0]
assert result == expected
assert isinstance(result, compat.string_types)

with tm.ensure_clean() as path:
encoded.to_stata(path, encoding='latin-1',
write_index=False, version=version)
with warnings.catch_warnings(record=True) as w:
encoded.to_stata(path, write_index=False, version=version,
encoding='latin-1')
assert len(w) == 1
reread_encoded = read_stata(path)
tm.assert_frame_equal(encoded, reread_encoded)

Expand Down Expand Up @@ -1349,13 +1353,6 @@ def test_out_of_range_float(self):
assert 'ColumnTooBig' in cm.exception
assert 'infinity' in cm.exception

def test_invalid_encoding(self):
# GH15723, validate encoding
original = self.read_csv(self.csv3)
with pytest.raises(ValueError):
with tm.ensure_clean() as path:
original.to_stata(path, encoding='utf-8')

def test_path_pathlib(self):
df = tm.makeDataFrame()
df.index.name = 'index'
Expand Down

0 comments on commit dc00bd8

Please sign in to comment.