From 69203636c2f661bbc167a5a4fc91ea7d30ff0b67 Mon Sep 17 00:00:00 2001 From: Justin Zheng Date: Wed, 14 Nov 2018 19:02:24 -0800 Subject: [PATCH 01/10] BUG-22984 Fix truncation of DataFrame representations (#22987) * BUG-22984 Fix truncation of DataFrame representations --- doc/source/whatsnew/v0.24.0.rst | 3 ++- pandas/io/formats/format.py | 5 ----- pandas/tests/io/formats/test_format.py | 17 ++++++++++++----- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 44c467795d1ed..9275357e5ad18 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1319,7 +1319,8 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) - :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) - Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`) -- Bug in :func:`to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) +- Bug in :func:`DataFrame.to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) +- Bug in :func:`DataFrame.to_string()` that caused representations of :class:`DataFrame` to not take up the whole window (:issue:`22984`) - Bug in :func:`DataFrame.to_csv` where a single level MultiIndex incorrectly wrote a tuple. Now just the value of the index is written (:issue:`19589`). - Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`) - Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 6f64605bcf175..9857129f56b0c 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -608,11 +608,6 @@ def to_string(self): else: # max_cols == 0. Try to fit frame to terminal text = self.adj.adjoin(1, *strcols).split('\n') max_len = Series(text).str.len().max() - headers = [ele[0] for ele in strcols] - # Size of last col determines dot col size. See - # `self._to_str_columns - size_tr_col = len(headers[self.tr_size_col]) - max_len += size_tr_col # Need to make space for largest row # plus truncate dot col dif = max_len - self.w # '+ 1' to avoid too wide repr (GH PR #17023) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 28aa8a92cc410..b8ca8cb73c7e9 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -305,14 +305,10 @@ def test_repr_non_interactive(self): assert not has_truncated_repr(df) assert not has_expanded_repr(df) - def test_repr_truncates_terminal_size(self): + def test_repr_truncates_terminal_size(self, mock): # https://github.com/pandas-dev/pandas/issues/21180 # TODO: use mock fixutre. # This is being backported, so doing it directly here. - try: - from unittest import mock - except ImportError: - mock = pytest.importorskip("mock") terminal_size = (118, 96) p1 = mock.patch('pandas.io.formats.console.get_terminal_size', @@ -343,6 +339,17 @@ def test_repr_truncates_terminal_size(self): assert df2.columns[0] in result.split('\n')[0] + def test_repr_truncates_terminal_size_full(self, mock): + # GH 22984 ensure entire window is filled + terminal_size = (80, 24) + df = pd.DataFrame(np.random.rand(1, 7)) + p1 = mock.patch('pandas.io.formats.console.get_terminal_size', + return_value=terminal_size) + p2 = mock.patch('pandas.io.formats.format.get_terminal_size', + return_value=terminal_size) + with p1, p2: + assert "..." not in str(df) + def test_repr_max_columns_max_rows(self): term_width, term_height = get_terminal_size() if term_width < 10 or term_height < 10: From c55057f79b0cbe47cf73f0fd8141e6c84f1e7370 Mon Sep 17 00:00:00 2001 From: dannyhyunkim <38394262+dannyhyunkim@users.noreply.github.com> Date: Thu, 15 Nov 2018 16:50:11 +1100 Subject: [PATCH 02/10] DOC: Validate space before colon docstring parameters #23483 (#23506) --- scripts/tests/test_validate_docstrings.py | 5 ++--- scripts/validate_docstrings.py | 8 +++++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index c1bdab73c2671..60b60603f0289 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -761,9 +761,8 @@ def test_bad_generic_functions(self, func): ('BadParameters', 'missing_params', ('Parameters {**kwargs} not documented',)), ('BadParameters', 'bad_colon_spacing', - ('Parameters {kind} not documented', - 'Unknown parameters {kind: str}', - 'Parameter "kind: str" has no type')), + ('Parameter "kind" requires a space before the colon ' + 'separating the parameter name and type',)), ('BadParameters', 'no_description_period', ('Parameter "kind" description should finish with "."',)), ('BadParameters', 'no_description_period_with_directive', diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 7da77a1f60ad5..873ba71d6539d 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -97,6 +97,8 @@ 'PR08': 'Parameter "{param_name}" description should start with a ' 'capital letter', 'PR09': 'Parameter "{param_name}" description should finish with "."', + 'PR10': 'Parameter "{param_name}" requires a space before the colon ' + 'separating the parameter name and type', 'RT01': 'No Returns section found', 'YD01': 'No Yields section found', 'SA01': 'See Also section not found', @@ -644,7 +646,11 @@ def validate_one(func_name): for param in doc.doc_parameters: if not param.startswith("*"): # Check can ignore var / kwargs if not doc.parameter_type(param): - errs.append(error('PR04', param_name=param)) + if ':' in param: + errs.append(error('PR10', + param_name=param.split(':')[0])) + else: + errs.append(error('PR04', param_name=param)) else: if doc.parameter_type(param)[-1] == '.': errs.append(error('PR05', param_name=param)) From 615f13725c647147b16520b099dfe65683db505e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 15 Nov 2018 05:53:39 -0600 Subject: [PATCH 03/10] DOC: Handle exceptions when computing contributors. (#23714) * DOC: fix the build maybe --- doc/sphinxext/contributors.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py index 0f04d47435699..8c9fa5bc961d1 100644 --- a/doc/sphinxext/contributors.py +++ b/doc/sphinxext/contributors.py @@ -10,6 +10,7 @@ """ from docutils import nodes from docutils.parsers.rst import Directive +import git from announce import build_components @@ -19,17 +20,25 @@ class ContributorsDirective(Directive): name = 'contributors' def run(self): - components = build_components(self.arguments[0]) - - message = nodes.paragraph() - message += nodes.Text(components['author_message']) - - listnode = nodes.bullet_list() - - for author in components['authors']: - para = nodes.paragraph() - para += nodes.Text(author) - listnode += nodes.list_item('', para) + range_ = self.arguments[0] + try: + components = build_components(range_) + except git.GitCommandError: + return [ + self.state.document.reporter.warning( + "Cannot find contributors for range '{}'".format(range_), + line=self.lineno) + ] + else: + message = nodes.paragraph() + message += nodes.Text(components['author_message']) + + listnode = nodes.bullet_list() + + for author in components['authors']: + para = nodes.paragraph() + para += nodes.Text(author) + listnode += nodes.list_item('', para) return [message, listnode] From ac8cb03ee25b9b624141ec9a002a3ae0f4aab880 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Thu, 15 Nov 2018 13:02:33 +0100 Subject: [PATCH 04/10] DOC: flake8-per-pr for windows users (#23707) --- doc/source/contributing.rst | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 7eb9a6cf815ba..b44bd1cfd9007 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -591,21 +591,14 @@ run this slightly modified command:: git diff master --name-only -- "*.py" | grep "pandas/" | xargs flake8 -Note that on Windows, these commands are unfortunately not possible because -commands like ``grep`` and ``xargs`` are not available natively. To imitate the -behavior with the commands above, you should run:: +Windows does not support the ``grep`` and ``xargs`` commands (unless installed +for example via the `MinGW `__ toolchain), but one can +imitate the behaviour as follows:: - git diff master --name-only -- "*.py" + for /f %i in ('git diff upstream/master --name-only ^| findstr pandas/') do flake8 %i -This will list all of the Python files that have been modified. The only ones -that matter during linting are any whose directory filepath begins with "pandas." -For each filepath, copy and paste it after the ``flake8`` command as shown below: - - flake8 - -Alternatively, you can install the ``grep`` and ``xargs`` commands via the -`MinGW `__ toolchain, and it will allow you to run the -commands above. +This will also get all the files being changed by the PR (and within the +``pandas/`` folder), and run ``flake8`` on them one after the other. .. _contributing.import-formatting: From fb68731cdfec6f3716ed13ced4d863fa8c2c8161 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 15 Nov 2018 06:57:53 -0600 Subject: [PATCH 05/10] Ensure Index._data is an ndarray (#23628) --- pandas/core/indexes/base.py | 8 +++++++- pandas/tests/indexes/test_base.py | 7 +++++++ pandas/tests/series/test_operators.py | 28 +++++++++++++-------------- 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2f449b4b33d8d..454d029c266f5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -19,7 +19,7 @@ ABCSeries, ABCDataFrame, ABCMultiIndex, ABCPeriodIndex, ABCTimedeltaIndex, ABCDatetimeIndex, - ABCDateOffset) + ABCDateOffset, ABCIndexClass) from pandas.core.dtypes.missing import isna, array_equivalent from pandas.core.dtypes.cast import maybe_cast_to_integer_array from pandas.core.dtypes.common import ( @@ -522,6 +522,12 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): values = cls(values, name=name, dtype=dtype, **kwargs)._ndarray_values + if isinstance(values, (ABCSeries, ABCIndexClass)): + # Index._data must always be an ndarray. + # This is no-copy for when _values is an ndarray, + # which should be always at this point. + values = np.asarray(values._values) + result = object.__new__(cls) result._data = values result.name = name diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 619f60a42e0be..424f6b1f9a77a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -504,6 +504,13 @@ def test_constructor_cast(self): with pytest.raises(ValueError, match=msg): Index(["a", "b", "c"], dtype=float) + def test_constructor_unwraps_index(self, indices): + if isinstance(indices, pd.MultiIndex): + raise pytest.skip("MultiIndex has no ._data") + a = indices + b = type(a)(a) + tm.assert_equal(a._data, b._data) + def test_view_with_args(self): restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex', diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 4cce26d135443..bcecedc2bba97 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -189,20 +189,7 @@ def test_scalar_na_logical_ops_corners(self): operator.and_, operator.or_, operator.xor, - pytest.param(ops.rand_, - marks=pytest.mark.xfail(reason="GH#22092 Index " - "implementation returns " - "Index", - raises=AssertionError, - strict=True)), - pytest.param(ops.ror_, - marks=pytest.mark.xfail(reason="GH#22092 Index " - "implementation raises", - raises=ValueError, strict=True)), - pytest.param(ops.rxor, - marks=pytest.mark.xfail(reason="GH#22092 Index " - "implementation raises", - raises=TypeError, strict=True)) + ]) def test_logical_ops_with_index(self, op): # GH#22092, GH#19792 @@ -221,6 +208,19 @@ def test_logical_ops_with_index(self, op): result = op(ser, idx2) assert_series_equal(result, expected) + @pytest.mark.parametrize("op, expected", [ + (ops.rand_, pd.Index([False, True])), + (ops.ror_, pd.Index([False, True])), + (ops.rxor, pd.Index([])), + ]) + def test_reverse_ops_with_index(self, op, expected): + # https://github.com/pandas-dev/pandas/pull/23628 + # multi-set Index ops are buggy, so let's avoid duplicates... + ser = Series([True, False]) + idx = Index([False, True]) + result = op(ser, idx) + tm.assert_index_equal(result, expected) + def test_logical_ops_label_based(self): # GH#4947 # logical ops should be label based From 88cbce33d5aaf514e88f0c09cc2a63dcc529bd0a Mon Sep 17 00:00:00 2001 From: harisbal Date: Thu, 15 Nov 2018 14:08:29 +0100 Subject: [PATCH 06/10] ENH: Allow for join between two multi-index dataframe instances (#20356) --- doc/source/whatsnew/v0.24.0.rst | 41 ++ pandas/core/indexes/base.py | 98 ++-- pandas/core/reshape/merge.py | 89 +++ pandas/tests/reshape/merge/test_merge.py | 520 +----------------- pandas/tests/reshape/merge/test_multi.py | 672 +++++++++++++++++++++++ 5 files changed, 866 insertions(+), 554 deletions(-) create mode 100644 pandas/tests/reshape/merge/test_multi.py diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 9275357e5ad18..83f034014bb3b 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -183,6 +183,47 @@ array, but rather an ``ExtensionArray``: This is the same behavior as ``Series.values`` for categorical data. See :ref:`whatsnew_0240.api_breaking.interval_values` for more. +.. _whatsnew_0240.enhancements.join_with_two_multiindexes: + +Joining with two multi-indexes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`Datafame.merge` and :func:`Dataframe.join` can now be used to join multi-indexed ``Dataframe`` instances on the overlaping index levels (:issue:`6360`) + +See the :ref:`Merge, join, and concatenate +` documentation section. + +.. ipython:: python + + index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), + ('K1', 'X2')], + names=['key', 'X']) + + + left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], + 'B': ['B0', 'B1', 'B2']}, + index=index_left) + + + index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), + ('K2', 'Y2'), ('K2', 'Y3')], + names=['key', 'Y']) + + + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3']}, + index=index_right) + + + left.join(right) + +For earlier versions this can be done using the following. + +.. ipython:: python + + pd.merge(left.reset_index(), right.reset_index(), + on=['key'], how='inner').set_index(['key', 'X', 'Y']) + .. _whatsnew_0240.enhancements.rename_axis: Renaming names in a MultiIndex diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 454d029c266f5..0632198c77262 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3168,8 +3168,8 @@ def get_value(self, series, key): iloc = self.get_loc(key) return s[iloc] except KeyError: - if (len(self) > 0 - and (self.holds_integer() or self.is_boolean())): + if (len(self) > 0 and + (self.holds_integer() or self.is_boolean())): raise elif is_integer(key): return s[key] @@ -3957,46 +3957,72 @@ def join(self, other, how='left', level=None, return_indexers=False, def _join_multi(self, other, how, return_indexers=True): from .multi import MultiIndex + from pandas.core.reshape.merge import _restore_dropped_levels_multijoin + + # figure out join names + self_names = set(com._not_none(*self.names)) + other_names = set(com._not_none(*other.names)) + overlap = self_names & other_names + + # need at least 1 in common + if not overlap: + raise ValueError("cannot join with no overlapping index names") + self_is_mi = isinstance(self, MultiIndex) other_is_mi = isinstance(other, MultiIndex) - # figure out join names - self_names = com._not_none(*self.names) - other_names = com._not_none(*other.names) - overlap = list(set(self_names) & set(other_names)) - - # need at least 1 in common, but not more than 1 - if not len(overlap): - raise ValueError("cannot join with no level specified and no " - "overlapping names") - if len(overlap) > 1: - raise NotImplementedError("merging with more than one level " - "overlap on a multi-index is not " - "implemented") - jl = overlap[0] + if self_is_mi and other_is_mi: + + # Drop the non-matching levels from left and right respectively + ldrop_names = list(self_names - overlap) + rdrop_names = list(other_names - overlap) + + self_jnlevels = self.droplevel(ldrop_names) + other_jnlevels = other.droplevel(rdrop_names) + + # Join left and right + # Join on same leveled multi-index frames is supported + join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how, + return_indexers=True) + + # Restore the dropped levels + # Returned index level order is + # common levels, ldrop_names, rdrop_names + dropped_names = ldrop_names + rdrop_names + + levels, labels, names = ( + _restore_dropped_levels_multijoin(self, other, + dropped_names, + join_idx, + lidx, ridx)) + # Re-create the multi-index + multi_join_idx = MultiIndex(levels=levels, labels=labels, + names=names, verify_integrity=False) + + multi_join_idx = multi_join_idx.remove_unused_levels() + + return multi_join_idx, lidx, ridx + + jl = list(overlap)[0] + + # Case where only one index is multi # make the indices into mi's that match - if not (self_is_mi and other_is_mi): - - flip_order = False - if self_is_mi: - self, other = other, self - flip_order = True - # flip if join method is right or left - how = {'right': 'left', 'left': 'right'}.get(how, how) - - level = other.names.index(jl) - result = self._join_level(other, level, how=how, - return_indexers=return_indexers) - - if flip_order: - if isinstance(result, tuple): - return result[0], result[2], result[1] - return result + flip_order = False + if self_is_mi: + self, other = other, self + flip_order = True + # flip if join method is right or left + how = {'right': 'left', 'left': 'right'}.get(how, how) + + level = other.names.index(jl) + result = self._join_level(other, level, how=how, + return_indexers=return_indexers) - # 2 multi-indexes - raise NotImplementedError("merging with both multi-indexes is not " - "implemented") + if flip_order: + if isinstance(result, tuple): + return result[0], result[2], result[1] + return result def _join_non_unique(self, other, how='left', return_indexers=False): from pandas.core.reshape.merge import _get_join_indexers diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3d6f55c907269..93a6e4538cbc1 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1122,6 +1122,95 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', return join_func(lkey, rkey, count, **kwargs) +def _restore_dropped_levels_multijoin(left, right, dropped_level_names, + join_index, lindexer, rindexer): + """ + *this is an internal non-public method* + + Returns the levels, labels and names of a multi-index to multi-index join. + Depending on the type of join, this method restores the appropriate + dropped levels of the joined multi-index. + The method relies on lidx, rindexer which hold the index positions of + left and right, where a join was feasible + + Parameters + ---------- + left : MultiIndex + left index + right : MultiIndex + right index + dropped_level_names : str array + list of non-common level names + join_index : MultiIndex + the index of the join between the + common levels of left and right + lindexer : intp array + left indexer + rindexer : intp array + right indexer + + Returns + ------- + levels : list of Index + levels of combined multiindexes + labels : intp array + labels of combined multiindexes + names : str array + names of combined multiindexes + + """ + + def _convert_to_mulitindex(index): + if isinstance(index, MultiIndex): + return index + else: + return MultiIndex.from_arrays([index.values], + names=[index.name]) + + # For multi-multi joins with one overlapping level, + # the returned index if of type Index + # Assure that join_index is of type MultiIndex + # so that dropped levels can be appended + join_index = _convert_to_mulitindex(join_index) + + join_levels = join_index.levels + join_labels = join_index.labels + join_names = join_index.names + + # lindexer and rindexer hold the indexes where the join occurred + # for left and right respectively. If left/right is None then + # the join occurred on all indices of left/right + if lindexer is None: + lindexer = range(left.size) + + if rindexer is None: + rindexer = range(right.size) + + # Iterate through the levels that must be restored + for dropped_level_name in dropped_level_names: + if dropped_level_name in left.names: + idx = left + indexer = lindexer + else: + idx = right + indexer = rindexer + + # The index of the level name to be restored + name_idx = idx.names.index(dropped_level_name) + + restore_levels = idx.levels[name_idx] + # Inject -1 in the labels list where a join was not possible + # IOW indexer[i]=-1 + labels = idx.labels[name_idx] + restore_labels = algos.take_nd(labels, indexer, fill_value=-1) + + join_levels = join_levels + [restore_levels] + join_labels = join_labels + [restore_labels] + join_names = join_names + [dropped_level_name] + + return join_levels, join_labels, join_names + + class _OrderedMerge(_MergeOperation): _merge_type = 'ordered_merge' diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d9297cdc5ab3e..7ee88f223cd95 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -8,15 +8,14 @@ import numpy as np import pytest from numpy import nan -from numpy.random import randn import pandas as pd import pandas.util.testing as tm from pandas import (Categorical, CategoricalIndex, DataFrame, DatetimeIndex, - Float64Index, Index, Int64Index, MultiIndex, RangeIndex, + Float64Index, Int64Index, MultiIndex, RangeIndex, Series, UInt64Index) from pandas.api.types import CategoricalDtype as CDT -from pandas.compat import lrange, lzip +from pandas.compat import lrange from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.reshape.concat import concat @@ -920,521 +919,6 @@ def _check_merge(x, y): assert_frame_equal(result, expected, check_names=False) -class TestMergeMulti(object): - - def setup_method(self, method): - self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - self.to_join = DataFrame(np.random.randn(10, 3), index=self.index, - columns=['j_one', 'j_two', 'j_three']) - - # a little relevant example with NAs - key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', - 'qux', 'snap'] - key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', - 'three', 'one'] - - data = np.random.randn(len(key1)) - self.data = DataFrame({'key1': key1, 'key2': key2, - 'data': data}) - - def test_merge_on_multikey(self): - joined = self.data.join(self.to_join, on=['key1', 'key2']) - - join_key = Index(lzip(self.data['key1'], self.data['key2'])) - indexer = self.to_join.index.get_indexer(join_key) - ex_values = self.to_join.values.take(indexer, axis=0) - ex_values[indexer == -1] = np.nan - expected = self.data.join(DataFrame(ex_values, - columns=self.to_join.columns)) - - # TODO: columns aren't in the same order yet - assert_frame_equal(joined, expected.loc[:, joined.columns]) - - left = self.data.join(self.to_join, on=['key1', 'key2'], sort=True) - right = expected.loc[:, joined.columns].sort_values(['key1', 'key2'], - kind='mergesort') - assert_frame_equal(left, right) - - def test_left_join_multi_index(self): - icols = ['1st', '2nd', '3rd'] - - def bind_cols(df): - iord = lambda a: 0 if a != a else ord(a) - f = lambda ts: ts.map(iord) - ord('a') - return (f(df['1st']) + f(df['3rd']) * 1e2 + - df['2nd'].fillna(0) * 1e4) - - def run_asserts(left, right): - for sort in [False, True]: - res = left.join(right, on=icols, how='left', sort=sort) - - assert len(left) < len(res) + 1 - assert not res['4th'].isna().any() - assert not res['5th'].isna().any() - - tm.assert_series_equal( - res['4th'], - res['5th'], check_names=False) - result = bind_cols(res.iloc[:, :-2]) - tm.assert_series_equal(res['4th'], result, check_names=False) - assert result.name is None - - if sort: - tm.assert_frame_equal( - res, res.sort_values(icols, kind='mergesort')) - - out = merge(left, right.reset_index(), on=icols, - sort=sort, how='left') - - res.index = np.arange(len(res)) - tm.assert_frame_equal(out, res) - - lc = list(map(chr, np.arange(ord('a'), ord('z') + 1))) - left = DataFrame(np.random.choice(lc, (5000, 2)), - columns=['1st', '3rd']) - left.insert(1, '2nd', np.random.randint(0, 1000, len(left))) - - i = np.random.permutation(len(left)) - right = left.iloc[i].copy() - - left['4th'] = bind_cols(left) - right['5th'] = - bind_cols(right) - right.set_index(icols, inplace=True) - - run_asserts(left, right) - - # inject some nulls - left.loc[1::23, '1st'] = np.nan - left.loc[2::37, '2nd'] = np.nan - left.loc[3::43, '3rd'] = np.nan - left['4th'] = bind_cols(left) - - i = np.random.permutation(len(left)) - right = left.iloc[i, :-1] - right['5th'] = - bind_cols(right) - right.set_index(icols, inplace=True) - - run_asserts(left, right) - - def test_merge_right_vs_left(self): - # compare left vs right merge with multikey - for sort in [False, True]: - merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'], - right_index=True, how='left', sort=sort) - - merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'], - left_index=True, how='right', - sort=sort) - - merged2 = merged2.loc[:, merged1.columns] - assert_frame_equal(merged1, merged2) - - def test_compress_group_combinations(self): - - # ~ 40000000 possible unique groups - key1 = tm.rands_array(10, 10000) - key1 = np.tile(key1, 2) - key2 = key1[::-1] - - df = DataFrame({'key1': key1, 'key2': key2, - 'value1': np.random.randn(20000)}) - - df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2], - 'value2': np.random.randn(10000)}) - - # just to hit the label compression code path - merge(df, df2, how='outer') - - def test_left_join_index_preserve_order(self): - - left = DataFrame({'k1': [0, 1, 2] * 8, - 'k2': ['foo', 'bar'] * 12, - 'v': np.array(np.arange(24), dtype=np.int64)}) - - index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) - right = DataFrame({'v2': [5, 7]}, index=index) - - result = left.join(right, on=['k1', 'k2']) - - expected = left.copy() - expected['v2'] = np.nan - expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 - expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 - - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal( - result.sort_values(['k1', 'k2'], kind='mergesort'), - left.join(right, on=['k1', 'k2'], sort=True)) - - # test join with multi dtypes blocks - left = DataFrame({'k1': [0, 1, 2] * 8, - 'k2': ['foo', 'bar'] * 12, - 'k3': np.array([0, 1, 2] * 8, dtype=np.float32), - 'v': np.array(np.arange(24), dtype=np.int32)}) - - index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) - right = DataFrame({'v2': [5, 7]}, index=index) - - result = left.join(right, on=['k1', 'k2']) - - expected = left.copy() - expected['v2'] = np.nan - expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 - expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 - - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal( - result.sort_values(['k1', 'k2'], kind='mergesort'), - left.join(right, on=['k1', 'k2'], sort=True)) - - # do a right join for an extra test - joined = merge(right, left, left_index=True, - right_on=['k1', 'k2'], how='right') - tm.assert_frame_equal(joined.loc[:, expected.columns], expected) - - def test_left_join_index_multi_match_multiindex(self): - left = DataFrame([ - ['X', 'Y', 'C', 'a'], - ['W', 'Y', 'C', 'e'], - ['V', 'Q', 'A', 'h'], - ['V', 'R', 'D', 'i'], - ['X', 'Y', 'D', 'b'], - ['X', 'Y', 'A', 'c'], - ['W', 'Q', 'B', 'f'], - ['W', 'R', 'C', 'g'], - ['V', 'Y', 'C', 'j'], - ['X', 'Y', 'B', 'd']], - columns=['cola', 'colb', 'colc', 'tag'], - index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8]) - - right = DataFrame([ - ['W', 'R', 'C', 0], - ['W', 'Q', 'B', 3], - ['W', 'Q', 'B', 8], - ['X', 'Y', 'A', 1], - ['X', 'Y', 'A', 4], - ['X', 'Y', 'B', 5], - ['X', 'Y', 'C', 6], - ['X', 'Y', 'C', 9], - ['X', 'Q', 'C', -6], - ['X', 'R', 'C', -9], - ['V', 'Y', 'C', 7], - ['V', 'R', 'D', 2], - ['V', 'R', 'D', -1], - ['V', 'Q', 'A', -3]], - columns=['col1', 'col2', 'col3', 'val']) - - right.set_index(['col1', 'col2', 'col3'], inplace=True) - result = left.join(right, on=['cola', 'colb', 'colc'], how='left') - - expected = DataFrame([ - ['X', 'Y', 'C', 'a', 6], - ['X', 'Y', 'C', 'a', 9], - ['W', 'Y', 'C', 'e', nan], - ['V', 'Q', 'A', 'h', -3], - ['V', 'R', 'D', 'i', 2], - ['V', 'R', 'D', 'i', -1], - ['X', 'Y', 'D', 'b', nan], - ['X', 'Y', 'A', 'c', 1], - ['X', 'Y', 'A', 'c', 4], - ['W', 'Q', 'B', 'f', 3], - ['W', 'Q', 'B', 'f', 8], - ['W', 'R', 'C', 'g', 0], - ['V', 'Y', 'C', 'j', 7], - ['X', 'Y', 'B', 'd', 5]], - columns=['cola', 'colb', 'colc', 'tag', 'val'], - index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8]) - - tm.assert_frame_equal(result, expected) - - result = left.join(right, on=['cola', 'colb', 'colc'], - how='left', sort=True) - - tm.assert_frame_equal( - result, - expected.sort_values(['cola', 'colb', 'colc'], kind='mergesort')) - - # GH7331 - maintain left frame order in left merge - right.reset_index(inplace=True) - right.columns = left.columns[:3].tolist() + right.columns[-1:].tolist() - result = merge(left, right, how='left', on=left.columns[:-1].tolist()) - expected.index = np.arange(len(expected)) - tm.assert_frame_equal(result, expected) - - def test_left_join_index_multi_match(self): - left = DataFrame([ - ['c', 0], - ['b', 1], - ['a', 2], - ['b', 3]], - columns=['tag', 'val'], - index=[2, 0, 1, 3]) - - right = DataFrame([ - ['a', 'v'], - ['c', 'w'], - ['c', 'x'], - ['d', 'y'], - ['a', 'z'], - ['c', 'r'], - ['e', 'q'], - ['c', 's']], - columns=['tag', 'char']) - - right.set_index('tag', inplace=True) - result = left.join(right, on='tag', how='left') - - expected = DataFrame([ - ['c', 0, 'w'], - ['c', 0, 'x'], - ['c', 0, 'r'], - ['c', 0, 's'], - ['b', 1, nan], - ['a', 2, 'v'], - ['a', 2, 'z'], - ['b', 3, nan]], - columns=['tag', 'val', 'char'], - index=[2, 2, 2, 2, 0, 1, 1, 3]) - - tm.assert_frame_equal(result, expected) - - result = left.join(right, on='tag', how='left', sort=True) - tm.assert_frame_equal( - result, expected.sort_values('tag', kind='mergesort')) - - # GH7331 - maintain left frame order in left merge - result = merge(left, right.reset_index(), how='left', on='tag') - expected.index = np.arange(len(expected)) - tm.assert_frame_equal(result, expected) - - def test_left_merge_na_buglet(self): - left = DataFrame({'id': list('abcde'), 'v1': randn(5), - 'v2': randn(5), 'dummy': list('abcde'), - 'v3': randn(5)}, - columns=['id', 'v1', 'v2', 'dummy', 'v3']) - right = DataFrame({'id': ['a', 'b', np.nan, np.nan, np.nan], - 'sv3': [1.234, 5.678, np.nan, np.nan, np.nan]}) - - merged = merge(left, right, on='id', how='left') - - rdf = right.drop(['id'], axis=1) - expected = left.join(rdf) - tm.assert_frame_equal(merged, expected) - - def test_merge_na_keys(self): - data = [[1950, "A", 1.5], - [1950, "B", 1.5], - [1955, "B", 1.5], - [1960, "B", np.nan], - [1970, "B", 4.], - [1950, "C", 4.], - [1960, "C", np.nan], - [1965, "C", 3.], - [1970, "C", 4.]] - - frame = DataFrame(data, columns=["year", "panel", "data"]) - - other_data = [[1960, 'A', np.nan], - [1970, 'A', np.nan], - [1955, 'A', np.nan], - [1965, 'A', np.nan], - [1965, 'B', np.nan], - [1955, 'C', np.nan]] - other = DataFrame(other_data, columns=['year', 'panel', 'data']) - - result = frame.merge(other, how='outer') - - expected = frame.fillna(-999).merge(other.fillna(-999), how='outer') - expected = expected.replace(-999, np.nan) - - tm.assert_frame_equal(result, expected) - - def test_join_multi_levels(self): - - # GH 3662 - # merge multi-levels - household = ( - DataFrame( - dict(household_id=[1, 2, 3], - male=[0, 1, 0], - wealth=[196087.3, 316478.7, 294750]), - columns=['household_id', 'male', 'wealth']) - .set_index('household_id')) - portfolio = ( - DataFrame( - dict(household_id=[1, 2, 2, 3, 3, 3, 4], - asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", - "gb00b03mlx29", "lu0197800237", "nl0000289965", - np.nan], - name=["ABN Amro", "Robeco", "Royal Dutch Shell", - "Royal Dutch Shell", - "AAB Eastern Europe Equity Fund", - "Postbank BioTech Fonds", np.nan], - share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), - columns=['household_id', 'asset_id', 'name', 'share']) - .set_index(['household_id', 'asset_id'])) - result = household.join(portfolio, how='inner') - expected = ( - DataFrame( - dict(male=[0, 1, 1, 0, 0, 0], - wealth=[196087.3, 316478.7, 316478.7, - 294750.0, 294750.0, 294750.0], - name=['ABN Amro', 'Robeco', 'Royal Dutch Shell', - 'Royal Dutch Shell', - 'AAB Eastern Europe Equity Fund', - 'Postbank BioTech Fonds'], - share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], - household_id=[1, 2, 2, 3, 3, 3], - asset_id=['nl0000301109', 'nl0000289783', 'gb00b03mlx29', - 'gb00b03mlx29', 'lu0197800237', - 'nl0000289965'])) - .set_index(['household_id', 'asset_id']) - .reindex(columns=['male', 'wealth', 'name', 'share'])) - assert_frame_equal(result, expected) - - assert_frame_equal(result, expected) - - # equivalency - result2 = (merge(household.reset_index(), portfolio.reset_index(), - on=['household_id'], how='inner') - .set_index(['household_id', 'asset_id'])) - assert_frame_equal(result2, expected) - - result = household.join(portfolio, how='outer') - expected = (concat([ - expected, - (DataFrame( - dict(share=[1.00]), - index=MultiIndex.from_tuples( - [(4, np.nan)], - names=['household_id', 'asset_id']))) - ], axis=0, sort=True).reindex(columns=expected.columns)) - assert_frame_equal(result, expected) - - # invalid cases - household.index.name = 'foo' - - def f(): - household.join(portfolio, how='inner') - - pytest.raises(ValueError, f) - - portfolio2 = portfolio.copy() - portfolio2.index.set_names(['household_id', 'foo']) - - def f(): - portfolio2.join(portfolio, how='inner') - - pytest.raises(ValueError, f) - - def test_join_multi_levels2(self): - - # some more advanced merges - # GH6360 - household = ( - DataFrame( - dict(household_id=[1, 2, 2, 3, 3, 3, 4], - asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", - "gb00b03mlx29", "lu0197800237", "nl0000289965", - np.nan], - share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), - columns=['household_id', 'asset_id', 'share']) - .set_index(['household_id', 'asset_id'])) - - log_return = DataFrame(dict( - asset_id=["gb00b03mlx29", "gb00b03mlx29", - "gb00b03mlx29", "lu0197800237", "lu0197800237"], - t=[233, 234, 235, 180, 181], - log_return=[.09604978, -.06524096, .03532373, .03025441, .036997] - )).set_index(["asset_id", "t"]) - - expected = ( - DataFrame(dict( - household_id=[2, 2, 2, 3, 3, 3, 3, 3], - asset_id=["gb00b03mlx29", "gb00b03mlx29", - "gb00b03mlx29", "gb00b03mlx29", - "gb00b03mlx29", "gb00b03mlx29", - "lu0197800237", "lu0197800237"], - t=[233, 234, 235, 233, 234, 235, 180, 181], - share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], - log_return=[.09604978, -.06524096, .03532373, - .09604978, -.06524096, .03532373, - .03025441, .036997] - )) - .set_index(["household_id", "asset_id", "t"]) - .reindex(columns=['share', 'log_return'])) - - def f(): - household.join(log_return, how='inner') - - pytest.raises(NotImplementedError, f) - - # this is the equivalency - result = (merge(household.reset_index(), log_return.reset_index(), - on=['asset_id'], how='inner') - .set_index(['household_id', 'asset_id', 't'])) - assert_frame_equal(result, expected) - - expected = ( - DataFrame(dict( - household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], - asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", - "gb00b03mlx29", "gb00b03mlx29", - "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", - "lu0197800237", "lu0197800237", - "nl0000289965", None], - t=[None, None, 233, 234, 235, 233, 234, - 235, 180, 181, None, None], - share=[1.0, 0.4, 0.6, 0.6, 0.6, 0.15, - 0.15, 0.15, 0.6, 0.6, 0.25, 1.0], - log_return=[None, None, .09604978, -.06524096, .03532373, - .09604978, -.06524096, .03532373, - .03025441, .036997, None, None] - )) - .set_index(["household_id", "asset_id", "t"])) - - def f(): - household.join(log_return, how='outer') - - pytest.raises(NotImplementedError, f) - - @pytest.mark.parametrize("klass", [None, np.asarray, Series, Index]) - def test_merge_datetime_index(self, klass): - # see gh-19038 - df = DataFrame([1, 2, 3], - ["2016-01-01", "2017-01-01", "2018-01-01"], - columns=["a"]) - df.index = pd.to_datetime(df.index) - on_vector = df.index.year - - if klass is not None: - on_vector = klass(on_vector) - - expected = DataFrame( - OrderedDict([ - ("a", [1, 2, 3]), - ("key_1", [2016, 2017, 2018]), - ]) - ) - - result = df.merge(df, on=["a", on_vector], how="inner") - tm.assert_frame_equal(result, expected) - - expected = DataFrame( - OrderedDict([ - ("key_0", [2016, 2017, 2018]), - ("a_x", [1, 2, 3]), - ("a_y", [1, 2, 3]), - ]) - ) - - result = df.merge(df, on=[df.index.year], how="inner") - tm.assert_frame_equal(result, expected) - - class TestMergeDtypes(object): @pytest.mark.parametrize('right_vals', [ diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py new file mode 100644 index 0000000000000..a1158201844b0 --- /dev/null +++ b/pandas/tests/reshape/merge/test_multi.py @@ -0,0 +1,672 @@ +# pylint: disable=E1103 + +from collections import OrderedDict + +import numpy as np +from numpy import nan +from numpy.random import randn +import pytest + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series +from pandas.core.reshape.concat import concat +from pandas.core.reshape.merge import merge +import pandas.util.testing as tm + + +@pytest.fixture +def left(): + """left dataframe (not multi-indexed) for multi-index join tests""" + # a little relevant example with NAs + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + return DataFrame({'key1': key1, 'key2': key2, 'data': data}) + + +@pytest.fixture +def right(): + """right dataframe (multi-indexed) for multi-index join tests""" + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['key1', 'key2']) + + return DataFrame(np.random.randn(10, 3), index=index, + columns=['j_one', 'j_two', 'j_three']) + + +@pytest.fixture +def left_multi(): + return ( + DataFrame( + dict(Origin=['A', 'A', 'B', 'B', 'C'], + Destination=['A', 'B', 'A', 'C', 'A'], + Period=['AM', 'AM', 'IP', 'AM', 'OP'], + TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], + Trips=[1987, 3647, 2470, 4296, 4444]), + columns=['Origin', 'Destination', 'Period', + 'TripPurp', 'Trips']) + .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) + + +@pytest.fixture +def right_multi(): + return ( + DataFrame( + dict(Origin=['A', 'A', 'B', 'B', 'C', 'C', 'E'], + Destination=['A', 'B', 'A', 'B', 'A', 'B', 'F'], + Period=['AM', 'AM', 'IP', 'AM', 'OP', 'IP', 'AM'], + LinkType=['a', 'b', 'c', 'b', 'a', 'b', 'a'], + Distance=[100, 80, 90, 80, 75, 35, 55]), + columns=['Origin', 'Destination', 'Period', + 'LinkType', 'Distance']) + .set_index(['Origin', 'Destination', 'Period', 'LinkType'])) + + +@pytest.fixture +def on_cols_multi(): + return ['Origin', 'Destination', 'Period'] + + +@pytest.fixture +def idx_cols_multi(): + return ['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType'] + + +class TestMergeMulti(object): + + def setup_method(self): + self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.to_join = DataFrame(np.random.randn(10, 3), index=self.index, + columns=['j_one', 'j_two', 'j_three']) + + # a little relevant example with NAs + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + self.data = DataFrame({'key1': key1, 'key2': key2, + 'data': data}) + + def test_merge_on_multikey(self, left, right, join_type): + on_cols = ['key1', 'key2'] + result = (left.join(right, on=on_cols, how=join_type) + .reset_index(drop=True)) + + expected = pd.merge(left, right.reset_index(), + on=on_cols, how=join_type) + + tm.assert_frame_equal(result, expected) + + result = (left.join(right, on=on_cols, how=join_type, sort=True) + .reset_index(drop=True)) + + expected = pd.merge(left, right.reset_index(), + on=on_cols, how=join_type, sort=True) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("sort", [False, True]) + def test_left_join_multi_index(self, left, right, sort): + icols = ['1st', '2nd', '3rd'] + + def bind_cols(df): + iord = lambda a: 0 if a != a else ord(a) + f = lambda ts: ts.map(iord) - ord('a') + return (f(df['1st']) + f(df['3rd']) * 1e2 + + df['2nd'].fillna(0) * 1e4) + + def run_asserts(left, right, sort): + res = left.join(right, on=icols, how='left', sort=sort) + + assert len(left) < len(res) + 1 + assert not res['4th'].isna().any() + assert not res['5th'].isna().any() + + tm.assert_series_equal( + res['4th'], - res['5th'], check_names=False) + result = bind_cols(res.iloc[:, :-2]) + tm.assert_series_equal(res['4th'], result, check_names=False) + assert result.name is None + + if sort: + tm.assert_frame_equal( + res, res.sort_values(icols, kind='mergesort')) + + out = merge(left, right.reset_index(), on=icols, + sort=sort, how='left') + + res.index = np.arange(len(res)) + tm.assert_frame_equal(out, res) + + lc = list(map(chr, np.arange(ord('a'), ord('z') + 1))) + left = DataFrame(np.random.choice(lc, (5000, 2)), + columns=['1st', '3rd']) + left.insert(1, '2nd', np.random.randint(0, 1000, len(left))) + + i = np.random.permutation(len(left)) + right = left.iloc[i].copy() + + left['4th'] = bind_cols(left) + right['5th'] = - bind_cols(right) + right.set_index(icols, inplace=True) + + run_asserts(left, right, sort) + + # inject some nulls + left.loc[1::23, '1st'] = np.nan + left.loc[2::37, '2nd'] = np.nan + left.loc[3::43, '3rd'] = np.nan + left['4th'] = bind_cols(left) + + i = np.random.permutation(len(left)) + right = left.iloc[i, :-1] + right['5th'] = - bind_cols(right) + right.set_index(icols, inplace=True) + + run_asserts(left, right, sort) + + @pytest.mark.parametrize("sort", [False, True]) + def test_merge_right_vs_left(self, left, right, sort): + # compare left vs right merge with multikey + on_cols = ['key1', 'key2'] + merged_left_right = left.merge(right, + left_on=on_cols, right_index=True, + how='left', sort=sort) + + merge_right_left = right.merge(left, + right_on=on_cols, left_index=True, + how='right', sort=sort) + + # Reorder columns + merge_right_left = merge_right_left[merged_left_right.columns] + + tm.assert_frame_equal(merged_left_right, merge_right_left) + + def test_compress_group_combinations(self): + + # ~ 40000000 possible unique groups + key1 = tm.rands_array(10, 10000) + key1 = np.tile(key1, 2) + key2 = key1[::-1] + + df = DataFrame({'key1': key1, 'key2': key2, + 'value1': np.random.randn(20000)}) + + df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2], + 'value2': np.random.randn(10000)}) + + # just to hit the label compression code path + merge(df, df2, how='outer') + + def test_left_join_index_preserve_order(self): + + on_cols = ['k1', 'k2'] + left = DataFrame({'k1': [0, 1, 2] * 8, + 'k2': ['foo', 'bar'] * 12, + 'v': np.array(np.arange(24), dtype=np.int64)}) + + index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) + right = DataFrame({'v2': [5, 7]}, index=index) + + result = left.join(right, on=on_cols) + + expected = left.copy() + expected['v2'] = np.nan + expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 + + tm.assert_frame_equal(result, expected) + + result.sort_values(on_cols, kind='mergesort', inplace=True) + expected = left.join(right, on=on_cols, sort=True) + + tm.assert_frame_equal(result, expected) + + # test join with multi dtypes blocks + left = DataFrame({'k1': [0, 1, 2] * 8, + 'k2': ['foo', 'bar'] * 12, + 'k3': np.array([0, 1, 2] * 8, dtype=np.float32), + 'v': np.array(np.arange(24), dtype=np.int32)}) + + index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) + right = DataFrame({'v2': [5, 7]}, index=index) + + result = left.join(right, on=on_cols) + + expected = left.copy() + expected['v2'] = np.nan + expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 + + tm.assert_frame_equal(result, expected) + + result = result.sort_values(on_cols, kind='mergesort') + expected = left.join(right, on=on_cols, sort=True) + + tm.assert_frame_equal(result, expected) + + def test_left_join_index_multi_match_multiindex(self): + left = DataFrame([ + ['X', 'Y', 'C', 'a'], + ['W', 'Y', 'C', 'e'], + ['V', 'Q', 'A', 'h'], + ['V', 'R', 'D', 'i'], + ['X', 'Y', 'D', 'b'], + ['X', 'Y', 'A', 'c'], + ['W', 'Q', 'B', 'f'], + ['W', 'R', 'C', 'g'], + ['V', 'Y', 'C', 'j'], + ['X', 'Y', 'B', 'd']], + columns=['cola', 'colb', 'colc', 'tag'], + index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8]) + + right = (DataFrame([ + ['W', 'R', 'C', 0], + ['W', 'Q', 'B', 3], + ['W', 'Q', 'B', 8], + ['X', 'Y', 'A', 1], + ['X', 'Y', 'A', 4], + ['X', 'Y', 'B', 5], + ['X', 'Y', 'C', 6], + ['X', 'Y', 'C', 9], + ['X', 'Q', 'C', -6], + ['X', 'R', 'C', -9], + ['V', 'Y', 'C', 7], + ['V', 'R', 'D', 2], + ['V', 'R', 'D', -1], + ['V', 'Q', 'A', -3]], + columns=['col1', 'col2', 'col3', 'val']) + .set_index(['col1', 'col2', 'col3'])) + + result = left.join(right, on=['cola', 'colb', 'colc'], how='left') + + expected = DataFrame([ + ['X', 'Y', 'C', 'a', 6], + ['X', 'Y', 'C', 'a', 9], + ['W', 'Y', 'C', 'e', nan], + ['V', 'Q', 'A', 'h', -3], + ['V', 'R', 'D', 'i', 2], + ['V', 'R', 'D', 'i', -1], + ['X', 'Y', 'D', 'b', nan], + ['X', 'Y', 'A', 'c', 1], + ['X', 'Y', 'A', 'c', 4], + ['W', 'Q', 'B', 'f', 3], + ['W', 'Q', 'B', 'f', 8], + ['W', 'R', 'C', 'g', 0], + ['V', 'Y', 'C', 'j', 7], + ['X', 'Y', 'B', 'd', 5]], + columns=['cola', 'colb', 'colc', 'tag', 'val'], + index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8]) + + tm.assert_frame_equal(result, expected) + + result = left.join(right, on=['cola', 'colb', 'colc'], + how='left', sort=True) + + expected = expected.sort_values(['cola', 'colb', 'colc'], + kind='mergesort') + + tm.assert_frame_equal(result, expected) + + def test_left_join_index_multi_match(self): + left = DataFrame([ + ['c', 0], + ['b', 1], + ['a', 2], + ['b', 3]], + columns=['tag', 'val'], + index=[2, 0, 1, 3]) + + right = (DataFrame([ + ['a', 'v'], + ['c', 'w'], + ['c', 'x'], + ['d', 'y'], + ['a', 'z'], + ['c', 'r'], + ['e', 'q'], + ['c', 's']], + columns=['tag', 'char']) + .set_index('tag')) + + result = left.join(right, on='tag', how='left') + + expected = DataFrame([ + ['c', 0, 'w'], + ['c', 0, 'x'], + ['c', 0, 'r'], + ['c', 0, 's'], + ['b', 1, nan], + ['a', 2, 'v'], + ['a', 2, 'z'], + ['b', 3, nan]], + columns=['tag', 'val', 'char'], + index=[2, 2, 2, 2, 0, 1, 1, 3]) + + tm.assert_frame_equal(result, expected) + + result = left.join(right, on='tag', how='left', sort=True) + expected2 = expected.sort_values('tag', kind='mergesort') + + tm.assert_frame_equal(result, expected2) + + # GH7331 - maintain left frame order in left merge + result = merge(left, right.reset_index(), how='left', on='tag') + expected.index = np.arange(len(expected)) + tm.assert_frame_equal(result, expected) + + def test_left_merge_na_buglet(self): + left = DataFrame({'id': list('abcde'), 'v1': randn(5), + 'v2': randn(5), 'dummy': list('abcde'), + 'v3': randn(5)}, + columns=['id', 'v1', 'v2', 'dummy', 'v3']) + right = DataFrame({'id': ['a', 'b', np.nan, np.nan, np.nan], + 'sv3': [1.234, 5.678, np.nan, np.nan, np.nan]}) + + result = merge(left, right, on='id', how='left') + + rdf = right.drop(['id'], axis=1) + expected = left.join(rdf) + tm.assert_frame_equal(result, expected) + + def test_merge_na_keys(self): + data = [[1950, "A", 1.5], + [1950, "B", 1.5], + [1955, "B", 1.5], + [1960, "B", np.nan], + [1970, "B", 4.], + [1950, "C", 4.], + [1960, "C", np.nan], + [1965, "C", 3.], + [1970, "C", 4.]] + + frame = DataFrame(data, columns=["year", "panel", "data"]) + + other_data = [[1960, 'A', np.nan], + [1970, 'A', np.nan], + [1955, 'A', np.nan], + [1965, 'A', np.nan], + [1965, 'B', np.nan], + [1955, 'C', np.nan]] + other = DataFrame(other_data, columns=['year', 'panel', 'data']) + + result = frame.merge(other, how='outer') + + expected = frame.fillna(-999).merge(other.fillna(-999), how='outer') + expected = expected.replace(-999, np.nan) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("klass", [None, np.asarray, Series, Index]) + def test_merge_datetime_index(self, klass): + # see gh-19038 + df = DataFrame([1, 2, 3], + ["2016-01-01", "2017-01-01", "2018-01-01"], + columns=["a"]) + df.index = pd.to_datetime(df.index) + on_vector = df.index.year + + if klass is not None: + on_vector = klass(on_vector) + + expected = DataFrame( + OrderedDict([ + ("a", [1, 2, 3]), + ("key_1", [2016, 2017, 2018]), + ]) + ) + + result = df.merge(df, on=["a", on_vector], how="inner") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + OrderedDict([ + ("key_0", [2016, 2017, 2018]), + ("a_x", [1, 2, 3]), + ("a_y", [1, 2, 3]), + ]) + ) + + result = df.merge(df, on=[df.index.year], how="inner") + tm.assert_frame_equal(result, expected) + + def test_join_multi_levels(self): + + # GH 3662 + # merge multi-levels + household = ( + DataFrame( + dict(household_id=[1, 2, 3], + male=[0, 1, 0], + wealth=[196087.3, 316478.7, 294750]), + columns=['household_id', 'male', 'wealth']) + .set_index('household_id')) + portfolio = ( + DataFrame( + dict(household_id=[1, 2, 2, 3, 3, 3, 4], + asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", + "gb00b03mlx29", "lu0197800237", "nl0000289965", + np.nan], + name=["ABN Amro", "Robeco", "Royal Dutch Shell", + "Royal Dutch Shell", + "AAB Eastern Europe Equity Fund", + "Postbank BioTech Fonds", np.nan], + share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), + columns=['household_id', 'asset_id', 'name', 'share']) + .set_index(['household_id', 'asset_id'])) + result = household.join(portfolio, how='inner') + expected = ( + DataFrame( + dict(male=[0, 1, 1, 0, 0, 0], + wealth=[196087.3, 316478.7, 316478.7, + 294750.0, 294750.0, 294750.0], + name=['ABN Amro', 'Robeco', 'Royal Dutch Shell', + 'Royal Dutch Shell', + 'AAB Eastern Europe Equity Fund', + 'Postbank BioTech Fonds'], + share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], + household_id=[1, 2, 2, 3, 3, 3], + asset_id=['nl0000301109', 'nl0000289783', 'gb00b03mlx29', + 'gb00b03mlx29', 'lu0197800237', + 'nl0000289965'])) + .set_index(['household_id', 'asset_id']) + .reindex(columns=['male', 'wealth', 'name', 'share'])) + tm.assert_frame_equal(result, expected) + + # equivalency + result = (merge(household.reset_index(), portfolio.reset_index(), + on=['household_id'], how='inner') + .set_index(['household_id', 'asset_id'])) + tm.assert_frame_equal(result, expected) + + result = household.join(portfolio, how='outer') + expected = (concat([ + expected, + (DataFrame( + dict(share=[1.00]), + index=MultiIndex.from_tuples( + [(4, np.nan)], + names=['household_id', 'asset_id']))) + ], axis=0, sort=True).reindex(columns=expected.columns)) + tm.assert_frame_equal(result, expected) + + # invalid cases + household.index.name = 'foo' + + def f(): + household.join(portfolio, how='inner') + + pytest.raises(ValueError, f) + + portfolio2 = portfolio.copy() + portfolio2.index.set_names(['household_id', 'foo']) + + def f(): + portfolio2.join(portfolio, how='inner') + + pytest.raises(ValueError, f) + + def test_join_multi_levels2(self): + + # some more advanced merges + # GH6360 + household = ( + DataFrame( + dict(household_id=[1, 2, 2, 3, 3, 3, 4], + asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", + "gb00b03mlx29", "lu0197800237", "nl0000289965", + np.nan], + share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), + columns=['household_id', 'asset_id', 'share']) + .set_index(['household_id', 'asset_id'])) + + log_return = DataFrame(dict( + asset_id=["gb00b03mlx29", "gb00b03mlx29", + "gb00b03mlx29", "lu0197800237", "lu0197800237"], + t=[233, 234, 235, 180, 181], + log_return=[.09604978, -.06524096, .03532373, .03025441, .036997] + )).set_index(["asset_id", "t"]) + + expected = ( + DataFrame(dict( + household_id=[2, 2, 2, 3, 3, 3, 3, 3], + asset_id=["gb00b03mlx29", "gb00b03mlx29", + "gb00b03mlx29", "gb00b03mlx29", + "gb00b03mlx29", "gb00b03mlx29", + "lu0197800237", "lu0197800237"], + t=[233, 234, 235, 233, 234, 235, 180, 181], + share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], + log_return=[.09604978, -.06524096, .03532373, + .09604978, -.06524096, .03532373, + .03025441, .036997] + )) + .set_index(["household_id", "asset_id", "t"]) + .reindex(columns=['share', 'log_return'])) + + # this is the equivalency + result = (merge(household.reset_index(), log_return.reset_index(), + on=['asset_id'], how='inner') + .set_index(['household_id', 'asset_id', 't'])) + tm.assert_frame_equal(result, expected) + + expected = ( + DataFrame(dict( + household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], + asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", + "gb00b03mlx29", "gb00b03mlx29", + "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", + "lu0197800237", "lu0197800237", + "nl0000289965", None], + t=[None, None, 233, 234, 235, 233, 234, + 235, 180, 181, None, None], + share=[1.0, 0.4, 0.6, 0.6, 0.6, 0.15, + 0.15, 0.15, 0.6, 0.6, 0.25, 1.0], + log_return=[None, None, .09604978, -.06524096, .03532373, + .09604978, -.06524096, .03532373, + .03025441, .036997, None, None] + )) + .set_index(["household_id", "asset_id", "t"]) + .reindex(columns=['share', 'log_return'])) + + result = (merge(household.reset_index(), log_return.reset_index(), + on=['asset_id'], how='outer') + .set_index(['household_id', 'asset_id', 't'])) + + tm.assert_frame_equal(result, expected) + + +class TestJoinMultiMulti(object): + + def test_join_multi_multi(self, left_multi, right_multi, join_type, + on_cols_multi, idx_cols_multi): + # Multi-index join tests + expected = (pd.merge(left_multi.reset_index(), + right_multi.reset_index(), + how=join_type, on=on_cols_multi). + set_index(idx_cols_multi).sort_index()) + + result = left_multi.join(right_multi, how=join_type).sort_index() + tm.assert_frame_equal(result, expected) + + def test_join_multi_empty_frames(self, left_multi, right_multi, join_type, + on_cols_multi, idx_cols_multi): + + left_multi = left_multi.drop(columns=left_multi.columns) + right_multi = right_multi.drop(columns=right_multi.columns) + + expected = (pd.merge(left_multi.reset_index(), + right_multi.reset_index(), + how=join_type, on=on_cols_multi) + .set_index(idx_cols_multi).sort_index()) + + result = left_multi.join(right_multi, how=join_type).sort_index() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("box", [None, np.asarray, Series, Index]) + def test_merge_datetime_index(self, box): + # see gh-19038 + df = DataFrame([1, 2, 3], + ["2016-01-01", "2017-01-01", "2018-01-01"], + columns=["a"]) + df.index = pd.to_datetime(df.index) + on_vector = df.index.year + + if box is not None: + on_vector = box(on_vector) + + expected = DataFrame( + OrderedDict([ + ("a", [1, 2, 3]), + ("key_1", [2016, 2017, 2018]), + ]) + ) + + result = df.merge(df, on=["a", on_vector], how="inner") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + OrderedDict([ + ("key_0", [2016, 2017, 2018]), + ("a_x", [1, 2, 3]), + ("a_y", [1, 2, 3]), + ]) + ) + + result = df.merge(df, on=[df.index.year], how="inner") + tm.assert_frame_equal(result, expected) + + def test_single_common_level(self): + index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), + ('K1', 'X2')], + names=['key', 'X']) + + left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], + 'B': ['B0', 'B1', 'B2']}, + index=index_left) + + index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), + ('K2', 'Y2'), ('K2', 'Y3')], + names=['key', 'Y']) + + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3']}, + index=index_right) + + result = left.join(right) + expected = (pd.merge(left.reset_index(), right.reset_index(), + on=['key'], how='inner') + .set_index(['key', 'X', 'Y'])) + + tm.assert_frame_equal(result, expected) From e3e59bfcab6bfc43ba00f92137d71322430f2636 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Thu, 15 Nov 2018 14:23:07 +0100 Subject: [PATCH 07/10] CLN/CI: Catch that stderr-warning! (#23706) --- pandas/tests/io/parser/python_parser_only.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/io/parser/python_parser_only.py b/pandas/tests/io/parser/python_parser_only.py index 590736f720e67..6a41b4636e532 100644 --- a/pandas/tests/io/parser/python_parser_only.py +++ b/pandas/tests/io/parser/python_parser_only.py @@ -8,6 +8,7 @@ """ import csv +import sys import pytest @@ -230,6 +231,7 @@ def test_multi_char_sep_quotes(self): self.read_csv(StringIO(data), sep=',,', quoting=csv.QUOTE_NONE) + @tm.capture_stderr def test_none_delimiter(self): # see gh-13374 and gh-17465 @@ -247,6 +249,9 @@ def test_none_delimiter(self): warn_bad_lines=True) tm.assert_frame_equal(result, expected) + warning = sys.stderr.getvalue() + assert 'Skipping line 3' in warning + def test_skipfooter_bad_row(self): # see gh-13879 # see gh-15910 From ee5e79f75fece9254b716fee6e596f72ebba9a07 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 15 Nov 2018 05:26:45 -0800 Subject: [PATCH 08/10] BUG: Append DataFrame to Series with dateutil timezone (#23685) --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/_libs/lib.pyx | 23 ++++++++++++----------- pandas/tests/reshape/test_concat.py | 15 +++++++++++++++ 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 83f034014bb3b..a48760a8a5d80 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1414,6 +1414,7 @@ Reshaping - Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue:`22796`) - Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values (:issue:`23189`) - Bug in :meth:`DataFrame.nsmallest` and :meth:`DataFrame.nlargest` for dataframes that have a :class:`MultiIndex` for columns (:issue:`23033`). +- Bug in :meth:`DataFrame.append` with a :class:`Series` with a dateutil timezone would raise a ``TypeError`` (:issue:`23682`) .. _whatsnew_0240.bug_fixes.sparse: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index cfc60256e97a3..0088a698f49e0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -48,8 +48,7 @@ cdef extern from "src/parse_helper.h": int floatify(object, float64_t *result, int *maybe_int) except -1 cimport util -from util cimport (is_nan, - UINT8_MAX, UINT64_MAX, INT64_MAX, INT64_MIN) +from util cimport is_nan, UINT64_MAX, INT64_MAX, INT64_MIN from tslib import array_to_datetime from tslibs.nattype cimport NPY_NAT @@ -1642,20 +1641,22 @@ def is_datetime_with_singletz_array(values: ndarray) -> bool: if n == 0: return False - + # Get a reference timezone to compare with the rest of the tzs in the array for i in range(n): base_val = values[i] if base_val is not NaT: base_tz = get_timezone(getattr(base_val, 'tzinfo', None)) - - for j in range(i, n): - val = values[j] - if val is not NaT: - tz = getattr(val, 'tzinfo', None) - if not tz_compare(base_tz, tz): - return False break + for j in range(i, n): + # Compare val's timezone with the reference timezone + # NaT can coexist with tz-aware datetimes, so skip if encountered + val = values[j] + if val is not NaT: + tz = getattr(val, 'tzinfo', None) + if not tz_compare(base_tz, tz): + return False + return True @@ -2045,7 +2046,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, # we try to coerce datetime w/tz but must all have the same tz if seen.datetimetz_: - if len({getattr(val, 'tzinfo', None) for val in objects}) == 1: + if is_datetime_with_singletz_array(objects): from pandas import DatetimeIndex return DatetimeIndex(objects) seen.object_ = 1 diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index c7fba47a8f27c..07b00cef2669e 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1010,6 +1010,21 @@ def test_append_missing_column_proper_upcast(self, sort): assert appended['A'].dtype == 'f8' assert appended['B'].dtype == 'O' + def test_append_empty_frame_to_series_with_dateutil_tz(self): + # GH 23682 + date = Timestamp('2018-10-24 07:30:00', tz=dateutil.tz.tzutc()) + s = Series({'date': date, 'a': 1.0, 'b': 2.0}) + df = DataFrame(columns=['c', 'd']) + result = df.append(s, ignore_index=True) + expected = DataFrame([[np.nan, np.nan, 1., 2., date]], + columns=['c', 'd', 'a', 'b', 'date']) + # These columns get cast to object after append + object_cols = ['c', 'd', 'date'] + expected.loc[:, object_cols] = expected.loc[:, object_cols].astype( + object + ) + assert_frame_equal(result, expected) + class TestConcatenate(ConcatenateBase): From e50b5fee953472c4cff6ec0df5dfb0875f5a1a2b Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Thu, 15 Nov 2018 14:30:40 +0100 Subject: [PATCH 09/10] API/DEPR: replace "raise_conflict" with "errors" for df.update (#23657) --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/frame.py | 28 ++++++++++--- pandas/core/panel.py | 49 +++++++++++++++-------- pandas/tests/frame/test_combine_concat.py | 21 +++++++++- pandas/tests/test_panel.py | 23 +++++++++-- 5 files changed, 95 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index a48760a8a5d80..52b002db06de0 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1022,6 +1022,7 @@ Deprecations - The ``fastpath`` keyword of the different Index constructors is deprecated (:issue:`23110`). - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have deprecated the ``errors`` argument in favor of the ``nonexistent`` argument (:issue:`8917`) - The class ``FrozenNDArray`` has been deprecated. When unpickling, ``FrozenNDArray`` will be unpickled to ``np.ndarray`` once this class is removed (:issue:`9031`) +- The methods :meth:`DataFrame.update` and :meth:`Panel.update` have deprecated the ``raise_conflict=False|True`` keyword in favor of ``errors='ignore'|'raise'`` (:issue:`23585`) - Deprecated the `nthreads` keyword of :func:`pandas.read_feather` in favor of `use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`) - :func:`pandas.read_excel` has deprecated accepting ``usecols`` as an integer. Please pass in a list of ints from 0 to ``usecols`` inclusive instead (:issue:`23527`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 511604517a84e..30ca7d82936c0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5213,8 +5213,10 @@ def combiner(x, y): return self.combine(other, combiner, overwrite=False) + @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors', + mapping={False: 'ignore', True: 'raise'}) def update(self, other, join='left', overwrite=True, filter_func=None, - raise_conflict=False): + errors='ignore'): """ Modify in place using non-NA values from another DataFrame. @@ -5238,17 +5240,28 @@ def update(self, other, join='left', overwrite=True, filter_func=None, * False: only update values that are NA in the original DataFrame. - filter_func : callable(1d-array) -> boolean 1d-array, optional + filter_func : callable(1d-array) -> bool 1d-array, optional Can choose to replace values other than NA. Return True for values that should be updated. - raise_conflict : bool, default False - If True, will raise a ValueError if the DataFrame and `other` + errors : {'raise', 'ignore'}, default 'ignore' + If 'raise', will raise a ValueError if the DataFrame and `other` both contain non-NA data in the same place. + .. versionchanged :: 0.24.0 + Changed from `raise_conflict=False|True` + to `errors='ignore'|'raise'`. + + Returns + ------- + None : method directly changes calling object + Raises ------ ValueError - When `raise_conflict` is True and there's overlapping non-NA data. + * When `errors='raise'` and there's overlapping non-NA data. + * When `errors` is not either `'ignore'` or `'raise'` + NotImplementedError + * If `join != 'left'` See Also -------- @@ -5319,6 +5332,9 @@ def update(self, other, join='left', overwrite=True, filter_func=None, # TODO: Support other joins if join != 'left': # pragma: no cover raise NotImplementedError("Only left join is supported") + if errors not in ['ignore', 'raise']: + raise ValueError("The parameter errors must be either " + "'ignore' or 'raise'") if not isinstance(other, DataFrame): other = DataFrame(other) @@ -5332,7 +5348,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None, with np.errstate(all='ignore'): mask = ~filter_func(this) | isna(that) else: - if raise_conflict: + if errors == 'raise': mask_this = notna(that) mask_that = notna(this) if any(mask_this & mask_that): diff --git a/pandas/core/panel.py b/pandas/core/panel.py index c878d16fac2e9..5ae7848b5adc6 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -32,7 +32,7 @@ create_block_manager_from_blocks) from pandas.core.series import Series from pandas.core.reshape.util import cartesian_product -from pandas.util._decorators import Appender, Substitution +from pandas.util._decorators import Appender, Substitution, deprecate_kwarg from pandas.util._validators import validate_axis_style_args _shared_doc_kwargs = dict( @@ -1235,7 +1235,12 @@ def reindex(self, *args, **kwargs): kwargs.update(axes) kwargs.pop('axis', None) kwargs.pop('labels', None) - return super(Panel, self).reindex(**kwargs) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + # do not warn about constructing Panel when reindexing + result = super(Panel, self).reindex(**kwargs) + return result @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.rename.__doc__) @@ -1377,25 +1382,37 @@ def join(self, other, how='left', lsuffix='', rsuffix=''): return concat([self] + list(other), axis=0, join=how, join_axes=join_axes, verify_integrity=True) + @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors', + mapping={False: 'ignore', True: 'raise'}) def update(self, other, join='left', overwrite=True, filter_func=None, - raise_conflict=False): + errors='ignore'): """ - Modify Panel in place using non-NA values from passed - Panel, or object coercible to Panel. Aligns on items + Modify Panel in place using non-NA values from other Panel. + + May also use object coercible to Panel. Will align on items. Parameters ---------- other : Panel, or object coercible to Panel - join : How to join individual DataFrames - {'left', 'right', 'outer', 'inner'}, default 'left' - overwrite : boolean, default True - If True then overwrite values for common keys in the calling panel - filter_func : callable(1d-array) -> 1d-array, default None + The object from which the caller will be udpated. + join : {'left', 'right', 'outer', 'inner'}, default 'left' + How individual DataFrames are joined. + overwrite : bool, default True + If True then overwrite values for common keys in the calling Panel. + filter_func : callable(1d-array) -> 1d-array, default None Can choose to replace values other than NA. Return True for values - that should be updated - raise_conflict : bool - If True, will raise an error if a DataFrame and other both - contain data in the same place. + that should be updated. + errors : {'raise', 'ignore'}, default 'ignore' + If 'raise', will raise an error if a DataFrame and other both. + + .. versionchanged :: 0.24.0 + Changed from `raise_conflict=False|True` + to `errors='ignore'|'raise'`. + + See Also + -------- + DataFrame.update : Similar method for DataFrames. + dict.update : Similar method for dictionaries. """ if not isinstance(other, self._constructor): @@ -1406,8 +1423,8 @@ def update(self, other, join='left', overwrite=True, filter_func=None, other = other.reindex(**{axis_name: axis_values}) for frame in axis_values: - self[frame].update(other[frame], join, overwrite, filter_func, - raise_conflict) + self[frame].update(other[frame], join=join, overwrite=overwrite, + filter_func=filter_func, errors=errors) def _get_join_index(self, other, how): if how == 'left': diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 22c5d146e1a06..25c5222b5f03c 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -313,7 +313,17 @@ def test_update_filtered(self): [1.5, nan, 7.]]) assert_frame_equal(df, expected) - def test_update_raise(self): + @pytest.mark.parametrize('bad_kwarg, exception, msg', [ + # errors must be 'ignore' or 'raise' + ({'errors': 'something'}, ValueError, 'The parameter errors must.*'), + ({'join': 'inner'}, NotImplementedError, 'Only left join is supported') + ]) + def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg): + df = DataFrame([[1.5, 1, 3.]]) + with pytest.raises(exception, match=msg): + df.update(df, **bad_kwarg) + + def test_update_raise_on_overlap(self): df = DataFrame([[1.5, 1, 3.], [1.5, nan, 3.], [1.5, nan, 3], @@ -322,7 +332,14 @@ def test_update_raise(self): other = DataFrame([[2., nan], [nan, 7]], index=[1, 3], columns=[1, 2]) with pytest.raises(ValueError, match="Data overlaps"): - df.update(other, raise_conflict=True) + df.update(other, errors='raise') + + @pytest.mark.parametrize('raise_conflict', [True, False]) + def test_update_deprecation(self, raise_conflict): + df = DataFrame([[1.5, 1, 3.]]) + other = DataFrame() + with tm.assert_produces_warning(FutureWarning): + df.update(other, raise_conflict=raise_conflict) def test_update_from_non_df(self): d = {'a': Series([1, 2, 3, 4]), 'b': Series([5, 6, 7, 8])} diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index bc644071e914f..0e45fd6411ac0 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2341,7 +2341,17 @@ def test_update_filtered(self): assert_panel_equal(pan, expected) - def test_update_raise(self): + @pytest.mark.parametrize('bad_kwarg, exception, msg', [ + # errors must be 'ignore' or 'raise' + ({'errors': 'something'}, ValueError, 'The parameter errors must.*'), + ({'join': 'inner'}, NotImplementedError, 'Only left join is supported') + ]) + def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg): + pan = Panel([[[1.5, np.nan, 3.]]]) + with pytest.raises(exception, match=msg): + pan.update(pan, **bad_kwarg) + + def test_update_raise_on_overlap(self): pan = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.]], @@ -2349,8 +2359,15 @@ def test_update_raise(self): [1.5, np.nan, 3.], [1.5, np.nan, 3.]]]) - pytest.raises(Exception, pan.update, *(pan, ), - **{'raise_conflict': True}) + with pytest.raises(ValueError, match='Data overlaps'): + pan.update(pan, errors='raise') + + @pytest.mark.parametrize('raise_conflict', [True, False]) + def test_update_deprecation(self, raise_conflict): + pan = Panel([[[1.5, np.nan, 3.]]]) + other = Panel([[[]]]) + with tm.assert_produces_warning(FutureWarning): + pan.update(other, raise_conflict=raise_conflict) def test_all_any(self): assert (self.panel.all(axis=0).values == nanall( From 8af76370a10399b539e93e2533ad7f25d77263c5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 15 Nov 2018 13:49:16 +0000 Subject: [PATCH 10/10] BUG: to_html misses truncation indicators (...) when index=False (#22786) --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/io/formats/html.py | 35 ++++++++------- .../formats/data/gh15019_expected_output.html | 30 +++++++++++++ .../formats/data/gh22783_expected_output.html | 27 +++++++++++ pandas/tests/io/formats/test_to_html.py | 45 +++++++++++++++++++ 5 files changed, 122 insertions(+), 16 deletions(-) create mode 100644 pandas/tests/io/formats/data/gh15019_expected_output.html create mode 100644 pandas/tests/io/formats/data/gh22783_expected_output.html diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 52b002db06de0..1fb54ae55b90e 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1361,6 +1361,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) - :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) - Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`) +- Bug in :func:`to_html()` with ``index=False`` misses truncation indicators (...) on truncated DataFrame (:issue:`15019`, :issue:`22783`) - Bug in :func:`DataFrame.to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) - Bug in :func:`DataFrame.to_string()` that caused representations of :class:`DataFrame` to not take up the whole window (:issue:`22984`) - Bug in :func:`DataFrame.to_csv` where a single level MultiIndex incorrectly wrote a tuple. Now just the value of the index is written (:issue:`19589`). diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 2a2a3e57729ec..967e5fca5f711 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -305,6 +305,8 @@ def _column_header(): align = self.fmt.justify if truncate_h: + if not self.fmt.index: + row_levels = 0 ins_col = row_levels + self.fmt.tr_col_num col_row.insert(ins_col, '...') @@ -336,15 +338,10 @@ def _write_body(self, indent): fmt_values[i] = self.fmt._format_col(i) # write values - if self.fmt.index: - if isinstance(self.frame.index, ABCMultiIndex): - self._write_hierarchical_rows(fmt_values, indent) - else: - self._write_regular_rows(fmt_values, indent) + if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): + self._write_hierarchical_rows(fmt_values, indent) else: - for i in range(min(len(self.frame), self.max_rows)): - row = [fmt_values[j][i] for j in range(len(self.columns))] - self.write_tr(row, indent, self.indent_delta, tags=None) + self._write_regular_rows(fmt_values, indent) indent -= self.indent_delta self.write('', indent) @@ -358,11 +355,16 @@ def _write_regular_rows(self, fmt_values, indent): ncols = len(self.fmt.tr_frame.columns) nrows = len(self.fmt.tr_frame) - fmt = self.fmt._get_formatter('__index__') - if fmt is not None: - index_values = self.fmt.tr_frame.index.map(fmt) + + if self.fmt.index: + fmt = self.fmt._get_formatter('__index__') + if fmt is not None: + index_values = self.fmt.tr_frame.index.map(fmt) + else: + index_values = self.fmt.tr_frame.index.format() + row_levels = 1 else: - index_values = self.fmt.tr_frame.index.format() + row_levels = 0 row = [] for i in range(nrows): @@ -370,17 +372,18 @@ def _write_regular_rows(self, fmt_values, indent): if truncate_v and i == (self.fmt.tr_row_num): str_sep_row = ['...'] * len(row) self.write_tr(str_sep_row, indent, self.indent_delta, - tags=None, nindex_levels=1) + tags=None, nindex_levels=row_levels) row = [] - row.append(index_values[i]) + if self.fmt.index: + row.append(index_values[i]) row.extend(fmt_values[j][i] for j in range(ncols)) if truncate_h: - dot_col_ix = self.fmt.tr_col_num + 1 + dot_col_ix = self.fmt.tr_col_num + row_levels row.insert(dot_col_ix, '...') self.write_tr(row, indent, self.indent_delta, tags=None, - nindex_levels=1) + nindex_levels=row_levels) def _write_hierarchical_rows(self, fmt_values, indent): template = 'rowspan="{span}" valign="top"' diff --git a/pandas/tests/io/formats/data/gh15019_expected_output.html b/pandas/tests/io/formats/data/gh15019_expected_output.html new file mode 100644 index 0000000000000..5fb9d960f4465 --- /dev/null +++ b/pandas/tests/io/formats/data/gh15019_expected_output.html @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
01
1.7640520.400157
0.9787382.240893
......
0.950088-0.151357
-0.1032190.410599
diff --git a/pandas/tests/io/formats/data/gh22783_expected_output.html b/pandas/tests/io/formats/data/gh22783_expected_output.html new file mode 100644 index 0000000000000..107db43c48639 --- /dev/null +++ b/pandas/tests/io/formats/data/gh22783_expected_output.html @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + +
01...34
1.7640520.400157...2.2408931.867558
-0.9772780.950088...-0.1032190.410599
diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 0416cf6da7912..32cf21ddf5f38 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -22,6 +22,28 @@ pass +def expected_html(datapath, name): + """ + Read HTML file from formats data directory. + + Parameters + ---------- + datapath : pytest fixture + The datapath fixture injected into a test by pytest. + name : str + The name of the HTML file without the suffix. + + Returns + ------- + str : contents of HTML file. + """ + filename = '.'.join([name, 'html']) + filepath = datapath('io', 'formats', 'data', filename) + with open(filepath) as f: + html = f.read() + return html.rstrip() + + class TestToHTML(object): def test_to_html_with_col_space(self): @@ -1881,6 +1903,29 @@ def test_to_html_multiindex_max_cols(self): """) assert result == expected + @pytest.mark.parametrize('index', [False, 0]) + def test_to_html_truncation_index_false_max_rows(self, datapath, index): + # GH 15019 + data = [[1.764052, 0.400157], + [0.978738, 2.240893], + [1.867558, -0.977278], + [0.950088, -0.151357], + [-0.103219, 0.410599]] + df = pd.DataFrame(data) + result = df.to_html(max_rows=4, index=index) + expected = expected_html(datapath, 'gh15019_expected_output') + assert result == expected + + @pytest.mark.parametrize('index', [False, 0]) + def test_to_html_truncation_index_false_max_cols(self, datapath, index): + # GH 22783 + data = [[1.764052, 0.400157, 0.978738, 2.240893, 1.867558], + [-0.977278, 0.950088, -0.151357, -0.103219, 0.410599]] + df = pd.DataFrame(data) + result = df.to_html(max_cols=4, index=index) + expected = expected_html(datapath, 'gh22783_expected_output') + assert result == expected + def test_to_html_notebook_has_style(self): df = pd.DataFrame({"A": [1, 2, 3]}) result = df.to_html(notebook=True)