diff --git a/doc/source/api.rst b/doc/source/api.rst index 0e893308dd935..0dde341d820e3 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -151,6 +151,8 @@ Data manipulations cut qcut merge + merge_ordered + merge_asof concat get_dummies factorize @@ -943,6 +945,7 @@ Time series-related :toctree: generated/ DataFrame.asfreq + DataFrame.asof DataFrame.shift DataFrame.first_valid_index DataFrame.last_valid_index diff --git a/doc/source/merging.rst b/doc/source/merging.rst index ba675d9aac830..74871fe68fc08 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -104,7 +104,7 @@ some configurable handling of "what to do with the other axes": - ``ignore_index`` : boolean, default False. If True, do not use the index values on the concatenation axis. The resulting axis will be labeled 0, ..., n - 1. This is useful if you are concatenating objects where the - concatenation axis does not have meaningful indexing information. Note + concatenation axis does not have meaningful indexing information. Note the index values on the other axes are still respected in the join. - ``copy`` : boolean, default True. If False, do not copy data unnecessarily. @@ -544,12 +544,12 @@ Here's a description of what each argument is for: can be avoided are somewhat pathological but this option is provided nonetheless. - ``indicator``: Add a column to the output DataFrame called ``_merge`` - with information on the source of each row. ``_merge`` is Categorical-type - and takes on a value of ``left_only`` for observations whose merge key - only appears in ``'left'`` DataFrame, ``right_only`` for observations whose - merge key only appears in ``'right'`` DataFrame, and ``both`` if the - observation's merge key is found in both. - + with information on the source of each row. ``_merge`` is Categorical-type + and takes on a value of ``left_only`` for observations whose merge key + only appears in ``'left'`` DataFrame, ``right_only`` for observations whose + merge key only appears in ``'right'`` DataFrame, and ``both`` if the + observation's merge key is found in both. + .. versionadded:: 0.17.0 @@ -718,7 +718,7 @@ The merge indicator df2 = DataFrame({'col1':[1,2,2],'col_right':[2,2,2]}) merge(df1, df2, on='col1', how='outer', indicator=True) -The ``indicator`` argument will also accept string arguments, in which case the indicator function will use the value of the passed string as the name for the indicator column. +The ``indicator`` argument will also accept string arguments, in which case the indicator function will use the value of the passed string as the name for the indicator column. .. ipython:: python @@ -1055,34 +1055,6 @@ them together on their indexes. The same is true for ``Panel.join``. labels=['left', 'right', 'right2'], vertical=False); plt.close('all'); -.. _merging.ordered_merge: - -Merging Ordered Data -~~~~~~~~~~~~~~~~~~~~ - -New in v0.8.0 is the ordered_merge function for combining time series and other -ordered data. In particular it has an optional ``fill_method`` keyword to -fill/interpolate missing data: - -.. ipython:: python - - left = DataFrame({'k': ['K0', 'K1', 'K1', 'K2'], - 'lv': [1, 2, 3, 4], - 's': ['a', 'b', 'c', 'd']}) - - right = DataFrame({'k': ['K1', 'K2', 'K4'], - 'rv': [1, 2, 3]}) - - result = ordered_merge(left, right, fill_method='ffill', left_by='s') - -.. ipython:: python - :suppress: - - @savefig merging_ordered_merge.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=True); - plt.close('all'); - .. _merging.combine_first.update: Merging together values within Series or DataFrame columns @@ -1132,4 +1104,124 @@ values inplace: @savefig merging_update.png p.plot([df1_copy, df2], df1, labels=['df1', 'df2'], vertical=False); - plt.close('all'); \ No newline at end of file + plt.close('all'); + +.. _merging.time_series: + +Timeseries friendly merging +--------------------------- + +.. _merging.merge_ordered: + +Merging Ordered Data +~~~~~~~~~~~~~~~~~~~~ + +The ``pd.merge_ordered()`` function allows combining time series and other +ordered data. In particular it has an optional ``fill_method`` keyword to +fill/interpolate missing data: + +.. ipython:: python + + left = DataFrame({'k': ['K0', 'K1', 'K1', 'K2'], + 'lv': [1, 2, 3, 4], + 's': ['a', 'b', 'c', 'd']}) + + right = DataFrame({'k': ['K1', 'K2', 'K4'], + 'rv': [1, 2, 3]}) + + result = pd.merge_ordered(left, right, fill_method='ffill', left_by='s') + +.. ipython:: python + :suppress: + + @savefig merging_ordered_merge.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=True); + plt.close('all'); + +.. _merging.merge_asof: + +Merging AsOf +~~~~~~~~~~~~ + +.. versionadded:: 0.18.2 + +An ``pd.merge_asof()`` this is similar to an ordered left-join except that we +match on nearest key rather than equal keys. + +For each row in the ``left`` DataFrame, we select the last row in the ``right`` +DataFrame whose ``on`` key is less than the left's key. Both DataFrames must +be sorted by the key. + +Optionally an asof merge can perform a group-wise merge. This matches the ``by`` key equally, +in addition to the nearest match on the ``on`` key. + +For example; we might have ``trades`` and ``quotes`` and we want to ``asof`` merge them. + +.. ipython:: python + + trades = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.038', + '20160525 13:30:00.048', + '20160525 13:30:00.048', + '20160525 13:30:00.048']), + 'ticker': ['MSFT', 'MSFT', + 'GOOG', 'GOOG', 'AAPL'], + 'price': [51.95, 51.95, + 720.77, 720.92, 98.00], + 'quantity': [75, 155, + 100, 100, 100]}, + columns=['time', 'ticker', 'price', 'quantity']) + + quotes = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.023', + '20160525 13:30:00.030', + '20160525 13:30:00.041', + '20160525 13:30:00.048', + '20160525 13:30:00.049', + '20160525 13:30:00.072', + '20160525 13:30:00.075']), + 'ticker': ['GOOG', 'MSFT', 'MSFT', + 'MSFT', 'GOOG', 'AAPL', 'GOOG', + 'MSFT'], + 'bid': [720.50, 51.95, 51.97, 51.99, + 720.50, 97.99, 720.50, 52.01], + 'ask': [720.93, 51.96, 51.98, 52.00, + 720.93, 98.01, 720.88, 52.03]}, + columns=['time', 'ticker', 'bid', 'ask']) + +.. ipython:: python + + trades + quotes + +By default we are taking the asof of the quotes. + +.. ipython:: python + + pd.merge_asof(trades, quotes, + on='time', + by='ticker') + +We only asof within ``2ms`` betwen the quote time and the trade time. + +.. ipython:: python + + pd.merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=pd.Timedelta('2ms')) + +We only asof within ``10ms`` betwen the quote time and the trade time and we exclude exact matches on time. +Note that though we exclude the exact matches (of the quotes), prior quotes DO propogate to that point +in time. + +.. ipython:: python + + pd.merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=pd.Timedelta('10ms'), + allow_exact_matches=False) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 0d4f07d19f880..cd436aa18a68b 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -19,6 +19,97 @@ Highlights include: New features ~~~~~~~~~~~~ +.. _whatsnew_0182.enhancements.asof_merge: + +``pd.merge_asof()`` for asof-style time-series joining +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A long-time requested feature has been added through the :func:`merge_asof` function, to +support asof style joining of time-series. (:issue:`1870`). Full documentation is +:ref:`here ` + +The :func:`merge_asof`` performs an asof merge, which is similar to a left-join +except that we match on nearest key rather than equal keys. + +.. ipython:: python + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 6, 7], + 'right_val': [1, 2, 3, 6, 7]}) + + left + right + +We typically want to match exactly when possible, and use the most +recent value otherwise. + +.. ipython:: python + + pd.merge_asof(left, right, on='a') + +We can also match rows ONLY with prior data, and not an exact match. + +.. ipython:: python + + pd.merge_asof(left, right, on='a', allow_exact_matches=False) + + +In a typical time-series example, we have ``trades`` and ``quotes`` and we want to ``asof-join`` them. +This also illustrates using the ``by`` parameter to group data before merging. + +.. ipython:: python + + trades = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.038', + '20160525 13:30:00.048', + '20160525 13:30:00.048', + '20160525 13:30:00.048']), + 'ticker': ['MSFT', 'MSFT', + 'GOOG', 'GOOG', 'AAPL'], + 'price': [51.95, 51.95, + 720.77, 720.92, 98.00], + 'quantity': [75, 155, + 100, 100, 100]}, + columns=['time', 'ticker', 'price', 'quantity']) + + quotes = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.023', + '20160525 13:30:00.030', + '20160525 13:30:00.041', + '20160525 13:30:00.048', + '20160525 13:30:00.049', + '20160525 13:30:00.072', + '20160525 13:30:00.075']), + 'ticker': ['GOOG', 'MSFT', 'MSFT', + 'MSFT', 'GOOG', 'AAPL', 'GOOG', + 'MSFT'], + 'bid': [720.50, 51.95, 51.97, 51.99, + 720.50, 97.99, 720.50, 52.01], + 'ask': [720.93, 51.96, 51.98, 52.00, + 720.93, 98.01, 720.88, 52.03]}, + columns=['time', 'ticker', 'bid', 'ask']) + +.. ipython:: python + + trades + quotes + +An asof merge joins on the ``on``, typically a datetimelike field, which is ordered, and +in this case we are using a grouper in the ``by`` field. This is like a left-outer join, except +that forward filling happens automatically taking the most recent non-NaN value. + +.. ipython:: python + + pd.merge_asof(trades, quotes, + on='time', + by='ticker') + +This returns a merged DataFrame with the entries in the same order as the original left +passed DataFrame (``trades`` in this case). With the fields of the ``quotes`` merged. + .. _whatsnew_0182.enhancements.read_csv_dupe_col_names_support: ``pd.read_csv`` has improved support for duplicate column names @@ -124,8 +215,8 @@ Other enhancements idx.where([True, False, True]) - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) +- ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`) - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) - - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) - A ``union_categorical`` function has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) @@ -335,6 +426,7 @@ Deprecations - ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) - ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) - ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`) +- top-level ``pd.ordered_merge()`` has been renamed to ``pd.merge_ordered()`` and the original name will be removed in a future version (:issue:`13358`) .. _whatsnew_0182.performance: diff --git a/pandas/__init__.py b/pandas/__init__.py index 53642fdcfeb31..350898c9925e7 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -43,7 +43,8 @@ from pandas.io.api import * from pandas.computation.api import * -from pandas.tools.merge import merge, concat, ordered_merge +from pandas.tools.merge import (merge, concat, ordered_merge, + merge_ordered, merge_asof) from pandas.tools.pivot import pivot_table, crosstab from pandas.tools.plotting import scatter_matrix, plot_params from pandas.tools.tile import cut, qcut diff --git a/pandas/algos.pyx b/pandas/algos.pyx index f1fd0204e2fd2..8e659a8566adb 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -1,3 +1,5 @@ +# cython: profile=False + from numpy cimport * cimport numpy as np import numpy as np @@ -982,21 +984,35 @@ def is_lexsorted(list list_of_arrays): @cython.boundscheck(False) -def groupby_indices(ndarray values): +def groupby_indices(dict ids, ndarray[int64_t] labels, ndarray[int64_t] counts): + """ + turn group_labels output into a combined indexer maping the labels to + indexers + + Parameters + ---------- + ids: dict + mapping of label -> group indexer + labels: ndarray + labels for positions + counts: ndarray + group counts + + Returns + ------- + list of ndarrays of indices + + """ cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] labels, counts, arr, seen + Py_ssize_t i, n = len(labels) + ndarray[int64_t] arr, seen int64_t loc - dict ids = {} - object val int64_t k + dict result = {} - ids, labels, counts = group_labels(values) seen = np.zeros_like(counts) - # try not to get in trouble here... cdef int64_t **vecs = malloc(len(ids) * sizeof(int64_t*)) - result = {} for i from 0 <= i < len(counts): arr = np.empty(counts[i], dtype=np.int64) result[ids[i]] = arr @@ -1014,7 +1030,6 @@ def groupby_indices(ndarray values): seen[k] = loc + 1 free(vecs) - return result @cython.wraparound(False) @@ -1023,8 +1038,15 @@ def group_labels(ndarray[object] values): """ Compute label vector from input values and associated useful data + Parameters + ---------- + values: object ndarray + Returns ------- + tuple of (reverse mappings of label -> group indexer, + factorized labels ndarray, + group counts ndarray) """ cdef: Py_ssize_t i, n = len(values) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 69def7502a6f7..b4b35953b4282 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -153,6 +153,12 @@ merged : DataFrame The output type will the be same as 'left', if it is a subclass of DataFrame. + +See also +-------- +merge_ordered +merge_asof + """ # ----------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0852c5a293f4e..348281d1a7e30 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -13,7 +13,7 @@ InvalidIndexError) import pandas.core.indexing as indexing from pandas.tseries.index import DatetimeIndex -from pandas.tseries.period import PeriodIndex +from pandas.tseries.period import PeriodIndex, Period from pandas.core.internals import BlockManager import pandas.core.algorithms as algos import pandas.core.common as com @@ -3629,6 +3629,93 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, res = res.T return res + # ---------------------------------------------------------------------- + # Timeseries methods Methods + + def asof(self, where, subset=None): + """ + The last row without any NaN is taken (or the last row without + NaN considering only the subset of columns in the case of a DataFrame) + + .. versionadded:: 0.18.2 For DataFrame + + If there is no good value, NaN is returned. + + Parameters + ---------- + where : date or array of dates + subset : string or list of strings, default None + if not None use these columns for NaN propagation + + Notes + ----- + Dates are assumed to be sorted + Raises if this is not the case + + Returns + ------- + where is scalar + + - value or NaN if input is Series + - Series if input is DataFrame + + where is Index: same shape object as input + + See Also + -------- + merge_asof + + """ + + if isinstance(where, compat.string_types): + from pandas import to_datetime + where = to_datetime(where) + + if not self.index.is_monotonic: + raise ValueError("asof requires a sorted index") + + if isinstance(self, ABCSeries): + if subset is not None: + raise ValueError("subset is not valid for Series") + nulls = self.isnull() + elif self.ndim > 2: + raise NotImplementedError("asof is not implemented " + "for {type}".format(type(self))) + else: + if subset is None: + subset = self.columns + if not is_list_like(subset): + subset = [subset] + nulls = self[subset].isnull().any(1) + + if not is_list_like(where): + start = self.index[0] + if isinstance(self.index, PeriodIndex): + where = Period(where, freq=self.index.freq).ordinal + start = start.ordinal + + if where < start: + return np.nan + + loc = self.index.searchsorted(where, side='right') + if loc > 0: + loc -= 1 + while nulls[loc] and loc > 0: + loc -= 1 + return self.iloc[loc] + + if not isinstance(where, Index): + where = Index(where) + + locs = self.index.asof_locs(where, ~(nulls.values)) + + # mask the missing + missing = locs == -1 + data = self.take(locs, is_copy=False) + data.index = where + data.loc[missing] = np.nan + return data + # ---------------------------------------------------------------------- # Action Methods diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index bea62e98e4a2a..cc639b562dab8 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -4329,8 +4329,19 @@ def _reorder_by_uniques(uniques, labels): def _groupby_indices(values): - return _algos.groupby_indices(_values_from_object( - com._ensure_object(values))) + + if is_categorical_dtype(values): + + # we have a categorical, so we can do quite a bit + # bit better than factorizing again + reverse = dict(enumerate(values.categories)) + codes = values.codes.astype('int64') + _, counts = _hash.value_count_scalar64(codes, False) + else: + reverse, codes, counts = _algos.group_labels( + _values_from_object(com._ensure_object(values))) + + return _algos.groupby_indices(reverse, codes, counts) def numpy_groupby(data, labels, axis=0): diff --git a/pandas/core/series.py b/pandas/core/series.py index 43b4ba3a51212..cf1639bacc3be 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -36,7 +36,7 @@ CombinedDatetimelikeProperties) from pandas.tseries.index import DatetimeIndex from pandas.tseries.tdi import TimedeltaIndex -from pandas.tseries.period import PeriodIndex, Period +from pandas.tseries.period import PeriodIndex from pandas import compat from pandas.util.terminal import get_terminal_size from pandas.compat import zip, u, OrderedDict, StringIO @@ -46,7 +46,6 @@ import pandas.core.algorithms as algos import pandas.core.common as com -import pandas.core.datetools as datetools import pandas.core.nanops as nanops import pandas.formats.format as fmt from pandas.util.decorators import Appender, deprecate_kwarg, Substitution @@ -2601,52 +2600,6 @@ def last_valid_index(self): # ---------------------------------------------------------------------- # Time series-oriented methods - def asof(self, where): - """ - Return last good (non-NaN) value in Series if value is NaN for - requested date. - - If there is no good value, NaN is returned. - - Parameters - ---------- - where : date or array of dates - - Notes - ----- - Dates are assumed to be sorted - - Returns - ------- - value or NaN - """ - if isinstance(where, compat.string_types): - where = datetools.to_datetime(where) - - values = self._values - - if not hasattr(where, '__iter__'): - start = self.index[0] - if isinstance(self.index, PeriodIndex): - where = Period(where, freq=self.index.freq).ordinal - start = start.ordinal - - if where < start: - return np.nan - loc = self.index.searchsorted(where, side='right') - if loc > 0: - loc -= 1 - while isnull(values[loc]) and loc > 0: - loc -= 1 - return values[loc] - - if not isinstance(where, Index): - where = Index(where) - - locs = self.index.asof_locs(where, notnull(values)) - new_values = algos.take_1d(values, locs) - return self._constructor(new_values, index=where).__finalize__(self) - def to_timestamp(self, freq=None, how='start', copy=True): """ Cast to datetimeindex of timestamps, at *beginning* of period diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index f718c1ab0b8da..e1c3733a0449d 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -1075,7 +1075,8 @@ def mode_int64(int64_t[:] values): @cython.wraparound(False) @cython.boundscheck(False) -def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'): +def duplicated_int64(ndarray[int64_t, ndim=1] values, + object keep='first'): cdef: int ret = 0, k int64_t value diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 4c9ca43f7f25d..3b7c660f5faa1 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -281,7 +281,8 @@ def is_unique(self): @Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs) def duplicated(self, keep='first'): from pandas.hashtable import duplicated_int64 - return duplicated_int64(self.codes.astype('i8'), keep) + codes = self.codes.astype('i8') + return duplicated_int64(codes, keep) def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx index 8a9cf01375a68..a81ac0aa35d4e 100644 --- a/pandas/src/join.pyx +++ b/pandas/src/join.pyx @@ -125,6 +125,153 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, +def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right, + Py_ssize_t max_groups, sort=True, + bint allow_exact_matches=1, + left_distance=None, + right_distance=None, + tolerance=None): + + cdef: + Py_ssize_t i, j, k, count = 0 + Py_ssize_t loc, left_pos, right_pos, position + Py_ssize_t offset + ndarray[int64_t] left_count, right_count + ndarray left_sorter, right_sorter, rev + ndarray[int64_t] left_indexer, right_indexer + int64_t lc, rc, tol, left_val, right_val, diff, indexer + ndarray[int64_t] ld, rd + bint has_tol = 0 + + # if we are using tolerance, set our objects + if left_distance is not None and right_distance is not None and tolerance is not None: + has_tol = 1 + ld = left_distance + rd = right_distance + tol = tolerance + + # NA group in location 0 + left_sorter, left_count = groupsort_indexer(left, max_groups) + right_sorter, right_count = groupsort_indexer(right, max_groups) + + # First pass, determine size of result set, do not use the NA group + for i in range(1, max_groups + 1): + if right_count[i] > 0: + count += left_count[i] * right_count[i] + else: + count += left_count[i] + + # group 0 is the NA group + left_pos = 0 + right_pos = 0 + position = 0 + + # exclude the NA group + left_pos = left_count[0] + right_pos = right_count[0] + + left_indexer = np.empty(count, dtype=np.int64) + right_indexer = np.empty(count, dtype=np.int64) + + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] + + if rc == 0: + for j in range(lc): + indexer = position + j + left_indexer[indexer] = left_pos + j + + # take the most recent value + # if we are not the first + if right_pos: + + if has_tol: + + left_val = ld[left_pos + j] + right_val = rd[right_pos - 1] + diff = left_val - right_val + + # do we allow exact matches + if allow_exact_matches and diff > tol: + right_indexer[indexer] = -1 + continue + elif not allow_exact_matches: + if diff >= tol: + right_indexer[indexer] = -1 + continue + + right_indexer[indexer] = right_pos - 1 + else: + right_indexer[indexer] = -1 + position += lc + else: + for j in range(lc): + offset = position + j * rc + for k in range(rc): + + indexer = offset + k + left_indexer[indexer] = left_pos + j + + if has_tol: + + left_val = ld[left_pos + j] + right_val = rd[right_pos + k] + diff = left_val - right_val + + # do we allow exact matches + if allow_exact_matches and diff > tol: + right_indexer[indexer] = -1 + continue + + # we don't allow exact matches + elif not allow_exact_matches: + if diff >= tol or not right_pos: + right_indexer[indexer] = -1 + else: + right_indexer[indexer] = right_pos - 1 + continue + + else: + + # do we allow exact matches + if not allow_exact_matches: + + if right_pos: + right_indexer[indexer] = right_pos - 1 + else: + right_indexer[indexer] = -1 + continue + + right_indexer[indexer] = right_pos + k + position += lc * rc + left_pos += lc + right_pos += rc + + left_indexer = _get_result_indexer(left_sorter, left_indexer) + right_indexer = _get_result_indexer(right_sorter, right_indexer) + + if not sort: # if not asked to sort, revert to original order + if len(left) == len(left_indexer): + # no multiple matches for any row on the left + # this is a short-cut to avoid groupsort_indexer + # otherwise, the `else` path also works in this case + if left_sorter.dtype != np.int_: + left_sorter = left_sorter.astype(np.int_) + + rev = np.empty(len(left), dtype=np.int_) + rev.put(left_sorter, np.arange(len(left))) + else: + rev, _ = groupsort_indexer(left_indexer, len(left)) + + if rev.dtype != np.int_: + rev = rev.astype(np.int_) + right_indexer = right_indexer.take(rev) + left_indexer = left_indexer.take(rev) + + return left_indexer, right_indexer + + def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, Py_ssize_t max_groups): cdef: @@ -246,4 +393,3 @@ def ffill_by_group(ndarray[int64_t] indexer, ndarray[int64_t] group_ids, last_obs[gid] = val return result - diff --git a/pandas/tests/frame/test_asof.py b/pandas/tests/frame/test_asof.py new file mode 100644 index 0000000000000..6c15c75cb5427 --- /dev/null +++ b/pandas/tests/frame/test_asof.py @@ -0,0 +1,72 @@ +# coding=utf-8 + +import nose + +import numpy as np +from pandas import DataFrame, date_range + +from pandas.util.testing import assert_frame_equal +import pandas.util.testing as tm + +from .common import TestData + + +class TestFrameAsof(TestData, tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + self.N = N = 50 + rng = date_range('1/1/1990', periods=N, freq='53s') + self.df = DataFrame({'A': np.arange(N), 'B': np.arange(N)}, + index=rng) + + def test_basic(self): + + df = self.df.copy() + df.ix[15:30, 'A'] = np.nan + dates = date_range('1/1/1990', periods=self.N * 3, + freq='25s') + + result = df.asof(dates) + self.assertTrue(result.notnull().all(1).all()) + lb = df.index[14] + ub = df.index[30] + + dates = list(dates) + result = df.asof(dates) + self.assertTrue(result.notnull().all(1).all()) + + mask = (result.index >= lb) & (result.index < ub) + rs = result[mask] + self.assertTrue((rs == 14).all(1).all()) + + def test_subset(self): + + N = 10 + rng = date_range('1/1/1990', periods=N, freq='53s') + df = DataFrame({'A': np.arange(N), 'B': np.arange(N)}, + index=rng) + df.ix[4:8, 'A'] = np.nan + dates = date_range('1/1/1990', periods=N * 3, + freq='25s') + + # with a subset of A should be the same + result = df.asof(dates, subset='A') + expected = df.asof(dates) + assert_frame_equal(result, expected) + + # same with A/B + result = df.asof(dates, subset=['A', 'B']) + expected = df.asof(dates) + assert_frame_equal(result, expected) + + # B gives self.df.asof + result = df.asof(dates, subset='B') + expected = df.resample('25s', closed='right').ffill().reindex(dates) + expected.iloc[20:] = 9 + + assert_frame_equal(result, expected) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/series/test_asof.py b/pandas/tests/series/test_asof.py new file mode 100644 index 0000000000000..e2092feab9004 --- /dev/null +++ b/pandas/tests/series/test_asof.py @@ -0,0 +1,158 @@ +# coding=utf-8 + +import nose + +import numpy as np + +from pandas import (offsets, Series, notnull, + isnull, date_range, Timestamp) + +import pandas.util.testing as tm + +from .common import TestData + + +class TestSeriesAsof(TestData, tm.TestCase): + _multiprocess_can_split_ = True + + def test_basic(self): + + # array or list or dates + N = 50 + rng = date_range('1/1/1990', periods=N, freq='53s') + ts = Series(np.random.randn(N), index=rng) + ts[15:30] = np.nan + dates = date_range('1/1/1990', periods=N * 3, freq='25s') + + result = ts.asof(dates) + self.assertTrue(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + result = ts.asof(list(dates)) + self.assertTrue(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + mask = (result.index >= lb) & (result.index < ub) + rs = result[mask] + self.assertTrue((rs == ts[lb]).all()) + + val = result[result.index[result.index >= ub][0]] + self.assertEqual(ts[ub], val) + + def test_scalar(self): + + N = 30 + rng = date_range('1/1/1990', periods=N, freq='53s') + ts = Series(np.arange(N), index=rng) + ts[5:10] = np.NaN + ts[15:20] = np.NaN + + val1 = ts.asof(ts.index[7]) + val2 = ts.asof(ts.index[19]) + + self.assertEqual(val1, ts[4]) + self.assertEqual(val2, ts[14]) + + # accepts strings + val1 = ts.asof(str(ts.index[7])) + self.assertEqual(val1, ts[4]) + + # in there + result = ts.asof(ts.index[3]) + self.assertEqual(result, ts[3]) + + # no as of value + d = ts.index[0] - offsets.BDay() + self.assertTrue(np.isnan(ts.asof(d))) + + def test_with_nan(self): + # basic asof test + rng = date_range('1/1/2000', '1/2/2000', freq='4h') + s = Series(np.arange(len(rng)), index=rng) + r = s.resample('2h').mean() + + result = r.asof(r.index) + expected = Series([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6.], + index=date_range('1/1/2000', '1/2/2000', freq='2h')) + tm.assert_series_equal(result, expected) + + r.iloc[3:5] = np.nan + result = r.asof(r.index) + expected = Series([0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 5, 5, 6.], + index=date_range('1/1/2000', '1/2/2000', freq='2h')) + tm.assert_series_equal(result, expected) + + r.iloc[-3:] = np.nan + result = r.asof(r.index) + expected = Series([0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4, 4.], + index=date_range('1/1/2000', '1/2/2000', freq='2h')) + tm.assert_series_equal(result, expected) + + def test_periodindex(self): + from pandas import period_range, PeriodIndex + # array or list or dates + N = 50 + rng = period_range('1/1/1990', periods=N, freq='H') + ts = Series(np.random.randn(N), index=rng) + ts[15:30] = np.nan + dates = date_range('1/1/1990', periods=N * 3, freq='37min') + + result = ts.asof(dates) + self.assertTrue(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + result = ts.asof(list(dates)) + self.assertTrue(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + pix = PeriodIndex(result.index.values, freq='H') + mask = (pix >= lb) & (pix < ub) + rs = result[mask] + self.assertTrue((rs == ts[lb]).all()) + + ts[5:10] = np.nan + ts[15:20] = np.nan + + val1 = ts.asof(ts.index[7]) + val2 = ts.asof(ts.index[19]) + + self.assertEqual(val1, ts[4]) + self.assertEqual(val2, ts[14]) + + # accepts strings + val1 = ts.asof(str(ts.index[7])) + self.assertEqual(val1, ts[4]) + + # in there + self.assertEqual(ts.asof(ts.index[3]), ts[3]) + + # no as of value + d = ts.index[0].to_timestamp() - offsets.BDay() + self.assertTrue(isnull(ts.asof(d))) + + def test_errors(self): + + s = Series([1, 2, 3], + index=[Timestamp('20130101'), + Timestamp('20130103'), + Timestamp('20130102')]) + + # non-monotonic + self.assertFalse(s.index.is_monotonic) + with self.assertRaises(ValueError): + s.asof(s.index[0]) + + # subset with Series + N = 10 + rng = date_range('1/1/1990', periods=N, freq='53s') + s = Series(np.random.randn(N), index=rng) + with self.assertRaises(ValueError): + s.asof(s.index[0], subset='foo') + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 13b95ea97eedf..19acf54c7a3cb 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -3,10 +3,9 @@ from datetime import datetime -from numpy import nan import numpy as np -from pandas import Index, Series, notnull, date_range +from pandas import Index, Series, date_range from pandas.tseries.index import DatetimeIndex from pandas.tseries.tdi import TimedeltaIndex @@ -179,51 +178,6 @@ def test_truncate(self): before=self.ts.index[-1] + offset, after=self.ts.index[0] - offset) - def test_asof(self): - # array or list or dates - N = 50 - rng = date_range('1/1/1990', periods=N, freq='53s') - ts = Series(np.random.randn(N), index=rng) - ts[15:30] = np.nan - dates = date_range('1/1/1990', periods=N * 3, freq='25s') - - result = ts.asof(dates) - self.assertTrue(notnull(result).all()) - lb = ts.index[14] - ub = ts.index[30] - - result = ts.asof(list(dates)) - self.assertTrue(notnull(result).all()) - lb = ts.index[14] - ub = ts.index[30] - - mask = (result.index >= lb) & (result.index < ub) - rs = result[mask] - self.assertTrue((rs == ts[lb]).all()) - - val = result[result.index[result.index >= ub][0]] - self.assertEqual(ts[ub], val) - - self.ts[5:10] = np.NaN - self.ts[15:20] = np.NaN - - val1 = self.ts.asof(self.ts.index[7]) - val2 = self.ts.asof(self.ts.index[19]) - - self.assertEqual(val1, self.ts[4]) - self.assertEqual(val2, self.ts[14]) - - # accepts strings - val1 = self.ts.asof(str(self.ts.index[7])) - self.assertEqual(val1, self.ts[4]) - - # in there - self.assertEqual(self.ts.asof(self.ts.index[3]), self.ts[3]) - - # no as of value - d = self.ts.index[0] - datetools.bday - self.assertTrue(np.isnan(self.ts.asof(d))) - def test_getitem_setitem_datetimeindex(self): from pandas import date_range @@ -424,68 +378,6 @@ def test_getitem_setitem_periodindex(self): result[4:8] = ts[4:8] assert_series_equal(result, ts) - def test_asof_periodindex(self): - from pandas import period_range, PeriodIndex - # array or list or dates - N = 50 - rng = period_range('1/1/1990', periods=N, freq='H') - ts = Series(np.random.randn(N), index=rng) - ts[15:30] = np.nan - dates = date_range('1/1/1990', periods=N * 3, freq='37min') - - result = ts.asof(dates) - self.assertTrue(notnull(result).all()) - lb = ts.index[14] - ub = ts.index[30] - - result = ts.asof(list(dates)) - self.assertTrue(notnull(result).all()) - lb = ts.index[14] - ub = ts.index[30] - - pix = PeriodIndex(result.index.values, freq='H') - mask = (pix >= lb) & (pix < ub) - rs = result[mask] - self.assertTrue((rs == ts[lb]).all()) - - ts[5:10] = np.NaN - ts[15:20] = np.NaN - - val1 = ts.asof(ts.index[7]) - val2 = ts.asof(ts.index[19]) - - self.assertEqual(val1, ts[4]) - self.assertEqual(val2, ts[14]) - - # accepts strings - val1 = ts.asof(str(ts.index[7])) - self.assertEqual(val1, ts[4]) - - # in there - self.assertEqual(ts.asof(ts.index[3]), ts[3]) - - # no as of value - d = ts.index[0].to_timestamp() - datetools.bday - self.assertTrue(np.isnan(ts.asof(d))) - - def test_asof_more(self): - from pandas import date_range - - s = Series([nan, nan, 1, 2, nan, nan, 3, 4, 5], - index=date_range('1/1/2000', periods=9)) - - dates = s.index[[4, 5, 6, 2, 1]] - - result = s.asof(dates) - expected = Series([2, 2, 3, 1, np.nan], index=dates) - - assert_series_equal(result, expected) - - s = Series([1.5, 2.5, 1, 2, nan, nan, 3, 4, 5], - index=date_range('1/1/2000', periods=9)) - result = s.asof(s.index[0]) - self.assertEqual(result, s[0]) - def test_asfreq(self): ts = Series([0., 1., 2.], index=[datetime(2009, 10, 30), datetime( 2009, 11, 30), datetime(2009, 12, 31)]) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 182c0637ae29c..f963a271a767e 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -2,23 +2,30 @@ SQL-style merge routines """ +import copy import warnings import numpy as np from pandas.compat import range, lrange, lzip, zip, map, filter import pandas.compat as compat -from pandas.core.categorical import Categorical -from pandas.core.frame import DataFrame, _merge_doc +from pandas import (Categorical, DataFrame, Series, + Index, MultiIndex, Timedelta) +from pandas.core.frame import _merge_doc from pandas.core.generic import NDFrame -from pandas.core.series import Series -from pandas.core.index import (Index, MultiIndex, _get_combined_index, +from pandas.core.index import (_get_combined_index, _ensure_index, _get_consensus_names, _all_indexes_same) from pandas.core.internals import (items_overlap_with_suffix, concatenate_block_managers) from pandas.util.decorators import Appender, Substitution -from pandas.core.common import ABCSeries +from pandas.core.common import (ABCSeries, is_dtype_equal, + is_datetime64_dtype, + is_int64_dtype, + is_integer, + is_bool, + is_list_like, + needs_i8_conversion) import pandas.core.algorithms as algos import pandas.core.common as com @@ -47,9 +54,100 @@ class MergeError(ValueError): pass -def ordered_merge(left, right, on=None, left_by=None, right_by=None, +def _groupby_and_merge(by, on, left, right, _merge_pieces, + check_duplicates=True): + """ + groupby & merge; we are always performing a left-by type operation + + Parameters + ---------- + by: field to group + on: duplicates field + left: left frame + right: right frame + _merge_pieces: function for merging + check_duplicates: boolean, default True + should we check & clean duplicates + """ + + pieces = [] + if not isinstance(by, (list, tuple)): + by = [by] + + lby = left.groupby(by, sort=False) + + # if we can groupby the rhs + # then we can get vastly better perf + try: + + # we will check & remove duplicates if indicated + if check_duplicates: + if on is None: + on = [] + elif not isinstance(on, (list, tuple)): + on = [on] + + if right.duplicated(by + on).any(): + right = right.drop_duplicates(by + on, keep='last') + rby = right.groupby(by, sort=False) + except KeyError: + rby = None + + for key, lhs in lby: + + if rby is None: + rhs = right + else: + try: + rhs = right.take(rby.indices[key]) + except KeyError: + # key doesn't exist in left + lcols = lhs.columns.tolist() + cols = lcols + [r for r in right.columns + if r not in set(lcols)] + merged = lhs.reindex(columns=cols) + merged.index = range(len(merged)) + pieces.append(merged) + continue + + merged = _merge_pieces(lhs, rhs) + + # make sure join keys are in the merged + # TODO, should _merge_pieces do this? + for k in by: + try: + if k in merged: + merged[k] = key + except: + pass + + pieces.append(merged) + + # preserve the original order + # if we have a missing piece this can be reset + result = concat(pieces, ignore_index=True) + result = result.reindex(columns=pieces[0].columns, copy=False) + return result, lby + + +def ordered_merge(left, right, on=None, left_on=None, right_on=None, + left_by=None, right_by=None, fill_method=None, suffixes=('_x', '_y')): + + warnings.warn("ordered_merge is deprecated and replace by merged_ordered", + FutureWarning, stacklevel=2) + return merge_ordered(left, right, on=on, + left_on=left_on, right_on=right_on, + left_by=left_by, right_by=right_by, + fill_method=fill_method, suffixes=suffixes) + + +def merge_ordered(left, right, on=None, + left_on=None, right_on=None, + left_by=None, right_by=None, + fill_method=None, suffixes=('_x', '_y'), + how='outer'): """Perform merge with optional filling/interpolation designed for ordered data like time series data. Optionally perform group-wise merge (see examples) @@ -58,8 +156,6 @@ def ordered_merge(left, right, on=None, left_by=None, right_by=None, ---------- left : DataFrame right : DataFrame - fill_method : {'ffill', None}, default None - Interpolation method for data on : label or list Field names to join on. Must be found in both DataFrames. left_on : label or list, or array-like @@ -75,9 +171,18 @@ def ordered_merge(left, right, on=None, left_by=None, right_by=None, right_by : column name or list of column names Group right DataFrame by group columns and merge piece by piece with left DataFrame + fill_method : {'ffill', None}, default None + Interpolation method for data suffixes : 2-length sequence (tuple, list, ...) Suffix to apply to overlapping column names in the left and right side, respectively + how : {'left', 'right', 'outer', 'inner'}, default 'outer' + * left: use only keys from left frame (SQL: left outer join) + * right: use only keys from right frame (SQL: right outer join) + * outer: use union of keys from both frames (SQL: full outer join) + * inner: use intersection of keys from both frames (SQL: inner join) + + .. versionadded 0.18.2 Examples -------- @@ -110,46 +215,243 @@ def ordered_merge(left, right, on=None, left_by=None, right_by=None, merged : DataFrame The output type will the be same as 'left', if it is a subclass of DataFrame. + + See also + -------- + merge + merge_asof + """ def _merger(x, y): + # perform the ordered merge operation op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on, - # left_index=left_index, right_index=right_index, - suffixes=suffixes, fill_method=fill_method) + suffixes=suffixes, fill_method=fill_method, + how=how) return op.get_result() if left_by is not None and right_by is not None: raise ValueError('Can only group either left or right frames') elif left_by is not None: - if not isinstance(left_by, (list, tuple)): - left_by = [left_by] - pieces = [] - for key, xpiece in left.groupby(left_by): - merged = _merger(xpiece, right) - for k in left_by: - # May have passed ndarray - try: - if k in merged: - merged[k] = key - except: - pass - pieces.append(merged) - return concat(pieces, ignore_index=True) + result, _ = _groupby_and_merge(left_by, on, left, right, + lambda x, y: _merger(x, y), + check_duplicates=False) elif right_by is not None: - if not isinstance(right_by, (list, tuple)): - right_by = [right_by] - pieces = [] - for key, ypiece in right.groupby(right_by): - merged = _merger(left, ypiece) - for k in right_by: - try: - if k in merged: - merged[k] = key - except: - pass - pieces.append(merged) - return concat(pieces, ignore_index=True) + result, _ = _groupby_and_merge(right_by, on, right, left, + lambda x, y: _merger(y, x), + check_duplicates=False) else: - return _merger(left, right) + result = _merger(left, right) + return result + + +def merge_asof(left, right, on=None, + left_on=None, right_on=None, + by=None, + suffixes=('_x', '_y'), + tolerance=None, + allow_exact_matches=True, + check_duplicates=True): + """Perform an asof merge. This is similar to a left-join except that we + match on nearest key rather than equal keys. + + For each row in the left DataFrame, we select the last row in the right + DataFrame whose 'on' key is less than or equal to the left's key. Both + DataFrames must be sorted by the key. + + Optionally perform group-wise merge. This searches for the nearest match + on the 'on' key within the same group according to 'by'. + + .. versionadded 0.18.2 + + Parameters + ---------- + left : DataFrame + right : DataFrame + on : label or list + Field names to join on. Must be found in both DataFrames. + The data MUST be ordered. Furthermore this must be a numeric column, + typically a datetimelike or integer. On or left_on/right_on + must be given. + left_on : label or list, or array-like + Field names to join on in left DataFrame. Can be a vector or list of + vectors of the length of the DataFrame to use a particular vector as + the join key instead of columns + right_on : label or list, or array-like + Field names to join on in right DataFrame or vector/list of vectors per + left_on docs + by : column name or list of column names + Group both the left and right DataFrames by the group columns; perform + the merge operation on these pieces and recombine. + suffixes : 2-length sequence (tuple, list, ...) + Suffix to apply to overlapping column names in the left and right + side, respectively + tolerance : integer or Timedelta, optional, default None + select asof tolerance within this range; must be compatible + to the merge index. + allow_exact_matches : boolean, default True + + - If True, allow matching the same 'on' value + (i.e. less-than-or-equal-to) + - If False, don't match the same 'on' value + (i.e., stricly less-than) + + check_duplicates : boolean, default True + + - If True, check and remove duplicates for the right + DataFrame, on the [by, on] combination, keeping the last value. + - If False, no check for duplicates. If you *know* that + you don't have duplicates, then turning off the check for duplicates + can be more performant. + + Returns + ------- + merged : DataFrame + + Examples + -------- + >>> left + a left_val + 0 1 a + 1 5 b + 2 10 c + + >>> right + a right_val + 0 1 1 + 1 2 2 + 2 3 3 + 3 6 6 + 4 7 7 + + >>> pd.merge_asof(left, right, on='a') + a left_val right_val + 0 1 a 1 + 1 5 b 3 + 2 10 c 7 + + >>> pd.merge_asof(left, right, on='a', allow_exact_matches=False) + a left_val right_val + 0 1 a NaN + 1 5 b 3.0 + 2 10 c 7.0 + + For this example, we can achieve a similar result thru pd.merge_ordered, + though its not nearly as performant. + + + >>> (pd.merge_ordered(left, right, on='a') + ... .ffill() + ... .drop_duplicates(['left_val']) + ... ) + a left_val right_val + 0 1 a 1.0 + 3 5 b 3.0 + 6 10 c 7.0 + + Here is a real-worth times-series example + + >>> quotes + time ticker bid ask + 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93 + 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96 + 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98 + 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00 + 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93 + 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01 + 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88 + 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 + + >>> trades + time ticker price quantity + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 + + # by default we are taking the asof of the quotes + >>> pd.asof_merge(trades, quotes, + ... on='time', + ... by='ticker') + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + + # we only asof within 2ms betwen the quote time and the trade time + >>> pd.asof_merge(trades, quotes, + ... on='time', + ... by='ticker', + ... tolerance=pd.Timedelta('2ms')) + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + + # we only asof within 10ms betwen the quote time and the trade time + # and we exclude exact matches on time. However *prior* data will + # propogate forward + >>> pd.asof_merge(trades, quotes, + ... on='time', + ... by='ticker', + ... tolerance=pd.Timedelta('10ms'), + ... allow_exact_matches=False) + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + + See also + -------- + merge + merge_ordered + + """ + def _merger(x, y): + # perform the ordered merge operation + op = _AsOfMerge(x, y, + on=on, left_on=left_on, right_on=right_on, + by=by, suffixes=suffixes, + how='asof', tolerance=tolerance, + allow_exact_matches=allow_exact_matches) + return op.get_result() + + if by is not None: + result, groupby = _groupby_and_merge(by, on, left, right, + lambda x, y: _merger(x, y), + check_duplicates=check_duplicates) + + # we want to preserve the original order + # we had grouped, so need to reverse this + # if we DO have duplicates, then + # we cannot guarantee order + + sorter = np.concatenate([groupby.indices[g] for g, _ in groupby]) + if len(result) != len(sorter): + if check_duplicates: + raise AssertionError("invalid reverse grouping") + return result + + rev = np.empty(len(sorter), dtype=np.int_) + rev.put(sorter, np.arange(len(sorter))) + return result.take(rev).reset_index(drop=True) + + if check_duplicates: + if on is None: + on = [] + elif not isinstance(on, (list, tuple)): + on = [on] + + if right.duplicated(on).any(): + right = right.drop_duplicates(on, keep='last') + + return _merger(left, right) # TODO: transformations?? @@ -159,6 +461,7 @@ class _MergeOperation(object): Perform a database (SQL) merge operation between two DataFrame objects using either columns as keys or their row indexes """ + _merge_type = 'merge' def __init__(self, left, right, how='inner', on=None, left_on=None, right_on=None, axis=1, @@ -206,6 +509,8 @@ def __init__(self, left, right, how='inner', on=None, msg = msg.format(left.columns.nlevels, right.columns.nlevels) warnings.warn(msg, UserWarning) + self._validate_specification() + # note this function has side effects (self.left_join_keys, self.right_join_keys, @@ -233,7 +538,7 @@ def get_result(self): concat_axis=0, copy=self.copy) typ = self.left._constructor - result = typ(result_data).__finalize__(self, method='merge') + result = typ(result_data).__finalize__(self, method=self._merge_type) if self.indicator: result = self._indicator_post_merge(result) @@ -304,8 +609,8 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if left_has_missing: take_right = self.right_join_keys[i] - if not com.is_dtype_equal(result[name].dtype, - self.left[name].dtype): + if not is_dtype_equal(result[name].dtype, + self.left[name].dtype): take_left = self.left[name]._values elif name in self.right: @@ -316,8 +621,8 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if right_has_missing: take_left = self.left_join_keys[i] - if not com.is_dtype_equal(result[name].dtype, - self.right[name].dtype): + if not is_dtype_equal(result[name].dtype, + self.right[name].dtype): take_right = self.right[name]._values elif left_indexer is not None \ @@ -355,6 +660,13 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): else: result.insert(i, name or 'key_%d' % i, key_col) + def _get_join_indexers(self): + """ return the join indexers """ + return _get_join_indexers(self.left_join_keys, + self.right_join_keys, + sort=self.sort, + how=self.how) + def _get_join_info(self): left_ax = self.left._data.axes[self.axis] right_ax = self.right._data.axes[self.axis] @@ -373,9 +685,8 @@ def _get_join_info(self): sort=self.sort) else: (left_indexer, - right_indexer) = _get_join_indexers(self.left_join_keys, - self.right_join_keys, - sort=self.sort, how=self.how) + right_indexer) = self._get_join_indexers() + if self.right_index: if len(self.left) > 0: join_index = self.left.index.take(left_indexer) @@ -429,8 +740,6 @@ def _get_merge_keys(self): ------- left_keys, right_keys """ - self._validate_specification() - left_keys = [] right_keys = [] join_names = [] @@ -549,7 +858,8 @@ def _validate_specification(self): raise ValueError("len(right_on) must equal len(left_on)") -def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): +def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', + **kwargs): """ Parameters @@ -579,26 +889,27 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): lkey, rkey, count = fkeys(lkey, rkey) # preserve left frame order if how == 'left' and sort == False - kwargs = {'sort': sort} if how == 'left' else {} + kwargs = copy.copy(kwargs) + if how == 'left': + kwargs['sort'] = sort join_func = _join_functions[how] + return join_func(lkey, rkey, count, **kwargs) class _OrderedMerge(_MergeOperation): + _merge_type = 'ordered_merge' - def __init__(self, left, right, on=None, by=None, left_on=None, - right_on=None, axis=1, left_index=False, right_index=False, + def __init__(self, left, right, on=None, left_on=None, + right_on=None, axis=1, suffixes=('_x', '_y'), copy=True, - fill_method=None): + fill_method=None, how='outer'): self.fill_method = fill_method - _MergeOperation.__init__(self, left, right, on=on, left_on=left_on, right_on=right_on, axis=axis, - left_index=left_index, - right_index=right_index, - how='outer', suffixes=suffixes, - sort=True # sorts when factorizing + how=how, suffixes=suffixes, + sort=True # factorize sorts ) def get_result(self): @@ -629,13 +940,133 @@ def get_result(self): concat_axis=0, copy=self.copy) typ = self.left._constructor - result = typ(result_data).__finalize__(self, method='ordered_merge') + result = typ(result_data).__finalize__(self, method=self._merge_type) self._maybe_add_join_keys(result, left_indexer, right_indexer) return result +class _AsOfMerge(_OrderedMerge): + _merge_type = 'asof_merge' + + def __init__(self, left, right, on=None, by=None, left_on=None, + right_on=None, axis=1, + suffixes=('_x', '_y'), copy=True, + fill_method=None, + how='asof', tolerance=None, + allow_exact_matches=True): + + self.by = by + self.tolerance = tolerance + self.allow_exact_matches = allow_exact_matches + + _OrderedMerge.__init__(self, left, right, on=on, left_on=left_on, + right_on=right_on, axis=axis, + how=how, suffixes=suffixes, + fill_method=fill_method) + + def _validate_specification(self): + super(_AsOfMerge, self)._validate_specification() + + # we only allow on to be a single item for on + if len(self.left_on) != 1: + raise MergeError("can only asof on a key for left") + + if len(self.right_on) != 1: + raise MergeError("can only asof on a key for right") + + # add by to our key-list so we can have it in the + # output as a key + if self.by is not None: + if not is_list_like(self.by): + self.by = [self.by] + + self.left_on = self.by + list(self.left_on) + self.right_on = self.by + list(self.right_on) + + @property + def _asof_key(self): + """ This is our asof key, the 'on' """ + return self.left_on[-1] + + def _get_merge_keys(self): + + # note this function has side effects + (left_join_keys, + right_join_keys, + join_names) = super(_AsOfMerge, self)._get_merge_keys() + + # validate index types are the same + for lk, rk in zip(left_join_keys, right_join_keys): + if not is_dtype_equal(lk.dtype, rk.dtype): + raise MergeError("incompatible merge keys, " + "must be the same type") + + # validate tolerance; must be a Timedelta if we have a DTI + if self.tolerance is not None: + + lt = left_join_keys[self.left_on.index(self._asof_key)] + msg = "incompatible tolerance, must be compat " \ + "with type {0}".format(type(lt)) + + if is_datetime64_dtype(lt): + if not isinstance(self.tolerance, Timedelta): + raise MergeError(msg) + if self.tolerance < Timedelta(0): + raise MergeError("tolerance must be positive") + + elif is_int64_dtype(lt): + if not is_integer(self.tolerance): + raise MergeError(msg) + if self.tolerance < 0: + raise MergeError("tolerance must be positive") + + else: + raise MergeError(msg) + + # validate allow_exact_matches + if not is_bool(self.allow_exact_matches): + raise MergeError("allow_exact_matches must be boolean, " + "passed {0}".format(self.allow_exact_matches)) + + return left_join_keys, right_join_keys, join_names + + def _get_join_indexers(self): + """ return the join indexers """ + + # we required sortedness in the join keys + msg = " keys must be sorted" + for lk in self.left_join_keys: + if not Index(lk).is_monotonic: + raise ValueError('left' + msg) + for rk in self.right_join_keys: + if not Index(rk).is_monotonic: + raise ValueError('right' + msg) + + kwargs = {} + + # tolerance + t = self.tolerance + if t is not None: + lt = self.left_join_keys[self.left_on.index(self._asof_key)] + rt = self.right_join_keys[self.right_on.index(self._asof_key)] + if needs_i8_conversion(lt): + lt = lt.view('i8') + t = t.value + rt = rt.view('i8') + kwargs['left_distance'] = lt + kwargs['right_distance'] = rt + kwargs['tolerance'] = t + + return _get_join_indexers(self.left_join_keys, + self.right_join_keys, + sort=self.sort, + how=self.how, + allow_exact_matches=self.allow_exact_matches, + **kwargs) + + def _get_multiindex_indexer(join_keys, index, sort): from functools import partial @@ -717,6 +1148,7 @@ def _right_outer_join(x, y, max_groups): 'left': _algos.left_outer_join, 'right': _right_outer_join, 'outer': _algos.full_outer_join, + 'asof': _algos.left_outer_asof_join, } @@ -724,6 +1156,7 @@ def _factorize_keys(lk, rk, sort=True): if com.is_datetime64tz_dtype(lk) and com.is_datetime64tz_dtype(rk): lk = lk.values rk = rk.values + if com.is_int_or_datetime_dtype(lk) and com.is_int_or_datetime_dtype(rk): klass = _hash.Int64Factorizer lk = com._ensure_int64(com._values_from_object(lk)) diff --git a/pandas/tools/tests/data/allow_exact_matches.csv b/pandas/tools/tests/data/allow_exact_matches.csv new file mode 100644 index 0000000000000..0446fb744c540 --- /dev/null +++ b/pandas/tools/tests/data/allow_exact_matches.csv @@ -0,0 +1,28 @@ +time,ticker,price,quantity,marketCenter,bid,ask +20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,, +20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 +20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 +20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,, +20160525 13:30:00.075,AAPL,98.55,6,ARCA,, +20160525 13:30:00.075,AAPL,98.55,6,ARCA,, +20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 diff --git a/pandas/tools/tests/data/allow_exact_matches_and_tolerance.csv b/pandas/tools/tests/data/allow_exact_matches_and_tolerance.csv new file mode 100644 index 0000000000000..0446fb744c540 --- /dev/null +++ b/pandas/tools/tests/data/allow_exact_matches_and_tolerance.csv @@ -0,0 +1,28 @@ +time,ticker,price,quantity,marketCenter,bid,ask +20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,, +20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 +20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 +20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,, +20160525 13:30:00.075,AAPL,98.55,6,ARCA,, +20160525 13:30:00.075,AAPL,98.55,6,ARCA,, +20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 diff --git a/pandas/tools/tests/data/asof.csv b/pandas/tools/tests/data/asof.csv new file mode 100644 index 0000000000000..d7d061bc46ccc --- /dev/null +++ b/pandas/tools/tests/data/asof.csv @@ -0,0 +1,28 @@ +time,ticker,price,quantity,marketCenter,bid,ask +20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,51.95,51.95 +20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 +20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 +20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 diff --git a/pandas/tools/tests/data/asof2.csv b/pandas/tools/tests/data/asof2.csv new file mode 100644 index 0000000000000..2c9c0392dd617 --- /dev/null +++ b/pandas/tools/tests/data/asof2.csv @@ -0,0 +1,78 @@ +time,ticker,price,quantity,marketCenter,bid,ask +20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,51.95,51.95 +20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 +20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 +20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 +20160525 13:30:00.084,AAPL,98.64,40,NASDAQ,98.55,98.56 +20160525 13:30:00.084,AAPL,98.55,149,EDGX,98.55,98.56 +20160525 13:30:00.086,AAPL,98.56,500,ARCA,98.55,98.63 +20160525 13:30:00.104,AAPL,98.63,647,EDGX,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,300,EDGX,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,50,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,50,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,70,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,70,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,1,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,62,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,10,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,100,ARCA,98.62,98.63 +20160525 13:30:00.105,AAPL,98.63,100,ARCA,98.62,98.63 +20160525 13:30:00.105,AAPL,98.63,700,ARCA,98.62,98.63 +20160525 13:30:00.106,AAPL,98.63,61,EDGX,98.62,98.63 +20160525 13:30:00.107,AAPL,98.63,100,ARCA,98.62,98.63 +20160525 13:30:00.107,AAPL,98.63,53,ARCA,98.62,98.63 +20160525 13:30:00.108,AAPL,98.63,100,ARCA,98.62,98.63 +20160525 13:30:00.108,AAPL,98.63,839,ARCA,98.62,98.63 +20160525 13:30:00.115,AAPL,98.63,5,EDGX,98.62,98.63 +20160525 13:30:00.118,AAPL,98.63,295,EDGX,98.62,98.63 +20160525 13:30:00.118,AAPL,98.63,5,EDGX,98.62,98.63 +20160525 13:30:00.128,AAPL,98.63,100,NASDAQ,98.62,98.63 +20160525 13:30:00.128,AAPL,98.63,100,NASDAQ,98.62,98.63 +20160525 13:30:00.128,MSFT,51.92,100,ARCA,51.92,51.95 +20160525 13:30:00.129,AAPL,98.62,100,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,10,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,59,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,31,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,69,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,12,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,12,EDGX,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,100,ARCA,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,100,ARCA,98.61,98.63 +20160525 13:30:00.130,MSFT,51.95,317,ARCA,51.93,51.95 +20160525 13:30:00.130,MSFT,51.95,283,ARCA,51.93,51.95 +20160525 13:30:00.135,MSFT,51.93,100,EDGX,51.92,51.95 +20160525 13:30:00.135,AAPL,98.62,100,ARCA,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,12,NASDAQ,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,88,NASDAQ,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,162,NASDAQ,98.61,98.62 +20160525 13:30:00.144,AAPL,98.61,100,BATS,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,61,ARCA,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,25,ARCA,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,14,ARCA,98.61,98.62 +20160525 13:30:00.145,AAPL,98.62,12,ARCA,98.6,98.63 +20160525 13:30:00.145,AAPL,98.62,100,ARCA,98.6,98.63 +20160525 13:30:00.145,AAPL,98.63,100,NASDAQ,98.6,98.63 +20160525 13:30:00.145,AAPL,98.63,100,NASDAQ,98.6,98.63 diff --git a/pandas/tools/tests/cut_data.csv b/pandas/tools/tests/data/cut_data.csv similarity index 100% rename from pandas/tools/tests/cut_data.csv rename to pandas/tools/tests/data/cut_data.csv diff --git a/pandas/tools/tests/data/quotes.csv b/pandas/tools/tests/data/quotes.csv new file mode 100644 index 0000000000000..3f31d2cfffe1b --- /dev/null +++ b/pandas/tools/tests/data/quotes.csv @@ -0,0 +1,17 @@ +time,ticker,bid,ask +20160525 13:30:00.023,GOOG,720.50,720.93 +20160525 13:30:00.023,MSFT,51.95,51.95 +20160525 13:30:00.041,MSFT,51.95,51.95 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.072,GOOG,720.50,720.88 +20160525 13:30:00.075,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.92,51.95 diff --git a/pandas/tools/tests/data/quotes2.csv b/pandas/tools/tests/data/quotes2.csv new file mode 100644 index 0000000000000..7ade1e7faf1ae --- /dev/null +++ b/pandas/tools/tests/data/quotes2.csv @@ -0,0 +1,57 @@ +time,ticker,bid,ask +20160525 13:30:00.023,GOOG,720.50,720.93 +20160525 13:30:00.023,MSFT,51.95,51.95 +20160525 13:30:00.041,MSFT,51.95,51.95 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.072,GOOG,720.50,720.88 +20160525 13:30:00.075,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.92,51.95 +20160525 13:30:00.079,MSFT,51.92,51.95 +20160525 13:30:00.080,AAPL,98.55,98.56 +20160525 13:30:00.084,AAPL,98.55,98.56 +20160525 13:30:00.086,AAPL,98.55,98.63 +20160525 13:30:00.088,AAPL,98.65,98.63 +20160525 13:30:00.089,AAPL,98.63,98.63 +20160525 13:30:00.104,AAPL,98.63,98.63 +20160525 13:30:00.104,AAPL,98.63,98.63 +20160525 13:30:00.104,AAPL,98.63,98.63 +20160525 13:30:00.104,AAPL,98.63,98.63 +20160525 13:30:00.104,AAPL,98.62,98.63 +20160525 13:30:00.105,AAPL,98.62,98.63 +20160525 13:30:00.107,AAPL,98.62,98.63 +20160525 13:30:00.115,AAPL,98.62,98.63 +20160525 13:30:00.115,AAPL,98.62,98.63 +20160525 13:30:00.118,AAPL,98.62,98.63 +20160525 13:30:00.128,AAPL,98.62,98.63 +20160525 13:30:00.128,AAPL,98.62,98.63 +20160525 13:30:00.129,AAPL,98.62,98.63 +20160525 13:30:00.129,AAPL,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,98.63 +20160525 13:30:00.129,AAPL,98.62,98.63 +20160525 13:30:00.129,AAPL,98.61,98.63 +20160525 13:30:00.130,MSFT,51.93,51.95 +20160525 13:30:00.130,MSFT,51.93,51.95 +20160525 13:30:00.130,AAPL,98.61,98.63 +20160525 13:30:00.131,AAPL,98.61,98.62 +20160525 13:30:00.131,AAPL,98.61,98.62 +20160525 13:30:00.135,MSFT,51.92,51.95 +20160525 13:30:00.135,AAPL,98.61,98.62 +20160525 13:30:00.136,AAPL,98.61,98.62 +20160525 13:30:00.136,AAPL,98.61,98.62 +20160525 13:30:00.144,AAPL,98.61,98.62 +20160525 13:30:00.144,AAPL,98.61,98.62 +20160525 13:30:00.145,AAPL,98.61,98.62 +20160525 13:30:00.145,AAPL,98.61,98.63 +20160525 13:30:00.145,AAPL,98.61,98.63 +20160525 13:30:00.145,AAPL,98.60,98.63 +20160525 13:30:00.145,AAPL,98.61,98.63 +20160525 13:30:00.145,AAPL,98.60,98.63 diff --git a/pandas/tools/tests/data/tolerance.csv b/pandas/tools/tests/data/tolerance.csv new file mode 100644 index 0000000000000..d7d061bc46ccc --- /dev/null +++ b/pandas/tools/tests/data/tolerance.csv @@ -0,0 +1,28 @@ +time,ticker,price,quantity,marketCenter,bid,ask +20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,51.95,51.95 +20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 +20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 +20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 diff --git a/pandas/tools/tests/data/trades.csv b/pandas/tools/tests/data/trades.csv new file mode 100644 index 0000000000000..b26a4ce714255 --- /dev/null +++ b/pandas/tools/tests/data/trades.csv @@ -0,0 +1,28 @@ +time,ticker,price,quantity,marketCenter +20160525 13:30:00.023,MSFT,51.9500,75,NASDAQ +20160525 13:30:00.038,MSFT,51.9500,155,NASDAQ +20160525 13:30:00.048,GOOG,720.7700,100,NASDAQ +20160525 13:30:00.048,GOOG,720.9200,100,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,200,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,300,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,600,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,44,NASDAQ +20160525 13:30:00.074,AAPL,98.6700,478343,NASDAQ +20160525 13:30:00.075,AAPL,98.6700,478343,NASDAQ +20160525 13:30:00.075,AAPL,98.6600,6,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,30,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,75,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,20,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,35,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,10,NASDAQ +20160525 13:30:00.075,AAPL,98.5500,6,ARCA +20160525 13:30:00.075,AAPL,98.5500,6,ARCA +20160525 13:30:00.076,AAPL,98.5600,1000,ARCA +20160525 13:30:00.076,AAPL,98.5600,200,ARCA +20160525 13:30:00.076,AAPL,98.5600,300,ARCA +20160525 13:30:00.076,AAPL,98.5600,400,ARCA +20160525 13:30:00.076,AAPL,98.5600,600,ARCA +20160525 13:30:00.076,AAPL,98.5600,200,ARCA +20160525 13:30:00.078,MSFT,51.9500,783,NASDAQ +20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ +20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ diff --git a/pandas/tools/tests/data/trades2.csv b/pandas/tools/tests/data/trades2.csv new file mode 100644 index 0000000000000..64021faa68ce3 --- /dev/null +++ b/pandas/tools/tests/data/trades2.csv @@ -0,0 +1,78 @@ +time,ticker,price,quantity,marketCenter +20160525 13:30:00.023,MSFT,51.9500,75,NASDAQ +20160525 13:30:00.038,MSFT,51.9500,155,NASDAQ +20160525 13:30:00.048,GOOG,720.7700,100,NASDAQ +20160525 13:30:00.048,GOOG,720.9200,100,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,200,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,300,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,600,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,44,NASDAQ +20160525 13:30:00.074,AAPL,98.6700,478343,NASDAQ +20160525 13:30:00.075,AAPL,98.6700,478343,NASDAQ +20160525 13:30:00.075,AAPL,98.6600,6,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,30,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,75,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,20,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,35,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,10,NASDAQ +20160525 13:30:00.075,AAPL,98.5500,6,ARCA +20160525 13:30:00.075,AAPL,98.5500,6,ARCA +20160525 13:30:00.076,AAPL,98.5600,1000,ARCA +20160525 13:30:00.076,AAPL,98.5600,200,ARCA +20160525 13:30:00.076,AAPL,98.5600,300,ARCA +20160525 13:30:00.076,AAPL,98.5600,400,ARCA +20160525 13:30:00.076,AAPL,98.5600,600,ARCA +20160525 13:30:00.076,AAPL,98.5600,200,ARCA +20160525 13:30:00.078,MSFT,51.9500,783,NASDAQ +20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ +20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ +20160525 13:30:00.084,AAPL,98.6400,40,NASDAQ +20160525 13:30:00.084,AAPL,98.5500,149,EDGX +20160525 13:30:00.086,AAPL,98.5600,500,ARCA +20160525 13:30:00.104,AAPL,98.6300,647,EDGX +20160525 13:30:00.104,AAPL,98.6300,300,EDGX +20160525 13:30:00.104,AAPL,98.6300,50,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,50,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,70,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,70,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,1,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,62,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,10,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,100,ARCA +20160525 13:30:00.105,AAPL,98.6300,100,ARCA +20160525 13:30:00.105,AAPL,98.6300,700,ARCA +20160525 13:30:00.106,AAPL,98.6300,61,EDGX +20160525 13:30:00.107,AAPL,98.6300,100,ARCA +20160525 13:30:00.107,AAPL,98.6300,53,ARCA +20160525 13:30:00.108,AAPL,98.6300,100,ARCA +20160525 13:30:00.108,AAPL,98.6300,839,ARCA +20160525 13:30:00.115,AAPL,98.6300,5,EDGX +20160525 13:30:00.118,AAPL,98.6300,295,EDGX +20160525 13:30:00.118,AAPL,98.6300,5,EDGX +20160525 13:30:00.128,AAPL,98.6300,100,NASDAQ +20160525 13:30:00.128,AAPL,98.6300,100,NASDAQ +20160525 13:30:00.128,MSFT,51.9200,100,ARCA +20160525 13:30:00.129,AAPL,98.6200,100,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,10,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,59,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,31,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,69,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,12,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,12,EDGX +20160525 13:30:00.129,AAPL,98.6200,100,ARCA +20160525 13:30:00.129,AAPL,98.6200,100,ARCA +20160525 13:30:00.130,MSFT,51.9500,317,ARCA +20160525 13:30:00.130,MSFT,51.9500,283,ARCA +20160525 13:30:00.135,MSFT,51.9300,100,EDGX +20160525 13:30:00.135,AAPL,98.6200,100,ARCA +20160525 13:30:00.144,AAPL,98.6200,12,NASDAQ +20160525 13:30:00.144,AAPL,98.6200,88,NASDAQ +20160525 13:30:00.144,AAPL,98.6200,162,NASDAQ +20160525 13:30:00.144,AAPL,98.6100,100,BATS +20160525 13:30:00.144,AAPL,98.6200,61,ARCA +20160525 13:30:00.144,AAPL,98.6200,25,ARCA +20160525 13:30:00.144,AAPL,98.6200,14,ARCA +20160525 13:30:00.145,AAPL,98.6200,12,ARCA +20160525 13:30:00.145,AAPL,98.6200,100,ARCA +20160525 13:30:00.145,AAPL,98.6300,100,NASDAQ +20160525 13:30:00.145,AAPL,98.6300,100,NASDAQ diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py new file mode 100644 index 0000000000000..5d78ccf199ed3 --- /dev/null +++ b/pandas/tools/tests/test_merge_asof.py @@ -0,0 +1,352 @@ +import nose +import os + +import numpy as np +import pandas as pd +from pandas import (merge_asof, read_csv, + to_datetime, Timedelta) +from pandas.tools.merge import MergeError +from pandas.util import testing as tm +from pandas.util.testing import assert_frame_equal + + +class TestAsOfMerge(tm.TestCase): + _multiprocess_can_split_ = True + + def read_data(self, name, dedupe=False): + path = os.path.join(tm.get_data_path(), name) + x = read_csv(path) + if dedupe: + x = (x.drop_duplicates(['time', 'ticker'], keep='last') + .reset_index(drop=True) + ) + x.time = to_datetime(x.time) + return x + + def setUp(self): + + self.trades = self.read_data('trades.csv') + self.quotes = self.read_data('quotes.csv', dedupe=True) + self.asof = self.read_data('asof.csv') + self.tolerance = self.read_data('tolerance.csv') + self.allow_exact_matches = self.read_data('allow_exact_matches.csv') + self.allow_exact_matches_and_tolerance = self.read_data( + 'allow_exact_matches_and_tolerance.csv') + + def test_examples1(self): + """ doc-string examples """ + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 6, 7], + 'right_val': [1, 2, 3, 6, 7]}) + + pd.merge_asof(left, right, on='a') + + def test_examples2(self): + """ doc-string examples """ + + trades = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.038', + '20160525 13:30:00.048', + '20160525 13:30:00.048', + '20160525 13:30:00.048']), + 'ticker': ['MSFT', 'MSFT', + 'GOOG', 'GOOG', 'AAPL'], + 'price': [51.95, 51.95, + 720.77, 720.92, 98.00], + 'quantity': [75, 155, + 100, 100, 100]}, + columns=['time', 'ticker', 'price', 'quantity']) + + quotes = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.023', + '20160525 13:30:00.030', + '20160525 13:30:00.041', + '20160525 13:30:00.048', + '20160525 13:30:00.049', + '20160525 13:30:00.072', + '20160525 13:30:00.075']), + 'ticker': ['GOOG', 'MSFT', 'MSFT', + 'MSFT', 'GOOG', 'AAPL', 'GOOG', + 'MSFT'], + 'bid': [720.50, 51.95, 51.97, 51.99, + 720.50, 97.99, 720.50, 52.01], + 'ask': [720.93, 51.96, 51.98, 52.00, + 720.93, 98.01, 720.88, 52.03]}, + columns=['time', 'ticker', 'bid', 'ask']) + + pd.merge_asof(trades, quotes, + on='time', + by='ticker') + + pd.merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=pd.Timedelta('2ms')) + + pd.merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=pd.Timedelta('10ms'), + allow_exact_matches=False) + + def test_basic(self): + + expected = self.asof + trades = self.trades + quotes = self.quotes + + result = merge_asof(trades, quotes, + on='time', + by='ticker') + assert_frame_equal(result, expected) + + def test_basic_categorical(self): + + expected = self.asof + trades = self.trades.copy() + trades.ticker = trades.ticker.astype('category') + quotes = self.quotes.copy() + quotes.ticker = quotes.ticker.astype('category') + + result = merge_asof(trades, quotes, + on='time', + by='ticker') + assert_frame_equal(result, expected) + + def test_missing_right_by(self): + + expected = self.asof + trades = self.trades + quotes = self.quotes + + q = quotes[quotes.ticker != 'MSFT'] + result = merge_asof(trades, q, + on='time', + by='ticker') + expected.loc[expected.ticker == 'MSFT', ['bid', 'ask']] = np.nan + assert_frame_equal(result, expected) + + def test_basic2(self): + + expected = self.read_data('asof2.csv') + trades = self.read_data('trades2.csv') + quotes = self.read_data('quotes2.csv', dedupe=True) + + result = merge_asof(trades, quotes, + on='time', + by='ticker') + assert_frame_equal(result, expected) + + def test_basic_no_by(self): + f = lambda x: x[x.ticker == 'MSFT'].drop('ticker', axis=1) \ + .reset_index(drop=True) + + # just use a single ticker + expected = f(self.asof) + trades = f(self.trades) + quotes = f(self.quotes) + + result = merge_asof(trades, quotes, + on='time') + assert_frame_equal(result, expected) + + def test_valid_join_keys(self): + + trades = self.trades + quotes = self.quotes + + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + left_on='time', + right_on='bid', + by='ticker') + + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + on=['time', 'ticker'], + by='ticker') + + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + by='ticker') + + def test_with_duplicates(self): + + q = pd.concat([self.quotes, self.quotes]).sort_values( + ['time', 'ticker']).reset_index(drop=True) + result = merge_asof(self.trades, q, + on='time', + by='ticker') + expected = self.read_data('asof.csv') + assert_frame_equal(result, expected) + + result = merge_asof(self.trades, q, + on='time', + by='ticker', + check_duplicates=False) + expected = self.read_data('asof.csv') + expected = pd.concat([expected, expected]).sort_values( + ['time', 'ticker']).reset_index(drop=True) + + # the results are not ordered in a meaningful way + # nor are the exact matches duplicated, so comparisons + # are pretty tricky here, however the uniques are the same + + def aligner(x, ticker): + return (x[x.ticker == ticker] + .sort_values(['time', 'ticker', 'quantity', 'price', + 'marketCenter', 'bid', 'ask']) + .drop_duplicates(keep='last') + .reset_index(drop=True) + ) + + for ticker in expected.ticker.unique(): + r = aligner(result, ticker) + e = aligner(expected, ticker) + assert_frame_equal(r, e) + + def test_with_duplicates_no_on(self): + + df1 = pd.DataFrame({'key': [1, 1, 3], + 'left_val': [1, 2, 3]}) + df2 = pd.DataFrame({'key': [1, 3, 3], + 'right_val': [1, 2, 3]}) + result = merge_asof(df1, df2, on='key', check_duplicates=False) + expected = pd.DataFrame({'key': [1, 1, 3, 3], + 'left_val': [1, 2, 3, 3], + 'right_val': [1, 1, 2, 3]}) + assert_frame_equal(result, expected) + + df1 = pd.DataFrame({'key': [1, 1, 3], + 'left_val': [1, 2, 3]}) + df2 = pd.DataFrame({'key': [1, 2, 2], + 'right_val': [1, 2, 3]}) + result = merge_asof(df1, df2, on='key') + expected = pd.DataFrame({'key': [1, 1, 3], + 'left_val': [1, 2, 3], + 'right_val': [1, 1, 3]}) + assert_frame_equal(result, expected) + + def test_valid_allow_exact_matches(self): + + trades = self.trades + quotes = self.quotes + + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + on='time', + by='ticker', + allow_exact_matches='foo') + + def test_valid_tolerance(self): + + trades = self.trades + quotes = self.quotes + + # dti + merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=Timedelta('1s')) + + # integer + merge_asof(trades.reset_index(), quotes.reset_index(), + on='index', + by='ticker', + tolerance=1) + + # incompat + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=1) + + # invalid + with self.assertRaises(MergeError): + merge_asof(trades.reset_index(), quotes.reset_index(), + on='index', + by='ticker', + tolerance=1.0) + + # invalid negative + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=-Timedelta('1s')) + + with self.assertRaises(MergeError): + merge_asof(trades.reset_index(), quotes.reset_index(), + on='index', + by='ticker', + tolerance=-1) + + def test_non_sorted(self): + + trades = self.trades.sort_values('time', ascending=False) + quotes = self.quotes.sort_values('time', ascending=False) + + # we require that we are already sorted on time & quotes + self.assertFalse(trades.time.is_monotonic) + self.assertFalse(quotes.time.is_monotonic) + with self.assertRaises(ValueError): + merge_asof(trades, quotes, + on='time', + by='ticker') + + trades = self.trades.sort_values('time') + self.assertTrue(trades.time.is_monotonic) + self.assertFalse(quotes.time.is_monotonic) + with self.assertRaises(ValueError): + merge_asof(trades, quotes, + on='time', + by='ticker') + + quotes = self.quotes.sort_values('time') + self.assertTrue(trades.time.is_monotonic) + self.assertTrue(quotes.time.is_monotonic) + + # ok, though has dupes + merge_asof(trades, self.quotes, + on='time', + by='ticker') + + def test_tolerance(self): + + trades = self.trades + quotes = self.quotes + + result = merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=Timedelta('1day')) + expected = self.tolerance + assert_frame_equal(result, expected) + + def test_allow_exact_matches(self): + + result = merge_asof(self.trades, self.quotes, + on='time', + by='ticker', + allow_exact_matches=False) + expected = self.allow_exact_matches + assert_frame_equal(result, expected) + + def test_allow_exact_matches_and_tolerance(self): + + result = merge_asof(self.trades, self.quotes, + on='time', + by='ticker', + tolerance=Timedelta('100ms'), + allow_exact_matches=False) + expected = self.allow_exact_matches_and_tolerance + assert_frame_equal(result, expected) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/tests/test_ordered_merge.py b/pandas/tools/tests/test_merge_ordered.py similarity index 85% rename from pandas/tools/tests/test_ordered_merge.py rename to pandas/tools/tests/test_merge_ordered.py index 53f00d9761f32..0511a0ca6d1cf 100644 --- a/pandas/tools/tests/test_ordered_merge.py +++ b/pandas/tools/tests/test_merge_ordered.py @@ -1,7 +1,7 @@ import nose import pandas as pd -from pandas import DataFrame, ordered_merge +from pandas import DataFrame, merge_ordered from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal @@ -17,10 +17,15 @@ def setUp(self): self.right = DataFrame({'key': ['b', 'c', 'd', 'f'], 'rvalue': [1, 2, 3., 4]}) + def test_deprecation(self): + + with tm.assert_produces_warning(FutureWarning): + pd.ordered_merge(self.left, self.right, on='key') + # GH #813 def test_basic(self): - result = ordered_merge(self.left, self.right, on='key') + result = merge_ordered(self.left, self.right, on='key') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], 'lvalue': [1, nan, 2, nan, 3, nan], 'rvalue': [nan, 1, 2, 3, nan, 4]}) @@ -28,7 +33,7 @@ def test_basic(self): assert_frame_equal(result, expected) def test_ffill(self): - result = ordered_merge( + result = merge_ordered( self.left, self.right, on='key', fill_method='ffill') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], 'lvalue': [1., 1, 2, 2, 3, 3.], @@ -42,7 +47,7 @@ def test_multigroup(self): left['group'] = ['a'] * 3 + ['b'] * 3 # right['group'] = ['a'] * 4 + ['b'] * 4 - result = ordered_merge(left, self.right, on='key', left_by='group', + result = merge_ordered(left, self.right, on='key', left_by='group', fill_method='ffill') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2, 'lvalue': [1., 1, 2, 2, 3, 3.] * 2, @@ -51,11 +56,11 @@ def test_multigroup(self): assert_frame_equal(result, expected.ix[:, result.columns]) - result2 = ordered_merge(self.right, left, on='key', right_by='group', + result2 = merge_ordered(self.right, left, on='key', right_by='group', fill_method='ffill') assert_frame_equal(result, result2.ix[:, result.columns]) - result = ordered_merge(left, self.right, on='key', left_by='group') + result = merge_ordered(left, self.right, on='key', left_by='group') self.assertTrue(result['group'].notnull().all()) def test_merge_type(self): diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 0b91fd1ef1c02..bb5429b5e8836 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -216,8 +216,7 @@ def test_label_formatting(self): def test_qcut_binning_issues(self): # #1978, 1979 - path = os.path.join(curpath(), 'cut_data.csv') - + path = os.path.join(tm.get_data_path(), 'cut_data.csv') arr = np.loadtxt(path) result = qcut(arr, 20) diff --git a/setup.py b/setup.py index 1d189364239a9..adea92896d382 100755 --- a/setup.py +++ b/setup.py @@ -591,6 +591,7 @@ def pxd(name): 'tests/data/*.xlsx', 'tests/data/*.xlsm', 'tests/data/*.table', + 'tests/tools/data/*.csv', 'tests/parser/data/*.csv', 'tests/parser/data/*.gz', 'tests/parser/data/*.bz2',