Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

df coerce #230

Merged
merged 5 commits into from
Sep 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions pyterrier/model.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pandas as pd
from typing import List, Sequence

Expand Down Expand Up @@ -165,6 +166,35 @@ def coerce_queries_dataframe(query):
raise ValueError("Could not coerce %s (type %s) into a DataFrame of queries" % (str(query), str(type(query))))


def coerce_dataframe_types(dataframe):
"""
Changes data types to match standard values. The dataframe need not have all the columns,
but if they are present, will cast the values to the proper types.
- ``qid`` -> ``str``
- ``docno`` -> ``str``
- ``score`` -> ``float``

Args:
dataframe: a Pandas dataframe

Returns:
dataframe with data types properly set
"""
TYPE_MAP = { # python type -> acceptable numpy types
str: (np.dtype('O'),),
float: (np.dtype('float32'), np.dtype('float64')),
}
COLUMN_MAP = { # column name -> python type
'qid': str,
'docno': str,
'score': float,
}
for column, dtype in COLUMN_MAP.items():
if column in dataframe.columns and dataframe[column].dtype not in TYPE_MAP[dtype]:
dataframe[column] = dataframe[column].astype(dtype)
return dataframe


def split_df(df : pd.DataFrame, N) -> List[pd.DataFrame]:
"""
splits a dataframe into N different chunks. Splitting will be sensitive to the primary datatype
Expand Down
6 changes: 5 additions & 1 deletion pyterrier/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Callable, Union, Dict, List, Tuple, Sequence, Any
from .utils import Utils
from .transformer import TransformerBase, EstimatorBase
from .model import add_ranks
from .model import add_ranks, coerce_dataframe_types
import deprecation
import ir_measures
from ir_measures.measures import BaseMeasure
Expand Down Expand Up @@ -137,6 +137,7 @@ def _run_and_evaluate(
# if its a DataFrame, use it as the results
if isinstance(system, pd.DataFrame):
res = system
res = coerce_dataframe_types(res)
if len(res) == 0:
raise ValueError("%d topics, but no results in dataframe" % len(topics))
evalMeasuresDict = _ir_measures_to_dict(
Expand All @@ -155,6 +156,8 @@ def _run_and_evaluate(
endtime = timer()
runtime = (endtime - starttime) * 1000.

res = coerce_dataframe_types(res)

if len(res) == 0:
raise ValueError("%d topics, but no results received from %s" % (len(topics), str(system)) )

Expand All @@ -176,6 +179,7 @@ def _run_and_evaluate(
raise ValueError("batch of %d topics, but no results received in batch %d from %s" % (len(batch_topics), i, str(system) ) )
endtime = timer()
runtime += (endtime - starttime) * 1000.
res = coerce_dataframe_types(res)
batch_qids = set(batch_topics.qid)
batch_qrels = qrels[qrels.query_id.isin(batch_qids)] # filter qrels down to just the qids that appear in this batch
remaining_qrel_qids.difference_update(batch_qids)
Expand Down
1 change: 1 addition & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ sklearn
xgboost
ray
fastrank>=0.7.0
torch
36 changes: 35 additions & 1 deletion tests/test_model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .base import BaseTestCase
import pandas as pd
from pyterrier.model import add_ranks, FIRST_RANK, coerce_queries_dataframe, split_df
from pyterrier.model import add_ranks, FIRST_RANK, coerce_queries_dataframe, coerce_dataframe_types, split_df
import pyterrier as pt
class TestModel(BaseTestCase):

Expand Down Expand Up @@ -160,6 +160,40 @@ def test_coerce_dataframe_with_tuple(self):
result = coerce_queries_dataframe(input)
self.assertTrue(exp_result.equals(result))

def test_coerce_dataframe_types(self):
with self.subTest('typical'):
input = pd.DataFrame([[1, 'query', 5, '1.3']], columns=['qid', 'query', 'docno', 'score'])
exp_result = pd.DataFrame([['1', 'query', '5', 1.3]], columns=['qid', 'query', 'docno', 'score'])
self.assertFalse(input.equals(exp_result))
result = coerce_dataframe_types(input)
pd.testing.assert_frame_equal(result, exp_result)
with self.subTest('missing column'):
input = pd.DataFrame([['query', 5, '1.3']], columns=['query', 'docno', 'score'])
exp_result = pd.DataFrame([['query', '5', 1.3]], columns=['query', 'docno', 'score'])
self.assertFalse(input.equals(exp_result))
result = coerce_dataframe_types(input)
pd.testing.assert_frame_equal(result, exp_result)
with self.subTest('score as integer'):
import torch
input = pd.DataFrame([[1, 'query', 5, 1]], columns=['qid', 'query', 'docno', 'score'])
exp_result = pd.DataFrame([['1', 'query', '5', 1.]], columns=['qid', 'query', 'docno', 'score'])
self.assertFalse(input.equals(exp_result))
result = coerce_dataframe_types(input)
pd.testing.assert_frame_equal(result, exp_result)
with self.subTest('score as torch type'):
import torch
input = pd.DataFrame([[1, 'query', 5, torch.tensor(1.3)]], columns=['qid', 'query', 'docno', 'score'])
exp_result = pd.DataFrame([['1', 'query', '5', 1.3]], columns=['qid', 'query', 'docno', 'score'])
self.assertFalse(input.equals(exp_result))
result = coerce_dataframe_types(input)
pd.testing.assert_frame_equal(result, exp_result)
with self.subTest('score not parsable as float'):
import torch
input = pd.DataFrame([[1, 'query', 5, 'A']], columns=['qid', 'query', 'docno', 'score'])
self.assertFalse(input.equals(exp_result))
with self.assertRaises(ValueError):
result = coerce_dataframe_types(input)

def test_split_Q(self):
df = pt.new.queries(["a", "b", "c"])
dfs = split_df(df, 2)
Expand Down