terrier-org · cmacdonald · Dec 20, 2021 · Nov 9, 2021 · Nov 10, 2021 · Dec 20, 2021
diff --git a/docs/transformer.rst b/docs/transformer.rst
@@ -5,12 +5,12 @@ PyTerrier Transformers
 
 PyTerrier's retrieval architecture is based on three concepts:
 
- - dataframes with pre-defined types (each with a minimum set of known attributes), as detailed in the data amodel.
+ - dataframes with pre-defined types (each with a minimum set of known attributes), as detailed in the data model.
  - the *transformation* of those dataframes by standard information retrieval operations, defined as transformers.
  - the compsition of transformers, supported by the operators defined on transformers.
 
 In essence, a PyTerrier transformer is a class with a ``transform()`` method, which takes as input a dataframe, and changes it,
-before returning it.
+before returning it. 
 
 +-------+---------+-------------+------------------+------------------------------+
 + Input | Output  | Cardinality | Example          | Concrete Transformer Example |
@@ -47,16 +47,16 @@ estimators within that pipeline.
 Transformer base classes
 ========================
 
-TransformerBase
+Transformer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 This class is the base class for all transformers.
 
-.. autoclass:: pyterrier.transformer.TransformerBase
+.. autoclass:: pyterrier.Transformer
     :members:
 
-Moreover, by extending TransformerBase, all transformer implementations gain the necessary "dunder" methods (e.g. `__rshift__()`)
-to support the transformer operators (`>>`, `+` etc). 
+Moreover, by extending Transformer, all transformer implementations gain the necessary "dunder" methods (e.g. ``__rshift__()``)
+to support the transformer operators (`>>`, `+` etc). NB: This class used to be called ``pyterrier.transformer.TransformerBase``
 
 .. _pt.transformer.estimatorbase:
 
@@ -129,10 +129,30 @@ Several common transformations are supported through the functions in the :ref:`
 :ref:`pyterrier.apply` documentation.
 
 However, if your transformer has state, such as an expensive model to be loaded at startup time, you may want to 
-extend TransformerBase directly. 
+extend ``pt.Transformer`` directly. 
 
 Here are some hints for writing Transformers:
  - Except for an indexer, you should implement a ``transform()`` method.
  - If your approach ranks results, use ``pt.model.add_ranks()`` to add the rank column.
  - If your approach can be trained, your transformer should extend EstimatorBase, and implement the ``fit()`` method.
- - If your approach is an indexer, your transformer should extend IterDictIndexerBase and implement ``index()`` method.
+ - If your approach is an indexer, your transformer should extend IterDictIndexerBase and implement ``index()`` method.
+
+
+Mocking Transformers from DataFrames
+====================================
+
+You can make a Transformer object from dataframes. For instance, a unifom transformer will always return the input
+dataframe any time ``transform()`` is called::
+
+  df = pt.new.ranked_documents([[1,2]])
+  uniformT = pt.Transformer.from_df(df, uniform=True)
+  # uniformT.transform() always returns df, regardless of arguments
+
+You can also create a Transformer object from existing results, e.g. saved on disk using ``pt.io.write_results()`` 
+etc. The resulting "source transformer" will return all results by matching on the qid of the input::
+
+  res = pt.io.read_results("/path/to/baseline.res.gz")
+  baselineT = pt.Transformer.from_df(df, uniform=True)
+
+  Q1 = pt.new.queries("test query", qid="Q1")
+  resQ1 = baselineT.transform(Q1)
diff --git a/pyterrier/__init__.py b/pyterrier/__init__.py
@@ -1,6 +1,7 @@
 __version__ = "0.8.0-alpha"
 
 import os
+
 from .bootstrap import _logging, setup_terrier, setup_jnius, is_windows
 
 import importlib
@@ -21,6 +22,7 @@
 rewrite = None
 text = None
 transformer = None
+Transformer = None
 
 file_path = os.path.dirname(os.path.abspath(__file__))
 firstInit = False
@@ -139,6 +141,7 @@ def init(version=None, mem=None, packages=[], jvm_opts=[], redirect_io=True, log
     from .datasets import get_dataset, find_datasets, list_datasets
     from .index import Indexer, FilesIndexer, TRECCollectionIndexer, DFIndexer, DFIndexUtils, IterDictIndexer, FlatJSONDocumentIterator, IndexingType
     from .pipelines import Experiment, GridScan, GridSearch, KFoldGridSearch
+    from .transformer import Transformer
 
     # Make imports global
     globals()["autoclass"] = autoclass
@@ -190,6 +193,7 @@ def init(version=None, mem=None, packages=[], jvm_opts=[], redirect_io=True, log
     globals()["GridScan"] = GridScan
     globals()["GridSearch"] = GridSearch
     globals()["KFoldGridSearch"] = KFoldGridSearch
+    globals()["Transformer"] = Transformer
 
 
     # we save the pt.init() arguments so that other processes,

diff --git a/pyterrier/apply.py b/pyterrier/apply.py
@@ -1,5 +1,5 @@
 from typing import Callable, Any, Dict
-from .transformer import ApplyDocumentScoringTransformer, ApplyQueryTransformer, ApplyDocFeatureTransformer, ApplyForEachQuery, ApplyGenericTransformer, TransformerBase
+from .transformer import ApplyDocumentScoringTransformer, ApplyQueryTransformer, ApplyDocFeatureTransformer, ApplyForEachQuery, ApplyGenericTransformer, Transformer
 from nptyping import NDArray
 import numpy as np
 import pandas as pd
@@ -16,7 +16,7 @@ def _bind(instance, func, as_name=None):
     setattr(instance, as_name, bound_method)
     return bound_method
 
-def query(fn : Callable[..., str], *args, **kwargs) -> TransformerBase:
+def query(fn : Callable[..., str], *args, **kwargs) -> Transformer:
     """
         Create a transformer that takes as input a query, and applies a supplied function to compute a new query formulation.
 
@@ -58,7 +58,7 @@ def _remove_stops(q):
     """
     return ApplyQueryTransformer(fn, *args, **kwargs)
 
-def doc_score(fn : Callable[..., float], *args, **kwargs) -> TransformerBase:
+def doc_score(fn : Callable[..., float], *args, **kwargs) -> Transformer:
     """
         Create a transformer that takes as input a ranked documents dataframe, and applies a supplied function to compute a new score.
         Ranks are automatically computed.
@@ -79,7 +79,7 @@ def doc_score(fn : Callable[..., float], *args, **kwargs) -> TransformerBase:
     """
     return ApplyDocumentScoringTransformer(fn, *args, **kwargs)
 
-def doc_features(fn : Callable[..., NDArray[Any]], *args, **kwargs) -> TransformerBase:
+def doc_features(fn : Callable[..., NDArray[Any]], *args, **kwargs) -> Transformer:
     """
         Create a transformer that takes as input a ranked documents dataframe, and applies the supplied function to each document to compute feature scores. 
 
@@ -108,7 +108,7 @@ def _features(row):
     """
     return ApplyDocFeatureTransformer(fn, *args, **kwargs)
 
-def rename(columns : Dict[str,str], *args, **kwargs):
+def rename(columns : Dict[str,str], *args, **kwargs) -> Transformer:
     """
         Creates a transformer that renames columns in a dataframe. 
 
@@ -121,7 +121,7 @@ def rename(columns : Dict[str,str], *args, **kwargs):
     """
     return ApplyGenericTransformer(lambda df: df.rename(columns=columns), *args, **kwargs)
 
-def generic(fn : Callable[[pd.DataFrame], pd.DataFrame], *args, **kwargs) -> TransformerBase:
+def generic(fn : Callable[[pd.DataFrame], pd.DataFrame], *args, **kwargs) -> Transformer:
     """
         Create a transformer that changes the input dataframe to another dataframe in an unspecified way.
 
@@ -142,7 +142,7 @@ def generic(fn : Callable[[pd.DataFrame], pd.DataFrame], *args, **kwargs) -> Tra
     """
     return ApplyGenericTransformer(fn, *args, **kwargs)
 
-def by_query(fn : Callable[[pd.DataFrame], pd.DataFrame], *args, **kwargs) -> TransformerBase:
+def by_query(fn : Callable[[pd.DataFrame], pd.DataFrame], *args, **kwargs) -> Transformer:
     """
         As `pt.apply.generic()` except that fn receives a dataframe for one query at at time, rather than all results at once.
     """
@@ -162,7 +162,7 @@ def __getattr__(self, item):
         from functools import partial
         return partial(generic_apply, item)
 
-def generic_apply(name, *args, drop=False, **kwargs) -> TransformerBase:
+def generic_apply(name, *args, drop=False, **kwargs) -> Transformer:
     if drop:
         return ApplyGenericTransformer(lambda df : df.drop(name, axis=1), *args, **kwargs) 
 

diff --git a/pyterrier/transformer.py b/pyterrier/transformer.py
@@ -6,7 +6,7 @@
 from .model import add_ranks
 from . import tqdm
 import deprecation
-from typing import Iterable, Iterator
+from typing import Iterable, Iterator, Union
 
 LAMBDA = lambda:0
 def is_lambda(v):
@@ -31,10 +31,13 @@ def get_transformer(v):
     if is_transformer(v):
         return v
     if is_lambda(v):
+        warn('Coercion of a lambda into a transformer is deprecated; use a pt.apply instead')
         return ApplyGenericTransformer(v)
     if is_function(v):
+        warn('Coercion of a function into a transformer is deprecated; use a pt.apply instead')
         return ApplyGenericTransformer(v)
     if isinstance(v, pd.DataFrame):
+        warn('Coercion of a dataframe into a transformer is deprecated; use a pt.Transformer.from_df() instead')
         return SourceTransformer(v)
     raise ValueError("Passed parameter %s of type %s cannot be coerced into a transformer" % (str(v), type(v)))
 
@@ -111,14 +114,29 @@ def __init__(self, name, value):
         super().__init__(name)
         self.value = value
 
-class TransformerBase:
-    name = "TransformerBase"
+class Transformer:
+    name = "Transformer"
     """
         Base class for all transformers. Implements the various operators ``>>`` ``+`` ``*`` ``|`` ``&`` 
         as well as ``search()`` for executing a single query and ``compile()`` for rewriting complex pipelines into more simples ones.
     """
 
-    def transform(self, topics_or_res):
+    @staticmethod
+    def from_df(input : pd.DataFrame, uniform=False) -> 'Transformer':
+        """
+        Instantiates a transformer from an input dataframe. Some rows from the input dataframe are returned
+        in response to a query on the ``transform()`` method. Depending on the value `uniform`, the dataframe
+        passed as an argument to ``transform()`` can affect this selection.
+
+        If `uniform` is True, input will be returned in its entirety each time.
+        If `uniform` is False, rows from input that match the qid values from the argument dataframe.
+
+        """
+        if uniform:
+            return UniformTransformer(input)
+        return SourceTransformer(input)
+
+    def transform(self, topics_or_res : pd.DataFrame) -> pd.DataFrame:
         """
             Abstract method for all transformations. Typically takes as input a Pandas
             DataFrame, and also returns one.
@@ -200,7 +218,7 @@ def search(self, query : str, qid : str = "1", sort=True) -> pd.DataFrame:
             rtr = rtr.sort_values(["qid", "rank"], ascending=[True,True])
         return rtr
 
-    def compile(self):
+    def compile(self) -> 'Transformer':
         """
         Rewrites this pipeline by applying of the Matchpy rules in rewrite_rules. Pipeline
         optimisation is discussed in the `ICTIR 2020 paper on PyTerrier <https://arxiv.org/abs/2007.14271>`_.
@@ -210,7 +228,7 @@ def compile(self):
         print("Applying %d rules" % len(rewrite_rules))
         return replace_all(self, rewrite_rules)
 
-    def parallel(self, N : int, backend='joblib'):
+    def parallel(self, N : int, backend='joblib') -> 'Transformer':
         """
         Returns a parallelised version of this transformer. The underlying transformer must be "picklable".
 
@@ -244,52 +262,55 @@ def set_parameter(self, name : str, value):
             raise ValueError(('Invalid parameter name %s for transformer %s. '+
                     'Check the list of available parameters') %(name, str(self)))
 
-    def __call__(self, *args, **kwargs):
+    def __call__(self, *args, **kwargs) -> pd.DataFrame:
         """
             Sets up a default method for every transformer, which is aliased to transform(). 
         """
         return self.transform(*args, **kwargs)
 
-    def __rshift__(self, right):
+    def __rshift__(self, right) -> 'Transformer':
         return ComposedPipeline(self, right)
 
-    def __rrshift__(self, left):
+    def __rrshift__(self, left) -> 'Transformer':
         return ComposedPipeline(left, self)
 
-    def __add__(self, right):
+    def __add__(self, right : 'Transformer') -> 'Transformer':
         return CombSumTransformer(self, right)
 
-    def __pow__(self, right):
+    def __pow__(self, right : 'Transformer') -> 'Transformer':
         return FeatureUnionPipeline(self, right)
 
-    def __mul__(self, rhs):
+    def __mul__(self, rhs : Union[float,int]) -> 'Transformer':
         assert isinstance(rhs, int) or isinstance(rhs, float)
         return ScalarProductTransformer(self, rhs)
 
-    def __rmul__(self, lhs):
+    def __rmul__(self, lhs : Union[float,int]) -> 'Transformer':
         assert isinstance(lhs, int) or isinstance(lhs, float)
         return ScalarProductTransformer(self, lhs)
 
-    def __or__(self, right):
+    def __or__(self, right : 'Transformer') -> 'Transformer':
         return SetUnionTransformer(self, right)
 
-    def __and__(self, right):
+    def __and__(self, right : 'Transformer') -> 'Transformer':
         return SetIntersectionTransformer(self, right)
 
-    def __mod__(self, right):
+    def __mod__(self, right : 'Transformer') -> 'Transformer':
         assert isinstance(right, int)
         return RankCutoffTransformer(self, right)
 
-    def __xor__(self, right):
+    def __xor__(self, right : 'Transformer') -> 'Transformer':
         return ConcatenateTransformer(self, right)
 
-    def __invert__(self):
+    def __invert__(self : 'Transformer') -> 'Transformer':
         from .cache import ChestCacheTransformer
         return ChestCacheTransformer(self)
 
     def __hash__(self):
         return hash(repr(self))
 
+class TransformerBase(Transformer):
+    pass
+
 class IterDictIndexerBase(TransformerBase):
     def index(self, iter : Iterable[dict], **kwargs):
         """

diff --git a/tests/test_experiment.py b/tests/test_experiment.py
@@ -156,7 +156,7 @@ def test_save(self):
 
     def test_empty(self):
         df1 = pt.new.ranked_documents([[1]]).head(0)
-        t1 = pt.transformer.SourceTransformer(df1)
+        t1 = pt.Transformer.from_df(df1)
 
         topics = pt.datasets.get_dataset("vaswani").get_topics().head(10)
         qrels =  pt.datasets.get_dataset("vaswani").get_qrels()
@@ -212,9 +212,8 @@ def test_differing_order(self):
         res1 = pd.DataFrame([ ["q2", "d1", 2.0], ["q1", "d1", 1.0],], columns=["qid", "docno", "score"])
         res2 = pd.DataFrame([["q1", "d1", 1.0], ["q2", "d1", 2.0] ], columns=["qid", "docno", "score"])
         qrels = pd.DataFrame([["q1", "d1", 1], ["q2", "d3", 1] ], columns=["qid", "docno", "label"])
-        from pyterrier.transformer import UniformTransformer
         measures = pt.Experiment(
-                [UniformTransformer(res1), UniformTransformer(res2)],
+                [pt.Transformer.from_df(res1, uniform=True), pt.Transformer.from_df(res2, uniform=True)],
                 topics,
                 qrels,
                 ["map"],