Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split convert observed data #7334

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/source/api/pytensorf.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ PyTensor utils
cont_inputs
floatX
intX
smartfloatX
constant_fold
CallableTensor
join_nonshared_inputs
make_shared_replacements
generator
convert_observed_data
convert_generator_data
convert_data
19 changes: 13 additions & 6 deletions pymc/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@

import pymc as pm

from pymc.pytensorf import convert_observed_data
from pymc.pytensorf import convert_data, smarttypeX
from pymc.vartypes import isgenerator

__all__ = [
"get_data",
Expand Down Expand Up @@ -98,7 +99,7 @@ def make_variable(self, gop, name=None):
def __init__(self, generator):
if not pm.vartypes.isgenerator(generator):
raise TypeError("Object should be generator like")
self.test_value = pm.smartfloatX(copy(next(generator)))
self.test_value = smarttypeX(copy(next(generator)))
# make pickling potentially possible
self._yielded_test_value = False
self.gen = generator
Expand All @@ -110,7 +111,7 @@ def __next__(self):
self._yielded_test_value = True
return self.test_value
else:
return pm.smartfloatX(copy(next(self.gen)))
return smarttypeX(copy(next(self.gen)))

# python2 generator
next = __next__
Expand Down Expand Up @@ -403,9 +404,15 @@ def Data(
)
name = model.name_for(name)

# `convert_observed_data` takes care of parameter `value` and
# transforms it to something digestible for PyTensor.
arr = convert_observed_data(value)
# Transform `value` it to something digestible for PyTensor.
if isgenerator(value):
raise NotImplementedError(
"Generator type data is no longer supported with pm.Data.",
# It messes up InferenceData and can't be the input to a SharedVariable.
)
else:
arr = convert_data(value)

if isinstance(arr, np.ma.MaskedArray):
raise NotImplementedError(
"Masked arrays or arrays with `nan` entries are not supported. "
Expand Down
42 changes: 30 additions & 12 deletions pymc/pytensorf.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,23 +67,37 @@
"cont_inputs",
"floatX",
"intX",
"smartfloatX",
"jacobian",
"CallableTensor",
"join_nonshared_inputs",
"make_shared_replacements",
"generator",
"convert_data",
"convert_generator_data",
"convert_observed_data",
"compile_pymc",
]


def convert_observed_data(data) -> np.ndarray | Variable:
"""Convert user provided dataset to accepted formats."""

if isgenerator(data):
return floatX(generator(data))
return convert_generator_data(data)
return convert_data(data)


def convert_generator_data(data) -> TensorVariable:
warnings.warn(
"Generator data is deprecated and we intend to remove it."
" If you disagree and need this, please get in touch via https://github.com/pymc-devs/pymc/issues.",
DeprecationWarning,
stacklevel=2,
)
return generator(data)


def convert_data(data) -> np.ndarray | Variable:
ret: np.ndarray | Variable
if hasattr(data, "to_numpy") and hasattr(data, "isnull"):
# typically, but not limited to pandas objects
vals = data.to_numpy()
Expand Down Expand Up @@ -122,16 +136,12 @@ def convert_observed_data(data) -> np.ndarray | Variable:
else:
ret = np.asarray(data)

# type handling to enable index variables when data is int:
if hasattr(data, "dtype"):
if "int" in str(data.dtype):
return intX(ret)
# otherwise, assume float:
else:
return floatX(ret)
# needed for uses of this function other than with pm.Data:
else:
# Data without dtype info is converted to float arrays by default.
# This is the most common case for simple examples.
if not hasattr(data, "dtype"):
return floatX(ret)
# Otherwise we only convert the precision.
return smarttypeX(ret)


@_as_tensor_variable.register(pd.Series)
Expand Down Expand Up @@ -297,6 +307,14 @@ def smartfloatX(x):
return x


def smarttypeX(x):
if str(x.dtype).startswith("float"):
x = floatX(x)
elif str(x.dtype).startswith("int"):
x = intX(x)
return x


"""
PyTensor derivative functions
"""
Expand Down
48 changes: 31 additions & 17 deletions tests/test_pytensorf.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,19 @@
from pymc.exceptions import NotConstantValueError
from pymc.logprob.utils import ParameterValueError
from pymc.pytensorf import (
GeneratorOp,
collect_default_updates,
compile_pymc,
constant_fold,
convert_observed_data,
convert_data,
convert_generator_data,
extract_obs_data,
hessian,
hessian_diag,
replace_rng_nodes,
replace_vars_in_graphs,
reseed_rngs,
smarttypeX,
walk_model,
)
from pymc.vartypes import int_types
Expand Down Expand Up @@ -188,9 +191,9 @@ def test_extract_obs_data():


@pytest.mark.parametrize("input_dtype", ["int32", "int64", "float32", "float64"])
def test_convert_observed_data(input_dtype):
def test_convert_data(input_dtype):
"""
Ensure that convert_observed_data returns the dense array, masked array,
Ensure that convert_data returns the dense array, masked array,
graph variable, TensorVariable, or sparse matrix as appropriate.
"""
# Create the various inputs to the function
Expand All @@ -206,12 +209,8 @@ def test_convert_observed_data(input_dtype):
missing_pandas_input = pd.DataFrame(missing_numpy_input)
masked_array_input = ma.array(dense_input, mask=(np.mod(dense_input, 2) == 0))

# Create a generator object. Apparently the generator object needs to
# yield numpy arrays.
square_generator = (np.array([i**2], dtype=int) for i in range(100))

# Alias the function to be tested
func = convert_observed_data
func = convert_data

#####
# Perform the various tests
Expand Down Expand Up @@ -255,21 +254,36 @@ def test_convert_observed_data(input_dtype):
else:
assert pytensor_output.dtype == intX

# Check function behavior with generator data
generator_output = func(square_generator)

# Output is wrapped with `pm.floatX`, and this unwraps
wrapped = generator_output.owner.inputs[0]
# Make sure the returned object has .set_gen and .set_default methods
assert hasattr(wrapped, "set_gen")
assert hasattr(wrapped, "set_default")
@pytest.mark.parametrize("input_dtype", ["int32", "int64", "float32", "float64"])
def test_convert_generator_data(input_dtype):
# Create a generator object producing NumPy arrays with the intended dtype.
# This is required to infer the correct dtype.
square_generator = (np.array([i**2], dtype=input_dtype) for i in range(100))

# Output is NOT wrapped with `pm.floatX`/`intX`,
# but produced from calling a special Op.
with pytest.warns(DeprecationWarning, match="get in touch"):
result = convert_generator_data(square_generator)
apply = result.owner
op = apply.op
# Make sure the returned object is an PyTensor TensorVariable
assert isinstance(wrapped, TensorVariable)
assert isinstance(result, TensorVariable)
assert isinstance(op, GeneratorOp), f"It's a {type(apply)}"
# There are no inputs - because it generates...
assert apply.inputs == []

# Evaluation results should have the correct* dtype!
# (*intX/floatX will be enforced!)
evaled = result.eval()
expected_dtype = smarttypeX(np.array(1, dtype=input_dtype)).dtype
assert result.type.dtype == expected_dtype
assert evaled.dtype == np.dtype(expected_dtype)


def test_pandas_to_array_pandas_index():
data = pd.Index([1, 2, 3])
result = convert_observed_data(data)
result = convert_data(data)
expected = np.array([1, 2, 3])
np.testing.assert_array_equal(result, expected)

Expand Down
Loading