Skip to content

Commit

Permalink
String dtype: restrict options.mode.string_storage to python|pyarrow …
Browse files Browse the repository at this point in the history
…(remove pyarrow_numpy) (#59376)

* String dtype: restrict options.mode.string_storage to python|pyarrow (remove pyarrow_numpy)

* add type annotation
  • Loading branch information
jorisvandenbossche committed Oct 9, 2024
1 parent 014c487 commit fef0ea4
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 51 deletions.
2 changes: 0 additions & 2 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1248,7 +1248,6 @@ def nullable_string_dtype(request):
params=[
"python",
pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")),
]
)
def string_storage(request):
Expand All @@ -1257,7 +1256,6 @@ def string_storage(request):
* 'python'
* 'pyarrow'
* 'pyarrow_numpy'
"""
return request.param

Expand Down
22 changes: 20 additions & 2 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
from __future__ import annotations

import os
from typing import Callable
from typing import (
Any,
Callable,
)

import pandas._config.config as cf
from pandas._config.config import (
Expand Down Expand Up @@ -506,12 +509,27 @@ def use_inf_as_na_cb(key) -> None:
``future.infer_string`` is set to True.
"""


def is_valid_string_storage(value: Any) -> None:
legal_values = ["python", "pyarrow"]
if value not in legal_values:
msg = "Value must be one of python|pyarrow"
if value == "pyarrow_numpy":
# TODO: we can remove extra message after 3.0
msg += (
". 'pyarrow_numpy' was specified, but this option should be "
"enabled using pandas.options.future.infer_string instead"
)
raise ValueError(msg)


with cf.config_prefix("mode"):
cf.register_option(
"string_storage",
"python",
string_storage_doc,
validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]),
# validator=is_one_of_factory(["python", "pyarrow"]),
validator=is_valid_string_storage,
)


Expand Down
32 changes: 8 additions & 24 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,50 +516,34 @@ def test_arrow_array(dtype):
assert arr.equals(expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
# roundtrip possible from arrow 1.0.0
pa = pytest.importorskip("pyarrow")

if using_infer_string and string_storage2 != "pyarrow_numpy":
request.applymarker(
pytest.mark.xfail(
reason="infer_string takes precedence over string storage"
)
)

data = pd.array(["a", "b", None], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
if dtype.storage == "python":
assert table.field("a").type == "string"
else:
assert table.field("a").type == "large_string"
with pd.option_context("string_storage", string_storage2):
with pd.option_context("string_storage", string_storage):
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(f"string[{string_storage2}]")
expected = df.astype(f"string[{string_storage}]")
tm.assert_frame_equal(result, expected)
# ensure the missing value is represented by NA and not np.nan or None
assert result.loc[2, "a"] is result["a"].dtype.na_value


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_load_from_zero_chunks(
dtype, string_storage2, request, using_infer_string
):
def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
# GH-41040
pa = pytest.importorskip("pyarrow")

if using_infer_string and string_storage2 != "pyarrow_numpy":
request.applymarker(
pytest.mark.xfail(
reason="infer_string takes precedence over string storage"
)
)

data = pd.array([], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
Expand All @@ -569,10 +553,10 @@ def test_arrow_load_from_zero_chunks(
assert table.field("a").type == "large_string"
# Instantiate the same table with no chunks at all
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
with pd.option_context("string_storage", string_storage2):
with pd.option_context("string_storage", string_storage):
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(f"string[{string_storage2}]")
expected = df.astype(f"string[{string_storage}]")
tm.assert_frame_equal(result, expected)


Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,18 @@ def test_eq_all_na():


def test_config(string_storage, request, using_infer_string):
if using_infer_string and string_storage != "pyarrow_numpy":
request.applymarker(pytest.mark.xfail(reason="infer string takes precedence"))
if string_storage == "pyarrow_numpy":
if using_infer_string and string_storage == "python":
# python string storage with na_value=NaN is not yet implemented
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))

with pd.option_context("string_storage", string_storage):
assert StringDtype().storage == string_storage
result = pd.array(["a", "b"])
assert result.dtype.storage == string_storage

dtype = StringDtype(string_storage)
dtype = StringDtype(
string_storage, na_value=np.nan if using_infer_string else pd.NA
)
expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype)
tm.assert_equal(result, expected)

Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/frame/methods/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -912,3 +912,12 @@ def test_astype_to_string_not_modifying_input(string_storage, val):
with option_context("mode.string_storage", string_storage):
df.astype("string", copy=False)
tm.assert_frame_equal(df, expected)


@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT])
def test_astype_to_string_dtype_not_modifying_input(any_string_dtype, val):
# GH#51073 - variant of the above test with explicit dtype instances
df = DataFrame({"a": ["a", "b", val]})
expected = df.copy()
df.astype(any_string_dtype)
tm.assert_frame_equal(df, expected)
5 changes: 2 additions & 3 deletions pandas/tests/frame/methods/test_convert_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@


class TestConvertDtypes:
# TODO convert_dtypes should not use NaN variant of string dtype, but always NA
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize(
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
)
Expand All @@ -18,9 +20,6 @@ def test_convert_dtypes(
):
# Specific types are tested in tests/series/test_dtypes.py
# Just check that it works for DataFrame here
if using_infer_string:
string_storage = "pyarrow_numpy"

df = pd.DataFrame(
{
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
Expand Down
16 changes: 0 additions & 16 deletions pandas/tests/io/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,19 +224,3 @@ def compression_format(request):
@pytest.fixture(params=_compression_formats_params)
def compression_ext(request):
return request.param[0]


@pytest.fixture(
params=[
"python",
pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
]
)
def string_storage(request):
"""
Parametrized fixture for pd.options.mode.string_storage.
* 'python'
* 'pyarrow'
"""
return request.param

0 comments on commit fef0ea4

Please sign in to comment.