First of all, I'm using pandera==0.23.1
on Python 3.12.9.
I have been following the examples of the pandera
doc, in particular the ones from the Data Synthesis Strategies section.
The sample code adapted from the pandera doc
import hypothesis
import pandera as pa
schema = pa.DataFrameSchema(
{
"column1": pa.Column(int, pa.Check.eq(10)),
"column2": pa.Column(float, pa.Check.eq(0.25)),
"column3": pa.Column(str, pa.Check.eq("foo")),
}
)
out_schema = schema.add_columns({"column4": pa.Column(float)})
def processing_fn(df):
"""
This function is undecorated as we are supposed to import it
from another place
"""
return df.assign(column4=df.column1 * df.column2)
@hypothesis.given(schema.strategy(size=5))
def test_processing_fn(dataframe):
## This is exactly equivalent to:
# @pa.check_output(out_schema)
# def dec_processing_fn(_):
# return processing_fn(_)
# processing_fn(df)
pa.check_output(out_schema)(processing_fn)(dataframe)
This works, so I tried applying the same behaviour to my use case, which contains nullable ints. Here is my MWE:
test_test.py
import pandera as pa
import pandas as pd
import hypothesis
in_schema = pa.DataFrameSchema(
{
"a": pa.Column(
pd.Int64Dtype,
checks=[
pa.Check.isin([1,2,pd.NA])
],
coerce=True
)
}
)
out_schema = in_schema.add_columns(
{
"b": pa.Column(
pd.Int64Dtype,
checks=[
pa.Check.isin([10,20,pd.NA])
],
coerce=True
)
}
)
def transform(df):
"""
This function is undecorated as we are supposed to import it
from another place
"""
return df.assign(b=df["a"] * 10)
def test_transform1():
"""
We test transform on a sample dataframe
"""
df_in = pd.DataFrame({"a": [1,2,pd.NA]})
df_out = pd.DataFrame({
"a": [1, 2, pd.NA],
"b": [10, 20, pd.NA]
})
pd.testing.assert_frame_equal(df_out, transform(df_in))
This test passes, but in practice my dataframes and checks are more complex, and I have many more functions to test, so instead of crafting a dataframe for each test case I want to use schemas. I therefore write a second test:
@hypothesis.given(in_schema.strategy(size=5))
def test_transform2(df):
"""
This test should pass and doesn't
"""
pa.check_output(out_schema)(transform)(df)
This fails with the following error trace:
tests/test_test.py:43 (test_transform2)
@hypothesis.given(in_schema.strategy(size=5))
> def test_transform2(df):
test_test.py:45:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/engine.py:789: in run
self._run()
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/engine.py:1344: in _run
self.generate_new_examples()
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/engine.py:1100: in generate_new_examples
self.test_function(data)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/engine.py:451: in test_function
self.__stoppable_test_function(data)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/engine.py:344: in __stoppable_test_function
self._test_function(data)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/core.py:1091: in _execute_once_for_engine
result = self.execute_once(data)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/core.py:1028: in execute_once
result = self.test_runner(data, run)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/core.py:729: in default_executor
return function(data)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/core.py:939: in run
kw, argslices = context.prep_args_kwargs_from_strategies(
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/control.py:170: in prep_args_kwargs_from_strategies
obj = check(self.data.draw(s, observe_as=f"generate:{k}"))
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/data.py:1114: in draw
v = strategy.do_draw(self)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/lazy.py:178: in do_draw
return data.draw(self.wrapped_strategy)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/data.py:1108: in draw
return strategy.do_draw(self)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/core.py:1821: in do_draw
return self.definition(data.draw, *self.args, **self.kwargs)
../../../venv/3.12/lib/python3.12/site-packages/pandera/strategies/pandas_strategies.py:1179: in _dataframe_strategy
return draw(strategy)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/data.py:1108: in draw
return strategy.do_draw(self)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/lazy.py:178: in do_draw
return data.draw(self.wrapped_strategy)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/data.py:1108: in draw
return strategy.do_draw(self)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/lazy.py:178: in do_draw
return data.draw(self.wrapped_strategy)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/data.py:1108: in draw
return strategy.do_draw(self)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/strategies.py:915: in do_draw
x = data.draw(self.mapped_strategy)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/data.py:1108: in draw
return strategy.do_draw(self)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/core.py:1821: in do_draw
return self.definition(data.draw, *self.args, **self.kwargs)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/extra/pandas/impl.py:639: in just_draw_columns
value = draw(c.elements)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/data.py:1108: in draw
return strategy.do_draw(self)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/strategies.py:607: in do_draw
result = self.do_filtered_draw(data)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/strategies.py:634: in do_filtered_draw
element = self.get_element(i)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/strategies.py:622: in get_element
return self._transform(self.elements[i])
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = sampled_from([1, 2, <NA>]).map(int64).map(convert_element)
element = <NA>
def _transform(
self,
# , we're not writing `element`
# anywhere in the class so this is still type-safe. mypy is being more
# conservative than necessary
element: Ex, # type: ignore
) -> Union[Ex, UniqueIdentifier]:
# Used in UniqueSampledListStrategy
for name, f in self._transformations:
if name == "map":
> result = f(element)
E TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NAType'
E while generating 'df' from _dataframe_strategy()
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/strategies.py:596: TypeError
So the issue is that in_schema.strategy
tries to cast my values to int
instead of the required pandas.Int64DType
, which is nullable. I tried with the string alias "Int64"
instead of the explicit type, it gave the same result. I tried removing nullable=True
, coerce=True
, to no avail.
The things I tried include SO:71395580, the issue in SO:78407951 is fixed in my version, I checked multiple open and closed GitHub issues and the closest one I could find is this one, #1903, but after investigating I'm unsure that my issue is caused by this bug.