最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

python - Nullable ints and pandera unit testing - Stack Overflow

programmeradmin5浏览0评论

First of all, I'm using pandera==0.23.1 on Python 3.12.9.

I have been following the examples of the pandera doc, in particular the ones from the Data Synthesis Strategies section.

The sample code adapted from the pandera doc

import hypothesis
import pandera as pa

schema = pa.DataFrameSchema(
    {
        "column1": pa.Column(int, pa.Check.eq(10)),
        "column2": pa.Column(float, pa.Check.eq(0.25)),
        "column3": pa.Column(str, pa.Check.eq("foo")),
    }
)

out_schema = schema.add_columns({"column4": pa.Column(float)})

def processing_fn(df):
    """
    This function is undecorated as we are supposed to import it
    from another place
    """
    return df.assign(column4=df.column1 * df.column2)

@hypothesis.given(schema.strategy(size=5))
def test_processing_fn(dataframe):
    ## This is exactly equivalent to:
    # @pa.check_output(out_schema)
    # def dec_processing_fn(_):
    #   return processing_fn(_)
    # processing_fn(df)
    pa.check_output(out_schema)(processing_fn)(dataframe)

This works, so I tried applying the same behaviour to my use case, which contains nullable ints. Here is my MWE:

test_test.py

import pandera as pa
import pandas as pd
import hypothesis

in_schema = pa.DataFrameSchema(
    {
        "a": pa.Column(
            pd.Int64Dtype,
            checks=[
               pa.Check.isin([1,2,pd.NA])
            ],
            coerce=True
        )
    }
)

out_schema = in_schema.add_columns(
    {
        "b": pa.Column(
            pd.Int64Dtype,
            checks=[
                pa.Check.isin([10,20,pd.NA])
            ],
            coerce=True
        )
    }
)

def transform(df):
    """
    This function is undecorated as we are supposed to import it
    from another place
    """
    return df.assign(b=df["a"] * 10)

def test_transform1():
    """
    We test transform on a sample dataframe
    """
    df_in = pd.DataFrame({"a": [1,2,pd.NA]})
    df_out = pd.DataFrame({
        "a": [1,  2,  pd.NA],
        "b": [10, 20, pd.NA]
    })
    pd.testing.assert_frame_equal(df_out, transform(df_in))

This test passes, but in practice my dataframes and checks are more complex, and I have many more functions to test, so instead of crafting a dataframe for each test case I want to use schemas. I therefore write a second test:

@hypothesis.given(in_schema.strategy(size=5))
def test_transform2(df):
    """
    This test should pass and doesn't
    """
    pa.check_output(out_schema)(transform)(df)

This fails with the following error trace:

tests/test_test.py:43 (test_transform2)
@hypothesis.given(in_schema.strategy(size=5))
>   def test_transform2(df):

test_test.py:45: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/engine.py:789: in run
    self._run()
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/engine.py:1344: in _run
    self.generate_new_examples()
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/engine.py:1100: in generate_new_examples
    self.test_function(data)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/engine.py:451: in test_function
    self.__stoppable_test_function(data)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/engine.py:344: in __stoppable_test_function
    self._test_function(data)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/core.py:1091: in _execute_once_for_engine
    result = self.execute_once(data)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/core.py:1028: in execute_once
    result = self.test_runner(data, run)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/core.py:729: in default_executor
    return function(data)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/core.py:939: in run
    kw, argslices = context.prep_args_kwargs_from_strategies(
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/control.py:170: in prep_args_kwargs_from_strategies
    obj = check(self.data.draw(s, observe_as=f"generate:{k}"))
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/data.py:1114: in draw
    v = strategy.do_draw(self)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/lazy.py:178: in do_draw
    return data.draw(self.wrapped_strategy)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/data.py:1108: in draw
    return strategy.do_draw(self)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/core.py:1821: in do_draw
    return self.definition(data.draw, *self.args, **self.kwargs)
../../../venv/3.12/lib/python3.12/site-packages/pandera/strategies/pandas_strategies.py:1179: in _dataframe_strategy
    return draw(strategy)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/data.py:1108: in draw
    return strategy.do_draw(self)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/lazy.py:178: in do_draw
    return data.draw(self.wrapped_strategy)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/data.py:1108: in draw
    return strategy.do_draw(self)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/lazy.py:178: in do_draw
    return data.draw(self.wrapped_strategy)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/data.py:1108: in draw
    return strategy.do_draw(self)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/strategies.py:915: in do_draw
    x = data.draw(self.mapped_strategy)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/data.py:1108: in draw
    return strategy.do_draw(self)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/core.py:1821: in do_draw
    return self.definition(data.draw, *self.args, **self.kwargs)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/extra/pandas/impl.py:639: in just_draw_columns
    value = draw(c.elements)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/internal/conjecture/data.py:1108: in draw
    return strategy.do_draw(self)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/strategies.py:607: in do_draw
    result = self.do_filtered_draw(data)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/strategies.py:634: in do_filtered_draw
    element = self.get_element(i)
../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/strategies.py:622: in get_element
    return self._transform(self.elements[i])
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = sampled_from([1, 2, <NA>]).map(int64).map(convert_element)
element = <NA>

    def _transform(
        self,
        # , we're not writing `element`
        # anywhere in the class so this is still type-safe. mypy is being more
        # conservative than necessary
        element: Ex,  # type: ignore
    ) -> Union[Ex, UniqueIdentifier]:
        # Used in UniqueSampledListStrategy
        for name, f in self._transformations:
            if name == "map":
>               result = f(element)
E               TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NAType'
E               while generating 'df' from _dataframe_strategy()

../../../venv/3.12/lib/python3.12/site-packages/hypothesis/strategies/_internal/strategies.py:596: TypeError

So the issue is that in_schema.strategy tries to cast my values to int instead of the required pandas.Int64DType, which is nullable. I tried with the string alias "Int64" instead of the explicit type, it gave the same result. I tried removing nullable=True, coerce=True, to no avail.


The things I tried include SO:71395580, the issue in SO:78407951 is fixed in my version, I checked multiple open and closed GitHub issues and the closest one I could find is this one, #1903, but after investigating I'm unsure that my issue is caused by this bug.

发布评论

评论列表(0)

  1. 暂无评论