pruned venvs

This commit is contained in:
d3m1g0d
2019-03-12 21:56:25 +01:00
parent 8ee094481c
commit 33f0511081
4095 changed files with 0 additions and 748399 deletions
@@ -1,52 +0,0 @@
"""Base test suite for extension arrays.
These tests are intended for third-party libraries to subclass to validate
that their extension arrays and dtypes satisfy the interface. Moving or
renaming the tests should not be done lightly.
Libraries are expected to implement a few pytest fixtures to provide data
for the tests. The fixtures may be located in either
* The same module as your test class.
* A ``conftest.py`` in the same directory as your test class.
The full list of fixtures may be found in the ``conftest.py`` next to this
file.
.. code-block:: python
import pytest
from pandas.tests.extension.base import BaseDtypeTests
@pytest.fixture
def dtype():
return MyDtype()
class TestMyDtype(BaseDtypeTests):
pass
Your class ``TestDtype`` will inherit all the tests defined on
``BaseDtypeTests``. pytest's fixture discover will supply your ``dtype``
wherever the test requires it. You're free to implement additional tests.
All the tests in these modules use ``self.assert_frame_equal`` or
``self.assert_series_equal`` for dataframe or series comparisons. By default,
they use the usual ``pandas.testing.assert_frame_equal`` and
``pandas.testing.assert_series_equal``. You can override the checks used
by defining the staticmethods ``assert_frame_equal`` and
``assert_series_equal`` on your base test class.
"""
from .casting import BaseCastingTests # noqa
from .constructors import BaseConstructorsTests # noqa
from .dtype import BaseDtypeTests # noqa
from .getitem import BaseGetitemTests # noqa
from .groupby import BaseGroupbyTests # noqa
from .interface import BaseInterfaceTests # noqa
from .methods import BaseMethodsTests # noqa
from .missing import BaseMissingTests # noqa
from .reshaping import BaseReshapingTests # noqa
from .setitem import BaseSetitemTests # noqa
@@ -1,9 +0,0 @@
import pandas.util.testing as tm
class BaseExtensionTests(object):
assert_series_equal = staticmethod(tm.assert_series_equal)
assert_frame_equal = staticmethod(tm.assert_frame_equal)
assert_extension_array_equal = staticmethod(
tm.assert_extension_array_equal
)
@@ -1,23 +0,0 @@
import pandas as pd
from pandas.core.internals import ObjectBlock
from .base import BaseExtensionTests
class BaseCastingTests(BaseExtensionTests):
"""Casting to and from ExtensionDtypes"""
def test_astype_object_series(self, all_data):
ser = pd.Series({"A": all_data})
result = ser.astype(object)
assert isinstance(result._data.blocks[0], ObjectBlock)
def test_tolist(self, data):
result = pd.Series(data).tolist()
expected = list(data)
assert result == expected
def test_astype_str(self, data):
result = pd.Series(data[:5]).astype(str)
expected = pd.Series(data[:5].astype(str))
self.assert_series_equal(result, expected)
@@ -1,47 +0,0 @@
import pytest
import pandas as pd
import pandas.util.testing as tm
from pandas.core.internals import ExtensionBlock
from .base import BaseExtensionTests
class BaseConstructorsTests(BaseExtensionTests):
def test_array_from_scalars(self, data):
scalars = [data[0], data[1], data[2]]
result = data._from_sequence(scalars)
assert isinstance(result, type(data))
def test_series_constructor(self, data):
result = pd.Series(data)
assert result.dtype == data.dtype
assert len(result) == len(data)
assert isinstance(result._data.blocks[0], ExtensionBlock)
assert result._data.blocks[0].values is data
# Series[EA] is unboxed / boxed correctly
result2 = pd.Series(result)
assert result2.dtype == data.dtype
assert isinstance(result2._data.blocks[0], ExtensionBlock)
@pytest.mark.parametrize("from_series", [True, False])
def test_dataframe_constructor_from_dict(self, data, from_series):
if from_series:
data = pd.Series(data)
result = pd.DataFrame({"A": data})
assert result.dtypes['A'] == data.dtype
assert result.shape == (len(data), 1)
assert isinstance(result._data.blocks[0], ExtensionBlock)
def test_dataframe_from_series(self, data):
result = pd.DataFrame(pd.Series(data))
assert result.dtypes[0] == data.dtype
assert result.shape == (len(data), 1)
assert isinstance(result._data.blocks[0], ExtensionBlock)
def test_series_given_mismatched_index_raises(self, data):
msg = 'Length of passed values is 3, index implies 5'
with tm.assert_raises_regex(ValueError, msg):
pd.Series(data[:3], index=[0, 1, 2, 3, 4])
@@ -1,48 +0,0 @@
import numpy as np
import pandas as pd
from .base import BaseExtensionTests
class BaseDtypeTests(BaseExtensionTests):
"""Base class for ExtensionDtype classes"""
def test_name(self, dtype):
assert isinstance(dtype.name, str)
def test_kind(self, dtype):
valid = set('biufcmMOSUV')
if dtype.kind is not None:
assert dtype.kind in valid
def test_construct_from_string_own_name(self, dtype):
result = dtype.construct_from_string(dtype.name)
assert type(result) is type(dtype)
# check OK as classmethod
result = type(dtype).construct_from_string(dtype.name)
assert type(result) is type(dtype)
def test_is_dtype_from_name(self, dtype):
result = type(dtype).is_dtype(dtype.name)
assert result is True
def test_is_dtype_unboxes_dtype(self, data, dtype):
assert dtype.is_dtype(data) is True
def test_is_dtype_from_self(self, dtype):
result = type(dtype).is_dtype(dtype)
assert result is True
def test_is_not_string_type(self, dtype):
return not pd.api.types.is_string_dtype(dtype)
def test_is_not_object_type(self, dtype):
return not pd.api.types.is_object_dtype(dtype)
def test_eq_with_str(self, dtype):
assert dtype == dtype.name
assert dtype != dtype.name + '-suffix'
def test_eq_with_numpy_object(self, dtype):
assert dtype != np.dtype('object')
@@ -1,242 +0,0 @@
import pytest
import numpy as np
import pandas as pd
import pandas.util.testing as tm
from .base import BaseExtensionTests
class BaseGetitemTests(BaseExtensionTests):
"""Tests for ExtensionArray.__getitem__."""
def test_iloc_series(self, data):
ser = pd.Series(data)
result = ser.iloc[:4]
expected = pd.Series(data[:4])
self.assert_series_equal(result, expected)
result = ser.iloc[[0, 1, 2, 3]]
self.assert_series_equal(result, expected)
def test_iloc_frame(self, data):
df = pd.DataFrame({"A": data, 'B':
np.arange(len(data), dtype='int64')})
expected = pd.DataFrame({"A": data[:4]})
# slice -> frame
result = df.iloc[:4, [0]]
self.assert_frame_equal(result, expected)
# sequence -> frame
result = df.iloc[[0, 1, 2, 3], [0]]
self.assert_frame_equal(result, expected)
expected = pd.Series(data[:4], name='A')
# slice -> series
result = df.iloc[:4, 0]
self.assert_series_equal(result, expected)
# sequence -> series
result = df.iloc[:4, 0]
self.assert_series_equal(result, expected)
def test_loc_series(self, data):
ser = pd.Series(data)
result = ser.loc[:3]
expected = pd.Series(data[:4])
self.assert_series_equal(result, expected)
result = ser.loc[[0, 1, 2, 3]]
self.assert_series_equal(result, expected)
def test_loc_frame(self, data):
df = pd.DataFrame({"A": data,
'B': np.arange(len(data), dtype='int64')})
expected = pd.DataFrame({"A": data[:4]})
# slice -> frame
result = df.loc[:3, ['A']]
self.assert_frame_equal(result, expected)
# sequence -> frame
result = df.loc[[0, 1, 2, 3], ['A']]
self.assert_frame_equal(result, expected)
expected = pd.Series(data[:4], name='A')
# slice -> series
result = df.loc[:3, 'A']
self.assert_series_equal(result, expected)
# sequence -> series
result = df.loc[:3, 'A']
self.assert_series_equal(result, expected)
def test_getitem_scalar(self, data):
result = data[0]
assert isinstance(result, data.dtype.type)
result = pd.Series(data)[0]
assert isinstance(result, data.dtype.type)
def test_getitem_scalar_na(self, data_missing, na_cmp, na_value):
result = data_missing[0]
assert na_cmp(result, na_value)
def test_getitem_mask(self, data):
# Empty mask, raw array
mask = np.zeros(len(data), dtype=bool)
result = data[mask]
assert len(result) == 0
assert isinstance(result, type(data))
# Empty mask, in series
mask = np.zeros(len(data), dtype=bool)
result = pd.Series(data)[mask]
assert len(result) == 0
assert result.dtype == data.dtype
# non-empty mask, raw array
mask[0] = True
result = data[mask]
assert len(result) == 1
assert isinstance(result, type(data))
# non-empty mask, in series
result = pd.Series(data)[mask]
assert len(result) == 1
assert result.dtype == data.dtype
def test_getitem_slice(self, data):
# getitem[slice] should return an array
result = data[slice(0)] # empty
assert isinstance(result, type(data))
result = data[slice(1)] # scalar
assert isinstance(result, type(data))
def test_get(self, data):
# GH 20882
s = pd.Series(data, index=[2 * i for i in range(len(data))])
assert s.get(4) == s.iloc[2]
result = s.get([4, 6])
expected = s.iloc[[2, 3]]
self.assert_series_equal(result, expected)
result = s.get(slice(2))
expected = s.iloc[[0, 1]]
self.assert_series_equal(result, expected)
assert s.get(-1) == s.iloc[-1]
assert s.get(s.index.max() + 1) is None
s = pd.Series(data[:6], index=list('abcdef'))
assert s.get('c') == s.iloc[2]
result = s.get(slice('b', 'd'))
expected = s.iloc[[1, 2, 3]]
self.assert_series_equal(result, expected)
result = s.get('Z')
assert result is None
assert s.get(4) == s.iloc[4]
assert s.get(-1) == s.iloc[-1]
assert s.get(len(s)) is None
def test_take_sequence(self, data):
result = pd.Series(data)[[0, 1, 3]]
assert result.iloc[0] == data[0]
assert result.iloc[1] == data[1]
assert result.iloc[2] == data[3]
def test_take(self, data, na_value, na_cmp):
result = data.take([0, -1])
assert result.dtype == data.dtype
assert result[0] == data[0]
assert result[1] == data[-1]
result = data.take([0, -1], allow_fill=True, fill_value=na_value)
assert result[0] == data[0]
assert na_cmp(result[1], na_value)
with tm.assert_raises_regex(IndexError, "out of bounds"):
data.take([len(data) + 1])
def test_take_empty(self, data, na_value, na_cmp):
empty = data[:0]
result = empty.take([-1], allow_fill=True)
assert na_cmp(result[0], na_value)
with pytest.raises(IndexError):
empty.take([-1])
with tm.assert_raises_regex(IndexError, "cannot do a non-empty take"):
empty.take([0, 1])
def test_take_negative(self, data):
# https://github.com/pandas-dev/pandas/issues/20640
n = len(data)
result = data.take([0, -n, n - 1, -1])
expected = data.take([0, 0, n - 1, n - 1])
self.assert_extension_array_equal(result, expected)
def test_take_non_na_fill_value(self, data_missing):
fill_value = data_missing[1] # valid
na = data_missing[0]
array = data_missing._from_sequence([na, fill_value, na])
result = array.take([-1, 1], fill_value=fill_value, allow_fill=True)
expected = array.take([1, 1])
self.assert_extension_array_equal(result, expected)
def test_take_pandas_style_negative_raises(self, data, na_value):
with pytest.raises(ValueError):
data.take([0, -2], fill_value=na_value, allow_fill=True)
@pytest.mark.parametrize('allow_fill', [True, False])
def test_take_out_of_bounds_raises(self, data, allow_fill):
arr = data[:3]
with pytest.raises(IndexError):
arr.take(np.asarray([0, 3]), allow_fill=allow_fill)
def test_take_series(self, data):
s = pd.Series(data)
result = s.take([0, -1])
expected = pd.Series(
data._from_sequence([data[0], data[len(data) - 1]]),
index=[0, len(data) - 1])
self.assert_series_equal(result, expected)
def test_reindex(self, data, na_value):
s = pd.Series(data)
result = s.reindex([0, 1, 3])
expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3])
self.assert_series_equal(result, expected)
n = len(data)
result = s.reindex([-1, 0, n])
expected = pd.Series(
data._from_sequence([na_value, data[0], na_value]),
index=[-1, 0, n])
self.assert_series_equal(result, expected)
result = s.reindex([n, n + 1])
expected = pd.Series(data._from_sequence([na_value, na_value]),
index=[n, n + 1])
self.assert_series_equal(result, expected)
def test_reindex_non_na_fill_value(self, data_missing):
valid = data_missing[1]
na = data_missing[0]
array = data_missing._from_sequence([na, valid])
ser = pd.Series(array)
result = ser.reindex([0, 1, 2], fill_value=valid)
expected = pd.Series(data_missing._from_sequence([na, valid, valid]))
self.assert_series_equal(result, expected)
@@ -1,69 +0,0 @@
import pytest
import pandas.util.testing as tm
import pandas as pd
from .base import BaseExtensionTests
class BaseGroupbyTests(BaseExtensionTests):
"""Groupby-specific tests."""
def test_grouping_grouper(self, data_for_grouping):
df = pd.DataFrame({
"A": ["B", "B", None, None, "A", "A", "B", "C"],
"B": data_for_grouping
})
gr1 = df.groupby("A").grouper.groupings[0]
gr2 = df.groupby("B").grouper.groupings[0]
tm.assert_numpy_array_equal(gr1.grouper, df.A.values)
tm.assert_extension_array_equal(gr2.grouper, data_for_grouping)
@pytest.mark.parametrize('as_index', [True, False])
def test_groupby_extension_agg(self, as_index, data_for_grouping):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
"B": data_for_grouping})
result = df.groupby("B", as_index=as_index).A.mean()
_, index = pd.factorize(data_for_grouping, sort=True)
# TODO(ExtensionIndex): remove astype
index = pd.Index(index.astype(object), name="B")
expected = pd.Series([3, 1, 4], index=index, name="A")
if as_index:
self.assert_series_equal(result, expected)
else:
expected = expected.reset_index()
self.assert_frame_equal(result, expected)
def test_groupby_extension_no_sort(self, data_for_grouping):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
"B": data_for_grouping})
result = df.groupby("B", sort=False).A.mean()
_, index = pd.factorize(data_for_grouping, sort=False)
# TODO(ExtensionIndex): remove astype
index = pd.Index(index.astype(object), name="B")
expected = pd.Series([1, 3, 4], index=index, name="A")
self.assert_series_equal(result, expected)
def test_groupby_extension_transform(self, data_for_grouping):
valid = data_for_grouping[~data_for_grouping.isna()]
df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4],
"B": valid})
result = df.groupby("B").A.transform(len)
expected = pd.Series([3, 3, 2, 2, 3, 1], name="A")
self.assert_series_equal(result, expected)
@pytest.mark.parametrize('op', [
lambda x: 1,
lambda x: [1] * len(x),
lambda x: pd.Series([1] * len(x)),
lambda x: x,
], ids=['scalar', 'list', 'series', 'object'])
def test_groupby_extension_apply(self, data_for_grouping, op):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
"B": data_for_grouping})
df.groupby("B").apply(op)
df.groupby("B").A.apply(op)
df.groupby("A").apply(op)
df.groupby("A").B.apply(op)
@@ -1,59 +0,0 @@
import numpy as np
import pandas as pd
from pandas.compat import StringIO
from pandas.core.dtypes.common import is_extension_array_dtype
from pandas.core.dtypes.dtypes import ExtensionDtype
from .base import BaseExtensionTests
class BaseInterfaceTests(BaseExtensionTests):
"""Tests that the basic interface is satisfied."""
# ------------------------------------------------------------------------
# Interface
# ------------------------------------------------------------------------
def test_len(self, data):
assert len(data) == 100
def test_ndim(self, data):
assert data.ndim == 1
def test_can_hold_na_valid(self, data):
# GH-20761
assert data._can_hold_na is True
def test_memory_usage(self, data):
s = pd.Series(data)
result = s.memory_usage(index=False)
assert result == s.nbytes
def test_array_interface(self, data):
result = np.array(data)
assert result[0] == data[0]
def test_repr(self, data):
ser = pd.Series(data)
assert data.dtype.name in repr(ser)
df = pd.DataFrame({"A": data})
repr(df)
def test_dtype_name_in_info(self, data):
buf = StringIO()
pd.DataFrame({"A": data}).info(buf=buf)
result = buf.getvalue()
assert data.dtype.name in result
def test_is_extension_array_dtype(self, data):
assert is_extension_array_dtype(data)
assert is_extension_array_dtype(data.dtype)
assert is_extension_array_dtype(pd.Series(data))
assert isinstance(data.dtype, ExtensionDtype)
def test_no_values_attribute(self, data):
# GH-20735: EA's with .values attribute give problems with internal
# code, disallowing this for now until solved
assert not hasattr(data, 'values')
assert not hasattr(data, '_values')
@@ -1,105 +0,0 @@
import pytest
import numpy as np
import pandas as pd
import pandas.util.testing as tm
from .base import BaseExtensionTests
class BaseMethodsTests(BaseExtensionTests):
"""Various Series and DataFrame methods."""
@pytest.mark.parametrize('dropna', [True, False])
def test_value_counts(self, all_data, dropna):
all_data = all_data[:10]
if dropna:
other = np.array(all_data[~all_data.isna()])
else:
other = all_data
result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
self.assert_series_equal(result, expected)
def test_count(self, data_missing):
df = pd.DataFrame({"A": data_missing})
result = df.count(axis='columns')
expected = pd.Series([0, 1])
self.assert_series_equal(result, expected)
def test_apply_simple_series(self, data):
result = pd.Series(data).apply(id)
assert isinstance(result, pd.Series)
def test_argsort(self, data_for_sorting):
result = pd.Series(data_for_sorting).argsort()
expected = pd.Series(np.array([2, 0, 1], dtype=np.int64))
self.assert_series_equal(result, expected)
def test_argsort_missing(self, data_missing_for_sorting):
result = pd.Series(data_missing_for_sorting).argsort()
expected = pd.Series(np.array([1, -1, 0], dtype=np.int64))
self.assert_series_equal(result, expected)
@pytest.mark.parametrize('ascending', [True, False])
def test_sort_values(self, data_for_sorting, ascending):
ser = pd.Series(data_for_sorting)
result = ser.sort_values(ascending=ascending)
expected = ser.iloc[[2, 0, 1]]
if not ascending:
expected = expected[::-1]
self.assert_series_equal(result, expected)
@pytest.mark.parametrize('ascending', [True, False])
def test_sort_values_missing(self, data_missing_for_sorting, ascending):
ser = pd.Series(data_missing_for_sorting)
result = ser.sort_values(ascending=ascending)
if ascending:
expected = ser.iloc[[2, 0, 1]]
else:
expected = ser.iloc[[0, 2, 1]]
self.assert_series_equal(result, expected)
@pytest.mark.parametrize('ascending', [True, False])
def test_sort_values_frame(self, data_for_sorting, ascending):
df = pd.DataFrame({"A": [1, 2, 1],
"B": data_for_sorting})
result = df.sort_values(['A', 'B'])
expected = pd.DataFrame({"A": [1, 1, 2],
'B': data_for_sorting.take([2, 0, 1])},
index=[2, 0, 1])
self.assert_frame_equal(result, expected)
@pytest.mark.parametrize('box', [pd.Series, lambda x: x])
@pytest.mark.parametrize('method', [lambda x: x.unique(), pd.unique])
def test_unique(self, data, box, method):
duplicated = box(data._from_sequence([data[0], data[0]]))
result = method(duplicated)
assert len(result) == 1
assert isinstance(result, type(data))
assert result[0] == duplicated[0]
@pytest.mark.parametrize('na_sentinel', [-1, -2])
def test_factorize(self, data_for_grouping, na_sentinel):
labels, uniques = pd.factorize(data_for_grouping,
na_sentinel=na_sentinel)
expected_labels = np.array([0, 0, na_sentinel,
na_sentinel, 1, 1, 0, 2],
dtype=np.intp)
expected_uniques = data_for_grouping.take([0, 4, 7])
tm.assert_numpy_array_equal(labels, expected_labels)
self.assert_extension_array_equal(uniques, expected_uniques)
@pytest.mark.parametrize('na_sentinel', [-1, -2])
def test_factorize_equivalence(self, data_for_grouping, na_sentinel):
l1, u1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
l2, u2 = data_for_grouping.factorize(na_sentinel=na_sentinel)
tm.assert_numpy_array_equal(l1, l2)
self.assert_extension_array_equal(u1, u2)
@@ -1,126 +0,0 @@
import numpy as np
import pytest
import pandas as pd
import pandas.util.testing as tm
from .base import BaseExtensionTests
class BaseMissingTests(BaseExtensionTests):
def test_isna(self, data_missing):
expected = np.array([True, False])
result = pd.isna(data_missing)
tm.assert_numpy_array_equal(result, expected)
result = pd.Series(data_missing).isna()
expected = pd.Series(expected)
self.assert_series_equal(result, expected)
# GH 21189
result = pd.Series(data_missing).drop([0, 1]).isna()
expected = pd.Series([], dtype=bool)
self.assert_series_equal(result, expected)
def test_dropna_series(self, data_missing):
ser = pd.Series(data_missing)
result = ser.dropna()
expected = ser.iloc[[1]]
self.assert_series_equal(result, expected)
def test_dropna_frame(self, data_missing):
df = pd.DataFrame({"A": data_missing})
# defaults
result = df.dropna()
expected = df.iloc[[1]]
self.assert_frame_equal(result, expected)
# axis = 1
result = df.dropna(axis='columns')
expected = pd.DataFrame(index=[0, 1])
self.assert_frame_equal(result, expected)
# multiple
df = pd.DataFrame({"A": data_missing,
"B": [1, np.nan]})
result = df.dropna()
expected = df.iloc[:0]
self.assert_frame_equal(result, expected)
def test_fillna_scalar(self, data_missing):
valid = data_missing[1]
result = data_missing.fillna(valid)
expected = data_missing.fillna(valid)
self.assert_extension_array_equal(result, expected)
def test_fillna_limit_pad(self, data_missing):
arr = data_missing.take([1, 0, 0, 0, 1])
result = pd.Series(arr).fillna(method='ffill', limit=2)
expected = pd.Series(data_missing.take([1, 1, 1, 0, 1]))
self.assert_series_equal(result, expected)
def test_fillna_limit_backfill(self, data_missing):
arr = data_missing.take([1, 0, 0, 0, 1])
result = pd.Series(arr).fillna(method='backfill', limit=2)
expected = pd.Series(data_missing.take([1, 0, 1, 1, 1]))
self.assert_series_equal(result, expected)
def test_fillna_series(self, data_missing):
fill_value = data_missing[1]
ser = pd.Series(data_missing)
result = ser.fillna(fill_value)
expected = pd.Series(
data_missing._from_sequence([fill_value, fill_value]))
self.assert_series_equal(result, expected)
# Fill with a series
result = ser.fillna(expected)
self.assert_series_equal(result, expected)
# Fill with a series not affecting the missing values
result = ser.fillna(ser)
self.assert_series_equal(result, ser)
@pytest.mark.parametrize('method', ['ffill', 'bfill'])
def test_fillna_series_method(self, data_missing, method):
fill_value = data_missing[1]
if method == 'ffill':
data_missing = type(data_missing)(data_missing[::-1])
result = pd.Series(data_missing).fillna(method=method)
expected = pd.Series(
data_missing._from_sequence([fill_value, fill_value]))
self.assert_series_equal(result, expected)
def test_fillna_frame(self, data_missing):
fill_value = data_missing[1]
result = pd.DataFrame({
"A": data_missing,
"B": [1, 2]
}).fillna(fill_value)
expected = pd.DataFrame({
"A": data_missing._from_sequence([fill_value, fill_value]),
"B": [1, 2],
})
self.assert_frame_equal(result, expected)
def test_fillna_fill_other(self, data):
result = pd.DataFrame({
"A": data,
"B": [np.nan] * len(data)
}).fillna({"B": 0.0})
expected = pd.DataFrame({
"A": data,
"B": [0.0] * len(result),
})
self.assert_frame_equal(result, expected)
@@ -1,164 +0,0 @@
import pytest
import numpy as np
import pandas as pd
from pandas.core.internals import ExtensionBlock
from .base import BaseExtensionTests
class BaseReshapingTests(BaseExtensionTests):
"""Tests for reshaping and concatenation."""
@pytest.mark.parametrize('in_frame', [True, False])
def test_concat(self, data, in_frame):
wrapped = pd.Series(data)
if in_frame:
wrapped = pd.DataFrame(wrapped)
result = pd.concat([wrapped, wrapped], ignore_index=True)
assert len(result) == len(data) * 2
if in_frame:
dtype = result.dtypes[0]
else:
dtype = result.dtype
assert dtype == data.dtype
assert isinstance(result._data.blocks[0], ExtensionBlock)
@pytest.mark.parametrize('in_frame', [True, False])
def test_concat_all_na_block(self, data_missing, in_frame):
valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1])
na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3])
if in_frame:
valid_block = pd.DataFrame({"a": valid_block})
na_block = pd.DataFrame({"a": na_block})
result = pd.concat([valid_block, na_block])
if in_frame:
expected = pd.DataFrame({"a": data_missing.take([1, 1, 0, 0])})
self.assert_frame_equal(result, expected)
else:
expected = pd.Series(data_missing.take([1, 1, 0, 0]))
self.assert_series_equal(result, expected)
def test_concat_mixed_dtypes(self, data):
# https://github.com/pandas-dev/pandas/issues/20762
df1 = pd.DataFrame({'A': data[:3]})
df2 = pd.DataFrame({"A": [1, 2, 3]})
df3 = pd.DataFrame({"A": ['a', 'b', 'c']}).astype('category')
df4 = pd.DataFrame({"A": pd.SparseArray([1, 2, 3])})
dfs = [df1, df2, df3, df4]
# dataframes
result = pd.concat(dfs)
expected = pd.concat([x.astype(object) for x in dfs])
self.assert_frame_equal(result, expected)
# series
result = pd.concat([x['A'] for x in dfs])
expected = pd.concat([x['A'].astype(object) for x in dfs])
self.assert_series_equal(result, expected)
# simple test for just EA and one other
result = pd.concat([df1, df2])
expected = pd.concat([df1.astype('object'), df2.astype('object')])
self.assert_frame_equal(result, expected)
result = pd.concat([df1['A'], df2['A']])
expected = pd.concat([df1['A'].astype('object'),
df2['A'].astype('object')])
self.assert_series_equal(result, expected)
def test_concat_columns(self, data, na_value):
df1 = pd.DataFrame({'A': data[:3]})
df2 = pd.DataFrame({'B': [1, 2, 3]})
expected = pd.DataFrame({'A': data[:3], 'B': [1, 2, 3]})
result = pd.concat([df1, df2], axis=1)
self.assert_frame_equal(result, expected)
result = pd.concat([df1['A'], df2['B']], axis=1)
self.assert_frame_equal(result, expected)
# non-aligned
df2 = pd.DataFrame({'B': [1, 2, 3]}, index=[1, 2, 3])
expected = pd.DataFrame({
'A': data._from_sequence(list(data[:3]) + [na_value]),
'B': [np.nan, 1, 2, 3]})
result = pd.concat([df1, df2], axis=1)
self.assert_frame_equal(result, expected)
result = pd.concat([df1['A'], df2['B']], axis=1)
self.assert_frame_equal(result, expected)
def test_align(self, data, na_value):
a = data[:3]
b = data[2:5]
r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3]))
# Assumes that the ctor can take a list of scalars of the type
e1 = pd.Series(data._from_sequence(list(a) + [na_value]))
e2 = pd.Series(data._from_sequence([na_value] + list(b)))
self.assert_series_equal(r1, e1)
self.assert_series_equal(r2, e2)
def test_align_frame(self, data, na_value):
a = data[:3]
b = data[2:5]
r1, r2 = pd.DataFrame({'A': a}).align(
pd.DataFrame({'A': b}, index=[1, 2, 3])
)
# Assumes that the ctor can take a list of scalars of the type
e1 = pd.DataFrame({'A': data._from_sequence(list(a) + [na_value])})
e2 = pd.DataFrame({'A': data._from_sequence([na_value] + list(b))})
self.assert_frame_equal(r1, e1)
self.assert_frame_equal(r2, e2)
def test_align_series_frame(self, data, na_value):
# https://github.com/pandas-dev/pandas/issues/20576
ser = pd.Series(data, name='a')
df = pd.DataFrame({"col": np.arange(len(ser) + 1)})
r1, r2 = ser.align(df)
e1 = pd.Series(data._from_sequence(list(data) + [na_value]),
name=ser.name)
self.assert_series_equal(r1, e1)
self.assert_frame_equal(r2, df)
def test_set_frame_expand_regular_with_extension(self, data):
df = pd.DataFrame({"A": [1] * len(data)})
df['B'] = data
expected = pd.DataFrame({"A": [1] * len(data), "B": data})
self.assert_frame_equal(df, expected)
def test_set_frame_expand_extension_with_regular(self, data):
df = pd.DataFrame({'A': data})
df['B'] = [1] * len(data)
expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
self.assert_frame_equal(df, expected)
def test_set_frame_overwrite_object(self, data):
# https://github.com/pandas-dev/pandas/issues/20555
df = pd.DataFrame({"A": [1] * len(data)}, dtype=object)
df['A'] = data
assert df.dtypes['A'] == data.dtype
def test_merge(self, data, na_value):
# GH-20743
df1 = pd.DataFrame({'ext': data[:3], 'int1': [1, 2, 3],
'key': [0, 1, 2]})
df2 = pd.DataFrame({'int2': [1, 2, 3, 4], 'key': [0, 0, 1, 3]})
res = pd.merge(df1, df2)
exp = pd.DataFrame(
{'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1],
'ext': data._from_sequence([data[0], data[0], data[1]])})
self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']])
res = pd.merge(df1, df2, how='outer')
exp = pd.DataFrame(
{'int1': [1, 1, 2, 3, np.nan], 'int2': [1, 2, 3, np.nan, 4],
'key': [0, 0, 1, 2, 3],
'ext': data._from_sequence(
[data[0], data[0], data[1], data[2], na_value])})
self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']])
@@ -1,167 +0,0 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas.util.testing as tm
from .base import BaseExtensionTests
class BaseSetitemTests(BaseExtensionTests):
def test_setitem_scalar_series(self, data):
arr = pd.Series(data)
arr[0] = data[1]
assert arr[0] == data[1]
def test_setitem_sequence(self, data):
arr = pd.Series(data)
original = data.copy()
arr[[0, 1]] = [data[1], data[0]]
assert arr[0] == original[1]
assert arr[1] == original[0]
@pytest.mark.parametrize('as_array', [True, False])
def test_setitem_sequence_mismatched_length_raises(self, data, as_array):
ser = pd.Series(data)
value = [data[0]]
if as_array:
value = data._from_sequence(value)
xpr = 'cannot set using a {} indexer with a different length'
with tm.assert_raises_regex(ValueError, xpr.format('list-like')):
ser[[0, 1]] = value
with tm.assert_raises_regex(ValueError, xpr.format('slice')):
ser[slice(3)] = value
def test_setitem_empty_indxer(self, data):
ser = pd.Series(data)
original = ser.copy()
ser[[]] = []
self.assert_series_equal(ser, original)
def test_setitem_sequence_broadcasts(self, data):
arr = pd.Series(data)
arr[[0, 1]] = data[2]
assert arr[0] == data[2]
assert arr[1] == data[2]
@pytest.mark.parametrize('setter', ['loc', 'iloc'])
def test_setitem_scalar(self, data, setter):
arr = pd.Series(data)
setter = getattr(arr, setter)
operator.setitem(setter, 0, data[1])
assert arr[0] == data[1]
def test_setitem_loc_scalar_mixed(self, data):
df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
df.loc[0, 'B'] = data[1]
assert df.loc[0, 'B'] == data[1]
def test_setitem_loc_scalar_single(self, data):
df = pd.DataFrame({"B": data})
df.loc[10, 'B'] = data[1]
assert df.loc[10, 'B'] == data[1]
def test_setitem_loc_scalar_multiple_homogoneous(self, data):
df = pd.DataFrame({"A": data, "B": data})
df.loc[10, 'B'] = data[1]
assert df.loc[10, 'B'] == data[1]
def test_setitem_iloc_scalar_mixed(self, data):
df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
df.iloc[0, 1] = data[1]
assert df.loc[0, 'B'] == data[1]
def test_setitem_iloc_scalar_single(self, data):
df = pd.DataFrame({"B": data})
df.iloc[10, 0] = data[1]
assert df.loc[10, 'B'] == data[1]
def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
df = pd.DataFrame({"A": data, "B": data})
df.iloc[10, 1] = data[1]
assert df.loc[10, 'B'] == data[1]
@pytest.mark.parametrize('as_callable', [True, False])
@pytest.mark.parametrize('setter', ['loc', None])
def test_setitem_mask_aligned(self, data, as_callable, setter):
ser = pd.Series(data)
mask = np.zeros(len(data), dtype=bool)
mask[:2] = True
if as_callable:
mask2 = lambda x: mask
else:
mask2 = mask
if setter:
# loc
target = getattr(ser, setter)
else:
# Series.__setitem__
target = ser
operator.setitem(target, mask2, data[5:7])
ser[mask2] = data[5:7]
assert ser[0] == data[5]
assert ser[1] == data[6]
@pytest.mark.parametrize('setter', ['loc', None])
def test_setitem_mask_broadcast(self, data, setter):
ser = pd.Series(data)
mask = np.zeros(len(data), dtype=bool)
mask[:2] = True
if setter: # loc
target = getattr(ser, setter)
else: # __setitem__
target = ser
operator.setitem(target, mask, data[10])
assert ser[0] == data[10]
assert ser[1] == data[10]
def test_setitem_expand_columns(self, data):
df = pd.DataFrame({"A": data})
result = df.copy()
result['B'] = 1
expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
self.assert_frame_equal(result, expected)
result = df.copy()
result.loc[:, 'B'] = 1
self.assert_frame_equal(result, expected)
# overwrite with new type
result['B'] = data
expected = pd.DataFrame({"A": data, "B": data})
self.assert_frame_equal(result, expected)
def test_setitem_expand_with_extension(self, data):
df = pd.DataFrame({"A": [1] * len(data)})
result = df.copy()
result['B'] = data
expected = pd.DataFrame({"A": [1] * len(data), "B": data})
self.assert_frame_equal(result, expected)
result = df.copy()
result.loc[:, 'B'] = data
self.assert_frame_equal(result, expected)
def test_setitem_frame_invalid_length(self, data):
df = pd.DataFrame({"A": [1] * len(data)})
xpr = "Length of values does not match length of index"
with tm.assert_raises_regex(ValueError, xpr):
df['B'] = data[:5]
@pytest.mark.xfail(reason="GH-20441: setitem on extension types.")
def test_setitem_tuple_index(self, data):
s = pd.Series(data[:2], index=[(0, 0), (0, 1)])
expected = pd.Series(data.take([1, 1]), index=s.index)
s[(0, 1)] = data[1]
self.assert_series_equal(s, expected)
@@ -1,159 +0,0 @@
import string
import pytest
import numpy as np
from pandas.api.types import CategoricalDtype
from pandas import Categorical
from pandas.tests.extension import base
def make_data():
return np.random.choice(list(string.ascii_letters), size=100)
@pytest.fixture
def dtype():
return CategoricalDtype()
@pytest.fixture
def data():
"""Length-100 PeriodArray for semantics test."""
return Categorical(make_data())
@pytest.fixture
def data_missing():
"""Length 2 array with [NA, Valid]"""
return Categorical([np.nan, 'A'])
@pytest.fixture
def data_for_sorting():
return Categorical(['A', 'B', 'C'], categories=['C', 'A', 'B'],
ordered=True)
@pytest.fixture
def data_missing_for_sorting():
return Categorical(['A', None, 'B'], categories=['B', 'A'],
ordered=True)
@pytest.fixture
def na_value():
return np.nan
@pytest.fixture
def data_for_grouping():
return Categorical(['a', 'a', None, None, 'b', 'b', 'a', 'c'])
class TestDtype(base.BaseDtypeTests):
pass
class TestInterface(base.BaseInterfaceTests):
@pytest.mark.skip(reason="Memory usage doesn't match")
def test_memory_usage(self):
# Is this deliberate?
pass
class TestConstructors(base.BaseConstructorsTests):
pass
class TestReshaping(base.BaseReshapingTests):
@pytest.mark.skip(reason="Unobserved categories preseved in concat.")
def test_concat_columns(self, data, na_value):
pass
@pytest.mark.skip(reason="Unobserved categories preseved in concat.")
def test_align(self, data, na_value):
pass
@pytest.mark.skip(reason="Unobserved categories preseved in concat.")
def test_align_frame(self, data, na_value):
pass
@pytest.mark.skip(reason="Unobserved categories preseved in concat.")
def test_merge(self, data, na_value):
pass
class TestGetitem(base.BaseGetitemTests):
skip_take = pytest.mark.skip(reason="GH-20664.")
@pytest.mark.skip(reason="Backwards compatibility")
def test_getitem_scalar(self):
# CategoricalDtype.type isn't "correct" since it should
# be a parent of the elements (object). But don't want
# to break things by changing.
pass
@skip_take
def test_take(self):
# TODO remove this once Categorical.take is fixed
pass
@skip_take
def test_take_negative(self):
pass
@skip_take
def test_take_pandas_style_negative_raises(self):
pass
@skip_take
def test_take_non_na_fill_value(self):
pass
@skip_take
def test_take_out_of_bounds_raises(self):
pass
@pytest.mark.skip(reason="GH-20747. Unobserved categories.")
def test_take_series(self):
pass
@skip_take
def test_reindex_non_na_fill_value(self):
pass
@pytest.mark.xfail(reason="Categorical.take buggy")
def test_take_empty(self):
pass
@pytest.mark.xfail(reason="test not written correctly for categorical")
def test_reindex(self):
pass
class TestSetitem(base.BaseSetitemTests):
pass
class TestMissing(base.BaseMissingTests):
@pytest.mark.skip(reason="Not implemented")
def test_fillna_limit_pad(self):
pass
@pytest.mark.skip(reason="Not implemented")
def test_fillna_limit_backfill(self):
pass
class TestMethods(base.BaseMethodsTests):
pass
@pytest.mark.skip(reason="Unobserved categories included")
def test_value_counts(self, all_data, dropna):
pass
class TestCasting(base.BaseCastingTests):
pass
@@ -1,79 +0,0 @@
import operator
import pytest
@pytest.fixture
def dtype():
"""A fixture providing the ExtensionDtype to validate."""
raise NotImplementedError
@pytest.fixture
def data():
"""Length-100 array for this type."""
raise NotImplementedError
@pytest.fixture
def data_missing():
"""Length-2 array with [NA, Valid]"""
raise NotImplementedError
@pytest.fixture(params=['data', 'data_missing'])
def all_data(request, data, data_missing):
"""Parametrized fixture giving 'data' and 'data_missing'"""
if request.param == 'data':
return data
elif request.param == 'data_missing':
return data_missing
@pytest.fixture
def data_for_sorting():
"""Length-3 array with a known sort order.
This should be three items [B, C, A] with
A < B < C
"""
raise NotImplementedError
@pytest.fixture
def data_missing_for_sorting():
"""Length-3 array with a known sort order.
This should be three items [B, NA, A] with
A < B and NA missing.
"""
raise NotImplementedError
@pytest.fixture
def na_cmp():
"""Binary operator for comparing NA values.
Should return a function of two arguments that returns
True if both arguments are (scalar) NA for your type.
By default, uses ``operator.is_``
"""
return operator.is_
@pytest.fixture
def na_value():
"""The scalar missing value for this type. Default 'None'"""
return None
@pytest.fixture
def data_for_grouping():
"""Data for factorization, grouping, and unique tests.
Expected to be like [B, B, NA, NA, A, A, B, C]
Where A < B < C and NA is missing
"""
raise NotImplementedError
@@ -1,105 +0,0 @@
import decimal
import numbers
import random
import sys
import numpy as np
import pandas as pd
from pandas.core.arrays import ExtensionArray
from pandas.core.dtypes.base import ExtensionDtype
class DecimalDtype(ExtensionDtype):
type = decimal.Decimal
name = 'decimal'
na_value = decimal.Decimal('NaN')
@classmethod
def construct_from_string(cls, string):
if string == cls.name:
return cls()
else:
raise TypeError("Cannot construct a '{}' from "
"'{}'".format(cls, string))
class DecimalArray(ExtensionArray):
dtype = DecimalDtype()
def __init__(self, values):
assert all(isinstance(v, decimal.Decimal) for v in values)
values = np.asarray(values, dtype=object)
self._data = values
# Some aliases for common attribute names to ensure pandas supports
# these
self._items = self.data = self._data
# those aliases are currently not working due to assumptions
# in internal code (GH-20735)
# self._values = self.values = self.data
@classmethod
def _from_sequence(cls, scalars):
return cls(scalars)
@classmethod
def _from_factorized(cls, values, original):
return cls(values)
def __getitem__(self, item):
if isinstance(item, numbers.Integral):
return self._data[item]
else:
return type(self)(self._data[item])
def take(self, indexer, allow_fill=False, fill_value=None):
from pandas.api.extensions import take
data = self._data
if allow_fill and fill_value is None:
fill_value = self.dtype.na_value
result = take(data, indexer, fill_value=fill_value,
allow_fill=allow_fill)
return self._from_sequence(result)
def copy(self, deep=False):
if deep:
return type(self)(self._data.copy())
return type(self)(self)
def __setitem__(self, key, value):
if pd.api.types.is_list_like(value):
value = [decimal.Decimal(v) for v in value]
else:
value = decimal.Decimal(value)
self._data[key] = value
def __len__(self):
return len(self._data)
def __repr__(self):
return 'DecimalArray({!r})'.format(self._data)
@property
def nbytes(self):
n = len(self)
if n:
return n * sys.getsizeof(self[0])
return 0
def isna(self):
return np.array([x.is_nan() for x in self._data], dtype=bool)
@property
def _na_value(self):
return decimal.Decimal('NaN')
@classmethod
def _concat_same_type(cls, to_concat):
return cls(np.concatenate([x._data for x in to_concat]))
def make_data():
return [decimal.Decimal(random.random()) for _ in range(100)]
@@ -1,185 +0,0 @@
import decimal
import numpy as np
import pandas as pd
import pandas.util.testing as tm
import pytest
from pandas.tests.extension import base
from .array import DecimalDtype, DecimalArray, make_data
@pytest.fixture
def dtype():
return DecimalDtype()
@pytest.fixture
def data():
return DecimalArray(make_data())
@pytest.fixture
def data_missing():
return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)])
@pytest.fixture
def data_for_sorting():
return DecimalArray([decimal.Decimal('1'),
decimal.Decimal('2'),
decimal.Decimal('0')])
@pytest.fixture
def data_missing_for_sorting():
return DecimalArray([decimal.Decimal('1'),
decimal.Decimal('NaN'),
decimal.Decimal('0')])
@pytest.fixture
def na_cmp():
return lambda x, y: x.is_nan() and y.is_nan()
@pytest.fixture
def na_value():
return decimal.Decimal("NaN")
@pytest.fixture
def data_for_grouping():
b = decimal.Decimal('1.0')
a = decimal.Decimal('0.0')
c = decimal.Decimal('2.0')
na = decimal.Decimal('NaN')
return DecimalArray([b, b, na, na, a, a, b, c])
class BaseDecimal(object):
def assert_series_equal(self, left, right, *args, **kwargs):
left_na = left.isna()
right_na = right.isna()
tm.assert_series_equal(left_na, right_na)
return tm.assert_series_equal(left[~left_na],
right[~right_na],
*args, **kwargs)
def assert_frame_equal(self, left, right, *args, **kwargs):
# TODO(EA): select_dtypes
tm.assert_index_equal(
left.columns, right.columns,
exact=kwargs.get('check_column_type', 'equiv'),
check_names=kwargs.get('check_names', True),
check_exact=kwargs.get('check_exact', False),
check_categorical=kwargs.get('check_categorical', True),
obj='{obj}.columns'.format(obj=kwargs.get('obj', 'DataFrame')))
decimals = (left.dtypes == 'decimal').index
for col in decimals:
self.assert_series_equal(left[col], right[col],
*args, **kwargs)
left = left.drop(columns=decimals)
right = right.drop(columns=decimals)
tm.assert_frame_equal(left, right, *args, **kwargs)
class TestDtype(BaseDecimal, base.BaseDtypeTests):
pass
class TestInterface(BaseDecimal, base.BaseInterfaceTests):
pass
class TestConstructors(BaseDecimal, base.BaseConstructorsTests):
pass
class TestReshaping(BaseDecimal, base.BaseReshapingTests):
pass
class TestGetitem(BaseDecimal, base.BaseGetitemTests):
def test_take_na_value_other_decimal(self):
arr = DecimalArray([decimal.Decimal('1.0'),
decimal.Decimal('2.0')])
result = arr.take([0, -1], allow_fill=True,
fill_value=decimal.Decimal('-1.0'))
expected = DecimalArray([decimal.Decimal('1.0'),
decimal.Decimal('-1.0')])
self.assert_extension_array_equal(result, expected)
class TestMissing(BaseDecimal, base.BaseMissingTests):
pass
class TestMethods(BaseDecimal, base.BaseMethodsTests):
@pytest.mark.parametrize('dropna', [True, False])
@pytest.mark.xfail(reason="value_counts not implemented yet.")
def test_value_counts(self, all_data, dropna):
all_data = all_data[:10]
if dropna:
other = np.array(all_data[~all_data.isna()])
else:
other = all_data
result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
tm.assert_series_equal(result, expected)
class TestCasting(BaseDecimal, base.BaseCastingTests):
pass
class TestGroupby(BaseDecimal, base.BaseGroupbyTests):
pass
def test_series_constructor_coerce_data_to_extension_dtype_raises():
xpr = ("Cannot cast data to extension dtype 'decimal'. Pass the "
"extension array directly.")
with tm.assert_raises_regex(ValueError, xpr):
pd.Series([0, 1, 2], dtype=DecimalDtype())
def test_series_constructor_with_same_dtype_ok():
arr = DecimalArray([decimal.Decimal('10.0')])
result = pd.Series(arr, dtype=DecimalDtype())
expected = pd.Series(arr)
tm.assert_series_equal(result, expected)
def test_series_constructor_coerce_extension_array_to_dtype_raises():
arr = DecimalArray([decimal.Decimal('10.0')])
xpr = r"Cannot specify a dtype 'int64' .* \('decimal'\)."
with tm.assert_raises_regex(ValueError, xpr):
pd.Series(arr, dtype='int64')
def test_dataframe_constructor_with_same_dtype_ok():
arr = DecimalArray([decimal.Decimal('10.0')])
result = pd.DataFrame({"A": arr}, dtype=DecimalDtype())
expected = pd.DataFrame({"A": arr})
tm.assert_frame_equal(result, expected)
def test_dataframe_constructor_with_different_dtype_raises():
arr = DecimalArray([decimal.Decimal('10.0')])
xpr = "Cannot coerce extension array to dtype 'int64'. "
with tm.assert_raises_regex(ValueError, xpr):
pd.DataFrame({"A": arr}, dtype='int64')
@@ -1,178 +0,0 @@
"""Test extension array for storing nested data in a pandas container.
The JSONArray stores lists of dictionaries. The storage mechanism is a list,
not an ndarray.
Note:
We currently store lists of UserDicts (Py3 only). Pandas has a few places
internally that specifically check for dicts, and does non-scalar things
in that case. We *want* the dictionaries to be treated as scalars, so we
hack around pandas by using UserDicts.
"""
import collections
import itertools
import numbers
import random
import string
import sys
import numpy as np
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.arrays import ExtensionArray
class JSONDtype(ExtensionDtype):
type = collections.Mapping
name = 'json'
try:
na_value = collections.UserDict()
except AttributeError:
# source compatibility with Py2.
na_value = {}
@classmethod
def construct_from_string(cls, string):
if string == cls.name:
return cls()
else:
raise TypeError("Cannot construct a '{}' from "
"'{}'".format(cls, string))
class JSONArray(ExtensionArray):
dtype = JSONDtype()
def __init__(self, values):
for val in values:
if not isinstance(val, self.dtype.type):
raise TypeError
self.data = values
# Some aliases for common attribute names to ensure pandas supports
# these
self._items = self._data = self.data
# those aliases are currently not working due to assumptions
# in internal code (GH-20735)
# self._values = self.values = self.data
@classmethod
def _from_sequence(cls, scalars):
return cls(scalars)
@classmethod
def _from_factorized(cls, values, original):
return cls([collections.UserDict(x) for x in values if x != ()])
def __getitem__(self, item):
if isinstance(item, numbers.Integral):
return self.data[item]
elif isinstance(item, np.ndarray) and item.dtype == 'bool':
return self._from_sequence([x for x, m in zip(self, item) if m])
elif isinstance(item, collections.Iterable):
# fancy indexing
return type(self)([self.data[i] for i in item])
else:
# slice
return type(self)(self.data[item])
def __setitem__(self, key, value):
if isinstance(key, numbers.Integral):
self.data[key] = value
else:
if not isinstance(value, (type(self),
collections.Sequence)):
# broadcast value
value = itertools.cycle([value])
if isinstance(key, np.ndarray) and key.dtype == 'bool':
# masking
for i, (k, v) in enumerate(zip(key, value)):
if k:
assert isinstance(v, self.dtype.type)
self.data[i] = v
else:
for k, v in zip(key, value):
assert isinstance(v, self.dtype.type)
self.data[k] = v
def __len__(self):
return len(self.data)
def __repr__(self):
return 'JSONArary({!r})'.format(self.data)
@property
def nbytes(self):
return sys.getsizeof(self.data)
def isna(self):
return np.array([x == self.dtype.na_value for x in self.data],
dtype=bool)
def take(self, indexer, allow_fill=False, fill_value=None):
# re-implement here, since NumPy has trouble setting
# sized objects like UserDicts into scalar slots of
# an ndarary.
indexer = np.asarray(indexer)
msg = ("Index is out of bounds or cannot do a "
"non-empty take from an empty array.")
if allow_fill:
if fill_value is None:
fill_value = self.dtype.na_value
# bounds check
if (indexer < -1).any():
raise ValueError
try:
output = [self.data[loc] if loc != -1 else fill_value
for loc in indexer]
except IndexError:
raise IndexError(msg)
else:
try:
output = [self.data[loc] for loc in indexer]
except IndexError:
raise IndexError(msg)
return self._from_sequence(output)
def copy(self, deep=False):
return type(self)(self.data[:])
def astype(self, dtype, copy=True):
# NumPy has issues when all the dicts are the same length.
# np.array([UserDict(...), UserDict(...)]) fails,
# but np.array([{...}, {...}]) works, so cast.
return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
def unique(self):
# Parent method doesn't work since np.array will try to infer
# a 2-dim object.
return type(self)([
dict(x) for x in list(set(tuple(d.items()) for d in self.data))
])
@classmethod
def _concat_same_type(cls, to_concat):
data = list(itertools.chain.from_iterable([x.data for x in to_concat]))
return cls(data)
def _values_for_factorize(self):
frozen = self._values_for_argsort()
return frozen, ()
def _values_for_argsort(self):
# Disable NumPy's shape inference by including an empty tuple...
# If all the elemnts of self are the same size P, NumPy will
# cast them to an (N, P) array, instead of an (N,) array of tuples.
frozen = [()] + list(tuple(x.items()) for x in self)
return np.array(frozen, dtype=object)[1:]
def make_data():
# TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
return [collections.UserDict([
(random.choice(string.ascii_letters), random.randint(0, 100))
for _ in range(random.randint(0, 10))]) for _ in range(100)]
@@ -1,232 +0,0 @@
import operator
import collections
import pytest
import pandas as pd
import pandas.util.testing as tm
from pandas.compat import PY2, PY36
from pandas.tests.extension import base
from .array import JSONArray, JSONDtype, make_data
pytestmark = pytest.mark.skipif(PY2, reason="Py2 doesn't have a UserDict")
@pytest.fixture
def dtype():
return JSONDtype()
@pytest.fixture
def data():
"""Length-100 PeriodArray for semantics test."""
data = make_data()
# Why the while loop? NumPy is unable to construct an ndarray from
# equal-length ndarrays. Many of our operations involve coercing the
# EA to an ndarray of objects. To avoid random test failures, we ensure
# that our data is coercable to an ndarray. Several tests deal with only
# the first two elements, so that's what we'll check.
while len(data[0]) == len(data[1]):
data = make_data()
return JSONArray(data)
@pytest.fixture
def data_missing():
"""Length 2 array with [NA, Valid]"""
return JSONArray([{}, {'a': 10}])
@pytest.fixture
def data_for_sorting():
return JSONArray([{'b': 1}, {'c': 4}, {'a': 2, 'c': 3}])
@pytest.fixture
def data_missing_for_sorting():
return JSONArray([{'b': 1}, {}, {'a': 4}])
@pytest.fixture
def na_value(dtype):
return dtype.na_value
@pytest.fixture
def na_cmp():
return operator.eq
@pytest.fixture
def data_for_grouping():
return JSONArray([
{'b': 1}, {'b': 1},
{}, {},
{'a': 0, 'c': 2}, {'a': 0, 'c': 2},
{'b': 1},
{'c': 2},
])
class BaseJSON(object):
# NumPy doesn't handle an array of equal-length UserDicts.
# The default assert_series_equal eventually does a
# Series.values, which raises. We work around it by
# converting the UserDicts to dicts.
def assert_series_equal(self, left, right, **kwargs):
if left.dtype.name == 'json':
assert left.dtype == right.dtype
left = pd.Series(JSONArray(left.values.astype(object)),
index=left.index, name=left.name)
right = pd.Series(JSONArray(right.values.astype(object)),
index=right.index, name=right.name)
tm.assert_series_equal(left, right, **kwargs)
def assert_frame_equal(self, left, right, *args, **kwargs):
tm.assert_index_equal(
left.columns, right.columns,
exact=kwargs.get('check_column_type', 'equiv'),
check_names=kwargs.get('check_names', True),
check_exact=kwargs.get('check_exact', False),
check_categorical=kwargs.get('check_categorical', True),
obj='{obj}.columns'.format(obj=kwargs.get('obj', 'DataFrame')))
jsons = (left.dtypes == 'json').index
for col in jsons:
self.assert_series_equal(left[col], right[col],
*args, **kwargs)
left = left.drop(columns=jsons)
right = right.drop(columns=jsons)
tm.assert_frame_equal(left, right, *args, **kwargs)
class TestDtype(BaseJSON, base.BaseDtypeTests):
pass
class TestInterface(BaseJSON, base.BaseInterfaceTests):
def test_custom_asserts(self):
# This would always trigger the KeyError from trying to put
# an array of equal-length UserDicts inside an ndarray.
data = JSONArray([collections.UserDict({'a': 1}),
collections.UserDict({'b': 2}),
collections.UserDict({'c': 3})])
a = pd.Series(data)
self.assert_series_equal(a, a)
self.assert_frame_equal(a.to_frame(), a.to_frame())
b = pd.Series(data.take([0, 0, 1]))
with pytest.raises(AssertionError):
self.assert_series_equal(a, b)
with pytest.raises(AssertionError):
self.assert_frame_equal(a.to_frame(), b.to_frame())
class TestConstructors(BaseJSON, base.BaseConstructorsTests):
pass
class TestReshaping(BaseJSON, base.BaseReshapingTests):
pass
class TestGetitem(BaseJSON, base.BaseGetitemTests):
pass
class TestMissing(BaseJSON, base.BaseMissingTests):
@pytest.mark.xfail(reason="Setting a dict as a scalar")
def test_fillna_series(self):
"""We treat dictionaries as a mapping in fillna, not a scalar."""
@pytest.mark.xfail(reason="Setting a dict as a scalar")
def test_fillna_frame(self):
"""We treat dictionaries as a mapping in fillna, not a scalar."""
unhashable = pytest.mark.skip(reason="Unhashable")
unstable = pytest.mark.skipif(not PY36, # 3.6 or higher
reason="Dictionary order unstable")
class TestMethods(BaseJSON, base.BaseMethodsTests):
@unhashable
def test_value_counts(self, all_data, dropna):
pass
@unhashable
def test_sort_values_frame(self):
# TODO (EA.factorize): see if _values_for_factorize allows this.
pass
@unstable
def test_argsort(self, data_for_sorting):
super(TestMethods, self).test_argsort(data_for_sorting)
@unstable
def test_argsort_missing(self, data_missing_for_sorting):
super(TestMethods, self).test_argsort_missing(
data_missing_for_sorting)
@unstable
@pytest.mark.parametrize('ascending', [True, False])
def test_sort_values(self, data_for_sorting, ascending):
super(TestMethods, self).test_sort_values(
data_for_sorting, ascending)
@unstable
@pytest.mark.parametrize('ascending', [True, False])
def test_sort_values_missing(self, data_missing_for_sorting, ascending):
super(TestMethods, self).test_sort_values_missing(
data_missing_for_sorting, ascending)
class TestCasting(BaseJSON, base.BaseCastingTests):
@pytest.mark.xfail
def test_astype_str(self):
"""This currently fails in NumPy on np.array(self, dtype=str) with
*** ValueError: setting an array element with a sequence
"""
# We intentionally don't run base.BaseSetitemTests because pandas'
# internals has trouble setting sequences of values into scalar positions.
class TestGroupby(BaseJSON, base.BaseGroupbyTests):
@unhashable
def test_groupby_extension_transform(self):
"""
This currently fails in Series.name.setter, since the
name must be hashable, but the value is a dictionary.
I think this is what we want, i.e. `.name` should be the original
values, and not the values for factorization.
"""
@unhashable
def test_groupby_extension_apply(self):
"""
This fails in Index._do_unique_check with
> hash(val)
E TypeError: unhashable type: 'UserDict' with
I suspect that once we support Index[ExtensionArray],
we'll be able to dispatch unique.
"""
@unstable
@pytest.mark.parametrize('as_index', [True, False])
def test_groupby_extension_agg(self, as_index, data_for_grouping):
super(TestGroupby, self).test_groupby_extension_agg(
as_index, data_for_grouping
)
@@ -1,85 +0,0 @@
import numpy as np
import pytest
import pandas as pd
import pandas.util.testing as tm
from pandas.core.arrays import ExtensionArray
from pandas.core.dtypes.common import is_extension_array_dtype
from pandas.core.dtypes import dtypes
class DummyDtype(dtypes.ExtensionDtype):
pass
class DummyArray(ExtensionArray):
def __init__(self, data):
self.data = data
def __array__(self, dtype):
return self.data
@property
def dtype(self):
return self.data.dtype
class TestExtensionArrayDtype(object):
@pytest.mark.parametrize('values', [
pd.Categorical([]),
pd.Categorical([]).dtype,
pd.Series(pd.Categorical([])),
DummyDtype(),
DummyArray(np.array([1, 2])),
])
def test_is_extension_array_dtype(self, values):
assert is_extension_array_dtype(values)
@pytest.mark.parametrize('values', [
np.array([]),
pd.Series(np.array([])),
])
def test_is_not_extension_array_dtype(self, values):
assert not is_extension_array_dtype(values)
def test_astype():
arr = DummyArray(np.array([1, 2, 3]))
expected = np.array([1, 2, 3], dtype=object)
result = arr.astype(object)
tm.assert_numpy_array_equal(result, expected)
result = arr.astype('object')
tm.assert_numpy_array_equal(result, expected)
def test_astype_no_copy():
arr = DummyArray(np.array([1, 2, 3], dtype=np.int64))
result = arr.astype(arr.dtype, copy=False)
assert arr.data is result
result = arr.astype(arr.dtype)
assert arr.data is not result
@pytest.mark.parametrize('dtype', [
dtypes.DatetimeTZDtype('ns', 'US/Central'),
dtypes.PeriodDtype("D"),
dtypes.IntervalDtype(),
])
def test_is_not_extension_array_dtype(dtype):
assert not isinstance(dtype, dtypes.ExtensionDtype)
assert not is_extension_array_dtype(dtype)
@pytest.mark.parametrize('dtype', [
dtypes.CategoricalDtype(),
])
def test_is_extension_array_dtype(dtype):
assert isinstance(dtype, dtypes.ExtensionDtype)
assert is_extension_array_dtype(dtype)
@@ -1,77 +0,0 @@
# -*- coding: utf-8 -*-
# pylint: disable=W0102
import numpy as np
import pandas as pd
from pandas.core.internals import (
BlockManager, SingleBlockManager, NonConsolidatableMixIn, Block)
import pytest
class CustomBlock(NonConsolidatableMixIn, Block):
_holder = np.ndarray
def formatting_values(self):
return np.array(["Val: {}".format(i) for i in self.values])
def concat_same_type(self, to_concat, placement=None):
"""
Always concatenate disregarding self.ndim as the values are
always 1D in this custom Block
"""
values = np.concatenate([blk.values for blk in to_concat])
return self.make_block_same_class(
values, placement=placement or slice(0, len(values), 1))
@pytest.fixture
def df():
df1 = pd.DataFrame({'a': [1, 2, 3]})
blocks = df1._data.blocks
values = np.arange(3, dtype='int64')
custom_block = CustomBlock(values, placement=slice(1, 2))
blocks = blocks + (custom_block,)
block_manager = BlockManager(blocks, [pd.Index(['a', 'b']), df1.index])
return pd.DataFrame(block_manager)
def test_custom_repr():
values = np.arange(3, dtype='int64')
# series
block = CustomBlock(values, placement=slice(0, 3))
s = pd.Series(SingleBlockManager(block, pd.RangeIndex(3)))
assert repr(s) == '0 Val: 0\n1 Val: 1\n2 Val: 2\ndtype: int64'
# dataframe
block = CustomBlock(values, placement=slice(0, 1))
blk_mgr = BlockManager([block], [['col'], range(3)])
df = pd.DataFrame(blk_mgr)
assert repr(df) == ' col\n0 Val: 0\n1 Val: 1\n2 Val: 2'
def test_concat_series():
# GH17728
values = np.arange(3, dtype='int64')
block = CustomBlock(values, placement=slice(0, 3))
s = pd.Series(block, pd.RangeIndex(3), fastpath=True)
res = pd.concat([s, s])
assert isinstance(res._data.blocks[0], CustomBlock)
def test_concat_dataframe(df):
# GH17728
res = pd.concat([df, df])
assert isinstance(res._data.blocks[1], CustomBlock)
def test_concat_axis1(df):
# GH17954
df2 = pd.DataFrame({'c': [.1, .2, .3]})
res = pd.concat([df, df2], axis=1)
assert isinstance(res._data.blocks[1], CustomBlock)