demo + utils venv

This commit is contained in:
d3m1g0d
2019-02-03 13:40:10 +01:00
parent 5fa112490b
commit cfa9c8ea23
5994 changed files with 1353819 additions and 0 deletions
@@ -0,0 +1,115 @@
import numpy as np
import pytest
from pandas import DataFrame, SparseArray, SparseDataFrame, bdate_range
data = {'A': [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6],
'B': [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6],
'C': np.arange(10, dtype=np.float64),
'D': [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan]}
dates = bdate_range('1/1/2011', periods=10)
# fixture names must be compatible with the tests in
# tests/frame/test_api.SharedWithSparse
@pytest.fixture
def float_frame_dense():
"""
Fixture for dense DataFrame of floats with DatetimeIndex
Columns are ['A', 'B', 'C', 'D']; some entries are missing
"""
return DataFrame(data, index=dates)
@pytest.fixture
def float_frame():
"""
Fixture for sparse DataFrame of floats with DatetimeIndex
Columns are ['A', 'B', 'C', 'D']; some entries are missing
"""
# default_kind='block' is the default
return SparseDataFrame(data, index=dates, default_kind='block')
@pytest.fixture
def float_frame_int_kind():
"""
Fixture for sparse DataFrame of floats with DatetimeIndex
Columns are ['A', 'B', 'C', 'D'] and default_kind='integer'.
Some entries are missing.
"""
return SparseDataFrame(data, index=dates, default_kind='integer')
@pytest.fixture
def float_string_frame():
"""
Fixture for sparse DataFrame of floats and strings with DatetimeIndex
Columns are ['A', 'B', 'C', 'D', 'foo']; some entries are missing
"""
sdf = SparseDataFrame(data, index=dates)
sdf['foo'] = SparseArray(['bar'] * len(dates))
return sdf
@pytest.fixture
def float_frame_fill0_dense():
"""
Fixture for dense DataFrame of floats with DatetimeIndex
Columns are ['A', 'B', 'C', 'D']; missing entries have been filled with 0
"""
values = SparseDataFrame(data).values
values[np.isnan(values)] = 0
return DataFrame(values, columns=['A', 'B', 'C', 'D'], index=dates)
@pytest.fixture
def float_frame_fill0():
"""
Fixture for sparse DataFrame of floats with DatetimeIndex
Columns are ['A', 'B', 'C', 'D']; missing entries have been filled with 0
"""
values = SparseDataFrame(data).values
values[np.isnan(values)] = 0
return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'],
default_fill_value=0, index=dates)
@pytest.fixture
def float_frame_fill2_dense():
"""
Fixture for dense DataFrame of floats with DatetimeIndex
Columns are ['A', 'B', 'C', 'D']; missing entries have been filled with 2
"""
values = SparseDataFrame(data).values
values[np.isnan(values)] = 2
return DataFrame(values, columns=['A', 'B', 'C', 'D'], index=dates)
@pytest.fixture
def float_frame_fill2():
"""
Fixture for sparse DataFrame of floats with DatetimeIndex
Columns are ['A', 'B', 'C', 'D']; missing entries have been filled with 2
"""
values = SparseDataFrame(data).values
values[np.isnan(values)] = 2
return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'],
default_fill_value=2, index=dates)
@pytest.fixture
def empty_frame():
"""
Fixture for empty SparseDataFrame
"""
return SparseDataFrame()
@@ -0,0 +1,39 @@
import numpy as np
import pytest
from pandas import DataFrame, SparseDataFrame, SparseSeries
from pandas.util import testing as tm
@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)')
def test_quantile():
# GH 17386
data = [[1, 1], [2, 10], [3, 100], [np.nan, np.nan]]
q = 0.1
sparse_df = SparseDataFrame(data)
result = sparse_df.quantile(q)
dense_df = DataFrame(data)
dense_expected = dense_df.quantile(q)
sparse_expected = SparseSeries(dense_expected)
tm.assert_series_equal(result, dense_expected)
tm.assert_sp_series_equal(result, sparse_expected)
@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)')
def test_quantile_multi():
# GH 17386
data = [[1, 1], [2, 10], [3, 100], [np.nan, np.nan]]
q = [0.1, 0.5]
sparse_df = SparseDataFrame(data)
result = sparse_df.quantile(q)
dense_df = DataFrame(data)
dense_expected = dense_df.quantile(q)
sparse_expected = SparseDataFrame(dense_expected)
tm.assert_frame_equal(result, dense_expected)
tm.assert_sp_frame_equal(result, sparse_expected)
@@ -0,0 +1,105 @@
import numpy as np
import pytest
from pandas import DataFrame, Series, SparseDataFrame, bdate_range
from pandas.core import nanops
from pandas.core.sparse.api import SparseDtype
from pandas.util import testing as tm
@pytest.fixture
def dates():
return bdate_range('1/1/2011', periods=10)
@pytest.fixture
def empty():
return SparseDataFrame()
@pytest.fixture
def frame(dates):
data = {'A': [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6],
'B': [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6],
'C': np.arange(10, dtype=np.float64),
'D': [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan]}
return SparseDataFrame(data, index=dates)
@pytest.fixture
def fill_frame(frame):
values = frame.values.copy()
values[np.isnan(values)] = 2
return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'],
default_fill_value=2,
index=frame.index)
def test_apply(frame):
applied = frame.apply(np.sqrt)
assert isinstance(applied, SparseDataFrame)
tm.assert_almost_equal(applied.values, np.sqrt(frame.values))
# agg / broadcast
with tm.assert_produces_warning(FutureWarning):
broadcasted = frame.apply(np.sum, broadcast=True)
assert isinstance(broadcasted, SparseDataFrame)
with tm.assert_produces_warning(FutureWarning):
exp = frame.to_dense().apply(np.sum, broadcast=True)
tm.assert_frame_equal(broadcasted.to_dense(), exp)
applied = frame.apply(np.sum)
tm.assert_series_equal(applied,
frame.to_dense().apply(nanops.nansum).to_sparse())
def test_apply_fill(fill_frame):
applied = fill_frame.apply(np.sqrt)
assert applied['A'].fill_value == np.sqrt(2)
def test_apply_empty(empty):
assert empty.apply(np.sqrt) is empty
def test_apply_nonuq():
orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=['a', 'a', 'c'])
sparse = orig.to_sparse()
res = sparse.apply(lambda s: s[0], axis=1)
exp = orig.apply(lambda s: s[0], axis=1)
# dtype must be kept
assert res.dtype == SparseDtype(np.int64)
# ToDo: apply must return subclassed dtype
assert isinstance(res, Series)
tm.assert_series_equal(res.to_dense(), exp)
# df.T breaks
sparse = orig.T.to_sparse()
res = sparse.apply(lambda s: s[0], axis=0) # noqa
exp = orig.T.apply(lambda s: s[0], axis=0)
# TODO: no non-unique columns supported in sparse yet
# tm.assert_series_equal(res.to_dense(), exp)
def test_applymap(frame):
# just test that it works
result = frame.applymap(lambda x: x * 2)
assert isinstance(result, SparseDataFrame)
def test_apply_keep_sparse_dtype():
# GH 23744
sdf = SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]),
columns=['b', 'a', 'c'], default_fill_value=1)
df = DataFrame(sdf)
expected = sdf.apply(np.exp)
result = df.apply(np.exp)
tm.assert_frame_equal(expected, result)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,109 @@
import numpy as np
import pytest
from pandas import DataFrame, SparseDataFrame
from pandas.util import testing as tm
pytestmark = pytest.mark.skip("Wrong SparseBlock initialization (GH 17386)")
@pytest.mark.parametrize('data', [
[[1, 1], [2, 2], [3, 3], [4, 4], [0, 0]],
[[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0], [np.nan, np.nan]],
[
[1.0, 1.0 + 1.0j],
[2.0 + 2.0j, 2.0],
[3.0, 3.0 + 3.0j],
[4.0 + 4.0j, 4.0],
[np.nan, np.nan]
]
])
@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)')
def test_where_with_numeric_data(data):
# GH 17386
lower_bound = 1.5
sparse = SparseDataFrame(data)
result = sparse.where(sparse > lower_bound)
dense = DataFrame(data)
dense_expected = dense.where(dense > lower_bound)
sparse_expected = SparseDataFrame(dense_expected)
tm.assert_frame_equal(result, dense_expected)
tm.assert_sp_frame_equal(result, sparse_expected)
@pytest.mark.parametrize('data', [
[[1, 1], [2, 2], [3, 3], [4, 4], [0, 0]],
[[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0], [np.nan, np.nan]],
[
[1.0, 1.0 + 1.0j],
[2.0 + 2.0j, 2.0],
[3.0, 3.0 + 3.0j],
[4.0 + 4.0j, 4.0],
[np.nan, np.nan]
]
])
@pytest.mark.parametrize('other', [
True,
-100,
0.1,
100.0 + 100.0j
])
@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)')
def test_where_with_numeric_data_and_other(data, other):
# GH 17386
lower_bound = 1.5
sparse = SparseDataFrame(data)
result = sparse.where(sparse > lower_bound, other)
dense = DataFrame(data)
dense_expected = dense.where(dense > lower_bound, other)
sparse_expected = SparseDataFrame(dense_expected,
default_fill_value=other)
tm.assert_frame_equal(result, dense_expected)
tm.assert_sp_frame_equal(result, sparse_expected)
@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)')
def test_where_with_bool_data():
# GH 17386
data = [[False, False], [True, True], [False, False]]
cond = True
sparse = SparseDataFrame(data)
result = sparse.where(sparse == cond)
dense = DataFrame(data)
dense_expected = dense.where(dense == cond)
sparse_expected = SparseDataFrame(dense_expected)
tm.assert_frame_equal(result, dense_expected)
tm.assert_sp_frame_equal(result, sparse_expected)
@pytest.mark.parametrize('other', [
True,
0,
0.1,
100.0 + 100.0j
])
@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)')
def test_where_with_bool_data_and_other(other):
# GH 17386
data = [[False, False], [True, True], [False, False]]
cond = True
sparse = SparseDataFrame(data)
result = sparse.where(sparse == cond, other)
dense = DataFrame(data)
dense_expected = dense.where(dense == cond, other)
sparse_expected = SparseDataFrame(dense_expected,
default_fill_value=other)
tm.assert_frame_equal(result, dense_expected)
tm.assert_sp_frame_equal(result, sparse_expected)
@@ -0,0 +1,21 @@
import numpy as np
import pytest
from pandas import SparseDataFrame, read_csv
from pandas.util import testing as tm
class TestSparseDataFrameToCsv(object):
fill_values = [np.nan, 0, None, 1]
@pytest.mark.parametrize('fill_value', fill_values)
def test_to_csv_sparse_dataframe(self, fill_value):
# GH19384
sdf = SparseDataFrame({'a': type(self).fill_values},
default_fill_value=fill_value)
with tm.ensure_clean('sparse_df.csv') as path:
sdf.to_csv(path, index=False)
df = read_csv(path, skip_blank_lines=False)
tm.assert_sp_frame_equal(df.to_sparse(fill_value=fill_value), sdf)
@@ -0,0 +1,185 @@
from distutils.version import LooseVersion
import numpy as np
import pytest
from pandas.core.dtypes.common import is_bool_dtype
import pandas as pd
from pandas import SparseDataFrame, SparseSeries
from pandas.core.sparse.api import SparseDtype
from pandas.util import testing as tm
scipy = pytest.importorskip('scipy')
ignore_matrix_warning = pytest.mark.filterwarnings(
"ignore:the matrix subclass:PendingDeprecationWarning"
)
@pytest.mark.parametrize('index', [None, list('abc')]) # noqa: F811
@pytest.mark.parametrize('columns', [None, list('def')])
@pytest.mark.parametrize('fill_value', [None, 0, np.nan])
@pytest.mark.parametrize('dtype', [bool, int, float, np.uint16])
@ignore_matrix_warning
def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype):
# GH 4343
# Make one ndarray and from it one sparse matrix, both to be used for
# constructing frames and comparing results
arr = np.eye(3, dtype=dtype)
# GH 16179
arr[0, 1] = dtype(2)
try:
spm = spmatrix(arr)
assert spm.dtype == arr.dtype
except (TypeError, AssertionError):
# If conversion to sparse fails for this spmatrix type and arr.dtype,
# then the combination is not currently supported in NumPy, so we
# can just skip testing it thoroughly
return
sdf = SparseDataFrame(spm, index=index, columns=columns,
default_fill_value=fill_value)
# Expected result construction is kind of tricky for all
# dtype-fill_value combinations; easiest to cast to something generic
# and except later on
rarr = arr.astype(object)
rarr[arr == 0] = np.nan
expected = SparseDataFrame(rarr, index=index, columns=columns).fillna(
fill_value if fill_value is not None else np.nan)
# Assert frame is as expected
sdf_obj = sdf.astype(object)
tm.assert_sp_frame_equal(sdf_obj, expected)
tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense())
# Assert spmatrices equal
assert dict(sdf.to_coo().todok()) == dict(spm.todok())
# Ensure dtype is preserved if possible
# XXX: verify this
res_dtype = bool if is_bool_dtype(dtype) else dtype
tm.assert_contains_all(sdf.dtypes.apply(lambda dtype: dtype.subtype),
{np.dtype(res_dtype)})
assert sdf.to_coo().dtype == res_dtype
# However, adding a str column results in an upcast to object
sdf['strings'] = np.arange(len(sdf)).astype(str)
assert sdf.to_coo().dtype == np.object_
@pytest.mark.parametrize('fill_value', [None, 0, np.nan]) # noqa: F811
@ignore_matrix_warning
@pytest.mark.filterwarnings("ignore:object dtype is not supp:UserWarning")
def test_from_to_scipy_object(spmatrix, fill_value):
# GH 4343
dtype = object
columns = list('cd')
index = list('ab')
if (spmatrix is scipy.sparse.dok_matrix and LooseVersion(
scipy.__version__) >= LooseVersion('0.19.0')):
pytest.skip("dok_matrix from object does not work in SciPy >= 0.19")
# Make one ndarray and from it one sparse matrix, both to be used for
# constructing frames and comparing results
arr = np.eye(2, dtype=dtype)
try:
spm = spmatrix(arr)
assert spm.dtype == arr.dtype
except (TypeError, AssertionError):
# If conversion to sparse fails for this spmatrix type and arr.dtype,
# then the combination is not currently supported in NumPy, so we
# can just skip testing it thoroughly
return
sdf = SparseDataFrame(spm, index=index, columns=columns,
default_fill_value=fill_value)
# Expected result construction is kind of tricky for all
# dtype-fill_value combinations; easiest to cast to something generic
# and except later on
rarr = arr.astype(object)
rarr[arr == 0] = np.nan
expected = SparseDataFrame(rarr, index=index, columns=columns).fillna(
fill_value if fill_value is not None else np.nan)
# Assert frame is as expected
sdf_obj = sdf.astype(SparseDtype(object, fill_value))
tm.assert_sp_frame_equal(sdf_obj, expected)
tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense())
# Assert spmatrices equal
assert dict(sdf.to_coo().todok()) == dict(spm.todok())
# Ensure dtype is preserved if possible
res_dtype = object
tm.assert_contains_all(sdf.dtypes.apply(lambda dtype: dtype.subtype),
{np.dtype(res_dtype)})
assert sdf.to_coo().dtype == res_dtype
@ignore_matrix_warning
def test_from_scipy_correct_ordering(spmatrix):
# GH 16179
arr = np.arange(1, 5).reshape(2, 2)
try:
spm = spmatrix(arr)
assert spm.dtype == arr.dtype
except (TypeError, AssertionError):
# If conversion to sparse fails for this spmatrix type and arr.dtype,
# then the combination is not currently supported in NumPy, so we
# can just skip testing it thoroughly
return
sdf = SparseDataFrame(spm)
expected = SparseDataFrame(arr)
tm.assert_sp_frame_equal(sdf, expected)
tm.assert_frame_equal(sdf.to_dense(), expected.to_dense())
@ignore_matrix_warning
def test_from_scipy_fillna(spmatrix):
# GH 16112
arr = np.eye(3)
arr[1:, 0] = np.nan
try:
spm = spmatrix(arr)
assert spm.dtype == arr.dtype
except (TypeError, AssertionError):
# If conversion to sparse fails for this spmatrix type and arr.dtype,
# then the combination is not currently supported in NumPy, so we
# can just skip testing it thoroughly
return
sdf = SparseDataFrame(spm).fillna(-1.0)
# Returning frame should fill all nan values with -1.0
expected = SparseDataFrame({
0: SparseSeries([1., -1, -1]),
1: SparseSeries([np.nan, 1, np.nan]),
2: SparseSeries([np.nan, np.nan, 1]),
}, default_fill_value=-1)
# fill_value is expected to be what .fillna() above was called with
# We don't use -1 as initial fill_value in expected SparseSeries
# construction because this way we obtain "compressed" SparseArrays,
# avoiding having to construct them ourselves
for col in expected:
expected[col].fill_value = -1
tm.assert_sp_frame_equal(sdf, expected)
def test_index_names_multiple_nones():
# https://github.com/pandas-dev/pandas/pull/24092
sparse = pytest.importorskip("scipy.sparse")
s = (pd.Series(1, index=pd.MultiIndex.from_product([['A', 'B'], [0, 1]]))
.to_sparse())
result, _, _ = s.to_coo()
assert isinstance(result, sparse.coo_matrix)
result = result.toarray()
expected = np.ones((2, 2), dtype="int64")
tm.assert_numpy_array_equal(result, expected)