pruned venvs
This commit is contained in:
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -1,812 +0,0 @@
|
||||
# pylint: disable=E1103
|
||||
|
||||
from warnings import catch_warnings
|
||||
from numpy.random import randn
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas.compat import lrange
|
||||
import pandas.compat as compat
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
from pandas import DataFrame, MultiIndex, Series, Index, merge, concat
|
||||
|
||||
from pandas._libs import join as libjoin
|
||||
import pandas.util.testing as tm
|
||||
from pandas.tests.reshape.merge.test_merge import get_test_data, N, NGROUPS
|
||||
|
||||
|
||||
a_ = np.array
|
||||
|
||||
|
||||
class TestJoin(object):
|
||||
|
||||
def setup_method(self, method):
|
||||
# aggregate multiple columns
|
||||
self.df = DataFrame({'key1': get_test_data(),
|
||||
'key2': get_test_data(),
|
||||
'data1': np.random.randn(N),
|
||||
'data2': np.random.randn(N)})
|
||||
|
||||
# exclude a couple keys for fun
|
||||
self.df = self.df[self.df['key2'] > 1]
|
||||
|
||||
self.df2 = DataFrame({'key1': get_test_data(n=N // 5),
|
||||
'key2': get_test_data(ngroups=NGROUPS // 2,
|
||||
n=N // 5),
|
||||
'value': np.random.randn(N // 5)})
|
||||
|
||||
index, data = tm.getMixedTypeDict()
|
||||
self.target = DataFrame(data, index=index)
|
||||
|
||||
# Join on string value
|
||||
self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']},
|
||||
index=data['C'])
|
||||
|
||||
def test_cython_left_outer_join(self):
|
||||
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
|
||||
right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
|
||||
max_group = 5
|
||||
|
||||
ls, rs = libjoin.left_outer_join(left, right, max_group)
|
||||
|
||||
exp_ls = left.argsort(kind='mergesort')
|
||||
exp_rs = right.argsort(kind='mergesort')
|
||||
|
||||
exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
|
||||
6, 6, 7, 7, 8, 8, 9, 10])
|
||||
exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
|
||||
4, 5, 4, 5, 4, 5, -1, -1])
|
||||
|
||||
exp_ls = exp_ls.take(exp_li)
|
||||
exp_ls[exp_li == -1] = -1
|
||||
|
||||
exp_rs = exp_rs.take(exp_ri)
|
||||
exp_rs[exp_ri == -1] = -1
|
||||
|
||||
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
|
||||
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
|
||||
|
||||
def test_cython_right_outer_join(self):
|
||||
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
|
||||
right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
|
||||
max_group = 5
|
||||
|
||||
rs, ls = libjoin.left_outer_join(right, left, max_group)
|
||||
|
||||
exp_ls = left.argsort(kind='mergesort')
|
||||
exp_rs = right.argsort(kind='mergesort')
|
||||
|
||||
# 0 1 1 1
|
||||
exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5,
|
||||
# 2 2 4
|
||||
6, 7, 8, 6, 7, 8, -1])
|
||||
exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3,
|
||||
4, 4, 4, 5, 5, 5, 6])
|
||||
|
||||
exp_ls = exp_ls.take(exp_li)
|
||||
exp_ls[exp_li == -1] = -1
|
||||
|
||||
exp_rs = exp_rs.take(exp_ri)
|
||||
exp_rs[exp_ri == -1] = -1
|
||||
|
||||
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
|
||||
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
|
||||
|
||||
def test_cython_inner_join(self):
|
||||
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
|
||||
right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
|
||||
max_group = 5
|
||||
|
||||
ls, rs = libjoin.inner_join(left, right, max_group)
|
||||
|
||||
exp_ls = left.argsort(kind='mergesort')
|
||||
exp_rs = right.argsort(kind='mergesort')
|
||||
|
||||
exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
|
||||
6, 6, 7, 7, 8, 8])
|
||||
exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
|
||||
4, 5, 4, 5, 4, 5])
|
||||
|
||||
exp_ls = exp_ls.take(exp_li)
|
||||
exp_ls[exp_li == -1] = -1
|
||||
|
||||
exp_rs = exp_rs.take(exp_ri)
|
||||
exp_rs[exp_ri == -1] = -1
|
||||
|
||||
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
|
||||
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
|
||||
|
||||
def test_left_outer_join(self):
|
||||
joined_key2 = merge(self.df, self.df2, on='key2')
|
||||
_check_join(self.df, self.df2, joined_key2, ['key2'], how='left')
|
||||
|
||||
joined_both = merge(self.df, self.df2)
|
||||
_check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
|
||||
how='left')
|
||||
|
||||
def test_right_outer_join(self):
|
||||
joined_key2 = merge(self.df, self.df2, on='key2', how='right')
|
||||
_check_join(self.df, self.df2, joined_key2, ['key2'], how='right')
|
||||
|
||||
joined_both = merge(self.df, self.df2, how='right')
|
||||
_check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
|
||||
how='right')
|
||||
|
||||
def test_full_outer_join(self):
|
||||
joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
|
||||
_check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')
|
||||
|
||||
joined_both = merge(self.df, self.df2, how='outer')
|
||||
_check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
|
||||
how='outer')
|
||||
|
||||
def test_inner_join(self):
|
||||
joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
|
||||
_check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')
|
||||
|
||||
joined_both = merge(self.df, self.df2, how='inner')
|
||||
_check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
|
||||
how='inner')
|
||||
|
||||
def test_handle_overlap(self):
|
||||
joined = merge(self.df, self.df2, on='key2',
|
||||
suffixes=['.foo', '.bar'])
|
||||
|
||||
assert 'key1.foo' in joined
|
||||
assert 'key1.bar' in joined
|
||||
|
||||
def test_handle_overlap_arbitrary_key(self):
|
||||
joined = merge(self.df, self.df2,
|
||||
left_on='key2', right_on='key1',
|
||||
suffixes=['.foo', '.bar'])
|
||||
assert 'key1.foo' in joined
|
||||
assert 'key2.bar' in joined
|
||||
|
||||
def test_join_on(self):
|
||||
target = self.target
|
||||
source = self.source
|
||||
|
||||
merged = target.join(source, on='C')
|
||||
tm.assert_series_equal(merged['MergedA'], target['A'],
|
||||
check_names=False)
|
||||
tm.assert_series_equal(merged['MergedD'], target['D'],
|
||||
check_names=False)
|
||||
|
||||
# join with duplicates (fix regression from DataFrame/Matrix merge)
|
||||
df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
|
||||
df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
|
||||
joined = df.join(df2, on='key')
|
||||
expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'],
|
||||
'value': [0, 0, 1, 1, 2]})
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
# Test when some are missing
|
||||
df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'],
|
||||
columns=['one'])
|
||||
df_b = DataFrame([['foo'], ['bar']], index=[1, 2],
|
||||
columns=['two'])
|
||||
df_c = DataFrame([[1], [2]], index=[1, 2],
|
||||
columns=['three'])
|
||||
joined = df_a.join(df_b, on='one')
|
||||
joined = joined.join(df_c, on='one')
|
||||
assert np.isnan(joined['two']['c'])
|
||||
assert np.isnan(joined['three']['c'])
|
||||
|
||||
# merge column not p resent
|
||||
pytest.raises(KeyError, target.join, source, on='E')
|
||||
|
||||
# overlap
|
||||
source_copy = source.copy()
|
||||
source_copy['A'] = 0
|
||||
pytest.raises(ValueError, target.join, source_copy, on='A')
|
||||
|
||||
def test_join_on_fails_with_different_right_index(self):
|
||||
with pytest.raises(ValueError):
|
||||
df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
|
||||
'b': np.random.randn(3)})
|
||||
df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
|
||||
'b': np.random.randn(10)},
|
||||
index=tm.makeCustomIndex(10, 2))
|
||||
merge(df, df2, left_on='a', right_index=True)
|
||||
|
||||
def test_join_on_fails_with_different_left_index(self):
|
||||
with pytest.raises(ValueError):
|
||||
df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
|
||||
'b': np.random.randn(3)},
|
||||
index=tm.makeCustomIndex(10, 2))
|
||||
df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
|
||||
'b': np.random.randn(10)})
|
||||
merge(df, df2, right_on='b', left_index=True)
|
||||
|
||||
def test_join_on_fails_with_different_column_counts(self):
|
||||
with pytest.raises(ValueError):
|
||||
df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
|
||||
'b': np.random.randn(3)})
|
||||
df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
|
||||
'b': np.random.randn(10)},
|
||||
index=tm.makeCustomIndex(10, 2))
|
||||
merge(df, df2, right_on='a', left_on=['a', 'b'])
|
||||
|
||||
def test_join_on_fails_with_wrong_object_type(self):
|
||||
# GH12081
|
||||
wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])]
|
||||
df = DataFrame({'a': [1, 1]})
|
||||
|
||||
for obj in wrongly_typed:
|
||||
with tm.assert_raises_regex(ValueError, str(type(obj))):
|
||||
merge(obj, df, left_on='a', right_on='a')
|
||||
with tm.assert_raises_regex(ValueError, str(type(obj))):
|
||||
merge(df, obj, left_on='a', right_on='a')
|
||||
|
||||
def test_join_on_pass_vector(self):
|
||||
expected = self.target.join(self.source, on='C')
|
||||
del expected['C']
|
||||
|
||||
join_col = self.target.pop('C')
|
||||
result = self.target.join(self.source, on=join_col)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_with_len0(self):
|
||||
# nothing to merge
|
||||
merged = self.target.join(self.source.reindex([]), on='C')
|
||||
for col in self.source:
|
||||
assert col in merged
|
||||
assert merged[col].isna().all()
|
||||
|
||||
merged2 = self.target.join(self.source.reindex([]), on='C',
|
||||
how='inner')
|
||||
tm.assert_index_equal(merged2.columns, merged.columns)
|
||||
assert len(merged2) == 0
|
||||
|
||||
def test_join_on_inner(self):
|
||||
df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
|
||||
df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])
|
||||
|
||||
joined = df.join(df2, on='key', how='inner')
|
||||
|
||||
expected = df.join(df2, on='key')
|
||||
expected = expected[expected['value'].notna()]
|
||||
tm.assert_series_equal(joined['key'], expected['key'],
|
||||
check_dtype=False)
|
||||
tm.assert_series_equal(joined['value'], expected['value'],
|
||||
check_dtype=False)
|
||||
tm.assert_index_equal(joined.index, expected.index)
|
||||
|
||||
def test_join_on_singlekey_list(self):
|
||||
df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
|
||||
df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
|
||||
|
||||
# corner cases
|
||||
joined = df.join(df2, on=['key'])
|
||||
expected = df.join(df2, on='key')
|
||||
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
def test_join_on_series(self):
|
||||
result = self.target.join(self.source['MergedA'], on='C')
|
||||
expected = self.target.join(self.source[['MergedA']], on='C')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_on_series_buglet(self):
|
||||
# GH #638
|
||||
df = DataFrame({'a': [1, 1]})
|
||||
ds = Series([2], index=[1], name='b')
|
||||
result = df.join(ds, on='a')
|
||||
expected = DataFrame({'a': [1, 1],
|
||||
'b': [2, 2]}, index=df.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_index_mixed(self, join_type):
|
||||
# no overlapping blocks
|
||||
df1 = DataFrame(index=np.arange(10))
|
||||
df1['bool'] = True
|
||||
df1['string'] = 'foo'
|
||||
|
||||
df2 = DataFrame(index=np.arange(5, 15))
|
||||
df2['int'] = 1
|
||||
df2['float'] = 1.
|
||||
|
||||
joined = df1.join(df2, how=join_type)
|
||||
expected = _join_by_hand(df1, df2, how=join_type)
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
joined = df2.join(df1, how=join_type)
|
||||
expected = _join_by_hand(df2, df1, how=join_type)
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
def test_join_index_mixed_overlap(self):
|
||||
df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
|
||||
index=np.arange(10),
|
||||
columns=['A', 'B', 'C', 'D'])
|
||||
assert df1['B'].dtype == np.int64
|
||||
assert df1['D'].dtype == np.bool_
|
||||
|
||||
df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
|
||||
index=np.arange(0, 10, 2),
|
||||
columns=['A', 'B', 'C', 'D'])
|
||||
|
||||
# overlap
|
||||
joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
|
||||
expected_columns = ['A_one', 'B_one', 'C_one', 'D_one',
|
||||
'A_two', 'B_two', 'C_two', 'D_two']
|
||||
df1.columns = expected_columns[:4]
|
||||
df2.columns = expected_columns[4:]
|
||||
expected = _join_by_hand(df1, df2)
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
def test_join_empty_bug(self):
|
||||
# generated an exception in 0.4.3
|
||||
x = DataFrame()
|
||||
x.join(DataFrame([3], index=[0], columns=['A']), how='outer')
|
||||
|
||||
def test_join_unconsolidated(self):
|
||||
# GH #331
|
||||
a = DataFrame(randn(30, 2), columns=['a', 'b'])
|
||||
c = Series(randn(30))
|
||||
a['c'] = c
|
||||
d = DataFrame(randn(30, 1), columns=['q'])
|
||||
|
||||
# it works!
|
||||
a.join(d)
|
||||
d.join(a)
|
||||
|
||||
def test_join_multiindex(self):
|
||||
index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'],
|
||||
[1, 2, 3, 1, 2, 3]],
|
||||
names=['first', 'second'])
|
||||
|
||||
index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'],
|
||||
[1, 2, 3, 1, 2, 3]],
|
||||
names=['first', 'second'])
|
||||
|
||||
df1 = DataFrame(data=np.random.randn(6), index=index1,
|
||||
columns=['var X'])
|
||||
df2 = DataFrame(data=np.random.randn(6), index=index2,
|
||||
columns=['var Y'])
|
||||
|
||||
df1 = df1.sort_index(level=0)
|
||||
df2 = df2.sort_index(level=0)
|
||||
|
||||
joined = df1.join(df2, how='outer')
|
||||
ex_index = Index(index1.values).union(Index(index2.values))
|
||||
expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
|
||||
expected.index.names = index1.names
|
||||
assert_frame_equal(joined, expected)
|
||||
assert joined.index.names == index1.names
|
||||
|
||||
df1 = df1.sort_index(level=1)
|
||||
df2 = df2.sort_index(level=1)
|
||||
|
||||
joined = df1.join(df2, how='outer').sort_index(level=0)
|
||||
ex_index = Index(index1.values).union(Index(index2.values))
|
||||
expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
|
||||
expected.index.names = index1.names
|
||||
|
||||
assert_frame_equal(joined, expected)
|
||||
assert joined.index.names == index1.names
|
||||
|
||||
def test_join_inner_multiindex(self):
|
||||
key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
|
||||
'qux', 'snap']
|
||||
key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
|
||||
'three', 'one']
|
||||
|
||||
data = np.random.randn(len(key1))
|
||||
data = DataFrame({'key1': key1, 'key2': key2,
|
||||
'data': data})
|
||||
|
||||
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
|
||||
['one', 'two', 'three']],
|
||||
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
|
||||
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
|
||||
names=['first', 'second'])
|
||||
to_join = DataFrame(np.random.randn(10, 3), index=index,
|
||||
columns=['j_one', 'j_two', 'j_three'])
|
||||
|
||||
joined = data.join(to_join, on=['key1', 'key2'], how='inner')
|
||||
expected = merge(data, to_join.reset_index(),
|
||||
left_on=['key1', 'key2'],
|
||||
right_on=['first', 'second'], how='inner',
|
||||
sort=False)
|
||||
|
||||
expected2 = merge(to_join, data,
|
||||
right_on=['key1', 'key2'], left_index=True,
|
||||
how='inner', sort=False)
|
||||
assert_frame_equal(joined, expected2.reindex_like(joined))
|
||||
|
||||
expected2 = merge(to_join, data, right_on=['key1', 'key2'],
|
||||
left_index=True, how='inner', sort=False)
|
||||
|
||||
expected = expected.drop(['first', 'second'], axis=1)
|
||||
expected.index = joined.index
|
||||
|
||||
assert joined.index.is_monotonic
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
# _assert_same_contents(expected, expected2.loc[:, expected.columns])
|
||||
|
||||
def test_join_hierarchical_mixed(self):
|
||||
# GH 2024
|
||||
df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
|
||||
new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
|
||||
other_df = DataFrame(
|
||||
[(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
|
||||
other_df.set_index('a', inplace=True)
|
||||
# GH 9455, 12219
|
||||
with tm.assert_produces_warning(UserWarning):
|
||||
result = merge(new_df, other_df, left_index=True, right_index=True)
|
||||
assert ('b', 'mean') in result
|
||||
assert 'b' in result
|
||||
|
||||
def test_join_float64_float32(self):
|
||||
|
||||
a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64)
|
||||
b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32)
|
||||
joined = a.join(b)
|
||||
assert joined.dtypes['a'] == 'float64'
|
||||
assert joined.dtypes['b'] == 'float64'
|
||||
assert joined.dtypes['c'] == 'float32'
|
||||
|
||||
a = np.random.randint(0, 5, 100).astype('int64')
|
||||
b = np.random.random(100).astype('float64')
|
||||
c = np.random.random(100).astype('float32')
|
||||
df = DataFrame({'a': a, 'b': b, 'c': c})
|
||||
xpdf = DataFrame({'a': a, 'b': b, 'c': c})
|
||||
s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
|
||||
rs = df.merge(s, left_on='a', right_index=True)
|
||||
assert rs.dtypes['a'] == 'int64'
|
||||
assert rs.dtypes['b'] == 'float64'
|
||||
assert rs.dtypes['c'] == 'float32'
|
||||
assert rs.dtypes['md'] == 'float32'
|
||||
|
||||
xp = xpdf.merge(s, left_on='a', right_index=True)
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_join_many_non_unique_index(self):
|
||||
df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
|
||||
df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
|
||||
df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
|
||||
idf1 = df1.set_index(["a", "b"])
|
||||
idf2 = df2.set_index(["a", "b"])
|
||||
idf3 = df3.set_index(["a", "b"])
|
||||
|
||||
result = idf1.join([idf2, idf3], how='outer')
|
||||
|
||||
df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
|
||||
expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')
|
||||
|
||||
result = result.reset_index()
|
||||
expected = expected[result.columns]
|
||||
expected['a'] = expected.a.astype('int64')
|
||||
expected['b'] = expected.b.astype('int64')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
|
||||
df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
|
||||
df3 = DataFrame(
|
||||
{"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
|
||||
idf1 = df1.set_index(["a", "b"])
|
||||
idf2 = df2.set_index(["a", "b"])
|
||||
idf3 = df3.set_index(["a", "b"])
|
||||
result = idf1.join([idf2, idf3], how='inner')
|
||||
|
||||
df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
|
||||
expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')
|
||||
|
||||
result = result.reset_index()
|
||||
|
||||
assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
# GH 11519
|
||||
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
|
||||
'foo', 'bar', 'foo', 'foo'],
|
||||
'B': ['one', 'one', 'two', 'three',
|
||||
'two', 'two', 'one', 'three'],
|
||||
'C': np.random.randn(8),
|
||||
'D': np.random.randn(8)})
|
||||
s = Series(np.repeat(np.arange(8), 2),
|
||||
index=np.repeat(np.arange(8), 2), name='TEST')
|
||||
inner = df.join(s, how='inner')
|
||||
outer = df.join(s, how='outer')
|
||||
left = df.join(s, how='left')
|
||||
right = df.join(s, how='right')
|
||||
assert_frame_equal(inner, outer)
|
||||
assert_frame_equal(inner, left)
|
||||
assert_frame_equal(inner, right)
|
||||
|
||||
def test_join_sort(self):
|
||||
left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
|
||||
'value': [1, 2, 3, 4]})
|
||||
right = DataFrame({'value2': ['a', 'b', 'c']},
|
||||
index=['bar', 'baz', 'foo'])
|
||||
|
||||
joined = left.join(right, on='key', sort=True)
|
||||
expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'],
|
||||
'value': [2, 3, 1, 4],
|
||||
'value2': ['a', 'b', 'c', 'c']},
|
||||
index=[1, 2, 0, 3])
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
# smoke test
|
||||
joined = left.join(right, on='key', sort=False)
|
||||
tm.assert_index_equal(joined.index, pd.Index(lrange(4)))
|
||||
|
||||
def test_join_mixed_non_unique_index(self):
|
||||
# GH 12814, unorderable types in py3 with a non-unique index
|
||||
df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a'])
|
||||
df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4])
|
||||
result = df1.join(df2)
|
||||
expected = DataFrame({'a': [1, 2, 3, 3, 4],
|
||||
'b': [5, np.nan, 6, 7, np.nan]},
|
||||
index=[1, 2, 3, 3, 'a'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a'])
|
||||
df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4])
|
||||
result = df3.join(df4)
|
||||
expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]},
|
||||
index=[1, 2, 2, 'a'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_non_unique_period_index(self):
|
||||
# GH #16871
|
||||
index = pd.period_range('2016-01-01', periods=16, freq='M')
|
||||
df = DataFrame([i for i in range(len(index))],
|
||||
index=index, columns=['pnum'])
|
||||
df2 = concat([df, df])
|
||||
result = df.join(df2, how='inner', rsuffix='_df2')
|
||||
expected = DataFrame(
|
||||
np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
|
||||
columns=['pnum', 'pnum_df2'], index=df2.sort_index().index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_mixed_type_join_with_suffix(self):
|
||||
# GH #916
|
||||
df = DataFrame(np.random.randn(20, 6),
|
||||
columns=['a', 'b', 'c', 'd', 'e', 'f'])
|
||||
df.insert(0, 'id', 0)
|
||||
df.insert(5, 'dt', 'foo')
|
||||
|
||||
grouped = df.groupby('id')
|
||||
mn = grouped.mean()
|
||||
cn = grouped.count()
|
||||
|
||||
# it works!
|
||||
mn.join(cn, rsuffix='_right')
|
||||
|
||||
def test_join_many(self):
|
||||
df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
|
||||
df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]
|
||||
|
||||
joined = df_list[0].join(df_list[1:])
|
||||
tm.assert_frame_equal(joined, df)
|
||||
|
||||
df_list = [df[['a', 'b']][:-2],
|
||||
df[['c', 'd']][2:], df[['e', 'f']][1:9]]
|
||||
|
||||
def _check_diff_index(df_list, result, exp_index):
|
||||
reindexed = [x.reindex(exp_index) for x in df_list]
|
||||
expected = reindexed[0].join(reindexed[1:])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# different join types
|
||||
joined = df_list[0].join(df_list[1:], how='outer')
|
||||
_check_diff_index(df_list, joined, df.index)
|
||||
|
||||
joined = df_list[0].join(df_list[1:])
|
||||
_check_diff_index(df_list, joined, df_list[0].index)
|
||||
|
||||
joined = df_list[0].join(df_list[1:], how='inner')
|
||||
_check_diff_index(df_list, joined, df.index[2:8])
|
||||
|
||||
pytest.raises(ValueError, df_list[0].join, df_list[1:], on='a')
|
||||
|
||||
def test_join_many_mixed(self):
|
||||
df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
|
||||
df['key'] = ['foo', 'bar'] * 4
|
||||
df1 = df.loc[:, ['A', 'B']]
|
||||
df2 = df.loc[:, ['C', 'D']]
|
||||
df3 = df.loc[:, ['key']]
|
||||
|
||||
result = df1.join([df2, df3])
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
def test_join_dups(self):
|
||||
|
||||
# joining dups
|
||||
df = concat([DataFrame(np.random.randn(10, 4),
|
||||
columns=['A', 'A', 'B', 'B']),
|
||||
DataFrame(np.random.randint(0, 10, size=20)
|
||||
.reshape(10, 2),
|
||||
columns=['A', 'C'])],
|
||||
axis=1)
|
||||
|
||||
expected = concat([df, df], axis=1)
|
||||
result = df.join(df, rsuffix='_2')
|
||||
result.columns = expected.columns
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH 4975, invalid join on dups
|
||||
w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
|
||||
x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
|
||||
y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
|
||||
z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
|
||||
|
||||
dta = x.merge(y, left_index=True, right_index=True).merge(
|
||||
z, left_index=True, right_index=True, how="outer")
|
||||
dta = dta.merge(w, left_index=True, right_index=True)
|
||||
expected = concat([x, y, z, w], axis=1)
|
||||
expected.columns = ['x_x', 'y_x', 'x_y',
|
||||
'y_y', 'x_x', 'y_x', 'x_y', 'y_y']
|
||||
assert_frame_equal(dta, expected)
|
||||
|
||||
def test_panel_join(self):
|
||||
with catch_warnings(record=True):
|
||||
panel = tm.makePanel()
|
||||
tm.add_nans(panel)
|
||||
|
||||
p1 = panel.iloc[:2, :10, :3]
|
||||
p2 = panel.iloc[2:, 5:, 2:]
|
||||
|
||||
# left join
|
||||
result = p1.join(p2)
|
||||
expected = p1.copy()
|
||||
expected['ItemC'] = p2['ItemC']
|
||||
tm.assert_panel_equal(result, expected)
|
||||
|
||||
# right join
|
||||
result = p1.join(p2, how='right')
|
||||
expected = p2.copy()
|
||||
expected['ItemA'] = p1['ItemA']
|
||||
expected['ItemB'] = p1['ItemB']
|
||||
expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC'])
|
||||
tm.assert_panel_equal(result, expected)
|
||||
|
||||
# inner join
|
||||
result = p1.join(p2, how='inner')
|
||||
expected = panel.iloc[:, 5:10, 2:3]
|
||||
tm.assert_panel_equal(result, expected)
|
||||
|
||||
# outer join
|
||||
result = p1.join(p2, how='outer')
|
||||
expected = p1.reindex(major=panel.major_axis,
|
||||
minor=panel.minor_axis)
|
||||
expected = expected.join(p2.reindex(major=panel.major_axis,
|
||||
minor=panel.minor_axis))
|
||||
tm.assert_panel_equal(result, expected)
|
||||
|
||||
def test_panel_join_overlap(self):
|
||||
with catch_warnings(record=True):
|
||||
panel = tm.makePanel()
|
||||
tm.add_nans(panel)
|
||||
|
||||
p1 = panel.loc[['ItemA', 'ItemB', 'ItemC']]
|
||||
p2 = panel.loc[['ItemB', 'ItemC']]
|
||||
|
||||
# Expected index is
|
||||
#
|
||||
# ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2
|
||||
joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2')
|
||||
p1_suf = p1.loc[['ItemB', 'ItemC']].add_suffix('_p1')
|
||||
p2_suf = p2.loc[['ItemB', 'ItemC']].add_suffix('_p2')
|
||||
no_overlap = panel.loc[['ItemA']]
|
||||
expected = no_overlap.join(p1_suf.join(p2_suf))
|
||||
tm.assert_panel_equal(joined, expected)
|
||||
|
||||
def test_panel_join_many(self):
|
||||
with catch_warnings(record=True):
|
||||
tm.K = 10
|
||||
panel = tm.makePanel()
|
||||
tm.K = 4
|
||||
|
||||
panels = [panel.iloc[:2], panel.iloc[2:6], panel.iloc[6:]]
|
||||
|
||||
joined = panels[0].join(panels[1:])
|
||||
tm.assert_panel_equal(joined, panel)
|
||||
|
||||
panels = [panel.iloc[:2, :-5],
|
||||
panel.iloc[2:6, 2:],
|
||||
panel.iloc[6:, 5:-7]]
|
||||
|
||||
data_dict = {}
|
||||
for p in panels:
|
||||
data_dict.update(p.iteritems())
|
||||
|
||||
joined = panels[0].join(panels[1:], how='inner')
|
||||
expected = pd.Panel.from_dict(data_dict, intersect=True)
|
||||
tm.assert_panel_equal(joined, expected)
|
||||
|
||||
joined = panels[0].join(panels[1:], how='outer')
|
||||
expected = pd.Panel.from_dict(data_dict, intersect=False)
|
||||
tm.assert_panel_equal(joined, expected)
|
||||
|
||||
# edge cases
|
||||
pytest.raises(ValueError, panels[0].join, panels[1:],
|
||||
how='outer', lsuffix='foo', rsuffix='bar')
|
||||
pytest.raises(ValueError, panels[0].join, panels[1:],
|
||||
how='right')
|
||||
|
||||
|
||||
def _check_join(left, right, result, join_col, how='left',
|
||||
lsuffix='_x', rsuffix='_y'):
|
||||
|
||||
# some smoke tests
|
||||
for c in join_col:
|
||||
assert(result[c].notna().all())
|
||||
|
||||
left_grouped = left.groupby(join_col)
|
||||
right_grouped = right.groupby(join_col)
|
||||
|
||||
for group_key, group in result.groupby(join_col):
|
||||
l_joined = _restrict_to_columns(group, left.columns, lsuffix)
|
||||
r_joined = _restrict_to_columns(group, right.columns, rsuffix)
|
||||
|
||||
try:
|
||||
lgroup = left_grouped.get_group(group_key)
|
||||
except KeyError:
|
||||
if how in ('left', 'inner'):
|
||||
raise AssertionError('key %s should not have been in the join'
|
||||
% str(group_key))
|
||||
|
||||
_assert_all_na(l_joined, left.columns, join_col)
|
||||
else:
|
||||
_assert_same_contents(l_joined, lgroup)
|
||||
|
||||
try:
|
||||
rgroup = right_grouped.get_group(group_key)
|
||||
except KeyError:
|
||||
if how in ('right', 'inner'):
|
||||
raise AssertionError('key %s should not have been in the join'
|
||||
% str(group_key))
|
||||
|
||||
_assert_all_na(r_joined, right.columns, join_col)
|
||||
else:
|
||||
_assert_same_contents(r_joined, rgroup)
|
||||
|
||||
|
||||
def _restrict_to_columns(group, columns, suffix):
|
||||
found = [c for c in group.columns
|
||||
if c in columns or c.replace(suffix, '') in columns]
|
||||
|
||||
# filter
|
||||
group = group.loc[:, found]
|
||||
|
||||
# get rid of suffixes, if any
|
||||
group = group.rename(columns=lambda x: x.replace(suffix, ''))
|
||||
|
||||
# put in the right order...
|
||||
group = group.loc[:, columns]
|
||||
|
||||
return group
|
||||
|
||||
|
||||
def _assert_same_contents(join_chunk, source):
|
||||
NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly...
|
||||
|
||||
jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values
|
||||
svalues = source.fillna(NA_SENTINEL).drop_duplicates().values
|
||||
|
||||
rows = {tuple(row) for row in jvalues}
|
||||
assert(len(rows) == len(source))
|
||||
assert(all(tuple(row) in rows for row in svalues))
|
||||
|
||||
|
||||
def _assert_all_na(join_chunk, source_columns, join_col):
|
||||
for c in source_columns:
|
||||
if c in join_col:
|
||||
continue
|
||||
assert(join_chunk[c].isna().all())
|
||||
|
||||
|
||||
def _join_by_hand(a, b, how='left'):
|
||||
join_index = a.index.join(b.index, how=how)
|
||||
|
||||
a_re = a.reindex(join_index)
|
||||
b_re = b.reindex(join_index)
|
||||
|
||||
result_columns = a.columns.append(b.columns)
|
||||
|
||||
for col, s in compat.iteritems(b_re):
|
||||
a_re[col] = s
|
||||
return a_re.reindex(columns=result_columns)
|
||||
File diff suppressed because it is too large
Load Diff
-1007
File diff suppressed because it is too large
Load Diff
-213
@@ -1,213 +0,0 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.util import testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df1():
|
||||
return DataFrame(dict(
|
||||
outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4],
|
||||
inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2],
|
||||
v1=np.linspace(0, 1, 11)))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df2():
|
||||
return DataFrame(dict(
|
||||
outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3],
|
||||
inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3],
|
||||
v2=np.linspace(10, 11, 12)))
|
||||
|
||||
|
||||
@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']])
|
||||
def left_df(request, df1):
|
||||
""" Construct left test DataFrame with specified levels
|
||||
(any of 'outer', 'inner', and 'v1')"""
|
||||
levels = request.param
|
||||
if levels:
|
||||
df1 = df1.set_index(levels)
|
||||
|
||||
return df1
|
||||
|
||||
|
||||
@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']])
|
||||
def right_df(request, df2):
|
||||
""" Construct right test DataFrame with specified levels
|
||||
(any of 'outer', 'inner', and 'v2')"""
|
||||
levels = request.param
|
||||
|
||||
if levels:
|
||||
df2 = df2.set_index(levels)
|
||||
|
||||
return df2
|
||||
|
||||
|
||||
def compute_expected(df_left, df_right,
|
||||
on=None, left_on=None, right_on=None, how=None):
|
||||
"""
|
||||
Compute the expected merge result for the test case.
|
||||
|
||||
This method computes the expected result of merging two DataFrames on
|
||||
a combination of their columns and index levels. It does so by
|
||||
explicitly dropping/resetting their named index levels, performing a
|
||||
merge on their columns, and then finally restoring the appropriate
|
||||
index in the result.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_left : DataFrame
|
||||
The left DataFrame (may have zero or more named index levels)
|
||||
df_right : DataFrame
|
||||
The right DataFrame (may have zero or more named index levels)
|
||||
on : list of str
|
||||
The on parameter to the merge operation
|
||||
left_on : list of str
|
||||
The left_on parameter to the merge operation
|
||||
right_on : list of str
|
||||
The right_on parameter to the merge operation
|
||||
how : str
|
||||
The how parameter to the merge operation
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The expected merge result
|
||||
"""
|
||||
|
||||
# Handle on param if specified
|
||||
if on is not None:
|
||||
left_on, right_on = on, on
|
||||
|
||||
# Compute input named index levels
|
||||
left_levels = [n for n in df_left.index.names if n is not None]
|
||||
right_levels = [n for n in df_right.index.names if n is not None]
|
||||
|
||||
# Compute output named index levels
|
||||
output_levels = [i for i in left_on
|
||||
if i in right_levels and i in left_levels]
|
||||
|
||||
# Drop index levels that aren't involved in the merge
|
||||
drop_left = [n for n in left_levels if n not in left_on]
|
||||
if drop_left:
|
||||
df_left = df_left.reset_index(drop_left, drop=True)
|
||||
|
||||
drop_right = [n for n in right_levels if n not in right_on]
|
||||
if drop_right:
|
||||
df_right = df_right.reset_index(drop_right, drop=True)
|
||||
|
||||
# Convert remaining index levels to columns
|
||||
reset_left = [n for n in left_levels if n in left_on]
|
||||
if reset_left:
|
||||
df_left = df_left.reset_index(level=reset_left)
|
||||
|
||||
reset_right = [n for n in right_levels if n in right_on]
|
||||
if reset_right:
|
||||
df_right = df_right.reset_index(level=reset_right)
|
||||
|
||||
# Perform merge
|
||||
expected = df_left.merge(df_right,
|
||||
left_on=left_on,
|
||||
right_on=right_on,
|
||||
how=how)
|
||||
|
||||
# Restore index levels
|
||||
if output_levels:
|
||||
expected = expected.set_index(output_levels)
|
||||
|
||||
return expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize('on,how',
|
||||
[(['outer'], 'inner'),
|
||||
(['inner'], 'left'),
|
||||
(['outer', 'inner'], 'right'),
|
||||
(['inner', 'outer'], 'outer')])
|
||||
def test_merge_indexes_and_columns_on(left_df, right_df, on, how):
|
||||
|
||||
# Construct expected result
|
||||
expected = compute_expected(left_df, right_df, on=on, how=how)
|
||||
|
||||
# Perform merge
|
||||
result = left_df.merge(right_df, on=on, how=how)
|
||||
assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('left_on,right_on,how',
|
||||
[(['outer'], ['outer'], 'inner'),
|
||||
(['inner'], ['inner'], 'right'),
|
||||
(['outer', 'inner'], ['outer', 'inner'], 'left'),
|
||||
(['inner', 'outer'], ['inner', 'outer'], 'outer')])
|
||||
def test_merge_indexes_and_columns_lefton_righton(
|
||||
left_df, right_df, left_on, right_on, how):
|
||||
|
||||
# Construct expected result
|
||||
expected = compute_expected(left_df, right_df,
|
||||
left_on=left_on,
|
||||
right_on=right_on,
|
||||
how=how)
|
||||
|
||||
# Perform merge
|
||||
result = left_df.merge(right_df,
|
||||
left_on=left_on, right_on=right_on, how=how)
|
||||
assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('left_index',
|
||||
['inner', ['inner', 'outer']])
|
||||
def test_join_indexes_and_columns_on(df1, df2, left_index, join_type):
|
||||
|
||||
# Construct left_df
|
||||
left_df = df1.set_index(left_index)
|
||||
|
||||
# Construct right_df
|
||||
right_df = df2.set_index(['outer', 'inner'])
|
||||
|
||||
# Result
|
||||
expected = (left_df.reset_index()
|
||||
.join(right_df, on=['outer', 'inner'], how=join_type,
|
||||
lsuffix='_x', rsuffix='_y')
|
||||
.set_index(left_index))
|
||||
|
||||
# Perform join
|
||||
result = left_df.join(right_df, on=['outer', 'inner'], how=join_type,
|
||||
lsuffix='_x', rsuffix='_y')
|
||||
|
||||
assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
def test_merge_index_column_precedence(df1, df2):
|
||||
|
||||
# Construct left_df with both an index and a column named 'outer'.
|
||||
# We make this 'outer' column equal to the 'inner' column so that we
|
||||
# can verify that the correct values are used by the merge operation
|
||||
left_df = df1.set_index('outer')
|
||||
left_df['outer'] = left_df['inner']
|
||||
|
||||
# Construct right_df with an index level named 'outer'
|
||||
right_df = df2.set_index('outer')
|
||||
|
||||
# Construct expected result.
|
||||
# The 'outer' column from left_df is chosen and the resulting
|
||||
# frame has no index levels
|
||||
expected = (left_df.reset_index(level='outer', drop=True)
|
||||
.merge(right_df.reset_index(), on=['outer', 'inner']))
|
||||
|
||||
# Merge left_df and right_df on 'outer' and 'inner'
|
||||
# 'outer' for left_df should refer to the 'outer' column, not the
|
||||
# 'outer' index level and a FutureWarning should be raised
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
result = left_df.merge(right_df, on=['outer', 'inner'])
|
||||
|
||||
# Check results
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Perform the same using the left_on and right_on parameters
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
result = left_df.merge(right_df,
|
||||
left_on=['outer', 'inner'],
|
||||
right_on=['outer', 'inner'])
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
-102
@@ -1,102 +0,0 @@
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, merge_ordered
|
||||
from pandas.util import testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
from numpy import nan
|
||||
|
||||
|
||||
class TestMergeOrdered(object):
|
||||
|
||||
def setup_method(self, method):
|
||||
self.left = DataFrame({'key': ['a', 'c', 'e'],
|
||||
'lvalue': [1, 2., 3]})
|
||||
|
||||
self.right = DataFrame({'key': ['b', 'c', 'd', 'f'],
|
||||
'rvalue': [1, 2, 3., 4]})
|
||||
|
||||
def test_basic(self):
|
||||
result = merge_ordered(self.left, self.right, on='key')
|
||||
expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'],
|
||||
'lvalue': [1, nan, 2, nan, 3, nan],
|
||||
'rvalue': [nan, 1, 2, 3, nan, 4]})
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_ffill(self):
|
||||
result = merge_ordered(
|
||||
self.left, self.right, on='key', fill_method='ffill')
|
||||
expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'],
|
||||
'lvalue': [1., 1, 2, 2, 3, 3.],
|
||||
'rvalue': [nan, 1, 2, 3, 3, 4]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_multigroup(self):
|
||||
left = pd.concat([self.left, self.left], ignore_index=True)
|
||||
|
||||
left['group'] = ['a'] * 3 + ['b'] * 3
|
||||
|
||||
result = merge_ordered(left, self.right, on='key', left_by='group',
|
||||
fill_method='ffill')
|
||||
expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2,
|
||||
'lvalue': [1., 1, 2, 2, 3, 3.] * 2,
|
||||
'rvalue': [nan, 1, 2, 3, 3, 4] * 2})
|
||||
expected['group'] = ['a'] * 6 + ['b'] * 6
|
||||
|
||||
assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
result2 = merge_ordered(self.right, left, on='key', right_by='group',
|
||||
fill_method='ffill')
|
||||
assert_frame_equal(result, result2.loc[:, result.columns])
|
||||
|
||||
result = merge_ordered(left, self.right, on='key', left_by='group')
|
||||
assert result['group'].notna().all()
|
||||
|
||||
def test_merge_type(self):
|
||||
class NotADataFrame(DataFrame):
|
||||
|
||||
@property
|
||||
def _constructor(self):
|
||||
return NotADataFrame
|
||||
|
||||
nad = NotADataFrame(self.left)
|
||||
result = nad.merge(self.right, on='key')
|
||||
|
||||
assert isinstance(result, NotADataFrame)
|
||||
|
||||
def test_empty_sequence_concat(self):
|
||||
# GH 9157
|
||||
empty_pat = "[Nn]o objects"
|
||||
none_pat = "objects.*None"
|
||||
test_cases = [
|
||||
((), empty_pat),
|
||||
([], empty_pat),
|
||||
({}, empty_pat),
|
||||
([None], none_pat),
|
||||
([None, None], none_pat)
|
||||
]
|
||||
for df_seq, pattern in test_cases:
|
||||
tm.assert_raises_regex(ValueError, pattern, pd.concat, df_seq)
|
||||
|
||||
pd.concat([pd.DataFrame()])
|
||||
pd.concat([None, pd.DataFrame()])
|
||||
pd.concat([pd.DataFrame(), None])
|
||||
|
||||
def test_doc_example(self):
|
||||
left = DataFrame({'group': list('aaabbb'),
|
||||
'key': ['a', 'c', 'e', 'a', 'c', 'e'],
|
||||
'lvalue': [1, 2, 3] * 2,
|
||||
})
|
||||
|
||||
right = DataFrame({'key': ['b', 'c', 'd'],
|
||||
'rvalue': [1, 2, 3]})
|
||||
|
||||
result = merge_ordered(left, right, fill_method='ffill',
|
||||
left_by='group')
|
||||
|
||||
expected = DataFrame({'group': list('aaaaabbbbb'),
|
||||
'key': ['a', 'b', 'c', 'd', 'e'] * 2,
|
||||
'lvalue': [1, 1, 2, 2, 3] * 2,
|
||||
'rvalue': [nan, 1, 2, 3, 3] * 2})
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,642 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# pylint: disable-msg=W0612,E1101
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas as pd
|
||||
|
||||
from numpy import nan
|
||||
import numpy as np
|
||||
|
||||
from pandas import melt, lreshape, wide_to_long
|
||||
import pandas.util.testing as tm
|
||||
from pandas.compat import range
|
||||
|
||||
|
||||
class TestMelt(object):
|
||||
|
||||
def setup_method(self, method):
|
||||
self.df = tm.makeTimeDataFrame()[:10]
|
||||
self.df['id1'] = (self.df['A'] > 0).astype(np.int64)
|
||||
self.df['id2'] = (self.df['B'] > 0).astype(np.int64)
|
||||
|
||||
self.var_name = 'var'
|
||||
self.value_name = 'val'
|
||||
|
||||
self.df1 = pd.DataFrame([[1.067683, -1.110463, 0.20867
|
||||
], [-1.321405, 0.368915, -1.055342],
|
||||
[-0.807333, 0.08298, -0.873361]])
|
||||
self.df1.columns = [list('ABC'), list('abc')]
|
||||
self.df1.columns.names = ['CAP', 'low']
|
||||
|
||||
def test_top_level_method(self):
|
||||
result = melt(self.df)
|
||||
assert result.columns.tolist() == ['variable', 'value']
|
||||
|
||||
def test_method_signatures(self):
|
||||
tm.assert_frame_equal(self.df.melt(),
|
||||
melt(self.df))
|
||||
|
||||
tm.assert_frame_equal(self.df.melt(id_vars=['id1', 'id2'],
|
||||
value_vars=['A', 'B']),
|
||||
melt(self.df,
|
||||
id_vars=['id1', 'id2'],
|
||||
value_vars=['A', 'B']))
|
||||
|
||||
tm.assert_frame_equal(self.df.melt(var_name=self.var_name,
|
||||
value_name=self.value_name),
|
||||
melt(self.df,
|
||||
var_name=self.var_name,
|
||||
value_name=self.value_name))
|
||||
|
||||
tm.assert_frame_equal(self.df1.melt(col_level=0),
|
||||
melt(self.df1, col_level=0))
|
||||
|
||||
def test_default_col_names(self):
|
||||
result = self.df.melt()
|
||||
assert result.columns.tolist() == ['variable', 'value']
|
||||
|
||||
result1 = self.df.melt(id_vars=['id1'])
|
||||
assert result1.columns.tolist() == ['id1', 'variable', 'value']
|
||||
|
||||
result2 = self.df.melt(id_vars=['id1', 'id2'])
|
||||
assert result2.columns.tolist() == ['id1', 'id2', 'variable', 'value']
|
||||
|
||||
def test_value_vars(self):
|
||||
result3 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A')
|
||||
assert len(result3) == 10
|
||||
|
||||
result4 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'])
|
||||
expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2,
|
||||
'id2': self.df['id2'].tolist() * 2,
|
||||
'variable': ['A'] * 10 + ['B'] * 10,
|
||||
'value': (self.df['A'].tolist() +
|
||||
self.df['B'].tolist())},
|
||||
columns=['id1', 'id2', 'variable', 'value'])
|
||||
tm.assert_frame_equal(result4, expected4)
|
||||
|
||||
def test_value_vars_types(self):
|
||||
# GH 15348
|
||||
expected = DataFrame({'id1': self.df['id1'].tolist() * 2,
|
||||
'id2': self.df['id2'].tolist() * 2,
|
||||
'variable': ['A'] * 10 + ['B'] * 10,
|
||||
'value': (self.df['A'].tolist() +
|
||||
self.df['B'].tolist())},
|
||||
columns=['id1', 'id2', 'variable', 'value'])
|
||||
|
||||
for type_ in (tuple, list, np.array):
|
||||
result = self.df.melt(id_vars=['id1', 'id2'],
|
||||
value_vars=type_(('A', 'B')))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_vars_work_with_multiindex(self):
|
||||
expected = DataFrame({
|
||||
('A', 'a'): self.df1[('A', 'a')],
|
||||
'CAP': ['B'] * len(self.df1),
|
||||
'low': ['b'] * len(self.df1),
|
||||
'value': self.df1[('B', 'b')],
|
||||
}, columns=[('A', 'a'), 'CAP', 'low', 'value'])
|
||||
|
||||
result = self.df1.melt(id_vars=[('A', 'a')], value_vars=[('B', 'b')])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_tuple_vars_fail_with_multiindex(self):
|
||||
# melt should fail with an informative error message if
|
||||
# the columns have a MultiIndex and a tuple is passed
|
||||
# for id_vars or value_vars.
|
||||
tuple_a = ('A', 'a')
|
||||
list_a = [tuple_a]
|
||||
tuple_b = ('B', 'b')
|
||||
list_b = [tuple_b]
|
||||
|
||||
for id_vars, value_vars in ((tuple_a, list_b), (list_a, tuple_b),
|
||||
(tuple_a, tuple_b)):
|
||||
with tm.assert_raises_regex(ValueError, r'MultiIndex'):
|
||||
self.df1.melt(id_vars=id_vars, value_vars=value_vars)
|
||||
|
||||
def test_custom_var_name(self):
|
||||
result5 = self.df.melt(var_name=self.var_name)
|
||||
assert result5.columns.tolist() == ['var', 'value']
|
||||
|
||||
result6 = self.df.melt(id_vars=['id1'], var_name=self.var_name)
|
||||
assert result6.columns.tolist() == ['id1', 'var', 'value']
|
||||
|
||||
result7 = self.df.melt(id_vars=['id1', 'id2'], var_name=self.var_name)
|
||||
assert result7.columns.tolist() == ['id1', 'id2', 'var', 'value']
|
||||
|
||||
result8 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
|
||||
var_name=self.var_name)
|
||||
assert result8.columns.tolist() == ['id1', 'id2', 'var', 'value']
|
||||
|
||||
result9 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
|
||||
var_name=self.var_name)
|
||||
expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2,
|
||||
'id2': self.df['id2'].tolist() * 2,
|
||||
self.var_name: ['A'] * 10 + ['B'] * 10,
|
||||
'value': (self.df['A'].tolist() +
|
||||
self.df['B'].tolist())},
|
||||
columns=['id1', 'id2', self.var_name, 'value'])
|
||||
tm.assert_frame_equal(result9, expected9)
|
||||
|
||||
def test_custom_value_name(self):
|
||||
result10 = self.df.melt(value_name=self.value_name)
|
||||
assert result10.columns.tolist() == ['variable', 'val']
|
||||
|
||||
result11 = self.df.melt(id_vars=['id1'], value_name=self.value_name)
|
||||
assert result11.columns.tolist() == ['id1', 'variable', 'val']
|
||||
|
||||
result12 = self.df.melt(id_vars=['id1', 'id2'],
|
||||
value_name=self.value_name)
|
||||
assert result12.columns.tolist() == ['id1', 'id2', 'variable', 'val']
|
||||
|
||||
result13 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
|
||||
value_name=self.value_name)
|
||||
assert result13.columns.tolist() == ['id1', 'id2', 'variable', 'val']
|
||||
|
||||
result14 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
|
||||
value_name=self.value_name)
|
||||
expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2,
|
||||
'id2': self.df['id2'].tolist() * 2,
|
||||
'variable': ['A'] * 10 + ['B'] * 10,
|
||||
self.value_name: (self.df['A'].tolist() +
|
||||
self.df['B'].tolist())},
|
||||
columns=['id1', 'id2', 'variable',
|
||||
self.value_name])
|
||||
tm.assert_frame_equal(result14, expected14)
|
||||
|
||||
def test_custom_var_and_value_name(self):
|
||||
|
||||
result15 = self.df.melt(var_name=self.var_name,
|
||||
value_name=self.value_name)
|
||||
assert result15.columns.tolist() == ['var', 'val']
|
||||
|
||||
result16 = self.df.melt(id_vars=['id1'], var_name=self.var_name,
|
||||
value_name=self.value_name)
|
||||
assert result16.columns.tolist() == ['id1', 'var', 'val']
|
||||
|
||||
result17 = self.df.melt(id_vars=['id1', 'id2'],
|
||||
var_name=self.var_name,
|
||||
value_name=self.value_name)
|
||||
assert result17.columns.tolist() == ['id1', 'id2', 'var', 'val']
|
||||
|
||||
result18 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
|
||||
var_name=self.var_name,
|
||||
value_name=self.value_name)
|
||||
assert result18.columns.tolist() == ['id1', 'id2', 'var', 'val']
|
||||
|
||||
result19 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
|
||||
var_name=self.var_name,
|
||||
value_name=self.value_name)
|
||||
expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2,
|
||||
'id2': self.df['id2'].tolist() * 2,
|
||||
self.var_name: ['A'] * 10 + ['B'] * 10,
|
||||
self.value_name: (self.df['A'].tolist() +
|
||||
self.df['B'].tolist())},
|
||||
columns=['id1', 'id2', self.var_name,
|
||||
self.value_name])
|
||||
tm.assert_frame_equal(result19, expected19)
|
||||
|
||||
df20 = self.df.copy()
|
||||
df20.columns.name = 'foo'
|
||||
result20 = df20.melt()
|
||||
assert result20.columns.tolist() == ['foo', 'value']
|
||||
|
||||
def test_col_level(self):
|
||||
res1 = self.df1.melt(col_level=0)
|
||||
res2 = self.df1.melt(col_level='CAP')
|
||||
assert res1.columns.tolist() == ['CAP', 'value']
|
||||
assert res2.columns.tolist() == ['CAP', 'value']
|
||||
|
||||
def test_multiindex(self):
|
||||
res = self.df1.melt()
|
||||
assert res.columns.tolist() == ['CAP', 'low', 'value']
|
||||
|
||||
@pytest.mark.parametrize("col", [
|
||||
pd.Series(pd.date_range('2010', periods=5, tz='US/Pacific')),
|
||||
pd.Series(["a", "b", "c", "a", "d"], dtype="category"),
|
||||
pd.Series([0, 1, 0, 0, 0])])
|
||||
def test_pandas_dtypes(self, col):
|
||||
# GH 15785
|
||||
df = DataFrame({'klass': range(5),
|
||||
'col': col,
|
||||
'attr1': [1, 0, 0, 0, 0],
|
||||
'attr2': col})
|
||||
expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col],
|
||||
ignore_index=True)
|
||||
result = melt(df, id_vars=['klass', 'col'], var_name='attribute',
|
||||
value_name='value')
|
||||
expected = DataFrame({0: list(range(5)) * 2,
|
||||
1: pd.concat([col] * 2, ignore_index=True),
|
||||
2: ['attr1'] * 5 + ['attr2'] * 5,
|
||||
3: expected_value})
|
||||
expected.columns = ['klass', 'col', 'attribute', 'value']
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestLreshape(object):
|
||||
|
||||
def test_pairs(self):
|
||||
data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008',
|
||||
'11jan2009'],
|
||||
'birthwt': [1766, 3301, 1454, 3139, 4133],
|
||||
'id': [101, 102, 103, 104, 105],
|
||||
'sex': ['Male', 'Female', 'Female', 'Female', 'Female'],
|
||||
'visitdt1': ['11jan2009', '22dec2008', '04jan2009',
|
||||
'29dec2008', '20jan2009'],
|
||||
'visitdt2':
|
||||
['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'],
|
||||
'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'],
|
||||
'wt1': [1823, 3338, 1549, 3298, 4306],
|
||||
'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0],
|
||||
'wt3': [2293.0, nan, nan, 3377.0, 4805.0]}
|
||||
|
||||
df = DataFrame(data)
|
||||
|
||||
spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)],
|
||||
'wt': ['wt%d' % i for i in range(1, 4)]}
|
||||
result = lreshape(df, spec)
|
||||
|
||||
exp_data = {'birthdt':
|
||||
['08jan2009', '20dec2008', '30dec2008', '21dec2008',
|
||||
'11jan2009', '08jan2009', '30dec2008', '21dec2008',
|
||||
'11jan2009', '08jan2009', '21dec2008', '11jan2009'],
|
||||
'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139,
|
||||
4133, 1766, 3139, 4133],
|
||||
'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101,
|
||||
104, 105],
|
||||
'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
|
||||
'Male', 'Female', 'Female', 'Female', 'Male',
|
||||
'Female', 'Female'],
|
||||
'visitdt': ['11jan2009', '22dec2008', '04jan2009',
|
||||
'29dec2008', '20jan2009', '21jan2009',
|
||||
'22jan2009', '31dec2008', '03feb2009',
|
||||
'05feb2009', '02jan2009', '15feb2009'],
|
||||
'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0,
|
||||
1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]}
|
||||
exp = DataFrame(exp_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
result = lreshape(df, spec, dropna=False)
|
||||
exp_data = {'birthdt':
|
||||
['08jan2009', '20dec2008', '30dec2008', '21dec2008',
|
||||
'11jan2009', '08jan2009', '20dec2008', '30dec2008',
|
||||
'21dec2008', '11jan2009', '08jan2009', '20dec2008',
|
||||
'30dec2008', '21dec2008', '11jan2009'],
|
||||
'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454,
|
||||
3139, 4133, 1766, 3301, 1454, 3139, 4133],
|
||||
'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105,
|
||||
101, 102, 103, 104, 105],
|
||||
'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
|
||||
'Male', 'Female', 'Female', 'Female', 'Female',
|
||||
'Male', 'Female', 'Female', 'Female', 'Female'],
|
||||
'visitdt': ['11jan2009', '22dec2008', '04jan2009',
|
||||
'29dec2008', '20jan2009', '21jan2009', nan,
|
||||
'22jan2009', '31dec2008', '03feb2009',
|
||||
'05feb2009', nan, nan, '02jan2009',
|
||||
'15feb2009'],
|
||||
'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan,
|
||||
1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0,
|
||||
4805.0]}
|
||||
exp = DataFrame(exp_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)],
|
||||
'wt': ['wt%d' % i for i in range(1, 4)]}
|
||||
pytest.raises(ValueError, lreshape, df, spec)
|
||||
|
||||
|
||||
class TestWideToLong(object):
|
||||
|
||||
def test_simple(self):
|
||||
np.random.seed(123)
|
||||
x = np.random.randn(3)
|
||||
df = pd.DataFrame({"A1970": {0: "a",
|
||||
1: "b",
|
||||
2: "c"},
|
||||
"A1980": {0: "d",
|
||||
1: "e",
|
||||
2: "f"},
|
||||
"B1970": {0: 2.5,
|
||||
1: 1.2,
|
||||
2: .7},
|
||||
"B1980": {0: 3.2,
|
||||
1: 1.3,
|
||||
2: .1},
|
||||
"X": dict(zip(
|
||||
range(3), x))})
|
||||
df["id"] = df.index
|
||||
exp_data = {"X": x.tolist() + x.tolist(),
|
||||
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
|
||||
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
|
||||
"year": [1970, 1970, 1970, 1980, 1980, 1980],
|
||||
"id": [0, 1, 2, 0, 1, 2]}
|
||||
expected = DataFrame(exp_data)
|
||||
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
|
||||
result = wide_to_long(df, ["A", "B"], i="id", j="year")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_stubs(self):
|
||||
# GH9204
|
||||
df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]])
|
||||
df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2']
|
||||
stubs = ['inc', 'edu']
|
||||
|
||||
# TODO: unused?
|
||||
df_long = pd.wide_to_long(df, stubs, i='id', j='age') # noqa
|
||||
|
||||
assert stubs == ['inc', 'edu']
|
||||
|
||||
def test_separating_character(self):
|
||||
# GH14779
|
||||
np.random.seed(123)
|
||||
x = np.random.randn(3)
|
||||
df = pd.DataFrame({"A.1970": {0: "a",
|
||||
1: "b",
|
||||
2: "c"},
|
||||
"A.1980": {0: "d",
|
||||
1: "e",
|
||||
2: "f"},
|
||||
"B.1970": {0: 2.5,
|
||||
1: 1.2,
|
||||
2: .7},
|
||||
"B.1980": {0: 3.2,
|
||||
1: 1.3,
|
||||
2: .1},
|
||||
"X": dict(zip(
|
||||
range(3), x))})
|
||||
df["id"] = df.index
|
||||
exp_data = {"X": x.tolist() + x.tolist(),
|
||||
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
|
||||
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
|
||||
"year": [1970, 1970, 1970, 1980, 1980, 1980],
|
||||
"id": [0, 1, 2, 0, 1, 2]}
|
||||
expected = DataFrame(exp_data)
|
||||
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
|
||||
result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_escapable_characters(self):
|
||||
np.random.seed(123)
|
||||
x = np.random.randn(3)
|
||||
df = pd.DataFrame({"A(quarterly)1970": {0: "a",
|
||||
1: "b",
|
||||
2: "c"},
|
||||
"A(quarterly)1980": {0: "d",
|
||||
1: "e",
|
||||
2: "f"},
|
||||
"B(quarterly)1970": {0: 2.5,
|
||||
1: 1.2,
|
||||
2: .7},
|
||||
"B(quarterly)1980": {0: 3.2,
|
||||
1: 1.3,
|
||||
2: .1},
|
||||
"X": dict(zip(
|
||||
range(3), x))})
|
||||
df["id"] = df.index
|
||||
exp_data = {"X": x.tolist() + x.tolist(),
|
||||
"A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'],
|
||||
"B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
|
||||
"year": [1970, 1970, 1970, 1980, 1980, 1980],
|
||||
"id": [0, 1, 2, 0, 1, 2]}
|
||||
expected = DataFrame(exp_data)
|
||||
expected = expected.set_index(
|
||||
['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]]
|
||||
result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"],
|
||||
i="id", j="year")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_unbalanced(self):
|
||||
# test that we can have a varying amount of time variables
|
||||
df = pd.DataFrame({'A2010': [1.0, 2.0],
|
||||
'A2011': [3.0, 4.0],
|
||||
'B2010': [5.0, 6.0],
|
||||
'X': ['X1', 'X2']})
|
||||
df['id'] = df.index
|
||||
exp_data = {'X': ['X1', 'X1', 'X2', 'X2'],
|
||||
'A': [1.0, 3.0, 2.0, 4.0],
|
||||
'B': [5.0, np.nan, 6.0, np.nan],
|
||||
'id': [0, 0, 1, 1],
|
||||
'year': [2010, 2011, 2010, 2011]}
|
||||
expected = pd.DataFrame(exp_data)
|
||||
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
|
||||
result = wide_to_long(df, ['A', 'B'], i='id', j='year')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_character_overlap(self):
|
||||
# Test we handle overlapping characters in both id_vars and value_vars
|
||||
df = pd.DataFrame({
|
||||
'A11': ['a11', 'a22', 'a33'],
|
||||
'A12': ['a21', 'a22', 'a23'],
|
||||
'B11': ['b11', 'b12', 'b13'],
|
||||
'B12': ['b21', 'b22', 'b23'],
|
||||
'BB11': [1, 2, 3],
|
||||
'BB12': [4, 5, 6],
|
||||
'BBBX': [91, 92, 93],
|
||||
'BBBZ': [91, 92, 93]
|
||||
})
|
||||
df['id'] = df.index
|
||||
expected = pd.DataFrame({
|
||||
'BBBX': [91, 92, 93, 91, 92, 93],
|
||||
'BBBZ': [91, 92, 93, 91, 92, 93],
|
||||
'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
|
||||
'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
|
||||
'BB': [1, 2, 3, 4, 5, 6],
|
||||
'id': [0, 1, 2, 0, 1, 2],
|
||||
'year': [11, 11, 11, 12, 12, 12]})
|
||||
expected = expected.set_index(['id', 'year'])[
|
||||
['BBBX', 'BBBZ', 'A', 'B', 'BB']]
|
||||
result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
|
||||
tm.assert_frame_equal(result.sort_index(axis=1),
|
||||
expected.sort_index(axis=1))
|
||||
|
||||
def test_invalid_separator(self):
|
||||
# if an invalid separator is supplied a empty data frame is returned
|
||||
sep = 'nope!'
|
||||
df = pd.DataFrame({'A2010': [1.0, 2.0],
|
||||
'A2011': [3.0, 4.0],
|
||||
'B2010': [5.0, 6.0],
|
||||
'X': ['X1', 'X2']})
|
||||
df['id'] = df.index
|
||||
exp_data = {'X': '',
|
||||
'A2010': [],
|
||||
'A2011': [],
|
||||
'B2010': [],
|
||||
'id': [],
|
||||
'year': [],
|
||||
'A': [],
|
||||
'B': []}
|
||||
expected = pd.DataFrame(exp_data).astype({'year': 'int'})
|
||||
expected = expected.set_index(['id', 'year'])[[
|
||||
'X', 'A2010', 'A2011', 'B2010', 'A', 'B']]
|
||||
expected.index.set_levels([0, 1], level=0, inplace=True)
|
||||
result = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep)
|
||||
tm.assert_frame_equal(result.sort_index(axis=1),
|
||||
expected.sort_index(axis=1))
|
||||
|
||||
def test_num_string_disambiguation(self):
|
||||
# Test that we can disambiguate number value_vars from
|
||||
# string value_vars
|
||||
df = pd.DataFrame({
|
||||
'A11': ['a11', 'a22', 'a33'],
|
||||
'A12': ['a21', 'a22', 'a23'],
|
||||
'B11': ['b11', 'b12', 'b13'],
|
||||
'B12': ['b21', 'b22', 'b23'],
|
||||
'BB11': [1, 2, 3],
|
||||
'BB12': [4, 5, 6],
|
||||
'Arating': [91, 92, 93],
|
||||
'Arating_old': [91, 92, 93]
|
||||
})
|
||||
df['id'] = df.index
|
||||
expected = pd.DataFrame({
|
||||
'Arating': [91, 92, 93, 91, 92, 93],
|
||||
'Arating_old': [91, 92, 93, 91, 92, 93],
|
||||
'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
|
||||
'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
|
||||
'BB': [1, 2, 3, 4, 5, 6],
|
||||
'id': [0, 1, 2, 0, 1, 2],
|
||||
'year': [11, 11, 11, 12, 12, 12]})
|
||||
expected = expected.set_index(['id', 'year'])[
|
||||
['Arating', 'Arating_old', 'A', 'B', 'BB']]
|
||||
result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
|
||||
tm.assert_frame_equal(result.sort_index(axis=1),
|
||||
expected.sort_index(axis=1))
|
||||
|
||||
def test_invalid_suffixtype(self):
|
||||
# If all stubs names end with a string, but a numeric suffix is
|
||||
# assumed, an empty data frame is returned
|
||||
df = pd.DataFrame({'Aone': [1.0, 2.0],
|
||||
'Atwo': [3.0, 4.0],
|
||||
'Bone': [5.0, 6.0],
|
||||
'X': ['X1', 'X2']})
|
||||
df['id'] = df.index
|
||||
exp_data = {'X': '',
|
||||
'Aone': [],
|
||||
'Atwo': [],
|
||||
'Bone': [],
|
||||
'id': [],
|
||||
'year': [],
|
||||
'A': [],
|
||||
'B': []}
|
||||
expected = pd.DataFrame(exp_data).astype({'year': 'int'})
|
||||
|
||||
expected = expected.set_index(['id', 'year'])
|
||||
expected.index.set_levels([0, 1], level=0, inplace=True)
|
||||
result = wide_to_long(df, ['A', 'B'], i='id', j='year')
|
||||
tm.assert_frame_equal(result.sort_index(axis=1),
|
||||
expected.sort_index(axis=1))
|
||||
|
||||
def test_multiple_id_columns(self):
|
||||
# Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm
|
||||
df = pd.DataFrame({
|
||||
'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
|
||||
'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
|
||||
'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
|
||||
'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
|
||||
})
|
||||
expected = pd.DataFrame({
|
||||
'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8,
|
||||
2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9],
|
||||
'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
|
||||
'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3],
|
||||
'age': [1, 2, 1, 2, 1, 2, 1, 2, 1,
|
||||
2, 1, 2, 1, 2, 1, 2, 1, 2]
|
||||
})
|
||||
expected = expected.set_index(['famid', 'birth', 'age'])[['ht']]
|
||||
result = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_non_unique_idvars(self):
|
||||
# GH16382
|
||||
# Raise an error message if non unique id vars (i) are passed
|
||||
df = pd.DataFrame({
|
||||
'A_A1': [1, 2, 3, 4, 5],
|
||||
'B_B1': [1, 2, 3, 4, 5],
|
||||
'x': [1, 1, 1, 1, 1]
|
||||
})
|
||||
with pytest.raises(ValueError):
|
||||
wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname')
|
||||
|
||||
def test_cast_j_int(self):
|
||||
df = pd.DataFrame({
|
||||
'actor_1': ['CCH Pounder', 'Johnny Depp', 'Christoph Waltz'],
|
||||
'actor_2': ['Joel David Moore', 'Orlando Bloom', 'Rory Kinnear'],
|
||||
'actor_fb_likes_1': [1000.0, 40000.0, 11000.0],
|
||||
'actor_fb_likes_2': [936.0, 5000.0, 393.0],
|
||||
'title': ['Avatar', "Pirates of the Caribbean", 'Spectre']})
|
||||
|
||||
expected = pd.DataFrame({
|
||||
'actor': ['CCH Pounder',
|
||||
'Johnny Depp',
|
||||
'Christoph Waltz',
|
||||
'Joel David Moore',
|
||||
'Orlando Bloom',
|
||||
'Rory Kinnear'],
|
||||
'actor_fb_likes': [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0],
|
||||
'num': [1, 1, 1, 2, 2, 2],
|
||||
'title': ['Avatar',
|
||||
'Pirates of the Caribbean',
|
||||
'Spectre',
|
||||
'Avatar',
|
||||
'Pirates of the Caribbean',
|
||||
'Spectre']}).set_index(['title', 'num'])
|
||||
result = wide_to_long(df, ['actor', 'actor_fb_likes'],
|
||||
i='title', j='num', sep='_')
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_identical_stubnames(self):
|
||||
df = pd.DataFrame({'A2010': [1.0, 2.0],
|
||||
'A2011': [3.0, 4.0],
|
||||
'B2010': [5.0, 6.0],
|
||||
'A': ['X1', 'X2']})
|
||||
with pytest.raises(ValueError):
|
||||
wide_to_long(df, ['A', 'B'], i='A', j='colname')
|
||||
|
||||
def test_nonnumeric_suffix(self):
|
||||
df = pd.DataFrame({'treatment_placebo': [1.0, 2.0],
|
||||
'treatment_test': [3.0, 4.0],
|
||||
'result_placebo': [5.0, 6.0],
|
||||
'A': ['X1', 'X2']})
|
||||
expected = pd.DataFrame({
|
||||
'A': ['X1', 'X1', 'X2', 'X2'],
|
||||
'colname': ['placebo', 'test', 'placebo', 'test'],
|
||||
'result': [5.0, np.nan, 6.0, np.nan],
|
||||
'treatment': [1.0, 3.0, 2.0, 4.0]})
|
||||
expected = expected.set_index(['A', 'colname'])
|
||||
result = wide_to_long(df, ['result', 'treatment'],
|
||||
i='A', j='colname', suffix='[a-z]+', sep='_')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_mixed_type_suffix(self):
|
||||
df = pd.DataFrame({
|
||||
'A': ['X1', 'X2'],
|
||||
'result_1': [0, 9],
|
||||
'result_foo': [5.0, 6.0],
|
||||
'treatment_1': [1.0, 2.0],
|
||||
'treatment_foo': [3.0, 4.0]})
|
||||
expected = pd.DataFrame({
|
||||
'A': ['X1', 'X2', 'X1', 'X2'],
|
||||
'colname': ['1', '1', 'foo', 'foo'],
|
||||
'result': [0.0, 9.0, 5.0, 6.0],
|
||||
'treatment': [1.0, 2.0, 3.0, 4.0]}).set_index(['A', 'colname'])
|
||||
result = wide_to_long(df, ['result', 'treatment'],
|
||||
i='A', j='colname', suffix='.+', sep='_')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_float_suffix(self):
|
||||
df = pd.DataFrame({
|
||||
'treatment_1.1': [1.0, 2.0],
|
||||
'treatment_2.1': [3.0, 4.0],
|
||||
'result_1.2': [5.0, 6.0],
|
||||
'result_1': [0, 9],
|
||||
'A': ['X1', 'X2']})
|
||||
expected = pd.DataFrame({
|
||||
'A': ['X1', 'X1', 'X1', 'X1', 'X2', 'X2', 'X2', 'X2'],
|
||||
'colname': [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1],
|
||||
'result': [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan],
|
||||
'treatment': [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0]})
|
||||
expected = expected.set_index(['A', 'colname'])
|
||||
result = wide_to_long(df, ['result', 'treatment'],
|
||||
i='A', j='colname', suffix='[0-9.]+', sep='_')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,524 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# pylint: disable-msg=W0612,E1101
|
||||
|
||||
from warnings import catch_warnings
|
||||
import pytest
|
||||
from collections import OrderedDict
|
||||
|
||||
from pandas import DataFrame, Series
|
||||
import pandas as pd
|
||||
|
||||
from numpy import nan
|
||||
import numpy as np
|
||||
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
from pandas import get_dummies, Categorical, Index
|
||||
import pandas.util.testing as tm
|
||||
from pandas.compat import u
|
||||
|
||||
|
||||
class TestGetDummies(object):
|
||||
|
||||
@pytest.fixture
|
||||
def df(self):
|
||||
return DataFrame({'A': ['a', 'b', 'a'],
|
||||
'B': ['b', 'b', 'c'],
|
||||
'C': [1, 2, 3]})
|
||||
|
||||
@pytest.fixture(params=['uint8', 'i8', np.float64, bool, None])
|
||||
def dtype(self, request):
|
||||
return np.dtype(request.param)
|
||||
|
||||
@pytest.fixture(params=['dense', 'sparse'])
|
||||
def sparse(self, request):
|
||||
# params are strings to simplify reading test results,
|
||||
# e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True]
|
||||
return request.param == 'sparse'
|
||||
|
||||
def effective_dtype(self, dtype):
|
||||
if dtype is None:
|
||||
return np.uint8
|
||||
return dtype
|
||||
|
||||
def test_raises_on_dtype_object(self, df):
|
||||
with pytest.raises(ValueError):
|
||||
get_dummies(df, dtype='object')
|
||||
|
||||
def test_basic(self, sparse, dtype):
|
||||
s_list = list('abc')
|
||||
s_series = Series(s_list)
|
||||
s_series_index = Series(s_list, list('ABC'))
|
||||
|
||||
expected = DataFrame({'a': [1, 0, 0],
|
||||
'b': [0, 1, 0],
|
||||
'c': [0, 0, 1]},
|
||||
dtype=self.effective_dtype(dtype))
|
||||
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
expected.index = list('ABC')
|
||||
result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_basic_types(self, sparse, dtype):
|
||||
# GH 10531
|
||||
s_list = list('abc')
|
||||
s_series = Series(s_list)
|
||||
s_df = DataFrame({'a': [0, 1, 0, 1, 2],
|
||||
'b': ['A', 'A', 'B', 'C', 'C'],
|
||||
'c': [2, 3, 3, 3, 2]})
|
||||
|
||||
expected = DataFrame({'a': [1, 0, 0],
|
||||
'b': [0, 1, 0],
|
||||
'c': [0, 0, 1]},
|
||||
dtype=self.effective_dtype(dtype),
|
||||
columns=list('abc'))
|
||||
if not sparse:
|
||||
compare = tm.assert_frame_equal
|
||||
else:
|
||||
expected = expected.to_sparse(fill_value=0, kind='integer')
|
||||
compare = tm.assert_sp_frame_equal
|
||||
|
||||
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
|
||||
compare(result, expected)
|
||||
|
||||
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
|
||||
compare(result, expected)
|
||||
|
||||
result = get_dummies(s_df, columns=s_df.columns,
|
||||
sparse=sparse, dtype=dtype)
|
||||
tm.assert_series_equal(result.get_dtype_counts(),
|
||||
Series({dtype.name: 8}))
|
||||
|
||||
result = get_dummies(s_df, columns=['a'], sparse=sparse, dtype=dtype)
|
||||
dtype_name = self.effective_dtype(dtype).name
|
||||
|
||||
expected_counts = {'int64': 1, 'object': 1}
|
||||
expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0)
|
||||
|
||||
expected = Series(expected_counts).sort_index()
|
||||
tm.assert_series_equal(result.get_dtype_counts().sort_index(),
|
||||
expected)
|
||||
|
||||
def test_just_na(self, sparse):
|
||||
just_na_list = [np.nan]
|
||||
just_na_series = Series(just_na_list)
|
||||
just_na_series_index = Series(just_na_list, index=['A'])
|
||||
|
||||
res_list = get_dummies(just_na_list, sparse=sparse)
|
||||
res_series = get_dummies(just_na_series, sparse=sparse)
|
||||
res_series_index = get_dummies(just_na_series_index, sparse=sparse)
|
||||
|
||||
assert res_list.empty
|
||||
assert res_series.empty
|
||||
assert res_series_index.empty
|
||||
|
||||
assert res_list.index.tolist() == [0]
|
||||
assert res_series.index.tolist() == [0]
|
||||
assert res_series_index.index.tolist() == ['A']
|
||||
|
||||
def test_include_na(self, sparse, dtype):
|
||||
if sparse:
|
||||
pytest.xfail(reason='nan in index is problematic (GH 16894)')
|
||||
|
||||
s = ['a', 'b', np.nan]
|
||||
res = get_dummies(s, sparse=sparse, dtype=dtype)
|
||||
exp = DataFrame({'a': [1, 0, 0],
|
||||
'b': [0, 1, 0]},
|
||||
dtype=self.effective_dtype(dtype))
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
# Sparse dataframes do not allow nan labelled columns, see #GH8822
|
||||
res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
|
||||
exp_na = DataFrame({nan: [0, 0, 1],
|
||||
'a': [1, 0, 0],
|
||||
'b': [0, 1, 0]},
|
||||
dtype=self.effective_dtype(dtype))
|
||||
exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
|
||||
# hack (NaN handling in assert_index_equal)
|
||||
exp_na.columns = res_na.columns
|
||||
assert_frame_equal(res_na, exp_na)
|
||||
|
||||
res_just_na = get_dummies([nan], dummy_na=True,
|
||||
sparse=sparse, dtype=dtype)
|
||||
exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan],
|
||||
dtype=self.effective_dtype(dtype))
|
||||
tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
|
||||
|
||||
def test_unicode(self, sparse):
|
||||
# See GH 6885 - get_dummies chokes on unicode values
|
||||
import unicodedata
|
||||
e = 'e'
|
||||
eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
|
||||
s = [e, eacute, eacute]
|
||||
res = get_dummies(s, prefix='letter', sparse=sparse)
|
||||
exp = DataFrame({'letter_e': [1, 0, 0],
|
||||
u('letter_%s') % eacute: [0, 1, 1]},
|
||||
dtype=np.uint8)
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
def test_dataframe_dummies_all_obj(self, df, sparse):
|
||||
df = df[['A', 'B']]
|
||||
result = get_dummies(df, sparse=sparse)
|
||||
expected = DataFrame({'A_a': [1, 0, 1],
|
||||
'A_b': [0, 1, 0],
|
||||
'B_b': [1, 1, 0],
|
||||
'B_c': [0, 0, 1]},
|
||||
dtype=np.uint8)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
|
||||
result = get_dummies(df, sparse=sparse, dtype=dtype)
|
||||
expected = DataFrame({'C': [1, 2, 3],
|
||||
'A_a': [1, 0, 1],
|
||||
'A_b': [0, 1, 0],
|
||||
'B_b': [1, 1, 0],
|
||||
'B_c': [0, 0, 1]})
|
||||
cols = ['A_a', 'A_b', 'B_b', 'B_c']
|
||||
expected[cols] = expected[cols].astype(dtype)
|
||||
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_list(self, df, sparse):
|
||||
prefixes = ['from_A', 'from_B']
|
||||
result = get_dummies(df, prefix=prefixes, sparse=sparse)
|
||||
expected = DataFrame({'C': [1, 2, 3],
|
||||
'from_A_a': [1, 0, 1],
|
||||
'from_A_b': [0, 1, 0],
|
||||
'from_B_b': [1, 1, 0],
|
||||
'from_B_c': [0, 0, 1]},
|
||||
dtype=np.uint8)
|
||||
expected[['C']] = df[['C']]
|
||||
expected = expected[['C', 'from_A_a', 'from_A_b',
|
||||
'from_B_b', 'from_B_c']]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_str(self, df, sparse):
|
||||
# not that you should do this...
|
||||
result = get_dummies(df, prefix='bad', sparse=sparse)
|
||||
bad_columns = ['bad_a', 'bad_b', 'bad_b', 'bad_c']
|
||||
expected = DataFrame([[1, 1, 0, 1, 0],
|
||||
[2, 0, 1, 1, 0],
|
||||
[3, 1, 0, 0, 1]],
|
||||
columns=['C'] + bad_columns,
|
||||
dtype=np.uint8)
|
||||
expected = expected.astype({"C": np.int64})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_subset(self, df, sparse):
|
||||
result = get_dummies(df, prefix=['from_A'], columns=['A'],
|
||||
sparse=sparse)
|
||||
expected = DataFrame({'B': ['b', 'b', 'c'],
|
||||
'C': [1, 2, 3],
|
||||
'from_A_a': [1, 0, 1],
|
||||
'from_A_b': [0, 1, 0]}, dtype=np.uint8)
|
||||
expected[['C']] = df[['C']]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_sep(self, df, sparse):
|
||||
result = get_dummies(df, prefix_sep='..', sparse=sparse)
|
||||
expected = DataFrame({'C': [1, 2, 3],
|
||||
'A..a': [1, 0, 1],
|
||||
'A..b': [0, 1, 0],
|
||||
'B..b': [1, 1, 0],
|
||||
'B..c': [0, 0, 1]},
|
||||
dtype=np.uint8)
|
||||
expected[['C']] = df[['C']]
|
||||
expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, prefix_sep=['..', '__'], sparse=sparse)
|
||||
expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'},
|
||||
sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
|
||||
with pytest.raises(ValueError):
|
||||
get_dummies(df, prefix=['too few'], sparse=sparse)
|
||||
|
||||
def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
|
||||
with pytest.raises(ValueError):
|
||||
get_dummies(df, prefix_sep=['bad'], sparse=sparse)
|
||||
|
||||
def test_dataframe_dummies_prefix_dict(self, sparse):
|
||||
prefixes = {'A': 'from_A', 'B': 'from_B'}
|
||||
df = DataFrame({'C': [1, 2, 3],
|
||||
'A': ['a', 'b', 'a'],
|
||||
'B': ['b', 'b', 'c']})
|
||||
result = get_dummies(df, prefix=prefixes, sparse=sparse)
|
||||
|
||||
expected = DataFrame({'C': [1, 2, 3],
|
||||
'from_A_a': [1, 0, 1],
|
||||
'from_A_b': [0, 1, 0],
|
||||
'from_B_b': [1, 1, 0],
|
||||
'from_B_c': [0, 0, 1]})
|
||||
|
||||
columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']
|
||||
expected[columns] = expected[columns].astype(np.uint8)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_with_na(self, df, sparse, dtype):
|
||||
df.loc[3, :] = [np.nan, np.nan, np.nan]
|
||||
result = get_dummies(df, dummy_na=True,
|
||||
sparse=sparse, dtype=dtype).sort_index(axis=1)
|
||||
expected = DataFrame({'C': [1, 2, 3, np.nan],
|
||||
'A_a': [1, 0, 1, 0],
|
||||
'A_b': [0, 1, 0, 0],
|
||||
'A_nan': [0, 0, 0, 1],
|
||||
'B_b': [1, 1, 0, 0],
|
||||
'B_c': [0, 0, 1, 0],
|
||||
'B_nan': [0, 0, 0, 1]}).sort_index(axis=1)
|
||||
|
||||
e_dtype = self.effective_dtype(dtype)
|
||||
columns = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']
|
||||
expected[columns] = expected[columns].astype(e_dtype)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
|
||||
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
|
||||
df['cat'] = pd.Categorical(['x', 'y', 'y'])
|
||||
result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
|
||||
expected = DataFrame({'C': [1, 2, 3],
|
||||
'A_a': [1, 0, 1],
|
||||
'A_b': [0, 1, 0],
|
||||
'B_b': [1, 1, 0],
|
||||
'B_c': [0, 0, 1],
|
||||
'cat_x': [1, 0, 0],
|
||||
'cat_y': [0, 1, 1]}).sort_index(axis=1)
|
||||
|
||||
columns = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y']
|
||||
effective_dtype = self.effective_dtype(dtype)
|
||||
expected[columns] = expected[columns].astype(effective_dtype)
|
||||
expected.sort_index(axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_basic_drop_first(self, sparse):
|
||||
# GH12402 Add a new parameter `drop_first` to avoid collinearity
|
||||
# Basic case
|
||||
s_list = list('abc')
|
||||
s_series = Series(s_list)
|
||||
s_series_index = Series(s_list, list('ABC'))
|
||||
|
||||
expected = DataFrame({'b': [0, 1, 0],
|
||||
'c': [0, 0, 1]},
|
||||
dtype=np.uint8)
|
||||
|
||||
result = get_dummies(s_list, drop_first=True, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, drop_first=True, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
expected.index = list('ABC')
|
||||
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_basic_drop_first_one_level(self, sparse):
|
||||
# Test the case that categorical variable only has one level.
|
||||
s_list = list('aaa')
|
||||
s_series = Series(s_list)
|
||||
s_series_index = Series(s_list, list('ABC'))
|
||||
|
||||
expected = DataFrame(index=np.arange(3))
|
||||
|
||||
result = get_dummies(s_list, drop_first=True, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, drop_first=True, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(index=list('ABC'))
|
||||
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_basic_drop_first_NA(self, sparse):
|
||||
# Test NA handling together with drop_first
|
||||
s_NA = ['a', 'b', np.nan]
|
||||
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
|
||||
exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8)
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
res_na = get_dummies(s_NA, dummy_na=True, drop_first=True,
|
||||
sparse=sparse)
|
||||
exp_na = DataFrame(
|
||||
{'b': [0, 1, 0],
|
||||
nan: [0, 0, 1]},
|
||||
dtype=np.uint8).reindex(['b', nan], axis=1)
|
||||
assert_frame_equal(res_na, exp_na)
|
||||
|
||||
res_just_na = get_dummies([nan], dummy_na=True, drop_first=True,
|
||||
sparse=sparse)
|
||||
exp_just_na = DataFrame(index=np.arange(1))
|
||||
assert_frame_equal(res_just_na, exp_just_na)
|
||||
|
||||
def test_dataframe_dummies_drop_first(self, df, sparse):
|
||||
df = df[['A', 'B']]
|
||||
result = get_dummies(df, drop_first=True, sparse=sparse)
|
||||
expected = DataFrame({'A_b': [0, 1, 0],
|
||||
'B_c': [0, 0, 1]},
|
||||
dtype=np.uint8)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_drop_first_with_categorical(
|
||||
self, df, sparse, dtype):
|
||||
df['cat'] = pd.Categorical(['x', 'y', 'y'])
|
||||
result = get_dummies(df, drop_first=True, sparse=sparse)
|
||||
expected = DataFrame({'C': [1, 2, 3],
|
||||
'A_b': [0, 1, 0],
|
||||
'B_c': [0, 0, 1],
|
||||
'cat_y': [0, 1, 1]})
|
||||
cols = ['A_b', 'B_c', 'cat_y']
|
||||
expected[cols] = expected[cols].astype(np.uint8)
|
||||
expected = expected[['C', 'A_b', 'B_c', 'cat_y']]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
|
||||
df.loc[3, :] = [np.nan, np.nan, np.nan]
|
||||
result = get_dummies(df, dummy_na=True, drop_first=True,
|
||||
sparse=sparse).sort_index(axis=1)
|
||||
expected = DataFrame({'C': [1, 2, 3, np.nan],
|
||||
'A_b': [0, 1, 0, 0],
|
||||
'A_nan': [0, 0, 0, 1],
|
||||
'B_c': [0, 0, 1, 0],
|
||||
'B_nan': [0, 0, 0, 1]})
|
||||
cols = ['A_b', 'A_nan', 'B_c', 'B_nan']
|
||||
expected[cols] = expected[cols].astype(np.uint8)
|
||||
expected = expected.sort_index(axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, dummy_na=False, drop_first=True,
|
||||
sparse=sparse)
|
||||
expected = expected[['C', 'A_b', 'B_c']]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_int_int(self):
|
||||
data = Series([1, 2, 1])
|
||||
result = pd.get_dummies(data)
|
||||
expected = DataFrame([[1, 0],
|
||||
[0, 1],
|
||||
[1, 0]],
|
||||
columns=[1, 2],
|
||||
dtype=np.uint8)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
data = Series(pd.Categorical(['a', 'b', 'a']))
|
||||
result = pd.get_dummies(data)
|
||||
expected = DataFrame([[1, 0],
|
||||
[0, 1],
|
||||
[1, 0]],
|
||||
columns=pd.Categorical(['a', 'b']),
|
||||
dtype=np.uint8)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_int_df(self, dtype):
|
||||
data = DataFrame(
|
||||
{'A': [1, 2, 1],
|
||||
'B': pd.Categorical(['a', 'b', 'a']),
|
||||
'C': [1, 2, 1],
|
||||
'D': [1., 2., 1.]
|
||||
}
|
||||
)
|
||||
columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b']
|
||||
expected = DataFrame([
|
||||
[1, 1., 1, 0, 1, 0],
|
||||
[2, 2., 0, 1, 0, 1],
|
||||
[1, 1., 1, 0, 1, 0]
|
||||
], columns=columns)
|
||||
expected[columns[2:]] = expected[columns[2:]].astype(dtype)
|
||||
result = pd.get_dummies(data, columns=['A', 'B'], dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):
|
||||
# GH13854
|
||||
for ordered in [False, True]:
|
||||
cat = pd.Categorical(list("xy"), categories=list("xyz"),
|
||||
ordered=ordered)
|
||||
result = get_dummies(cat, dtype=dtype)
|
||||
|
||||
data = np.array([[1, 0, 0], [0, 1, 0]],
|
||||
dtype=self.effective_dtype(dtype))
|
||||
cols = pd.CategoricalIndex(cat.categories,
|
||||
categories=cat.categories,
|
||||
ordered=ordered)
|
||||
expected = DataFrame(data, columns=cols,
|
||||
dtype=self.effective_dtype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('sparse', [True, False])
|
||||
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
|
||||
# GH18914
|
||||
df = DataFrame.from_dict(OrderedDict([('GDP', [1, 2]),
|
||||
('Nation', ['AB', 'CD'])]))
|
||||
df = get_dummies(df, columns=['Nation'], sparse=sparse)
|
||||
df2 = df.reindex(columns=['GDP'])
|
||||
|
||||
tm.assert_frame_equal(df[['GDP']], df2)
|
||||
|
||||
def test_get_dummies_duplicate_columns(self, df):
|
||||
# GH20839
|
||||
df.columns = ["A", "A", "A"]
|
||||
result = get_dummies(df).sort_index(axis=1)
|
||||
|
||||
expected = DataFrame([[1, 1, 0, 1, 0],
|
||||
[2, 0, 1, 1, 0],
|
||||
[3, 1, 0, 0, 1]],
|
||||
columns=['A', 'A_a', 'A_b', 'A_b', 'A_c'],
|
||||
dtype=np.uint8).sort_index(axis=1)
|
||||
|
||||
expected = expected.astype({"A": np.int64})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestCategoricalReshape(object):
|
||||
|
||||
def test_reshaping_panel_categorical(self):
|
||||
|
||||
with catch_warnings(record=True):
|
||||
p = tm.makePanel()
|
||||
p['str'] = 'foo'
|
||||
df = p.to_frame()
|
||||
|
||||
df['category'] = df['str'].astype('category')
|
||||
result = df['category'].unstack()
|
||||
|
||||
c = Categorical(['foo'] * len(p.major_axis))
|
||||
expected = DataFrame({'A': c.copy(),
|
||||
'B': c.copy(),
|
||||
'C': c.copy(),
|
||||
'D': c.copy()},
|
||||
columns=Index(list('ABCD'), name='minor'),
|
||||
index=p.major_axis.set_names('major'))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestMakeAxisDummies(object):
|
||||
|
||||
def test_preserve_categorical_dtype(self):
|
||||
# GH13854
|
||||
for ordered in [False, True]:
|
||||
cidx = pd.CategoricalIndex(list("xyz"), ordered=ordered)
|
||||
midx = pd.MultiIndex(levels=[['a'], cidx],
|
||||
labels=[[0, 0], [0, 1]])
|
||||
df = DataFrame([[10, 11]], index=midx)
|
||||
|
||||
expected = DataFrame([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
|
||||
index=midx, columns=cidx)
|
||||
|
||||
from pandas.core.reshape.reshape import make_axis_dummies
|
||||
result = make_axis_dummies(df)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = make_axis_dummies(df, transform=lambda x: x)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -1,607 +0,0 @@
|
||||
import os
|
||||
import pytest
|
||||
|
||||
import numpy as np
|
||||
from pandas.compat import zip
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (DataFrame, Series, isna, to_datetime, DatetimeIndex, Index,
|
||||
Timestamp, Interval, IntervalIndex, Categorical,
|
||||
cut, qcut, date_range, NaT, TimedeltaIndex)
|
||||
from pandas.tseries.offsets import Nano, Day
|
||||
import pandas.util.testing as tm
|
||||
from pandas.api.types import CategoricalDtype as CDT
|
||||
|
||||
from pandas.core.algorithms import quantile
|
||||
import pandas.core.reshape.tile as tmod
|
||||
|
||||
|
||||
class TestCut(object):
|
||||
|
||||
def test_simple(self):
|
||||
data = np.ones(5, dtype='int64')
|
||||
result = cut(data, 4, labels=False)
|
||||
expected = np.array([1, 1, 1, 1, 1])
|
||||
tm.assert_numpy_array_equal(result, expected,
|
||||
check_dtype=False)
|
||||
|
||||
def test_bins(self):
|
||||
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1])
|
||||
result, bins = cut(data, 3, retbins=True)
|
||||
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3))
|
||||
intervals = intervals.take([0, 0, 0, 1, 2, 0])
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
|
||||
6.53333333, 9.7]))
|
||||
|
||||
def test_right(self):
|
||||
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
|
||||
result, bins = cut(data, 4, right=True, retbins=True)
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3))
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
expected = expected.take([0, 0, 0, 2, 3, 0, 0])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95,
|
||||
7.325, 9.7]))
|
||||
|
||||
def test_noright(self):
|
||||
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
|
||||
result, bins = cut(data, 4, right=False, retbins=True)
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3), closed='left')
|
||||
intervals = intervals.take([0, 0, 0, 2, 3, 0, 1])
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95,
|
||||
7.325, 9.7095]))
|
||||
|
||||
def test_arraylike(self):
|
||||
data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
|
||||
result, bins = cut(data, 3, retbins=True)
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3))
|
||||
intervals = intervals.take([0, 0, 0, 1, 2, 0])
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
|
||||
6.53333333, 9.7]))
|
||||
|
||||
def test_bins_from_intervalindex(self):
|
||||
c = cut(range(5), 3)
|
||||
expected = c
|
||||
result = cut(range(5), bins=expected.categories)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
expected = Categorical.from_codes(np.append(c.codes, -1),
|
||||
categories=c.categories,
|
||||
ordered=True)
|
||||
result = cut(range(6), bins=expected.categories)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# doc example
|
||||
# make sure we preserve the bins
|
||||
ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])
|
||||
c = cut(ages, bins=[0, 18, 35, 70])
|
||||
expected = IntervalIndex.from_tuples([(0, 18), (18, 35), (35, 70)])
|
||||
tm.assert_index_equal(c.categories, expected)
|
||||
|
||||
result = cut([25, 20, 50], bins=c.categories)
|
||||
tm.assert_index_equal(result.categories, expected)
|
||||
tm.assert_numpy_array_equal(result.codes,
|
||||
np.array([1, 1, 2], dtype='int8'))
|
||||
|
||||
def test_bins_not_monotonic(self):
|
||||
data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
|
||||
pytest.raises(ValueError, cut, data, [0.1, 1.5, 1, 10])
|
||||
|
||||
def test_wrong_num_labels(self):
|
||||
data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
|
||||
pytest.raises(ValueError, cut, data, [0, 1, 10],
|
||||
labels=['foo', 'bar', 'baz'])
|
||||
|
||||
def test_cut_corner(self):
|
||||
# h3h
|
||||
pytest.raises(ValueError, cut, [], 2)
|
||||
|
||||
pytest.raises(ValueError, cut, [1, 2, 3], 0.5)
|
||||
|
||||
@pytest.mark.parametrize('arg', [2, np.eye(2), DataFrame(np.eye(2))])
|
||||
@pytest.mark.parametrize('cut_func', [cut, qcut])
|
||||
def test_cut_not_1d_arg(self, arg, cut_func):
|
||||
with pytest.raises(ValueError):
|
||||
cut_func(arg, 2)
|
||||
|
||||
def test_cut_out_of_range_more(self):
|
||||
# #1511
|
||||
s = Series([0, -1, 0, 1, -3], name='x')
|
||||
ind = cut(s, [0, 1], labels=False)
|
||||
exp = Series([np.nan, np.nan, np.nan, 0, np.nan], name='x')
|
||||
tm.assert_series_equal(ind, exp)
|
||||
|
||||
def test_labels(self):
|
||||
arr = np.tile(np.arange(0, 1.01, 0.1), 4)
|
||||
|
||||
result, bins = cut(arr, 4, retbins=True)
|
||||
ex_levels = IntervalIndex.from_breaks([-1e-3, 0.25, 0.5, 0.75, 1])
|
||||
tm.assert_index_equal(result.categories, ex_levels)
|
||||
|
||||
result, bins = cut(arr, 4, retbins=True, right=False)
|
||||
ex_levels = IntervalIndex.from_breaks([0, 0.25, 0.5, 0.75, 1 + 1e-3],
|
||||
closed='left')
|
||||
tm.assert_index_equal(result.categories, ex_levels)
|
||||
|
||||
def test_cut_pass_series_name_to_factor(self):
|
||||
s = Series(np.random.randn(100), name='foo')
|
||||
|
||||
factor = cut(s, 4)
|
||||
assert factor.name == 'foo'
|
||||
|
||||
def test_label_precision(self):
|
||||
arr = np.arange(0, 0.73, 0.01)
|
||||
|
||||
result = cut(arr, 4, precision=2)
|
||||
ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36,
|
||||
0.54, 0.72])
|
||||
tm.assert_index_equal(result.categories, ex_levels)
|
||||
|
||||
def test_na_handling(self):
|
||||
arr = np.arange(0, 0.75, 0.01)
|
||||
arr[::3] = np.nan
|
||||
|
||||
result = cut(arr, 4)
|
||||
|
||||
result_arr = np.asarray(result)
|
||||
|
||||
ex_arr = np.where(isna(arr), np.nan, result_arr)
|
||||
|
||||
tm.assert_almost_equal(result_arr, ex_arr)
|
||||
|
||||
result = cut(arr, 4, labels=False)
|
||||
ex_result = np.where(isna(arr), np.nan, result)
|
||||
tm.assert_almost_equal(result, ex_result)
|
||||
|
||||
def test_inf_handling(self):
|
||||
data = np.arange(6)
|
||||
data_ser = Series(data, dtype='int64')
|
||||
|
||||
bins = [-np.inf, 2, 4, np.inf]
|
||||
result = cut(data, bins)
|
||||
result_ser = cut(data_ser, bins)
|
||||
|
||||
ex_uniques = IntervalIndex.from_breaks(bins)
|
||||
tm.assert_index_equal(result.categories, ex_uniques)
|
||||
assert result[5] == Interval(4, np.inf)
|
||||
assert result[0] == Interval(-np.inf, 2)
|
||||
assert result_ser[5] == Interval(4, np.inf)
|
||||
assert result_ser[0] == Interval(-np.inf, 2)
|
||||
|
||||
def test_qcut(self):
|
||||
arr = np.random.randn(1000)
|
||||
|
||||
# We store the bins as Index that have been rounded
|
||||
# to comparisons are a bit tricky.
|
||||
labels, bins = qcut(arr, 4, retbins=True)
|
||||
ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
|
||||
result = labels.categories.left.values
|
||||
assert np.allclose(result, ex_bins[:-1], atol=1e-2)
|
||||
result = labels.categories.right.values
|
||||
assert np.allclose(result, ex_bins[1:], atol=1e-2)
|
||||
|
||||
ex_levels = cut(arr, ex_bins, include_lowest=True)
|
||||
tm.assert_categorical_equal(labels, ex_levels)
|
||||
|
||||
def test_qcut_bounds(self):
|
||||
arr = np.random.randn(1000)
|
||||
|
||||
factor = qcut(arr, 10, labels=False)
|
||||
assert len(np.unique(factor)) == 10
|
||||
|
||||
def test_qcut_specify_quantiles(self):
|
||||
arr = np.random.randn(100)
|
||||
|
||||
factor = qcut(arr, [0, .25, .5, .75, 1.])
|
||||
expected = qcut(arr, 4)
|
||||
tm.assert_categorical_equal(factor, expected)
|
||||
|
||||
def test_qcut_all_bins_same(self):
|
||||
tm.assert_raises_regex(ValueError, "edges.*unique", qcut,
|
||||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3)
|
||||
|
||||
def test_cut_out_of_bounds(self):
|
||||
arr = np.random.randn(100)
|
||||
|
||||
result = cut(arr, [-1, 0, 1])
|
||||
|
||||
mask = isna(result)
|
||||
ex_mask = (arr < -1) | (arr > 1)
|
||||
tm.assert_numpy_array_equal(mask, ex_mask)
|
||||
|
||||
def test_cut_pass_labels(self):
|
||||
arr = [50, 5, 10, 15, 20, 30, 70]
|
||||
bins = [0, 25, 50, 100]
|
||||
labels = ['Small', 'Medium', 'Large']
|
||||
|
||||
result = cut(arr, bins, labels=labels)
|
||||
exp = Categorical(['Medium'] + 4 * ['Small'] + ['Medium', 'Large'],
|
||||
categories=labels,
|
||||
ordered=True)
|
||||
tm.assert_categorical_equal(result, exp)
|
||||
|
||||
result = cut(arr, bins, labels=Categorical.from_codes([0, 1, 2],
|
||||
labels))
|
||||
exp = Categorical.from_codes([1] + 4 * [0] + [1, 2], labels)
|
||||
tm.assert_categorical_equal(result, exp)
|
||||
|
||||
# issue 16459
|
||||
labels = ['Good', 'Medium', 'Bad']
|
||||
result = cut(arr, 3, labels=labels)
|
||||
exp = cut(arr, 3, labels=Categorical(labels, categories=labels,
|
||||
ordered=True))
|
||||
tm.assert_categorical_equal(result, exp)
|
||||
|
||||
def test_qcut_include_lowest(self):
|
||||
values = np.arange(10)
|
||||
|
||||
ii = qcut(values, 4)
|
||||
|
||||
ex_levels = IntervalIndex(
|
||||
[Interval(-0.001, 2.25),
|
||||
Interval(2.25, 4.5),
|
||||
Interval(4.5, 6.75),
|
||||
Interval(6.75, 9)])
|
||||
tm.assert_index_equal(ii.categories, ex_levels)
|
||||
|
||||
def test_qcut_nas(self):
|
||||
arr = np.random.randn(100)
|
||||
arr[:20] = np.nan
|
||||
|
||||
result = qcut(arr, 4)
|
||||
assert isna(result[:20]).all()
|
||||
|
||||
def test_qcut_index(self):
|
||||
result = qcut([0, 2], 2)
|
||||
intervals = [Interval(-0.001, 1), Interval(1, 2)]
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_round_frac(self):
|
||||
# it works
|
||||
result = cut(np.arange(11.), 2)
|
||||
|
||||
result = cut(np.arange(11.) / 1e10, 2)
|
||||
|
||||
# #1979, negative numbers
|
||||
|
||||
result = tmod._round_frac(-117.9998, precision=3)
|
||||
assert result == -118
|
||||
result = tmod._round_frac(117.9998, precision=3)
|
||||
assert result == 118
|
||||
|
||||
result = tmod._round_frac(117.9998, precision=2)
|
||||
assert result == 118
|
||||
result = tmod._round_frac(0.000123456, precision=2)
|
||||
assert result == 0.00012
|
||||
|
||||
def test_qcut_binning_issues(self, datapath):
|
||||
# #1978, 1979
|
||||
cut_file = datapath(os.path.join('reshape', 'data', 'cut_data.csv'))
|
||||
arr = np.loadtxt(cut_file)
|
||||
|
||||
result = qcut(arr, 20)
|
||||
|
||||
starts = []
|
||||
ends = []
|
||||
for lev in np.unique(result):
|
||||
s = lev.left
|
||||
e = lev.right
|
||||
assert s != e
|
||||
|
||||
starts.append(float(s))
|
||||
ends.append(float(e))
|
||||
|
||||
for (sp, sn), (ep, en) in zip(zip(starts[:-1], starts[1:]),
|
||||
zip(ends[:-1], ends[1:])):
|
||||
assert sp < sn
|
||||
assert ep < en
|
||||
assert ep <= sn
|
||||
|
||||
def test_cut_return_intervals(self):
|
||||
s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
|
||||
res = cut(s, 3)
|
||||
exp_bins = np.linspace(0, 8, num=4).round(3)
|
||||
exp_bins[0] -= 0.008
|
||||
exp = Series(IntervalIndex.from_breaks(exp_bins, closed='right').take(
|
||||
[0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True))
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_qcut_return_intervals(self):
|
||||
s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
|
||||
res = qcut(s, [0, 0.333, 0.666, 1])
|
||||
exp_levels = np.array([Interval(-0.001, 2.664),
|
||||
Interval(2.664, 5.328), Interval(5.328, 8)])
|
||||
exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(
|
||||
CDT(ordered=True))
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_series_retbins(self):
|
||||
# GH 8589
|
||||
s = Series(np.arange(4))
|
||||
result, bins = cut(s, 2, retbins=True)
|
||||
expected = Series(IntervalIndex.from_breaks(
|
||||
[-0.003, 1.5, 3], closed='right').repeat(2)).astype(
|
||||
CDT(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result, bins = qcut(s, 2, retbins=True)
|
||||
expected = Series(IntervalIndex.from_breaks(
|
||||
[-0.001, 1.5, 3], closed='right').repeat(2)).astype(
|
||||
CDT(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_cut_duplicates_bin(self):
|
||||
# issue 20947
|
||||
values = Series(np.array([1, 3, 5, 7, 9]),
|
||||
index=["a", "b", "c", "d", "e"])
|
||||
bins = [0, 2, 4, 6, 10, 10]
|
||||
result = cut(values, bins, duplicates='drop')
|
||||
expected = cut(values, pd.unique(bins))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
pytest.raises(ValueError, cut, values, bins)
|
||||
pytest.raises(ValueError, cut, values, bins, duplicates='raise')
|
||||
|
||||
# invalid
|
||||
pytest.raises(ValueError, cut, values, bins, duplicates='foo')
|
||||
|
||||
def test_qcut_duplicates_bin(self):
|
||||
# GH 7751
|
||||
values = [0, 0, 0, 0, 1, 2, 3]
|
||||
expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)])
|
||||
|
||||
result = qcut(values, 3, duplicates='drop')
|
||||
tm.assert_index_equal(result.categories, expected)
|
||||
|
||||
pytest.raises(ValueError, qcut, values, 3)
|
||||
pytest.raises(ValueError, qcut, values, 3, duplicates='raise')
|
||||
|
||||
# invalid
|
||||
pytest.raises(ValueError, qcut, values, 3, duplicates='foo')
|
||||
|
||||
def test_single_quantile(self):
|
||||
# issue 15431
|
||||
expected = Series([0, 0])
|
||||
|
||||
s = Series([9., 9.])
|
||||
result = qcut(s, 1, labels=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
result = qcut(s, 1)
|
||||
intervals = IntervalIndex([Interval(8.999, 9.0),
|
||||
Interval(8.999, 9.0)], closed='right')
|
||||
expected = Series(intervals).astype(CDT(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
s = Series([-9., -9.])
|
||||
expected = Series([0, 0])
|
||||
result = qcut(s, 1, labels=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
result = qcut(s, 1)
|
||||
intervals = IntervalIndex([Interval(-9.001, -9.0),
|
||||
Interval(-9.001, -9.0)], closed='right')
|
||||
expected = Series(intervals).astype(CDT(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
s = Series([0., 0.])
|
||||
expected = Series([0, 0])
|
||||
result = qcut(s, 1, labels=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
result = qcut(s, 1)
|
||||
intervals = IntervalIndex([Interval(-0.001, 0.0),
|
||||
Interval(-0.001, 0.0)], closed='right')
|
||||
expected = Series(intervals).astype(CDT(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
s = Series([9])
|
||||
expected = Series([0])
|
||||
result = qcut(s, 1, labels=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
result = qcut(s, 1)
|
||||
intervals = IntervalIndex([Interval(8.999, 9.0)], closed='right')
|
||||
expected = Series(intervals).astype(CDT(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
s = Series([-9])
|
||||
expected = Series([0])
|
||||
result = qcut(s, 1, labels=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
result = qcut(s, 1)
|
||||
intervals = IntervalIndex([Interval(-9.001, -9.0)], closed='right')
|
||||
expected = Series(intervals).astype(CDT(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
s = Series([0])
|
||||
expected = Series([0])
|
||||
result = qcut(s, 1, labels=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
result = qcut(s, 1)
|
||||
intervals = IntervalIndex([Interval(-0.001, 0.0)], closed='right')
|
||||
expected = Series(intervals).astype(CDT(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_single_bin(self):
|
||||
# issue 14652
|
||||
expected = Series([0, 0])
|
||||
|
||||
s = Series([9., 9.])
|
||||
result = cut(s, 1, labels=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
s = Series([-9., -9.])
|
||||
result = cut(s, 1, labels=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
expected = Series([0])
|
||||
|
||||
s = Series([9])
|
||||
result = cut(s, 1, labels=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
s = Series([-9])
|
||||
result = cut(s, 1, labels=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# issue 15428
|
||||
expected = Series([0, 0])
|
||||
|
||||
s = Series([0., 0.])
|
||||
result = cut(s, 1, labels=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
expected = Series([0])
|
||||
|
||||
s = Series([0])
|
||||
result = cut(s, 1, labels=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_1_writeable, array_2_writeable",
|
||||
[(True, True), (True, False), (False, False)])
|
||||
def test_cut_read_only(self, array_1_writeable, array_2_writeable):
|
||||
# issue 18773
|
||||
array_1 = np.arange(0, 100, 10)
|
||||
array_1.flags.writeable = array_1_writeable
|
||||
|
||||
array_2 = np.arange(0, 100, 10)
|
||||
array_2.flags.writeable = array_2_writeable
|
||||
|
||||
hundred_elements = np.arange(100)
|
||||
|
||||
tm.assert_categorical_equal(cut(hundred_elements, array_1),
|
||||
cut(hundred_elements, array_2))
|
||||
|
||||
|
||||
class TestDatelike(object):
|
||||
|
||||
@pytest.mark.parametrize('s', [
|
||||
Series(DatetimeIndex(['20180101', NaT, '20180103'])),
|
||||
Series(TimedeltaIndex(['0 days', NaT, '2 days']))],
|
||||
ids=lambda x: str(x.dtype))
|
||||
def test_qcut_nat(self, s):
|
||||
# GH 19768
|
||||
intervals = IntervalIndex.from_tuples(
|
||||
[(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])])
|
||||
expected = Series(Categorical(intervals, ordered=True))
|
||||
result = qcut(s, 2)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_datetime_cut(self):
|
||||
# GH 14714
|
||||
# testing for time data to be present as series
|
||||
data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03']))
|
||||
|
||||
result, bins = cut(data, 3, retbins=True)
|
||||
expected = (
|
||||
Series(IntervalIndex([
|
||||
Interval(Timestamp('2012-12-31 23:57:07.200000'),
|
||||
Timestamp('2013-01-01 16:00:00')),
|
||||
Interval(Timestamp('2013-01-01 16:00:00'),
|
||||
Timestamp('2013-01-02 08:00:00')),
|
||||
Interval(Timestamp('2013-01-02 08:00:00'),
|
||||
Timestamp('2013-01-03 00:00:00'))]))
|
||||
.astype(CDT(ordered=True)))
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# testing for time data to be present as list
|
||||
data = [np.datetime64('2013-01-01'), np.datetime64('2013-01-02'),
|
||||
np.datetime64('2013-01-03')]
|
||||
result, bins = cut(data, 3, retbins=True)
|
||||
tm.assert_series_equal(Series(result), expected)
|
||||
|
||||
# testing for time data to be present as ndarray
|
||||
data = np.array([np.datetime64('2013-01-01'),
|
||||
np.datetime64('2013-01-02'),
|
||||
np.datetime64('2013-01-03')])
|
||||
result, bins = cut(data, 3, retbins=True)
|
||||
tm.assert_series_equal(Series(result), expected)
|
||||
|
||||
# testing for time data to be present as datetime index
|
||||
data = DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03'])
|
||||
result, bins = cut(data, 3, retbins=True)
|
||||
tm.assert_series_equal(Series(result), expected)
|
||||
|
||||
@pytest.mark.parametrize('bins', [
|
||||
3, [Timestamp('2013-01-01 04:57:07.200000'),
|
||||
Timestamp('2013-01-01 21:00:00'),
|
||||
Timestamp('2013-01-02 13:00:00'),
|
||||
Timestamp('2013-01-03 05:00:00')]])
|
||||
@pytest.mark.parametrize('box', [list, np.array, Index, Series])
|
||||
def test_datetimetz_cut(self, bins, box):
|
||||
# GH 19872
|
||||
tz = 'US/Eastern'
|
||||
s = Series(date_range('20130101', periods=3, tz=tz))
|
||||
if not isinstance(bins, int):
|
||||
bins = box(bins)
|
||||
result = cut(s, bins)
|
||||
expected = (
|
||||
Series(IntervalIndex([
|
||||
Interval(Timestamp('2012-12-31 23:57:07.200000', tz=tz),
|
||||
Timestamp('2013-01-01 16:00:00', tz=tz)),
|
||||
Interval(Timestamp('2013-01-01 16:00:00', tz=tz),
|
||||
Timestamp('2013-01-02 08:00:00', tz=tz)),
|
||||
Interval(Timestamp('2013-01-02 08:00:00', tz=tz),
|
||||
Timestamp('2013-01-03 00:00:00', tz=tz))]))
|
||||
.astype(CDT(ordered=True)))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('bins', [3, np.linspace(0, 1, 4)])
|
||||
def test_datetimetz_qcut(self, bins):
|
||||
# GH 19872
|
||||
tz = 'US/Eastern'
|
||||
s = Series(date_range('20130101', periods=3, tz=tz))
|
||||
result = qcut(s, bins)
|
||||
expected = (
|
||||
Series(IntervalIndex([
|
||||
Interval(Timestamp('2012-12-31 23:59:59.999999999', tz=tz),
|
||||
Timestamp('2013-01-01 16:00:00', tz=tz)),
|
||||
Interval(Timestamp('2013-01-01 16:00:00', tz=tz),
|
||||
Timestamp('2013-01-02 08:00:00', tz=tz)),
|
||||
Interval(Timestamp('2013-01-02 08:00:00', tz=tz),
|
||||
Timestamp('2013-01-03 00:00:00', tz=tz))]))
|
||||
.astype(CDT(ordered=True)))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_datetime_bin(self):
|
||||
data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')]
|
||||
bin_data = ['2012-12-12', '2012-12-14', '2012-12-16']
|
||||
expected = (
|
||||
Series(IntervalIndex([
|
||||
Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])),
|
||||
Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2]))]))
|
||||
.astype(CDT(ordered=True)))
|
||||
|
||||
for conv in [Timestamp, Timestamp, np.datetime64]:
|
||||
bins = [conv(v) for v in bin_data]
|
||||
result = cut(data, bins=bins)
|
||||
tm.assert_series_equal(Series(result), expected)
|
||||
|
||||
bin_pydatetime = [Timestamp(v).to_pydatetime() for v in bin_data]
|
||||
result = cut(data, bins=bin_pydatetime)
|
||||
tm.assert_series_equal(Series(result), expected)
|
||||
|
||||
bins = to_datetime(bin_data)
|
||||
result = cut(data, bins=bin_pydatetime)
|
||||
tm.assert_series_equal(Series(result), expected)
|
||||
|
||||
def test_datetime_nan(self):
|
||||
|
||||
def f():
|
||||
cut(date_range('20130101', periods=3), bins=[0, 2, 4])
|
||||
pytest.raises(ValueError, f)
|
||||
|
||||
result = cut(date_range('20130102', periods=5),
|
||||
bins=date_range('20130101', periods=2))
|
||||
mask = result.categories.isna()
|
||||
tm.assert_numpy_array_equal(mask, np.array([False]))
|
||||
mask = result.isna()
|
||||
tm.assert_numpy_array_equal(
|
||||
mask, np.array([False, True, True, True, True]))
|
||||
-345
@@ -1,345 +0,0 @@
|
||||
import pytest
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas import Categorical, Series, CategoricalIndex
|
||||
from pandas.core.dtypes.concat import union_categoricals
|
||||
from pandas.util import testing as tm
|
||||
|
||||
|
||||
class TestUnionCategoricals(object):
|
||||
|
||||
def test_union_categorical(self):
|
||||
# GH 13361
|
||||
data = [
|
||||
(list('abc'), list('abd'), list('abcabd')),
|
||||
([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
|
||||
([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
|
||||
|
||||
(['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'],
|
||||
['b', 'b', np.nan, 'a', 'a', np.nan, 'c']),
|
||||
|
||||
(pd.date_range('2014-01-01', '2014-01-05'),
|
||||
pd.date_range('2014-01-06', '2014-01-07'),
|
||||
pd.date_range('2014-01-01', '2014-01-07')),
|
||||
|
||||
(pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'),
|
||||
pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'),
|
||||
pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')),
|
||||
|
||||
(pd.period_range('2014-01-01', '2014-01-05'),
|
||||
pd.period_range('2014-01-06', '2014-01-07'),
|
||||
pd.period_range('2014-01-01', '2014-01-07')),
|
||||
]
|
||||
|
||||
for a, b, combined in data:
|
||||
for box in [Categorical, CategoricalIndex, Series]:
|
||||
result = union_categoricals([box(Categorical(a)),
|
||||
box(Categorical(b))])
|
||||
expected = Categorical(combined)
|
||||
tm.assert_categorical_equal(result, expected,
|
||||
check_category_order=True)
|
||||
|
||||
# new categories ordered by appearance
|
||||
s = Categorical(['x', 'y', 'z'])
|
||||
s2 = Categorical(['a', 'b', 'c'])
|
||||
result = union_categoricals([s, s2])
|
||||
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
|
||||
categories=['x', 'y', 'z', 'a', 'b', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
s = Categorical([0, 1.2, 2], ordered=True)
|
||||
s2 = Categorical([0, 1.2, 2], ordered=True)
|
||||
result = union_categoricals([s, s2])
|
||||
expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# must exactly match types
|
||||
s = Categorical([0, 1.2, 2])
|
||||
s2 = Categorical([2, 3, 4])
|
||||
msg = 'dtype of categories must be the same'
|
||||
with tm.assert_raises_regex(TypeError, msg):
|
||||
union_categoricals([s, s2])
|
||||
|
||||
msg = 'No Categoricals to union'
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
union_categoricals([])
|
||||
|
||||
def test_union_categoricals_nan(self):
|
||||
# GH 13759
|
||||
res = union_categoricals([pd.Categorical([1, 2, np.nan]),
|
||||
pd.Categorical([3, 2, np.nan])])
|
||||
exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([pd.Categorical(['A', 'B']),
|
||||
pd.Categorical(['B', 'B', np.nan])])
|
||||
exp = Categorical(['A', 'B', 'B', 'B', np.nan])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'),
|
||||
pd.NaT]
|
||||
val2 = [pd.NaT, pd.Timestamp('2011-01-01'),
|
||||
pd.Timestamp('2011-02-01')]
|
||||
|
||||
res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)])
|
||||
exp = Categorical(val1 + val2,
|
||||
categories=[pd.Timestamp('2011-01-01'),
|
||||
pd.Timestamp('2011-03-01'),
|
||||
pd.Timestamp('2011-02-01')])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
# all NaN
|
||||
res = union_categoricals([pd.Categorical(np.array([np.nan, np.nan],
|
||||
dtype=object)),
|
||||
pd.Categorical(['X'])])
|
||||
exp = Categorical([np.nan, np.nan, 'X'])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([pd.Categorical([np.nan, np.nan]),
|
||||
pd.Categorical([np.nan, np.nan])])
|
||||
exp = Categorical([np.nan, np.nan, np.nan, np.nan])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
def test_union_categoricals_empty(self):
|
||||
# GH 13759
|
||||
res = union_categoricals([pd.Categorical([]),
|
||||
pd.Categorical([])])
|
||||
exp = Categorical([])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([Categorical([]),
|
||||
Categorical(['1'])])
|
||||
exp = Categorical(['1'])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
def test_union_categorical_same_category(self):
|
||||
# check fastpath
|
||||
c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
|
||||
c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
|
||||
res = union_categoricals([c1, c2])
|
||||
exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan],
|
||||
categories=[1, 2, 3, 4])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z'])
|
||||
c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z'])
|
||||
res = union_categoricals([c1, c2])
|
||||
exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'],
|
||||
categories=['x', 'y', 'z'])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
def test_union_categorical_same_categories_different_order(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/19096
|
||||
c1 = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c'])
|
||||
c2 = Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c'])
|
||||
result = union_categoricals([c1, c2])
|
||||
expected = Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
|
||||
categories=['a', 'b', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_ordered(self):
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], ordered=False)
|
||||
|
||||
msg = 'Categorical.ordered must be the same'
|
||||
with tm.assert_raises_regex(TypeError, msg):
|
||||
union_categoricals([c1, c2])
|
||||
|
||||
res = union_categoricals([c1, c1])
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
|
||||
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
|
||||
|
||||
res = union_categoricals([c1, c2])
|
||||
exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
|
||||
|
||||
msg = "to union ordered Categoricals, all categories must be the same"
|
||||
with tm.assert_raises_regex(TypeError, msg):
|
||||
union_categoricals([c1, c2])
|
||||
|
||||
def test_union_categoricals_ignore_order(self):
|
||||
# GH 15219
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], ordered=False)
|
||||
|
||||
res = union_categoricals([c1, c2], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
msg = 'Categorical.ordered must be the same'
|
||||
with tm.assert_raises_regex(TypeError, msg):
|
||||
union_categoricals([c1, c2], ignore_order=False)
|
||||
|
||||
res = union_categoricals([c1, c1], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([c1, c1], ignore_order=False)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3],
|
||||
categories=[1, 2, 3], ordered=True)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
|
||||
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
|
||||
|
||||
res = union_categoricals([c1, c2], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, np.nan, 3, 2])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
|
||||
|
||||
res = union_categoricals([c1, c2], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([c2, c1], ignore_order=True,
|
||||
sort_categories=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([4, 5, 6], ordered=True)
|
||||
result = union_categoricals([c1, c2], ignore_order=True)
|
||||
expected = Categorical([1, 2, 3, 4, 5, 6])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
msg = "to union ordered Categoricals, all categories must be the same"
|
||||
with tm.assert_raises_regex(TypeError, msg):
|
||||
union_categoricals([c1, c2], ignore_order=False)
|
||||
|
||||
with tm.assert_raises_regex(TypeError, msg):
|
||||
union_categoricals([c1, c2])
|
||||
|
||||
def test_union_categoricals_sort(self):
|
||||
# GH 13846
|
||||
c1 = Categorical(['x', 'y', 'z'])
|
||||
c2 = Categorical(['a', 'b', 'c'])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
|
||||
categories=['a', 'b', 'c', 'x', 'y', 'z'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# fastpath
|
||||
c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
|
||||
c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(['a', 'b', 'b', 'c'],
|
||||
categories=['a', 'b', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b'])
|
||||
c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b'])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(['a', 'b', 'b', 'c'],
|
||||
categories=['a', 'b', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# fastpath - skip resort
|
||||
c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
|
||||
c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(['a', 'b', 'b', 'c'],
|
||||
categories=['a', 'b', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(['x', np.nan])
|
||||
c2 = Categorical([np.nan, 'b'])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(['x', np.nan, np.nan, 'b'],
|
||||
categories=['b', 'x'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical([np.nan])
|
||||
c2 = Categorical([np.nan])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical([np.nan, np.nan])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical([])
|
||||
c2 = Categorical([])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical([])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
|
||||
c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
|
||||
with pytest.raises(TypeError):
|
||||
union_categoricals([c1, c2], sort_categories=True)
|
||||
|
||||
def test_union_categoricals_sort_false(self):
|
||||
# GH 13846
|
||||
c1 = Categorical(['x', 'y', 'z'])
|
||||
c2 = Categorical(['a', 'b', 'c'])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
|
||||
categories=['x', 'y', 'z', 'a', 'b', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# fastpath
|
||||
c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
|
||||
c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(['a', 'b', 'b', 'c'],
|
||||
categories=['b', 'a', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# fastpath - skip resort
|
||||
c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
|
||||
c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(['a', 'b', 'b', 'c'],
|
||||
categories=['a', 'b', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(['x', np.nan])
|
||||
c2 = Categorical([np.nan, 'b'])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(['x', np.nan, np.nan, 'b'],
|
||||
categories=['x', 'b'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical([np.nan])
|
||||
c2 = Categorical([np.nan])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical([np.nan, np.nan])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical([])
|
||||
c2 = Categorical([])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical([])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
|
||||
c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(['b', 'a', 'a', 'c'],
|
||||
categories=['b', 'a', 'c'], ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categorical_unwrap(self):
|
||||
# GH 14173
|
||||
c1 = Categorical(['a', 'b'])
|
||||
c2 = pd.Series(['b', 'c'], dtype='category')
|
||||
result = union_categoricals([c1, c2])
|
||||
expected = Categorical(['a', 'b', 'b', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c2 = CategoricalIndex(c2)
|
||||
result = union_categoricals([c1, c2])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Series(c1)
|
||||
result = union_categoricals([c1, c2])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
union_categoricals([c1, ['a', 'b', 'c']])
|
||||
@@ -1,49 +0,0 @@
|
||||
|
||||
import numpy as np
|
||||
from pandas import date_range, Index
|
||||
import pandas.util.testing as tm
|
||||
from pandas.core.reshape.util import cartesian_product
|
||||
|
||||
|
||||
class TestCartesianProduct(object):
|
||||
|
||||
def test_simple(self):
|
||||
x, y = list('ABC'), [1, 22]
|
||||
result1, result2 = cartesian_product([x, y])
|
||||
expected1 = np.array(['A', 'A', 'B', 'B', 'C', 'C'])
|
||||
expected2 = np.array([1, 22, 1, 22, 1, 22])
|
||||
tm.assert_numpy_array_equal(result1, expected1)
|
||||
tm.assert_numpy_array_equal(result2, expected2)
|
||||
|
||||
def test_datetimeindex(self):
|
||||
# regression test for GitHub issue #6439
|
||||
# make sure that the ordering on datetimeindex is consistent
|
||||
x = date_range('2000-01-01', periods=2)
|
||||
result1, result2 = [Index(y).day for y in cartesian_product([x, x])]
|
||||
expected1 = Index([1, 1, 2, 2])
|
||||
expected2 = Index([1, 2, 1, 2])
|
||||
tm.assert_index_equal(result1, expected1)
|
||||
tm.assert_index_equal(result2, expected2)
|
||||
|
||||
def test_empty(self):
|
||||
# product of empty factors
|
||||
X = [[], [0, 1], []]
|
||||
Y = [[], [], ['a', 'b', 'c']]
|
||||
for x, y in zip(X, Y):
|
||||
expected1 = np.array([], dtype=np.asarray(x).dtype)
|
||||
expected2 = np.array([], dtype=np.asarray(y).dtype)
|
||||
result1, result2 = cartesian_product([x, y])
|
||||
tm.assert_numpy_array_equal(result1, expected1)
|
||||
tm.assert_numpy_array_equal(result2, expected2)
|
||||
|
||||
# empty product (empty input):
|
||||
result = cartesian_product([])
|
||||
expected = []
|
||||
assert result == expected
|
||||
|
||||
def test_invalid_input(self):
|
||||
invalid_inputs = [1, [1], [1, 2], [[1], 2],
|
||||
'a', ['a'], ['a', 'b'], [['a'], 'b']]
|
||||
msg = "Input must be a list-like of list-likes"
|
||||
for X in invalid_inputs:
|
||||
tm.assert_raises_regex(TypeError, msg, cartesian_product, X=X)
|
||||
Reference in New Issue
Block a user