Static code analysis and corrections
This commit is contained in:
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -0,0 +1,858 @@
|
||||
# pylint: disable=E1103
|
||||
|
||||
from warnings import catch_warnings
|
||||
|
||||
import numpy as np
|
||||
from numpy.random import randn
|
||||
import pytest
|
||||
|
||||
from pandas._libs import join as libjoin
|
||||
import pandas.compat as compat
|
||||
from pandas.compat import lrange
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, Series, concat, merge
|
||||
from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
a_ = np.array
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
|
||||
class TestJoin(object):
|
||||
|
||||
def setup_method(self, method):
|
||||
# aggregate multiple columns
|
||||
self.df = DataFrame({'key1': get_test_data(),
|
||||
'key2': get_test_data(),
|
||||
'data1': np.random.randn(N),
|
||||
'data2': np.random.randn(N)})
|
||||
|
||||
# exclude a couple keys for fun
|
||||
self.df = self.df[self.df['key2'] > 1]
|
||||
|
||||
self.df2 = DataFrame({'key1': get_test_data(n=N // 5),
|
||||
'key2': get_test_data(ngroups=NGROUPS // 2,
|
||||
n=N // 5),
|
||||
'value': np.random.randn(N // 5)})
|
||||
|
||||
index, data = tm.getMixedTypeDict()
|
||||
self.target = DataFrame(data, index=index)
|
||||
|
||||
# Join on string value
|
||||
self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']},
|
||||
index=data['C'])
|
||||
|
||||
def test_cython_left_outer_join(self):
|
||||
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
|
||||
right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
|
||||
max_group = 5
|
||||
|
||||
ls, rs = libjoin.left_outer_join(left, right, max_group)
|
||||
|
||||
exp_ls = left.argsort(kind='mergesort')
|
||||
exp_rs = right.argsort(kind='mergesort')
|
||||
|
||||
exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
|
||||
6, 6, 7, 7, 8, 8, 9, 10])
|
||||
exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
|
||||
4, 5, 4, 5, 4, 5, -1, -1])
|
||||
|
||||
exp_ls = exp_ls.take(exp_li)
|
||||
exp_ls[exp_li == -1] = -1
|
||||
|
||||
exp_rs = exp_rs.take(exp_ri)
|
||||
exp_rs[exp_ri == -1] = -1
|
||||
|
||||
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
|
||||
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
|
||||
|
||||
def test_cython_right_outer_join(self):
|
||||
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
|
||||
right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
|
||||
max_group = 5
|
||||
|
||||
rs, ls = libjoin.left_outer_join(right, left, max_group)
|
||||
|
||||
exp_ls = left.argsort(kind='mergesort')
|
||||
exp_rs = right.argsort(kind='mergesort')
|
||||
|
||||
# 0 1 1 1
|
||||
exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5,
|
||||
# 2 2 4
|
||||
6, 7, 8, 6, 7, 8, -1])
|
||||
exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3,
|
||||
4, 4, 4, 5, 5, 5, 6])
|
||||
|
||||
exp_ls = exp_ls.take(exp_li)
|
||||
exp_ls[exp_li == -1] = -1
|
||||
|
||||
exp_rs = exp_rs.take(exp_ri)
|
||||
exp_rs[exp_ri == -1] = -1
|
||||
|
||||
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
|
||||
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
|
||||
|
||||
def test_cython_inner_join(self):
|
||||
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
|
||||
right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
|
||||
max_group = 5
|
||||
|
||||
ls, rs = libjoin.inner_join(left, right, max_group)
|
||||
|
||||
exp_ls = left.argsort(kind='mergesort')
|
||||
exp_rs = right.argsort(kind='mergesort')
|
||||
|
||||
exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
|
||||
6, 6, 7, 7, 8, 8])
|
||||
exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
|
||||
4, 5, 4, 5, 4, 5])
|
||||
|
||||
exp_ls = exp_ls.take(exp_li)
|
||||
exp_ls[exp_li == -1] = -1
|
||||
|
||||
exp_rs = exp_rs.take(exp_ri)
|
||||
exp_rs[exp_ri == -1] = -1
|
||||
|
||||
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
|
||||
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
|
||||
|
||||
def test_left_outer_join(self):
|
||||
joined_key2 = merge(self.df, self.df2, on='key2')
|
||||
_check_join(self.df, self.df2, joined_key2, ['key2'], how='left')
|
||||
|
||||
joined_both = merge(self.df, self.df2)
|
||||
_check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
|
||||
how='left')
|
||||
|
||||
def test_right_outer_join(self):
|
||||
joined_key2 = merge(self.df, self.df2, on='key2', how='right')
|
||||
_check_join(self.df, self.df2, joined_key2, ['key2'], how='right')
|
||||
|
||||
joined_both = merge(self.df, self.df2, how='right')
|
||||
_check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
|
||||
how='right')
|
||||
|
||||
def test_full_outer_join(self):
|
||||
joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
|
||||
_check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')
|
||||
|
||||
joined_both = merge(self.df, self.df2, how='outer')
|
||||
_check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
|
||||
how='outer')
|
||||
|
||||
def test_inner_join(self):
|
||||
joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
|
||||
_check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')
|
||||
|
||||
joined_both = merge(self.df, self.df2, how='inner')
|
||||
_check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
|
||||
how='inner')
|
||||
|
||||
def test_handle_overlap(self):
|
||||
joined = merge(self.df, self.df2, on='key2',
|
||||
suffixes=['.foo', '.bar'])
|
||||
|
||||
assert 'key1.foo' in joined
|
||||
assert 'key1.bar' in joined
|
||||
|
||||
def test_handle_overlap_arbitrary_key(self):
|
||||
joined = merge(self.df, self.df2,
|
||||
left_on='key2', right_on='key1',
|
||||
suffixes=['.foo', '.bar'])
|
||||
assert 'key1.foo' in joined
|
||||
assert 'key2.bar' in joined
|
||||
|
||||
def test_join_on(self):
|
||||
target = self.target
|
||||
source = self.source
|
||||
|
||||
merged = target.join(source, on='C')
|
||||
tm.assert_series_equal(merged['MergedA'], target['A'],
|
||||
check_names=False)
|
||||
tm.assert_series_equal(merged['MergedD'], target['D'],
|
||||
check_names=False)
|
||||
|
||||
# join with duplicates (fix regression from DataFrame/Matrix merge)
|
||||
df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
|
||||
df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
|
||||
joined = df.join(df2, on='key')
|
||||
expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'],
|
||||
'value': [0, 0, 1, 1, 2]})
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
# Test when some are missing
|
||||
df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'],
|
||||
columns=['one'])
|
||||
df_b = DataFrame([['foo'], ['bar']], index=[1, 2],
|
||||
columns=['two'])
|
||||
df_c = DataFrame([[1], [2]], index=[1, 2],
|
||||
columns=['three'])
|
||||
joined = df_a.join(df_b, on='one')
|
||||
joined = joined.join(df_c, on='one')
|
||||
assert np.isnan(joined['two']['c'])
|
||||
assert np.isnan(joined['three']['c'])
|
||||
|
||||
# merge column not p resent
|
||||
with pytest.raises(KeyError, match="^'E'$"):
|
||||
target.join(source, on='E')
|
||||
|
||||
# overlap
|
||||
source_copy = source.copy()
|
||||
source_copy['A'] = 0
|
||||
msg = ("You are trying to merge on float64 and object columns. If"
|
||||
" you wish to proceed you should use pd.concat")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
target.join(source_copy, on='A')
|
||||
|
||||
def test_join_on_fails_with_different_right_index(self):
|
||||
df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
|
||||
'b': np.random.randn(3)})
|
||||
df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
|
||||
'b': np.random.randn(10)},
|
||||
index=tm.makeCustomIndex(10, 2))
|
||||
msg = (r'len\(left_on\) must equal the number of levels in the index'
|
||||
' of "right"')
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
merge(df, df2, left_on='a', right_index=True)
|
||||
|
||||
def test_join_on_fails_with_different_left_index(self):
|
||||
df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
|
||||
'b': np.random.randn(3)},
|
||||
index=tm.makeCustomIndex(3, 2))
|
||||
df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
|
||||
'b': np.random.randn(10)})
|
||||
msg = (r'len\(right_on\) must equal the number of levels in the index'
|
||||
' of "left"')
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
merge(df, df2, right_on='b', left_index=True)
|
||||
|
||||
def test_join_on_fails_with_different_column_counts(self):
|
||||
df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
|
||||
'b': np.random.randn(3)})
|
||||
df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
|
||||
'b': np.random.randn(10)},
|
||||
index=tm.makeCustomIndex(10, 2))
|
||||
msg = r"len\(right_on\) must equal len\(left_on\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
merge(df, df2, right_on='a', left_on=['a', 'b'])
|
||||
|
||||
@pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])])
|
||||
def test_join_on_fails_with_wrong_object_type(self, wrong_type):
|
||||
# GH12081 - original issue
|
||||
|
||||
# GH21220 - merging of Series and DataFrame is now allowed
|
||||
# Edited test to remove the Series object from test parameters
|
||||
|
||||
df = DataFrame({'a': [1, 1]})
|
||||
msg = ("Can only merge Series or DataFrame objects, a {} was passed"
|
||||
.format(str(type(wrong_type))))
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
merge(wrong_type, df, left_on='a', right_on='a')
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
merge(df, wrong_type, left_on='a', right_on='a')
|
||||
|
||||
def test_join_on_pass_vector(self):
|
||||
expected = self.target.join(self.source, on='C')
|
||||
del expected['C']
|
||||
|
||||
join_col = self.target.pop('C')
|
||||
result = self.target.join(self.source, on=join_col)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_with_len0(self):
|
||||
# nothing to merge
|
||||
merged = self.target.join(self.source.reindex([]), on='C')
|
||||
for col in self.source:
|
||||
assert col in merged
|
||||
assert merged[col].isna().all()
|
||||
|
||||
merged2 = self.target.join(self.source.reindex([]), on='C',
|
||||
how='inner')
|
||||
tm.assert_index_equal(merged2.columns, merged.columns)
|
||||
assert len(merged2) == 0
|
||||
|
||||
def test_join_on_inner(self):
|
||||
df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
|
||||
df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])
|
||||
|
||||
joined = df.join(df2, on='key', how='inner')
|
||||
|
||||
expected = df.join(df2, on='key')
|
||||
expected = expected[expected['value'].notna()]
|
||||
tm.assert_series_equal(joined['key'], expected['key'],
|
||||
check_dtype=False)
|
||||
tm.assert_series_equal(joined['value'], expected['value'],
|
||||
check_dtype=False)
|
||||
tm.assert_index_equal(joined.index, expected.index)
|
||||
|
||||
def test_join_on_singlekey_list(self):
|
||||
df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
|
||||
df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
|
||||
|
||||
# corner cases
|
||||
joined = df.join(df2, on=['key'])
|
||||
expected = df.join(df2, on='key')
|
||||
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
def test_join_on_series(self):
|
||||
result = self.target.join(self.source['MergedA'], on='C')
|
||||
expected = self.target.join(self.source[['MergedA']], on='C')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_on_series_buglet(self):
|
||||
# GH #638
|
||||
df = DataFrame({'a': [1, 1]})
|
||||
ds = Series([2], index=[1], name='b')
|
||||
result = df.join(ds, on='a')
|
||||
expected = DataFrame({'a': [1, 1],
|
||||
'b': [2, 2]}, index=df.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_index_mixed(self, join_type):
|
||||
# no overlapping blocks
|
||||
df1 = DataFrame(index=np.arange(10))
|
||||
df1['bool'] = True
|
||||
df1['string'] = 'foo'
|
||||
|
||||
df2 = DataFrame(index=np.arange(5, 15))
|
||||
df2['int'] = 1
|
||||
df2['float'] = 1.
|
||||
|
||||
joined = df1.join(df2, how=join_type)
|
||||
expected = _join_by_hand(df1, df2, how=join_type)
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
joined = df2.join(df1, how=join_type)
|
||||
expected = _join_by_hand(df2, df1, how=join_type)
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
def test_join_index_mixed_overlap(self):
|
||||
df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
|
||||
index=np.arange(10),
|
||||
columns=['A', 'B', 'C', 'D'])
|
||||
assert df1['B'].dtype == np.int64
|
||||
assert df1['D'].dtype == np.bool_
|
||||
|
||||
df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
|
||||
index=np.arange(0, 10, 2),
|
||||
columns=['A', 'B', 'C', 'D'])
|
||||
|
||||
# overlap
|
||||
joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
|
||||
expected_columns = ['A_one', 'B_one', 'C_one', 'D_one',
|
||||
'A_two', 'B_two', 'C_two', 'D_two']
|
||||
df1.columns = expected_columns[:4]
|
||||
df2.columns = expected_columns[4:]
|
||||
expected = _join_by_hand(df1, df2)
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
def test_join_empty_bug(self):
|
||||
# generated an exception in 0.4.3
|
||||
x = DataFrame()
|
||||
x.join(DataFrame([3], index=[0], columns=['A']), how='outer')
|
||||
|
||||
def test_join_unconsolidated(self):
|
||||
# GH #331
|
||||
a = DataFrame(randn(30, 2), columns=['a', 'b'])
|
||||
c = Series(randn(30))
|
||||
a['c'] = c
|
||||
d = DataFrame(randn(30, 1), columns=['q'])
|
||||
|
||||
# it works!
|
||||
a.join(d)
|
||||
d.join(a)
|
||||
|
||||
def test_join_multiindex(self):
|
||||
index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'],
|
||||
[1, 2, 3, 1, 2, 3]],
|
||||
names=['first', 'second'])
|
||||
|
||||
index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'],
|
||||
[1, 2, 3, 1, 2, 3]],
|
||||
names=['first', 'second'])
|
||||
|
||||
df1 = DataFrame(data=np.random.randn(6), index=index1,
|
||||
columns=['var X'])
|
||||
df2 = DataFrame(data=np.random.randn(6), index=index2,
|
||||
columns=['var Y'])
|
||||
|
||||
df1 = df1.sort_index(level=0)
|
||||
df2 = df2.sort_index(level=0)
|
||||
|
||||
joined = df1.join(df2, how='outer')
|
||||
ex_index = Index(index1.values).union(Index(index2.values))
|
||||
expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
|
||||
expected.index.names = index1.names
|
||||
assert_frame_equal(joined, expected)
|
||||
assert joined.index.names == index1.names
|
||||
|
||||
df1 = df1.sort_index(level=1)
|
||||
df2 = df2.sort_index(level=1)
|
||||
|
||||
joined = df1.join(df2, how='outer').sort_index(level=0)
|
||||
ex_index = Index(index1.values).union(Index(index2.values))
|
||||
expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
|
||||
expected.index.names = index1.names
|
||||
|
||||
assert_frame_equal(joined, expected)
|
||||
assert joined.index.names == index1.names
|
||||
|
||||
def test_join_inner_multiindex(self):
|
||||
key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
|
||||
'qux', 'snap']
|
||||
key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
|
||||
'three', 'one']
|
||||
|
||||
data = np.random.randn(len(key1))
|
||||
data = DataFrame({'key1': key1, 'key2': key2,
|
||||
'data': data})
|
||||
|
||||
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
|
||||
['one', 'two', 'three']],
|
||||
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
|
||||
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
|
||||
names=['first', 'second'])
|
||||
to_join = DataFrame(np.random.randn(10, 3), index=index,
|
||||
columns=['j_one', 'j_two', 'j_three'])
|
||||
|
||||
joined = data.join(to_join, on=['key1', 'key2'], how='inner')
|
||||
expected = merge(data, to_join.reset_index(),
|
||||
left_on=['key1', 'key2'],
|
||||
right_on=['first', 'second'], how='inner',
|
||||
sort=False)
|
||||
|
||||
expected2 = merge(to_join, data,
|
||||
right_on=['key1', 'key2'], left_index=True,
|
||||
how='inner', sort=False)
|
||||
assert_frame_equal(joined, expected2.reindex_like(joined))
|
||||
|
||||
expected2 = merge(to_join, data, right_on=['key1', 'key2'],
|
||||
left_index=True, how='inner', sort=False)
|
||||
|
||||
expected = expected.drop(['first', 'second'], axis=1)
|
||||
expected.index = joined.index
|
||||
|
||||
assert joined.index.is_monotonic
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
# _assert_same_contents(expected, expected2.loc[:, expected.columns])
|
||||
|
||||
def test_join_hierarchical_mixed(self):
|
||||
# GH 2024
|
||||
df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
|
||||
new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
|
||||
other_df = DataFrame(
|
||||
[(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
|
||||
other_df.set_index('a', inplace=True)
|
||||
# GH 9455, 12219
|
||||
with tm.assert_produces_warning(UserWarning):
|
||||
result = merge(new_df, other_df, left_index=True, right_index=True)
|
||||
assert ('b', 'mean') in result
|
||||
assert 'b' in result
|
||||
|
||||
def test_join_float64_float32(self):
|
||||
|
||||
a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64)
|
||||
b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32)
|
||||
joined = a.join(b)
|
||||
assert joined.dtypes['a'] == 'float64'
|
||||
assert joined.dtypes['b'] == 'float64'
|
||||
assert joined.dtypes['c'] == 'float32'
|
||||
|
||||
a = np.random.randint(0, 5, 100).astype('int64')
|
||||
b = np.random.random(100).astype('float64')
|
||||
c = np.random.random(100).astype('float32')
|
||||
df = DataFrame({'a': a, 'b': b, 'c': c})
|
||||
xpdf = DataFrame({'a': a, 'b': b, 'c': c})
|
||||
s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
|
||||
rs = df.merge(s, left_on='a', right_index=True)
|
||||
assert rs.dtypes['a'] == 'int64'
|
||||
assert rs.dtypes['b'] == 'float64'
|
||||
assert rs.dtypes['c'] == 'float32'
|
||||
assert rs.dtypes['md'] == 'float32'
|
||||
|
||||
xp = xpdf.merge(s, left_on='a', right_index=True)
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_join_many_non_unique_index(self):
|
||||
df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
|
||||
df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
|
||||
df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
|
||||
idf1 = df1.set_index(["a", "b"])
|
||||
idf2 = df2.set_index(["a", "b"])
|
||||
idf3 = df3.set_index(["a", "b"])
|
||||
|
||||
result = idf1.join([idf2, idf3], how='outer')
|
||||
|
||||
df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
|
||||
expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')
|
||||
|
||||
result = result.reset_index()
|
||||
expected = expected[result.columns]
|
||||
expected['a'] = expected.a.astype('int64')
|
||||
expected['b'] = expected.b.astype('int64')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
|
||||
df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
|
||||
df3 = DataFrame(
|
||||
{"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
|
||||
idf1 = df1.set_index(["a", "b"])
|
||||
idf2 = df2.set_index(["a", "b"])
|
||||
idf3 = df3.set_index(["a", "b"])
|
||||
result = idf1.join([idf2, idf3], how='inner')
|
||||
|
||||
df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
|
||||
expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')
|
||||
|
||||
result = result.reset_index()
|
||||
|
||||
assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
# GH 11519
|
||||
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
|
||||
'foo', 'bar', 'foo', 'foo'],
|
||||
'B': ['one', 'one', 'two', 'three',
|
||||
'two', 'two', 'one', 'three'],
|
||||
'C': np.random.randn(8),
|
||||
'D': np.random.randn(8)})
|
||||
s = Series(np.repeat(np.arange(8), 2),
|
||||
index=np.repeat(np.arange(8), 2), name='TEST')
|
||||
inner = df.join(s, how='inner')
|
||||
outer = df.join(s, how='outer')
|
||||
left = df.join(s, how='left')
|
||||
right = df.join(s, how='right')
|
||||
assert_frame_equal(inner, outer)
|
||||
assert_frame_equal(inner, left)
|
||||
assert_frame_equal(inner, right)
|
||||
|
||||
def test_join_sort(self):
|
||||
left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
|
||||
'value': [1, 2, 3, 4]})
|
||||
right = DataFrame({'value2': ['a', 'b', 'c']},
|
||||
index=['bar', 'baz', 'foo'])
|
||||
|
||||
joined = left.join(right, on='key', sort=True)
|
||||
expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'],
|
||||
'value': [2, 3, 1, 4],
|
||||
'value2': ['a', 'b', 'c', 'c']},
|
||||
index=[1, 2, 0, 3])
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
# smoke test
|
||||
joined = left.join(right, on='key', sort=False)
|
||||
tm.assert_index_equal(joined.index, pd.Index(lrange(4)))
|
||||
|
||||
def test_join_mixed_non_unique_index(self):
|
||||
# GH 12814, unorderable types in py3 with a non-unique index
|
||||
df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a'])
|
||||
df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4])
|
||||
result = df1.join(df2)
|
||||
expected = DataFrame({'a': [1, 2, 3, 3, 4],
|
||||
'b': [5, np.nan, 6, 7, np.nan]},
|
||||
index=[1, 2, 3, 3, 'a'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a'])
|
||||
df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4])
|
||||
result = df3.join(df4)
|
||||
expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]},
|
||||
index=[1, 2, 2, 'a'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_non_unique_period_index(self):
|
||||
# GH #16871
|
||||
index = pd.period_range('2016-01-01', periods=16, freq='M')
|
||||
df = DataFrame([i for i in range(len(index))],
|
||||
index=index, columns=['pnum'])
|
||||
df2 = concat([df, df])
|
||||
result = df.join(df2, how='inner', rsuffix='_df2')
|
||||
expected = DataFrame(
|
||||
np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
|
||||
columns=['pnum', 'pnum_df2'], index=df2.sort_index().index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_mixed_type_join_with_suffix(self):
|
||||
# GH #916
|
||||
df = DataFrame(np.random.randn(20, 6),
|
||||
columns=['a', 'b', 'c', 'd', 'e', 'f'])
|
||||
df.insert(0, 'id', 0)
|
||||
df.insert(5, 'dt', 'foo')
|
||||
|
||||
grouped = df.groupby('id')
|
||||
mn = grouped.mean()
|
||||
cn = grouped.count()
|
||||
|
||||
# it works!
|
||||
mn.join(cn, rsuffix='_right')
|
||||
|
||||
def test_join_many(self):
|
||||
df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
|
||||
df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]
|
||||
|
||||
joined = df_list[0].join(df_list[1:])
|
||||
tm.assert_frame_equal(joined, df)
|
||||
|
||||
df_list = [df[['a', 'b']][:-2],
|
||||
df[['c', 'd']][2:], df[['e', 'f']][1:9]]
|
||||
|
||||
def _check_diff_index(df_list, result, exp_index):
|
||||
reindexed = [x.reindex(exp_index) for x in df_list]
|
||||
expected = reindexed[0].join(reindexed[1:])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# different join types
|
||||
joined = df_list[0].join(df_list[1:], how='outer')
|
||||
_check_diff_index(df_list, joined, df.index)
|
||||
|
||||
joined = df_list[0].join(df_list[1:])
|
||||
_check_diff_index(df_list, joined, df_list[0].index)
|
||||
|
||||
joined = df_list[0].join(df_list[1:], how='inner')
|
||||
_check_diff_index(df_list, joined, df.index[2:8])
|
||||
|
||||
msg = "Joining multiple DataFrames only supported for joining on index"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df_list[0].join(df_list[1:], on='a')
|
||||
|
||||
def test_join_many_mixed(self):
|
||||
df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
|
||||
df['key'] = ['foo', 'bar'] * 4
|
||||
df1 = df.loc[:, ['A', 'B']]
|
||||
df2 = df.loc[:, ['C', 'D']]
|
||||
df3 = df.loc[:, ['key']]
|
||||
|
||||
result = df1.join([df2, df3])
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
def test_join_dups(self):
|
||||
|
||||
# joining dups
|
||||
df = concat([DataFrame(np.random.randn(10, 4),
|
||||
columns=['A', 'A', 'B', 'B']),
|
||||
DataFrame(np.random.randint(0, 10, size=20)
|
||||
.reshape(10, 2),
|
||||
columns=['A', 'C'])],
|
||||
axis=1)
|
||||
|
||||
expected = concat([df, df], axis=1)
|
||||
result = df.join(df, rsuffix='_2')
|
||||
result.columns = expected.columns
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH 4975, invalid join on dups
|
||||
w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
|
||||
x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
|
||||
y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
|
||||
z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
|
||||
|
||||
dta = x.merge(y, left_index=True, right_index=True).merge(
|
||||
z, left_index=True, right_index=True, how="outer")
|
||||
dta = dta.merge(w, left_index=True, right_index=True)
|
||||
expected = concat([x, y, z, w], axis=1)
|
||||
expected.columns = ['x_x', 'y_x', 'x_y',
|
||||
'y_y', 'x_x', 'y_x', 'x_y', 'y_y']
|
||||
assert_frame_equal(dta, expected)
|
||||
|
||||
def test_panel_join(self):
|
||||
with catch_warnings(record=True):
|
||||
panel = tm.makePanel()
|
||||
tm.add_nans(panel)
|
||||
|
||||
p1 = panel.iloc[:2, :10, :3]
|
||||
p2 = panel.iloc[2:, 5:, 2:]
|
||||
|
||||
# left join
|
||||
result = p1.join(p2)
|
||||
expected = p1.copy()
|
||||
expected['ItemC'] = p2['ItemC']
|
||||
tm.assert_panel_equal(result, expected)
|
||||
|
||||
# right join
|
||||
result = p1.join(p2, how='right')
|
||||
expected = p2.copy()
|
||||
expected['ItemA'] = p1['ItemA']
|
||||
expected['ItemB'] = p1['ItemB']
|
||||
expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC'])
|
||||
tm.assert_panel_equal(result, expected)
|
||||
|
||||
# inner join
|
||||
result = p1.join(p2, how='inner')
|
||||
expected = panel.iloc[:, 5:10, 2:3]
|
||||
tm.assert_panel_equal(result, expected)
|
||||
|
||||
# outer join
|
||||
result = p1.join(p2, how='outer')
|
||||
expected = p1.reindex(major=panel.major_axis,
|
||||
minor=panel.minor_axis)
|
||||
expected = expected.join(p2.reindex(major=panel.major_axis,
|
||||
minor=panel.minor_axis))
|
||||
tm.assert_panel_equal(result, expected)
|
||||
|
||||
def test_panel_join_overlap(self):
|
||||
with catch_warnings(record=True):
|
||||
panel = tm.makePanel()
|
||||
tm.add_nans(panel)
|
||||
|
||||
p1 = panel.loc[['ItemA', 'ItemB', 'ItemC']]
|
||||
p2 = panel.loc[['ItemB', 'ItemC']]
|
||||
|
||||
# Expected index is
|
||||
#
|
||||
# ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2
|
||||
joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2')
|
||||
p1_suf = p1.loc[['ItemB', 'ItemC']].add_suffix('_p1')
|
||||
p2_suf = p2.loc[['ItemB', 'ItemC']].add_suffix('_p2')
|
||||
no_overlap = panel.loc[['ItemA']]
|
||||
expected = no_overlap.join(p1_suf.join(p2_suf))
|
||||
tm.assert_panel_equal(joined, expected)
|
||||
|
||||
def test_panel_join_many(self):
|
||||
with catch_warnings(record=True):
|
||||
tm.K = 10
|
||||
panel = tm.makePanel()
|
||||
tm.K = 4
|
||||
|
||||
panels = [panel.iloc[:2], panel.iloc[2:6], panel.iloc[6:]]
|
||||
|
||||
joined = panels[0].join(panels[1:])
|
||||
tm.assert_panel_equal(joined, panel)
|
||||
|
||||
panels = [panel.iloc[:2, :-5],
|
||||
panel.iloc[2:6, 2:],
|
||||
panel.iloc[6:, 5:-7]]
|
||||
|
||||
data_dict = {}
|
||||
for p in panels:
|
||||
data_dict.update(p.iteritems())
|
||||
|
||||
joined = panels[0].join(panels[1:], how='inner')
|
||||
expected = pd.Panel.from_dict(data_dict, intersect=True)
|
||||
tm.assert_panel_equal(joined, expected)
|
||||
|
||||
joined = panels[0].join(panels[1:], how='outer')
|
||||
expected = pd.Panel.from_dict(data_dict, intersect=False)
|
||||
tm.assert_panel_equal(joined, expected)
|
||||
|
||||
# edge cases
|
||||
msg = "Suffixes not supported when passing multiple panels"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
panels[0].join(panels[1:], how='outer', lsuffix='foo',
|
||||
rsuffix='bar')
|
||||
msg = "Right join not supported with multiple panels"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
panels[0].join(panels[1:], how='right')
|
||||
|
||||
def test_join_multi_to_multi(self, join_type):
|
||||
# GH 20475
|
||||
leftindex = MultiIndex.from_product([list('abc'), list('xy'), [1, 2]],
|
||||
names=['abc', 'xy', 'num'])
|
||||
left = DataFrame({'v1': range(12)}, index=leftindex)
|
||||
|
||||
rightindex = MultiIndex.from_product([list('abc'), list('xy')],
|
||||
names=['abc', 'xy'])
|
||||
right = DataFrame({'v2': [100 * i for i in range(1, 7)]},
|
||||
index=rightindex)
|
||||
|
||||
result = left.join(right, on=['abc', 'xy'], how=join_type)
|
||||
expected = (left.reset_index()
|
||||
.merge(right.reset_index(),
|
||||
on=['abc', 'xy'], how=join_type)
|
||||
.set_index(['abc', 'xy', 'num'])
|
||||
)
|
||||
assert_frame_equal(expected, result)
|
||||
|
||||
msg = (r'len\(left_on\) must equal the number of levels in the index'
|
||||
' of "right"')
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
left.join(right, on='xy', how=join_type)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
right.join(left, on=['abc', 'xy'], how=join_type)
|
||||
|
||||
|
||||
def _check_join(left, right, result, join_col, how='left',
|
||||
lsuffix='_x', rsuffix='_y'):
|
||||
|
||||
# some smoke tests
|
||||
for c in join_col:
|
||||
assert(result[c].notna().all())
|
||||
|
||||
left_grouped = left.groupby(join_col)
|
||||
right_grouped = right.groupby(join_col)
|
||||
|
||||
for group_key, group in result.groupby(join_col):
|
||||
l_joined = _restrict_to_columns(group, left.columns, lsuffix)
|
||||
r_joined = _restrict_to_columns(group, right.columns, rsuffix)
|
||||
|
||||
try:
|
||||
lgroup = left_grouped.get_group(group_key)
|
||||
except KeyError:
|
||||
if how in ('left', 'inner'):
|
||||
raise AssertionError('key %s should not have been in the join'
|
||||
% str(group_key))
|
||||
|
||||
_assert_all_na(l_joined, left.columns, join_col)
|
||||
else:
|
||||
_assert_same_contents(l_joined, lgroup)
|
||||
|
||||
try:
|
||||
rgroup = right_grouped.get_group(group_key)
|
||||
except KeyError:
|
||||
if how in ('right', 'inner'):
|
||||
raise AssertionError('key %s should not have been in the join'
|
||||
% str(group_key))
|
||||
|
||||
_assert_all_na(r_joined, right.columns, join_col)
|
||||
else:
|
||||
_assert_same_contents(r_joined, rgroup)
|
||||
|
||||
|
||||
def _restrict_to_columns(group, columns, suffix):
|
||||
found = [c for c in group.columns
|
||||
if c in columns or c.replace(suffix, '') in columns]
|
||||
|
||||
# filter
|
||||
group = group.loc[:, found]
|
||||
|
||||
# get rid of suffixes, if any
|
||||
group = group.rename(columns=lambda x: x.replace(suffix, ''))
|
||||
|
||||
# put in the right order...
|
||||
group = group.loc[:, columns]
|
||||
|
||||
return group
|
||||
|
||||
|
||||
def _assert_same_contents(join_chunk, source):
|
||||
NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly...
|
||||
|
||||
jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values
|
||||
svalues = source.fillna(NA_SENTINEL).drop_duplicates().values
|
||||
|
||||
rows = {tuple(row) for row in jvalues}
|
||||
assert(len(rows) == len(source))
|
||||
assert(all(tuple(row) in rows for row in svalues))
|
||||
|
||||
|
||||
def _assert_all_na(join_chunk, source_columns, join_col):
|
||||
for c in source_columns:
|
||||
if c in join_col:
|
||||
continue
|
||||
assert(join_chunk[c].isna().all())
|
||||
|
||||
|
||||
def _join_by_hand(a, b, how='left'):
|
||||
join_index = a.index.join(b.index, how=how)
|
||||
|
||||
a_re = a.reindex(join_index)
|
||||
b_re = b.reindex(join_index)
|
||||
|
||||
result_columns = a.columns.append(b.columns)
|
||||
|
||||
for col, s in compat.iteritems(b_re):
|
||||
a_re[col] = s
|
||||
return a_re.reindex(columns=result_columns)
|
||||
File diff suppressed because it is too large
Load Diff
+1038
File diff suppressed because it is too large
Load Diff
+177
@@ -0,0 +1,177 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df1():
|
||||
return DataFrame(dict(
|
||||
outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4],
|
||||
inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2],
|
||||
v1=np.linspace(0, 1, 11)))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df2():
|
||||
return DataFrame(dict(
|
||||
outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3],
|
||||
inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3],
|
||||
v2=np.linspace(10, 11, 12)))
|
||||
|
||||
|
||||
@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']])
|
||||
def left_df(request, df1):
|
||||
""" Construct left test DataFrame with specified levels
|
||||
(any of 'outer', 'inner', and 'v1')"""
|
||||
levels = request.param
|
||||
if levels:
|
||||
df1 = df1.set_index(levels)
|
||||
|
||||
return df1
|
||||
|
||||
|
||||
@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']])
|
||||
def right_df(request, df2):
|
||||
""" Construct right test DataFrame with specified levels
|
||||
(any of 'outer', 'inner', and 'v2')"""
|
||||
levels = request.param
|
||||
|
||||
if levels:
|
||||
df2 = df2.set_index(levels)
|
||||
|
||||
return df2
|
||||
|
||||
|
||||
def compute_expected(df_left, df_right,
|
||||
on=None, left_on=None, right_on=None, how=None):
|
||||
"""
|
||||
Compute the expected merge result for the test case.
|
||||
|
||||
This method computes the expected result of merging two DataFrames on
|
||||
a combination of their columns and index levels. It does so by
|
||||
explicitly dropping/resetting their named index levels, performing a
|
||||
merge on their columns, and then finally restoring the appropriate
|
||||
index in the result.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_left : DataFrame
|
||||
The left DataFrame (may have zero or more named index levels)
|
||||
df_right : DataFrame
|
||||
The right DataFrame (may have zero or more named index levels)
|
||||
on : list of str
|
||||
The on parameter to the merge operation
|
||||
left_on : list of str
|
||||
The left_on parameter to the merge operation
|
||||
right_on : list of str
|
||||
The right_on parameter to the merge operation
|
||||
how : str
|
||||
The how parameter to the merge operation
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The expected merge result
|
||||
"""
|
||||
|
||||
# Handle on param if specified
|
||||
if on is not None:
|
||||
left_on, right_on = on, on
|
||||
|
||||
# Compute input named index levels
|
||||
left_levels = [n for n in df_left.index.names if n is not None]
|
||||
right_levels = [n for n in df_right.index.names if n is not None]
|
||||
|
||||
# Compute output named index levels
|
||||
output_levels = [i for i in left_on
|
||||
if i in right_levels and i in left_levels]
|
||||
|
||||
# Drop index levels that aren't involved in the merge
|
||||
drop_left = [n for n in left_levels if n not in left_on]
|
||||
if drop_left:
|
||||
df_left = df_left.reset_index(drop_left, drop=True)
|
||||
|
||||
drop_right = [n for n in right_levels if n not in right_on]
|
||||
if drop_right:
|
||||
df_right = df_right.reset_index(drop_right, drop=True)
|
||||
|
||||
# Convert remaining index levels to columns
|
||||
reset_left = [n for n in left_levels if n in left_on]
|
||||
if reset_left:
|
||||
df_left = df_left.reset_index(level=reset_left)
|
||||
|
||||
reset_right = [n for n in right_levels if n in right_on]
|
||||
if reset_right:
|
||||
df_right = df_right.reset_index(level=reset_right)
|
||||
|
||||
# Perform merge
|
||||
expected = df_left.merge(df_right,
|
||||
left_on=left_on,
|
||||
right_on=right_on,
|
||||
how=how)
|
||||
|
||||
# Restore index levels
|
||||
if output_levels:
|
||||
expected = expected.set_index(output_levels)
|
||||
|
||||
return expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize('on,how',
|
||||
[(['outer'], 'inner'),
|
||||
(['inner'], 'left'),
|
||||
(['outer', 'inner'], 'right'),
|
||||
(['inner', 'outer'], 'outer')])
|
||||
def test_merge_indexes_and_columns_on(left_df, right_df, on, how):
|
||||
|
||||
# Construct expected result
|
||||
expected = compute_expected(left_df, right_df, on=on, how=how)
|
||||
|
||||
# Perform merge
|
||||
result = left_df.merge(right_df, on=on, how=how)
|
||||
assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('left_on,right_on,how',
|
||||
[(['outer'], ['outer'], 'inner'),
|
||||
(['inner'], ['inner'], 'right'),
|
||||
(['outer', 'inner'], ['outer', 'inner'], 'left'),
|
||||
(['inner', 'outer'], ['inner', 'outer'], 'outer')])
|
||||
def test_merge_indexes_and_columns_lefton_righton(
|
||||
left_df, right_df, left_on, right_on, how):
|
||||
|
||||
# Construct expected result
|
||||
expected = compute_expected(left_df, right_df,
|
||||
left_on=left_on,
|
||||
right_on=right_on,
|
||||
how=how)
|
||||
|
||||
# Perform merge
|
||||
result = left_df.merge(right_df,
|
||||
left_on=left_on, right_on=right_on, how=how)
|
||||
assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('left_index',
|
||||
['inner', ['inner', 'outer']])
|
||||
def test_join_indexes_and_columns_on(df1, df2, left_index, join_type):
|
||||
|
||||
# Construct left_df
|
||||
left_df = df1.set_index(left_index)
|
||||
|
||||
# Construct right_df
|
||||
right_df = df2.set_index(['outer', 'inner'])
|
||||
|
||||
# Result
|
||||
expected = (left_df.reset_index()
|
||||
.join(right_df, on=['outer', 'inner'], how=join_type,
|
||||
lsuffix='_x', rsuffix='_y')
|
||||
.set_index(left_index))
|
||||
|
||||
# Perform join
|
||||
result = left_df.join(right_df, on=['outer', 'inner'], how=join_type,
|
||||
lsuffix='_x', rsuffix='_y')
|
||||
|
||||
assert_frame_equal(result, expected, check_like=True)
|
||||
+103
@@ -0,0 +1,103 @@
|
||||
from numpy import nan
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, merge_ordered
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
|
||||
class TestMergeOrdered(object):
|
||||
|
||||
def setup_method(self, method):
|
||||
self.left = DataFrame({'key': ['a', 'c', 'e'],
|
||||
'lvalue': [1, 2., 3]})
|
||||
|
||||
self.right = DataFrame({'key': ['b', 'c', 'd', 'f'],
|
||||
'rvalue': [1, 2, 3., 4]})
|
||||
|
||||
def test_basic(self):
|
||||
result = merge_ordered(self.left, self.right, on='key')
|
||||
expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'],
|
||||
'lvalue': [1, nan, 2, nan, 3, nan],
|
||||
'rvalue': [nan, 1, 2, 3, nan, 4]})
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_ffill(self):
|
||||
result = merge_ordered(
|
||||
self.left, self.right, on='key', fill_method='ffill')
|
||||
expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'],
|
||||
'lvalue': [1., 1, 2, 2, 3, 3.],
|
||||
'rvalue': [nan, 1, 2, 3, 3, 4]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_multigroup(self):
|
||||
left = pd.concat([self.left, self.left], ignore_index=True)
|
||||
|
||||
left['group'] = ['a'] * 3 + ['b'] * 3
|
||||
|
||||
result = merge_ordered(left, self.right, on='key', left_by='group',
|
||||
fill_method='ffill')
|
||||
expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2,
|
||||
'lvalue': [1., 1, 2, 2, 3, 3.] * 2,
|
||||
'rvalue': [nan, 1, 2, 3, 3, 4] * 2})
|
||||
expected['group'] = ['a'] * 6 + ['b'] * 6
|
||||
|
||||
assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
result2 = merge_ordered(self.right, left, on='key', right_by='group',
|
||||
fill_method='ffill')
|
||||
assert_frame_equal(result, result2.loc[:, result.columns])
|
||||
|
||||
result = merge_ordered(left, self.right, on='key', left_by='group')
|
||||
assert result['group'].notna().all()
|
||||
|
||||
def test_merge_type(self):
|
||||
class NotADataFrame(DataFrame):
|
||||
|
||||
@property
|
||||
def _constructor(self):
|
||||
return NotADataFrame
|
||||
|
||||
nad = NotADataFrame(self.left)
|
||||
result = nad.merge(self.right, on='key')
|
||||
|
||||
assert isinstance(result, NotADataFrame)
|
||||
|
||||
def test_empty_sequence_concat(self):
|
||||
# GH 9157
|
||||
empty_pat = "[Nn]o objects"
|
||||
none_pat = "objects.*None"
|
||||
test_cases = [
|
||||
((), empty_pat),
|
||||
([], empty_pat),
|
||||
({}, empty_pat),
|
||||
([None], none_pat),
|
||||
([None, None], none_pat)
|
||||
]
|
||||
for df_seq, pattern in test_cases:
|
||||
with pytest.raises(ValueError, match=pattern):
|
||||
pd.concat(df_seq)
|
||||
|
||||
pd.concat([pd.DataFrame()])
|
||||
pd.concat([None, pd.DataFrame()])
|
||||
pd.concat([pd.DataFrame(), None])
|
||||
|
||||
def test_doc_example(self):
|
||||
left = DataFrame({'group': list('aaabbb'),
|
||||
'key': ['a', 'c', 'e', 'a', 'c', 'e'],
|
||||
'lvalue': [1, 2, 3] * 2,
|
||||
})
|
||||
|
||||
right = DataFrame({'key': ['b', 'c', 'd'],
|
||||
'rvalue': [1, 2, 3]})
|
||||
|
||||
result = merge_ordered(left, right, fill_method='ffill',
|
||||
left_by='group')
|
||||
|
||||
expected = DataFrame({'group': list('aaaaabbbbb'),
|
||||
'key': ['a', 'b', 'c', 'd', 'e'] * 2,
|
||||
'lvalue': [1, 1, 2, 2, 3] * 2,
|
||||
'rvalue': [nan, 1, 2, 3, 3] * 2})
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,668 @@
|
||||
# pylint: disable=E1103
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
from numpy import nan
|
||||
from numpy.random import randn
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, Series
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.reshape.merge import merge
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left():
|
||||
"""left dataframe (not multi-indexed) for multi-index join tests"""
|
||||
# a little relevant example with NAs
|
||||
key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
|
||||
'qux', 'snap']
|
||||
key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
|
||||
'three', 'one']
|
||||
|
||||
data = np.random.randn(len(key1))
|
||||
return DataFrame({'key1': key1, 'key2': key2, 'data': data})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right():
|
||||
"""right dataframe (multi-indexed) for multi-index join tests"""
|
||||
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
|
||||
['one', 'two', 'three']],
|
||||
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
|
||||
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
|
||||
names=['key1', 'key2'])
|
||||
|
||||
return DataFrame(np.random.randn(10, 3), index=index,
|
||||
columns=['j_one', 'j_two', 'j_three'])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left_multi():
|
||||
return (
|
||||
DataFrame(
|
||||
dict(Origin=['A', 'A', 'B', 'B', 'C'],
|
||||
Destination=['A', 'B', 'A', 'C', 'A'],
|
||||
Period=['AM', 'AM', 'IP', 'AM', 'OP'],
|
||||
TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
|
||||
Trips=[1987, 3647, 2470, 4296, 4444]),
|
||||
columns=['Origin', 'Destination', 'Period',
|
||||
'TripPurp', 'Trips'])
|
||||
.set_index(['Origin', 'Destination', 'Period', 'TripPurp']))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right_multi():
|
||||
return (
|
||||
DataFrame(
|
||||
dict(Origin=['A', 'A', 'B', 'B', 'C', 'C', 'E'],
|
||||
Destination=['A', 'B', 'A', 'B', 'A', 'B', 'F'],
|
||||
Period=['AM', 'AM', 'IP', 'AM', 'OP', 'IP', 'AM'],
|
||||
LinkType=['a', 'b', 'c', 'b', 'a', 'b', 'a'],
|
||||
Distance=[100, 80, 90, 80, 75, 35, 55]),
|
||||
columns=['Origin', 'Destination', 'Period',
|
||||
'LinkType', 'Distance'])
|
||||
.set_index(['Origin', 'Destination', 'Period', 'LinkType']))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def on_cols_multi():
|
||||
return ['Origin', 'Destination', 'Period']
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def idx_cols_multi():
|
||||
return ['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType']
|
||||
|
||||
|
||||
class TestMergeMulti(object):
|
||||
|
||||
def setup_method(self):
|
||||
self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
|
||||
['one', 'two', 'three']],
|
||||
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
|
||||
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
|
||||
names=['first', 'second'])
|
||||
self.to_join = DataFrame(np.random.randn(10, 3), index=self.index,
|
||||
columns=['j_one', 'j_two', 'j_three'])
|
||||
|
||||
# a little relevant example with NAs
|
||||
key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
|
||||
'qux', 'snap']
|
||||
key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
|
||||
'three', 'one']
|
||||
|
||||
data = np.random.randn(len(key1))
|
||||
self.data = DataFrame({'key1': key1, 'key2': key2,
|
||||
'data': data})
|
||||
|
||||
def test_merge_on_multikey(self, left, right, join_type):
|
||||
on_cols = ['key1', 'key2']
|
||||
result = (left.join(right, on=on_cols, how=join_type)
|
||||
.reset_index(drop=True))
|
||||
|
||||
expected = pd.merge(left, right.reset_index(),
|
||||
on=on_cols, how=join_type)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = (left.join(right, on=on_cols, how=join_type, sort=True)
|
||||
.reset_index(drop=True))
|
||||
|
||||
expected = pd.merge(left, right.reset_index(),
|
||||
on=on_cols, how=join_type, sort=True)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("sort", [False, True])
|
||||
def test_left_join_multi_index(self, left, right, sort):
|
||||
icols = ['1st', '2nd', '3rd']
|
||||
|
||||
def bind_cols(df):
|
||||
iord = lambda a: 0 if a != a else ord(a)
|
||||
f = lambda ts: ts.map(iord) - ord('a')
|
||||
return (f(df['1st']) + f(df['3rd']) * 1e2 +
|
||||
df['2nd'].fillna(0) * 1e4)
|
||||
|
||||
def run_asserts(left, right, sort):
|
||||
res = left.join(right, on=icols, how='left', sort=sort)
|
||||
|
||||
assert len(left) < len(res) + 1
|
||||
assert not res['4th'].isna().any()
|
||||
assert not res['5th'].isna().any()
|
||||
|
||||
tm.assert_series_equal(
|
||||
res['4th'], - res['5th'], check_names=False)
|
||||
result = bind_cols(res.iloc[:, :-2])
|
||||
tm.assert_series_equal(res['4th'], result, check_names=False)
|
||||
assert result.name is None
|
||||
|
||||
if sort:
|
||||
tm.assert_frame_equal(
|
||||
res, res.sort_values(icols, kind='mergesort'))
|
||||
|
||||
out = merge(left, right.reset_index(), on=icols,
|
||||
sort=sort, how='left')
|
||||
|
||||
res.index = np.arange(len(res))
|
||||
tm.assert_frame_equal(out, res)
|
||||
|
||||
lc = list(map(chr, np.arange(ord('a'), ord('z') + 1)))
|
||||
left = DataFrame(np.random.choice(lc, (5000, 2)),
|
||||
columns=['1st', '3rd'])
|
||||
left.insert(1, '2nd', np.random.randint(0, 1000, len(left)))
|
||||
|
||||
i = np.random.permutation(len(left))
|
||||
right = left.iloc[i].copy()
|
||||
|
||||
left['4th'] = bind_cols(left)
|
||||
right['5th'] = - bind_cols(right)
|
||||
right.set_index(icols, inplace=True)
|
||||
|
||||
run_asserts(left, right, sort)
|
||||
|
||||
# inject some nulls
|
||||
left.loc[1::23, '1st'] = np.nan
|
||||
left.loc[2::37, '2nd'] = np.nan
|
||||
left.loc[3::43, '3rd'] = np.nan
|
||||
left['4th'] = bind_cols(left)
|
||||
|
||||
i = np.random.permutation(len(left))
|
||||
right = left.iloc[i, :-1]
|
||||
right['5th'] = - bind_cols(right)
|
||||
right.set_index(icols, inplace=True)
|
||||
|
||||
run_asserts(left, right, sort)
|
||||
|
||||
@pytest.mark.parametrize("sort", [False, True])
|
||||
def test_merge_right_vs_left(self, left, right, sort):
|
||||
# compare left vs right merge with multikey
|
||||
on_cols = ['key1', 'key2']
|
||||
merged_left_right = left.merge(right,
|
||||
left_on=on_cols, right_index=True,
|
||||
how='left', sort=sort)
|
||||
|
||||
merge_right_left = right.merge(left,
|
||||
right_on=on_cols, left_index=True,
|
||||
how='right', sort=sort)
|
||||
|
||||
# Reorder columns
|
||||
merge_right_left = merge_right_left[merged_left_right.columns]
|
||||
|
||||
tm.assert_frame_equal(merged_left_right, merge_right_left)
|
||||
|
||||
def test_compress_group_combinations(self):
|
||||
|
||||
# ~ 40000000 possible unique groups
|
||||
key1 = tm.rands_array(10, 10000)
|
||||
key1 = np.tile(key1, 2)
|
||||
key2 = key1[::-1]
|
||||
|
||||
df = DataFrame({'key1': key1, 'key2': key2,
|
||||
'value1': np.random.randn(20000)})
|
||||
|
||||
df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2],
|
||||
'value2': np.random.randn(10000)})
|
||||
|
||||
# just to hit the label compression code path
|
||||
merge(df, df2, how='outer')
|
||||
|
||||
def test_left_join_index_preserve_order(self):
|
||||
|
||||
on_cols = ['k1', 'k2']
|
||||
left = DataFrame({'k1': [0, 1, 2] * 8,
|
||||
'k2': ['foo', 'bar'] * 12,
|
||||
'v': np.array(np.arange(24), dtype=np.int64)})
|
||||
|
||||
index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
|
||||
right = DataFrame({'v2': [5, 7]}, index=index)
|
||||
|
||||
result = left.join(right, on=on_cols)
|
||||
|
||||
expected = left.copy()
|
||||
expected['v2'] = np.nan
|
||||
expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
|
||||
expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result.sort_values(on_cols, kind='mergesort', inplace=True)
|
||||
expected = left.join(right, on=on_cols, sort=True)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# test join with multi dtypes blocks
|
||||
left = DataFrame({'k1': [0, 1, 2] * 8,
|
||||
'k2': ['foo', 'bar'] * 12,
|
||||
'k3': np.array([0, 1, 2] * 8, dtype=np.float32),
|
||||
'v': np.array(np.arange(24), dtype=np.int32)})
|
||||
|
||||
index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
|
||||
right = DataFrame({'v2': [5, 7]}, index=index)
|
||||
|
||||
result = left.join(right, on=on_cols)
|
||||
|
||||
expected = left.copy()
|
||||
expected['v2'] = np.nan
|
||||
expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
|
||||
expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = result.sort_values(on_cols, kind='mergesort')
|
||||
expected = left.join(right, on=on_cols, sort=True)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_join_index_multi_match_multiindex(self):
|
||||
left = DataFrame([
|
||||
['X', 'Y', 'C', 'a'],
|
||||
['W', 'Y', 'C', 'e'],
|
||||
['V', 'Q', 'A', 'h'],
|
||||
['V', 'R', 'D', 'i'],
|
||||
['X', 'Y', 'D', 'b'],
|
||||
['X', 'Y', 'A', 'c'],
|
||||
['W', 'Q', 'B', 'f'],
|
||||
['W', 'R', 'C', 'g'],
|
||||
['V', 'Y', 'C', 'j'],
|
||||
['X', 'Y', 'B', 'd']],
|
||||
columns=['cola', 'colb', 'colc', 'tag'],
|
||||
index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8])
|
||||
|
||||
right = (DataFrame([
|
||||
['W', 'R', 'C', 0],
|
||||
['W', 'Q', 'B', 3],
|
||||
['W', 'Q', 'B', 8],
|
||||
['X', 'Y', 'A', 1],
|
||||
['X', 'Y', 'A', 4],
|
||||
['X', 'Y', 'B', 5],
|
||||
['X', 'Y', 'C', 6],
|
||||
['X', 'Y', 'C', 9],
|
||||
['X', 'Q', 'C', -6],
|
||||
['X', 'R', 'C', -9],
|
||||
['V', 'Y', 'C', 7],
|
||||
['V', 'R', 'D', 2],
|
||||
['V', 'R', 'D', -1],
|
||||
['V', 'Q', 'A', -3]],
|
||||
columns=['col1', 'col2', 'col3', 'val'])
|
||||
.set_index(['col1', 'col2', 'col3']))
|
||||
|
||||
result = left.join(right, on=['cola', 'colb', 'colc'], how='left')
|
||||
|
||||
expected = DataFrame([
|
||||
['X', 'Y', 'C', 'a', 6],
|
||||
['X', 'Y', 'C', 'a', 9],
|
||||
['W', 'Y', 'C', 'e', nan],
|
||||
['V', 'Q', 'A', 'h', -3],
|
||||
['V', 'R', 'D', 'i', 2],
|
||||
['V', 'R', 'D', 'i', -1],
|
||||
['X', 'Y', 'D', 'b', nan],
|
||||
['X', 'Y', 'A', 'c', 1],
|
||||
['X', 'Y', 'A', 'c', 4],
|
||||
['W', 'Q', 'B', 'f', 3],
|
||||
['W', 'Q', 'B', 'f', 8],
|
||||
['W', 'R', 'C', 'g', 0],
|
||||
['V', 'Y', 'C', 'j', 7],
|
||||
['X', 'Y', 'B', 'd', 5]],
|
||||
columns=['cola', 'colb', 'colc', 'tag', 'val'],
|
||||
index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on=['cola', 'colb', 'colc'],
|
||||
how='left', sort=True)
|
||||
|
||||
expected = expected.sort_values(['cola', 'colb', 'colc'],
|
||||
kind='mergesort')
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_join_index_multi_match(self):
|
||||
left = DataFrame([
|
||||
['c', 0],
|
||||
['b', 1],
|
||||
['a', 2],
|
||||
['b', 3]],
|
||||
columns=['tag', 'val'],
|
||||
index=[2, 0, 1, 3])
|
||||
|
||||
right = (DataFrame([
|
||||
['a', 'v'],
|
||||
['c', 'w'],
|
||||
['c', 'x'],
|
||||
['d', 'y'],
|
||||
['a', 'z'],
|
||||
['c', 'r'],
|
||||
['e', 'q'],
|
||||
['c', 's']],
|
||||
columns=['tag', 'char'])
|
||||
.set_index('tag'))
|
||||
|
||||
result = left.join(right, on='tag', how='left')
|
||||
|
||||
expected = DataFrame([
|
||||
['c', 0, 'w'],
|
||||
['c', 0, 'x'],
|
||||
['c', 0, 'r'],
|
||||
['c', 0, 's'],
|
||||
['b', 1, nan],
|
||||
['a', 2, 'v'],
|
||||
['a', 2, 'z'],
|
||||
['b', 3, nan]],
|
||||
columns=['tag', 'val', 'char'],
|
||||
index=[2, 2, 2, 2, 0, 1, 1, 3])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on='tag', how='left', sort=True)
|
||||
expected2 = expected.sort_values('tag', kind='mergesort')
|
||||
|
||||
tm.assert_frame_equal(result, expected2)
|
||||
|
||||
# GH7331 - maintain left frame order in left merge
|
||||
result = merge(left, right.reset_index(), how='left', on='tag')
|
||||
expected.index = np.arange(len(expected))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_merge_na_buglet(self):
|
||||
left = DataFrame({'id': list('abcde'), 'v1': randn(5),
|
||||
'v2': randn(5), 'dummy': list('abcde'),
|
||||
'v3': randn(5)},
|
||||
columns=['id', 'v1', 'v2', 'dummy', 'v3'])
|
||||
right = DataFrame({'id': ['a', 'b', np.nan, np.nan, np.nan],
|
||||
'sv3': [1.234, 5.678, np.nan, np.nan, np.nan]})
|
||||
|
||||
result = merge(left, right, on='id', how='left')
|
||||
|
||||
rdf = right.drop(['id'], axis=1)
|
||||
expected = left.join(rdf)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_merge_na_keys(self):
|
||||
data = [[1950, "A", 1.5],
|
||||
[1950, "B", 1.5],
|
||||
[1955, "B", 1.5],
|
||||
[1960, "B", np.nan],
|
||||
[1970, "B", 4.],
|
||||
[1950, "C", 4.],
|
||||
[1960, "C", np.nan],
|
||||
[1965, "C", 3.],
|
||||
[1970, "C", 4.]]
|
||||
|
||||
frame = DataFrame(data, columns=["year", "panel", "data"])
|
||||
|
||||
other_data = [[1960, 'A', np.nan],
|
||||
[1970, 'A', np.nan],
|
||||
[1955, 'A', np.nan],
|
||||
[1965, 'A', np.nan],
|
||||
[1965, 'B', np.nan],
|
||||
[1955, 'C', np.nan]]
|
||||
other = DataFrame(other_data, columns=['year', 'panel', 'data'])
|
||||
|
||||
result = frame.merge(other, how='outer')
|
||||
|
||||
expected = frame.fillna(-999).merge(other.fillna(-999), how='outer')
|
||||
expected = expected.replace(-999, np.nan)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("klass", [None, np.asarray, Series, Index])
|
||||
def test_merge_datetime_index(self, klass):
|
||||
# see gh-19038
|
||||
df = DataFrame([1, 2, 3],
|
||||
["2016-01-01", "2017-01-01", "2018-01-01"],
|
||||
columns=["a"])
|
||||
df.index = pd.to_datetime(df.index)
|
||||
on_vector = df.index.year
|
||||
|
||||
if klass is not None:
|
||||
on_vector = klass(on_vector)
|
||||
|
||||
expected = DataFrame(
|
||||
OrderedDict([
|
||||
("a", [1, 2, 3]),
|
||||
("key_1", [2016, 2017, 2018]),
|
||||
])
|
||||
)
|
||||
|
||||
result = df.merge(df, on=["a", on_vector], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
OrderedDict([
|
||||
("key_0", [2016, 2017, 2018]),
|
||||
("a_x", [1, 2, 3]),
|
||||
("a_y", [1, 2, 3]),
|
||||
])
|
||||
)
|
||||
|
||||
result = df.merge(df, on=[df.index.year], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_levels(self):
|
||||
|
||||
# GH 3662
|
||||
# merge multi-levels
|
||||
household = (
|
||||
DataFrame(
|
||||
dict(household_id=[1, 2, 3],
|
||||
male=[0, 1, 0],
|
||||
wealth=[196087.3, 316478.7, 294750]),
|
||||
columns=['household_id', 'male', 'wealth'])
|
||||
.set_index('household_id'))
|
||||
portfolio = (
|
||||
DataFrame(
|
||||
dict(household_id=[1, 2, 2, 3, 3, 3, 4],
|
||||
asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29",
|
||||
"gb00b03mlx29", "lu0197800237", "nl0000289965",
|
||||
np.nan],
|
||||
name=["ABN Amro", "Robeco", "Royal Dutch Shell",
|
||||
"Royal Dutch Shell",
|
||||
"AAB Eastern Europe Equity Fund",
|
||||
"Postbank BioTech Fonds", np.nan],
|
||||
share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]),
|
||||
columns=['household_id', 'asset_id', 'name', 'share'])
|
||||
.set_index(['household_id', 'asset_id']))
|
||||
result = household.join(portfolio, how='inner')
|
||||
expected = (
|
||||
DataFrame(
|
||||
dict(male=[0, 1, 1, 0, 0, 0],
|
||||
wealth=[196087.3, 316478.7, 316478.7,
|
||||
294750.0, 294750.0, 294750.0],
|
||||
name=['ABN Amro', 'Robeco', 'Royal Dutch Shell',
|
||||
'Royal Dutch Shell',
|
||||
'AAB Eastern Europe Equity Fund',
|
||||
'Postbank BioTech Fonds'],
|
||||
share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
|
||||
household_id=[1, 2, 2, 3, 3, 3],
|
||||
asset_id=['nl0000301109', 'nl0000289783', 'gb00b03mlx29',
|
||||
'gb00b03mlx29', 'lu0197800237',
|
||||
'nl0000289965']))
|
||||
.set_index(['household_id', 'asset_id'])
|
||||
.reindex(columns=['male', 'wealth', 'name', 'share']))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# equivalency
|
||||
result = (merge(household.reset_index(), portfolio.reset_index(),
|
||||
on=['household_id'], how='inner')
|
||||
.set_index(['household_id', 'asset_id']))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = household.join(portfolio, how='outer')
|
||||
expected = (concat([
|
||||
expected,
|
||||
(DataFrame(
|
||||
dict(share=[1.00]),
|
||||
index=MultiIndex.from_tuples(
|
||||
[(4, np.nan)],
|
||||
names=['household_id', 'asset_id'])))
|
||||
], axis=0, sort=True).reindex(columns=expected.columns))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# invalid cases
|
||||
household.index.name = 'foo'
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
household.join(portfolio, how='inner')
|
||||
|
||||
portfolio2 = portfolio.copy()
|
||||
portfolio2.index.set_names(['household_id', 'foo'])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
portfolio2.join(portfolio, how='inner')
|
||||
|
||||
def test_join_multi_levels2(self):
|
||||
|
||||
# some more advanced merges
|
||||
# GH6360
|
||||
household = (
|
||||
DataFrame(
|
||||
dict(household_id=[1, 2, 2, 3, 3, 3, 4],
|
||||
asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29",
|
||||
"gb00b03mlx29", "lu0197800237", "nl0000289965",
|
||||
np.nan],
|
||||
share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]),
|
||||
columns=['household_id', 'asset_id', 'share'])
|
||||
.set_index(['household_id', 'asset_id']))
|
||||
|
||||
log_return = DataFrame(dict(
|
||||
asset_id=["gb00b03mlx29", "gb00b03mlx29",
|
||||
"gb00b03mlx29", "lu0197800237", "lu0197800237"],
|
||||
t=[233, 234, 235, 180, 181],
|
||||
log_return=[.09604978, -.06524096, .03532373, .03025441, .036997]
|
||||
)).set_index(["asset_id", "t"])
|
||||
|
||||
expected = (
|
||||
DataFrame(dict(
|
||||
household_id=[2, 2, 2, 3, 3, 3, 3, 3],
|
||||
asset_id=["gb00b03mlx29", "gb00b03mlx29",
|
||||
"gb00b03mlx29", "gb00b03mlx29",
|
||||
"gb00b03mlx29", "gb00b03mlx29",
|
||||
"lu0197800237", "lu0197800237"],
|
||||
t=[233, 234, 235, 233, 234, 235, 180, 181],
|
||||
share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
|
||||
log_return=[.09604978, -.06524096, .03532373,
|
||||
.09604978, -.06524096, .03532373,
|
||||
.03025441, .036997]
|
||||
))
|
||||
.set_index(["household_id", "asset_id", "t"])
|
||||
.reindex(columns=['share', 'log_return']))
|
||||
|
||||
# this is the equivalency
|
||||
result = (merge(household.reset_index(), log_return.reset_index(),
|
||||
on=['asset_id'], how='inner')
|
||||
.set_index(['household_id', 'asset_id', 't']))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = (
|
||||
DataFrame(dict(
|
||||
household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
|
||||
asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29",
|
||||
"gb00b03mlx29", "gb00b03mlx29",
|
||||
"gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29",
|
||||
"lu0197800237", "lu0197800237",
|
||||
"nl0000289965", None],
|
||||
t=[None, None, 233, 234, 235, 233, 234,
|
||||
235, 180, 181, None, None],
|
||||
share=[1.0, 0.4, 0.6, 0.6, 0.6, 0.15,
|
||||
0.15, 0.15, 0.6, 0.6, 0.25, 1.0],
|
||||
log_return=[None, None, .09604978, -.06524096, .03532373,
|
||||
.09604978, -.06524096, .03532373,
|
||||
.03025441, .036997, None, None]
|
||||
))
|
||||
.set_index(["household_id", "asset_id", "t"])
|
||||
.reindex(columns=['share', 'log_return']))
|
||||
|
||||
result = (merge(household.reset_index(), log_return.reset_index(),
|
||||
on=['asset_id'], how='outer')
|
||||
.set_index(['household_id', 'asset_id', 't']))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestJoinMultiMulti(object):
|
||||
|
||||
def test_join_multi_multi(self, left_multi, right_multi, join_type,
|
||||
on_cols_multi, idx_cols_multi):
|
||||
# Multi-index join tests
|
||||
expected = (pd.merge(left_multi.reset_index(),
|
||||
right_multi.reset_index(),
|
||||
how=join_type, on=on_cols_multi).
|
||||
set_index(idx_cols_multi).sort_index())
|
||||
|
||||
result = left_multi.join(right_multi, how=join_type).sort_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_empty_frames(self, left_multi, right_multi, join_type,
|
||||
on_cols_multi, idx_cols_multi):
|
||||
|
||||
left_multi = left_multi.drop(columns=left_multi.columns)
|
||||
right_multi = right_multi.drop(columns=right_multi.columns)
|
||||
|
||||
expected = (pd.merge(left_multi.reset_index(),
|
||||
right_multi.reset_index(),
|
||||
how=join_type, on=on_cols_multi)
|
||||
.set_index(idx_cols_multi).sort_index())
|
||||
|
||||
result = left_multi.join(right_multi, how=join_type).sort_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("box", [None, np.asarray, Series, Index])
|
||||
def test_merge_datetime_index(self, box):
|
||||
# see gh-19038
|
||||
df = DataFrame([1, 2, 3],
|
||||
["2016-01-01", "2017-01-01", "2018-01-01"],
|
||||
columns=["a"])
|
||||
df.index = pd.to_datetime(df.index)
|
||||
on_vector = df.index.year
|
||||
|
||||
if box is not None:
|
||||
on_vector = box(on_vector)
|
||||
|
||||
expected = DataFrame(
|
||||
OrderedDict([
|
||||
("a", [1, 2, 3]),
|
||||
("key_1", [2016, 2017, 2018]),
|
||||
])
|
||||
)
|
||||
|
||||
result = df.merge(df, on=["a", on_vector], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
OrderedDict([
|
||||
("key_0", [2016, 2017, 2018]),
|
||||
("a_x", [1, 2, 3]),
|
||||
("a_y", [1, 2, 3]),
|
||||
])
|
||||
)
|
||||
|
||||
result = df.merge(df, on=[df.index.year], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_single_common_level(self):
|
||||
index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'),
|
||||
('K1', 'X2')],
|
||||
names=['key', 'X'])
|
||||
|
||||
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
|
||||
'B': ['B0', 'B1', 'B2']},
|
||||
index=index_left)
|
||||
|
||||
index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'),
|
||||
('K2', 'Y2'), ('K2', 'Y3')],
|
||||
names=['key', 'Y'])
|
||||
|
||||
right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'],
|
||||
'D': ['D0', 'D1', 'D2', 'D3']},
|
||||
index=index_right)
|
||||
|
||||
result = left.join(right)
|
||||
expected = (pd.merge(left.reset_index(), right.reset_index(),
|
||||
on=['key'], how='inner')
|
||||
.set_index(['key', 'X', 'Y']))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,458 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical, DataFrame, DatetimeIndex, Index, Interval, IntervalIndex,
|
||||
Series, TimedeltaIndex, Timestamp, cut, date_range, isna, qcut,
|
||||
timedelta_range, to_datetime)
|
||||
from pandas.api.types import CategoricalDtype as CDT
|
||||
import pandas.core.reshape.tile as tmod
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def test_simple():
|
||||
data = np.ones(5, dtype="int64")
|
||||
result = cut(data, 4, labels=False)
|
||||
|
||||
expected = np.array([1, 1, 1, 1, 1])
|
||||
tm.assert_numpy_array_equal(result, expected, check_dtype=False)
|
||||
|
||||
|
||||
def test_bins():
|
||||
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1])
|
||||
result, bins = cut(data, 3, retbins=True)
|
||||
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3))
|
||||
intervals = intervals.take([0, 0, 0, 1, 2, 0])
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
|
||||
6.53333333, 9.7]))
|
||||
|
||||
|
||||
def test_right():
|
||||
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
|
||||
result, bins = cut(data, 4, right=True, retbins=True)
|
||||
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3))
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
expected = expected.take([0, 0, 0, 2, 3, 0, 0])
|
||||
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95, 7.325, 9.7]))
|
||||
|
||||
|
||||
def test_no_right():
|
||||
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
|
||||
result, bins = cut(data, 4, right=False, retbins=True)
|
||||
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3), closed="left")
|
||||
intervals = intervals.take([0, 0, 0, 2, 3, 0, 1])
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95, 7.325, 9.7095]))
|
||||
|
||||
|
||||
def test_array_like():
|
||||
data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
|
||||
result, bins = cut(data, 3, retbins=True)
|
||||
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3))
|
||||
intervals = intervals.take([0, 0, 0, 1, 2, 0])
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
|
||||
6.53333333, 9.7]))
|
||||
|
||||
|
||||
def test_bins_from_interval_index():
|
||||
c = cut(range(5), 3)
|
||||
expected = c
|
||||
result = cut(range(5), bins=expected.categories)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
expected = Categorical.from_codes(np.append(c.codes, -1),
|
||||
categories=c.categories,
|
||||
ordered=True)
|
||||
result = cut(range(6), bins=expected.categories)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
def test_bins_from_interval_index_doc_example():
|
||||
# Make sure we preserve the bins.
|
||||
ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])
|
||||
c = cut(ages, bins=[0, 18, 35, 70])
|
||||
expected = IntervalIndex.from_tuples([(0, 18), (18, 35), (35, 70)])
|
||||
tm.assert_index_equal(c.categories, expected)
|
||||
|
||||
result = cut([25, 20, 50], bins=c.categories)
|
||||
tm.assert_index_equal(result.categories, expected)
|
||||
tm.assert_numpy_array_equal(result.codes,
|
||||
np.array([1, 1, 2], dtype="int8"))
|
||||
|
||||
|
||||
def test_bins_not_overlapping_from_interval_index():
|
||||
# see gh-23980
|
||||
msg = "Overlapping IntervalIndex is not accepted"
|
||||
ii = IntervalIndex.from_tuples([(0, 10), (2, 12), (4, 14)])
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut([5, 6], bins=ii)
|
||||
|
||||
|
||||
def test_bins_not_monotonic():
|
||||
msg = "bins must increase monotonically"
|
||||
data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(data, [0.1, 1.5, 1, 10])
|
||||
|
||||
|
||||
def test_wrong_num_labels():
|
||||
msg = "Bin labels must be one fewer than the number of bin edges"
|
||||
data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(data, [0, 1, 10], labels=["foo", "bar", "baz"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("x,bins,msg", [
|
||||
([], 2, "Cannot cut empty array"),
|
||||
([1, 2, 3], 0.5, "`bins` should be a positive integer")
|
||||
])
|
||||
def test_cut_corner(x, bins, msg):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(x, bins)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("arg", [2, np.eye(2), DataFrame(np.eye(2))])
|
||||
@pytest.mark.parametrize("cut_func", [cut, qcut])
|
||||
def test_cut_not_1d_arg(arg, cut_func):
|
||||
msg = "Input array must be 1 dimensional"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut_func(arg, 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('data', [
|
||||
[0, 1, 2, 3, 4, np.inf],
|
||||
[-np.inf, 0, 1, 2, 3, 4],
|
||||
[-np.inf, 0, 1, 2, 3, 4, np.inf]])
|
||||
def test_int_bins_with_inf(data):
|
||||
# GH 24314
|
||||
msg = 'cannot specify integer `bins` when input data contains infinity'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(data, bins=3)
|
||||
|
||||
|
||||
def test_cut_out_of_range_more():
|
||||
# see gh-1511
|
||||
name = "x"
|
||||
|
||||
ser = Series([0, -1, 0, 1, -3], name=name)
|
||||
ind = cut(ser, [0, 1], labels=False)
|
||||
|
||||
exp = Series([np.nan, np.nan, np.nan, 0, np.nan], name=name)
|
||||
tm.assert_series_equal(ind, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("right,breaks,closed", [
|
||||
(True, [-1e-3, 0.25, 0.5, 0.75, 1], "right"),
|
||||
(False, [0, 0.25, 0.5, 0.75, 1 + 1e-3], "left")
|
||||
])
|
||||
def test_labels(right, breaks, closed):
|
||||
arr = np.tile(np.arange(0, 1.01, 0.1), 4)
|
||||
|
||||
result, bins = cut(arr, 4, retbins=True, right=right)
|
||||
ex_levels = IntervalIndex.from_breaks(breaks, closed=closed)
|
||||
tm.assert_index_equal(result.categories, ex_levels)
|
||||
|
||||
|
||||
def test_cut_pass_series_name_to_factor():
|
||||
name = "foo"
|
||||
ser = Series(np.random.randn(100), name=name)
|
||||
|
||||
factor = cut(ser, 4)
|
||||
assert factor.name == name
|
||||
|
||||
|
||||
def test_label_precision():
|
||||
arr = np.arange(0, 0.73, 0.01)
|
||||
result = cut(arr, 4, precision=2)
|
||||
|
||||
ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, 0.54, 0.72])
|
||||
tm.assert_index_equal(result.categories, ex_levels)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("labels", [None, False])
|
||||
def test_na_handling(labels):
|
||||
arr = np.arange(0, 0.75, 0.01)
|
||||
arr[::3] = np.nan
|
||||
|
||||
result = cut(arr, 4, labels=labels)
|
||||
result = np.asarray(result)
|
||||
|
||||
expected = np.where(isna(arr), np.nan, result)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
def test_inf_handling():
|
||||
data = np.arange(6)
|
||||
data_ser = Series(data, dtype="int64")
|
||||
|
||||
bins = [-np.inf, 2, 4, np.inf]
|
||||
result = cut(data, bins)
|
||||
result_ser = cut(data_ser, bins)
|
||||
|
||||
ex_uniques = IntervalIndex.from_breaks(bins)
|
||||
tm.assert_index_equal(result.categories, ex_uniques)
|
||||
|
||||
assert result[5] == Interval(4, np.inf)
|
||||
assert result[0] == Interval(-np.inf, 2)
|
||||
assert result_ser[5] == Interval(4, np.inf)
|
||||
assert result_ser[0] == Interval(-np.inf, 2)
|
||||
|
||||
|
||||
def test_cut_out_of_bounds():
|
||||
arr = np.random.randn(100)
|
||||
result = cut(arr, [-1, 0, 1])
|
||||
|
||||
mask = isna(result)
|
||||
ex_mask = (arr < -1) | (arr > 1)
|
||||
tm.assert_numpy_array_equal(mask, ex_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("get_labels,get_expected", [
|
||||
(lambda labels: labels,
|
||||
lambda labels: Categorical(["Medium"] + 4 * ["Small"] +
|
||||
["Medium", "Large"],
|
||||
categories=labels, ordered=True)),
|
||||
(lambda labels: Categorical.from_codes([0, 1, 2], labels),
|
||||
lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels))
|
||||
])
|
||||
def test_cut_pass_labels(get_labels, get_expected):
|
||||
bins = [0, 25, 50, 100]
|
||||
arr = [50, 5, 10, 15, 20, 30, 70]
|
||||
labels = ["Small", "Medium", "Large"]
|
||||
|
||||
result = cut(arr, bins, labels=get_labels(labels))
|
||||
tm.assert_categorical_equal(result, get_expected(labels))
|
||||
|
||||
|
||||
def test_cut_pass_labels_compat():
|
||||
# see gh-16459
|
||||
arr = [50, 5, 10, 15, 20, 30, 70]
|
||||
labels = ["Good", "Medium", "Bad"]
|
||||
|
||||
result = cut(arr, 3, labels=labels)
|
||||
exp = cut(arr, 3, labels=Categorical(labels, categories=labels,
|
||||
ordered=True))
|
||||
tm.assert_categorical_equal(result, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("x", [np.arange(11.), np.arange(11.) / 1e10])
|
||||
def test_round_frac_just_works(x):
|
||||
# It works.
|
||||
cut(x, 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("val,precision,expected", [
|
||||
(-117.9998, 3, -118),
|
||||
(117.9998, 3, 118),
|
||||
(117.9998, 2, 118),
|
||||
(0.000123456, 2, 0.00012)
|
||||
])
|
||||
def test_round_frac(val, precision, expected):
|
||||
# see gh-1979
|
||||
result = tmod._round_frac(val, precision=precision)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_cut_return_intervals():
|
||||
ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
|
||||
result = cut(ser, 3)
|
||||
|
||||
exp_bins = np.linspace(0, 8, num=4).round(3)
|
||||
exp_bins[0] -= 0.008
|
||||
|
||||
expected = Series(IntervalIndex.from_breaks(exp_bins, closed="right").take(
|
||||
[0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_series_ret_bins():
|
||||
# see gh-8589
|
||||
ser = Series(np.arange(4))
|
||||
result, bins = cut(ser, 2, retbins=True)
|
||||
|
||||
expected = Series(IntervalIndex.from_breaks(
|
||||
[-0.003, 1.5, 3], closed="right").repeat(2)).astype(CDT(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs,msg", [
|
||||
(dict(duplicates="drop"), None),
|
||||
(dict(), "Bin edges must be unique"),
|
||||
(dict(duplicates="raise"), "Bin edges must be unique"),
|
||||
(dict(duplicates="foo"), "invalid value for 'duplicates' parameter")
|
||||
])
|
||||
def test_cut_duplicates_bin(kwargs, msg):
|
||||
# see gh-20947
|
||||
bins = [0, 2, 4, 6, 10, 10]
|
||||
values = Series(np.array([1, 3, 5, 7, 9]), index=["a", "b", "c", "d", "e"])
|
||||
|
||||
if msg is not None:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(values, bins, **kwargs)
|
||||
else:
|
||||
result = cut(values, bins, **kwargs)
|
||||
expected = cut(values, pd.unique(bins))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", [9.0, -9.0, 0.0])
|
||||
@pytest.mark.parametrize("length", [1, 2])
|
||||
def test_single_bin(data, length):
|
||||
# see gh-14652, gh-15428
|
||||
ser = Series([data] * length)
|
||||
result = cut(ser, 1, labels=False)
|
||||
|
||||
expected = Series([0] * length)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_1_writeable,array_2_writeable",
|
||||
[(True, True), (True, False), (False, False)])
|
||||
def test_cut_read_only(array_1_writeable, array_2_writeable):
|
||||
# issue 18773
|
||||
array_1 = np.arange(0, 100, 10)
|
||||
array_1.flags.writeable = array_1_writeable
|
||||
|
||||
array_2 = np.arange(0, 100, 10)
|
||||
array_2.flags.writeable = array_2_writeable
|
||||
|
||||
hundred_elements = np.arange(100)
|
||||
tm.assert_categorical_equal(cut(hundred_elements, array_1),
|
||||
cut(hundred_elements, array_2))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("conv", [
|
||||
lambda v: Timestamp(v),
|
||||
lambda v: to_datetime(v),
|
||||
lambda v: np.datetime64(v),
|
||||
lambda v: Timestamp(v).to_pydatetime(),
|
||||
])
|
||||
def test_datetime_bin(conv):
|
||||
data = [np.datetime64("2012-12-13"), np.datetime64("2012-12-15")]
|
||||
bin_data = ["2012-12-12", "2012-12-14", "2012-12-16"]
|
||||
|
||||
expected = Series(IntervalIndex([
|
||||
Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])),
|
||||
Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2]))])).astype(
|
||||
CDT(ordered=True))
|
||||
|
||||
bins = [conv(v) for v in bin_data]
|
||||
result = Series(cut(data, bins=bins))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", [
|
||||
to_datetime(Series(["2013-01-01", "2013-01-02", "2013-01-03"])),
|
||||
[np.datetime64("2013-01-01"), np.datetime64("2013-01-02"),
|
||||
np.datetime64("2013-01-03")],
|
||||
np.array([np.datetime64("2013-01-01"), np.datetime64("2013-01-02"),
|
||||
np.datetime64("2013-01-03")]),
|
||||
DatetimeIndex(["2013-01-01", "2013-01-02", "2013-01-03"])
|
||||
])
|
||||
def test_datetime_cut(data):
|
||||
# see gh-14714
|
||||
#
|
||||
# Testing time data when it comes in various collection types.
|
||||
result, _ = cut(data, 3, retbins=True)
|
||||
expected = Series(IntervalIndex([
|
||||
Interval(Timestamp("2012-12-31 23:57:07.200000"),
|
||||
Timestamp("2013-01-01 16:00:00")),
|
||||
Interval(Timestamp("2013-01-01 16:00:00"),
|
||||
Timestamp("2013-01-02 08:00:00")),
|
||||
Interval(Timestamp("2013-01-02 08:00:00"),
|
||||
Timestamp("2013-01-03 00:00:00"))])).astype(CDT(ordered=True))
|
||||
tm.assert_series_equal(Series(result), expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bins", [
|
||||
3, [Timestamp("2013-01-01 04:57:07.200000"),
|
||||
Timestamp("2013-01-01 21:00:00"),
|
||||
Timestamp("2013-01-02 13:00:00"),
|
||||
Timestamp("2013-01-03 05:00:00")]])
|
||||
@pytest.mark.parametrize("box", [list, np.array, Index, Series])
|
||||
def test_datetime_tz_cut(bins, box):
|
||||
# see gh-19872
|
||||
tz = "US/Eastern"
|
||||
s = Series(date_range("20130101", periods=3, tz=tz))
|
||||
|
||||
if not isinstance(bins, int):
|
||||
bins = box(bins)
|
||||
|
||||
result = cut(s, bins)
|
||||
expected = Series(IntervalIndex([
|
||||
Interval(Timestamp("2012-12-31 23:57:07.200000", tz=tz),
|
||||
Timestamp("2013-01-01 16:00:00", tz=tz)),
|
||||
Interval(Timestamp("2013-01-01 16:00:00", tz=tz),
|
||||
Timestamp("2013-01-02 08:00:00", tz=tz)),
|
||||
Interval(Timestamp("2013-01-02 08:00:00", tz=tz),
|
||||
Timestamp("2013-01-03 00:00:00", tz=tz))])).astype(
|
||||
CDT(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_datetime_nan_error():
|
||||
msg = "bins must be of datetime64 dtype"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(date_range("20130101", periods=3), bins=[0, 2, 4])
|
||||
|
||||
|
||||
def test_datetime_nan_mask():
|
||||
result = cut(date_range("20130102", periods=5),
|
||||
bins=date_range("20130101", periods=2))
|
||||
|
||||
mask = result.categories.isna()
|
||||
tm.assert_numpy_array_equal(mask, np.array([False]))
|
||||
|
||||
mask = result.isna()
|
||||
tm.assert_numpy_array_equal(mask, np.array([False, True, True,
|
||||
True, True]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
|
||||
def test_datetime_cut_roundtrip(tz):
|
||||
# see gh-19891
|
||||
ser = Series(date_range("20180101", periods=3, tz=tz))
|
||||
result, result_bins = cut(ser, 2, retbins=True)
|
||||
|
||||
expected = cut(ser, result_bins)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
expected_bins = DatetimeIndex(["2017-12-31 23:57:07.200000",
|
||||
"2018-01-02 00:00:00",
|
||||
"2018-01-03 00:00:00"])
|
||||
expected_bins = expected_bins.tz_localize(tz)
|
||||
tm.assert_index_equal(result_bins, expected_bins)
|
||||
|
||||
|
||||
def test_timedelta_cut_roundtrip():
|
||||
# see gh-19891
|
||||
ser = Series(timedelta_range("1day", periods=3))
|
||||
result, result_bins = cut(ser, 2, retbins=True)
|
||||
|
||||
expected = cut(ser, result_bins)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
expected_bins = TimedeltaIndex(["0 days 23:57:07.200000",
|
||||
"2 days 00:00:00",
|
||||
"3 days 00:00:00"])
|
||||
tm.assert_index_equal(result_bins, expected_bins)
|
||||
@@ -0,0 +1,718 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# pylint: disable-msg=W0612,E1101
|
||||
|
||||
import numpy as np
|
||||
from numpy import nan
|
||||
import pytest
|
||||
|
||||
from pandas.compat import range
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, lreshape, melt, wide_to_long
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
class TestMelt(object):
|
||||
|
||||
def setup_method(self, method):
|
||||
self.df = tm.makeTimeDataFrame()[:10]
|
||||
self.df['id1'] = (self.df['A'] > 0).astype(np.int64)
|
||||
self.df['id2'] = (self.df['B'] > 0).astype(np.int64)
|
||||
|
||||
self.var_name = 'var'
|
||||
self.value_name = 'val'
|
||||
|
||||
self.df1 = pd.DataFrame([[1.067683, -1.110463, 0.20867
|
||||
], [-1.321405, 0.368915, -1.055342],
|
||||
[-0.807333, 0.08298, -0.873361]])
|
||||
self.df1.columns = [list('ABC'), list('abc')]
|
||||
self.df1.columns.names = ['CAP', 'low']
|
||||
|
||||
def test_top_level_method(self):
|
||||
result = melt(self.df)
|
||||
assert result.columns.tolist() == ['variable', 'value']
|
||||
|
||||
def test_method_signatures(self):
|
||||
tm.assert_frame_equal(self.df.melt(),
|
||||
melt(self.df))
|
||||
|
||||
tm.assert_frame_equal(self.df.melt(id_vars=['id1', 'id2'],
|
||||
value_vars=['A', 'B']),
|
||||
melt(self.df,
|
||||
id_vars=['id1', 'id2'],
|
||||
value_vars=['A', 'B']))
|
||||
|
||||
tm.assert_frame_equal(self.df.melt(var_name=self.var_name,
|
||||
value_name=self.value_name),
|
||||
melt(self.df,
|
||||
var_name=self.var_name,
|
||||
value_name=self.value_name))
|
||||
|
||||
tm.assert_frame_equal(self.df1.melt(col_level=0),
|
||||
melt(self.df1, col_level=0))
|
||||
|
||||
def test_default_col_names(self):
|
||||
result = self.df.melt()
|
||||
assert result.columns.tolist() == ['variable', 'value']
|
||||
|
||||
result1 = self.df.melt(id_vars=['id1'])
|
||||
assert result1.columns.tolist() == ['id1', 'variable', 'value']
|
||||
|
||||
result2 = self.df.melt(id_vars=['id1', 'id2'])
|
||||
assert result2.columns.tolist() == ['id1', 'id2', 'variable', 'value']
|
||||
|
||||
def test_value_vars(self):
|
||||
result3 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A')
|
||||
assert len(result3) == 10
|
||||
|
||||
result4 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'])
|
||||
expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2,
|
||||
'id2': self.df['id2'].tolist() * 2,
|
||||
'variable': ['A'] * 10 + ['B'] * 10,
|
||||
'value': (self.df['A'].tolist() +
|
||||
self.df['B'].tolist())},
|
||||
columns=['id1', 'id2', 'variable', 'value'])
|
||||
tm.assert_frame_equal(result4, expected4)
|
||||
|
||||
def test_value_vars_types(self):
|
||||
# GH 15348
|
||||
expected = DataFrame({'id1': self.df['id1'].tolist() * 2,
|
||||
'id2': self.df['id2'].tolist() * 2,
|
||||
'variable': ['A'] * 10 + ['B'] * 10,
|
||||
'value': (self.df['A'].tolist() +
|
||||
self.df['B'].tolist())},
|
||||
columns=['id1', 'id2', 'variable', 'value'])
|
||||
|
||||
for type_ in (tuple, list, np.array):
|
||||
result = self.df.melt(id_vars=['id1', 'id2'],
|
||||
value_vars=type_(('A', 'B')))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_vars_work_with_multiindex(self):
|
||||
expected = DataFrame({
|
||||
('A', 'a'): self.df1[('A', 'a')],
|
||||
'CAP': ['B'] * len(self.df1),
|
||||
'low': ['b'] * len(self.df1),
|
||||
'value': self.df1[('B', 'b')],
|
||||
}, columns=[('A', 'a'), 'CAP', 'low', 'value'])
|
||||
|
||||
result = self.df1.melt(id_vars=[('A', 'a')], value_vars=[('B', 'b')])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_single_vars_work_with_multiindex(self):
|
||||
expected = DataFrame({
|
||||
'A': {0: 1.067683, 1: -1.321405, 2: -0.807333},
|
||||
'CAP': {0: 'B', 1: 'B', 2: 'B'},
|
||||
'value': {0: -1.110463, 1: 0.368915, 2: 0.08298}})
|
||||
result = self.df1.melt(['A'], ['B'], col_level=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_tuple_vars_fail_with_multiindex(self):
|
||||
# melt should fail with an informative error message if
|
||||
# the columns have a MultiIndex and a tuple is passed
|
||||
# for id_vars or value_vars.
|
||||
tuple_a = ('A', 'a')
|
||||
list_a = [tuple_a]
|
||||
tuple_b = ('B', 'b')
|
||||
list_b = [tuple_b]
|
||||
|
||||
msg = (r"(id|value)_vars must be a list of tuples when columns are"
|
||||
" a MultiIndex")
|
||||
for id_vars, value_vars in ((tuple_a, list_b), (list_a, tuple_b),
|
||||
(tuple_a, tuple_b)):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
self.df1.melt(id_vars=id_vars, value_vars=value_vars)
|
||||
|
||||
def test_custom_var_name(self):
|
||||
result5 = self.df.melt(var_name=self.var_name)
|
||||
assert result5.columns.tolist() == ['var', 'value']
|
||||
|
||||
result6 = self.df.melt(id_vars=['id1'], var_name=self.var_name)
|
||||
assert result6.columns.tolist() == ['id1', 'var', 'value']
|
||||
|
||||
result7 = self.df.melt(id_vars=['id1', 'id2'], var_name=self.var_name)
|
||||
assert result7.columns.tolist() == ['id1', 'id2', 'var', 'value']
|
||||
|
||||
result8 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
|
||||
var_name=self.var_name)
|
||||
assert result8.columns.tolist() == ['id1', 'id2', 'var', 'value']
|
||||
|
||||
result9 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
|
||||
var_name=self.var_name)
|
||||
expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2,
|
||||
'id2': self.df['id2'].tolist() * 2,
|
||||
self.var_name: ['A'] * 10 + ['B'] * 10,
|
||||
'value': (self.df['A'].tolist() +
|
||||
self.df['B'].tolist())},
|
||||
columns=['id1', 'id2', self.var_name, 'value'])
|
||||
tm.assert_frame_equal(result9, expected9)
|
||||
|
||||
def test_custom_value_name(self):
|
||||
result10 = self.df.melt(value_name=self.value_name)
|
||||
assert result10.columns.tolist() == ['variable', 'val']
|
||||
|
||||
result11 = self.df.melt(id_vars=['id1'], value_name=self.value_name)
|
||||
assert result11.columns.tolist() == ['id1', 'variable', 'val']
|
||||
|
||||
result12 = self.df.melt(id_vars=['id1', 'id2'],
|
||||
value_name=self.value_name)
|
||||
assert result12.columns.tolist() == ['id1', 'id2', 'variable', 'val']
|
||||
|
||||
result13 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
|
||||
value_name=self.value_name)
|
||||
assert result13.columns.tolist() == ['id1', 'id2', 'variable', 'val']
|
||||
|
||||
result14 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
|
||||
value_name=self.value_name)
|
||||
expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2,
|
||||
'id2': self.df['id2'].tolist() * 2,
|
||||
'variable': ['A'] * 10 + ['B'] * 10,
|
||||
self.value_name: (self.df['A'].tolist() +
|
||||
self.df['B'].tolist())},
|
||||
columns=['id1', 'id2', 'variable',
|
||||
self.value_name])
|
||||
tm.assert_frame_equal(result14, expected14)
|
||||
|
||||
def test_custom_var_and_value_name(self):
|
||||
|
||||
result15 = self.df.melt(var_name=self.var_name,
|
||||
value_name=self.value_name)
|
||||
assert result15.columns.tolist() == ['var', 'val']
|
||||
|
||||
result16 = self.df.melt(id_vars=['id1'], var_name=self.var_name,
|
||||
value_name=self.value_name)
|
||||
assert result16.columns.tolist() == ['id1', 'var', 'val']
|
||||
|
||||
result17 = self.df.melt(id_vars=['id1', 'id2'],
|
||||
var_name=self.var_name,
|
||||
value_name=self.value_name)
|
||||
assert result17.columns.tolist() == ['id1', 'id2', 'var', 'val']
|
||||
|
||||
result18 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
|
||||
var_name=self.var_name,
|
||||
value_name=self.value_name)
|
||||
assert result18.columns.tolist() == ['id1', 'id2', 'var', 'val']
|
||||
|
||||
result19 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
|
||||
var_name=self.var_name,
|
||||
value_name=self.value_name)
|
||||
expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2,
|
||||
'id2': self.df['id2'].tolist() * 2,
|
||||
self.var_name: ['A'] * 10 + ['B'] * 10,
|
||||
self.value_name: (self.df['A'].tolist() +
|
||||
self.df['B'].tolist())},
|
||||
columns=['id1', 'id2', self.var_name,
|
||||
self.value_name])
|
||||
tm.assert_frame_equal(result19, expected19)
|
||||
|
||||
df20 = self.df.copy()
|
||||
df20.columns.name = 'foo'
|
||||
result20 = df20.melt()
|
||||
assert result20.columns.tolist() == ['foo', 'value']
|
||||
|
||||
def test_col_level(self):
|
||||
res1 = self.df1.melt(col_level=0)
|
||||
res2 = self.df1.melt(col_level='CAP')
|
||||
assert res1.columns.tolist() == ['CAP', 'value']
|
||||
assert res2.columns.tolist() == ['CAP', 'value']
|
||||
|
||||
def test_multiindex(self):
|
||||
res = self.df1.melt()
|
||||
assert res.columns.tolist() == ['CAP', 'low', 'value']
|
||||
|
||||
@pytest.mark.parametrize("col", [
|
||||
pd.Series(pd.date_range('2010', periods=5, tz='US/Pacific')),
|
||||
pd.Series(["a", "b", "c", "a", "d"], dtype="category"),
|
||||
pd.Series([0, 1, 0, 0, 0])])
|
||||
def test_pandas_dtypes(self, col):
|
||||
# GH 15785
|
||||
df = DataFrame({'klass': range(5),
|
||||
'col': col,
|
||||
'attr1': [1, 0, 0, 0, 0],
|
||||
'attr2': col})
|
||||
expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col],
|
||||
ignore_index=True)
|
||||
result = melt(df, id_vars=['klass', 'col'], var_name='attribute',
|
||||
value_name='value')
|
||||
expected = DataFrame({0: list(range(5)) * 2,
|
||||
1: pd.concat([col] * 2, ignore_index=True),
|
||||
2: ['attr1'] * 5 + ['attr2'] * 5,
|
||||
3: expected_value})
|
||||
expected.columns = ['klass', 'col', 'attribute', 'value']
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_melt_missing_columns_raises(self):
|
||||
# GH-23575
|
||||
# This test is to ensure that pandas raises an error if melting is
|
||||
# attempted with column names absent from the dataframe
|
||||
|
||||
# Generate data
|
||||
df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd'))
|
||||
|
||||
# Try to melt with missing `value_vars` column name
|
||||
msg = "The following '{Var}' are not present in the DataFrame: {Col}"
|
||||
with pytest.raises(
|
||||
KeyError,
|
||||
match=msg.format(Var='value_vars', Col="\\['C'\\]")):
|
||||
df.melt(['a', 'b'], ['C', 'd'])
|
||||
|
||||
# Try to melt with missing `id_vars` column name
|
||||
with pytest.raises(
|
||||
KeyError,
|
||||
match=msg.format(Var='id_vars', Col="\\['A'\\]")):
|
||||
df.melt(['A', 'b'], ['c', 'd'])
|
||||
|
||||
# Multiple missing
|
||||
with pytest.raises(
|
||||
KeyError,
|
||||
match=msg.format(Var='id_vars',
|
||||
Col="\\['not_here', 'or_there'\\]")):
|
||||
df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd'])
|
||||
|
||||
# Multiindex melt fails if column is missing from multilevel melt
|
||||
multi = df.copy()
|
||||
multi.columns = [list('ABCD'), list('abcd')]
|
||||
with pytest.raises(
|
||||
KeyError,
|
||||
match=msg.format(Var='id_vars',
|
||||
Col="\\['E'\\]")):
|
||||
multi.melt([('E', 'a')], [('B', 'b')])
|
||||
# Multiindex fails if column is missing from single level melt
|
||||
with pytest.raises(
|
||||
KeyError,
|
||||
match=msg.format(Var='value_vars',
|
||||
Col="\\['F'\\]")):
|
||||
multi.melt(['A'], ['F'], col_level=0)
|
||||
|
||||
|
||||
class TestLreshape(object):
|
||||
|
||||
def test_pairs(self):
|
||||
data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008',
|
||||
'11jan2009'],
|
||||
'birthwt': [1766, 3301, 1454, 3139, 4133],
|
||||
'id': [101, 102, 103, 104, 105],
|
||||
'sex': ['Male', 'Female', 'Female', 'Female', 'Female'],
|
||||
'visitdt1': ['11jan2009', '22dec2008', '04jan2009',
|
||||
'29dec2008', '20jan2009'],
|
||||
'visitdt2':
|
||||
['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'],
|
||||
'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'],
|
||||
'wt1': [1823, 3338, 1549, 3298, 4306],
|
||||
'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0],
|
||||
'wt3': [2293.0, nan, nan, 3377.0, 4805.0]}
|
||||
|
||||
df = DataFrame(data)
|
||||
|
||||
spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)],
|
||||
'wt': ['wt%d' % i for i in range(1, 4)]}
|
||||
result = lreshape(df, spec)
|
||||
|
||||
exp_data = {'birthdt':
|
||||
['08jan2009', '20dec2008', '30dec2008', '21dec2008',
|
||||
'11jan2009', '08jan2009', '30dec2008', '21dec2008',
|
||||
'11jan2009', '08jan2009', '21dec2008', '11jan2009'],
|
||||
'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139,
|
||||
4133, 1766, 3139, 4133],
|
||||
'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101,
|
||||
104, 105],
|
||||
'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
|
||||
'Male', 'Female', 'Female', 'Female', 'Male',
|
||||
'Female', 'Female'],
|
||||
'visitdt': ['11jan2009', '22dec2008', '04jan2009',
|
||||
'29dec2008', '20jan2009', '21jan2009',
|
||||
'22jan2009', '31dec2008', '03feb2009',
|
||||
'05feb2009', '02jan2009', '15feb2009'],
|
||||
'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0,
|
||||
1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]}
|
||||
exp = DataFrame(exp_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
result = lreshape(df, spec, dropna=False)
|
||||
exp_data = {'birthdt':
|
||||
['08jan2009', '20dec2008', '30dec2008', '21dec2008',
|
||||
'11jan2009', '08jan2009', '20dec2008', '30dec2008',
|
||||
'21dec2008', '11jan2009', '08jan2009', '20dec2008',
|
||||
'30dec2008', '21dec2008', '11jan2009'],
|
||||
'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454,
|
||||
3139, 4133, 1766, 3301, 1454, 3139, 4133],
|
||||
'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105,
|
||||
101, 102, 103, 104, 105],
|
||||
'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
|
||||
'Male', 'Female', 'Female', 'Female', 'Female',
|
||||
'Male', 'Female', 'Female', 'Female', 'Female'],
|
||||
'visitdt': ['11jan2009', '22dec2008', '04jan2009',
|
||||
'29dec2008', '20jan2009', '21jan2009', nan,
|
||||
'22jan2009', '31dec2008', '03feb2009',
|
||||
'05feb2009', nan, nan, '02jan2009',
|
||||
'15feb2009'],
|
||||
'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan,
|
||||
1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0,
|
||||
4805.0]}
|
||||
exp = DataFrame(exp_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)],
|
||||
'wt': ['wt%d' % i for i in range(1, 4)]}
|
||||
msg = "All column lists must be same length"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
lreshape(df, spec)
|
||||
|
||||
|
||||
class TestWideToLong(object):
|
||||
|
||||
def test_simple(self):
|
||||
np.random.seed(123)
|
||||
x = np.random.randn(3)
|
||||
df = pd.DataFrame({"A1970": {0: "a",
|
||||
1: "b",
|
||||
2: "c"},
|
||||
"A1980": {0: "d",
|
||||
1: "e",
|
||||
2: "f"},
|
||||
"B1970": {0: 2.5,
|
||||
1: 1.2,
|
||||
2: .7},
|
||||
"B1980": {0: 3.2,
|
||||
1: 1.3,
|
||||
2: .1},
|
||||
"X": dict(zip(
|
||||
range(3), x))})
|
||||
df["id"] = df.index
|
||||
exp_data = {"X": x.tolist() + x.tolist(),
|
||||
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
|
||||
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
|
||||
"year": [1970, 1970, 1970, 1980, 1980, 1980],
|
||||
"id": [0, 1, 2, 0, 1, 2]}
|
||||
expected = DataFrame(exp_data)
|
||||
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
|
||||
result = wide_to_long(df, ["A", "B"], i="id", j="year")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_stubs(self):
|
||||
# GH9204
|
||||
df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]])
|
||||
df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2']
|
||||
stubs = ['inc', 'edu']
|
||||
|
||||
# TODO: unused?
|
||||
df_long = pd.wide_to_long(df, stubs, i='id', j='age') # noqa
|
||||
|
||||
assert stubs == ['inc', 'edu']
|
||||
|
||||
def test_separating_character(self):
|
||||
# GH14779
|
||||
np.random.seed(123)
|
||||
x = np.random.randn(3)
|
||||
df = pd.DataFrame({"A.1970": {0: "a",
|
||||
1: "b",
|
||||
2: "c"},
|
||||
"A.1980": {0: "d",
|
||||
1: "e",
|
||||
2: "f"},
|
||||
"B.1970": {0: 2.5,
|
||||
1: 1.2,
|
||||
2: .7},
|
||||
"B.1980": {0: 3.2,
|
||||
1: 1.3,
|
||||
2: .1},
|
||||
"X": dict(zip(
|
||||
range(3), x))})
|
||||
df["id"] = df.index
|
||||
exp_data = {"X": x.tolist() + x.tolist(),
|
||||
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
|
||||
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
|
||||
"year": [1970, 1970, 1970, 1980, 1980, 1980],
|
||||
"id": [0, 1, 2, 0, 1, 2]}
|
||||
expected = DataFrame(exp_data)
|
||||
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
|
||||
result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_escapable_characters(self):
|
||||
np.random.seed(123)
|
||||
x = np.random.randn(3)
|
||||
df = pd.DataFrame({"A(quarterly)1970": {0: "a",
|
||||
1: "b",
|
||||
2: "c"},
|
||||
"A(quarterly)1980": {0: "d",
|
||||
1: "e",
|
||||
2: "f"},
|
||||
"B(quarterly)1970": {0: 2.5,
|
||||
1: 1.2,
|
||||
2: .7},
|
||||
"B(quarterly)1980": {0: 3.2,
|
||||
1: 1.3,
|
||||
2: .1},
|
||||
"X": dict(zip(
|
||||
range(3), x))})
|
||||
df["id"] = df.index
|
||||
exp_data = {"X": x.tolist() + x.tolist(),
|
||||
"A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'],
|
||||
"B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
|
||||
"year": [1970, 1970, 1970, 1980, 1980, 1980],
|
||||
"id": [0, 1, 2, 0, 1, 2]}
|
||||
expected = DataFrame(exp_data)
|
||||
expected = expected.set_index(
|
||||
['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]]
|
||||
result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"],
|
||||
i="id", j="year")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_unbalanced(self):
|
||||
# test that we can have a varying amount of time variables
|
||||
df = pd.DataFrame({'A2010': [1.0, 2.0],
|
||||
'A2011': [3.0, 4.0],
|
||||
'B2010': [5.0, 6.0],
|
||||
'X': ['X1', 'X2']})
|
||||
df['id'] = df.index
|
||||
exp_data = {'X': ['X1', 'X1', 'X2', 'X2'],
|
||||
'A': [1.0, 3.0, 2.0, 4.0],
|
||||
'B': [5.0, np.nan, 6.0, np.nan],
|
||||
'id': [0, 0, 1, 1],
|
||||
'year': [2010, 2011, 2010, 2011]}
|
||||
expected = pd.DataFrame(exp_data)
|
||||
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
|
||||
result = wide_to_long(df, ['A', 'B'], i='id', j='year')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_character_overlap(self):
|
||||
# Test we handle overlapping characters in both id_vars and value_vars
|
||||
df = pd.DataFrame({
|
||||
'A11': ['a11', 'a22', 'a33'],
|
||||
'A12': ['a21', 'a22', 'a23'],
|
||||
'B11': ['b11', 'b12', 'b13'],
|
||||
'B12': ['b21', 'b22', 'b23'],
|
||||
'BB11': [1, 2, 3],
|
||||
'BB12': [4, 5, 6],
|
||||
'BBBX': [91, 92, 93],
|
||||
'BBBZ': [91, 92, 93]
|
||||
})
|
||||
df['id'] = df.index
|
||||
expected = pd.DataFrame({
|
||||
'BBBX': [91, 92, 93, 91, 92, 93],
|
||||
'BBBZ': [91, 92, 93, 91, 92, 93],
|
||||
'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
|
||||
'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
|
||||
'BB': [1, 2, 3, 4, 5, 6],
|
||||
'id': [0, 1, 2, 0, 1, 2],
|
||||
'year': [11, 11, 11, 12, 12, 12]})
|
||||
expected = expected.set_index(['id', 'year'])[
|
||||
['BBBX', 'BBBZ', 'A', 'B', 'BB']]
|
||||
result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
|
||||
tm.assert_frame_equal(result.sort_index(axis=1),
|
||||
expected.sort_index(axis=1))
|
||||
|
||||
def test_invalid_separator(self):
|
||||
# if an invalid separator is supplied a empty data frame is returned
|
||||
sep = 'nope!'
|
||||
df = pd.DataFrame({'A2010': [1.0, 2.0],
|
||||
'A2011': [3.0, 4.0],
|
||||
'B2010': [5.0, 6.0],
|
||||
'X': ['X1', 'X2']})
|
||||
df['id'] = df.index
|
||||
exp_data = {'X': '',
|
||||
'A2010': [],
|
||||
'A2011': [],
|
||||
'B2010': [],
|
||||
'id': [],
|
||||
'year': [],
|
||||
'A': [],
|
||||
'B': []}
|
||||
expected = pd.DataFrame(exp_data).astype({'year': 'int'})
|
||||
expected = expected.set_index(['id', 'year'])[[
|
||||
'X', 'A2010', 'A2011', 'B2010', 'A', 'B']]
|
||||
expected.index.set_levels([0, 1], level=0, inplace=True)
|
||||
result = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep)
|
||||
tm.assert_frame_equal(result.sort_index(axis=1),
|
||||
expected.sort_index(axis=1))
|
||||
|
||||
def test_num_string_disambiguation(self):
|
||||
# Test that we can disambiguate number value_vars from
|
||||
# string value_vars
|
||||
df = pd.DataFrame({
|
||||
'A11': ['a11', 'a22', 'a33'],
|
||||
'A12': ['a21', 'a22', 'a23'],
|
||||
'B11': ['b11', 'b12', 'b13'],
|
||||
'B12': ['b21', 'b22', 'b23'],
|
||||
'BB11': [1, 2, 3],
|
||||
'BB12': [4, 5, 6],
|
||||
'Arating': [91, 92, 93],
|
||||
'Arating_old': [91, 92, 93]
|
||||
})
|
||||
df['id'] = df.index
|
||||
expected = pd.DataFrame({
|
||||
'Arating': [91, 92, 93, 91, 92, 93],
|
||||
'Arating_old': [91, 92, 93, 91, 92, 93],
|
||||
'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
|
||||
'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
|
||||
'BB': [1, 2, 3, 4, 5, 6],
|
||||
'id': [0, 1, 2, 0, 1, 2],
|
||||
'year': [11, 11, 11, 12, 12, 12]})
|
||||
expected = expected.set_index(['id', 'year'])[
|
||||
['Arating', 'Arating_old', 'A', 'B', 'BB']]
|
||||
result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
|
||||
tm.assert_frame_equal(result.sort_index(axis=1),
|
||||
expected.sort_index(axis=1))
|
||||
|
||||
def test_invalid_suffixtype(self):
|
||||
# If all stubs names end with a string, but a numeric suffix is
|
||||
# assumed, an empty data frame is returned
|
||||
df = pd.DataFrame({'Aone': [1.0, 2.0],
|
||||
'Atwo': [3.0, 4.0],
|
||||
'Bone': [5.0, 6.0],
|
||||
'X': ['X1', 'X2']})
|
||||
df['id'] = df.index
|
||||
exp_data = {'X': '',
|
||||
'Aone': [],
|
||||
'Atwo': [],
|
||||
'Bone': [],
|
||||
'id': [],
|
||||
'year': [],
|
||||
'A': [],
|
||||
'B': []}
|
||||
expected = pd.DataFrame(exp_data).astype({'year': 'int'})
|
||||
|
||||
expected = expected.set_index(['id', 'year'])
|
||||
expected.index.set_levels([0, 1], level=0, inplace=True)
|
||||
result = wide_to_long(df, ['A', 'B'], i='id', j='year')
|
||||
tm.assert_frame_equal(result.sort_index(axis=1),
|
||||
expected.sort_index(axis=1))
|
||||
|
||||
def test_multiple_id_columns(self):
|
||||
# Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm
|
||||
df = pd.DataFrame({
|
||||
'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
|
||||
'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
|
||||
'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
|
||||
'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
|
||||
})
|
||||
expected = pd.DataFrame({
|
||||
'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8,
|
||||
2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9],
|
||||
'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
|
||||
'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3],
|
||||
'age': [1, 2, 1, 2, 1, 2, 1, 2, 1,
|
||||
2, 1, 2, 1, 2, 1, 2, 1, 2]
|
||||
})
|
||||
expected = expected.set_index(['famid', 'birth', 'age'])[['ht']]
|
||||
result = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_non_unique_idvars(self):
|
||||
# GH16382
|
||||
# Raise an error message if non unique id vars (i) are passed
|
||||
df = pd.DataFrame({
|
||||
'A_A1': [1, 2, 3, 4, 5],
|
||||
'B_B1': [1, 2, 3, 4, 5],
|
||||
'x': [1, 1, 1, 1, 1]
|
||||
})
|
||||
msg = "the id variables need to uniquely identify each row"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname')
|
||||
|
||||
def test_cast_j_int(self):
|
||||
df = pd.DataFrame({
|
||||
'actor_1': ['CCH Pounder', 'Johnny Depp', 'Christoph Waltz'],
|
||||
'actor_2': ['Joel David Moore', 'Orlando Bloom', 'Rory Kinnear'],
|
||||
'actor_fb_likes_1': [1000.0, 40000.0, 11000.0],
|
||||
'actor_fb_likes_2': [936.0, 5000.0, 393.0],
|
||||
'title': ['Avatar', "Pirates of the Caribbean", 'Spectre']})
|
||||
|
||||
expected = pd.DataFrame({
|
||||
'actor': ['CCH Pounder',
|
||||
'Johnny Depp',
|
||||
'Christoph Waltz',
|
||||
'Joel David Moore',
|
||||
'Orlando Bloom',
|
||||
'Rory Kinnear'],
|
||||
'actor_fb_likes': [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0],
|
||||
'num': [1, 1, 1, 2, 2, 2],
|
||||
'title': ['Avatar',
|
||||
'Pirates of the Caribbean',
|
||||
'Spectre',
|
||||
'Avatar',
|
||||
'Pirates of the Caribbean',
|
||||
'Spectre']}).set_index(['title', 'num'])
|
||||
result = wide_to_long(df, ['actor', 'actor_fb_likes'],
|
||||
i='title', j='num', sep='_')
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_identical_stubnames(self):
|
||||
df = pd.DataFrame({'A2010': [1.0, 2.0],
|
||||
'A2011': [3.0, 4.0],
|
||||
'B2010': [5.0, 6.0],
|
||||
'A': ['X1', 'X2']})
|
||||
msg = "stubname can't be identical to a column name"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
wide_to_long(df, ['A', 'B'], i='A', j='colname')
|
||||
|
||||
def test_nonnumeric_suffix(self):
|
||||
df = pd.DataFrame({'treatment_placebo': [1.0, 2.0],
|
||||
'treatment_test': [3.0, 4.0],
|
||||
'result_placebo': [5.0, 6.0],
|
||||
'A': ['X1', 'X2']})
|
||||
expected = pd.DataFrame({
|
||||
'A': ['X1', 'X1', 'X2', 'X2'],
|
||||
'colname': ['placebo', 'test', 'placebo', 'test'],
|
||||
'result': [5.0, np.nan, 6.0, np.nan],
|
||||
'treatment': [1.0, 3.0, 2.0, 4.0]})
|
||||
expected = expected.set_index(['A', 'colname'])
|
||||
result = wide_to_long(df, ['result', 'treatment'],
|
||||
i='A', j='colname', suffix='[a-z]+', sep='_')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_mixed_type_suffix(self):
|
||||
df = pd.DataFrame({
|
||||
'A': ['X1', 'X2'],
|
||||
'result_1': [0, 9],
|
||||
'result_foo': [5.0, 6.0],
|
||||
'treatment_1': [1.0, 2.0],
|
||||
'treatment_foo': [3.0, 4.0]})
|
||||
expected = pd.DataFrame({
|
||||
'A': ['X1', 'X2', 'X1', 'X2'],
|
||||
'colname': ['1', '1', 'foo', 'foo'],
|
||||
'result': [0.0, 9.0, 5.0, 6.0],
|
||||
'treatment': [1.0, 2.0, 3.0, 4.0]}).set_index(['A', 'colname'])
|
||||
result = wide_to_long(df, ['result', 'treatment'],
|
||||
i='A', j='colname', suffix='.+', sep='_')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_float_suffix(self):
|
||||
df = pd.DataFrame({
|
||||
'treatment_1.1': [1.0, 2.0],
|
||||
'treatment_2.1': [3.0, 4.0],
|
||||
'result_1.2': [5.0, 6.0],
|
||||
'result_1': [0, 9],
|
||||
'A': ['X1', 'X2']})
|
||||
expected = pd.DataFrame({
|
||||
'A': ['X1', 'X1', 'X1', 'X1', 'X2', 'X2', 'X2', 'X2'],
|
||||
'colname': [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1],
|
||||
'result': [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan],
|
||||
'treatment': [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0]})
|
||||
expected = expected.set_index(['A', 'colname'])
|
||||
result = wide_to_long(df, ['result', 'treatment'],
|
||||
i='A', j='colname', suffix='[0-9.]+', sep='_')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_col_substring_of_stubname(self):
|
||||
# GH22468
|
||||
# Don't raise ValueError when a column name is a substring
|
||||
# of a stubname that's been passed as a string
|
||||
wide_data = {'node_id': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
|
||||
'A': {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81},
|
||||
'PA0': {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6},
|
||||
'PA1': {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67},
|
||||
'PA3': {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67}
|
||||
}
|
||||
wide_df = pd.DataFrame.from_dict(wide_data)
|
||||
expected = pd.wide_to_long(wide_df,
|
||||
stubnames=['PA'],
|
||||
i=['node_id', 'A'],
|
||||
j='time')
|
||||
result = pd.wide_to_long(wide_df,
|
||||
stubnames='PA',
|
||||
i=['node_id', 'A'],
|
||||
j='time')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,199 @@
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import zip
|
||||
|
||||
from pandas import (
|
||||
Categorical, DatetimeIndex, Interval, IntervalIndex, NaT, Series,
|
||||
TimedeltaIndex, Timestamp, cut, date_range, isna, qcut, timedelta_range)
|
||||
from pandas.api.types import CategoricalDtype as CDT
|
||||
from pandas.core.algorithms import quantile
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.tseries.offsets import Day, Nano
|
||||
|
||||
|
||||
def test_qcut():
|
||||
arr = np.random.randn(1000)
|
||||
|
||||
# We store the bins as Index that have been
|
||||
# rounded to comparisons are a bit tricky.
|
||||
labels, bins = qcut(arr, 4, retbins=True)
|
||||
ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
|
||||
|
||||
result = labels.categories.left.values
|
||||
assert np.allclose(result, ex_bins[:-1], atol=1e-2)
|
||||
|
||||
result = labels.categories.right.values
|
||||
assert np.allclose(result, ex_bins[1:], atol=1e-2)
|
||||
|
||||
ex_levels = cut(arr, ex_bins, include_lowest=True)
|
||||
tm.assert_categorical_equal(labels, ex_levels)
|
||||
|
||||
|
||||
def test_qcut_bounds():
|
||||
arr = np.random.randn(1000)
|
||||
|
||||
factor = qcut(arr, 10, labels=False)
|
||||
assert len(np.unique(factor)) == 10
|
||||
|
||||
|
||||
def test_qcut_specify_quantiles():
|
||||
arr = np.random.randn(100)
|
||||
factor = qcut(arr, [0, .25, .5, .75, 1.])
|
||||
|
||||
expected = qcut(arr, 4)
|
||||
tm.assert_categorical_equal(factor, expected)
|
||||
|
||||
|
||||
def test_qcut_all_bins_same():
|
||||
with pytest.raises(ValueError, match="edges.*unique"):
|
||||
qcut([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3)
|
||||
|
||||
|
||||
def test_qcut_include_lowest():
|
||||
values = np.arange(10)
|
||||
ii = qcut(values, 4)
|
||||
|
||||
ex_levels = IntervalIndex([Interval(-0.001, 2.25), Interval(2.25, 4.5),
|
||||
Interval(4.5, 6.75), Interval(6.75, 9)])
|
||||
tm.assert_index_equal(ii.categories, ex_levels)
|
||||
|
||||
|
||||
def test_qcut_nas():
|
||||
arr = np.random.randn(100)
|
||||
arr[:20] = np.nan
|
||||
|
||||
result = qcut(arr, 4)
|
||||
assert isna(result[:20]).all()
|
||||
|
||||
|
||||
def test_qcut_index():
|
||||
result = qcut([0, 2], 2)
|
||||
intervals = [Interval(-0.001, 1), Interval(1, 2)]
|
||||
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
def test_qcut_binning_issues(datapath):
|
||||
# see gh-1978, gh-1979
|
||||
cut_file = datapath(os.path.join("reshape", "data", "cut_data.csv"))
|
||||
arr = np.loadtxt(cut_file)
|
||||
result = qcut(arr, 20)
|
||||
|
||||
starts = []
|
||||
ends = []
|
||||
|
||||
for lev in np.unique(result):
|
||||
s = lev.left
|
||||
e = lev.right
|
||||
assert s != e
|
||||
|
||||
starts.append(float(s))
|
||||
ends.append(float(e))
|
||||
|
||||
for (sp, sn), (ep, en) in zip(zip(starts[:-1], starts[1:]),
|
||||
zip(ends[:-1], ends[1:])):
|
||||
assert sp < sn
|
||||
assert ep < en
|
||||
assert ep <= sn
|
||||
|
||||
|
||||
def test_qcut_return_intervals():
|
||||
ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
|
||||
res = qcut(ser, [0, 0.333, 0.666, 1])
|
||||
|
||||
exp_levels = np.array([Interval(-0.001, 2.664),
|
||||
Interval(2.664, 5.328), Interval(5.328, 8)])
|
||||
exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(
|
||||
CDT(ordered=True))
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs,msg", [
|
||||
(dict(duplicates="drop"), None),
|
||||
(dict(), "Bin edges must be unique"),
|
||||
(dict(duplicates="raise"), "Bin edges must be unique"),
|
||||
(dict(duplicates="foo"), "invalid value for 'duplicates' parameter")
|
||||
])
|
||||
def test_qcut_duplicates_bin(kwargs, msg):
|
||||
# see gh-7751
|
||||
values = [0, 0, 0, 0, 1, 2, 3]
|
||||
|
||||
if msg is not None:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
qcut(values, 3, **kwargs)
|
||||
else:
|
||||
result = qcut(values, 3, **kwargs)
|
||||
expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)])
|
||||
tm.assert_index_equal(result.categories, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,start,end", [
|
||||
(9.0, 8.999, 9.0),
|
||||
(0.0, -0.001, 0.0),
|
||||
(-9.0, -9.001, -9.0),
|
||||
])
|
||||
@pytest.mark.parametrize("length", [1, 2])
|
||||
@pytest.mark.parametrize("labels", [None, False])
|
||||
def test_single_quantile(data, start, end, length, labels):
|
||||
# see gh-15431
|
||||
ser = Series([data] * length)
|
||||
result = qcut(ser, 1, labels=labels)
|
||||
|
||||
if labels is None:
|
||||
intervals = IntervalIndex([Interval(start, end)] *
|
||||
length, closed="right")
|
||||
expected = Series(intervals).astype(CDT(ordered=True))
|
||||
else:
|
||||
expected = Series([0] * length)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ser", [
|
||||
Series(DatetimeIndex(["20180101", NaT, "20180103"])),
|
||||
Series(TimedeltaIndex(["0 days", NaT, "2 days"]))],
|
||||
ids=lambda x: str(x.dtype))
|
||||
def test_qcut_nat(ser):
|
||||
# see gh-19768
|
||||
intervals = IntervalIndex.from_tuples([
|
||||
(ser[0] - Nano(), ser[2] - Day()),
|
||||
np.nan, (ser[2] - Day(), ser[2])])
|
||||
expected = Series(Categorical(intervals, ordered=True))
|
||||
|
||||
result = qcut(ser, 2)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bins", [3, np.linspace(0, 1, 4)])
|
||||
def test_datetime_tz_qcut(bins):
|
||||
# see gh-19872
|
||||
tz = "US/Eastern"
|
||||
ser = Series(date_range("20130101", periods=3, tz=tz))
|
||||
|
||||
result = qcut(ser, bins)
|
||||
expected = Series(IntervalIndex([
|
||||
Interval(Timestamp("2012-12-31 23:59:59.999999999", tz=tz),
|
||||
Timestamp("2013-01-01 16:00:00", tz=tz)),
|
||||
Interval(Timestamp("2013-01-01 16:00:00", tz=tz),
|
||||
Timestamp("2013-01-02 08:00:00", tz=tz)),
|
||||
Interval(Timestamp("2013-01-02 08:00:00", tz=tz),
|
||||
Timestamp("2013-01-03 00:00:00", tz=tz))])).astype(
|
||||
CDT(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("arg,expected_bins", [
|
||||
[timedelta_range("1day", periods=3),
|
||||
TimedeltaIndex(["1 days", "2 days", "3 days"])],
|
||||
[date_range("20180101", periods=3),
|
||||
DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"])]])
|
||||
def test_date_like_qcut_bins(arg, expected_bins):
|
||||
# see gh-19891
|
||||
ser = Series(arg)
|
||||
result, result_bins = qcut(ser, 2, retbins=True)
|
||||
tm.assert_index_equal(result_bins, expected_bins)
|
||||
@@ -0,0 +1,621 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# pylint: disable-msg=W0612,E1101
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
from numpy import nan
|
||||
import pytest
|
||||
|
||||
from pandas.compat import u
|
||||
|
||||
from pandas.core.dtypes.common import is_integer_dtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import Categorical, DataFrame, Index, Series, get_dummies
|
||||
from pandas.core.sparse.api import SparseArray, SparseDtype
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
|
||||
class TestGetDummies(object):
|
||||
|
||||
@pytest.fixture
|
||||
def df(self):
|
||||
return DataFrame({'A': ['a', 'b', 'a'],
|
||||
'B': ['b', 'b', 'c'],
|
||||
'C': [1, 2, 3]})
|
||||
|
||||
@pytest.fixture(params=['uint8', 'i8', np.float64, bool, None])
|
||||
def dtype(self, request):
|
||||
return np.dtype(request.param)
|
||||
|
||||
@pytest.fixture(params=['dense', 'sparse'])
|
||||
def sparse(self, request):
|
||||
# params are strings to simplify reading test results,
|
||||
# e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True]
|
||||
return request.param == 'sparse'
|
||||
|
||||
def effective_dtype(self, dtype):
|
||||
if dtype is None:
|
||||
return np.uint8
|
||||
return dtype
|
||||
|
||||
def test_raises_on_dtype_object(self, df):
|
||||
with pytest.raises(ValueError):
|
||||
get_dummies(df, dtype='object')
|
||||
|
||||
def test_basic(self, sparse, dtype):
|
||||
s_list = list('abc')
|
||||
s_series = Series(s_list)
|
||||
s_series_index = Series(s_list, list('ABC'))
|
||||
|
||||
expected = DataFrame({'a': [1, 0, 0],
|
||||
'b': [0, 1, 0],
|
||||
'c': [0, 0, 1]},
|
||||
dtype=self.effective_dtype(dtype))
|
||||
if sparse:
|
||||
expected = expected.apply(pd.SparseArray, fill_value=0.0)
|
||||
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
expected.index = list('ABC')
|
||||
result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_basic_types(self, sparse, dtype):
|
||||
# GH 10531
|
||||
s_list = list('abc')
|
||||
s_series = Series(s_list)
|
||||
s_df = DataFrame({'a': [0, 1, 0, 1, 2],
|
||||
'b': ['A', 'A', 'B', 'C', 'C'],
|
||||
'c': [2, 3, 3, 3, 2]})
|
||||
|
||||
expected = DataFrame({'a': [1, 0, 0],
|
||||
'b': [0, 1, 0],
|
||||
'c': [0, 0, 1]},
|
||||
dtype=self.effective_dtype(dtype),
|
||||
columns=list('abc'))
|
||||
if sparse:
|
||||
if is_integer_dtype(dtype):
|
||||
fill_value = 0
|
||||
elif dtype == bool:
|
||||
fill_value = False
|
||||
else:
|
||||
fill_value = 0.0
|
||||
|
||||
expected = expected.apply(SparseArray, fill_value=fill_value)
|
||||
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_df, columns=s_df.columns,
|
||||
sparse=sparse, dtype=dtype)
|
||||
if sparse:
|
||||
dtype_name = 'Sparse[{}, {}]'.format(
|
||||
self.effective_dtype(dtype).name,
|
||||
fill_value
|
||||
)
|
||||
else:
|
||||
dtype_name = self.effective_dtype(dtype).name
|
||||
|
||||
expected = Series({dtype_name: 8})
|
||||
tm.assert_series_equal(result.get_dtype_counts(), expected)
|
||||
|
||||
result = get_dummies(s_df, columns=['a'], sparse=sparse, dtype=dtype)
|
||||
|
||||
expected_counts = {'int64': 1, 'object': 1}
|
||||
expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0)
|
||||
|
||||
expected = Series(expected_counts).sort_index()
|
||||
tm.assert_series_equal(result.get_dtype_counts().sort_index(),
|
||||
expected)
|
||||
|
||||
def test_just_na(self, sparse):
|
||||
just_na_list = [np.nan]
|
||||
just_na_series = Series(just_na_list)
|
||||
just_na_series_index = Series(just_na_list, index=['A'])
|
||||
|
||||
res_list = get_dummies(just_na_list, sparse=sparse)
|
||||
res_series = get_dummies(just_na_series, sparse=sparse)
|
||||
res_series_index = get_dummies(just_na_series_index, sparse=sparse)
|
||||
|
||||
assert res_list.empty
|
||||
assert res_series.empty
|
||||
assert res_series_index.empty
|
||||
|
||||
assert res_list.index.tolist() == [0]
|
||||
assert res_series.index.tolist() == [0]
|
||||
assert res_series_index.index.tolist() == ['A']
|
||||
|
||||
def test_include_na(self, sparse, dtype):
|
||||
s = ['a', 'b', np.nan]
|
||||
res = get_dummies(s, sparse=sparse, dtype=dtype)
|
||||
exp = DataFrame({'a': [1, 0, 0],
|
||||
'b': [0, 1, 0]},
|
||||
dtype=self.effective_dtype(dtype))
|
||||
if sparse:
|
||||
exp = exp.apply(pd.SparseArray, fill_value=0.0)
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
# Sparse dataframes do not allow nan labelled columns, see #GH8822
|
||||
res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
|
||||
exp_na = DataFrame({nan: [0, 0, 1],
|
||||
'a': [1, 0, 0],
|
||||
'b': [0, 1, 0]},
|
||||
dtype=self.effective_dtype(dtype))
|
||||
exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
|
||||
# hack (NaN handling in assert_index_equal)
|
||||
exp_na.columns = res_na.columns
|
||||
if sparse:
|
||||
exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0)
|
||||
assert_frame_equal(res_na, exp_na)
|
||||
|
||||
res_just_na = get_dummies([nan], dummy_na=True,
|
||||
sparse=sparse, dtype=dtype)
|
||||
exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan],
|
||||
dtype=self.effective_dtype(dtype))
|
||||
tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
|
||||
|
||||
def test_unicode(self, sparse):
|
||||
# See GH 6885 - get_dummies chokes on unicode values
|
||||
import unicodedata
|
||||
e = 'e'
|
||||
eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
|
||||
s = [e, eacute, eacute]
|
||||
res = get_dummies(s, prefix='letter', sparse=sparse)
|
||||
exp = DataFrame({'letter_e': [1, 0, 0],
|
||||
u('letter_%s') % eacute: [0, 1, 1]},
|
||||
dtype=np.uint8)
|
||||
if sparse:
|
||||
exp = exp.apply(pd.SparseArray, fill_value=0)
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
def test_dataframe_dummies_all_obj(self, df, sparse):
|
||||
df = df[['A', 'B']]
|
||||
result = get_dummies(df, sparse=sparse)
|
||||
expected = DataFrame({'A_a': [1, 0, 1],
|
||||
'A_b': [0, 1, 0],
|
||||
'B_b': [1, 1, 0],
|
||||
'B_c': [0, 0, 1]},
|
||||
dtype=np.uint8)
|
||||
if sparse:
|
||||
expected = pd.DataFrame({
|
||||
"A_a": pd.SparseArray([1, 0, 1], dtype='uint8'),
|
||||
"A_b": pd.SparseArray([0, 1, 0], dtype='uint8'),
|
||||
"B_b": pd.SparseArray([1, 1, 0], dtype='uint8'),
|
||||
"B_c": pd.SparseArray([0, 0, 1], dtype='uint8'),
|
||||
})
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
|
||||
result = get_dummies(df, sparse=sparse, dtype=dtype)
|
||||
if sparse:
|
||||
arr = SparseArray
|
||||
typ = SparseDtype(dtype, 0)
|
||||
else:
|
||||
arr = np.array
|
||||
typ = dtype
|
||||
expected = DataFrame({'C': [1, 2, 3],
|
||||
'A_a': arr([1, 0, 1], dtype=typ),
|
||||
'A_b': arr([0, 1, 0], dtype=typ),
|
||||
'B_b': arr([1, 1, 0], dtype=typ),
|
||||
'B_c': arr([0, 0, 1], dtype=typ)})
|
||||
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_list(self, df, sparse):
|
||||
prefixes = ['from_A', 'from_B']
|
||||
result = get_dummies(df, prefix=prefixes, sparse=sparse)
|
||||
expected = DataFrame({'C': [1, 2, 3],
|
||||
'from_A_a': [1, 0, 1],
|
||||
'from_A_b': [0, 1, 0],
|
||||
'from_B_b': [1, 1, 0],
|
||||
'from_B_c': [0, 0, 1]},
|
||||
dtype=np.uint8)
|
||||
expected[['C']] = df[['C']]
|
||||
cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']
|
||||
expected = expected[['C'] + cols]
|
||||
|
||||
typ = pd.SparseArray if sparse else pd.Series
|
||||
expected[cols] = expected[cols].apply(lambda x: typ(x))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_str(self, df, sparse):
|
||||
# not that you should do this...
|
||||
result = get_dummies(df, prefix='bad', sparse=sparse)
|
||||
bad_columns = ['bad_a', 'bad_b', 'bad_b', 'bad_c']
|
||||
expected = DataFrame([[1, 1, 0, 1, 0],
|
||||
[2, 0, 1, 1, 0],
|
||||
[3, 1, 0, 0, 1]],
|
||||
columns=['C'] + bad_columns,
|
||||
dtype=np.uint8)
|
||||
expected = expected.astype({"C": np.int64})
|
||||
if sparse:
|
||||
# work around astyping & assigning with duplicate columns
|
||||
# https://github.com/pandas-dev/pandas/issues/14427
|
||||
expected = pd.concat([
|
||||
pd.Series([1, 2, 3], name='C'),
|
||||
pd.Series([1, 0, 1], name='bad_a', dtype='Sparse[uint8]'),
|
||||
pd.Series([0, 1, 0], name='bad_b', dtype='Sparse[uint8]'),
|
||||
pd.Series([1, 1, 0], name='bad_b', dtype='Sparse[uint8]'),
|
||||
pd.Series([0, 0, 1], name='bad_c', dtype='Sparse[uint8]'),
|
||||
], axis=1)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_subset(self, df, sparse):
|
||||
result = get_dummies(df, prefix=['from_A'], columns=['A'],
|
||||
sparse=sparse)
|
||||
expected = DataFrame({'B': ['b', 'b', 'c'],
|
||||
'C': [1, 2, 3],
|
||||
'from_A_a': [1, 0, 1],
|
||||
'from_A_b': [0, 1, 0]}, dtype=np.uint8)
|
||||
expected[['C']] = df[['C']]
|
||||
if sparse:
|
||||
cols = ['from_A_a', 'from_A_b']
|
||||
expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_sep(self, df, sparse):
|
||||
result = get_dummies(df, prefix_sep='..', sparse=sparse)
|
||||
expected = DataFrame({'C': [1, 2, 3],
|
||||
'A..a': [1, 0, 1],
|
||||
'A..b': [0, 1, 0],
|
||||
'B..b': [1, 1, 0],
|
||||
'B..c': [0, 0, 1]},
|
||||
dtype=np.uint8)
|
||||
expected[['C']] = df[['C']]
|
||||
expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
|
||||
if sparse:
|
||||
cols = ['A..a', 'A..b', 'B..b', 'B..c']
|
||||
expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x))
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, prefix_sep=['..', '__'], sparse=sparse)
|
||||
expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'},
|
||||
sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
|
||||
with pytest.raises(ValueError):
|
||||
get_dummies(df, prefix=['too few'], sparse=sparse)
|
||||
|
||||
def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
|
||||
with pytest.raises(ValueError):
|
||||
get_dummies(df, prefix_sep=['bad'], sparse=sparse)
|
||||
|
||||
def test_dataframe_dummies_prefix_dict(self, sparse):
|
||||
prefixes = {'A': 'from_A', 'B': 'from_B'}
|
||||
df = DataFrame({'C': [1, 2, 3],
|
||||
'A': ['a', 'b', 'a'],
|
||||
'B': ['b', 'b', 'c']})
|
||||
result = get_dummies(df, prefix=prefixes, sparse=sparse)
|
||||
|
||||
expected = DataFrame({'C': [1, 2, 3],
|
||||
'from_A_a': [1, 0, 1],
|
||||
'from_A_b': [0, 1, 0],
|
||||
'from_B_b': [1, 1, 0],
|
||||
'from_B_c': [0, 0, 1]})
|
||||
|
||||
columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']
|
||||
expected[columns] = expected[columns].astype(np.uint8)
|
||||
if sparse:
|
||||
expected[columns] = expected[columns].apply(
|
||||
lambda x: pd.SparseSeries(x)
|
||||
)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_with_na(self, df, sparse, dtype):
|
||||
df.loc[3, :] = [np.nan, np.nan, np.nan]
|
||||
result = get_dummies(df, dummy_na=True,
|
||||
sparse=sparse, dtype=dtype).sort_index(axis=1)
|
||||
|
||||
if sparse:
|
||||
arr = SparseArray
|
||||
typ = SparseDtype(dtype, 0)
|
||||
else:
|
||||
arr = np.array
|
||||
typ = dtype
|
||||
|
||||
expected = DataFrame({'C': [1, 2, 3, np.nan],
|
||||
'A_a': arr([1, 0, 1, 0], dtype=typ),
|
||||
'A_b': arr([0, 1, 0, 0], dtype=typ),
|
||||
'A_nan': arr([0, 0, 0, 1], dtype=typ),
|
||||
'B_b': arr([1, 1, 0, 0], dtype=typ),
|
||||
'B_c': arr([0, 0, 1, 0], dtype=typ),
|
||||
'B_nan': arr([0, 0, 0, 1], dtype=typ)
|
||||
}).sort_index(axis=1)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
|
||||
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
|
||||
df['cat'] = pd.Categorical(['x', 'y', 'y'])
|
||||
result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
|
||||
if sparse:
|
||||
arr = SparseArray
|
||||
typ = SparseDtype(dtype, 0)
|
||||
else:
|
||||
arr = np.array
|
||||
typ = dtype
|
||||
|
||||
expected = DataFrame({'C': [1, 2, 3],
|
||||
'A_a': arr([1, 0, 1], dtype=typ),
|
||||
'A_b': arr([0, 1, 0], dtype=typ),
|
||||
'B_b': arr([1, 1, 0], dtype=typ),
|
||||
'B_c': arr([0, 0, 1], dtype=typ),
|
||||
'cat_x': arr([1, 0, 0], dtype=typ),
|
||||
'cat_y': arr([0, 1, 1], dtype=typ)
|
||||
}).sort_index(axis=1)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('get_dummies_kwargs,expected', [
|
||||
({'data': pd.DataFrame(({u'ä': ['a']}))},
|
||||
pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)),
|
||||
|
||||
({'data': pd.DataFrame({'x': [u'ä']})},
|
||||
pd.DataFrame({u'x_ä': [1]}, dtype=np.uint8)),
|
||||
|
||||
({'data': pd.DataFrame({'x': [u'a']}), 'prefix':u'ä'},
|
||||
pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)),
|
||||
|
||||
({'data': pd.DataFrame({'x': [u'a']}), 'prefix_sep':u'ä'},
|
||||
pd.DataFrame({u'xäa': [1]}, dtype=np.uint8))])
|
||||
def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected):
|
||||
# GH22084 pd.get_dummies incorrectly encodes unicode characters
|
||||
# in dataframe column names
|
||||
result = get_dummies(**get_dummies_kwargs)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_basic_drop_first(self, sparse):
|
||||
# GH12402 Add a new parameter `drop_first` to avoid collinearity
|
||||
# Basic case
|
||||
s_list = list('abc')
|
||||
s_series = Series(s_list)
|
||||
s_series_index = Series(s_list, list('ABC'))
|
||||
|
||||
expected = DataFrame({'b': [0, 1, 0],
|
||||
'c': [0, 0, 1]},
|
||||
dtype=np.uint8)
|
||||
|
||||
result = get_dummies(s_list, drop_first=True, sparse=sparse)
|
||||
if sparse:
|
||||
expected = expected.apply(pd.SparseArray, fill_value=0)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, drop_first=True, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
expected.index = list('ABC')
|
||||
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_basic_drop_first_one_level(self, sparse):
|
||||
# Test the case that categorical variable only has one level.
|
||||
s_list = list('aaa')
|
||||
s_series = Series(s_list)
|
||||
s_series_index = Series(s_list, list('ABC'))
|
||||
|
||||
expected = DataFrame(index=np.arange(3))
|
||||
|
||||
result = get_dummies(s_list, drop_first=True, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, drop_first=True, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(index=list('ABC'))
|
||||
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_basic_drop_first_NA(self, sparse):
|
||||
# Test NA handling together with drop_first
|
||||
s_NA = ['a', 'b', np.nan]
|
||||
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
|
||||
exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8)
|
||||
if sparse:
|
||||
exp = exp.apply(pd.SparseArray, fill_value=0)
|
||||
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
res_na = get_dummies(s_NA, dummy_na=True, drop_first=True,
|
||||
sparse=sparse)
|
||||
exp_na = DataFrame(
|
||||
{'b': [0, 1, 0],
|
||||
nan: [0, 0, 1]},
|
||||
dtype=np.uint8).reindex(['b', nan], axis=1)
|
||||
if sparse:
|
||||
exp_na = exp_na.apply(pd.SparseArray, fill_value=0)
|
||||
assert_frame_equal(res_na, exp_na)
|
||||
|
||||
res_just_na = get_dummies([nan], dummy_na=True, drop_first=True,
|
||||
sparse=sparse)
|
||||
exp_just_na = DataFrame(index=np.arange(1))
|
||||
assert_frame_equal(res_just_na, exp_just_na)
|
||||
|
||||
def test_dataframe_dummies_drop_first(self, df, sparse):
|
||||
df = df[['A', 'B']]
|
||||
result = get_dummies(df, drop_first=True, sparse=sparse)
|
||||
expected = DataFrame({'A_b': [0, 1, 0],
|
||||
'B_c': [0, 0, 1]},
|
||||
dtype=np.uint8)
|
||||
if sparse:
|
||||
expected = expected.apply(pd.SparseArray, fill_value=0)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_drop_first_with_categorical(
|
||||
self, df, sparse, dtype):
|
||||
df['cat'] = pd.Categorical(['x', 'y', 'y'])
|
||||
result = get_dummies(df, drop_first=True, sparse=sparse)
|
||||
expected = DataFrame({'C': [1, 2, 3],
|
||||
'A_b': [0, 1, 0],
|
||||
'B_c': [0, 0, 1],
|
||||
'cat_y': [0, 1, 1]})
|
||||
cols = ['A_b', 'B_c', 'cat_y']
|
||||
expected[cols] = expected[cols].astype(np.uint8)
|
||||
expected = expected[['C', 'A_b', 'B_c', 'cat_y']]
|
||||
if sparse:
|
||||
for col in cols:
|
||||
expected[col] = pd.SparseSeries(expected[col])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
|
||||
df.loc[3, :] = [np.nan, np.nan, np.nan]
|
||||
result = get_dummies(df, dummy_na=True, drop_first=True,
|
||||
sparse=sparse).sort_index(axis=1)
|
||||
expected = DataFrame({'C': [1, 2, 3, np.nan],
|
||||
'A_b': [0, 1, 0, 0],
|
||||
'A_nan': [0, 0, 0, 1],
|
||||
'B_c': [0, 0, 1, 0],
|
||||
'B_nan': [0, 0, 0, 1]})
|
||||
cols = ['A_b', 'A_nan', 'B_c', 'B_nan']
|
||||
expected[cols] = expected[cols].astype(np.uint8)
|
||||
expected = expected.sort_index(axis=1)
|
||||
if sparse:
|
||||
for col in cols:
|
||||
expected[col] = pd.SparseSeries(expected[col])
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, dummy_na=False, drop_first=True,
|
||||
sparse=sparse)
|
||||
expected = expected[['C', 'A_b', 'B_c']]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_int_int(self):
|
||||
data = Series([1, 2, 1])
|
||||
result = pd.get_dummies(data)
|
||||
expected = DataFrame([[1, 0],
|
||||
[0, 1],
|
||||
[1, 0]],
|
||||
columns=[1, 2],
|
||||
dtype=np.uint8)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
data = Series(pd.Categorical(['a', 'b', 'a']))
|
||||
result = pd.get_dummies(data)
|
||||
expected = DataFrame([[1, 0],
|
||||
[0, 1],
|
||||
[1, 0]],
|
||||
columns=pd.Categorical(['a', 'b']),
|
||||
dtype=np.uint8)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_int_df(self, dtype):
|
||||
data = DataFrame(
|
||||
{'A': [1, 2, 1],
|
||||
'B': pd.Categorical(['a', 'b', 'a']),
|
||||
'C': [1, 2, 1],
|
||||
'D': [1., 2., 1.]
|
||||
}
|
||||
)
|
||||
columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b']
|
||||
expected = DataFrame([
|
||||
[1, 1., 1, 0, 1, 0],
|
||||
[2, 2., 0, 1, 0, 1],
|
||||
[1, 1., 1, 0, 1, 0]
|
||||
], columns=columns)
|
||||
expected[columns[2:]] = expected[columns[2:]].astype(dtype)
|
||||
result = pd.get_dummies(data, columns=['A', 'B'], dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):
|
||||
# GH13854
|
||||
for ordered in [False, True]:
|
||||
cat = pd.Categorical(list("xy"), categories=list("xyz"),
|
||||
ordered=ordered)
|
||||
result = get_dummies(cat, dtype=dtype)
|
||||
|
||||
data = np.array([[1, 0, 0], [0, 1, 0]],
|
||||
dtype=self.effective_dtype(dtype))
|
||||
cols = pd.CategoricalIndex(cat.categories,
|
||||
categories=cat.categories,
|
||||
ordered=ordered)
|
||||
expected = DataFrame(data, columns=cols,
|
||||
dtype=self.effective_dtype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('sparse', [True, False])
|
||||
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
|
||||
# GH18914
|
||||
df = DataFrame.from_dict(OrderedDict([('GDP', [1, 2]),
|
||||
('Nation', ['AB', 'CD'])]))
|
||||
df = get_dummies(df, columns=['Nation'], sparse=sparse)
|
||||
df2 = df.reindex(columns=['GDP'])
|
||||
|
||||
tm.assert_frame_equal(df[['GDP']], df2)
|
||||
|
||||
def test_get_dummies_duplicate_columns(self, df):
|
||||
# GH20839
|
||||
df.columns = ["A", "A", "A"]
|
||||
result = get_dummies(df).sort_index(axis=1)
|
||||
|
||||
expected = DataFrame([[1, 1, 0, 1, 0],
|
||||
[2, 0, 1, 1, 0],
|
||||
[3, 1, 0, 0, 1]],
|
||||
columns=['A', 'A_a', 'A_b', 'A_b', 'A_c'],
|
||||
dtype=np.uint8).sort_index(axis=1)
|
||||
|
||||
expected = expected.astype({"A": np.int64})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestCategoricalReshape(object):
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
|
||||
def test_reshaping_panel_categorical(self):
|
||||
|
||||
p = tm.makePanel()
|
||||
p['str'] = 'foo'
|
||||
df = p.to_frame()
|
||||
|
||||
df['category'] = df['str'].astype('category')
|
||||
result = df['category'].unstack()
|
||||
|
||||
c = Categorical(['foo'] * len(p.major_axis))
|
||||
expected = DataFrame({'A': c.copy(),
|
||||
'B': c.copy(),
|
||||
'C': c.copy(),
|
||||
'D': c.copy()},
|
||||
columns=Index(list('ABCD'), name='minor'),
|
||||
index=p.major_axis.set_names('major'))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestMakeAxisDummies(object):
|
||||
|
||||
def test_preserve_categorical_dtype(self):
|
||||
# GH13854
|
||||
for ordered in [False, True]:
|
||||
cidx = pd.CategoricalIndex(list("xyz"), ordered=ordered)
|
||||
midx = pd.MultiIndex(levels=[['a'], cidx],
|
||||
codes=[[0, 0], [0, 1]])
|
||||
df = DataFrame([[10, 11]], index=midx)
|
||||
|
||||
expected = DataFrame([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
|
||||
index=midx, columns=cidx)
|
||||
|
||||
from pandas.core.reshape.reshape import make_axis_dummies
|
||||
result = make_axis_dummies(df)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = make_axis_dummies(df, transform=lambda x: x)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
+346
@@ -0,0 +1,346 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.concat import union_categoricals
|
||||
|
||||
import pandas as pd
|
||||
from pandas import Categorical, CategoricalIndex, Series
|
||||
from pandas.util import testing as tm
|
||||
|
||||
|
||||
class TestUnionCategoricals(object):
|
||||
|
||||
def test_union_categorical(self):
|
||||
# GH 13361
|
||||
data = [
|
||||
(list('abc'), list('abd'), list('abcabd')),
|
||||
([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
|
||||
([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
|
||||
|
||||
(['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'],
|
||||
['b', 'b', np.nan, 'a', 'a', np.nan, 'c']),
|
||||
|
||||
(pd.date_range('2014-01-01', '2014-01-05'),
|
||||
pd.date_range('2014-01-06', '2014-01-07'),
|
||||
pd.date_range('2014-01-01', '2014-01-07')),
|
||||
|
||||
(pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'),
|
||||
pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'),
|
||||
pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')),
|
||||
|
||||
(pd.period_range('2014-01-01', '2014-01-05'),
|
||||
pd.period_range('2014-01-06', '2014-01-07'),
|
||||
pd.period_range('2014-01-01', '2014-01-07')),
|
||||
]
|
||||
|
||||
for a, b, combined in data:
|
||||
for box in [Categorical, CategoricalIndex, Series]:
|
||||
result = union_categoricals([box(Categorical(a)),
|
||||
box(Categorical(b))])
|
||||
expected = Categorical(combined)
|
||||
tm.assert_categorical_equal(result, expected,
|
||||
check_category_order=True)
|
||||
|
||||
# new categories ordered by appearance
|
||||
s = Categorical(['x', 'y', 'z'])
|
||||
s2 = Categorical(['a', 'b', 'c'])
|
||||
result = union_categoricals([s, s2])
|
||||
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
|
||||
categories=['x', 'y', 'z', 'a', 'b', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
s = Categorical([0, 1.2, 2], ordered=True)
|
||||
s2 = Categorical([0, 1.2, 2], ordered=True)
|
||||
result = union_categoricals([s, s2])
|
||||
expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# must exactly match types
|
||||
s = Categorical([0, 1.2, 2])
|
||||
s2 = Categorical([2, 3, 4])
|
||||
msg = 'dtype of categories must be the same'
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([s, s2])
|
||||
|
||||
msg = 'No Categoricals to union'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
union_categoricals([])
|
||||
|
||||
def test_union_categoricals_nan(self):
|
||||
# GH 13759
|
||||
res = union_categoricals([pd.Categorical([1, 2, np.nan]),
|
||||
pd.Categorical([3, 2, np.nan])])
|
||||
exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([pd.Categorical(['A', 'B']),
|
||||
pd.Categorical(['B', 'B', np.nan])])
|
||||
exp = Categorical(['A', 'B', 'B', 'B', np.nan])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'),
|
||||
pd.NaT]
|
||||
val2 = [pd.NaT, pd.Timestamp('2011-01-01'),
|
||||
pd.Timestamp('2011-02-01')]
|
||||
|
||||
res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)])
|
||||
exp = Categorical(val1 + val2,
|
||||
categories=[pd.Timestamp('2011-01-01'),
|
||||
pd.Timestamp('2011-03-01'),
|
||||
pd.Timestamp('2011-02-01')])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
# all NaN
|
||||
res = union_categoricals([pd.Categorical(np.array([np.nan, np.nan],
|
||||
dtype=object)),
|
||||
pd.Categorical(['X'])])
|
||||
exp = Categorical([np.nan, np.nan, 'X'])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([pd.Categorical([np.nan, np.nan]),
|
||||
pd.Categorical([np.nan, np.nan])])
|
||||
exp = Categorical([np.nan, np.nan, np.nan, np.nan])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
def test_union_categoricals_empty(self):
|
||||
# GH 13759
|
||||
res = union_categoricals([pd.Categorical([]),
|
||||
pd.Categorical([])])
|
||||
exp = Categorical([])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([Categorical([]),
|
||||
Categorical(['1'])])
|
||||
exp = Categorical(['1'])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
def test_union_categorical_same_category(self):
|
||||
# check fastpath
|
||||
c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
|
||||
c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
|
||||
res = union_categoricals([c1, c2])
|
||||
exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan],
|
||||
categories=[1, 2, 3, 4])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z'])
|
||||
c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z'])
|
||||
res = union_categoricals([c1, c2])
|
||||
exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'],
|
||||
categories=['x', 'y', 'z'])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
def test_union_categorical_same_categories_different_order(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/19096
|
||||
c1 = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c'])
|
||||
c2 = Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c'])
|
||||
result = union_categoricals([c1, c2])
|
||||
expected = Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
|
||||
categories=['a', 'b', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_ordered(self):
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], ordered=False)
|
||||
|
||||
msg = 'Categorical.ordered must be the same'
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2])
|
||||
|
||||
res = union_categoricals([c1, c1])
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
|
||||
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
|
||||
|
||||
res = union_categoricals([c1, c2])
|
||||
exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
|
||||
|
||||
msg = "to union ordered Categoricals, all categories must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2])
|
||||
|
||||
def test_union_categoricals_ignore_order(self):
|
||||
# GH 15219
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], ordered=False)
|
||||
|
||||
res = union_categoricals([c1, c2], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
msg = 'Categorical.ordered must be the same'
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2], ignore_order=False)
|
||||
|
||||
res = union_categoricals([c1, c1], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([c1, c1], ignore_order=False)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3],
|
||||
categories=[1, 2, 3], ordered=True)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
|
||||
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
|
||||
|
||||
res = union_categoricals([c1, c2], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, np.nan, 3, 2])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
|
||||
|
||||
res = union_categoricals([c1, c2], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([c2, c1], ignore_order=True,
|
||||
sort_categories=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([4, 5, 6], ordered=True)
|
||||
result = union_categoricals([c1, c2], ignore_order=True)
|
||||
expected = Categorical([1, 2, 3, 4, 5, 6])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
msg = "to union ordered Categoricals, all categories must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2], ignore_order=False)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2])
|
||||
|
||||
def test_union_categoricals_sort(self):
|
||||
# GH 13846
|
||||
c1 = Categorical(['x', 'y', 'z'])
|
||||
c2 = Categorical(['a', 'b', 'c'])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
|
||||
categories=['a', 'b', 'c', 'x', 'y', 'z'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# fastpath
|
||||
c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
|
||||
c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(['a', 'b', 'b', 'c'],
|
||||
categories=['a', 'b', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b'])
|
||||
c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b'])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(['a', 'b', 'b', 'c'],
|
||||
categories=['a', 'b', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# fastpath - skip resort
|
||||
c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
|
||||
c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(['a', 'b', 'b', 'c'],
|
||||
categories=['a', 'b', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(['x', np.nan])
|
||||
c2 = Categorical([np.nan, 'b'])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(['x', np.nan, np.nan, 'b'],
|
||||
categories=['b', 'x'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical([np.nan])
|
||||
c2 = Categorical([np.nan])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical([np.nan, np.nan])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical([])
|
||||
c2 = Categorical([])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical([])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
|
||||
c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
|
||||
with pytest.raises(TypeError):
|
||||
union_categoricals([c1, c2], sort_categories=True)
|
||||
|
||||
def test_union_categoricals_sort_false(self):
|
||||
# GH 13846
|
||||
c1 = Categorical(['x', 'y', 'z'])
|
||||
c2 = Categorical(['a', 'b', 'c'])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
|
||||
categories=['x', 'y', 'z', 'a', 'b', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# fastpath
|
||||
c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
|
||||
c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(['a', 'b', 'b', 'c'],
|
||||
categories=['b', 'a', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# fastpath - skip resort
|
||||
c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
|
||||
c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(['a', 'b', 'b', 'c'],
|
||||
categories=['a', 'b', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(['x', np.nan])
|
||||
c2 = Categorical([np.nan, 'b'])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(['x', np.nan, np.nan, 'b'],
|
||||
categories=['x', 'b'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical([np.nan])
|
||||
c2 = Categorical([np.nan])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical([np.nan, np.nan])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical([])
|
||||
c2 = Categorical([])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical([])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
|
||||
c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(['b', 'a', 'a', 'c'],
|
||||
categories=['b', 'a', 'c'], ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categorical_unwrap(self):
|
||||
# GH 14173
|
||||
c1 = Categorical(['a', 'b'])
|
||||
c2 = pd.Series(['b', 'c'], dtype='category')
|
||||
result = union_categoricals([c1, c2])
|
||||
expected = Categorical(['a', 'b', 'b', 'c'])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c2 = CategoricalIndex(c2)
|
||||
result = union_categoricals([c1, c2])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Series(c1)
|
||||
result = union_categoricals([c1, c2])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
union_categoricals([c1, ['a', 'b', 'c']])
|
||||
@@ -0,0 +1,53 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import Index, date_range
|
||||
from pandas.core.reshape.util import cartesian_product
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
class TestCartesianProduct(object):
|
||||
|
||||
def test_simple(self):
|
||||
x, y = list('ABC'), [1, 22]
|
||||
result1, result2 = cartesian_product([x, y])
|
||||
expected1 = np.array(['A', 'A', 'B', 'B', 'C', 'C'])
|
||||
expected2 = np.array([1, 22, 1, 22, 1, 22])
|
||||
tm.assert_numpy_array_equal(result1, expected1)
|
||||
tm.assert_numpy_array_equal(result2, expected2)
|
||||
|
||||
def test_datetimeindex(self):
|
||||
# regression test for GitHub issue #6439
|
||||
# make sure that the ordering on datetimeindex is consistent
|
||||
x = date_range('2000-01-01', periods=2)
|
||||
result1, result2 = [Index(y).day for y in cartesian_product([x, x])]
|
||||
expected1 = Index([1, 1, 2, 2])
|
||||
expected2 = Index([1, 2, 1, 2])
|
||||
tm.assert_index_equal(result1, expected1)
|
||||
tm.assert_index_equal(result2, expected2)
|
||||
|
||||
def test_empty(self):
|
||||
# product of empty factors
|
||||
X = [[], [0, 1], []]
|
||||
Y = [[], [], ['a', 'b', 'c']]
|
||||
for x, y in zip(X, Y):
|
||||
expected1 = np.array([], dtype=np.asarray(x).dtype)
|
||||
expected2 = np.array([], dtype=np.asarray(y).dtype)
|
||||
result1, result2 = cartesian_product([x, y])
|
||||
tm.assert_numpy_array_equal(result1, expected1)
|
||||
tm.assert_numpy_array_equal(result2, expected2)
|
||||
|
||||
# empty product (empty input):
|
||||
result = cartesian_product([])
|
||||
expected = []
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize("X", [
|
||||
1, [1], [1, 2], [[1], 2],
|
||||
'a', ['a'], ['a', 'b'], [['a'], 'b']
|
||||
])
|
||||
def test_invalid_input(self, X):
|
||||
msg = "Input must be a list-like of list-likes"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
cartesian_product(X=X)
|
||||
Reference in New Issue
Block a user