pruned venvs

This commit is contained in:
d3m1g0d
2019-03-12 21:56:25 +01:00
parent 8ee094481c
commit 33f0511081
4095 changed files with 0 additions and 748399 deletions
@@ -1,812 +0,0 @@
# pylint: disable=E1103
from warnings import catch_warnings
from numpy.random import randn
import numpy as np
import pytest
import pandas as pd
from pandas.compat import lrange
import pandas.compat as compat
from pandas.util.testing import assert_frame_equal
from pandas import DataFrame, MultiIndex, Series, Index, merge, concat
from pandas._libs import join as libjoin
import pandas.util.testing as tm
from pandas.tests.reshape.merge.test_merge import get_test_data, N, NGROUPS
a_ = np.array
class TestJoin(object):
def setup_method(self, method):
# aggregate multiple columns
self.df = DataFrame({'key1': get_test_data(),
'key2': get_test_data(),
'data1': np.random.randn(N),
'data2': np.random.randn(N)})
# exclude a couple keys for fun
self.df = self.df[self.df['key2'] > 1]
self.df2 = DataFrame({'key1': get_test_data(n=N // 5),
'key2': get_test_data(ngroups=NGROUPS // 2,
n=N // 5),
'value': np.random.randn(N // 5)})
index, data = tm.getMixedTypeDict()
self.target = DataFrame(data, index=index)
# Join on string value
self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']},
index=data['C'])
def test_cython_left_outer_join(self):
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
max_group = 5
ls, rs = libjoin.left_outer_join(left, right, max_group)
exp_ls = left.argsort(kind='mergesort')
exp_rs = right.argsort(kind='mergesort')
exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
6, 6, 7, 7, 8, 8, 9, 10])
exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
4, 5, 4, 5, 4, 5, -1, -1])
exp_ls = exp_ls.take(exp_li)
exp_ls[exp_li == -1] = -1
exp_rs = exp_rs.take(exp_ri)
exp_rs[exp_ri == -1] = -1
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
def test_cython_right_outer_join(self):
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
max_group = 5
rs, ls = libjoin.left_outer_join(right, left, max_group)
exp_ls = left.argsort(kind='mergesort')
exp_rs = right.argsort(kind='mergesort')
# 0 1 1 1
exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5,
# 2 2 4
6, 7, 8, 6, 7, 8, -1])
exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3,
4, 4, 4, 5, 5, 5, 6])
exp_ls = exp_ls.take(exp_li)
exp_ls[exp_li == -1] = -1
exp_rs = exp_rs.take(exp_ri)
exp_rs[exp_ri == -1] = -1
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
def test_cython_inner_join(self):
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
max_group = 5
ls, rs = libjoin.inner_join(left, right, max_group)
exp_ls = left.argsort(kind='mergesort')
exp_rs = right.argsort(kind='mergesort')
exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
6, 6, 7, 7, 8, 8])
exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
4, 5, 4, 5, 4, 5])
exp_ls = exp_ls.take(exp_li)
exp_ls[exp_li == -1] = -1
exp_rs = exp_rs.take(exp_ri)
exp_rs[exp_ri == -1] = -1
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
def test_left_outer_join(self):
joined_key2 = merge(self.df, self.df2, on='key2')
_check_join(self.df, self.df2, joined_key2, ['key2'], how='left')
joined_both = merge(self.df, self.df2)
_check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
how='left')
def test_right_outer_join(self):
joined_key2 = merge(self.df, self.df2, on='key2', how='right')
_check_join(self.df, self.df2, joined_key2, ['key2'], how='right')
joined_both = merge(self.df, self.df2, how='right')
_check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
how='right')
def test_full_outer_join(self):
joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
_check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')
joined_both = merge(self.df, self.df2, how='outer')
_check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
how='outer')
def test_inner_join(self):
joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
_check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')
joined_both = merge(self.df, self.df2, how='inner')
_check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
how='inner')
def test_handle_overlap(self):
joined = merge(self.df, self.df2, on='key2',
suffixes=['.foo', '.bar'])
assert 'key1.foo' in joined
assert 'key1.bar' in joined
def test_handle_overlap_arbitrary_key(self):
joined = merge(self.df, self.df2,
left_on='key2', right_on='key1',
suffixes=['.foo', '.bar'])
assert 'key1.foo' in joined
assert 'key2.bar' in joined
def test_join_on(self):
target = self.target
source = self.source
merged = target.join(source, on='C')
tm.assert_series_equal(merged['MergedA'], target['A'],
check_names=False)
tm.assert_series_equal(merged['MergedD'], target['D'],
check_names=False)
# join with duplicates (fix regression from DataFrame/Matrix merge)
df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
joined = df.join(df2, on='key')
expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'],
'value': [0, 0, 1, 1, 2]})
assert_frame_equal(joined, expected)
# Test when some are missing
df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'],
columns=['one'])
df_b = DataFrame([['foo'], ['bar']], index=[1, 2],
columns=['two'])
df_c = DataFrame([[1], [2]], index=[1, 2],
columns=['three'])
joined = df_a.join(df_b, on='one')
joined = joined.join(df_c, on='one')
assert np.isnan(joined['two']['c'])
assert np.isnan(joined['three']['c'])
# merge column not p resent
pytest.raises(KeyError, target.join, source, on='E')
# overlap
source_copy = source.copy()
source_copy['A'] = 0
pytest.raises(ValueError, target.join, source_copy, on='A')
def test_join_on_fails_with_different_right_index(self):
with pytest.raises(ValueError):
df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
'b': np.random.randn(3)})
df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
'b': np.random.randn(10)},
index=tm.makeCustomIndex(10, 2))
merge(df, df2, left_on='a', right_index=True)
def test_join_on_fails_with_different_left_index(self):
with pytest.raises(ValueError):
df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
'b': np.random.randn(3)},
index=tm.makeCustomIndex(10, 2))
df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
'b': np.random.randn(10)})
merge(df, df2, right_on='b', left_index=True)
def test_join_on_fails_with_different_column_counts(self):
with pytest.raises(ValueError):
df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
'b': np.random.randn(3)})
df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
'b': np.random.randn(10)},
index=tm.makeCustomIndex(10, 2))
merge(df, df2, right_on='a', left_on=['a', 'b'])
def test_join_on_fails_with_wrong_object_type(self):
# GH12081
wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])]
df = DataFrame({'a': [1, 1]})
for obj in wrongly_typed:
with tm.assert_raises_regex(ValueError, str(type(obj))):
merge(obj, df, left_on='a', right_on='a')
with tm.assert_raises_regex(ValueError, str(type(obj))):
merge(df, obj, left_on='a', right_on='a')
def test_join_on_pass_vector(self):
expected = self.target.join(self.source, on='C')
del expected['C']
join_col = self.target.pop('C')
result = self.target.join(self.source, on=join_col)
assert_frame_equal(result, expected)
def test_join_with_len0(self):
# nothing to merge
merged = self.target.join(self.source.reindex([]), on='C')
for col in self.source:
assert col in merged
assert merged[col].isna().all()
merged2 = self.target.join(self.source.reindex([]), on='C',
how='inner')
tm.assert_index_equal(merged2.columns, merged.columns)
assert len(merged2) == 0
def test_join_on_inner(self):
df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])
joined = df.join(df2, on='key', how='inner')
expected = df.join(df2, on='key')
expected = expected[expected['value'].notna()]
tm.assert_series_equal(joined['key'], expected['key'],
check_dtype=False)
tm.assert_series_equal(joined['value'], expected['value'],
check_dtype=False)
tm.assert_index_equal(joined.index, expected.index)
def test_join_on_singlekey_list(self):
df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
# corner cases
joined = df.join(df2, on=['key'])
expected = df.join(df2, on='key')
assert_frame_equal(joined, expected)
def test_join_on_series(self):
result = self.target.join(self.source['MergedA'], on='C')
expected = self.target.join(self.source[['MergedA']], on='C')
assert_frame_equal(result, expected)
def test_join_on_series_buglet(self):
# GH #638
df = DataFrame({'a': [1, 1]})
ds = Series([2], index=[1], name='b')
result = df.join(ds, on='a')
expected = DataFrame({'a': [1, 1],
'b': [2, 2]}, index=df.index)
tm.assert_frame_equal(result, expected)
def test_join_index_mixed(self, join_type):
# no overlapping blocks
df1 = DataFrame(index=np.arange(10))
df1['bool'] = True
df1['string'] = 'foo'
df2 = DataFrame(index=np.arange(5, 15))
df2['int'] = 1
df2['float'] = 1.
joined = df1.join(df2, how=join_type)
expected = _join_by_hand(df1, df2, how=join_type)
assert_frame_equal(joined, expected)
joined = df2.join(df1, how=join_type)
expected = _join_by_hand(df2, df1, how=join_type)
assert_frame_equal(joined, expected)
def test_join_index_mixed_overlap(self):
df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
index=np.arange(10),
columns=['A', 'B', 'C', 'D'])
assert df1['B'].dtype == np.int64
assert df1['D'].dtype == np.bool_
df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
index=np.arange(0, 10, 2),
columns=['A', 'B', 'C', 'D'])
# overlap
joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
expected_columns = ['A_one', 'B_one', 'C_one', 'D_one',
'A_two', 'B_two', 'C_two', 'D_two']
df1.columns = expected_columns[:4]
df2.columns = expected_columns[4:]
expected = _join_by_hand(df1, df2)
assert_frame_equal(joined, expected)
def test_join_empty_bug(self):
# generated an exception in 0.4.3
x = DataFrame()
x.join(DataFrame([3], index=[0], columns=['A']), how='outer')
def test_join_unconsolidated(self):
# GH #331
a = DataFrame(randn(30, 2), columns=['a', 'b'])
c = Series(randn(30))
a['c'] = c
d = DataFrame(randn(30, 1), columns=['q'])
# it works!
a.join(d)
d.join(a)
def test_join_multiindex(self):
index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'],
[1, 2, 3, 1, 2, 3]],
names=['first', 'second'])
index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'],
[1, 2, 3, 1, 2, 3]],
names=['first', 'second'])
df1 = DataFrame(data=np.random.randn(6), index=index1,
columns=['var X'])
df2 = DataFrame(data=np.random.randn(6), index=index2,
columns=['var Y'])
df1 = df1.sort_index(level=0)
df2 = df2.sort_index(level=0)
joined = df1.join(df2, how='outer')
ex_index = Index(index1.values).union(Index(index2.values))
expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
expected.index.names = index1.names
assert_frame_equal(joined, expected)
assert joined.index.names == index1.names
df1 = df1.sort_index(level=1)
df2 = df2.sort_index(level=1)
joined = df1.join(df2, how='outer').sort_index(level=0)
ex_index = Index(index1.values).union(Index(index2.values))
expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
expected.index.names = index1.names
assert_frame_equal(joined, expected)
assert joined.index.names == index1.names
def test_join_inner_multiindex(self):
key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
'qux', 'snap']
key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
'three', 'one']
data = np.random.randn(len(key1))
data = DataFrame({'key1': key1, 'key2': key2,
'data': data})
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
['one', 'two', 'three']],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=['first', 'second'])
to_join = DataFrame(np.random.randn(10, 3), index=index,
columns=['j_one', 'j_two', 'j_three'])
joined = data.join(to_join, on=['key1', 'key2'], how='inner')
expected = merge(data, to_join.reset_index(),
left_on=['key1', 'key2'],
right_on=['first', 'second'], how='inner',
sort=False)
expected2 = merge(to_join, data,
right_on=['key1', 'key2'], left_index=True,
how='inner', sort=False)
assert_frame_equal(joined, expected2.reindex_like(joined))
expected2 = merge(to_join, data, right_on=['key1', 'key2'],
left_index=True, how='inner', sort=False)
expected = expected.drop(['first', 'second'], axis=1)
expected.index = joined.index
assert joined.index.is_monotonic
assert_frame_equal(joined, expected)
# _assert_same_contents(expected, expected2.loc[:, expected.columns])
def test_join_hierarchical_mixed(self):
# GH 2024
df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
other_df = DataFrame(
[(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
other_df.set_index('a', inplace=True)
# GH 9455, 12219
with tm.assert_produces_warning(UserWarning):
result = merge(new_df, other_df, left_index=True, right_index=True)
assert ('b', 'mean') in result
assert 'b' in result
def test_join_float64_float32(self):
a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64)
b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32)
joined = a.join(b)
assert joined.dtypes['a'] == 'float64'
assert joined.dtypes['b'] == 'float64'
assert joined.dtypes['c'] == 'float32'
a = np.random.randint(0, 5, 100).astype('int64')
b = np.random.random(100).astype('float64')
c = np.random.random(100).astype('float32')
df = DataFrame({'a': a, 'b': b, 'c': c})
xpdf = DataFrame({'a': a, 'b': b, 'c': c})
s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
rs = df.merge(s, left_on='a', right_index=True)
assert rs.dtypes['a'] == 'int64'
assert rs.dtypes['b'] == 'float64'
assert rs.dtypes['c'] == 'float32'
assert rs.dtypes['md'] == 'float32'
xp = xpdf.merge(s, left_on='a', right_index=True)
assert_frame_equal(rs, xp)
def test_join_many_non_unique_index(self):
df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
idf1 = df1.set_index(["a", "b"])
idf2 = df2.set_index(["a", "b"])
idf3 = df3.set_index(["a", "b"])
result = idf1.join([idf2, idf3], how='outer')
df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')
result = result.reset_index()
expected = expected[result.columns]
expected['a'] = expected.a.astype('int64')
expected['b'] = expected.b.astype('int64')
assert_frame_equal(result, expected)
df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
df3 = DataFrame(
{"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
idf1 = df1.set_index(["a", "b"])
idf2 = df2.set_index(["a", "b"])
idf3 = df3.set_index(["a", "b"])
result = idf1.join([idf2, idf3], how='inner')
df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')
result = result.reset_index()
assert_frame_equal(result, expected.loc[:, result.columns])
# GH 11519
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})
s = Series(np.repeat(np.arange(8), 2),
index=np.repeat(np.arange(8), 2), name='TEST')
inner = df.join(s, how='inner')
outer = df.join(s, how='outer')
left = df.join(s, how='left')
right = df.join(s, how='right')
assert_frame_equal(inner, outer)
assert_frame_equal(inner, left)
assert_frame_equal(inner, right)
def test_join_sort(self):
left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
'value': [1, 2, 3, 4]})
right = DataFrame({'value2': ['a', 'b', 'c']},
index=['bar', 'baz', 'foo'])
joined = left.join(right, on='key', sort=True)
expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'],
'value': [2, 3, 1, 4],
'value2': ['a', 'b', 'c', 'c']},
index=[1, 2, 0, 3])
assert_frame_equal(joined, expected)
# smoke test
joined = left.join(right, on='key', sort=False)
tm.assert_index_equal(joined.index, pd.Index(lrange(4)))
def test_join_mixed_non_unique_index(self):
# GH 12814, unorderable types in py3 with a non-unique index
df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a'])
df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4])
result = df1.join(df2)
expected = DataFrame({'a': [1, 2, 3, 3, 4],
'b': [5, np.nan, 6, 7, np.nan]},
index=[1, 2, 3, 3, 'a'])
tm.assert_frame_equal(result, expected)
df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a'])
df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4])
result = df3.join(df4)
expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]},
index=[1, 2, 2, 'a'])
tm.assert_frame_equal(result, expected)
def test_join_non_unique_period_index(self):
# GH #16871
index = pd.period_range('2016-01-01', periods=16, freq='M')
df = DataFrame([i for i in range(len(index))],
index=index, columns=['pnum'])
df2 = concat([df, df])
result = df.join(df2, how='inner', rsuffix='_df2')
expected = DataFrame(
np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
columns=['pnum', 'pnum_df2'], index=df2.sort_index().index)
tm.assert_frame_equal(result, expected)
def test_mixed_type_join_with_suffix(self):
# GH #916
df = DataFrame(np.random.randn(20, 6),
columns=['a', 'b', 'c', 'd', 'e', 'f'])
df.insert(0, 'id', 0)
df.insert(5, 'dt', 'foo')
grouped = df.groupby('id')
mn = grouped.mean()
cn = grouped.count()
# it works!
mn.join(cn, rsuffix='_right')
def test_join_many(self):
df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]
joined = df_list[0].join(df_list[1:])
tm.assert_frame_equal(joined, df)
df_list = [df[['a', 'b']][:-2],
df[['c', 'd']][2:], df[['e', 'f']][1:9]]
def _check_diff_index(df_list, result, exp_index):
reindexed = [x.reindex(exp_index) for x in df_list]
expected = reindexed[0].join(reindexed[1:])
tm.assert_frame_equal(result, expected)
# different join types
joined = df_list[0].join(df_list[1:], how='outer')
_check_diff_index(df_list, joined, df.index)
joined = df_list[0].join(df_list[1:])
_check_diff_index(df_list, joined, df_list[0].index)
joined = df_list[0].join(df_list[1:], how='inner')
_check_diff_index(df_list, joined, df.index[2:8])
pytest.raises(ValueError, df_list[0].join, df_list[1:], on='a')
def test_join_many_mixed(self):
df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
df['key'] = ['foo', 'bar'] * 4
df1 = df.loc[:, ['A', 'B']]
df2 = df.loc[:, ['C', 'D']]
df3 = df.loc[:, ['key']]
result = df1.join([df2, df3])
assert_frame_equal(result, df)
def test_join_dups(self):
# joining dups
df = concat([DataFrame(np.random.randn(10, 4),
columns=['A', 'A', 'B', 'B']),
DataFrame(np.random.randint(0, 10, size=20)
.reshape(10, 2),
columns=['A', 'C'])],
axis=1)
expected = concat([df, df], axis=1)
result = df.join(df, rsuffix='_2')
result.columns = expected.columns
assert_frame_equal(result, expected)
# GH 4975, invalid join on dups
w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
dta = x.merge(y, left_index=True, right_index=True).merge(
z, left_index=True, right_index=True, how="outer")
dta = dta.merge(w, left_index=True, right_index=True)
expected = concat([x, y, z, w], axis=1)
expected.columns = ['x_x', 'y_x', 'x_y',
'y_y', 'x_x', 'y_x', 'x_y', 'y_y']
assert_frame_equal(dta, expected)
def test_panel_join(self):
with catch_warnings(record=True):
panel = tm.makePanel()
tm.add_nans(panel)
p1 = panel.iloc[:2, :10, :3]
p2 = panel.iloc[2:, 5:, 2:]
# left join
result = p1.join(p2)
expected = p1.copy()
expected['ItemC'] = p2['ItemC']
tm.assert_panel_equal(result, expected)
# right join
result = p1.join(p2, how='right')
expected = p2.copy()
expected['ItemA'] = p1['ItemA']
expected['ItemB'] = p1['ItemB']
expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC'])
tm.assert_panel_equal(result, expected)
# inner join
result = p1.join(p2, how='inner')
expected = panel.iloc[:, 5:10, 2:3]
tm.assert_panel_equal(result, expected)
# outer join
result = p1.join(p2, how='outer')
expected = p1.reindex(major=panel.major_axis,
minor=panel.minor_axis)
expected = expected.join(p2.reindex(major=panel.major_axis,
minor=panel.minor_axis))
tm.assert_panel_equal(result, expected)
def test_panel_join_overlap(self):
with catch_warnings(record=True):
panel = tm.makePanel()
tm.add_nans(panel)
p1 = panel.loc[['ItemA', 'ItemB', 'ItemC']]
p2 = panel.loc[['ItemB', 'ItemC']]
# Expected index is
#
# ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2
joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2')
p1_suf = p1.loc[['ItemB', 'ItemC']].add_suffix('_p1')
p2_suf = p2.loc[['ItemB', 'ItemC']].add_suffix('_p2')
no_overlap = panel.loc[['ItemA']]
expected = no_overlap.join(p1_suf.join(p2_suf))
tm.assert_panel_equal(joined, expected)
def test_panel_join_many(self):
with catch_warnings(record=True):
tm.K = 10
panel = tm.makePanel()
tm.K = 4
panels = [panel.iloc[:2], panel.iloc[2:6], panel.iloc[6:]]
joined = panels[0].join(panels[1:])
tm.assert_panel_equal(joined, panel)
panels = [panel.iloc[:2, :-5],
panel.iloc[2:6, 2:],
panel.iloc[6:, 5:-7]]
data_dict = {}
for p in panels:
data_dict.update(p.iteritems())
joined = panels[0].join(panels[1:], how='inner')
expected = pd.Panel.from_dict(data_dict, intersect=True)
tm.assert_panel_equal(joined, expected)
joined = panels[0].join(panels[1:], how='outer')
expected = pd.Panel.from_dict(data_dict, intersect=False)
tm.assert_panel_equal(joined, expected)
# edge cases
pytest.raises(ValueError, panels[0].join, panels[1:],
how='outer', lsuffix='foo', rsuffix='bar')
pytest.raises(ValueError, panels[0].join, panels[1:],
how='right')
def _check_join(left, right, result, join_col, how='left',
lsuffix='_x', rsuffix='_y'):
# some smoke tests
for c in join_col:
assert(result[c].notna().all())
left_grouped = left.groupby(join_col)
right_grouped = right.groupby(join_col)
for group_key, group in result.groupby(join_col):
l_joined = _restrict_to_columns(group, left.columns, lsuffix)
r_joined = _restrict_to_columns(group, right.columns, rsuffix)
try:
lgroup = left_grouped.get_group(group_key)
except KeyError:
if how in ('left', 'inner'):
raise AssertionError('key %s should not have been in the join'
% str(group_key))
_assert_all_na(l_joined, left.columns, join_col)
else:
_assert_same_contents(l_joined, lgroup)
try:
rgroup = right_grouped.get_group(group_key)
except KeyError:
if how in ('right', 'inner'):
raise AssertionError('key %s should not have been in the join'
% str(group_key))
_assert_all_na(r_joined, right.columns, join_col)
else:
_assert_same_contents(r_joined, rgroup)
def _restrict_to_columns(group, columns, suffix):
found = [c for c in group.columns
if c in columns or c.replace(suffix, '') in columns]
# filter
group = group.loc[:, found]
# get rid of suffixes, if any
group = group.rename(columns=lambda x: x.replace(suffix, ''))
# put in the right order...
group = group.loc[:, columns]
return group
def _assert_same_contents(join_chunk, source):
NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly...
jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values
svalues = source.fillna(NA_SENTINEL).drop_duplicates().values
rows = {tuple(row) for row in jvalues}
assert(len(rows) == len(source))
assert(all(tuple(row) in rows for row in svalues))
def _assert_all_na(join_chunk, source_columns, join_col):
for c in source_columns:
if c in join_col:
continue
assert(join_chunk[c].isna().all())
def _join_by_hand(a, b, how='left'):
join_index = a.index.join(b.index, how=how)
a_re = a.reindex(join_index)
b_re = b.reindex(join_index)
result_columns = a.columns.append(b.columns)
for col, s in compat.iteritems(b_re):
a_re[col] = s
return a_re.reindex(columns=result_columns)
@@ -1,213 +0,0 @@
import numpy as np
import pytest
from pandas import DataFrame
from pandas.util import testing as tm
from pandas.util.testing import assert_frame_equal
@pytest.fixture
def df1():
return DataFrame(dict(
outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4],
inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2],
v1=np.linspace(0, 1, 11)))
@pytest.fixture
def df2():
return DataFrame(dict(
outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3],
inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3],
v2=np.linspace(10, 11, 12)))
@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']])
def left_df(request, df1):
""" Construct left test DataFrame with specified levels
(any of 'outer', 'inner', and 'v1')"""
levels = request.param
if levels:
df1 = df1.set_index(levels)
return df1
@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']])
def right_df(request, df2):
""" Construct right test DataFrame with specified levels
(any of 'outer', 'inner', and 'v2')"""
levels = request.param
if levels:
df2 = df2.set_index(levels)
return df2
def compute_expected(df_left, df_right,
on=None, left_on=None, right_on=None, how=None):
"""
Compute the expected merge result for the test case.
This method computes the expected result of merging two DataFrames on
a combination of their columns and index levels. It does so by
explicitly dropping/resetting their named index levels, performing a
merge on their columns, and then finally restoring the appropriate
index in the result.
Parameters
----------
df_left : DataFrame
The left DataFrame (may have zero or more named index levels)
df_right : DataFrame
The right DataFrame (may have zero or more named index levels)
on : list of str
The on parameter to the merge operation
left_on : list of str
The left_on parameter to the merge operation
right_on : list of str
The right_on parameter to the merge operation
how : str
The how parameter to the merge operation
Returns
-------
DataFrame
The expected merge result
"""
# Handle on param if specified
if on is not None:
left_on, right_on = on, on
# Compute input named index levels
left_levels = [n for n in df_left.index.names if n is not None]
right_levels = [n for n in df_right.index.names if n is not None]
# Compute output named index levels
output_levels = [i for i in left_on
if i in right_levels and i in left_levels]
# Drop index levels that aren't involved in the merge
drop_left = [n for n in left_levels if n not in left_on]
if drop_left:
df_left = df_left.reset_index(drop_left, drop=True)
drop_right = [n for n in right_levels if n not in right_on]
if drop_right:
df_right = df_right.reset_index(drop_right, drop=True)
# Convert remaining index levels to columns
reset_left = [n for n in left_levels if n in left_on]
if reset_left:
df_left = df_left.reset_index(level=reset_left)
reset_right = [n for n in right_levels if n in right_on]
if reset_right:
df_right = df_right.reset_index(level=reset_right)
# Perform merge
expected = df_left.merge(df_right,
left_on=left_on,
right_on=right_on,
how=how)
# Restore index levels
if output_levels:
expected = expected.set_index(output_levels)
return expected
@pytest.mark.parametrize('on,how',
[(['outer'], 'inner'),
(['inner'], 'left'),
(['outer', 'inner'], 'right'),
(['inner', 'outer'], 'outer')])
def test_merge_indexes_and_columns_on(left_df, right_df, on, how):
# Construct expected result
expected = compute_expected(left_df, right_df, on=on, how=how)
# Perform merge
result = left_df.merge(right_df, on=on, how=how)
assert_frame_equal(result, expected, check_like=True)
@pytest.mark.parametrize('left_on,right_on,how',
[(['outer'], ['outer'], 'inner'),
(['inner'], ['inner'], 'right'),
(['outer', 'inner'], ['outer', 'inner'], 'left'),
(['inner', 'outer'], ['inner', 'outer'], 'outer')])
def test_merge_indexes_and_columns_lefton_righton(
left_df, right_df, left_on, right_on, how):
# Construct expected result
expected = compute_expected(left_df, right_df,
left_on=left_on,
right_on=right_on,
how=how)
# Perform merge
result = left_df.merge(right_df,
left_on=left_on, right_on=right_on, how=how)
assert_frame_equal(result, expected, check_like=True)
@pytest.mark.parametrize('left_index',
['inner', ['inner', 'outer']])
def test_join_indexes_and_columns_on(df1, df2, left_index, join_type):
# Construct left_df
left_df = df1.set_index(left_index)
# Construct right_df
right_df = df2.set_index(['outer', 'inner'])
# Result
expected = (left_df.reset_index()
.join(right_df, on=['outer', 'inner'], how=join_type,
lsuffix='_x', rsuffix='_y')
.set_index(left_index))
# Perform join
result = left_df.join(right_df, on=['outer', 'inner'], how=join_type,
lsuffix='_x', rsuffix='_y')
assert_frame_equal(result, expected, check_like=True)
def test_merge_index_column_precedence(df1, df2):
# Construct left_df with both an index and a column named 'outer'.
# We make this 'outer' column equal to the 'inner' column so that we
# can verify that the correct values are used by the merge operation
left_df = df1.set_index('outer')
left_df['outer'] = left_df['inner']
# Construct right_df with an index level named 'outer'
right_df = df2.set_index('outer')
# Construct expected result.
# The 'outer' column from left_df is chosen and the resulting
# frame has no index levels
expected = (left_df.reset_index(level='outer', drop=True)
.merge(right_df.reset_index(), on=['outer', 'inner']))
# Merge left_df and right_df on 'outer' and 'inner'
# 'outer' for left_df should refer to the 'outer' column, not the
# 'outer' index level and a FutureWarning should be raised
with tm.assert_produces_warning(FutureWarning):
result = left_df.merge(right_df, on=['outer', 'inner'])
# Check results
assert_frame_equal(result, expected)
# Perform the same using the left_on and right_on parameters
with tm.assert_produces_warning(FutureWarning):
result = left_df.merge(right_df,
left_on=['outer', 'inner'],
right_on=['outer', 'inner'])
assert_frame_equal(result, expected)
@@ -1,102 +0,0 @@
import pandas as pd
from pandas import DataFrame, merge_ordered
from pandas.util import testing as tm
from pandas.util.testing import assert_frame_equal
from numpy import nan
class TestMergeOrdered(object):
def setup_method(self, method):
self.left = DataFrame({'key': ['a', 'c', 'e'],
'lvalue': [1, 2., 3]})
self.right = DataFrame({'key': ['b', 'c', 'd', 'f'],
'rvalue': [1, 2, 3., 4]})
def test_basic(self):
result = merge_ordered(self.left, self.right, on='key')
expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'],
'lvalue': [1, nan, 2, nan, 3, nan],
'rvalue': [nan, 1, 2, 3, nan, 4]})
assert_frame_equal(result, expected)
def test_ffill(self):
result = merge_ordered(
self.left, self.right, on='key', fill_method='ffill')
expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'],
'lvalue': [1., 1, 2, 2, 3, 3.],
'rvalue': [nan, 1, 2, 3, 3, 4]})
assert_frame_equal(result, expected)
def test_multigroup(self):
left = pd.concat([self.left, self.left], ignore_index=True)
left['group'] = ['a'] * 3 + ['b'] * 3
result = merge_ordered(left, self.right, on='key', left_by='group',
fill_method='ffill')
expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2,
'lvalue': [1., 1, 2, 2, 3, 3.] * 2,
'rvalue': [nan, 1, 2, 3, 3, 4] * 2})
expected['group'] = ['a'] * 6 + ['b'] * 6
assert_frame_equal(result, expected.loc[:, result.columns])
result2 = merge_ordered(self.right, left, on='key', right_by='group',
fill_method='ffill')
assert_frame_equal(result, result2.loc[:, result.columns])
result = merge_ordered(left, self.right, on='key', left_by='group')
assert result['group'].notna().all()
def test_merge_type(self):
class NotADataFrame(DataFrame):
@property
def _constructor(self):
return NotADataFrame
nad = NotADataFrame(self.left)
result = nad.merge(self.right, on='key')
assert isinstance(result, NotADataFrame)
def test_empty_sequence_concat(self):
# GH 9157
empty_pat = "[Nn]o objects"
none_pat = "objects.*None"
test_cases = [
((), empty_pat),
([], empty_pat),
({}, empty_pat),
([None], none_pat),
([None, None], none_pat)
]
for df_seq, pattern in test_cases:
tm.assert_raises_regex(ValueError, pattern, pd.concat, df_seq)
pd.concat([pd.DataFrame()])
pd.concat([None, pd.DataFrame()])
pd.concat([pd.DataFrame(), None])
def test_doc_example(self):
left = DataFrame({'group': list('aaabbb'),
'key': ['a', 'c', 'e', 'a', 'c', 'e'],
'lvalue': [1, 2, 3] * 2,
})
right = DataFrame({'key': ['b', 'c', 'd'],
'rvalue': [1, 2, 3]})
result = merge_ordered(left, right, fill_method='ffill',
left_by='group')
expected = DataFrame({'group': list('aaaaabbbbb'),
'key': ['a', 'b', 'c', 'd', 'e'] * 2,
'lvalue': [1, 1, 2, 2, 3] * 2,
'rvalue': [nan, 1, 2, 3, 3] * 2})
assert_frame_equal(result, expected)
File diff suppressed because it is too large Load Diff
@@ -1,642 +0,0 @@
# -*- coding: utf-8 -*-
# pylint: disable-msg=W0612,E1101
import pytest
from pandas import DataFrame
import pandas as pd
from numpy import nan
import numpy as np
from pandas import melt, lreshape, wide_to_long
import pandas.util.testing as tm
from pandas.compat import range
class TestMelt(object):
def setup_method(self, method):
self.df = tm.makeTimeDataFrame()[:10]
self.df['id1'] = (self.df['A'] > 0).astype(np.int64)
self.df['id2'] = (self.df['B'] > 0).astype(np.int64)
self.var_name = 'var'
self.value_name = 'val'
self.df1 = pd.DataFrame([[1.067683, -1.110463, 0.20867
], [-1.321405, 0.368915, -1.055342],
[-0.807333, 0.08298, -0.873361]])
self.df1.columns = [list('ABC'), list('abc')]
self.df1.columns.names = ['CAP', 'low']
def test_top_level_method(self):
result = melt(self.df)
assert result.columns.tolist() == ['variable', 'value']
def test_method_signatures(self):
tm.assert_frame_equal(self.df.melt(),
melt(self.df))
tm.assert_frame_equal(self.df.melt(id_vars=['id1', 'id2'],
value_vars=['A', 'B']),
melt(self.df,
id_vars=['id1', 'id2'],
value_vars=['A', 'B']))
tm.assert_frame_equal(self.df.melt(var_name=self.var_name,
value_name=self.value_name),
melt(self.df,
var_name=self.var_name,
value_name=self.value_name))
tm.assert_frame_equal(self.df1.melt(col_level=0),
melt(self.df1, col_level=0))
def test_default_col_names(self):
result = self.df.melt()
assert result.columns.tolist() == ['variable', 'value']
result1 = self.df.melt(id_vars=['id1'])
assert result1.columns.tolist() == ['id1', 'variable', 'value']
result2 = self.df.melt(id_vars=['id1', 'id2'])
assert result2.columns.tolist() == ['id1', 'id2', 'variable', 'value']
def test_value_vars(self):
result3 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A')
assert len(result3) == 10
result4 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'])
expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
'variable': ['A'] * 10 + ['B'] * 10,
'value': (self.df['A'].tolist() +
self.df['B'].tolist())},
columns=['id1', 'id2', 'variable', 'value'])
tm.assert_frame_equal(result4, expected4)
def test_value_vars_types(self):
# GH 15348
expected = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
'variable': ['A'] * 10 + ['B'] * 10,
'value': (self.df['A'].tolist() +
self.df['B'].tolist())},
columns=['id1', 'id2', 'variable', 'value'])
for type_ in (tuple, list, np.array):
result = self.df.melt(id_vars=['id1', 'id2'],
value_vars=type_(('A', 'B')))
tm.assert_frame_equal(result, expected)
def test_vars_work_with_multiindex(self):
expected = DataFrame({
('A', 'a'): self.df1[('A', 'a')],
'CAP': ['B'] * len(self.df1),
'low': ['b'] * len(self.df1),
'value': self.df1[('B', 'b')],
}, columns=[('A', 'a'), 'CAP', 'low', 'value'])
result = self.df1.melt(id_vars=[('A', 'a')], value_vars=[('B', 'b')])
tm.assert_frame_equal(result, expected)
def test_tuple_vars_fail_with_multiindex(self):
# melt should fail with an informative error message if
# the columns have a MultiIndex and a tuple is passed
# for id_vars or value_vars.
tuple_a = ('A', 'a')
list_a = [tuple_a]
tuple_b = ('B', 'b')
list_b = [tuple_b]
for id_vars, value_vars in ((tuple_a, list_b), (list_a, tuple_b),
(tuple_a, tuple_b)):
with tm.assert_raises_regex(ValueError, r'MultiIndex'):
self.df1.melt(id_vars=id_vars, value_vars=value_vars)
def test_custom_var_name(self):
result5 = self.df.melt(var_name=self.var_name)
assert result5.columns.tolist() == ['var', 'value']
result6 = self.df.melt(id_vars=['id1'], var_name=self.var_name)
assert result6.columns.tolist() == ['id1', 'var', 'value']
result7 = self.df.melt(id_vars=['id1', 'id2'], var_name=self.var_name)
assert result7.columns.tolist() == ['id1', 'id2', 'var', 'value']
result8 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
var_name=self.var_name)
assert result8.columns.tolist() == ['id1', 'id2', 'var', 'value']
result9 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
var_name=self.var_name)
expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
self.var_name: ['A'] * 10 + ['B'] * 10,
'value': (self.df['A'].tolist() +
self.df['B'].tolist())},
columns=['id1', 'id2', self.var_name, 'value'])
tm.assert_frame_equal(result9, expected9)
def test_custom_value_name(self):
result10 = self.df.melt(value_name=self.value_name)
assert result10.columns.tolist() == ['variable', 'val']
result11 = self.df.melt(id_vars=['id1'], value_name=self.value_name)
assert result11.columns.tolist() == ['id1', 'variable', 'val']
result12 = self.df.melt(id_vars=['id1', 'id2'],
value_name=self.value_name)
assert result12.columns.tolist() == ['id1', 'id2', 'variable', 'val']
result13 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
value_name=self.value_name)
assert result13.columns.tolist() == ['id1', 'id2', 'variable', 'val']
result14 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
value_name=self.value_name)
expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
'variable': ['A'] * 10 + ['B'] * 10,
self.value_name: (self.df['A'].tolist() +
self.df['B'].tolist())},
columns=['id1', 'id2', 'variable',
self.value_name])
tm.assert_frame_equal(result14, expected14)
def test_custom_var_and_value_name(self):
result15 = self.df.melt(var_name=self.var_name,
value_name=self.value_name)
assert result15.columns.tolist() == ['var', 'val']
result16 = self.df.melt(id_vars=['id1'], var_name=self.var_name,
value_name=self.value_name)
assert result16.columns.tolist() == ['id1', 'var', 'val']
result17 = self.df.melt(id_vars=['id1', 'id2'],
var_name=self.var_name,
value_name=self.value_name)
assert result17.columns.tolist() == ['id1', 'id2', 'var', 'val']
result18 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
var_name=self.var_name,
value_name=self.value_name)
assert result18.columns.tolist() == ['id1', 'id2', 'var', 'val']
result19 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
var_name=self.var_name,
value_name=self.value_name)
expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
self.var_name: ['A'] * 10 + ['B'] * 10,
self.value_name: (self.df['A'].tolist() +
self.df['B'].tolist())},
columns=['id1', 'id2', self.var_name,
self.value_name])
tm.assert_frame_equal(result19, expected19)
df20 = self.df.copy()
df20.columns.name = 'foo'
result20 = df20.melt()
assert result20.columns.tolist() == ['foo', 'value']
def test_col_level(self):
res1 = self.df1.melt(col_level=0)
res2 = self.df1.melt(col_level='CAP')
assert res1.columns.tolist() == ['CAP', 'value']
assert res2.columns.tolist() == ['CAP', 'value']
def test_multiindex(self):
res = self.df1.melt()
assert res.columns.tolist() == ['CAP', 'low', 'value']
@pytest.mark.parametrize("col", [
pd.Series(pd.date_range('2010', periods=5, tz='US/Pacific')),
pd.Series(["a", "b", "c", "a", "d"], dtype="category"),
pd.Series([0, 1, 0, 0, 0])])
def test_pandas_dtypes(self, col):
# GH 15785
df = DataFrame({'klass': range(5),
'col': col,
'attr1': [1, 0, 0, 0, 0],
'attr2': col})
expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col],
ignore_index=True)
result = melt(df, id_vars=['klass', 'col'], var_name='attribute',
value_name='value')
expected = DataFrame({0: list(range(5)) * 2,
1: pd.concat([col] * 2, ignore_index=True),
2: ['attr1'] * 5 + ['attr2'] * 5,
3: expected_value})
expected.columns = ['klass', 'col', 'attribute', 'value']
tm.assert_frame_equal(result, expected)
class TestLreshape(object):
def test_pairs(self):
data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008',
'11jan2009'],
'birthwt': [1766, 3301, 1454, 3139, 4133],
'id': [101, 102, 103, 104, 105],
'sex': ['Male', 'Female', 'Female', 'Female', 'Female'],
'visitdt1': ['11jan2009', '22dec2008', '04jan2009',
'29dec2008', '20jan2009'],
'visitdt2':
['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'],
'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'],
'wt1': [1823, 3338, 1549, 3298, 4306],
'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0],
'wt3': [2293.0, nan, nan, 3377.0, 4805.0]}
df = DataFrame(data)
spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)],
'wt': ['wt%d' % i for i in range(1, 4)]}
result = lreshape(df, spec)
exp_data = {'birthdt':
['08jan2009', '20dec2008', '30dec2008', '21dec2008',
'11jan2009', '08jan2009', '30dec2008', '21dec2008',
'11jan2009', '08jan2009', '21dec2008', '11jan2009'],
'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139,
4133, 1766, 3139, 4133],
'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101,
104, 105],
'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
'Male', 'Female', 'Female', 'Female', 'Male',
'Female', 'Female'],
'visitdt': ['11jan2009', '22dec2008', '04jan2009',
'29dec2008', '20jan2009', '21jan2009',
'22jan2009', '31dec2008', '03feb2009',
'05feb2009', '02jan2009', '15feb2009'],
'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0,
1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]}
exp = DataFrame(exp_data, columns=result.columns)
tm.assert_frame_equal(result, exp)
result = lreshape(df, spec, dropna=False)
exp_data = {'birthdt':
['08jan2009', '20dec2008', '30dec2008', '21dec2008',
'11jan2009', '08jan2009', '20dec2008', '30dec2008',
'21dec2008', '11jan2009', '08jan2009', '20dec2008',
'30dec2008', '21dec2008', '11jan2009'],
'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454,
3139, 4133, 1766, 3301, 1454, 3139, 4133],
'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105,
101, 102, 103, 104, 105],
'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
'Male', 'Female', 'Female', 'Female', 'Female',
'Male', 'Female', 'Female', 'Female', 'Female'],
'visitdt': ['11jan2009', '22dec2008', '04jan2009',
'29dec2008', '20jan2009', '21jan2009', nan,
'22jan2009', '31dec2008', '03feb2009',
'05feb2009', nan, nan, '02jan2009',
'15feb2009'],
'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan,
1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0,
4805.0]}
exp = DataFrame(exp_data, columns=result.columns)
tm.assert_frame_equal(result, exp)
spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)],
'wt': ['wt%d' % i for i in range(1, 4)]}
pytest.raises(ValueError, lreshape, df, spec)
class TestWideToLong(object):
def test_simple(self):
np.random.seed(123)
x = np.random.randn(3)
df = pd.DataFrame({"A1970": {0: "a",
1: "b",
2: "c"},
"A1980": {0: "d",
1: "e",
2: "f"},
"B1970": {0: 2.5,
1: 1.2,
2: .7},
"B1980": {0: 3.2,
1: 1.3,
2: .1},
"X": dict(zip(
range(3), x))})
df["id"] = df.index
exp_data = {"X": x.tolist() + x.tolist(),
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
"year": [1970, 1970, 1970, 1980, 1980, 1980],
"id": [0, 1, 2, 0, 1, 2]}
expected = DataFrame(exp_data)
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
result = wide_to_long(df, ["A", "B"], i="id", j="year")
tm.assert_frame_equal(result, expected)
def test_stubs(self):
# GH9204
df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]])
df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2']
stubs = ['inc', 'edu']
# TODO: unused?
df_long = pd.wide_to_long(df, stubs, i='id', j='age') # noqa
assert stubs == ['inc', 'edu']
def test_separating_character(self):
# GH14779
np.random.seed(123)
x = np.random.randn(3)
df = pd.DataFrame({"A.1970": {0: "a",
1: "b",
2: "c"},
"A.1980": {0: "d",
1: "e",
2: "f"},
"B.1970": {0: 2.5,
1: 1.2,
2: .7},
"B.1980": {0: 3.2,
1: 1.3,
2: .1},
"X": dict(zip(
range(3), x))})
df["id"] = df.index
exp_data = {"X": x.tolist() + x.tolist(),
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
"year": [1970, 1970, 1970, 1980, 1980, 1980],
"id": [0, 1, 2, 0, 1, 2]}
expected = DataFrame(exp_data)
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
tm.assert_frame_equal(result, expected)
def test_escapable_characters(self):
np.random.seed(123)
x = np.random.randn(3)
df = pd.DataFrame({"A(quarterly)1970": {0: "a",
1: "b",
2: "c"},
"A(quarterly)1980": {0: "d",
1: "e",
2: "f"},
"B(quarterly)1970": {0: 2.5,
1: 1.2,
2: .7},
"B(quarterly)1980": {0: 3.2,
1: 1.3,
2: .1},
"X": dict(zip(
range(3), x))})
df["id"] = df.index
exp_data = {"X": x.tolist() + x.tolist(),
"A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'],
"B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
"year": [1970, 1970, 1970, 1980, 1980, 1980],
"id": [0, 1, 2, 0, 1, 2]}
expected = DataFrame(exp_data)
expected = expected.set_index(
['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]]
result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"],
i="id", j="year")
tm.assert_frame_equal(result, expected)
def test_unbalanced(self):
# test that we can have a varying amount of time variables
df = pd.DataFrame({'A2010': [1.0, 2.0],
'A2011': [3.0, 4.0],
'B2010': [5.0, 6.0],
'X': ['X1', 'X2']})
df['id'] = df.index
exp_data = {'X': ['X1', 'X1', 'X2', 'X2'],
'A': [1.0, 3.0, 2.0, 4.0],
'B': [5.0, np.nan, 6.0, np.nan],
'id': [0, 0, 1, 1],
'year': [2010, 2011, 2010, 2011]}
expected = pd.DataFrame(exp_data)
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
result = wide_to_long(df, ['A', 'B'], i='id', j='year')
tm.assert_frame_equal(result, expected)
def test_character_overlap(self):
# Test we handle overlapping characters in both id_vars and value_vars
df = pd.DataFrame({
'A11': ['a11', 'a22', 'a33'],
'A12': ['a21', 'a22', 'a23'],
'B11': ['b11', 'b12', 'b13'],
'B12': ['b21', 'b22', 'b23'],
'BB11': [1, 2, 3],
'BB12': [4, 5, 6],
'BBBX': [91, 92, 93],
'BBBZ': [91, 92, 93]
})
df['id'] = df.index
expected = pd.DataFrame({
'BBBX': [91, 92, 93, 91, 92, 93],
'BBBZ': [91, 92, 93, 91, 92, 93],
'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
'BB': [1, 2, 3, 4, 5, 6],
'id': [0, 1, 2, 0, 1, 2],
'year': [11, 11, 11, 12, 12, 12]})
expected = expected.set_index(['id', 'year'])[
['BBBX', 'BBBZ', 'A', 'B', 'BB']]
result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
tm.assert_frame_equal(result.sort_index(axis=1),
expected.sort_index(axis=1))
def test_invalid_separator(self):
# if an invalid separator is supplied a empty data frame is returned
sep = 'nope!'
df = pd.DataFrame({'A2010': [1.0, 2.0],
'A2011': [3.0, 4.0],
'B2010': [5.0, 6.0],
'X': ['X1', 'X2']})
df['id'] = df.index
exp_data = {'X': '',
'A2010': [],
'A2011': [],
'B2010': [],
'id': [],
'year': [],
'A': [],
'B': []}
expected = pd.DataFrame(exp_data).astype({'year': 'int'})
expected = expected.set_index(['id', 'year'])[[
'X', 'A2010', 'A2011', 'B2010', 'A', 'B']]
expected.index.set_levels([0, 1], level=0, inplace=True)
result = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep)
tm.assert_frame_equal(result.sort_index(axis=1),
expected.sort_index(axis=1))
def test_num_string_disambiguation(self):
# Test that we can disambiguate number value_vars from
# string value_vars
df = pd.DataFrame({
'A11': ['a11', 'a22', 'a33'],
'A12': ['a21', 'a22', 'a23'],
'B11': ['b11', 'b12', 'b13'],
'B12': ['b21', 'b22', 'b23'],
'BB11': [1, 2, 3],
'BB12': [4, 5, 6],
'Arating': [91, 92, 93],
'Arating_old': [91, 92, 93]
})
df['id'] = df.index
expected = pd.DataFrame({
'Arating': [91, 92, 93, 91, 92, 93],
'Arating_old': [91, 92, 93, 91, 92, 93],
'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
'BB': [1, 2, 3, 4, 5, 6],
'id': [0, 1, 2, 0, 1, 2],
'year': [11, 11, 11, 12, 12, 12]})
expected = expected.set_index(['id', 'year'])[
['Arating', 'Arating_old', 'A', 'B', 'BB']]
result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
tm.assert_frame_equal(result.sort_index(axis=1),
expected.sort_index(axis=1))
def test_invalid_suffixtype(self):
# If all stubs names end with a string, but a numeric suffix is
# assumed, an empty data frame is returned
df = pd.DataFrame({'Aone': [1.0, 2.0],
'Atwo': [3.0, 4.0],
'Bone': [5.0, 6.0],
'X': ['X1', 'X2']})
df['id'] = df.index
exp_data = {'X': '',
'Aone': [],
'Atwo': [],
'Bone': [],
'id': [],
'year': [],
'A': [],
'B': []}
expected = pd.DataFrame(exp_data).astype({'year': 'int'})
expected = expected.set_index(['id', 'year'])
expected.index.set_levels([0, 1], level=0, inplace=True)
result = wide_to_long(df, ['A', 'B'], i='id', j='year')
tm.assert_frame_equal(result.sort_index(axis=1),
expected.sort_index(axis=1))
def test_multiple_id_columns(self):
# Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm
df = pd.DataFrame({
'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
})
expected = pd.DataFrame({
'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8,
2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9],
'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3],
'age': [1, 2, 1, 2, 1, 2, 1, 2, 1,
2, 1, 2, 1, 2, 1, 2, 1, 2]
})
expected = expected.set_index(['famid', 'birth', 'age'])[['ht']]
result = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age')
tm.assert_frame_equal(result, expected)
def test_non_unique_idvars(self):
# GH16382
# Raise an error message if non unique id vars (i) are passed
df = pd.DataFrame({
'A_A1': [1, 2, 3, 4, 5],
'B_B1': [1, 2, 3, 4, 5],
'x': [1, 1, 1, 1, 1]
})
with pytest.raises(ValueError):
wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname')
def test_cast_j_int(self):
df = pd.DataFrame({
'actor_1': ['CCH Pounder', 'Johnny Depp', 'Christoph Waltz'],
'actor_2': ['Joel David Moore', 'Orlando Bloom', 'Rory Kinnear'],
'actor_fb_likes_1': [1000.0, 40000.0, 11000.0],
'actor_fb_likes_2': [936.0, 5000.0, 393.0],
'title': ['Avatar', "Pirates of the Caribbean", 'Spectre']})
expected = pd.DataFrame({
'actor': ['CCH Pounder',
'Johnny Depp',
'Christoph Waltz',
'Joel David Moore',
'Orlando Bloom',
'Rory Kinnear'],
'actor_fb_likes': [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0],
'num': [1, 1, 1, 2, 2, 2],
'title': ['Avatar',
'Pirates of the Caribbean',
'Spectre',
'Avatar',
'Pirates of the Caribbean',
'Spectre']}).set_index(['title', 'num'])
result = wide_to_long(df, ['actor', 'actor_fb_likes'],
i='title', j='num', sep='_')
tm.assert_frame_equal(result, expected)
def test_identical_stubnames(self):
df = pd.DataFrame({'A2010': [1.0, 2.0],
'A2011': [3.0, 4.0],
'B2010': [5.0, 6.0],
'A': ['X1', 'X2']})
with pytest.raises(ValueError):
wide_to_long(df, ['A', 'B'], i='A', j='colname')
def test_nonnumeric_suffix(self):
df = pd.DataFrame({'treatment_placebo': [1.0, 2.0],
'treatment_test': [3.0, 4.0],
'result_placebo': [5.0, 6.0],
'A': ['X1', 'X2']})
expected = pd.DataFrame({
'A': ['X1', 'X1', 'X2', 'X2'],
'colname': ['placebo', 'test', 'placebo', 'test'],
'result': [5.0, np.nan, 6.0, np.nan],
'treatment': [1.0, 3.0, 2.0, 4.0]})
expected = expected.set_index(['A', 'colname'])
result = wide_to_long(df, ['result', 'treatment'],
i='A', j='colname', suffix='[a-z]+', sep='_')
tm.assert_frame_equal(result, expected)
def test_mixed_type_suffix(self):
df = pd.DataFrame({
'A': ['X1', 'X2'],
'result_1': [0, 9],
'result_foo': [5.0, 6.0],
'treatment_1': [1.0, 2.0],
'treatment_foo': [3.0, 4.0]})
expected = pd.DataFrame({
'A': ['X1', 'X2', 'X1', 'X2'],
'colname': ['1', '1', 'foo', 'foo'],
'result': [0.0, 9.0, 5.0, 6.0],
'treatment': [1.0, 2.0, 3.0, 4.0]}).set_index(['A', 'colname'])
result = wide_to_long(df, ['result', 'treatment'],
i='A', j='colname', suffix='.+', sep='_')
tm.assert_frame_equal(result, expected)
def test_float_suffix(self):
df = pd.DataFrame({
'treatment_1.1': [1.0, 2.0],
'treatment_2.1': [3.0, 4.0],
'result_1.2': [5.0, 6.0],
'result_1': [0, 9],
'A': ['X1', 'X2']})
expected = pd.DataFrame({
'A': ['X1', 'X1', 'X1', 'X1', 'X2', 'X2', 'X2', 'X2'],
'colname': [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1],
'result': [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan],
'treatment': [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0]})
expected = expected.set_index(['A', 'colname'])
result = wide_to_long(df, ['result', 'treatment'],
i='A', j='colname', suffix='[0-9.]+', sep='_')
tm.assert_frame_equal(result, expected)
File diff suppressed because it is too large Load Diff
@@ -1,524 +0,0 @@
# -*- coding: utf-8 -*-
# pylint: disable-msg=W0612,E1101
from warnings import catch_warnings
import pytest
from collections import OrderedDict
from pandas import DataFrame, Series
import pandas as pd
from numpy import nan
import numpy as np
from pandas.util.testing import assert_frame_equal
from pandas import get_dummies, Categorical, Index
import pandas.util.testing as tm
from pandas.compat import u
class TestGetDummies(object):
@pytest.fixture
def df(self):
return DataFrame({'A': ['a', 'b', 'a'],
'B': ['b', 'b', 'c'],
'C': [1, 2, 3]})
@pytest.fixture(params=['uint8', 'i8', np.float64, bool, None])
def dtype(self, request):
return np.dtype(request.param)
@pytest.fixture(params=['dense', 'sparse'])
def sparse(self, request):
# params are strings to simplify reading test results,
# e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True]
return request.param == 'sparse'
def effective_dtype(self, dtype):
if dtype is None:
return np.uint8
return dtype
def test_raises_on_dtype_object(self, df):
with pytest.raises(ValueError):
get_dummies(df, dtype='object')
def test_basic(self, sparse, dtype):
s_list = list('abc')
s_series = Series(s_list)
s_series_index = Series(s_list, list('ABC'))
expected = DataFrame({'a': [1, 0, 0],
'b': [0, 1, 0],
'c': [0, 0, 1]},
dtype=self.effective_dtype(dtype))
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
assert_frame_equal(result, expected)
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
assert_frame_equal(result, expected)
expected.index = list('ABC')
result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
assert_frame_equal(result, expected)
def test_basic_types(self, sparse, dtype):
# GH 10531
s_list = list('abc')
s_series = Series(s_list)
s_df = DataFrame({'a': [0, 1, 0, 1, 2],
'b': ['A', 'A', 'B', 'C', 'C'],
'c': [2, 3, 3, 3, 2]})
expected = DataFrame({'a': [1, 0, 0],
'b': [0, 1, 0],
'c': [0, 0, 1]},
dtype=self.effective_dtype(dtype),
columns=list('abc'))
if not sparse:
compare = tm.assert_frame_equal
else:
expected = expected.to_sparse(fill_value=0, kind='integer')
compare = tm.assert_sp_frame_equal
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
compare(result, expected)
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
compare(result, expected)
result = get_dummies(s_df, columns=s_df.columns,
sparse=sparse, dtype=dtype)
tm.assert_series_equal(result.get_dtype_counts(),
Series({dtype.name: 8}))
result = get_dummies(s_df, columns=['a'], sparse=sparse, dtype=dtype)
dtype_name = self.effective_dtype(dtype).name
expected_counts = {'int64': 1, 'object': 1}
expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0)
expected = Series(expected_counts).sort_index()
tm.assert_series_equal(result.get_dtype_counts().sort_index(),
expected)
def test_just_na(self, sparse):
just_na_list = [np.nan]
just_na_series = Series(just_na_list)
just_na_series_index = Series(just_na_list, index=['A'])
res_list = get_dummies(just_na_list, sparse=sparse)
res_series = get_dummies(just_na_series, sparse=sparse)
res_series_index = get_dummies(just_na_series_index, sparse=sparse)
assert res_list.empty
assert res_series.empty
assert res_series_index.empty
assert res_list.index.tolist() == [0]
assert res_series.index.tolist() == [0]
assert res_series_index.index.tolist() == ['A']
def test_include_na(self, sparse, dtype):
if sparse:
pytest.xfail(reason='nan in index is problematic (GH 16894)')
s = ['a', 'b', np.nan]
res = get_dummies(s, sparse=sparse, dtype=dtype)
exp = DataFrame({'a': [1, 0, 0],
'b': [0, 1, 0]},
dtype=self.effective_dtype(dtype))
assert_frame_equal(res, exp)
# Sparse dataframes do not allow nan labelled columns, see #GH8822
res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
exp_na = DataFrame({nan: [0, 0, 1],
'a': [1, 0, 0],
'b': [0, 1, 0]},
dtype=self.effective_dtype(dtype))
exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
# hack (NaN handling in assert_index_equal)
exp_na.columns = res_na.columns
assert_frame_equal(res_na, exp_na)
res_just_na = get_dummies([nan], dummy_na=True,
sparse=sparse, dtype=dtype)
exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan],
dtype=self.effective_dtype(dtype))
tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
def test_unicode(self, sparse):
# See GH 6885 - get_dummies chokes on unicode values
import unicodedata
e = 'e'
eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
s = [e, eacute, eacute]
res = get_dummies(s, prefix='letter', sparse=sparse)
exp = DataFrame({'letter_e': [1, 0, 0],
u('letter_%s') % eacute: [0, 1, 1]},
dtype=np.uint8)
assert_frame_equal(res, exp)
def test_dataframe_dummies_all_obj(self, df, sparse):
df = df[['A', 'B']]
result = get_dummies(df, sparse=sparse)
expected = DataFrame({'A_a': [1, 0, 1],
'A_b': [0, 1, 0],
'B_b': [1, 1, 0],
'B_c': [0, 0, 1]},
dtype=np.uint8)
assert_frame_equal(result, expected)
def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
result = get_dummies(df, sparse=sparse, dtype=dtype)
expected = DataFrame({'C': [1, 2, 3],
'A_a': [1, 0, 1],
'A_b': [0, 1, 0],
'B_b': [1, 1, 0],
'B_c': [0, 0, 1]})
cols = ['A_a', 'A_b', 'B_b', 'B_c']
expected[cols] = expected[cols].astype(dtype)
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_list(self, df, sparse):
prefixes = ['from_A', 'from_B']
result = get_dummies(df, prefix=prefixes, sparse=sparse)
expected = DataFrame({'C': [1, 2, 3],
'from_A_a': [1, 0, 1],
'from_A_b': [0, 1, 0],
'from_B_b': [1, 1, 0],
'from_B_c': [0, 0, 1]},
dtype=np.uint8)
expected[['C']] = df[['C']]
expected = expected[['C', 'from_A_a', 'from_A_b',
'from_B_b', 'from_B_c']]
assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_str(self, df, sparse):
# not that you should do this...
result = get_dummies(df, prefix='bad', sparse=sparse)
bad_columns = ['bad_a', 'bad_b', 'bad_b', 'bad_c']
expected = DataFrame([[1, 1, 0, 1, 0],
[2, 0, 1, 1, 0],
[3, 1, 0, 0, 1]],
columns=['C'] + bad_columns,
dtype=np.uint8)
expected = expected.astype({"C": np.int64})
assert_frame_equal(result, expected)
def test_dataframe_dummies_subset(self, df, sparse):
result = get_dummies(df, prefix=['from_A'], columns=['A'],
sparse=sparse)
expected = DataFrame({'B': ['b', 'b', 'c'],
'C': [1, 2, 3],
'from_A_a': [1, 0, 1],
'from_A_b': [0, 1, 0]}, dtype=np.uint8)
expected[['C']] = df[['C']]
assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_sep(self, df, sparse):
result = get_dummies(df, prefix_sep='..', sparse=sparse)
expected = DataFrame({'C': [1, 2, 3],
'A..a': [1, 0, 1],
'A..b': [0, 1, 0],
'B..b': [1, 1, 0],
'B..c': [0, 0, 1]},
dtype=np.uint8)
expected[['C']] = df[['C']]
expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
assert_frame_equal(result, expected)
result = get_dummies(df, prefix_sep=['..', '__'], sparse=sparse)
expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'})
assert_frame_equal(result, expected)
result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'},
sparse=sparse)
assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
with pytest.raises(ValueError):
get_dummies(df, prefix=['too few'], sparse=sparse)
def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
with pytest.raises(ValueError):
get_dummies(df, prefix_sep=['bad'], sparse=sparse)
def test_dataframe_dummies_prefix_dict(self, sparse):
prefixes = {'A': 'from_A', 'B': 'from_B'}
df = DataFrame({'C': [1, 2, 3],
'A': ['a', 'b', 'a'],
'B': ['b', 'b', 'c']})
result = get_dummies(df, prefix=prefixes, sparse=sparse)
expected = DataFrame({'C': [1, 2, 3],
'from_A_a': [1, 0, 1],
'from_A_b': [0, 1, 0],
'from_B_b': [1, 1, 0],
'from_B_c': [0, 0, 1]})
columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']
expected[columns] = expected[columns].astype(np.uint8)
assert_frame_equal(result, expected)
def test_dataframe_dummies_with_na(self, df, sparse, dtype):
df.loc[3, :] = [np.nan, np.nan, np.nan]
result = get_dummies(df, dummy_na=True,
sparse=sparse, dtype=dtype).sort_index(axis=1)
expected = DataFrame({'C': [1, 2, 3, np.nan],
'A_a': [1, 0, 1, 0],
'A_b': [0, 1, 0, 0],
'A_nan': [0, 0, 0, 1],
'B_b': [1, 1, 0, 0],
'B_c': [0, 0, 1, 0],
'B_nan': [0, 0, 0, 1]}).sort_index(axis=1)
e_dtype = self.effective_dtype(dtype)
columns = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']
expected[columns] = expected[columns].astype(e_dtype)
assert_frame_equal(result, expected)
result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
assert_frame_equal(result, expected)
def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
df['cat'] = pd.Categorical(['x', 'y', 'y'])
result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
expected = DataFrame({'C': [1, 2, 3],
'A_a': [1, 0, 1],
'A_b': [0, 1, 0],
'B_b': [1, 1, 0],
'B_c': [0, 0, 1],
'cat_x': [1, 0, 0],
'cat_y': [0, 1, 1]}).sort_index(axis=1)
columns = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y']
effective_dtype = self.effective_dtype(dtype)
expected[columns] = expected[columns].astype(effective_dtype)
expected.sort_index(axis=1)
assert_frame_equal(result, expected)
def test_basic_drop_first(self, sparse):
# GH12402 Add a new parameter `drop_first` to avoid collinearity
# Basic case
s_list = list('abc')
s_series = Series(s_list)
s_series_index = Series(s_list, list('ABC'))
expected = DataFrame({'b': [0, 1, 0],
'c': [0, 0, 1]},
dtype=np.uint8)
result = get_dummies(s_list, drop_first=True, sparse=sparse)
assert_frame_equal(result, expected)
result = get_dummies(s_series, drop_first=True, sparse=sparse)
assert_frame_equal(result, expected)
expected.index = list('ABC')
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
assert_frame_equal(result, expected)
def test_basic_drop_first_one_level(self, sparse):
# Test the case that categorical variable only has one level.
s_list = list('aaa')
s_series = Series(s_list)
s_series_index = Series(s_list, list('ABC'))
expected = DataFrame(index=np.arange(3))
result = get_dummies(s_list, drop_first=True, sparse=sparse)
assert_frame_equal(result, expected)
result = get_dummies(s_series, drop_first=True, sparse=sparse)
assert_frame_equal(result, expected)
expected = DataFrame(index=list('ABC'))
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
assert_frame_equal(result, expected)
def test_basic_drop_first_NA(self, sparse):
# Test NA handling together with drop_first
s_NA = ['a', 'b', np.nan]
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8)
assert_frame_equal(res, exp)
res_na = get_dummies(s_NA, dummy_na=True, drop_first=True,
sparse=sparse)
exp_na = DataFrame(
{'b': [0, 1, 0],
nan: [0, 0, 1]},
dtype=np.uint8).reindex(['b', nan], axis=1)
assert_frame_equal(res_na, exp_na)
res_just_na = get_dummies([nan], dummy_na=True, drop_first=True,
sparse=sparse)
exp_just_na = DataFrame(index=np.arange(1))
assert_frame_equal(res_just_na, exp_just_na)
def test_dataframe_dummies_drop_first(self, df, sparse):
df = df[['A', 'B']]
result = get_dummies(df, drop_first=True, sparse=sparse)
expected = DataFrame({'A_b': [0, 1, 0],
'B_c': [0, 0, 1]},
dtype=np.uint8)
assert_frame_equal(result, expected)
def test_dataframe_dummies_drop_first_with_categorical(
self, df, sparse, dtype):
df['cat'] = pd.Categorical(['x', 'y', 'y'])
result = get_dummies(df, drop_first=True, sparse=sparse)
expected = DataFrame({'C': [1, 2, 3],
'A_b': [0, 1, 0],
'B_c': [0, 0, 1],
'cat_y': [0, 1, 1]})
cols = ['A_b', 'B_c', 'cat_y']
expected[cols] = expected[cols].astype(np.uint8)
expected = expected[['C', 'A_b', 'B_c', 'cat_y']]
assert_frame_equal(result, expected)
def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
df.loc[3, :] = [np.nan, np.nan, np.nan]
result = get_dummies(df, dummy_na=True, drop_first=True,
sparse=sparse).sort_index(axis=1)
expected = DataFrame({'C': [1, 2, 3, np.nan],
'A_b': [0, 1, 0, 0],
'A_nan': [0, 0, 0, 1],
'B_c': [0, 0, 1, 0],
'B_nan': [0, 0, 0, 1]})
cols = ['A_b', 'A_nan', 'B_c', 'B_nan']
expected[cols] = expected[cols].astype(np.uint8)
expected = expected.sort_index(axis=1)
assert_frame_equal(result, expected)
result = get_dummies(df, dummy_na=False, drop_first=True,
sparse=sparse)
expected = expected[['C', 'A_b', 'B_c']]
assert_frame_equal(result, expected)
def test_int_int(self):
data = Series([1, 2, 1])
result = pd.get_dummies(data)
expected = DataFrame([[1, 0],
[0, 1],
[1, 0]],
columns=[1, 2],
dtype=np.uint8)
tm.assert_frame_equal(result, expected)
data = Series(pd.Categorical(['a', 'b', 'a']))
result = pd.get_dummies(data)
expected = DataFrame([[1, 0],
[0, 1],
[1, 0]],
columns=pd.Categorical(['a', 'b']),
dtype=np.uint8)
tm.assert_frame_equal(result, expected)
def test_int_df(self, dtype):
data = DataFrame(
{'A': [1, 2, 1],
'B': pd.Categorical(['a', 'b', 'a']),
'C': [1, 2, 1],
'D': [1., 2., 1.]
}
)
columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b']
expected = DataFrame([
[1, 1., 1, 0, 1, 0],
[2, 2., 0, 1, 0, 1],
[1, 1., 1, 0, 1, 0]
], columns=columns)
expected[columns[2:]] = expected[columns[2:]].astype(dtype)
result = pd.get_dummies(data, columns=['A', 'B'], dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):
# GH13854
for ordered in [False, True]:
cat = pd.Categorical(list("xy"), categories=list("xyz"),
ordered=ordered)
result = get_dummies(cat, dtype=dtype)
data = np.array([[1, 0, 0], [0, 1, 0]],
dtype=self.effective_dtype(dtype))
cols = pd.CategoricalIndex(cat.categories,
categories=cat.categories,
ordered=ordered)
expected = DataFrame(data, columns=cols,
dtype=self.effective_dtype(dtype))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize('sparse', [True, False])
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
# GH18914
df = DataFrame.from_dict(OrderedDict([('GDP', [1, 2]),
('Nation', ['AB', 'CD'])]))
df = get_dummies(df, columns=['Nation'], sparse=sparse)
df2 = df.reindex(columns=['GDP'])
tm.assert_frame_equal(df[['GDP']], df2)
def test_get_dummies_duplicate_columns(self, df):
# GH20839
df.columns = ["A", "A", "A"]
result = get_dummies(df).sort_index(axis=1)
expected = DataFrame([[1, 1, 0, 1, 0],
[2, 0, 1, 1, 0],
[3, 1, 0, 0, 1]],
columns=['A', 'A_a', 'A_b', 'A_b', 'A_c'],
dtype=np.uint8).sort_index(axis=1)
expected = expected.astype({"A": np.int64})
tm.assert_frame_equal(result, expected)
class TestCategoricalReshape(object):
def test_reshaping_panel_categorical(self):
with catch_warnings(record=True):
p = tm.makePanel()
p['str'] = 'foo'
df = p.to_frame()
df['category'] = df['str'].astype('category')
result = df['category'].unstack()
c = Categorical(['foo'] * len(p.major_axis))
expected = DataFrame({'A': c.copy(),
'B': c.copy(),
'C': c.copy(),
'D': c.copy()},
columns=Index(list('ABCD'), name='minor'),
index=p.major_axis.set_names('major'))
tm.assert_frame_equal(result, expected)
class TestMakeAxisDummies(object):
def test_preserve_categorical_dtype(self):
# GH13854
for ordered in [False, True]:
cidx = pd.CategoricalIndex(list("xyz"), ordered=ordered)
midx = pd.MultiIndex(levels=[['a'], cidx],
labels=[[0, 0], [0, 1]])
df = DataFrame([[10, 11]], index=midx)
expected = DataFrame([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
index=midx, columns=cidx)
from pandas.core.reshape.reshape import make_axis_dummies
result = make_axis_dummies(df)
tm.assert_frame_equal(result, expected)
result = make_axis_dummies(df, transform=lambda x: x)
tm.assert_frame_equal(result, expected)
@@ -1,607 +0,0 @@
import os
import pytest
import numpy as np
from pandas.compat import zip
import pandas as pd
from pandas import (DataFrame, Series, isna, to_datetime, DatetimeIndex, Index,
Timestamp, Interval, IntervalIndex, Categorical,
cut, qcut, date_range, NaT, TimedeltaIndex)
from pandas.tseries.offsets import Nano, Day
import pandas.util.testing as tm
from pandas.api.types import CategoricalDtype as CDT
from pandas.core.algorithms import quantile
import pandas.core.reshape.tile as tmod
class TestCut(object):
def test_simple(self):
data = np.ones(5, dtype='int64')
result = cut(data, 4, labels=False)
expected = np.array([1, 1, 1, 1, 1])
tm.assert_numpy_array_equal(result, expected,
check_dtype=False)
def test_bins(self):
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1])
result, bins = cut(data, 3, retbins=True)
intervals = IntervalIndex.from_breaks(bins.round(3))
intervals = intervals.take([0, 0, 0, 1, 2, 0])
expected = Categorical(intervals, ordered=True)
tm.assert_categorical_equal(result, expected)
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
6.53333333, 9.7]))
def test_right(self):
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
result, bins = cut(data, 4, right=True, retbins=True)
intervals = IntervalIndex.from_breaks(bins.round(3))
expected = Categorical(intervals, ordered=True)
expected = expected.take([0, 0, 0, 2, 3, 0, 0])
tm.assert_categorical_equal(result, expected)
tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95,
7.325, 9.7]))
def test_noright(self):
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
result, bins = cut(data, 4, right=False, retbins=True)
intervals = IntervalIndex.from_breaks(bins.round(3), closed='left')
intervals = intervals.take([0, 0, 0, 2, 3, 0, 1])
expected = Categorical(intervals, ordered=True)
tm.assert_categorical_equal(result, expected)
tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95,
7.325, 9.7095]))
def test_arraylike(self):
data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
result, bins = cut(data, 3, retbins=True)
intervals = IntervalIndex.from_breaks(bins.round(3))
intervals = intervals.take([0, 0, 0, 1, 2, 0])
expected = Categorical(intervals, ordered=True)
tm.assert_categorical_equal(result, expected)
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
6.53333333, 9.7]))
def test_bins_from_intervalindex(self):
c = cut(range(5), 3)
expected = c
result = cut(range(5), bins=expected.categories)
tm.assert_categorical_equal(result, expected)
expected = Categorical.from_codes(np.append(c.codes, -1),
categories=c.categories,
ordered=True)
result = cut(range(6), bins=expected.categories)
tm.assert_categorical_equal(result, expected)
# doc example
# make sure we preserve the bins
ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])
c = cut(ages, bins=[0, 18, 35, 70])
expected = IntervalIndex.from_tuples([(0, 18), (18, 35), (35, 70)])
tm.assert_index_equal(c.categories, expected)
result = cut([25, 20, 50], bins=c.categories)
tm.assert_index_equal(result.categories, expected)
tm.assert_numpy_array_equal(result.codes,
np.array([1, 1, 2], dtype='int8'))
def test_bins_not_monotonic(self):
data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
pytest.raises(ValueError, cut, data, [0.1, 1.5, 1, 10])
def test_wrong_num_labels(self):
data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
pytest.raises(ValueError, cut, data, [0, 1, 10],
labels=['foo', 'bar', 'baz'])
def test_cut_corner(self):
# h3h
pytest.raises(ValueError, cut, [], 2)
pytest.raises(ValueError, cut, [1, 2, 3], 0.5)
@pytest.mark.parametrize('arg', [2, np.eye(2), DataFrame(np.eye(2))])
@pytest.mark.parametrize('cut_func', [cut, qcut])
def test_cut_not_1d_arg(self, arg, cut_func):
with pytest.raises(ValueError):
cut_func(arg, 2)
def test_cut_out_of_range_more(self):
# #1511
s = Series([0, -1, 0, 1, -3], name='x')
ind = cut(s, [0, 1], labels=False)
exp = Series([np.nan, np.nan, np.nan, 0, np.nan], name='x')
tm.assert_series_equal(ind, exp)
def test_labels(self):
arr = np.tile(np.arange(0, 1.01, 0.1), 4)
result, bins = cut(arr, 4, retbins=True)
ex_levels = IntervalIndex.from_breaks([-1e-3, 0.25, 0.5, 0.75, 1])
tm.assert_index_equal(result.categories, ex_levels)
result, bins = cut(arr, 4, retbins=True, right=False)
ex_levels = IntervalIndex.from_breaks([0, 0.25, 0.5, 0.75, 1 + 1e-3],
closed='left')
tm.assert_index_equal(result.categories, ex_levels)
def test_cut_pass_series_name_to_factor(self):
s = Series(np.random.randn(100), name='foo')
factor = cut(s, 4)
assert factor.name == 'foo'
def test_label_precision(self):
arr = np.arange(0, 0.73, 0.01)
result = cut(arr, 4, precision=2)
ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36,
0.54, 0.72])
tm.assert_index_equal(result.categories, ex_levels)
def test_na_handling(self):
arr = np.arange(0, 0.75, 0.01)
arr[::3] = np.nan
result = cut(arr, 4)
result_arr = np.asarray(result)
ex_arr = np.where(isna(arr), np.nan, result_arr)
tm.assert_almost_equal(result_arr, ex_arr)
result = cut(arr, 4, labels=False)
ex_result = np.where(isna(arr), np.nan, result)
tm.assert_almost_equal(result, ex_result)
def test_inf_handling(self):
data = np.arange(6)
data_ser = Series(data, dtype='int64')
bins = [-np.inf, 2, 4, np.inf]
result = cut(data, bins)
result_ser = cut(data_ser, bins)
ex_uniques = IntervalIndex.from_breaks(bins)
tm.assert_index_equal(result.categories, ex_uniques)
assert result[5] == Interval(4, np.inf)
assert result[0] == Interval(-np.inf, 2)
assert result_ser[5] == Interval(4, np.inf)
assert result_ser[0] == Interval(-np.inf, 2)
def test_qcut(self):
arr = np.random.randn(1000)
# We store the bins as Index that have been rounded
# to comparisons are a bit tricky.
labels, bins = qcut(arr, 4, retbins=True)
ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
result = labels.categories.left.values
assert np.allclose(result, ex_bins[:-1], atol=1e-2)
result = labels.categories.right.values
assert np.allclose(result, ex_bins[1:], atol=1e-2)
ex_levels = cut(arr, ex_bins, include_lowest=True)
tm.assert_categorical_equal(labels, ex_levels)
def test_qcut_bounds(self):
arr = np.random.randn(1000)
factor = qcut(arr, 10, labels=False)
assert len(np.unique(factor)) == 10
def test_qcut_specify_quantiles(self):
arr = np.random.randn(100)
factor = qcut(arr, [0, .25, .5, .75, 1.])
expected = qcut(arr, 4)
tm.assert_categorical_equal(factor, expected)
def test_qcut_all_bins_same(self):
tm.assert_raises_regex(ValueError, "edges.*unique", qcut,
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3)
def test_cut_out_of_bounds(self):
arr = np.random.randn(100)
result = cut(arr, [-1, 0, 1])
mask = isna(result)
ex_mask = (arr < -1) | (arr > 1)
tm.assert_numpy_array_equal(mask, ex_mask)
def test_cut_pass_labels(self):
arr = [50, 5, 10, 15, 20, 30, 70]
bins = [0, 25, 50, 100]
labels = ['Small', 'Medium', 'Large']
result = cut(arr, bins, labels=labels)
exp = Categorical(['Medium'] + 4 * ['Small'] + ['Medium', 'Large'],
categories=labels,
ordered=True)
tm.assert_categorical_equal(result, exp)
result = cut(arr, bins, labels=Categorical.from_codes([0, 1, 2],
labels))
exp = Categorical.from_codes([1] + 4 * [0] + [1, 2], labels)
tm.assert_categorical_equal(result, exp)
# issue 16459
labels = ['Good', 'Medium', 'Bad']
result = cut(arr, 3, labels=labels)
exp = cut(arr, 3, labels=Categorical(labels, categories=labels,
ordered=True))
tm.assert_categorical_equal(result, exp)
def test_qcut_include_lowest(self):
values = np.arange(10)
ii = qcut(values, 4)
ex_levels = IntervalIndex(
[Interval(-0.001, 2.25),
Interval(2.25, 4.5),
Interval(4.5, 6.75),
Interval(6.75, 9)])
tm.assert_index_equal(ii.categories, ex_levels)
def test_qcut_nas(self):
arr = np.random.randn(100)
arr[:20] = np.nan
result = qcut(arr, 4)
assert isna(result[:20]).all()
def test_qcut_index(self):
result = qcut([0, 2], 2)
intervals = [Interval(-0.001, 1), Interval(1, 2)]
expected = Categorical(intervals, ordered=True)
tm.assert_categorical_equal(result, expected)
def test_round_frac(self):
# it works
result = cut(np.arange(11.), 2)
result = cut(np.arange(11.) / 1e10, 2)
# #1979, negative numbers
result = tmod._round_frac(-117.9998, precision=3)
assert result == -118
result = tmod._round_frac(117.9998, precision=3)
assert result == 118
result = tmod._round_frac(117.9998, precision=2)
assert result == 118
result = tmod._round_frac(0.000123456, precision=2)
assert result == 0.00012
def test_qcut_binning_issues(self, datapath):
# #1978, 1979
cut_file = datapath(os.path.join('reshape', 'data', 'cut_data.csv'))
arr = np.loadtxt(cut_file)
result = qcut(arr, 20)
starts = []
ends = []
for lev in np.unique(result):
s = lev.left
e = lev.right
assert s != e
starts.append(float(s))
ends.append(float(e))
for (sp, sn), (ep, en) in zip(zip(starts[:-1], starts[1:]),
zip(ends[:-1], ends[1:])):
assert sp < sn
assert ep < en
assert ep <= sn
def test_cut_return_intervals(self):
s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
res = cut(s, 3)
exp_bins = np.linspace(0, 8, num=4).round(3)
exp_bins[0] -= 0.008
exp = Series(IntervalIndex.from_breaks(exp_bins, closed='right').take(
[0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True))
tm.assert_series_equal(res, exp)
def test_qcut_return_intervals(self):
s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
res = qcut(s, [0, 0.333, 0.666, 1])
exp_levels = np.array([Interval(-0.001, 2.664),
Interval(2.664, 5.328), Interval(5.328, 8)])
exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(
CDT(ordered=True))
tm.assert_series_equal(res, exp)
def test_series_retbins(self):
# GH 8589
s = Series(np.arange(4))
result, bins = cut(s, 2, retbins=True)
expected = Series(IntervalIndex.from_breaks(
[-0.003, 1.5, 3], closed='right').repeat(2)).astype(
CDT(ordered=True))
tm.assert_series_equal(result, expected)
result, bins = qcut(s, 2, retbins=True)
expected = Series(IntervalIndex.from_breaks(
[-0.001, 1.5, 3], closed='right').repeat(2)).astype(
CDT(ordered=True))
tm.assert_series_equal(result, expected)
def test_cut_duplicates_bin(self):
# issue 20947
values = Series(np.array([1, 3, 5, 7, 9]),
index=["a", "b", "c", "d", "e"])
bins = [0, 2, 4, 6, 10, 10]
result = cut(values, bins, duplicates='drop')
expected = cut(values, pd.unique(bins))
tm.assert_series_equal(result, expected)
pytest.raises(ValueError, cut, values, bins)
pytest.raises(ValueError, cut, values, bins, duplicates='raise')
# invalid
pytest.raises(ValueError, cut, values, bins, duplicates='foo')
def test_qcut_duplicates_bin(self):
# GH 7751
values = [0, 0, 0, 0, 1, 2, 3]
expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)])
result = qcut(values, 3, duplicates='drop')
tm.assert_index_equal(result.categories, expected)
pytest.raises(ValueError, qcut, values, 3)
pytest.raises(ValueError, qcut, values, 3, duplicates='raise')
# invalid
pytest.raises(ValueError, qcut, values, 3, duplicates='foo')
def test_single_quantile(self):
# issue 15431
expected = Series([0, 0])
s = Series([9., 9.])
result = qcut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
result = qcut(s, 1)
intervals = IntervalIndex([Interval(8.999, 9.0),
Interval(8.999, 9.0)], closed='right')
expected = Series(intervals).astype(CDT(ordered=True))
tm.assert_series_equal(result, expected)
s = Series([-9., -9.])
expected = Series([0, 0])
result = qcut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
result = qcut(s, 1)
intervals = IntervalIndex([Interval(-9.001, -9.0),
Interval(-9.001, -9.0)], closed='right')
expected = Series(intervals).astype(CDT(ordered=True))
tm.assert_series_equal(result, expected)
s = Series([0., 0.])
expected = Series([0, 0])
result = qcut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
result = qcut(s, 1)
intervals = IntervalIndex([Interval(-0.001, 0.0),
Interval(-0.001, 0.0)], closed='right')
expected = Series(intervals).astype(CDT(ordered=True))
tm.assert_series_equal(result, expected)
s = Series([9])
expected = Series([0])
result = qcut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
result = qcut(s, 1)
intervals = IntervalIndex([Interval(8.999, 9.0)], closed='right')
expected = Series(intervals).astype(CDT(ordered=True))
tm.assert_series_equal(result, expected)
s = Series([-9])
expected = Series([0])
result = qcut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
result = qcut(s, 1)
intervals = IntervalIndex([Interval(-9.001, -9.0)], closed='right')
expected = Series(intervals).astype(CDT(ordered=True))
tm.assert_series_equal(result, expected)
s = Series([0])
expected = Series([0])
result = qcut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
result = qcut(s, 1)
intervals = IntervalIndex([Interval(-0.001, 0.0)], closed='right')
expected = Series(intervals).astype(CDT(ordered=True))
tm.assert_series_equal(result, expected)
def test_single_bin(self):
# issue 14652
expected = Series([0, 0])
s = Series([9., 9.])
result = cut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
s = Series([-9., -9.])
result = cut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
expected = Series([0])
s = Series([9])
result = cut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
s = Series([-9])
result = cut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
# issue 15428
expected = Series([0, 0])
s = Series([0., 0.])
result = cut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
expected = Series([0])
s = Series([0])
result = cut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"array_1_writeable, array_2_writeable",
[(True, True), (True, False), (False, False)])
def test_cut_read_only(self, array_1_writeable, array_2_writeable):
# issue 18773
array_1 = np.arange(0, 100, 10)
array_1.flags.writeable = array_1_writeable
array_2 = np.arange(0, 100, 10)
array_2.flags.writeable = array_2_writeable
hundred_elements = np.arange(100)
tm.assert_categorical_equal(cut(hundred_elements, array_1),
cut(hundred_elements, array_2))
class TestDatelike(object):
@pytest.mark.parametrize('s', [
Series(DatetimeIndex(['20180101', NaT, '20180103'])),
Series(TimedeltaIndex(['0 days', NaT, '2 days']))],
ids=lambda x: str(x.dtype))
def test_qcut_nat(self, s):
# GH 19768
intervals = IntervalIndex.from_tuples(
[(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])])
expected = Series(Categorical(intervals, ordered=True))
result = qcut(s, 2)
tm.assert_series_equal(result, expected)
def test_datetime_cut(self):
# GH 14714
# testing for time data to be present as series
data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03']))
result, bins = cut(data, 3, retbins=True)
expected = (
Series(IntervalIndex([
Interval(Timestamp('2012-12-31 23:57:07.200000'),
Timestamp('2013-01-01 16:00:00')),
Interval(Timestamp('2013-01-01 16:00:00'),
Timestamp('2013-01-02 08:00:00')),
Interval(Timestamp('2013-01-02 08:00:00'),
Timestamp('2013-01-03 00:00:00'))]))
.astype(CDT(ordered=True)))
tm.assert_series_equal(result, expected)
# testing for time data to be present as list
data = [np.datetime64('2013-01-01'), np.datetime64('2013-01-02'),
np.datetime64('2013-01-03')]
result, bins = cut(data, 3, retbins=True)
tm.assert_series_equal(Series(result), expected)
# testing for time data to be present as ndarray
data = np.array([np.datetime64('2013-01-01'),
np.datetime64('2013-01-02'),
np.datetime64('2013-01-03')])
result, bins = cut(data, 3, retbins=True)
tm.assert_series_equal(Series(result), expected)
# testing for time data to be present as datetime index
data = DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03'])
result, bins = cut(data, 3, retbins=True)
tm.assert_series_equal(Series(result), expected)
@pytest.mark.parametrize('bins', [
3, [Timestamp('2013-01-01 04:57:07.200000'),
Timestamp('2013-01-01 21:00:00'),
Timestamp('2013-01-02 13:00:00'),
Timestamp('2013-01-03 05:00:00')]])
@pytest.mark.parametrize('box', [list, np.array, Index, Series])
def test_datetimetz_cut(self, bins, box):
# GH 19872
tz = 'US/Eastern'
s = Series(date_range('20130101', periods=3, tz=tz))
if not isinstance(bins, int):
bins = box(bins)
result = cut(s, bins)
expected = (
Series(IntervalIndex([
Interval(Timestamp('2012-12-31 23:57:07.200000', tz=tz),
Timestamp('2013-01-01 16:00:00', tz=tz)),
Interval(Timestamp('2013-01-01 16:00:00', tz=tz),
Timestamp('2013-01-02 08:00:00', tz=tz)),
Interval(Timestamp('2013-01-02 08:00:00', tz=tz),
Timestamp('2013-01-03 00:00:00', tz=tz))]))
.astype(CDT(ordered=True)))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize('bins', [3, np.linspace(0, 1, 4)])
def test_datetimetz_qcut(self, bins):
# GH 19872
tz = 'US/Eastern'
s = Series(date_range('20130101', periods=3, tz=tz))
result = qcut(s, bins)
expected = (
Series(IntervalIndex([
Interval(Timestamp('2012-12-31 23:59:59.999999999', tz=tz),
Timestamp('2013-01-01 16:00:00', tz=tz)),
Interval(Timestamp('2013-01-01 16:00:00', tz=tz),
Timestamp('2013-01-02 08:00:00', tz=tz)),
Interval(Timestamp('2013-01-02 08:00:00', tz=tz),
Timestamp('2013-01-03 00:00:00', tz=tz))]))
.astype(CDT(ordered=True)))
tm.assert_series_equal(result, expected)
def test_datetime_bin(self):
data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')]
bin_data = ['2012-12-12', '2012-12-14', '2012-12-16']
expected = (
Series(IntervalIndex([
Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])),
Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2]))]))
.astype(CDT(ordered=True)))
for conv in [Timestamp, Timestamp, np.datetime64]:
bins = [conv(v) for v in bin_data]
result = cut(data, bins=bins)
tm.assert_series_equal(Series(result), expected)
bin_pydatetime = [Timestamp(v).to_pydatetime() for v in bin_data]
result = cut(data, bins=bin_pydatetime)
tm.assert_series_equal(Series(result), expected)
bins = to_datetime(bin_data)
result = cut(data, bins=bin_pydatetime)
tm.assert_series_equal(Series(result), expected)
def test_datetime_nan(self):
def f():
cut(date_range('20130101', periods=3), bins=[0, 2, 4])
pytest.raises(ValueError, f)
result = cut(date_range('20130102', periods=5),
bins=date_range('20130101', periods=2))
mask = result.categories.isna()
tm.assert_numpy_array_equal(mask, np.array([False]))
mask = result.isna()
tm.assert_numpy_array_equal(
mask, np.array([False, True, True, True, True]))
@@ -1,345 +0,0 @@
import pytest
import numpy as np
import pandas as pd
from pandas import Categorical, Series, CategoricalIndex
from pandas.core.dtypes.concat import union_categoricals
from pandas.util import testing as tm
class TestUnionCategoricals(object):
def test_union_categorical(self):
# GH 13361
data = [
(list('abc'), list('abd'), list('abcabd')),
([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
(['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'],
['b', 'b', np.nan, 'a', 'a', np.nan, 'c']),
(pd.date_range('2014-01-01', '2014-01-05'),
pd.date_range('2014-01-06', '2014-01-07'),
pd.date_range('2014-01-01', '2014-01-07')),
(pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'),
pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'),
pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')),
(pd.period_range('2014-01-01', '2014-01-05'),
pd.period_range('2014-01-06', '2014-01-07'),
pd.period_range('2014-01-01', '2014-01-07')),
]
for a, b, combined in data:
for box in [Categorical, CategoricalIndex, Series]:
result = union_categoricals([box(Categorical(a)),
box(Categorical(b))])
expected = Categorical(combined)
tm.assert_categorical_equal(result, expected,
check_category_order=True)
# new categories ordered by appearance
s = Categorical(['x', 'y', 'z'])
s2 = Categorical(['a', 'b', 'c'])
result = union_categoricals([s, s2])
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
categories=['x', 'y', 'z', 'a', 'b', 'c'])
tm.assert_categorical_equal(result, expected)
s = Categorical([0, 1.2, 2], ordered=True)
s2 = Categorical([0, 1.2, 2], ordered=True)
result = union_categoricals([s, s2])
expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
tm.assert_categorical_equal(result, expected)
# must exactly match types
s = Categorical([0, 1.2, 2])
s2 = Categorical([2, 3, 4])
msg = 'dtype of categories must be the same'
with tm.assert_raises_regex(TypeError, msg):
union_categoricals([s, s2])
msg = 'No Categoricals to union'
with tm.assert_raises_regex(ValueError, msg):
union_categoricals([])
def test_union_categoricals_nan(self):
# GH 13759
res = union_categoricals([pd.Categorical([1, 2, np.nan]),
pd.Categorical([3, 2, np.nan])])
exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
tm.assert_categorical_equal(res, exp)
res = union_categoricals([pd.Categorical(['A', 'B']),
pd.Categorical(['B', 'B', np.nan])])
exp = Categorical(['A', 'B', 'B', 'B', np.nan])
tm.assert_categorical_equal(res, exp)
val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'),
pd.NaT]
val2 = [pd.NaT, pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-02-01')]
res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)])
exp = Categorical(val1 + val2,
categories=[pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-03-01'),
pd.Timestamp('2011-02-01')])
tm.assert_categorical_equal(res, exp)
# all NaN
res = union_categoricals([pd.Categorical(np.array([np.nan, np.nan],
dtype=object)),
pd.Categorical(['X'])])
exp = Categorical([np.nan, np.nan, 'X'])
tm.assert_categorical_equal(res, exp)
res = union_categoricals([pd.Categorical([np.nan, np.nan]),
pd.Categorical([np.nan, np.nan])])
exp = Categorical([np.nan, np.nan, np.nan, np.nan])
tm.assert_categorical_equal(res, exp)
def test_union_categoricals_empty(self):
# GH 13759
res = union_categoricals([pd.Categorical([]),
pd.Categorical([])])
exp = Categorical([])
tm.assert_categorical_equal(res, exp)
res = union_categoricals([Categorical([]),
Categorical(['1'])])
exp = Categorical(['1'])
tm.assert_categorical_equal(res, exp)
def test_union_categorical_same_category(self):
# check fastpath
c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
res = union_categoricals([c1, c2])
exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan],
categories=[1, 2, 3, 4])
tm.assert_categorical_equal(res, exp)
c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z'])
c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z'])
res = union_categoricals([c1, c2])
exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'],
categories=['x', 'y', 'z'])
tm.assert_categorical_equal(res, exp)
def test_union_categorical_same_categories_different_order(self):
# https://github.com/pandas-dev/pandas/issues/19096
c1 = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c'])
c2 = Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c'])
result = union_categoricals([c1, c2])
expected = Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
categories=['a', 'b', 'c'])
tm.assert_categorical_equal(result, expected)
def test_union_categoricals_ordered(self):
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], ordered=False)
msg = 'Categorical.ordered must be the same'
with tm.assert_raises_regex(TypeError, msg):
union_categoricals([c1, c2])
res = union_categoricals([c1, c1])
exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
tm.assert_categorical_equal(res, exp)
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
res = union_categoricals([c1, c2])
exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
tm.assert_categorical_equal(res, exp)
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
msg = "to union ordered Categoricals, all categories must be the same"
with tm.assert_raises_regex(TypeError, msg):
union_categoricals([c1, c2])
def test_union_categoricals_ignore_order(self):
# GH 15219
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], ordered=False)
res = union_categoricals([c1, c2], ignore_order=True)
exp = Categorical([1, 2, 3, 1, 2, 3])
tm.assert_categorical_equal(res, exp)
msg = 'Categorical.ordered must be the same'
with tm.assert_raises_regex(TypeError, msg):
union_categoricals([c1, c2], ignore_order=False)
res = union_categoricals([c1, c1], ignore_order=True)
exp = Categorical([1, 2, 3, 1, 2, 3])
tm.assert_categorical_equal(res, exp)
res = union_categoricals([c1, c1], ignore_order=False)
exp = Categorical([1, 2, 3, 1, 2, 3],
categories=[1, 2, 3], ordered=True)
tm.assert_categorical_equal(res, exp)
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
res = union_categoricals([c1, c2], ignore_order=True)
exp = Categorical([1, 2, 3, np.nan, 3, 2])
tm.assert_categorical_equal(res, exp)
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
res = union_categoricals([c1, c2], ignore_order=True)
exp = Categorical([1, 2, 3, 1, 2, 3])
tm.assert_categorical_equal(res, exp)
res = union_categoricals([c2, c1], ignore_order=True,
sort_categories=True)
exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
tm.assert_categorical_equal(res, exp)
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([4, 5, 6], ordered=True)
result = union_categoricals([c1, c2], ignore_order=True)
expected = Categorical([1, 2, 3, 4, 5, 6])
tm.assert_categorical_equal(result, expected)
msg = "to union ordered Categoricals, all categories must be the same"
with tm.assert_raises_regex(TypeError, msg):
union_categoricals([c1, c2], ignore_order=False)
with tm.assert_raises_regex(TypeError, msg):
union_categoricals([c1, c2])
def test_union_categoricals_sort(self):
# GH 13846
c1 = Categorical(['x', 'y', 'z'])
c2 = Categorical(['a', 'b', 'c'])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
categories=['a', 'b', 'c', 'x', 'y', 'z'])
tm.assert_categorical_equal(result, expected)
# fastpath
c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(['a', 'b', 'b', 'c'],
categories=['a', 'b', 'c'])
tm.assert_categorical_equal(result, expected)
c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b'])
c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b'])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(['a', 'b', 'b', 'c'],
categories=['a', 'b', 'c'])
tm.assert_categorical_equal(result, expected)
# fastpath - skip resort
c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(['a', 'b', 'b', 'c'],
categories=['a', 'b', 'c'])
tm.assert_categorical_equal(result, expected)
c1 = Categorical(['x', np.nan])
c2 = Categorical([np.nan, 'b'])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(['x', np.nan, np.nan, 'b'],
categories=['b', 'x'])
tm.assert_categorical_equal(result, expected)
c1 = Categorical([np.nan])
c2 = Categorical([np.nan])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical([np.nan, np.nan])
tm.assert_categorical_equal(result, expected)
c1 = Categorical([])
c2 = Categorical([])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical([])
tm.assert_categorical_equal(result, expected)
c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
with pytest.raises(TypeError):
union_categoricals([c1, c2], sort_categories=True)
def test_union_categoricals_sort_false(self):
# GH 13846
c1 = Categorical(['x', 'y', 'z'])
c2 = Categorical(['a', 'b', 'c'])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
categories=['x', 'y', 'z', 'a', 'b', 'c'])
tm.assert_categorical_equal(result, expected)
# fastpath
c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical(['a', 'b', 'b', 'c'],
categories=['b', 'a', 'c'])
tm.assert_categorical_equal(result, expected)
# fastpath - skip resort
c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical(['a', 'b', 'b', 'c'],
categories=['a', 'b', 'c'])
tm.assert_categorical_equal(result, expected)
c1 = Categorical(['x', np.nan])
c2 = Categorical([np.nan, 'b'])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical(['x', np.nan, np.nan, 'b'],
categories=['x', 'b'])
tm.assert_categorical_equal(result, expected)
c1 = Categorical([np.nan])
c2 = Categorical([np.nan])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical([np.nan, np.nan])
tm.assert_categorical_equal(result, expected)
c1 = Categorical([])
c2 = Categorical([])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical([])
tm.assert_categorical_equal(result, expected)
c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical(['b', 'a', 'a', 'c'],
categories=['b', 'a', 'c'], ordered=True)
tm.assert_categorical_equal(result, expected)
def test_union_categorical_unwrap(self):
# GH 14173
c1 = Categorical(['a', 'b'])
c2 = pd.Series(['b', 'c'], dtype='category')
result = union_categoricals([c1, c2])
expected = Categorical(['a', 'b', 'b', 'c'])
tm.assert_categorical_equal(result, expected)
c2 = CategoricalIndex(c2)
result = union_categoricals([c1, c2])
tm.assert_categorical_equal(result, expected)
c1 = Series(c1)
result = union_categoricals([c1, c2])
tm.assert_categorical_equal(result, expected)
with pytest.raises(TypeError):
union_categoricals([c1, ['a', 'b', 'c']])
@@ -1,49 +0,0 @@
import numpy as np
from pandas import date_range, Index
import pandas.util.testing as tm
from pandas.core.reshape.util import cartesian_product
class TestCartesianProduct(object):
def test_simple(self):
x, y = list('ABC'), [1, 22]
result1, result2 = cartesian_product([x, y])
expected1 = np.array(['A', 'A', 'B', 'B', 'C', 'C'])
expected2 = np.array([1, 22, 1, 22, 1, 22])
tm.assert_numpy_array_equal(result1, expected1)
tm.assert_numpy_array_equal(result2, expected2)
def test_datetimeindex(self):
# regression test for GitHub issue #6439
# make sure that the ordering on datetimeindex is consistent
x = date_range('2000-01-01', periods=2)
result1, result2 = [Index(y).day for y in cartesian_product([x, x])]
expected1 = Index([1, 1, 2, 2])
expected2 = Index([1, 2, 1, 2])
tm.assert_index_equal(result1, expected1)
tm.assert_index_equal(result2, expected2)
def test_empty(self):
# product of empty factors
X = [[], [0, 1], []]
Y = [[], [], ['a', 'b', 'c']]
for x, y in zip(X, Y):
expected1 = np.array([], dtype=np.asarray(x).dtype)
expected2 = np.array([], dtype=np.asarray(y).dtype)
result1, result2 = cartesian_product([x, y])
tm.assert_numpy_array_equal(result1, expected1)
tm.assert_numpy_array_equal(result2, expected2)
# empty product (empty input):
result = cartesian_product([])
expected = []
assert result == expected
def test_invalid_input(self):
invalid_inputs = [1, [1], [1, 2], [[1], 2],
'a', ['a'], ['a', 'b'], [['a'], 'b']]
msg = "Input must be a list-like of list-likes"
for X in invalid_inputs:
tm.assert_raises_regex(TypeError, msg, cartesian_product, X=X)