pruned venvs
This commit is contained in:
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
-289
@@ -1,289 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
test .agg behavior / note that .apply is tested generally in test_groupby.py
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import OrderedDict
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, Series, concat
|
||||
from pandas.core.base import SpecificationError
|
||||
from pandas.core.groupby.grouper import Grouping
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def test_agg_regression1(tsframe):
|
||||
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
|
||||
result = grouped.agg(np.mean)
|
||||
expected = grouped.mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_must_agg(df):
|
||||
grouped = df.groupby('A')['C']
|
||||
|
||||
msg = "Must produce aggregated value"
|
||||
with pytest.raises(Exception, match=msg):
|
||||
grouped.agg(lambda x: x.describe())
|
||||
with pytest.raises(Exception, match=msg):
|
||||
grouped.agg(lambda x: x.index[:2])
|
||||
|
||||
|
||||
def test_agg_ser_multi_key(df):
|
||||
# TODO(wesm): unused
|
||||
ser = df.C # noqa
|
||||
|
||||
f = lambda x: x.sum()
|
||||
results = df.C.groupby([df.A, df.B]).aggregate(f)
|
||||
expected = df.groupby(['A', 'B']).sum()['C']
|
||||
tm.assert_series_equal(results, expected)
|
||||
|
||||
|
||||
def test_groupby_aggregation_mixed_dtype():
|
||||
|
||||
# GH 6212
|
||||
expected = DataFrame({
|
||||
'v1': [5, 5, 7, np.nan, 3, 3, 4, 1],
|
||||
'v2': [55, 55, 77, np.nan, 33, 33, 44, 11]},
|
||||
index=MultiIndex.from_tuples([(1, 95), (1, 99), (2, 95), (2, 99),
|
||||
('big', 'damp'),
|
||||
('blue', 'dry'),
|
||||
('red', 'red'), ('red', 'wet')],
|
||||
names=['by1', 'by2']))
|
||||
|
||||
df = DataFrame({
|
||||
'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9],
|
||||
'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99],
|
||||
'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan,
|
||||
12],
|
||||
'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99,
|
||||
np.nan, np.nan]
|
||||
})
|
||||
|
||||
g = df.groupby(['by1', 'by2'])
|
||||
result = g[['v1', 'v2']].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_apply_corner(ts, tsframe):
|
||||
# nothing to group, all NA
|
||||
grouped = ts.groupby(ts * np.nan)
|
||||
assert ts.dtype == np.float64
|
||||
|
||||
# groupby float64 values results in Float64Index
|
||||
exp = Series([], dtype=np.float64,
|
||||
index=pd.Index([], dtype=np.float64))
|
||||
tm.assert_series_equal(grouped.sum(), exp)
|
||||
tm.assert_series_equal(grouped.agg(np.sum), exp)
|
||||
tm.assert_series_equal(grouped.apply(np.sum), exp,
|
||||
check_index_type=False)
|
||||
|
||||
# DataFrame
|
||||
grouped = tsframe.groupby(tsframe['A'] * np.nan)
|
||||
exp_df = DataFrame(columns=tsframe.columns, dtype=float,
|
||||
index=pd.Index([], dtype=np.float64))
|
||||
tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False)
|
||||
tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
|
||||
tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0],
|
||||
check_names=False)
|
||||
|
||||
|
||||
def test_agg_grouping_is_list_tuple(ts):
|
||||
df = tm.makeTimeDataFrame()
|
||||
|
||||
grouped = df.groupby(lambda x: x.year)
|
||||
grouper = grouped.grouper.groupings[0].grouper
|
||||
grouped.grouper.groupings[0] = Grouping(ts.index, list(grouper))
|
||||
|
||||
result = grouped.agg(np.mean)
|
||||
expected = grouped.mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
grouped.grouper.groupings[0] = Grouping(ts.index, tuple(grouper))
|
||||
|
||||
result = grouped.agg(np.mean)
|
||||
expected = grouped.mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_python_multiindex(mframe):
|
||||
grouped = mframe.groupby(['A', 'B'])
|
||||
|
||||
result = grouped.agg(np.mean)
|
||||
expected = grouped.mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('groupbyfunc', [
|
||||
lambda x: x.weekday(),
|
||||
[lambda x: x.month, lambda x: x.weekday()],
|
||||
])
|
||||
def test_aggregate_str_func(tsframe, groupbyfunc):
|
||||
grouped = tsframe.groupby(groupbyfunc)
|
||||
|
||||
# single series
|
||||
result = grouped['A'].agg('std')
|
||||
expected = grouped['A'].std()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# group frame by function name
|
||||
result = grouped.aggregate('var')
|
||||
expected = grouped.var()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# group frame by function dict
|
||||
result = grouped.agg(OrderedDict([['A', 'var'],
|
||||
['B', 'std'],
|
||||
['C', 'mean'],
|
||||
['D', 'sem']]))
|
||||
expected = DataFrame(OrderedDict([['A', grouped['A'].var()],
|
||||
['B', grouped['B'].std()],
|
||||
['C', grouped['C'].mean()],
|
||||
['D', grouped['D'].sem()]]))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_item_by_item(df):
|
||||
grouped = df.groupby('A')
|
||||
|
||||
aggfun = lambda ser: ser.size
|
||||
result = grouped.agg(aggfun)
|
||||
foo = (df.A == 'foo').sum()
|
||||
bar = (df.A == 'bar').sum()
|
||||
K = len(result.columns)
|
||||
|
||||
# GH5782
|
||||
# odd comparisons can result here, so cast to make easy
|
||||
exp = pd.Series(np.array([foo] * K), index=list('BCD'),
|
||||
dtype=np.float64, name='foo')
|
||||
tm.assert_series_equal(result.xs('foo'), exp)
|
||||
|
||||
exp = pd.Series(np.array([bar] * K), index=list('BCD'),
|
||||
dtype=np.float64, name='bar')
|
||||
tm.assert_almost_equal(result.xs('bar'), exp)
|
||||
|
||||
def aggfun(ser):
|
||||
return ser.size
|
||||
|
||||
result = DataFrame().groupby(df.A).agg(aggfun)
|
||||
assert isinstance(result, DataFrame)
|
||||
assert len(result) == 0
|
||||
|
||||
|
||||
def test_wrap_agg_out(three_group):
|
||||
grouped = three_group.groupby(['A', 'B'])
|
||||
|
||||
def func(ser):
|
||||
if ser.dtype == np.object:
|
||||
raise TypeError
|
||||
else:
|
||||
return ser.sum()
|
||||
|
||||
result = grouped.aggregate(func)
|
||||
exp_grouped = three_group.loc[:, three_group.columns != 'C']
|
||||
expected = exp_grouped.groupby(['A', 'B']).aggregate(func)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_multiple_functions_maintain_order(df):
|
||||
# GH #610
|
||||
funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)]
|
||||
result = df.groupby('A')['C'].agg(funcs)
|
||||
exp_cols = Index(['mean', 'max', 'min'])
|
||||
|
||||
tm.assert_index_equal(result.columns, exp_cols)
|
||||
|
||||
|
||||
def test_multiple_functions_tuples_and_non_tuples(df):
|
||||
# #1359
|
||||
funcs = [('foo', 'mean'), 'std']
|
||||
ex_funcs = [('foo', 'mean'), ('std', 'std')]
|
||||
|
||||
result = df.groupby('A')['C'].agg(funcs)
|
||||
expected = df.groupby('A')['C'].agg(ex_funcs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby('A').agg(funcs)
|
||||
expected = df.groupby('A').agg(ex_funcs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_multiple_functions_too_many_lambdas(df):
|
||||
grouped = df.groupby('A')
|
||||
funcs = ['mean', lambda x: x.mean(), lambda x: x.std()]
|
||||
|
||||
msg = 'Function names must be unique, found multiple named <lambda>'
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
grouped.agg(funcs)
|
||||
|
||||
|
||||
def test_more_flexible_frame_multi_function(df):
|
||||
grouped = df.groupby('A')
|
||||
|
||||
exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]]))
|
||||
exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]]))
|
||||
|
||||
expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1)
|
||||
expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)
|
||||
|
||||
d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]])
|
||||
result = grouped.aggregate(d)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# be careful
|
||||
result = grouped.aggregate(OrderedDict([['C', np.mean],
|
||||
['D', [np.mean, np.std]]]))
|
||||
expected = grouped.aggregate(OrderedDict([['C', np.mean],
|
||||
['D', [np.mean, np.std]]]))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def foo(x):
|
||||
return np.mean(x)
|
||||
|
||||
def bar(x):
|
||||
return np.std(x, ddof=1)
|
||||
|
||||
# this uses column selection & renaming
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
d = OrderedDict([['C', np.mean],
|
||||
['D', OrderedDict([['foo', np.mean],
|
||||
['bar', np.std]])]])
|
||||
result = grouped.aggregate(d)
|
||||
|
||||
d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]])
|
||||
expected = grouped.aggregate(d)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multi_function_flexible_mix(df):
|
||||
# GH #1268
|
||||
grouped = df.groupby('A')
|
||||
|
||||
# Expected
|
||||
d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
|
||||
['D', {'sum': 'sum'}]])
|
||||
# this uses column selection & renaming
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
expected = grouped.aggregate(d)
|
||||
|
||||
# Test 1
|
||||
d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
|
||||
['D', 'sum']])
|
||||
# this uses column selection & renaming
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = grouped.aggregate(d)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Test 2
|
||||
d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
|
||||
['D', ['sum']]])
|
||||
# this uses column selection & renaming
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = grouped.aggregate(d)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -1,218 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
test cython .agg behavior
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range)
|
||||
from pandas.core.groupby.groupby import DataError
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize('op_name', [
|
||||
'count',
|
||||
'sum',
|
||||
'std',
|
||||
'var',
|
||||
'sem',
|
||||
'mean',
|
||||
pytest.param('median',
|
||||
# ignore mean of empty slice
|
||||
# and all-NaN
|
||||
marks=[pytest.mark.filterwarnings(
|
||||
"ignore::RuntimeWarning"
|
||||
)]),
|
||||
'prod',
|
||||
'min',
|
||||
'max',
|
||||
])
|
||||
def test_cythonized_aggers(op_name):
|
||||
data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., np.nan, np.nan],
|
||||
'B': ['A', 'B'] * 6,
|
||||
'C': np.random.randn(12)}
|
||||
df = DataFrame(data)
|
||||
df.loc[2:10:2, 'C'] = np.nan
|
||||
|
||||
op = lambda x: getattr(x, op_name)()
|
||||
|
||||
# single column
|
||||
grouped = df.drop(['B'], axis=1).groupby('A')
|
||||
exp = {cat: op(group['C']) for cat, group in grouped}
|
||||
exp = DataFrame({'C': exp})
|
||||
exp.index.name = 'A'
|
||||
result = op(grouped)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
# multiple columns
|
||||
grouped = df.groupby(['A', 'B'])
|
||||
expd = {}
|
||||
for (cat1, cat2), group in grouped:
|
||||
expd.setdefault(cat1, {})[cat2] = op(group['C'])
|
||||
exp = DataFrame(expd).T.stack(dropna=False)
|
||||
exp.index.names = ['A', 'B']
|
||||
exp.name = 'C'
|
||||
|
||||
result = op(grouped)['C']
|
||||
if op_name in ['sum', 'prod']:
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
|
||||
def test_cython_agg_boolean():
|
||||
frame = DataFrame({'a': np.random.randint(0, 5, 50),
|
||||
'b': np.random.randint(0, 2, 50).astype('bool')})
|
||||
result = frame.groupby('a')['b'].mean()
|
||||
expected = frame.groupby('a')['b'].agg(np.mean)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_nothing_to_agg():
|
||||
frame = DataFrame({'a': np.random.randint(0, 5, 50),
|
||||
'b': ['foo', 'bar'] * 25})
|
||||
msg = "No numeric types to aggregate"
|
||||
|
||||
with pytest.raises(DataError, match=msg):
|
||||
frame.groupby('a')['b'].mean()
|
||||
|
||||
frame = DataFrame({'a': np.random.randint(0, 5, 50),
|
||||
'b': ['foo', 'bar'] * 25})
|
||||
with pytest.raises(DataError, match=msg):
|
||||
frame[['b']].groupby(frame['a']).mean()
|
||||
|
||||
|
||||
def test_cython_agg_nothing_to_agg_with_dates():
|
||||
frame = DataFrame({'a': np.random.randint(0, 5, 50),
|
||||
'b': ['foo', 'bar'] * 25,
|
||||
'dates': pd.date_range('now', periods=50, freq='T')})
|
||||
msg = "No numeric types to aggregate"
|
||||
with pytest.raises(DataError, match=msg):
|
||||
frame.groupby('b').dates.mean()
|
||||
|
||||
|
||||
def test_cython_agg_frame_columns():
|
||||
# #2113
|
||||
df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})
|
||||
|
||||
df.groupby(level=0, axis='columns').mean()
|
||||
df.groupby(level=0, axis='columns').mean()
|
||||
df.groupby(level=0, axis='columns').mean()
|
||||
df.groupby(level=0, axis='columns').mean()
|
||||
|
||||
|
||||
def test_cython_agg_return_dict():
|
||||
# GH 16741
|
||||
df = DataFrame(
|
||||
{'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
|
||||
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
|
||||
'C': np.random.randn(8),
|
||||
'D': np.random.randn(8)})
|
||||
|
||||
ts = df.groupby('A')['B'].agg(lambda x: x.value_counts().to_dict())
|
||||
expected = Series([{'two': 1, 'one': 1, 'three': 1},
|
||||
{'two': 2, 'one': 2, 'three': 1}],
|
||||
index=Index(['bar', 'foo'], name='A'),
|
||||
name='B')
|
||||
tm.assert_series_equal(ts, expected)
|
||||
|
||||
|
||||
def test_cython_fail_agg():
|
||||
dr = bdate_range('1/1/2000', periods=50)
|
||||
ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr)
|
||||
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
summed = grouped.sum()
|
||||
expected = grouped.agg(np.sum)
|
||||
tm.assert_series_equal(summed, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('op, targop', [
|
||||
('mean', np.mean),
|
||||
('median', np.median),
|
||||
('var', np.var),
|
||||
('add', np.sum),
|
||||
('prod', np.prod),
|
||||
('min', np.min),
|
||||
('max', np.max),
|
||||
('first', lambda x: x.iloc[0]),
|
||||
('last', lambda x: x.iloc[-1]),
|
||||
])
|
||||
def test__cython_agg_general(op, targop):
|
||||
df = DataFrame(np.random.randn(1000))
|
||||
labels = np.random.randint(0, 50, size=1000).astype(float)
|
||||
|
||||
result = df.groupby(labels)._cython_agg_general(op)
|
||||
expected = df.groupby(labels).agg(targop)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('op, targop', [
|
||||
('mean', np.mean),
|
||||
('median', lambda x: np.median(x) if len(x) > 0 else np.nan),
|
||||
('var', lambda x: np.var(x, ddof=1)),
|
||||
('min', np.min),
|
||||
('max', np.max), ]
|
||||
)
|
||||
def test_cython_agg_empty_buckets(op, targop, observed):
|
||||
df = pd.DataFrame([11, 12, 13])
|
||||
grps = range(0, 55, 5)
|
||||
|
||||
# calling _cython_agg_general directly, instead of via the user API
|
||||
# which sets different values for min_count, so do that here.
|
||||
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||||
result = g._cython_agg_general(op)
|
||||
|
||||
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||||
expected = g.agg(lambda x: targop(x))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_empty_buckets_nanops(observed):
|
||||
# GH-18869 can't call nanops on empty groups, so hardcode expected
|
||||
# for these
|
||||
df = pd.DataFrame([11, 12, 13], columns=['a'])
|
||||
grps = range(0, 25, 5)
|
||||
# add / sum
|
||||
result = df.groupby(pd.cut(df['a'], grps),
|
||||
observed=observed)._cython_agg_general('add')
|
||||
intervals = pd.interval_range(0, 20, freq=5)
|
||||
expected = pd.DataFrame(
|
||||
{"a": [0, 0, 36, 0]},
|
||||
index=pd.CategoricalIndex(intervals, name='a', ordered=True))
|
||||
if observed:
|
||||
expected = expected[expected.a != 0]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# prod
|
||||
result = df.groupby(pd.cut(df['a'], grps),
|
||||
observed=observed)._cython_agg_general('prod')
|
||||
expected = pd.DataFrame(
|
||||
{"a": [1, 1, 1716, 1]},
|
||||
index=pd.CategoricalIndex(intervals, name='a', ordered=True))
|
||||
if observed:
|
||||
expected = expected[expected.a != 1]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('op', ['first', 'last', 'max', 'min'])
|
||||
@pytest.mark.parametrize('data', [
|
||||
Timestamp('2016-10-14 21:00:44.557'),
|
||||
Timedelta('17088 days 21:00:44.557'), ])
|
||||
def test_cython_with_timestamp_and_nat(op, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/19526
|
||||
df = DataFrame({'a': [0, 1], 'b': [data, NaT]})
|
||||
index = Index([0, 1], name='a')
|
||||
|
||||
# We will group by a and test the cython aggregations
|
||||
expected = DataFrame({'b': [data, NaT]}, index=index)
|
||||
|
||||
result = df.groupby('a').aggregate(op)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
@@ -1,514 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
test all other .agg behavior
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from collections import OrderedDict
|
||||
import datetime as dt
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame, Index, MultiIndex, PeriodIndex, Series, date_range,
|
||||
period_range)
|
||||
from pandas.core.groupby.groupby import SpecificationError
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
|
||||
def test_agg_api():
|
||||
# GH 6337
|
||||
# http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
|
||||
# different api for agg when passed custom function with mixed frame
|
||||
|
||||
df = DataFrame({'data1': np.random.randn(5),
|
||||
'data2': np.random.randn(5),
|
||||
'key1': ['a', 'a', 'b', 'b', 'a'],
|
||||
'key2': ['one', 'two', 'one', 'two', 'one']})
|
||||
grouped = df.groupby('key1')
|
||||
|
||||
def peak_to_peak(arr):
|
||||
return arr.max() - arr.min()
|
||||
|
||||
expected = grouped.agg([peak_to_peak])
|
||||
expected.columns = ['data1', 'data2']
|
||||
result = grouped.agg(peak_to_peak)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_datetimes_mixed():
|
||||
data = [[1, '2012-01-01', 1.0],
|
||||
[2, '2012-01-02', 2.0],
|
||||
[3, None, 3.0]]
|
||||
|
||||
df1 = DataFrame({'key': [x[0] for x in data],
|
||||
'date': [x[1] for x in data],
|
||||
'value': [x[2] for x in data]})
|
||||
|
||||
data = [[row[0],
|
||||
(dt.datetime.strptime(row[1], '%Y-%m-%d').date()
|
||||
if row[1] else None),
|
||||
row[2]]
|
||||
for row in data]
|
||||
|
||||
df2 = DataFrame({'key': [x[0] for x in data],
|
||||
'date': [x[1] for x in data],
|
||||
'value': [x[2] for x in data]})
|
||||
|
||||
df1['weights'] = df1['value'] / df1['value'].sum()
|
||||
gb1 = df1.groupby('date').aggregate(np.sum)
|
||||
|
||||
df2['weights'] = df1['value'] / df1['value'].sum()
|
||||
gb2 = df2.groupby('date').aggregate(np.sum)
|
||||
|
||||
assert (len(gb1) == len(gb2))
|
||||
|
||||
|
||||
def test_agg_period_index():
|
||||
prng = period_range('2012-1-1', freq='M', periods=3)
|
||||
df = DataFrame(np.random.randn(3, 2), index=prng)
|
||||
rs = df.groupby(level=0).sum()
|
||||
assert isinstance(rs.index, PeriodIndex)
|
||||
|
||||
# GH 3579
|
||||
index = period_range(start='1999-01', periods=5, freq='M')
|
||||
s1 = Series(np.random.rand(len(index)), index=index)
|
||||
s2 = Series(np.random.rand(len(index)), index=index)
|
||||
series = [('s1', s1), ('s2', s2)]
|
||||
df = DataFrame.from_dict(OrderedDict(series))
|
||||
grouped = df.groupby(df.index.month)
|
||||
list(grouped)
|
||||
|
||||
|
||||
def test_agg_dict_parameter_cast_result_dtypes():
|
||||
# GH 12821
|
||||
|
||||
df = DataFrame({'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'],
|
||||
'time': date_range('1/1/2011', periods=8, freq='H')})
|
||||
df.loc[[0, 1, 2, 5], 'time'] = None
|
||||
|
||||
# test for `first` function
|
||||
exp = df.loc[[0, 3, 4, 6]].set_index('class')
|
||||
grouped = df.groupby('class')
|
||||
tm.assert_frame_equal(grouped.first(), exp)
|
||||
tm.assert_frame_equal(grouped.agg('first'), exp)
|
||||
tm.assert_frame_equal(grouped.agg({'time': 'first'}), exp)
|
||||
tm.assert_series_equal(grouped.time.first(), exp['time'])
|
||||
tm.assert_series_equal(grouped.time.agg('first'), exp['time'])
|
||||
|
||||
# test for `last` function
|
||||
exp = df.loc[[0, 3, 4, 7]].set_index('class')
|
||||
grouped = df.groupby('class')
|
||||
tm.assert_frame_equal(grouped.last(), exp)
|
||||
tm.assert_frame_equal(grouped.agg('last'), exp)
|
||||
tm.assert_frame_equal(grouped.agg({'time': 'last'}), exp)
|
||||
tm.assert_series_equal(grouped.time.last(), exp['time'])
|
||||
tm.assert_series_equal(grouped.time.agg('last'), exp['time'])
|
||||
|
||||
# count
|
||||
exp = pd.Series([2, 2, 2, 2],
|
||||
index=Index(list('ABCD'), name='class'),
|
||||
name='time')
|
||||
tm.assert_series_equal(grouped.time.agg(len), exp)
|
||||
tm.assert_series_equal(grouped.time.size(), exp)
|
||||
|
||||
exp = pd.Series([0, 1, 1, 2],
|
||||
index=Index(list('ABCD'), name='class'),
|
||||
name='time')
|
||||
tm.assert_series_equal(grouped.time.count(), exp)
|
||||
|
||||
|
||||
def test_agg_cast_results_dtypes():
|
||||
# similar to GH12821
|
||||
# xref #11444
|
||||
u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
|
||||
v = list('aaabbbbbbccd')
|
||||
df = pd.DataFrame({'X': v, 'Y': u})
|
||||
|
||||
result = df.groupby('X')['Y'].agg(len)
|
||||
expected = df.groupby('X')['Y'].count()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_float64_no_int64():
|
||||
# see gh-11199
|
||||
df = DataFrame({"a": [1, 2, 3, 4, 5],
|
||||
"b": [1, 2, 2, 4, 5],
|
||||
"c": [1, 2, 3, 4, 5]})
|
||||
|
||||
expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
||||
expected.index.name = "b"
|
||||
|
||||
result = df.groupby("b")[["a"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]},
|
||||
index=[1, 2, 4, 5])
|
||||
expected.index.name = "b"
|
||||
|
||||
result = df.groupby("b")[["a", "c"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_api_consistency():
|
||||
# GH 9052
|
||||
# make sure that the aggregates via dict
|
||||
# are consistent
|
||||
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
|
||||
'foo', 'bar', 'foo', 'foo'],
|
||||
'B': ['one', 'one', 'two', 'two',
|
||||
'two', 'two', 'one', 'two'],
|
||||
'C': np.random.randn(8) + 1.0,
|
||||
'D': np.arange(8)})
|
||||
|
||||
grouped = df.groupby(['A', 'B'])
|
||||
c_mean = grouped['C'].mean()
|
||||
c_sum = grouped['C'].sum()
|
||||
d_mean = grouped['D'].mean()
|
||||
d_sum = grouped['D'].sum()
|
||||
|
||||
result = grouped['D'].agg(['sum', 'mean'])
|
||||
expected = pd.concat([d_sum, d_mean], axis=1)
|
||||
expected.columns = ['sum', 'mean']
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg([np.sum, np.mean])
|
||||
expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([['C', 'D'],
|
||||
['sum', 'mean']])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped[['D', 'C']].agg([np.sum, np.mean])
|
||||
expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([['D', 'C'],
|
||||
['sum', 'mean']])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg({'C': 'mean', 'D': 'sum'})
|
||||
expected = pd.concat([d_sum, c_mean], axis=1)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg({'C': ['mean', 'sum'],
|
||||
'D': ['mean', 'sum']})
|
||||
expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
|
||||
expected.columns = MultiIndex.from_product([['C', 'D'],
|
||||
['mean', 'sum']])
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = grouped[['D', 'C']].agg({'r': np.sum,
|
||||
'r2': np.mean})
|
||||
expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([['r', 'r2'],
|
||||
['D', 'C']])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
def test_agg_dict_renaming_deprecation():
|
||||
# 15931
|
||||
df = pd.DataFrame({'A': [1, 1, 1, 2, 2],
|
||||
'B': range(5),
|
||||
'C': range(5)})
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning,
|
||||
check_stacklevel=False) as w:
|
||||
df.groupby('A').agg({'B': {'foo': ['sum', 'max']},
|
||||
'C': {'bar': ['count', 'min']}})
|
||||
assert "using a dict with renaming" in str(w[0].message)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
df.groupby('A')[['B', 'C']].agg({'ma': 'max'})
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning) as w:
|
||||
df.groupby('A').B.agg({'foo': 'count'})
|
||||
assert "using a dict on a Series for aggregation" in str(w[0].message)
|
||||
|
||||
|
||||
def test_agg_compat():
|
||||
# GH 12334
|
||||
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
|
||||
'foo', 'bar', 'foo', 'foo'],
|
||||
'B': ['one', 'one', 'two', 'two',
|
||||
'two', 'two', 'one', 'two'],
|
||||
'C': np.random.randn(8) + 1.0,
|
||||
'D': np.arange(8)})
|
||||
|
||||
g = df.groupby(['A', 'B'])
|
||||
|
||||
expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1)
|
||||
expected.columns = MultiIndex.from_tuples([('C', 'sum'),
|
||||
('C', 'std')])
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = g['D'].agg({'C': ['sum', 'std']})
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1)
|
||||
expected.columns = ['C', 'D']
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = g['D'].agg({'C': 'sum', 'D': 'std'})
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
def test_agg_nested_dicts():
|
||||
# API change for disallowing these types of nested dicts
|
||||
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
|
||||
'foo', 'bar', 'foo', 'foo'],
|
||||
'B': ['one', 'one', 'two', 'two',
|
||||
'two', 'two', 'one', 'two'],
|
||||
'C': np.random.randn(8) + 1.0,
|
||||
'D': np.arange(8)})
|
||||
|
||||
g = df.groupby(['A', 'B'])
|
||||
|
||||
msg = r'cannot perform renaming for r[1-2] with a nested dictionary'
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g.aggregate({'r1': {'C': ['mean', 'sum']},
|
||||
'r2': {'D': ['mean', 'sum']}})
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = g.agg({'C': {'ra': ['mean', 'std']},
|
||||
'D': {'rb': ['mean', 'std']}})
|
||||
expected = pd.concat([g['C'].mean(), g['C'].std(),
|
||||
g['D'].mean(), g['D'].std()],
|
||||
axis=1)
|
||||
expected.columns = pd.MultiIndex.from_tuples(
|
||||
[('ra', 'mean'), ('ra', 'std'),
|
||||
('rb', 'mean'), ('rb', 'std')])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
# same name as the original column
|
||||
# GH9052
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
expected = g['D'].agg({'result1': np.sum, 'result2': np.mean})
|
||||
expected = expected.rename(columns={'result1': 'D'})
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = g['D'].agg({'D': np.sum, 'result2': np.mean})
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
def test_agg_item_by_item_raise_typeerror():
|
||||
df = DataFrame(np.random.randint(10, size=(20, 10)))
|
||||
|
||||
def raiseException(df):
|
||||
pprint_thing('----------------------------------------')
|
||||
pprint_thing(df.to_string())
|
||||
raise TypeError('test')
|
||||
|
||||
with pytest.raises(TypeError, match='test'):
|
||||
df.groupby(0).agg(raiseException)
|
||||
|
||||
|
||||
def test_series_agg_multikey():
|
||||
ts = tm.makeTimeSeries()
|
||||
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
||||
|
||||
result = grouped.agg(np.sum)
|
||||
expected = grouped.sum()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_series_agg_multi_pure_python():
|
||||
data = DataFrame(
|
||||
{'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
|
||||
'foo', 'foo', 'foo'],
|
||||
'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
|
||||
'two', 'two', 'one'],
|
||||
'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
|
||||
'dull', 'shiny', 'shiny', 'shiny'],
|
||||
'D': np.random.randn(11),
|
||||
'E': np.random.randn(11),
|
||||
'F': np.random.randn(11)})
|
||||
|
||||
def bad(x):
|
||||
assert (len(x.values.base) > 0)
|
||||
return 'foo'
|
||||
|
||||
result = data.groupby(['A', 'B']).agg(bad)
|
||||
expected = data.groupby(['A', 'B']).agg(lambda x: 'foo')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_consistency():
|
||||
# agg with ([]) and () not consistent
|
||||
# GH 6715
|
||||
def P1(a):
|
||||
try:
|
||||
return np.percentile(a.dropna(), q=1)
|
||||
except Exception:
|
||||
return np.nan
|
||||
|
||||
df = DataFrame({'col1': [1, 2, 3, 4],
|
||||
'col2': [10, 25, 26, 31],
|
||||
'date': [dt.date(2013, 2, 10), dt.date(2013, 2, 10),
|
||||
dt.date(2013, 2, 11), dt.date(2013, 2, 11)]})
|
||||
|
||||
g = df.groupby('date')
|
||||
|
||||
expected = g.agg([P1])
|
||||
expected.columns = expected.columns.levels[0]
|
||||
|
||||
result = g.agg(P1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_callables():
|
||||
# GH 7929
|
||||
df = DataFrame({'foo': [1, 2], 'bar': [3, 4]}).astype(np.int64)
|
||||
|
||||
class fn_class(object):
|
||||
|
||||
def __call__(self, x):
|
||||
return sum(x)
|
||||
|
||||
equiv_callables = [sum,
|
||||
np.sum,
|
||||
lambda x: sum(x),
|
||||
lambda x: x.sum(),
|
||||
partial(sum),
|
||||
fn_class(), ]
|
||||
|
||||
expected = df.groupby("foo").agg(sum)
|
||||
for ecall in equiv_callables:
|
||||
result = df.groupby('foo').agg(ecall)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_over_numpy_arrays():
|
||||
# GH 3788
|
||||
df = pd.DataFrame([[1, np.array([10, 20, 30])],
|
||||
[1, np.array([40, 50, 60])],
|
||||
[2, np.array([20, 30, 40])]],
|
||||
columns=['category', 'arraydata'])
|
||||
result = df.groupby('category').agg(sum)
|
||||
|
||||
expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
|
||||
expected_index = pd.Index([1, 2], name='category')
|
||||
expected_column = ['arraydata']
|
||||
expected = pd.DataFrame(expected_data,
|
||||
index=expected_index,
|
||||
columns=expected_column)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_timezone_round_trip():
|
||||
# GH 15426
|
||||
ts = pd.Timestamp("2016-01-01 12:00:00", tz='US/Pacific')
|
||||
df = pd.DataFrame({'a': 1,
|
||||
'b': [ts + dt.timedelta(minutes=nn)
|
||||
for nn in range(10)]})
|
||||
|
||||
result1 = df.groupby('a')['b'].agg(np.min).iloc[0]
|
||||
result2 = df.groupby('a')['b'].agg(lambda x: np.min(x)).iloc[0]
|
||||
result3 = df.groupby('a')['b'].min().iloc[0]
|
||||
|
||||
assert result1 == ts
|
||||
assert result2 == ts
|
||||
assert result3 == ts
|
||||
|
||||
dates = [pd.Timestamp("2016-01-0%d 12:00:00" % i, tz='US/Pacific')
|
||||
for i in range(1, 5)]
|
||||
df = pd.DataFrame({'A': ['a', 'b'] * 2, 'B': dates})
|
||||
grouped = df.groupby('A')
|
||||
|
||||
ts = df['B'].iloc[0]
|
||||
assert ts == grouped.nth(0)['B'].iloc[0]
|
||||
assert ts == grouped.head(1)['B'].iloc[0]
|
||||
assert ts == grouped.first()['B'].iloc[0]
|
||||
assert ts == grouped.apply(lambda x: x.iloc[0])[0]
|
||||
|
||||
ts = df['B'].iloc[2]
|
||||
assert ts == grouped.last()['B'].iloc[0]
|
||||
assert ts == grouped.apply(lambda x: x.iloc[-1])[0]
|
||||
|
||||
|
||||
def test_sum_uint64_overflow():
|
||||
# see gh-14758
|
||||
# Convert to uint64 and don't overflow
|
||||
df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
|
||||
df = df + 9223372036854775807
|
||||
|
||||
index = pd.Index([9223372036854775808,
|
||||
9223372036854775810,
|
||||
9223372036854775812],
|
||||
dtype=np.uint64)
|
||||
expected = pd.DataFrame({1: [9223372036854775809,
|
||||
9223372036854775811,
|
||||
9223372036854775813]},
|
||||
index=index)
|
||||
|
||||
expected.index.name = 0
|
||||
result = df.groupby(0).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("structure, expected", [
|
||||
(tuple, pd.DataFrame({'C': {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
|
||||
(list, pd.DataFrame({'C': {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
|
||||
(lambda x: tuple(x), pd.DataFrame({'C': {(1, 1): (1, 1, 1),
|
||||
(3, 4): (3, 4, 4)}})),
|
||||
(lambda x: list(x), pd.DataFrame({'C': {(1, 1): [1, 1, 1],
|
||||
(3, 4): [3, 4, 4]}}))
|
||||
])
|
||||
def test_agg_structs_dataframe(structure, expected):
|
||||
df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3],
|
||||
'B': [1, 1, 1, 4, 4, 4],
|
||||
'C': [1, 1, 1, 3, 4, 4]})
|
||||
|
||||
result = df.groupby(['A', 'B']).aggregate(structure)
|
||||
expected.index.names = ['A', 'B']
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("structure, expected", [
|
||||
(tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name='C')),
|
||||
(list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name='C')),
|
||||
(lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)],
|
||||
index=[1, 3], name='C')),
|
||||
(lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]],
|
||||
index=[1, 3], name='C'))
|
||||
])
|
||||
def test_agg_structs_series(structure, expected):
|
||||
# Issue #18079
|
||||
df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3],
|
||||
'B': [1, 1, 1, 4, 4, 4],
|
||||
'C': [1, 1, 1, 3, 4, 4]})
|
||||
|
||||
result = df.groupby('A')['C'].aggregate(structure)
|
||||
expected.index.name = 'A'
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_category_nansum(observed):
|
||||
categories = ['a', 'b', 'c']
|
||||
df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
|
||||
categories=categories),
|
||||
'B': [1, 2, 3]})
|
||||
result = df.groupby("A", observed=observed).B.agg(np.nansum)
|
||||
expected = pd.Series([3, 3, 0],
|
||||
index=pd.CategoricalIndex(['a', 'b', 'c'],
|
||||
categories=categories,
|
||||
name='A'),
|
||||
name='B')
|
||||
if observed:
|
||||
expected = expected[expected != 0]
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_list_like_func():
|
||||
# GH 18473
|
||||
df = pd.DataFrame({'A': [str(x) for x in range(3)],
|
||||
'B': [str(x) for x in range(3)]})
|
||||
grouped = df.groupby('A', as_index=False, sort=False)
|
||||
result = grouped.agg({'B': lambda x: list(x)})
|
||||
expected = pd.DataFrame({'A': [str(x) for x in range(3)],
|
||||
'B': [[str(x)] for x in range(3)]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -1,78 +0,0 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, MultiIndex
|
||||
from pandas.util import testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mframe():
|
||||
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
|
||||
'three']],
|
||||
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
|
||||
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
|
||||
names=['first', 'second'])
|
||||
return DataFrame(np.random.randn(10, 3), index=index,
|
||||
columns=['A', 'B', 'C'])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
return DataFrame(
|
||||
{'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
|
||||
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
|
||||
'C': np.random.randn(8),
|
||||
'D': np.random.randn(8)})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ts():
|
||||
return tm.makeTimeSeries()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def seriesd():
|
||||
return tm.getSeriesData()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tsd():
|
||||
return tm.getTimeSeriesData()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def frame(seriesd):
|
||||
return DataFrame(seriesd)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tsframe(tsd):
|
||||
return DataFrame(tsd)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_mixed_floats():
|
||||
return DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
|
||||
'foo', 'bar', 'foo', 'foo'],
|
||||
'B': ['one', 'one', 'two', 'three',
|
||||
'two', 'two', 'one', 'three'],
|
||||
'C': np.random.randn(8),
|
||||
'D': np.array(
|
||||
np.random.randn(8), dtype='float32')})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def three_group():
|
||||
return DataFrame({'A': ['foo', 'foo', 'foo',
|
||||
'foo', 'bar', 'bar',
|
||||
'bar', 'bar',
|
||||
'foo', 'foo', 'foo'],
|
||||
'B': ['one', 'one', 'one',
|
||||
'two', 'one', 'one', 'one', 'two',
|
||||
'two', 'two', 'one'],
|
||||
'C': ['dull', 'dull', 'shiny',
|
||||
'dull', 'dull', 'shiny', 'shiny',
|
||||
'dull', 'shiny', 'shiny', 'shiny'],
|
||||
'D': np.random.randn(11),
|
||||
'E': np.random.randn(11),
|
||||
'F': np.random.randn(11)})
|
||||
@@ -1,542 +0,0 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, Series, bdate_range, compat
|
||||
from pandas.util import testing as tm
|
||||
|
||||
|
||||
def test_apply_issues():
|
||||
# GH 5788
|
||||
|
||||
s = """2011.05.16,00:00,1.40893
|
||||
2011.05.16,01:00,1.40760
|
||||
2011.05.16,02:00,1.40750
|
||||
2011.05.16,03:00,1.40649
|
||||
2011.05.17,02:00,1.40893
|
||||
2011.05.17,03:00,1.40760
|
||||
2011.05.17,04:00,1.40750
|
||||
2011.05.17,05:00,1.40649
|
||||
2011.05.18,02:00,1.40893
|
||||
2011.05.18,03:00,1.40760
|
||||
2011.05.18,04:00,1.40750
|
||||
2011.05.18,05:00,1.40649"""
|
||||
|
||||
df = pd.read_csv(
|
||||
compat.StringIO(s), header=None, names=['date', 'time', 'value'],
|
||||
parse_dates=[['date', 'time']])
|
||||
df = df.set_index('date_time')
|
||||
|
||||
expected = df.groupby(df.index.date).idxmax()
|
||||
result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 5789
|
||||
# don't auto coerce dates
|
||||
df = pd.read_csv(
|
||||
compat.StringIO(s), header=None, names=['date', 'time', 'value'])
|
||||
exp_idx = pd.Index(
|
||||
['2011.05.16', '2011.05.17', '2011.05.18'
|
||||
], dtype=object, name='date')
|
||||
expected = Series(['00:00', '02:00', '02:00'], index=exp_idx)
|
||||
result = df.groupby('date').apply(
|
||||
lambda x: x['time'][x['value'].idxmax()])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_trivial():
|
||||
# GH 20066
|
||||
# trivial apply: ignore input and return a constant dataframe.
|
||||
df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'],
|
||||
'data': [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=['key', 'data'])
|
||||
expected = pd.concat([df.iloc[1:], df.iloc[1:]],
|
||||
axis=1, keys=['float64', 'object'])
|
||||
result = df.groupby([str(x) for x in df.dtypes],
|
||||
axis=1).apply(lambda x: df.iloc[1:])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="GH#20066; function passed into apply "
|
||||
"returns a DataFrame with the same index "
|
||||
"as the one to create GroupBy object.")
|
||||
def test_apply_trivial_fail():
|
||||
# GH 20066
|
||||
# trivial apply fails if the constant dataframe has the same index
|
||||
# with the one used to create GroupBy object.
|
||||
df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'],
|
||||
'data': [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=['key', 'data'])
|
||||
expected = pd.concat([df, df],
|
||||
axis=1, keys=['float64', 'object'])
|
||||
result = df.groupby([str(x) for x in df.dtypes],
|
||||
axis=1).apply(lambda x: df)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_fast_apply():
|
||||
# make sure that fast apply is correctly called
|
||||
# rather than raising any kind of error
|
||||
# otherwise the python path will be callsed
|
||||
# which slows things down
|
||||
N = 1000
|
||||
labels = np.random.randint(0, 2000, size=N)
|
||||
labels2 = np.random.randint(0, 3, size=N)
|
||||
df = DataFrame({'key': labels,
|
||||
'key2': labels2,
|
||||
'value1': np.random.randn(N),
|
||||
'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)})
|
||||
|
||||
def f(g):
|
||||
return 1
|
||||
|
||||
g = df.groupby(['key', 'key2'])
|
||||
|
||||
grouper = g.grouper
|
||||
|
||||
splitter = grouper._get_splitter(g._selected_obj, axis=g.axis)
|
||||
group_keys = grouper._get_group_keys()
|
||||
|
||||
values, mutated = splitter.fast_apply(f, group_keys)
|
||||
assert not mutated
|
||||
|
||||
|
||||
def test_apply_with_mixed_dtype():
|
||||
# GH3480, apply with mixed dtype on axis=1 breaks in 0.11
|
||||
df = DataFrame({'foo1': np.random.randn(6),
|
||||
'foo2': ['one', 'two', 'two', 'three', 'one', 'two']})
|
||||
result = df.apply(lambda x: x, axis=1)
|
||||
tm.assert_series_equal(df.get_dtype_counts(), result.get_dtype_counts())
|
||||
|
||||
# GH 3610 incorrect dtype conversion with as_index=False
|
||||
df = DataFrame({"c1": [1, 2, 6, 6, 8]})
|
||||
df["c2"] = df.c1 / 2.0
|
||||
result1 = df.groupby("c2").mean().reset_index().c2
|
||||
result2 = df.groupby("c2", as_index=False).mean().c2
|
||||
tm.assert_series_equal(result1, result2)
|
||||
|
||||
|
||||
def test_groupby_as_index_apply(df):
|
||||
# GH #4648 and #3417
|
||||
df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'],
|
||||
'user_id': [1, 2, 1, 1, 3, 1],
|
||||
'time': range(6)})
|
||||
|
||||
g_as = df.groupby('user_id', as_index=True)
|
||||
g_not_as = df.groupby('user_id', as_index=False)
|
||||
|
||||
res_as = g_as.head(2).index
|
||||
res_not_as = g_not_as.head(2).index
|
||||
exp = Index([0, 1, 2, 4])
|
||||
tm.assert_index_equal(res_as, exp)
|
||||
tm.assert_index_equal(res_not_as, exp)
|
||||
|
||||
res_as_apply = g_as.apply(lambda x: x.head(2)).index
|
||||
res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
|
||||
|
||||
# apply doesn't maintain the original ordering
|
||||
# changed in GH5610 as the as_index=False returns a MI here
|
||||
exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (
|
||||
2, 4)])
|
||||
tp = [(1, 0), (1, 2), (2, 1), (3, 4)]
|
||||
exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None])
|
||||
|
||||
tm.assert_index_equal(res_as_apply, exp_as_apply)
|
||||
tm.assert_index_equal(res_not_as_apply, exp_not_as_apply)
|
||||
|
||||
ind = Index(list('abcde'))
|
||||
df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
|
||||
res = df.groupby(0, as_index=False).apply(lambda x: x).index
|
||||
tm.assert_index_equal(res, ind)
|
||||
|
||||
|
||||
def test_apply_concat_preserve_names(three_group):
|
||||
grouped = three_group.groupby(['A', 'B'])
|
||||
|
||||
def desc(group):
|
||||
result = group.describe()
|
||||
result.index.name = 'stat'
|
||||
return result
|
||||
|
||||
def desc2(group):
|
||||
result = group.describe()
|
||||
result.index.name = 'stat'
|
||||
result = result[:len(group)]
|
||||
# weirdo
|
||||
return result
|
||||
|
||||
def desc3(group):
|
||||
result = group.describe()
|
||||
|
||||
# names are different
|
||||
result.index.name = 'stat_%d' % len(group)
|
||||
|
||||
result = result[:len(group)]
|
||||
# weirdo
|
||||
return result
|
||||
|
||||
result = grouped.apply(desc)
|
||||
assert result.index.names == ('A', 'B', 'stat')
|
||||
|
||||
result2 = grouped.apply(desc2)
|
||||
assert result2.index.names == ('A', 'B', 'stat')
|
||||
|
||||
result3 = grouped.apply(desc3)
|
||||
assert result3.index.names == ('A', 'B', None)
|
||||
|
||||
|
||||
def test_apply_series_to_frame():
|
||||
def f(piece):
|
||||
with np.errstate(invalid='ignore'):
|
||||
logged = np.log(piece)
|
||||
return DataFrame({'value': piece,
|
||||
'demeaned': piece - piece.mean(),
|
||||
'logged': logged})
|
||||
|
||||
dr = bdate_range('1/1/2000', periods=100)
|
||||
ts = Series(np.random.randn(100), index=dr)
|
||||
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
result = grouped.apply(f)
|
||||
|
||||
assert isinstance(result, DataFrame)
|
||||
tm.assert_index_equal(result.index, ts.index)
|
||||
|
||||
|
||||
def test_apply_series_yield_constant(df):
|
||||
result = df.groupby(['A', 'B'])['C'].apply(len)
|
||||
assert result.index.names[:2] == ('A', 'B')
|
||||
|
||||
|
||||
def test_apply_frame_yield_constant(df):
|
||||
# GH13568
|
||||
result = df.groupby(['A', 'B']).apply(len)
|
||||
assert isinstance(result, Series)
|
||||
assert result.name is None
|
||||
|
||||
result = df.groupby(['A', 'B'])[['C', 'D']].apply(len)
|
||||
assert isinstance(result, Series)
|
||||
assert result.name is None
|
||||
|
||||
|
||||
def test_apply_frame_to_series(df):
|
||||
grouped = df.groupby(['A', 'B'])
|
||||
result = grouped.apply(len)
|
||||
expected = grouped.count()['C']
|
||||
tm.assert_index_equal(result.index, expected.index)
|
||||
tm.assert_numpy_array_equal(result.values, expected.values)
|
||||
|
||||
|
||||
def test_apply_frame_concat_series():
|
||||
def trans(group):
|
||||
return group.groupby('B')['C'].sum().sort_values()[:2]
|
||||
|
||||
def trans2(group):
|
||||
grouped = group.groupby(df.reindex(group.index)['B'])
|
||||
return grouped.sum().sort_values()[:2]
|
||||
|
||||
df = DataFrame({'A': np.random.randint(0, 5, 1000),
|
||||
'B': np.random.randint(0, 5, 1000),
|
||||
'C': np.random.randn(1000)})
|
||||
|
||||
result = df.groupby('A').apply(trans)
|
||||
exp = df.groupby('A')['C'].apply(trans2)
|
||||
tm.assert_series_equal(result, exp, check_names=False)
|
||||
assert result.name == 'C'
|
||||
|
||||
|
||||
def test_apply_transform(ts):
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
result = grouped.apply(lambda x: x * 2)
|
||||
expected = grouped.transform(lambda x: x * 2)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_multikey_corner(tsframe):
|
||||
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
|
||||
|
||||
def f(group):
|
||||
return group.sort_values('A')[-5:]
|
||||
|
||||
result = grouped.apply(f)
|
||||
for key, group in grouped:
|
||||
tm.assert_frame_equal(result.loc[key], f(group))
|
||||
|
||||
|
||||
def test_apply_chunk_view():
|
||||
# Low level tinkering could be unsafe, make sure not
|
||||
df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3],
|
||||
'value': compat.lrange(9)})
|
||||
|
||||
result = df.groupby('key', group_keys=False).apply(lambda x: x[:2])
|
||||
expected = df.take([0, 1, 3, 4, 6, 7])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_no_name_column_conflict():
|
||||
df = DataFrame({'name': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2],
|
||||
'name2': [0, 0, 0, 1, 1, 1, 0, 0, 1, 1],
|
||||
'value': compat.lrange(10)[::-1]})
|
||||
|
||||
# it works! #2605
|
||||
grouped = df.groupby(['name', 'name2'])
|
||||
grouped.apply(lambda x: x.sort_values('value', inplace=True))
|
||||
|
||||
|
||||
def test_apply_typecast_fail():
|
||||
df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
|
||||
'c': np.tile(
|
||||
['a', 'b', 'c'], 2),
|
||||
'v': np.arange(1., 7.)})
|
||||
|
||||
def f(group):
|
||||
v = group['v']
|
||||
group['v2'] = (v - v.min()) / (v.max() - v.min())
|
||||
return group
|
||||
|
||||
result = df.groupby('d').apply(f)
|
||||
|
||||
expected = df.copy()
|
||||
expected['v2'] = np.tile([0., 0.5, 1], 2)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_multiindex_fail():
|
||||
index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]
|
||||
])
|
||||
df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
|
||||
'c': np.tile(['a', 'b', 'c'], 2),
|
||||
'v': np.arange(1., 7.)}, index=index)
|
||||
|
||||
def f(group):
|
||||
v = group['v']
|
||||
group['v2'] = (v - v.min()) / (v.max() - v.min())
|
||||
return group
|
||||
|
||||
result = df.groupby('d').apply(f)
|
||||
|
||||
expected = df.copy()
|
||||
expected['v2'] = np.tile([0., 0.5, 1], 2)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_corner(tsframe):
|
||||
result = tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2)
|
||||
expected = tsframe * 2
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_without_copy():
|
||||
# GH 5545
|
||||
# returning a non-copy in an applied function fails
|
||||
|
||||
data = DataFrame({'id_field': [100, 100, 200, 300],
|
||||
'category': ['a', 'b', 'c', 'c'],
|
||||
'value': [1, 2, 3, 4]})
|
||||
|
||||
def filt1(x):
|
||||
if x.shape[0] == 1:
|
||||
return x.copy()
|
||||
else:
|
||||
return x[x.category == 'c']
|
||||
|
||||
def filt2(x):
|
||||
if x.shape[0] == 1:
|
||||
return x
|
||||
else:
|
||||
return x[x.category == 'c']
|
||||
|
||||
expected = data.groupby('id_field').apply(filt1)
|
||||
result = data.groupby('id_field').apply(filt2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_corner_cases():
|
||||
# #535, can't use sliding iterator
|
||||
|
||||
N = 1000
|
||||
labels = np.random.randint(0, 100, size=N)
|
||||
df = DataFrame({'key': labels,
|
||||
'value1': np.random.randn(N),
|
||||
'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)})
|
||||
|
||||
grouped = df.groupby('key')
|
||||
|
||||
def f(g):
|
||||
g['value3'] = g['value1'] * 2
|
||||
return g
|
||||
|
||||
result = grouped.apply(f)
|
||||
assert 'value3' in result
|
||||
|
||||
|
||||
def test_apply_numeric_coercion_when_datetime():
|
||||
# In the past, group-by/apply operations have been over-eager
|
||||
# in converting dtypes to numeric, in the presence of datetime
|
||||
# columns. Various GH issues were filed, the reproductions
|
||||
# for which are here.
|
||||
|
||||
# GH 15670
|
||||
df = pd.DataFrame({'Number': [1, 2],
|
||||
'Date': ["2017-03-02"] * 2,
|
||||
'Str': ["foo", "inf"]})
|
||||
expected = df.groupby(['Number']).apply(lambda x: x.iloc[0])
|
||||
df.Date = pd.to_datetime(df.Date)
|
||||
result = df.groupby(['Number']).apply(lambda x: x.iloc[0])
|
||||
tm.assert_series_equal(result['Str'], expected['Str'])
|
||||
|
||||
# GH 15421
|
||||
df = pd.DataFrame({'A': [10, 20, 30],
|
||||
'B': ['foo', '3', '4'],
|
||||
'T': [pd.Timestamp("12:31:22")] * 3})
|
||||
|
||||
def get_B(g):
|
||||
return g.iloc[0][['B']]
|
||||
result = df.groupby('A').apply(get_B)['B']
|
||||
expected = df.B
|
||||
expected.index = df.A
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# GH 14423
|
||||
def predictions(tool):
|
||||
out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object)
|
||||
if 'step1' in list(tool.State):
|
||||
out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0])
|
||||
if 'step2' in list(tool.State):
|
||||
out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0])
|
||||
out['useTime'] = str(
|
||||
tool[tool.State == 'step2'].oTime.values[0])
|
||||
return out
|
||||
df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'],
|
||||
'State': ['step1', 'step2', 'step1', 'step2'],
|
||||
'oTime': ['', '2016-09-19 05:24:33',
|
||||
'', '2016-09-19 23:59:04'],
|
||||
'Machine': ['23', '36L', '36R', '36R']})
|
||||
df2 = df1.copy()
|
||||
df2.oTime = pd.to_datetime(df2.oTime)
|
||||
expected = df1.groupby('Key').apply(predictions).p1
|
||||
result = df2.groupby('Key').apply(predictions).p1
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_time_field_bug():
|
||||
# Test a fix for the following error related to GH issue 11324 When
|
||||
# non-key fields in a group-by dataframe contained time-based fields
|
||||
# that were not returned by the apply function, an exception would be
|
||||
# raised.
|
||||
|
||||
df = pd.DataFrame({'a': 1, 'b': [datetime.now() for nn in range(10)]})
|
||||
|
||||
def func_with_no_date(batch):
|
||||
return pd.Series({'c': 2})
|
||||
|
||||
def func_with_date(batch):
|
||||
return pd.Series({'b': datetime(2015, 1, 1), 'c': 2})
|
||||
|
||||
dfg_no_conversion = df.groupby(by=['a']).apply(func_with_no_date)
|
||||
dfg_no_conversion_expected = pd.DataFrame({'c': 2}, index=[1])
|
||||
dfg_no_conversion_expected.index.name = 'a'
|
||||
|
||||
dfg_conversion = df.groupby(by=['a']).apply(func_with_date)
|
||||
dfg_conversion_expected = pd.DataFrame(
|
||||
{'b': datetime(2015, 1, 1),
|
||||
'c': 2}, index=[1])
|
||||
dfg_conversion_expected.index.name = 'a'
|
||||
|
||||
tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)
|
||||
tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected)
|
||||
|
||||
|
||||
def test_gb_apply_list_of_unequal_len_arrays():
|
||||
|
||||
# GH1738
|
||||
df = DataFrame({'group1': ['a', 'a', 'a', 'b', 'b', 'b', 'a', 'a', 'a',
|
||||
'b', 'b', 'b'],
|
||||
'group2': ['c', 'c', 'd', 'd', 'd', 'e', 'c', 'c', 'd',
|
||||
'd', 'd', 'e'],
|
||||
'weight': [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2],
|
||||
'value': [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3]})
|
||||
df = df.set_index(['group1', 'group2'])
|
||||
df_grouped = df.groupby(level=['group1', 'group2'], sort=True)
|
||||
|
||||
def noddy(value, weight):
|
||||
out = np.array(value * weight).repeat(3)
|
||||
return out
|
||||
|
||||
# the kernel function returns arrays of unequal length
|
||||
# pandas sniffs the first one, sees it's an array and not
|
||||
# a list, and assumed the rest are of equal length
|
||||
# and so tries a vstack
|
||||
|
||||
# don't die
|
||||
df_grouped.apply(lambda x: noddy(x.value, x.weight))
|
||||
|
||||
|
||||
def test_groupby_apply_all_none():
|
||||
# Tests to make sure no errors if apply function returns all None
|
||||
# values. Issue 9684.
|
||||
test_df = DataFrame({'groups': [0, 0, 1, 1],
|
||||
'random_vars': [8, 7, 4, 5]})
|
||||
|
||||
def test_func(x):
|
||||
pass
|
||||
|
||||
result = test_df.groupby('groups').apply(test_func)
|
||||
expected = DataFrame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_apply_none_first():
|
||||
# GH 12824. Tests if apply returns None first.
|
||||
test_df1 = DataFrame({'groups': [1, 1, 1, 2], 'vars': [0, 1, 2, 3]})
|
||||
test_df2 = DataFrame({'groups': [1, 2, 2, 2], 'vars': [0, 1, 2, 3]})
|
||||
|
||||
def test_func(x):
|
||||
if x.shape[0] < 2:
|
||||
return None
|
||||
return x.iloc[[0, -1]]
|
||||
|
||||
result1 = test_df1.groupby('groups').apply(test_func)
|
||||
result2 = test_df2.groupby('groups').apply(test_func)
|
||||
index1 = MultiIndex.from_arrays([[1, 1], [0, 2]],
|
||||
names=['groups', None])
|
||||
index2 = MultiIndex.from_arrays([[2, 2], [1, 3]],
|
||||
names=['groups', None])
|
||||
expected1 = DataFrame({'groups': [1, 1], 'vars': [0, 2]},
|
||||
index=index1)
|
||||
expected2 = DataFrame({'groups': [2, 2], 'vars': [1, 3]},
|
||||
index=index2)
|
||||
tm.assert_frame_equal(result1, expected1)
|
||||
tm.assert_frame_equal(result2, expected2)
|
||||
|
||||
|
||||
def test_groupby_apply_return_empty_chunk():
|
||||
# GH 22221: apply filter which returns some empty groups
|
||||
df = pd.DataFrame(dict(value=[0, 1], group=['filled', 'empty']))
|
||||
groups = df.groupby('group')
|
||||
result = groups.apply(lambda group: group[group.value != 1]['value'])
|
||||
expected = pd.Series([0], name='value',
|
||||
index=MultiIndex.from_product([['empty', 'filled'],
|
||||
[0]],
|
||||
names=['group', None]
|
||||
).drop('empty'))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_with_mixed_types():
|
||||
# gh-20949
|
||||
df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1, 2, 3], 'C': [4, 6, 5]})
|
||||
g = df.groupby('A')
|
||||
|
||||
result = g.transform(lambda x: x / x.sum())
|
||||
expected = pd.DataFrame({'B': [1 / 3., 2 / 3., 1], 'C': [0.4, 0.6, 1.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = g.apply(lambda x: x / x.sum())
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -1,157 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import numpy as np
|
||||
from numpy import nan
|
||||
import pytest
|
||||
|
||||
from pandas._libs import groupby, lib, reduction
|
||||
|
||||
from pandas.core.dtypes.common import ensure_int64
|
||||
|
||||
from pandas import Index, isna
|
||||
from pandas.core.groupby.ops import generate_bins_generic
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_almost_equal
|
||||
|
||||
|
||||
def test_series_grouper():
|
||||
from pandas import Series
|
||||
obj = Series(np.random.randn(10))
|
||||
dummy = obj[:0]
|
||||
|
||||
labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64)
|
||||
|
||||
grouper = reduction.SeriesGrouper(obj, np.mean, labels, 2, dummy)
|
||||
result, counts = grouper.get_result()
|
||||
|
||||
expected = np.array([obj[3:6].mean(), obj[6:].mean()])
|
||||
assert_almost_equal(result, expected)
|
||||
|
||||
exp_counts = np.array([3, 4], dtype=np.int64)
|
||||
assert_almost_equal(counts, exp_counts)
|
||||
|
||||
|
||||
def test_series_bin_grouper():
|
||||
from pandas import Series
|
||||
obj = Series(np.random.randn(10))
|
||||
dummy = obj[:0]
|
||||
|
||||
bins = np.array([3, 6])
|
||||
|
||||
grouper = reduction.SeriesBinGrouper(obj, np.mean, bins, dummy)
|
||||
result, counts = grouper.get_result()
|
||||
|
||||
expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()])
|
||||
assert_almost_equal(result, expected)
|
||||
|
||||
exp_counts = np.array([3, 3, 4], dtype=np.int64)
|
||||
assert_almost_equal(counts, exp_counts)
|
||||
|
||||
|
||||
class TestBinGroupers(object):
|
||||
|
||||
def setup_method(self, method):
|
||||
self.obj = np.random.randn(10, 1)
|
||||
self.labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2], dtype=np.int64)
|
||||
self.bins = np.array([3, 6], dtype=np.int64)
|
||||
|
||||
def test_generate_bins(self):
|
||||
values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
|
||||
binner = np.array([0, 3, 6, 9], dtype=np.int64)
|
||||
|
||||
for func in [lib.generate_bins_dt64, generate_bins_generic]:
|
||||
bins = func(values, binner, closed='left')
|
||||
assert ((bins == np.array([2, 5, 6])).all())
|
||||
|
||||
bins = func(values, binner, closed='right')
|
||||
assert ((bins == np.array([3, 6, 6])).all())
|
||||
|
||||
for func in [lib.generate_bins_dt64, generate_bins_generic]:
|
||||
values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
|
||||
binner = np.array([0, 3, 6], dtype=np.int64)
|
||||
|
||||
bins = func(values, binner, closed='right')
|
||||
assert ((bins == np.array([3, 6])).all())
|
||||
|
||||
msg = "Invalid length for values or for binner"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
generate_bins_generic(values, [], 'right')
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
generate_bins_generic(values[:0], binner, 'right')
|
||||
|
||||
msg = "Values falls before first bin"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
generate_bins_generic(values, [4], 'right')
|
||||
msg = "Values falls after last bin"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
generate_bins_generic(values, [-3, -1], 'right')
|
||||
|
||||
|
||||
def test_group_ohlc():
|
||||
def _check(dtype):
|
||||
obj = np.array(np.random.randn(20), dtype=dtype)
|
||||
|
||||
bins = np.array([6, 12, 20])
|
||||
out = np.zeros((3, 4), dtype)
|
||||
counts = np.zeros(len(out), dtype=np.int64)
|
||||
labels = ensure_int64(np.repeat(np.arange(3),
|
||||
np.diff(np.r_[0, bins])))
|
||||
|
||||
func = getattr(groupby, 'group_ohlc_%s' % dtype)
|
||||
func(out, counts, obj[:, None], labels)
|
||||
|
||||
def _ohlc(group):
|
||||
if isna(group).all():
|
||||
return np.repeat(nan, 4)
|
||||
return [group[0], group.max(), group.min(), group[-1]]
|
||||
|
||||
expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]),
|
||||
_ohlc(obj[12:])])
|
||||
|
||||
assert_almost_equal(out, expected)
|
||||
tm.assert_numpy_array_equal(counts,
|
||||
np.array([6, 6, 8], dtype=np.int64))
|
||||
|
||||
obj[:6] = nan
|
||||
func(out, counts, obj[:, None], labels)
|
||||
expected[0] = nan
|
||||
assert_almost_equal(out, expected)
|
||||
|
||||
_check('float32')
|
||||
_check('float64')
|
||||
|
||||
|
||||
class TestMoments(object):
|
||||
pass
|
||||
|
||||
|
||||
class TestReducer(object):
|
||||
|
||||
def test_int_index(self):
|
||||
from pandas.core.series import Series
|
||||
|
||||
arr = np.random.randn(100, 4)
|
||||
result = reduction.reduce(arr, np.sum, labels=Index(np.arange(4)))
|
||||
expected = arr.sum(0)
|
||||
assert_almost_equal(result, expected)
|
||||
|
||||
result = reduction.reduce(arr, np.sum, axis=1,
|
||||
labels=Index(np.arange(100)))
|
||||
expected = arr.sum(1)
|
||||
assert_almost_equal(result, expected)
|
||||
|
||||
dummy = Series(0., index=np.arange(100))
|
||||
result = reduction.reduce(arr, np.sum, dummy=dummy,
|
||||
labels=Index(np.arange(4)))
|
||||
expected = arr.sum(0)
|
||||
assert_almost_equal(result, expected)
|
||||
|
||||
dummy = Series(0., index=np.arange(4))
|
||||
result = reduction.reduce(arr, np.sum, axis=1, dummy=dummy,
|
||||
labels=Index(np.arange(100)))
|
||||
expected = arr.sum(1)
|
||||
assert_almost_equal(result, expected)
|
||||
|
||||
result = reduction.reduce(arr, np.sum, axis=1, dummy=dummy,
|
||||
labels=Index(np.arange(100)))
|
||||
assert_almost_equal(result, expected)
|
||||
@@ -1,936 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import print_function
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY37
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical, CategoricalIndex, DataFrame, Index, MultiIndex, Series, qcut)
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (
|
||||
assert_equal, assert_frame_equal, assert_series_equal)
|
||||
|
||||
|
||||
def cartesian_product_for_groupers(result, args, names):
|
||||
""" Reindex to a cartesian production for the groupers,
|
||||
preserving the nature (Categorical) of each grouper """
|
||||
|
||||
def f(a):
|
||||
if isinstance(a, (CategoricalIndex, Categorical)):
|
||||
categories = a.categories
|
||||
a = Categorical.from_codes(np.arange(len(categories)),
|
||||
categories=categories,
|
||||
ordered=a.ordered)
|
||||
return a
|
||||
|
||||
index = pd.MultiIndex.from_product(map(f, args), names=names)
|
||||
return result.reindex(index).sort_index()
|
||||
|
||||
|
||||
def test_apply_use_categorical_name(df):
|
||||
cats = qcut(df.C, 4)
|
||||
|
||||
def get_stats(group):
|
||||
return {'min': group.min(),
|
||||
'max': group.max(),
|
||||
'count': group.count(),
|
||||
'mean': group.mean()}
|
||||
|
||||
result = df.groupby(cats, observed=False).D.apply(get_stats)
|
||||
assert result.index.names[0] == 'C'
|
||||
|
||||
|
||||
def test_basic():
|
||||
|
||||
cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
|
||||
categories=["a", "b", "c", "d"], ordered=True)
|
||||
data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
|
||||
|
||||
exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
|
||||
expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
|
||||
result = data.groupby("b", observed=False).mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
cat1 = Categorical(["a", "a", "b", "b"],
|
||||
categories=["a", "b", "z"], ordered=True)
|
||||
cat2 = Categorical(["c", "d", "c", "d"],
|
||||
categories=["c", "d", "y"], ordered=True)
|
||||
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
|
||||
|
||||
# single grouper
|
||||
gb = df.groupby("A", observed=False)
|
||||
exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
|
||||
expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
|
||||
result = gb.sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 8623
|
||||
x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
|
||||
[1, 'John P. Doe']],
|
||||
columns=['person_id', 'person_name'])
|
||||
x['person_name'] = Categorical(x.person_name)
|
||||
|
||||
g = x.groupby(['person_id'], observed=False)
|
||||
result = g.transform(lambda x: x)
|
||||
tm.assert_frame_equal(result, x[['person_name']])
|
||||
|
||||
result = x.drop_duplicates('person_name')
|
||||
expected = x.iloc[[0, 1]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def f(x):
|
||||
return x.drop_duplicates('person_name').iloc[0]
|
||||
|
||||
result = g.apply(f)
|
||||
expected = x.iloc[[0, 1]].copy()
|
||||
expected.index = Index([1, 2], name='person_id')
|
||||
expected['person_name'] = expected['person_name'].astype('object')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 9921
|
||||
# Monotonic
|
||||
df = DataFrame({"a": [5, 15, 25]})
|
||||
c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])
|
||||
|
||||
result = df.a.groupby(c, observed=False).transform(sum)
|
||||
tm.assert_series_equal(result, df['a'])
|
||||
|
||||
tm.assert_series_equal(
|
||||
df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
|
||||
df['a'])
|
||||
tm.assert_frame_equal(
|
||||
df.groupby(c, observed=False).transform(sum),
|
||||
df[['a']])
|
||||
tm.assert_frame_equal(
|
||||
df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
|
||||
df[['a']])
|
||||
|
||||
# Filter
|
||||
tm.assert_series_equal(
|
||||
df.a.groupby(c, observed=False).filter(np.all),
|
||||
df['a'])
|
||||
tm.assert_frame_equal(
|
||||
df.groupby(c, observed=False).filter(np.all),
|
||||
df)
|
||||
|
||||
# Non-monotonic
|
||||
df = DataFrame({"a": [5, 15, 25, -5]})
|
||||
c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])
|
||||
|
||||
result = df.a.groupby(c, observed=False).transform(sum)
|
||||
tm.assert_series_equal(result, df['a'])
|
||||
|
||||
tm.assert_series_equal(
|
||||
df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
|
||||
df['a'])
|
||||
tm.assert_frame_equal(
|
||||
df.groupby(c, observed=False).transform(sum),
|
||||
df[['a']])
|
||||
tm.assert_frame_equal(
|
||||
df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
|
||||
df[['a']])
|
||||
|
||||
# GH 9603
|
||||
df = DataFrame({'a': [1, 0, 0, 0]})
|
||||
c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
|
||||
result = df.groupby(c, observed=False).apply(len)
|
||||
|
||||
exp_index = CategoricalIndex(
|
||||
c.values.categories, ordered=c.values.ordered)
|
||||
expected = Series([1, 0, 0, 0], index=exp_index)
|
||||
expected.index.name = 'a'
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# more basic
|
||||
levels = ['foo', 'bar', 'baz', 'qux']
|
||||
codes = np.random.randint(0, 4, size=100)
|
||||
|
||||
cats = Categorical.from_codes(codes, levels, ordered=True)
|
||||
|
||||
data = DataFrame(np.random.randn(100, 4))
|
||||
|
||||
result = data.groupby(cats, observed=False).mean()
|
||||
|
||||
expected = data.groupby(np.asarray(cats), observed=False).mean()
|
||||
exp_idx = CategoricalIndex(levels, categories=cats.categories,
|
||||
ordered=True)
|
||||
expected = expected.reindex(exp_idx)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
grouped = data.groupby(cats, observed=False)
|
||||
desc_result = grouped.describe()
|
||||
|
||||
idx = cats.codes.argsort()
|
||||
ord_labels = np.asarray(cats).take(idx)
|
||||
ord_data = data.take(idx)
|
||||
|
||||
exp_cats = Categorical(ord_labels, ordered=True,
|
||||
categories=['foo', 'bar', 'baz', 'qux'])
|
||||
expected = ord_data.groupby(
|
||||
exp_cats, sort=False, observed=False).describe()
|
||||
assert_frame_equal(desc_result, expected)
|
||||
|
||||
# GH 10460
|
||||
expc = Categorical.from_codes(np.arange(4).repeat(8),
|
||||
levels, ordered=True)
|
||||
exp = CategoricalIndex(expc)
|
||||
tm.assert_index_equal((desc_result.stack().index
|
||||
.get_level_values(0)), exp)
|
||||
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
|
||||
'75%', 'max'] * 4)
|
||||
tm.assert_index_equal((desc_result.stack().index
|
||||
.get_level_values(1)), exp)
|
||||
|
||||
|
||||
def test_level_get_group(observed):
|
||||
# GH15155
|
||||
df = DataFrame(data=np.arange(2, 22, 2),
|
||||
index=MultiIndex(
|
||||
levels=[pd.CategoricalIndex(["a", "b"]), range(10)],
|
||||
codes=[[0] * 5 + [1] * 5, range(10)],
|
||||
names=["Index1", "Index2"]))
|
||||
g = df.groupby(level=["Index1"], observed=observed)
|
||||
|
||||
# expected should equal test.loc[["a"]]
|
||||
# GH15166
|
||||
expected = DataFrame(data=np.arange(2, 12, 2),
|
||||
index=pd.MultiIndex(levels=[pd.CategoricalIndex(
|
||||
["a", "b"]), range(5)],
|
||||
codes=[[0] * 5, range(5)],
|
||||
names=["Index1", "Index2"]))
|
||||
result = g.get_group('a')
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.xfail(PY37, reason="flaky on 3.7, xref gh-21636", strict=False)
|
||||
@pytest.mark.parametrize('ordered', [True, False])
|
||||
def test_apply(ordered):
|
||||
# GH 10138
|
||||
|
||||
dense = Categorical(list('abc'), ordered=ordered)
|
||||
|
||||
# 'b' is in the categories but not in the list
|
||||
missing = Categorical(
|
||||
list('aaa'), categories=['a', 'b'], ordered=ordered)
|
||||
values = np.arange(len(dense))
|
||||
df = DataFrame({'missing': missing,
|
||||
'dense': dense,
|
||||
'values': values})
|
||||
grouped = df.groupby(['missing', 'dense'], observed=True)
|
||||
|
||||
# missing category 'b' should still exist in the output index
|
||||
idx = MultiIndex.from_arrays(
|
||||
[missing, dense], names=['missing', 'dense'])
|
||||
expected = DataFrame([0, 1, 2.],
|
||||
index=idx,
|
||||
columns=['values'])
|
||||
|
||||
result = grouped.apply(lambda x: np.mean(x))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# we coerce back to ints
|
||||
expected = expected.astype('int')
|
||||
result = grouped.mean()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = grouped.agg(np.mean)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# but for transform we should still get back the original index
|
||||
idx = MultiIndex.from_arrays([missing, dense],
|
||||
names=['missing', 'dense'])
|
||||
expected = Series(1, index=idx)
|
||||
result = grouped.apply(lambda x: 1)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_observed(observed):
|
||||
# multiple groupers, don't re-expand the output space
|
||||
# of the grouper
|
||||
# gh-14942 (implement)
|
||||
# gh-10132 (back-compat)
|
||||
# gh-8138 (back-compat)
|
||||
# gh-8869
|
||||
|
||||
cat1 = Categorical(["a", "a", "b", "b"],
|
||||
categories=["a", "b", "z"], ordered=True)
|
||||
cat2 = Categorical(["c", "d", "c", "d"],
|
||||
categories=["c", "d", "y"], ordered=True)
|
||||
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
|
||||
df['C'] = ['foo', 'bar'] * 2
|
||||
|
||||
# multiple groupers with a non-cat
|
||||
gb = df.groupby(['A', 'B', 'C'], observed=observed)
|
||||
exp_index = pd.MultiIndex.from_arrays(
|
||||
[cat1, cat2, ['foo', 'bar'] * 2],
|
||||
names=['A', 'B', 'C'])
|
||||
expected = DataFrame({'values': Series(
|
||||
[1, 2, 3, 4], index=exp_index)}).sort_index()
|
||||
result = gb.sum()
|
||||
if not observed:
|
||||
expected = cartesian_product_for_groupers(
|
||||
expected,
|
||||
[cat1, cat2, ['foo', 'bar']],
|
||||
list('ABC'))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
gb = df.groupby(['A', 'B'], observed=observed)
|
||||
exp_index = pd.MultiIndex.from_arrays(
|
||||
[cat1, cat2],
|
||||
names=['A', 'B'])
|
||||
expected = DataFrame({'values': [1, 2, 3, 4]},
|
||||
index=exp_index)
|
||||
result = gb.sum()
|
||||
if not observed:
|
||||
expected = cartesian_product_for_groupers(
|
||||
expected,
|
||||
[cat1, cat2],
|
||||
list('AB'))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# https://github.com/pandas-dev/pandas/issues/8138
|
||||
d = {'cat':
|
||||
pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"],
|
||||
ordered=True),
|
||||
'ints': [1, 1, 2, 2],
|
||||
'val': [10, 20, 30, 40]}
|
||||
df = pd.DataFrame(d)
|
||||
|
||||
# Grouping on a single column
|
||||
groups_single_key = df.groupby("cat", observed=observed)
|
||||
result = groups_single_key.mean()
|
||||
|
||||
exp_index = pd.CategoricalIndex(list('ab'), name="cat",
|
||||
categories=list('abc'),
|
||||
ordered=True)
|
||||
expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]},
|
||||
index=exp_index)
|
||||
if not observed:
|
||||
index = pd.CategoricalIndex(list('abc'), name="cat",
|
||||
categories=list('abc'),
|
||||
ordered=True)
|
||||
expected = expected.reindex(index)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Grouping on two columns
|
||||
groups_double_key = df.groupby(["cat", "ints"], observed=observed)
|
||||
result = groups_double_key.agg('mean')
|
||||
expected = DataFrame(
|
||||
{"val": [10, 30, 20, 40],
|
||||
"cat": pd.Categorical(['a', 'a', 'b', 'b'],
|
||||
categories=['a', 'b', 'c'],
|
||||
ordered=True),
|
||||
"ints": [1, 2, 1, 2]}).set_index(["cat", "ints"])
|
||||
if not observed:
|
||||
expected = cartesian_product_for_groupers(
|
||||
expected,
|
||||
[df.cat.values, [1, 2]],
|
||||
['cat', 'ints'])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 10132
|
||||
for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
|
||||
c, i = key
|
||||
result = groups_double_key.get_group(key)
|
||||
expected = df[(df.cat == c) & (df.ints == i)]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# gh-8869
|
||||
# with as_index
|
||||
d = {'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70],
|
||||
'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']}
|
||||
df = pd.DataFrame(d)
|
||||
cat = pd.cut(df['foo'], np.linspace(0, 10, 3))
|
||||
df['range'] = cat
|
||||
groups = df.groupby(['range', 'baz'], as_index=False, observed=observed)
|
||||
result = groups.agg('mean')
|
||||
|
||||
groups2 = df.groupby(['range', 'baz'], as_index=True, observed=observed)
|
||||
expected = groups2.agg('mean').reset_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_observed_codes_remap(observed):
|
||||
d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
|
||||
df = pd.DataFrame(d)
|
||||
values = pd.cut(df['C1'], [1, 2, 3, 6])
|
||||
values.name = "cat"
|
||||
groups_double_key = df.groupby([values, 'C2'], observed=observed)
|
||||
|
||||
idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]],
|
||||
names=["cat", "C2"])
|
||||
expected = DataFrame({"C1": [3, 3, 4, 5],
|
||||
"C3": [10, 100, 200, 34]}, index=idx)
|
||||
if not observed:
|
||||
expected = cartesian_product_for_groupers(
|
||||
expected,
|
||||
[values.values, [1, 2, 3, 4]],
|
||||
['cat', 'C2'])
|
||||
|
||||
result = groups_double_key.agg('mean')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_observed_perf():
|
||||
# we create a cartesian product, so this is
|
||||
# non-performant if we don't use observed values
|
||||
# gh-14942
|
||||
df = DataFrame({
|
||||
'cat': np.random.randint(0, 255, size=30000),
|
||||
'int_id': np.random.randint(0, 255, size=30000),
|
||||
'other_id': np.random.randint(0, 10000, size=30000),
|
||||
'foo': 0})
|
||||
df['cat'] = df.cat.astype(str).astype('category')
|
||||
|
||||
grouped = df.groupby(['cat', 'int_id', 'other_id'], observed=True)
|
||||
result = grouped.count()
|
||||
assert result.index.levels[0].nunique() == df.cat.nunique()
|
||||
assert result.index.levels[1].nunique() == df.int_id.nunique()
|
||||
assert result.index.levels[2].nunique() == df.other_id.nunique()
|
||||
|
||||
|
||||
def test_observed_groups(observed):
|
||||
# gh-20583
|
||||
# test that we have the appropriate groups
|
||||
|
||||
cat = pd.Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c'])
|
||||
df = pd.DataFrame({'cat': cat, 'vals': [1, 2, 3]})
|
||||
g = df.groupby('cat', observed=observed)
|
||||
|
||||
result = g.groups
|
||||
if observed:
|
||||
expected = {'a': Index([0, 2], dtype='int64'),
|
||||
'c': Index([1], dtype='int64')}
|
||||
else:
|
||||
expected = {'a': Index([0, 2], dtype='int64'),
|
||||
'b': Index([], dtype='int64'),
|
||||
'c': Index([1], dtype='int64')}
|
||||
|
||||
tm.assert_dict_equal(result, expected)
|
||||
|
||||
|
||||
def test_observed_groups_with_nan(observed):
|
||||
# GH 24740
|
||||
df = pd.DataFrame({'cat': pd.Categorical(['a', np.nan, 'a'],
|
||||
categories=['a', 'b', 'd']),
|
||||
'vals': [1, 2, 3]})
|
||||
g = df.groupby('cat', observed=observed)
|
||||
result = g.groups
|
||||
if observed:
|
||||
expected = {'a': Index([0, 2], dtype='int64')}
|
||||
else:
|
||||
expected = {'a': Index([0, 2], dtype='int64'),
|
||||
'b': Index([], dtype='int64'),
|
||||
'd': Index([], dtype='int64')}
|
||||
tm.assert_dict_equal(result, expected)
|
||||
|
||||
|
||||
def test_dataframe_categorical_with_nan(observed):
|
||||
# GH 21151
|
||||
s1 = pd.Categorical([np.nan, 'a', np.nan, 'a'],
|
||||
categories=['a', 'b', 'c'])
|
||||
s2 = pd.Series([1, 2, 3, 4])
|
||||
df = pd.DataFrame({'s1': s1, 's2': s2})
|
||||
result = df.groupby('s1', observed=observed).first().reset_index()
|
||||
if observed:
|
||||
expected = DataFrame({'s1': pd.Categorical(['a'],
|
||||
categories=['a', 'b', 'c']), 's2': [2]})
|
||||
else:
|
||||
expected = DataFrame({'s1': pd.Categorical(['a', 'b', 'c'],
|
||||
categories=['a', 'b', 'c']),
|
||||
's2': [2, np.nan, np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_datetime():
|
||||
# GH9049: ensure backward compatibility
|
||||
levels = pd.date_range('2014-01-01', periods=4)
|
||||
codes = np.random.randint(0, 4, size=100)
|
||||
|
||||
cats = Categorical.from_codes(codes, levels, ordered=True)
|
||||
|
||||
data = DataFrame(np.random.randn(100, 4))
|
||||
result = data.groupby(cats, observed=False).mean()
|
||||
|
||||
expected = data.groupby(np.asarray(cats), observed=False).mean()
|
||||
expected = expected.reindex(levels)
|
||||
expected.index = CategoricalIndex(expected.index,
|
||||
categories=expected.index,
|
||||
ordered=True)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
grouped = data.groupby(cats, observed=False)
|
||||
desc_result = grouped.describe()
|
||||
|
||||
idx = cats.codes.argsort()
|
||||
ord_labels = cats.take_nd(idx)
|
||||
ord_data = data.take(idx)
|
||||
expected = ord_data.groupby(ord_labels, observed=False).describe()
|
||||
assert_frame_equal(desc_result, expected)
|
||||
tm.assert_index_equal(desc_result.index, expected.index)
|
||||
tm.assert_index_equal(
|
||||
desc_result.index.get_level_values(0),
|
||||
expected.index.get_level_values(0))
|
||||
|
||||
# GH 10460
|
||||
expc = Categorical.from_codes(
|
||||
np.arange(4).repeat(8), levels, ordered=True)
|
||||
exp = CategoricalIndex(expc)
|
||||
tm.assert_index_equal((desc_result.stack().index
|
||||
.get_level_values(0)), exp)
|
||||
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
|
||||
'75%', 'max'] * 4)
|
||||
tm.assert_index_equal((desc_result.stack().index
|
||||
.get_level_values(1)), exp)
|
||||
|
||||
|
||||
def test_categorical_index():
|
||||
|
||||
s = np.random.RandomState(12345)
|
||||
levels = ['foo', 'bar', 'baz', 'qux']
|
||||
codes = s.randint(0, 4, size=20)
|
||||
cats = Categorical.from_codes(codes, levels, ordered=True)
|
||||
df = DataFrame(
|
||||
np.repeat(
|
||||
np.arange(20), 4).reshape(-1, 4), columns=list('abcd'))
|
||||
df['cats'] = cats
|
||||
|
||||
# with a cat index
|
||||
result = df.set_index('cats').groupby(level=0, observed=False).sum()
|
||||
expected = df[list('abcd')].groupby(cats.codes, observed=False).sum()
|
||||
expected.index = CategoricalIndex(
|
||||
Categorical.from_codes(
|
||||
[0, 1, 2, 3], levels, ordered=True), name='cats')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# with a cat column, should produce a cat index
|
||||
result = df.groupby('cats', observed=False).sum()
|
||||
expected = df[list('abcd')].groupby(cats.codes, observed=False).sum()
|
||||
expected.index = CategoricalIndex(
|
||||
Categorical.from_codes(
|
||||
[0, 1, 2, 3], levels, ordered=True), name='cats')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_describe_categorical_columns():
|
||||
# GH 11558
|
||||
cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'],
|
||||
categories=['foo', 'bar', 'baz', 'qux'],
|
||||
ordered=True)
|
||||
df = DataFrame(np.random.randn(20, 4), columns=cats)
|
||||
result = df.groupby([1, 2, 3, 4] * 5).describe()
|
||||
|
||||
tm.assert_index_equal(result.stack().columns, cats)
|
||||
tm.assert_categorical_equal(result.stack().columns.values, cats.values)
|
||||
|
||||
|
||||
def test_unstack_categorical():
|
||||
# GH11558 (example is taken from the original issue)
|
||||
df = pd.DataFrame({'a': range(10),
|
||||
'medium': ['A', 'B'] * 5,
|
||||
'artist': list('XYXXY') * 2})
|
||||
df['medium'] = df['medium'].astype('category')
|
||||
|
||||
gcat = df.groupby(
|
||||
['artist', 'medium'], observed=False)['a'].count().unstack()
|
||||
result = gcat.describe()
|
||||
|
||||
exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False,
|
||||
name='medium')
|
||||
tm.assert_index_equal(result.columns, exp_columns)
|
||||
tm.assert_categorical_equal(result.columns.values, exp_columns.values)
|
||||
|
||||
result = gcat['A'] + gcat['B']
|
||||
expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist'))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_bins_unequal_len():
|
||||
# GH3011
|
||||
series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
|
||||
bins = pd.cut(series.dropna().values, 4)
|
||||
|
||||
# len(bins) != len(series) here
|
||||
with pytest.raises(ValueError):
|
||||
series.groupby(bins).mean()
|
||||
|
||||
|
||||
def test_as_index():
|
||||
# GH13204
|
||||
df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]),
|
||||
'A': [10, 11, 11],
|
||||
'B': [101, 102, 103]})
|
||||
result = df.groupby(['cat', 'A'], as_index=False, observed=True).sum()
|
||||
expected = DataFrame(
|
||||
{'cat': Categorical([1, 2], categories=df.cat.cat.categories),
|
||||
'A': [10, 11],
|
||||
'B': [101, 205]},
|
||||
columns=['cat', 'A', 'B'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# function grouper
|
||||
f = lambda r: df.loc[r, 'A']
|
||||
result = df.groupby(['cat', f], as_index=False, observed=True).sum()
|
||||
expected = DataFrame(
|
||||
{'cat': Categorical([1, 2], categories=df.cat.cat.categories),
|
||||
'A': [10, 22],
|
||||
'B': [101, 205]},
|
||||
columns=['cat', 'A', 'B'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# another not in-axis grouper (conflicting names in index)
|
||||
s = Series(['a', 'b', 'b'], name='cat')
|
||||
result = df.groupby(['cat', s], as_index=False, observed=True).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# is original index dropped?
|
||||
group_columns = ['cat', 'A']
|
||||
expected = DataFrame(
|
||||
{'cat': Categorical([1, 2], categories=df.cat.cat.categories),
|
||||
'A': [10, 11],
|
||||
'B': [101, 205]},
|
||||
columns=['cat', 'A', 'B'])
|
||||
|
||||
for name in [None, 'X', 'B']:
|
||||
df.index = Index(list("abc"), name=name)
|
||||
result = df.groupby(group_columns, as_index=False, observed=True).sum()
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_preserve_categories():
|
||||
# GH-13179
|
||||
categories = list('abc')
|
||||
|
||||
# ordered=True
|
||||
df = DataFrame({'A': pd.Categorical(list('ba'),
|
||||
categories=categories,
|
||||
ordered=True)})
|
||||
index = pd.CategoricalIndex(categories, categories, ordered=True)
|
||||
tm.assert_index_equal(
|
||||
df.groupby('A', sort=True, observed=False).first().index, index)
|
||||
tm.assert_index_equal(
|
||||
df.groupby('A', sort=False, observed=False).first().index, index)
|
||||
|
||||
# ordered=False
|
||||
df = DataFrame({'A': pd.Categorical(list('ba'),
|
||||
categories=categories,
|
||||
ordered=False)})
|
||||
sort_index = pd.CategoricalIndex(categories, categories, ordered=False)
|
||||
nosort_index = pd.CategoricalIndex(list('bac'), list('bac'),
|
||||
ordered=False)
|
||||
tm.assert_index_equal(
|
||||
df.groupby('A', sort=True, observed=False).first().index,
|
||||
sort_index)
|
||||
tm.assert_index_equal(
|
||||
df.groupby('A', sort=False, observed=False).first().index,
|
||||
nosort_index)
|
||||
|
||||
|
||||
def test_preserve_categorical_dtype():
|
||||
# GH13743, GH13854
|
||||
df = DataFrame({'A': [1, 2, 1, 1, 2],
|
||||
'B': [10, 16, 22, 28, 34],
|
||||
'C1': Categorical(list("abaab"),
|
||||
categories=list("bac"),
|
||||
ordered=False),
|
||||
'C2': Categorical(list("abaab"),
|
||||
categories=list("bac"),
|
||||
ordered=True)})
|
||||
# single grouper
|
||||
exp_full = DataFrame({'A': [2.0, 1.0, np.nan],
|
||||
'B': [25.0, 20.0, np.nan],
|
||||
'C1': Categorical(list("bac"),
|
||||
categories=list("bac"),
|
||||
ordered=False),
|
||||
'C2': Categorical(list("bac"),
|
||||
categories=list("bac"),
|
||||
ordered=True)})
|
||||
for col in ['C1', 'C2']:
|
||||
result1 = df.groupby(by=col, as_index=False, observed=False).mean()
|
||||
result2 = df.groupby(
|
||||
by=col, as_index=True, observed=False).mean().reset_index()
|
||||
expected = exp_full.reindex(columns=result1.columns)
|
||||
tm.assert_frame_equal(result1, expected)
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
|
||||
def test_categorical_no_compress():
|
||||
data = Series(np.random.randn(9))
|
||||
|
||||
codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
|
||||
cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)
|
||||
|
||||
result = data.groupby(cats, observed=False).mean()
|
||||
exp = data.groupby(codes, observed=False).mean()
|
||||
|
||||
exp.index = CategoricalIndex(exp.index, categories=cats.categories,
|
||||
ordered=cats.ordered)
|
||||
assert_series_equal(result, exp)
|
||||
|
||||
codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
|
||||
cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)
|
||||
|
||||
result = data.groupby(cats, observed=False).mean()
|
||||
exp = data.groupby(codes, observed=False).mean().reindex(cats.categories)
|
||||
exp.index = CategoricalIndex(exp.index, categories=cats.categories,
|
||||
ordered=cats.ordered)
|
||||
assert_series_equal(result, exp)
|
||||
|
||||
cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
|
||||
categories=["a", "b", "c", "d"], ordered=True)
|
||||
data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
|
||||
|
||||
result = data.groupby("b", observed=False).mean()
|
||||
result = result["a"].values
|
||||
exp = np.array([1, 2, 4, np.nan])
|
||||
tm.assert_numpy_array_equal(result, exp)
|
||||
|
||||
|
||||
def test_sort():
|
||||
|
||||
# http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby # noqa: flake8
|
||||
# This should result in a properly sorted Series so that the plot
|
||||
# has a sorted x axis
|
||||
# self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
|
||||
|
||||
df = DataFrame({'value': np.random.randint(0, 10000, 100)})
|
||||
labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
|
||||
cat_labels = Categorical(labels, labels)
|
||||
|
||||
df = df.sort_values(by=['value'], ascending=True)
|
||||
df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
|
||||
right=False, labels=cat_labels)
|
||||
|
||||
res = df.groupby(['value_group'], observed=False)['value_group'].count()
|
||||
exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
|
||||
exp.index = CategoricalIndex(exp.index, name=exp.index.name)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
|
||||
def test_sort2():
|
||||
# dataframe groupby sort was being ignored # GH 8868
|
||||
df = DataFrame([['(7.5, 10]', 10, 10],
|
||||
['(7.5, 10]', 8, 20],
|
||||
['(2.5, 5]', 5, 30],
|
||||
['(5, 7.5]', 6, 40],
|
||||
['(2.5, 5]', 4, 50],
|
||||
['(0, 2.5]', 1, 60],
|
||||
['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar'])
|
||||
df['range'] = Categorical(df['range'], ordered=True)
|
||||
index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]',
|
||||
'(7.5, 10]'], name='range', ordered=True)
|
||||
expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
|
||||
columns=['foo', 'bar'], index=index)
|
||||
|
||||
col = 'range'
|
||||
result_sort = df.groupby(col, sort=True, observed=False).first()
|
||||
assert_frame_equal(result_sort, expected_sort)
|
||||
|
||||
# when categories is ordered, group is ordered by category's order
|
||||
expected_sort = result_sort
|
||||
result_sort = df.groupby(col, sort=False, observed=False).first()
|
||||
assert_frame_equal(result_sort, expected_sort)
|
||||
|
||||
df['range'] = Categorical(df['range'], ordered=False)
|
||||
index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]',
|
||||
'(7.5, 10]'], name='range')
|
||||
expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
|
||||
columns=['foo', 'bar'], index=index)
|
||||
|
||||
index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]',
|
||||
'(0, 2.5]'],
|
||||
categories=['(7.5, 10]', '(2.5, 5]',
|
||||
'(5, 7.5]', '(0, 2.5]'],
|
||||
name='range')
|
||||
expected_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
|
||||
index=index, columns=['foo', 'bar'])
|
||||
|
||||
col = 'range'
|
||||
|
||||
# this is an unordered categorical, but we allow this ####
|
||||
result_sort = df.groupby(col, sort=True, observed=False).first()
|
||||
assert_frame_equal(result_sort, expected_sort)
|
||||
|
||||
result_nosort = df.groupby(col, sort=False, observed=False).first()
|
||||
assert_frame_equal(result_nosort, expected_nosort)
|
||||
|
||||
|
||||
def test_sort_datetimelike():
|
||||
# GH10505
|
||||
|
||||
# use same data as test_groupby_sort_categorical, which category is
|
||||
# corresponding to datetime.month
|
||||
df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1),
|
||||
datetime(2011, 2, 1), datetime(2011, 5, 1),
|
||||
datetime(2011, 2, 1), datetime(2011, 1, 1),
|
||||
datetime(2011, 5, 1)],
|
||||
'foo': [10, 8, 5, 6, 4, 1, 7],
|
||||
'bar': [10, 20, 30, 40, 50, 60, 70]},
|
||||
columns=['dt', 'foo', 'bar'])
|
||||
|
||||
# ordered=True
|
||||
df['dt'] = Categorical(df['dt'], ordered=True)
|
||||
index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
|
||||
datetime(2011, 5, 1), datetime(2011, 7, 1)]
|
||||
result_sort = DataFrame(
|
||||
[[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
|
||||
result_sort.index = CategoricalIndex(index, name='dt', ordered=True)
|
||||
|
||||
index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
|
||||
datetime(2011, 5, 1), datetime(2011, 1, 1)]
|
||||
result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
|
||||
columns=['foo', 'bar'])
|
||||
result_nosort.index = CategoricalIndex(index, categories=index,
|
||||
name='dt', ordered=True)
|
||||
|
||||
col = 'dt'
|
||||
assert_frame_equal(
|
||||
result_sort, df.groupby(col, sort=True, observed=False).first())
|
||||
|
||||
# when categories is ordered, group is ordered by category's order
|
||||
assert_frame_equal(
|
||||
result_sort, df.groupby(col, sort=False, observed=False).first())
|
||||
|
||||
# ordered = False
|
||||
df['dt'] = Categorical(df['dt'], ordered=False)
|
||||
index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
|
||||
datetime(2011, 5, 1), datetime(2011, 7, 1)]
|
||||
result_sort = DataFrame(
|
||||
[[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
|
||||
result_sort.index = CategoricalIndex(index, name='dt')
|
||||
|
||||
index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
|
||||
datetime(2011, 5, 1), datetime(2011, 1, 1)]
|
||||
result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
|
||||
columns=['foo', 'bar'])
|
||||
result_nosort.index = CategoricalIndex(index, categories=index,
|
||||
name='dt')
|
||||
|
||||
col = 'dt'
|
||||
assert_frame_equal(
|
||||
result_sort, df.groupby(col, sort=True, observed=False).first())
|
||||
assert_frame_equal(
|
||||
result_nosort, df.groupby(col, sort=False, observed=False).first())
|
||||
|
||||
|
||||
def test_empty_sum():
|
||||
# https://github.com/pandas-dev/pandas/issues/18678
|
||||
df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
|
||||
categories=['a', 'b', 'c']),
|
||||
'B': [1, 2, 1]})
|
||||
expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A')
|
||||
|
||||
# 0 by default
|
||||
result = df.groupby("A", observed=False).B.sum()
|
||||
expected = pd.Series([3, 1, 0], expected_idx, name='B')
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# min_count=0
|
||||
result = df.groupby("A", observed=False).B.sum(min_count=0)
|
||||
expected = pd.Series([3, 1, 0], expected_idx, name='B')
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# min_count=1
|
||||
result = df.groupby("A", observed=False).B.sum(min_count=1)
|
||||
expected = pd.Series([3, 1, np.nan], expected_idx, name='B')
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# min_count>1
|
||||
result = df.groupby("A", observed=False).B.sum(min_count=2)
|
||||
expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B')
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_prod():
|
||||
# https://github.com/pandas-dev/pandas/issues/18678
|
||||
df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
|
||||
categories=['a', 'b', 'c']),
|
||||
'B': [1, 2, 1]})
|
||||
|
||||
expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A')
|
||||
|
||||
# 1 by default
|
||||
result = df.groupby("A", observed=False).B.prod()
|
||||
expected = pd.Series([2, 1, 1], expected_idx, name='B')
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# min_count=0
|
||||
result = df.groupby("A", observed=False).B.prod(min_count=0)
|
||||
expected = pd.Series([2, 1, 1], expected_idx, name='B')
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# min_count=1
|
||||
result = df.groupby("A", observed=False).B.prod(min_count=1)
|
||||
expected = pd.Series([2, 1, np.nan], expected_idx, name='B')
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_multiindex_categorical_datetime():
|
||||
# https://github.com/pandas-dev/pandas/issues/21390
|
||||
|
||||
df = pd.DataFrame({
|
||||
'key1': pd.Categorical(list('abcbabcba')),
|
||||
'key2': pd.Categorical(
|
||||
list(pd.date_range('2018-06-01 00', freq='1T', periods=3)) * 3),
|
||||
'values': np.arange(9),
|
||||
})
|
||||
result = df.groupby(['key1', 'key2']).mean()
|
||||
|
||||
idx = pd.MultiIndex.from_product(
|
||||
[pd.Categorical(['a', 'b', 'c']),
|
||||
pd.Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))],
|
||||
names=['key1', 'key2'])
|
||||
expected = pd.DataFrame(
|
||||
{'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_index, expected", [
|
||||
(True, pd.Series(
|
||||
index=pd.MultiIndex.from_arrays(
|
||||
[pd.Series([1, 1, 2], dtype='category'),
|
||||
[1, 2, 2]], names=['a', 'b']
|
||||
),
|
||||
data=[1, 2, 3], name='x'
|
||||
)),
|
||||
(False, pd.DataFrame({
|
||||
'a': pd.Series([1, 1, 2], dtype='category'),
|
||||
'b': [1, 2, 2],
|
||||
'x': [1, 2, 3]
|
||||
}))
|
||||
])
|
||||
def test_groupby_agg_observed_true_single_column(as_index, expected):
|
||||
# GH-23970
|
||||
df = pd.DataFrame({
|
||||
'a': pd.Series([1, 1, 2], dtype='category'),
|
||||
'b': [1, 2, 2],
|
||||
'x': [1, 2, 3]
|
||||
})
|
||||
|
||||
result = df.groupby(
|
||||
['a', 'b'], as_index=as_index, observed=True)['x'].sum()
|
||||
|
||||
assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('fill_value', [None, np.nan, pd.NaT])
|
||||
def test_shift(fill_value):
|
||||
ct = pd.Categorical(['a', 'b', 'c', 'd'],
|
||||
categories=['a', 'b', 'c', 'd'], ordered=False)
|
||||
expected = pd.Categorical([None, 'a', 'b', 'c'],
|
||||
categories=['a', 'b', 'c', 'd'], ordered=False)
|
||||
res = ct.shift(1, fill_value=fill_value)
|
||||
assert_equal(res, expected)
|
||||
@@ -1,224 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import product as cart_product, range
|
||||
|
||||
from pandas import DataFrame, MultiIndex, Period, Series, Timedelta, Timestamp
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
class TestCounting(object):
|
||||
|
||||
def test_cumcount(self):
|
||||
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
|
||||
g = df.groupby('A')
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3])
|
||||
|
||||
assert_series_equal(expected, g.cumcount())
|
||||
assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_empty(self):
|
||||
ge = DataFrame().groupby(level=0)
|
||||
se = Series().groupby(level=0)
|
||||
|
||||
# edge case, as this is usually considered float
|
||||
e = Series(dtype='int64')
|
||||
|
||||
assert_series_equal(e, ge.cumcount())
|
||||
assert_series_equal(e, se.cumcount())
|
||||
|
||||
def test_cumcount_dupe_index(self):
|
||||
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
|
||||
index=[0] * 5)
|
||||
g = df.groupby('A')
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
|
||||
|
||||
assert_series_equal(expected, g.cumcount())
|
||||
assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_mi(self):
|
||||
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
|
||||
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
|
||||
index=mi)
|
||||
g = df.groupby('A')
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=mi)
|
||||
|
||||
assert_series_equal(expected, g.cumcount())
|
||||
assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_groupby_not_col(self):
|
||||
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
|
||||
index=[0] * 5)
|
||||
g = df.groupby([0, 0, 0, 1, 0])
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
|
||||
|
||||
assert_series_equal(expected, g.cumcount())
|
||||
assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_ngroup(self):
|
||||
df = DataFrame({'A': list('aaaba')})
|
||||
g = df.groupby('A')
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0])
|
||||
|
||||
assert_series_equal(expected, g.ngroup())
|
||||
assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_distinct(self):
|
||||
df = DataFrame({'A': list('abcde')})
|
||||
g = df.groupby('A')
|
||||
sg = g.A
|
||||
|
||||
expected = Series(range(5), dtype='int64')
|
||||
|
||||
assert_series_equal(expected, g.ngroup())
|
||||
assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_one_group(self):
|
||||
df = DataFrame({'A': [0] * 5})
|
||||
g = df.groupby('A')
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0] * 5)
|
||||
|
||||
assert_series_equal(expected, g.ngroup())
|
||||
assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_empty(self):
|
||||
ge = DataFrame().groupby(level=0)
|
||||
se = Series().groupby(level=0)
|
||||
|
||||
# edge case, as this is usually considered float
|
||||
e = Series(dtype='int64')
|
||||
|
||||
assert_series_equal(e, ge.ngroup())
|
||||
assert_series_equal(e, se.ngroup())
|
||||
|
||||
def test_ngroup_series_matches_frame(self):
|
||||
df = DataFrame({'A': list('aaaba')})
|
||||
s = Series(list('aaaba'))
|
||||
|
||||
assert_series_equal(df.groupby(s).ngroup(),
|
||||
s.groupby(s).ngroup())
|
||||
|
||||
def test_ngroup_dupe_index(self):
|
||||
df = DataFrame({'A': list('aaaba')}, index=[0] * 5)
|
||||
g = df.groupby('A')
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
|
||||
|
||||
assert_series_equal(expected, g.ngroup())
|
||||
assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_mi(self):
|
||||
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
|
||||
df = DataFrame({'A': list('aaaba')}, index=mi)
|
||||
g = df.groupby('A')
|
||||
sg = g.A
|
||||
expected = Series([0, 0, 0, 1, 0], index=mi)
|
||||
|
||||
assert_series_equal(expected, g.ngroup())
|
||||
assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_groupby_not_col(self):
|
||||
df = DataFrame({'A': list('aaaba')}, index=[0] * 5)
|
||||
g = df.groupby([0, 0, 0, 1, 0])
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
|
||||
|
||||
assert_series_equal(expected, g.ngroup())
|
||||
assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_descending(self):
|
||||
df = DataFrame(['a', 'a', 'b', 'a', 'b'], columns=['A'])
|
||||
g = df.groupby(['A'])
|
||||
|
||||
ascending = Series([0, 0, 1, 0, 1])
|
||||
descending = Series([1, 1, 0, 1, 0])
|
||||
|
||||
assert_series_equal(descending, (g.ngroups - 1) - ascending)
|
||||
assert_series_equal(ascending, g.ngroup(ascending=True))
|
||||
assert_series_equal(descending, g.ngroup(ascending=False))
|
||||
|
||||
def test_ngroup_matches_cumcount(self):
|
||||
# verify one manually-worked out case works
|
||||
df = DataFrame([['a', 'x'], ['a', 'y'], ['b', 'x'],
|
||||
['a', 'x'], ['b', 'y']], columns=['A', 'X'])
|
||||
g = df.groupby(['A', 'X'])
|
||||
g_ngroup = g.ngroup()
|
||||
g_cumcount = g.cumcount()
|
||||
expected_ngroup = Series([0, 1, 2, 0, 3])
|
||||
expected_cumcount = Series([0, 0, 0, 1, 0])
|
||||
|
||||
assert_series_equal(g_ngroup, expected_ngroup)
|
||||
assert_series_equal(g_cumcount, expected_cumcount)
|
||||
|
||||
def test_ngroup_cumcount_pair(self):
|
||||
# brute force comparison for all small series
|
||||
for p in cart_product(range(3), repeat=4):
|
||||
df = DataFrame({'a': p})
|
||||
g = df.groupby(['a'])
|
||||
|
||||
order = sorted(set(p))
|
||||
ngroupd = [order.index(val) for val in p]
|
||||
cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
|
||||
|
||||
assert_series_equal(g.ngroup(), Series(ngroupd))
|
||||
assert_series_equal(g.cumcount(), Series(cumcounted))
|
||||
|
||||
def test_ngroup_respects_groupby_order(self):
|
||||
np.random.seed(0)
|
||||
df = DataFrame({'a': np.random.choice(list('abcdef'), 100)})
|
||||
for sort_flag in (False, True):
|
||||
g = df.groupby(['a'], sort=sort_flag)
|
||||
df['group_id'] = -1
|
||||
df['group_index'] = -1
|
||||
|
||||
for i, (_, group) in enumerate(g):
|
||||
df.loc[group.index, 'group_id'] = i
|
||||
for j, ind in enumerate(group.index):
|
||||
df.loc[ind, 'group_index'] = j
|
||||
|
||||
assert_series_equal(Series(df['group_id'].values),
|
||||
g.ngroup())
|
||||
assert_series_equal(Series(df['group_index'].values),
|
||||
g.cumcount())
|
||||
|
||||
@pytest.mark.parametrize('datetimelike', [
|
||||
[Timestamp('2016-05-%02d 20:09:25+00:00' % i) for i in range(1, 4)],
|
||||
[Timestamp('2016-05-%02d 20:09:25' % i) for i in range(1, 4)],
|
||||
[Timedelta(x, unit="h") for x in range(1, 4)],
|
||||
[Period(freq="2W", year=2017, month=x) for x in range(1, 4)]])
|
||||
def test_count_with_datetimelike(self, datetimelike):
|
||||
# test for #13393, where DataframeGroupBy.count() fails
|
||||
# when counting a datetimelike column.
|
||||
|
||||
df = DataFrame({'x': ['a', 'a', 'b'], 'y': datetimelike})
|
||||
res = df.groupby('x').count()
|
||||
expected = DataFrame({'y': [2, 1]}, index=['a', 'b'])
|
||||
expected.index.name = "x"
|
||||
assert_frame_equal(expected, res)
|
||||
|
||||
def test_count_with_only_nans_in_first_group(self):
|
||||
# GH21956
|
||||
df = DataFrame({'A': [np.nan, np.nan], 'B': ['a', 'b'], 'C': [1, 2]})
|
||||
result = df.groupby(['A', 'B']).C.count()
|
||||
mi = MultiIndex(levels=[[], ['a', 'b']],
|
||||
codes=[[], []],
|
||||
names=['A', 'B'])
|
||||
expected = Series([], index=mi, dtype=np.int64, name='C')
|
||||
assert_series_equal(result, expected, check_index_type=False)
|
||||
@@ -1,588 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Series, Timestamp
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def test_filter_series():
|
||||
s = pd.Series([1, 3, 20, 5, 22, 24, 7])
|
||||
expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6])
|
||||
expected_even = pd.Series([20, 22, 24], index=[2, 4, 5])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
tm.assert_series_equal(
|
||||
grouped.filter(lambda x: x.mean() < 10), expected_odd)
|
||||
tm.assert_series_equal(
|
||||
grouped.filter(lambda x: x.mean() > 10), expected_even)
|
||||
# Test dropna=False.
|
||||
tm.assert_series_equal(
|
||||
grouped.filter(lambda x: x.mean() < 10, dropna=False),
|
||||
expected_odd.reindex(s.index))
|
||||
tm.assert_series_equal(
|
||||
grouped.filter(lambda x: x.mean() > 10, dropna=False),
|
||||
expected_even.reindex(s.index))
|
||||
|
||||
|
||||
def test_filter_single_column_df():
|
||||
df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7])
|
||||
expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
|
||||
expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5])
|
||||
grouper = df[0].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x.mean() < 10), expected_odd)
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x.mean() > 10), expected_even)
|
||||
# Test dropna=False.
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x.mean() < 10, dropna=False),
|
||||
expected_odd.reindex(df.index))
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x.mean() > 10, dropna=False),
|
||||
expected_even.reindex(df.index))
|
||||
|
||||
|
||||
def test_filter_multi_column_df():
|
||||
df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]})
|
||||
grouper = df['A'].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2])
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10),
|
||||
expected)
|
||||
|
||||
|
||||
def test_filter_mixed_df():
|
||||
df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
|
||||
grouper = df['A'].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, index=[1, 2])
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x['A'].sum() > 10), expected)
|
||||
|
||||
|
||||
def test_filter_out_all_groups():
|
||||
s = pd.Series([1, 3, 20, 5, 22, 24, 7])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]])
|
||||
df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
|
||||
grouper = df['A'].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x['A'].sum() > 1000), df.loc[[]])
|
||||
|
||||
|
||||
def test_filter_out_no_groups():
|
||||
s = pd.Series([1, 3, 20, 5, 22, 24, 7])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
filtered = grouped.filter(lambda x: x.mean() > 0)
|
||||
tm.assert_series_equal(filtered, s)
|
||||
df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
|
||||
grouper = df['A'].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
filtered = grouped.filter(lambda x: x['A'].mean() > 0)
|
||||
tm.assert_frame_equal(filtered, df)
|
||||
|
||||
|
||||
def test_filter_out_all_groups_in_df():
|
||||
# GH12768
|
||||
df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]})
|
||||
res = df.groupby('a')
|
||||
res = res.filter(lambda x: x['b'].sum() > 5, dropna=False)
|
||||
expected = pd.DataFrame({'a': [np.nan] * 3, 'b': [np.nan] * 3})
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]})
|
||||
res = df.groupby('a')
|
||||
res = res.filter(lambda x: x['b'].sum() > 5, dropna=True)
|
||||
expected = pd.DataFrame({'a': [], 'b': []}, dtype="int64")
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
|
||||
def test_filter_condition_raises():
|
||||
def raise_if_sum_is_zero(x):
|
||||
if x.sum() == 0:
|
||||
raise ValueError
|
||||
else:
|
||||
return x.sum() > 0
|
||||
|
||||
s = pd.Series([-1, 0, 1, 2])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
grouped.filter(raise_if_sum_is_zero)
|
||||
|
||||
|
||||
def test_filter_with_axis_in_groupby():
|
||||
# issue 11041
|
||||
index = pd.MultiIndex.from_product([range(10), [0, 1]])
|
||||
data = pd.DataFrame(
|
||||
np.arange(100).reshape(-1, 20), columns=index, dtype='int64')
|
||||
result = data.groupby(level=0,
|
||||
axis=1).filter(lambda x: x.iloc[0, 0] > 10)
|
||||
expected = data.iloc[:, 12:20]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_filter_bad_shapes():
|
||||
df = DataFrame({'A': np.arange(8),
|
||||
'B': list('aabbbbcc'),
|
||||
'C': np.arange(8)})
|
||||
s = df['B']
|
||||
g_df = df.groupby('B')
|
||||
g_s = s.groupby(s)
|
||||
|
||||
f = lambda x: x
|
||||
msg = "filter function returned a DataFrame, but expected a scalar bool"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
f = lambda x: x == 1
|
||||
msg = "filter function returned a DataFrame, but expected a scalar bool"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
f = lambda x: np.outer(x, x)
|
||||
msg = "can't multiply sequence by non-int of type 'str'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
|
||||
def test_filter_nan_is_false():
|
||||
df = DataFrame({'A': np.arange(8),
|
||||
'B': list('aabbbbcc'),
|
||||
'C': np.arange(8)})
|
||||
s = df['B']
|
||||
g_df = df.groupby(df['B'])
|
||||
g_s = s.groupby(s)
|
||||
|
||||
f = lambda x: np.nan
|
||||
tm.assert_frame_equal(g_df.filter(f), df.loc[[]])
|
||||
tm.assert_series_equal(g_s.filter(f), s[[]])
|
||||
|
||||
|
||||
def test_filter_against_workaround():
|
||||
np.random.seed(0)
|
||||
# Series of ints
|
||||
s = Series(np.random.randint(0, 100, 1000))
|
||||
grouper = s.apply(lambda x: np.round(x, -1))
|
||||
grouped = s.groupby(grouper)
|
||||
f = lambda x: x.mean() > 10
|
||||
|
||||
old_way = s[grouped.transform(f).astype('bool')]
|
||||
new_way = grouped.filter(f)
|
||||
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
|
||||
|
||||
# Series of floats
|
||||
s = 100 * Series(np.random.random(1000))
|
||||
grouper = s.apply(lambda x: np.round(x, -1))
|
||||
grouped = s.groupby(grouper)
|
||||
f = lambda x: x.mean() > 10
|
||||
old_way = s[grouped.transform(f).astype('bool')]
|
||||
new_way = grouped.filter(f)
|
||||
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
|
||||
|
||||
# Set up DataFrame of ints, floats, strings.
|
||||
from string import ascii_lowercase
|
||||
letters = np.array(list(ascii_lowercase))
|
||||
N = 1000
|
||||
random_letters = letters.take(np.random.randint(0, 26, N))
|
||||
df = DataFrame({'ints': Series(np.random.randint(0, 100, N)),
|
||||
'floats': N / 10 * Series(np.random.random(N)),
|
||||
'letters': Series(random_letters)})
|
||||
|
||||
# Group by ints; filter on floats.
|
||||
grouped = df.groupby('ints')
|
||||
old_way = df[grouped.floats.
|
||||
transform(lambda x: x.mean() > N / 20).astype('bool')]
|
||||
new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
# Group by floats (rounded); filter on strings.
|
||||
grouper = df.floats.apply(lambda x: np.round(x, -1))
|
||||
grouped = df.groupby(grouper)
|
||||
old_way = df[grouped.letters.
|
||||
transform(lambda x: len(x) < N / 10).astype('bool')]
|
||||
new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
# Group by strings; filter on ints.
|
||||
grouped = df.groupby('letters')
|
||||
old_way = df[grouped.ints.
|
||||
transform(lambda x: x.mean() > N / 20).astype('bool')]
|
||||
new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
|
||||
def test_filter_using_len():
|
||||
# BUG GH4447
|
||||
df = DataFrame({'A': np.arange(8),
|
||||
'B': list('aabbbbcc'),
|
||||
'C': np.arange(8)})
|
||||
grouped = df.groupby('B')
|
||||
actual = grouped.filter(lambda x: len(x) > 2)
|
||||
expected = DataFrame(
|
||||
{'A': np.arange(2, 6),
|
||||
'B': list('bbbb'),
|
||||
'C': np.arange(2, 6)}, index=np.arange(2, 6))
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped.filter(lambda x: len(x) > 4)
|
||||
expected = df.loc[[]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Series have always worked properly, but we'll test anyway.
|
||||
s = df['B']
|
||||
grouped = s.groupby(s)
|
||||
actual = grouped.filter(lambda x: len(x) > 2)
|
||||
expected = Series(4 * ['b'], index=np.arange(2, 6), name='B')
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped.filter(lambda x: len(x) > 4)
|
||||
expected = s[[]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_maintains_ordering():
|
||||
# Simple case: index is sequential. #4621
|
||||
df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
|
||||
'tag': [23, 45, 62, 24, 45, 34, 25, 62]})
|
||||
s = df['pid']
|
||||
grouped = df.groupby('tag')
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
grouped = s.groupby(df['tag'])
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = s.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Now index is sequentially decreasing.
|
||||
df.index = np.arange(len(df) - 1, -1, -1)
|
||||
s = df['pid']
|
||||
grouped = df.groupby('tag')
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
grouped = s.groupby(df['tag'])
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = s.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Index is shuffled.
|
||||
SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
|
||||
df.index = df.index[SHUFFLED]
|
||||
s = df['pid']
|
||||
grouped = df.groupby('tag')
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
grouped = s.groupby(df['tag'])
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = s.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_multiple_timestamp():
|
||||
# GH 10114
|
||||
df = DataFrame({'A': np.arange(5, dtype='int64'),
|
||||
'B': ['foo', 'bar', 'foo', 'bar', 'bar'],
|
||||
'C': Timestamp('20130101')})
|
||||
|
||||
grouped = df.groupby(['B', 'C'])
|
||||
|
||||
result = grouped['A'].filter(lambda x: True)
|
||||
tm.assert_series_equal(df['A'], result)
|
||||
|
||||
result = grouped['A'].transform(len)
|
||||
expected = Series([2, 3, 2, 3, 3], name='A')
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = grouped.filter(lambda x: True)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
result = grouped.transform('sum')
|
||||
expected = DataFrame({'A': [2, 8, 2, 8, 8]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grouped.transform(len)
|
||||
expected = DataFrame({'A': [2, 3, 2, 3, 3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_int_index():
|
||||
# GH4620
|
||||
index = [1, 1, 1, 2, 1, 1, 0, 1]
|
||||
df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
|
||||
'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
|
||||
grouped_df = df.groupby('tag')
|
||||
ser = df['pid']
|
||||
grouped_ser = ser.groupby(df['tag'])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = df.copy()
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
NA = np.nan
|
||||
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_multiple_non_unique_int_index():
|
||||
# GH4620
|
||||
index = [1, 1, 1, 2, 0, 0, 0, 1]
|
||||
df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
|
||||
'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
|
||||
grouped_df = df.groupby('tag')
|
||||
ser = df['pid']
|
||||
grouped_ser = ser.groupby(df['tag'])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = df.copy()
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
NA = np.nan
|
||||
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_float_index():
|
||||
# GH4620
|
||||
index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
|
||||
df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
|
||||
'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
|
||||
grouped_df = df.groupby('tag')
|
||||
ser = df['pid']
|
||||
grouped_ser = ser.groupby(df['tag'])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = df.copy()
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
NA = np.nan
|
||||
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_timestamp_index():
|
||||
# GH4620
|
||||
t0 = Timestamp('2013-09-30 00:05:00')
|
||||
t1 = Timestamp('2013-10-30 00:05:00')
|
||||
t2 = Timestamp('2013-11-30 00:05:00')
|
||||
index = [t1, t1, t1, t2, t1, t1, t0, t1]
|
||||
df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
|
||||
'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
|
||||
grouped_df = df.groupby('tag')
|
||||
ser = df['pid']
|
||||
grouped_ser = ser.groupby(df['tag'])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = df.copy()
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
NA = np.nan
|
||||
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_string_index():
|
||||
# GH4620
|
||||
index = list('bbbcbbab')
|
||||
df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
|
||||
'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
|
||||
grouped_df = df.groupby('tag')
|
||||
ser = df['pid']
|
||||
grouped_ser = ser.groupby(df['tag'])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = df.copy()
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
NA = np.nan
|
||||
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_has_access_to_grouped_cols():
|
||||
df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B'])
|
||||
g = df.groupby('A')
|
||||
# previously didn't have access to col A #????
|
||||
filt = g.filter(lambda x: x['A'].sum() == 2)
|
||||
tm.assert_frame_equal(filt, df.iloc[[0, 1]])
|
||||
|
||||
|
||||
def test_filter_enforces_scalarness():
|
||||
df = pd.DataFrame([
|
||||
['best', 'a', 'x'],
|
||||
['worst', 'b', 'y'],
|
||||
['best', 'c', 'x'],
|
||||
['best', 'd', 'y'],
|
||||
['worst', 'd', 'y'],
|
||||
['worst', 'd', 'y'],
|
||||
['best', 'd', 'z'],
|
||||
], columns=['a', 'b', 'c'])
|
||||
with pytest.raises(TypeError, match='filter function returned a.*'):
|
||||
df.groupby('c').filter(lambda g: g['a'] == 'best')
|
||||
|
||||
|
||||
def test_filter_non_bool_raises():
|
||||
df = pd.DataFrame([
|
||||
['best', 'a', 1],
|
||||
['worst', 'b', 1],
|
||||
['best', 'c', 1],
|
||||
['best', 'd', 1],
|
||||
['worst', 'd', 1],
|
||||
['worst', 'd', 1],
|
||||
['best', 'd', 1],
|
||||
], columns=['a', 'b', 'c'])
|
||||
with pytest.raises(TypeError, match='filter function returned a.*'):
|
||||
df.groupby('a').filter(lambda g: g.c.mean())
|
||||
|
||||
|
||||
def test_filter_dropna_with_empty_groups():
|
||||
# GH 10780
|
||||
data = pd.Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3))
|
||||
groupped = data.groupby(level=0)
|
||||
result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False)
|
||||
expected_false = pd.Series([np.nan] * 9,
|
||||
index=np.repeat([1, 2, 3], 3))
|
||||
tm.assert_series_equal(result_false, expected_false)
|
||||
|
||||
result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True)
|
||||
expected_true = pd.Series(index=pd.Index([], dtype=int))
|
||||
tm.assert_series_equal(result_true, expected_true)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,838 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
""" test where we are determining what we are grouping, or getting groups """
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import long, lrange
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
CategoricalIndex, DataFrame, Index, MultiIndex, Series, Timestamp, compat,
|
||||
date_range)
|
||||
from pandas.core.groupby.grouper import Grouping
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (
|
||||
assert_almost_equal, assert_frame_equal, assert_panel_equal,
|
||||
assert_series_equal)
|
||||
|
||||
# selection
|
||||
# --------------------------------
|
||||
|
||||
|
||||
class TestSelection(object):
|
||||
|
||||
def test_select_bad_cols(self):
|
||||
df = DataFrame([[1, 2]], columns=['A', 'B'])
|
||||
g = df.groupby('A')
|
||||
with pytest.raises(KeyError, match='"Columns not found: \'C\'"'):
|
||||
g[['C']]
|
||||
|
||||
with pytest.raises(KeyError, match='^[^A]+$'):
|
||||
# A should not be referenced as a bad column...
|
||||
# will have to rethink regex if you change message!
|
||||
g[['A', 'C']]
|
||||
|
||||
def test_groupby_duplicated_column_errormsg(self):
|
||||
# GH7511
|
||||
df = DataFrame(columns=['A', 'B', 'A', 'C'],
|
||||
data=[range(4), range(2, 6), range(0, 8, 2)])
|
||||
|
||||
msg = "Grouper for 'A' not 1-dimensional"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby('A')
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby(['A', 'B'])
|
||||
|
||||
grouped = df.groupby('B')
|
||||
c = grouped.count()
|
||||
assert c.columns.nlevels == 1
|
||||
assert c.columns.size == 3
|
||||
|
||||
def test_column_select_via_attr(self, df):
|
||||
result = df.groupby('A').C.sum()
|
||||
expected = df.groupby('A')['C'].sum()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df['mean'] = 1.5
|
||||
result = df.groupby('A').mean()
|
||||
expected = df.groupby('A').agg(np.mean)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_getitem_list_of_columns(self):
|
||||
df = DataFrame(
|
||||
{'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
|
||||
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
|
||||
'C': np.random.randn(8),
|
||||
'D': np.random.randn(8),
|
||||
'E': np.random.randn(8)})
|
||||
|
||||
result = df.groupby('A')[['C', 'D']].mean()
|
||||
result2 = df.groupby('A')['C', 'D'].mean()
|
||||
result3 = df.groupby('A')[df.columns[2:4]].mean()
|
||||
|
||||
expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean()
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
assert_frame_equal(result2, expected)
|
||||
assert_frame_equal(result3, expected)
|
||||
|
||||
def test_getitem_numeric_column_names(self):
|
||||
# GH #13731
|
||||
df = DataFrame({0: list('abcd') * 2,
|
||||
2: np.random.randn(8),
|
||||
4: np.random.randn(8),
|
||||
6: np.random.randn(8)})
|
||||
result = df.groupby(0)[df.columns[1:3]].mean()
|
||||
result2 = df.groupby(0)[2, 4].mean()
|
||||
result3 = df.groupby(0)[[2, 4]].mean()
|
||||
|
||||
expected = df.loc[:, [0, 2, 4]].groupby(0).mean()
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
assert_frame_equal(result2, expected)
|
||||
assert_frame_equal(result3, expected)
|
||||
|
||||
|
||||
# grouping
|
||||
# --------------------------------
|
||||
|
||||
class TestGrouping():
|
||||
|
||||
def test_grouper_index_types(self):
|
||||
# related GH5375
|
||||
# groupby misbehaving when using a Floatlike index
|
||||
df = DataFrame(np.arange(10).reshape(5, 2), columns=list('AB'))
|
||||
for index in [tm.makeFloatIndex, tm.makeStringIndex,
|
||||
tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex,
|
||||
tm.makePeriodIndex]:
|
||||
|
||||
df.index = index(len(df))
|
||||
df.groupby(list('abcde')).apply(lambda x: x)
|
||||
|
||||
df.index = list(reversed(df.index.tolist()))
|
||||
df.groupby(list('abcde')).apply(lambda x: x)
|
||||
|
||||
def test_grouper_multilevel_freq(self):
|
||||
|
||||
# GH 7885
|
||||
# with level and freq specified in a pd.Grouper
|
||||
from datetime import date, timedelta
|
||||
d0 = date.today() - timedelta(days=14)
|
||||
dates = date_range(d0, date.today())
|
||||
date_index = pd.MultiIndex.from_product(
|
||||
[dates, dates], names=['foo', 'bar'])
|
||||
df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index)
|
||||
|
||||
# Check string level
|
||||
expected = df.reset_index().groupby([pd.Grouper(
|
||||
key='foo', freq='W'), pd.Grouper(key='bar', freq='W')]).sum()
|
||||
# reset index changes columns dtype to object
|
||||
expected.columns = pd.Index([0], dtype='int64')
|
||||
|
||||
result = df.groupby([pd.Grouper(level='foo', freq='W'), pd.Grouper(
|
||||
level='bar', freq='W')]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Check integer level
|
||||
result = df.groupby([pd.Grouper(level=0, freq='W'), pd.Grouper(
|
||||
level=1, freq='W')]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_grouper_creation_bug(self):
|
||||
|
||||
# GH 8795
|
||||
df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]})
|
||||
g = df.groupby('A')
|
||||
expected = g.sum()
|
||||
|
||||
g = df.groupby(pd.Grouper(key='A'))
|
||||
result = g.sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = g.apply(lambda x: x.sum())
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
g = df.groupby(pd.Grouper(key='A', axis=0))
|
||||
result = g.sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH14334
|
||||
# pd.Grouper(key=...) may be passed in a list
|
||||
df = DataFrame({'A': [0, 0, 0, 1, 1, 1],
|
||||
'B': [1, 1, 2, 2, 3, 3],
|
||||
'C': [1, 2, 3, 4, 5, 6]})
|
||||
# Group by single column
|
||||
expected = df.groupby('A').sum()
|
||||
g = df.groupby([pd.Grouper(key='A')])
|
||||
result = g.sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Group by two columns
|
||||
# using a combination of strings and Grouper objects
|
||||
expected = df.groupby(['A', 'B']).sum()
|
||||
|
||||
# Group with two Grouper objects
|
||||
g = df.groupby([pd.Grouper(key='A'), pd.Grouper(key='B')])
|
||||
result = g.sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Group with a string and a Grouper object
|
||||
g = df.groupby(['A', pd.Grouper(key='B')])
|
||||
result = g.sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Group with a Grouper object and a string
|
||||
g = df.groupby([pd.Grouper(key='A'), 'B'])
|
||||
result = g.sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH8866
|
||||
s = Series(np.arange(8, dtype='int64'),
|
||||
index=pd.MultiIndex.from_product(
|
||||
[list('ab'), range(2),
|
||||
date_range('20130101', periods=2)],
|
||||
names=['one', 'two', 'three']))
|
||||
result = s.groupby(pd.Grouper(level='three', freq='M')).sum()
|
||||
expected = Series([28], index=Index(
|
||||
[Timestamp('2013-01-31')], freq='M', name='three'))
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# just specifying a level breaks
|
||||
result = s.groupby(pd.Grouper(level='one')).sum()
|
||||
expected = s.groupby(level='one').sum()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_grouper_column_and_index(self):
|
||||
# GH 14327
|
||||
|
||||
# Grouping a multi-index frame by a column and an index level should
|
||||
# be equivalent to resetting the index and grouping by two columns
|
||||
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3),
|
||||
('b', 1), ('b', 2), ('b', 3)])
|
||||
idx.names = ['outer', 'inner']
|
||||
df_multi = pd.DataFrame({"A": np.arange(6),
|
||||
'B': ['one', 'one', 'two',
|
||||
'two', 'one', 'one']},
|
||||
index=idx)
|
||||
result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean()
|
||||
expected = df_multi.reset_index().groupby(['B', 'inner']).mean()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Test the reverse grouping order
|
||||
result = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean()
|
||||
expected = df_multi.reset_index().groupby(['inner', 'B']).mean()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Grouping a single-index frame by a column and the index should
|
||||
# be equivalent to resetting the index and grouping by two columns
|
||||
df_single = df_multi.reset_index('outer')
|
||||
result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean()
|
||||
expected = df_single.reset_index().groupby(['B', 'inner']).mean()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Test the reverse grouping order
|
||||
result = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean()
|
||||
expected = df_single.reset_index().groupby(['inner', 'B']).mean()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_levels_and_columns(self):
|
||||
# GH9344, GH9049
|
||||
idx_names = ['x', 'y']
|
||||
idx = pd.MultiIndex.from_tuples(
|
||||
[(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names)
|
||||
df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx)
|
||||
|
||||
by_levels = df.groupby(level=idx_names).mean()
|
||||
# reset_index changes columns dtype to object
|
||||
by_columns = df.reset_index().groupby(idx_names).mean()
|
||||
|
||||
tm.assert_frame_equal(by_levels, by_columns, check_column_type=False)
|
||||
|
||||
by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64)
|
||||
tm.assert_frame_equal(by_levels, by_columns)
|
||||
|
||||
def test_groupby_categorical_index_and_columns(self, observed):
|
||||
# GH18432
|
||||
columns = ['A', 'B', 'A', 'B']
|
||||
categories = ['B', 'A']
|
||||
data = np.ones((5, 4), int)
|
||||
cat_columns = CategoricalIndex(columns,
|
||||
categories=categories,
|
||||
ordered=True)
|
||||
df = DataFrame(data=data, columns=cat_columns)
|
||||
result = df.groupby(axis=1, level=0, observed=observed).sum()
|
||||
expected_data = 2 * np.ones((5, 2), int)
|
||||
|
||||
if observed:
|
||||
# if we are not-observed we undergo a reindex
|
||||
# so need to adjust the output as our expected sets us up
|
||||
# to be non-observed
|
||||
expected_columns = CategoricalIndex(['A', 'B'],
|
||||
categories=categories,
|
||||
ordered=True)
|
||||
else:
|
||||
expected_columns = CategoricalIndex(categories,
|
||||
categories=categories,
|
||||
ordered=True)
|
||||
expected = DataFrame(data=expected_data, columns=expected_columns)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# test transposed version
|
||||
df = DataFrame(data.T, index=cat_columns)
|
||||
result = df.groupby(axis=0, level=0, observed=observed).sum()
|
||||
expected = DataFrame(data=expected_data.T, index=expected_columns)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_grouper_getting_correct_binner(self):
|
||||
|
||||
# GH 10063
|
||||
# using a non-time-based grouper and a time-based grouper
|
||||
# and specifying levels
|
||||
df = DataFrame({'A': 1}, index=pd.MultiIndex.from_product(
|
||||
[list('ab'), date_range('20130101', periods=80)], names=['one',
|
||||
'two']))
|
||||
result = df.groupby([pd.Grouper(level='one'), pd.Grouper(
|
||||
level='two', freq='M')]).sum()
|
||||
expected = DataFrame({'A': [31, 28, 21, 31, 28, 21]},
|
||||
index=MultiIndex.from_product(
|
||||
[list('ab'),
|
||||
date_range('20130101', freq='M', periods=3)],
|
||||
names=['one', 'two']))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_grouper_iter(self, df):
|
||||
assert sorted(df.groupby('A').grouper) == ['bar', 'foo']
|
||||
|
||||
def test_empty_groups(self, df):
|
||||
# see gh-1048
|
||||
with pytest.raises(ValueError, match="No group keys passed!"):
|
||||
df.groupby([])
|
||||
|
||||
def test_groupby_grouper(self, df):
|
||||
grouped = df.groupby('A')
|
||||
|
||||
result = df.groupby(grouped.grouper).mean()
|
||||
expected = grouped.mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_dict_mapping(self):
|
||||
# GH #679
|
||||
from pandas import Series
|
||||
s = Series({'T1': 5})
|
||||
result = s.groupby({'T1': 'T2'}).agg(sum)
|
||||
expected = s.groupby(['T2']).agg(sum)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
s = Series([1., 2., 3., 4.], index=list('abcd'))
|
||||
mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1}
|
||||
|
||||
result = s.groupby(mapping).mean()
|
||||
result2 = s.groupby(mapping).agg(np.mean)
|
||||
expected = s.groupby([0, 0, 1, 1]).mean()
|
||||
expected2 = s.groupby([0, 0, 1, 1]).mean()
|
||||
assert_series_equal(result, expected)
|
||||
assert_series_equal(result, result2)
|
||||
assert_series_equal(result, expected2)
|
||||
|
||||
def test_groupby_grouper_f_sanity_checked(self):
|
||||
dates = date_range('01-Jan-2013', periods=12, freq='MS')
|
||||
ts = Series(np.random.randn(12), index=dates)
|
||||
|
||||
# GH3035
|
||||
# index.map is used to apply grouper to the index
|
||||
# if it fails on the elements, map tries it on the entire index as
|
||||
# a sequence. That can yield invalid results that cause trouble
|
||||
# down the line.
|
||||
# the surprise comes from using key[0:6] rather then str(key)[0:6]
|
||||
# when the elements are Timestamp.
|
||||
# the result is Index[0:6], very confusing.
|
||||
|
||||
msg = r"Grouper result violates len\(labels\) == len\(data\)"
|
||||
with pytest.raises(AssertionError, match=msg):
|
||||
ts.groupby(lambda key: key[0:6])
|
||||
|
||||
def test_grouping_error_on_multidim_input(self, df):
|
||||
msg = ("Grouper for '<class 'pandas.core.frame.DataFrame'>'"
|
||||
" not 1-dimensional")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
Grouping(df.index, df[['A', 'A']])
|
||||
|
||||
def test_multiindex_passthru(self):
|
||||
|
||||
# GH 7997
|
||||
# regression from 0.14.1
|
||||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
||||
df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)])
|
||||
|
||||
result = df.groupby(axis=1, level=[0, 1]).first()
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
def test_multiindex_negative_level(self, mframe):
|
||||
# GH 13901
|
||||
result = mframe.groupby(level=-1).sum()
|
||||
expected = mframe.groupby(level='second').sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = mframe.groupby(level=-2).sum()
|
||||
expected = mframe.groupby(level='first').sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = mframe.groupby(level=[-2, -1]).sum()
|
||||
expected = mframe
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = mframe.groupby(level=[-1, 'first']).sum()
|
||||
expected = mframe.groupby(level=['second', 'first']).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_multifunc_select_col_integer_cols(self, df):
|
||||
df.columns = np.arange(len(df.columns))
|
||||
|
||||
# it works!
|
||||
df.groupby(1, as_index=False)[2].agg({'Q': np.mean})
|
||||
|
||||
def test_multiindex_columns_empty_level(self):
|
||||
lst = [['count', 'values'], ['to filter', '']]
|
||||
midx = MultiIndex.from_tuples(lst)
|
||||
|
||||
df = DataFrame([[long(1), 'A']], columns=midx)
|
||||
|
||||
grouped = df.groupby('to filter').groups
|
||||
assert grouped['A'] == [0]
|
||||
|
||||
grouped = df.groupby([('to filter', '')]).groups
|
||||
assert grouped['A'] == [0]
|
||||
|
||||
df = DataFrame([[long(1), 'A'], [long(2), 'B']], columns=midx)
|
||||
|
||||
expected = df.groupby('to filter').groups
|
||||
result = df.groupby([('to filter', '')]).groups
|
||||
assert result == expected
|
||||
|
||||
df = DataFrame([[long(1), 'A'], [long(2), 'A']], columns=midx)
|
||||
|
||||
expected = df.groupby('to filter').groups
|
||||
result = df.groupby([('to filter', '')]).groups
|
||||
tm.assert_dict_equal(result, expected)
|
||||
|
||||
def test_groupby_multiindex_tuple(self):
|
||||
# GH 17979
|
||||
df = pd.DataFrame([[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
|
||||
columns=pd.MultiIndex.from_arrays(
|
||||
[['a', 'b', 'b', 'c'],
|
||||
[1, 1, 2, 2]]))
|
||||
expected = df.groupby([('b', 1)]).groups
|
||||
result = df.groupby(('b', 1)).groups
|
||||
tm.assert_dict_equal(expected, result)
|
||||
|
||||
df2 = pd.DataFrame(df.values,
|
||||
columns=pd.MultiIndex.from_arrays(
|
||||
[['a', 'b', 'b', 'c'],
|
||||
['d', 'd', 'e', 'e']]))
|
||||
expected = df2.groupby([('b', 'd')]).groups
|
||||
result = df.groupby(('b', 1)).groups
|
||||
tm.assert_dict_equal(expected, result)
|
||||
|
||||
df3 = pd.DataFrame(df.values,
|
||||
columns=[('a', 'd'), ('b', 'd'), ('b', 'e'), 'c'])
|
||||
expected = df3.groupby([('b', 'd')]).groups
|
||||
result = df.groupby(('b', 1)).groups
|
||||
tm.assert_dict_equal(expected, result)
|
||||
|
||||
@pytest.mark.parametrize('sort', [True, False])
|
||||
def test_groupby_level(self, sort, mframe, df):
|
||||
# GH 17537
|
||||
frame = mframe
|
||||
deleveled = frame.reset_index()
|
||||
|
||||
result0 = frame.groupby(level=0, sort=sort).sum()
|
||||
result1 = frame.groupby(level=1, sort=sort).sum()
|
||||
|
||||
expected0 = frame.groupby(deleveled['first'].values, sort=sort).sum()
|
||||
expected1 = frame.groupby(deleveled['second'].values, sort=sort).sum()
|
||||
|
||||
expected0.index.name = 'first'
|
||||
expected1.index.name = 'second'
|
||||
|
||||
assert result0.index.name == 'first'
|
||||
assert result1.index.name == 'second'
|
||||
|
||||
assert_frame_equal(result0, expected0)
|
||||
assert_frame_equal(result1, expected1)
|
||||
assert result0.index.name == frame.index.names[0]
|
||||
assert result1.index.name == frame.index.names[1]
|
||||
|
||||
# groupby level name
|
||||
result0 = frame.groupby(level='first', sort=sort).sum()
|
||||
result1 = frame.groupby(level='second', sort=sort).sum()
|
||||
assert_frame_equal(result0, expected0)
|
||||
assert_frame_equal(result1, expected1)
|
||||
|
||||
# axis=1
|
||||
|
||||
result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum()
|
||||
result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum()
|
||||
assert_frame_equal(result0, expected0.T)
|
||||
assert_frame_equal(result1, expected1.T)
|
||||
|
||||
# raise exception for non-MultiIndex
|
||||
msg = "level > 0 or level < -1 only valid with MultiIndex"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby(level=1)
|
||||
|
||||
def test_groupby_level_index_names(self):
|
||||
# GH4014 this used to raise ValueError since 'exp'>1 (in py2)
|
||||
df = DataFrame({'exp': ['A'] * 3 + ['B'] * 3,
|
||||
'var1': lrange(6), }).set_index('exp')
|
||||
df.groupby(level='exp')
|
||||
msg = "level name foo is not the name of the index"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby(level='foo')
|
||||
|
||||
@pytest.mark.parametrize('sort', [True, False])
|
||||
def test_groupby_level_with_nas(self, sort):
|
||||
# GH 17537
|
||||
index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
|
||||
codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1,
|
||||
2, 3]])
|
||||
|
||||
# factorizing doesn't confuse things
|
||||
s = Series(np.arange(8.), index=index)
|
||||
result = s.groupby(level=0, sort=sort).sum()
|
||||
expected = Series([6., 22.], index=[0, 1])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
|
||||
codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0,
|
||||
1, 2, 3]])
|
||||
|
||||
# factorizing doesn't confuse things
|
||||
s = Series(np.arange(8.), index=index)
|
||||
result = s.groupby(level=0, sort=sort).sum()
|
||||
expected = Series([6., 18.], index=[0.0, 1.0])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_args(self, mframe):
|
||||
# PR8618 and issue 8015
|
||||
frame = mframe
|
||||
|
||||
msg = "You have to supply one of 'by' and 'level'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
frame.groupby()
|
||||
|
||||
msg = "You have to supply one of 'by' and 'level'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
frame.groupby(by=None, level=None)
|
||||
|
||||
@pytest.mark.parametrize('sort,labels', [
|
||||
[True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
|
||||
[False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]]
|
||||
])
|
||||
def test_level_preserve_order(self, sort, labels, mframe):
|
||||
# GH 17537
|
||||
grouped = mframe.groupby(level=0, sort=sort)
|
||||
exp_labels = np.array(labels, np.intp)
|
||||
assert_almost_equal(grouped.grouper.labels[0], exp_labels)
|
||||
|
||||
def test_grouping_labels(self, mframe):
|
||||
grouped = mframe.groupby(mframe.index.get_level_values(0))
|
||||
exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp)
|
||||
assert_almost_equal(grouped.grouper.labels[0], exp_labels)
|
||||
|
||||
def test_list_grouper_with_nat(self):
|
||||
# GH 14715
|
||||
df = pd.DataFrame({'date': pd.date_range('1/1/2011',
|
||||
periods=365, freq='D')})
|
||||
df.iloc[-1] = pd.NaT
|
||||
grouper = pd.Grouper(key='date', freq='AS')
|
||||
|
||||
# Grouper in a list grouping
|
||||
result = df.groupby([grouper])
|
||||
expected = {pd.Timestamp('2011-01-01'): pd.Index(list(range(364)))}
|
||||
tm.assert_dict_equal(result.groups, expected)
|
||||
|
||||
# Test case without a list
|
||||
result = df.groupby(grouper)
|
||||
expected = {pd.Timestamp('2011-01-01'): 365}
|
||||
tm.assert_dict_equal(result.groups, expected)
|
||||
|
||||
|
||||
# get_group
|
||||
# --------------------------------
|
||||
|
||||
class TestGetGroup():
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
|
||||
def test_get_group(self):
|
||||
wp = tm.makePanel()
|
||||
grouped = wp.groupby(lambda x: x.month, axis='major')
|
||||
|
||||
gp = grouped.get_group(1)
|
||||
expected = wp.reindex(
|
||||
major=[x for x in wp.major_axis if x.month == 1])
|
||||
assert_panel_equal(gp, expected)
|
||||
|
||||
# GH 5267
|
||||
# be datelike friendly
|
||||
df = DataFrame({'DATE': pd.to_datetime(
|
||||
['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', '11-Oct-2013',
|
||||
'11-Oct-2013', '11-Oct-2013']),
|
||||
'label': ['foo', 'foo', 'bar', 'foo', 'foo', 'bar'],
|
||||
'VAL': [1, 2, 3, 4, 5, 6]})
|
||||
|
||||
g = df.groupby('DATE')
|
||||
key = list(g.groups)[0]
|
||||
result1 = g.get_group(key)
|
||||
result2 = g.get_group(Timestamp(key).to_pydatetime())
|
||||
result3 = g.get_group(str(Timestamp(key)))
|
||||
assert_frame_equal(result1, result2)
|
||||
assert_frame_equal(result1, result3)
|
||||
|
||||
g = df.groupby(['DATE', 'label'])
|
||||
|
||||
key = list(g.groups)[0]
|
||||
result1 = g.get_group(key)
|
||||
result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1]))
|
||||
result3 = g.get_group((str(Timestamp(key[0])), key[1]))
|
||||
assert_frame_equal(result1, result2)
|
||||
assert_frame_equal(result1, result3)
|
||||
|
||||
# must pass a same-length tuple with multiple keys
|
||||
msg = "must supply a tuple to get_group with multiple grouping keys"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
g.get_group('foo')
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
g.get_group(('foo'))
|
||||
msg = ("must supply a same-length tuple to get_group with multiple"
|
||||
" grouping keys")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
g.get_group(('foo', 'bar', 'baz'))
|
||||
|
||||
def test_get_group_empty_bins(self, observed):
|
||||
|
||||
d = pd.DataFrame([3, 1, 7, 6])
|
||||
bins = [0, 5, 10, 15]
|
||||
g = d.groupby(pd.cut(d[0], bins), observed=observed)
|
||||
|
||||
# TODO: should prob allow a str of Interval work as well
|
||||
# IOW '(0, 5]'
|
||||
result = g.get_group(pd.Interval(0, 5))
|
||||
expected = DataFrame([3, 1], index=[0, 1])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
msg = r"Interval\(10, 15, closed='right'\)"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
g.get_group(pd.Interval(10, 15))
|
||||
|
||||
def test_get_group_grouped_by_tuple(self):
|
||||
# GH 8121
|
||||
df = DataFrame([[(1, ), (1, 2), (1, ), (1, 2)]], index=['ids']).T
|
||||
gr = df.groupby('ids')
|
||||
expected = DataFrame({'ids': [(1, ), (1, )]}, index=[0, 2])
|
||||
result = gr.get_group((1, ))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
dt = pd.to_datetime(['2010-01-01', '2010-01-02', '2010-01-01',
|
||||
'2010-01-02'])
|
||||
df = DataFrame({'ids': [(x, ) for x in dt]})
|
||||
gr = df.groupby('ids')
|
||||
result = gr.get_group(('2010-01-01', ))
|
||||
expected = DataFrame({'ids': [(dt[0], ), (dt[0], )]}, index=[0, 2])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_with_empty(self):
|
||||
index = pd.DatetimeIndex(())
|
||||
data = ()
|
||||
series = pd.Series(data, index)
|
||||
grouper = pd.Grouper(freq='D')
|
||||
grouped = series.groupby(grouper)
|
||||
assert next(iter(grouped), None) is None
|
||||
|
||||
def test_groupby_with_single_column(self):
|
||||
df = pd.DataFrame({'a': list('abssbab')})
|
||||
tm.assert_frame_equal(df.groupby('a').get_group('a'), df.iloc[[0, 5]])
|
||||
# GH 13530
|
||||
exp = pd.DataFrame([], index=pd.Index(['a', 'b', 's'], name='a'))
|
||||
tm.assert_frame_equal(df.groupby('a').count(), exp)
|
||||
tm.assert_frame_equal(df.groupby('a').sum(), exp)
|
||||
tm.assert_frame_equal(df.groupby('a').nth(1), exp)
|
||||
|
||||
def test_gb_key_len_equal_axis_len(self):
|
||||
# GH16843
|
||||
# test ensures that index and column keys are recognized correctly
|
||||
# when number of keys equals axis length of groupby
|
||||
df = pd.DataFrame([['foo', 'bar', 'B', 1],
|
||||
['foo', 'bar', 'B', 2],
|
||||
['foo', 'baz', 'C', 3]],
|
||||
columns=['first', 'second', 'third', 'one'])
|
||||
df = df.set_index(['first', 'second'])
|
||||
df = df.groupby(['first', 'second', 'third']).size()
|
||||
assert df.loc[('foo', 'bar', 'B')] == 2
|
||||
assert df.loc[('foo', 'baz', 'C')] == 1
|
||||
|
||||
|
||||
# groups & iteration
|
||||
# --------------------------------
|
||||
|
||||
class TestIteration():
|
||||
|
||||
def test_groups(self, df):
|
||||
grouped = df.groupby(['A'])
|
||||
groups = grouped.groups
|
||||
assert groups is grouped.groups # caching works
|
||||
|
||||
for k, v in compat.iteritems(grouped.groups):
|
||||
assert (df.loc[v]['A'] == k).all()
|
||||
|
||||
grouped = df.groupby(['A', 'B'])
|
||||
groups = grouped.groups
|
||||
assert groups is grouped.groups # caching works
|
||||
|
||||
for k, v in compat.iteritems(grouped.groups):
|
||||
assert (df.loc[v]['A'] == k[0]).all()
|
||||
assert (df.loc[v]['B'] == k[1]).all()
|
||||
|
||||
def test_grouping_is_iterable(self, tsframe):
|
||||
# this code path isn't used anywhere else
|
||||
# not sure it's useful
|
||||
grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year])
|
||||
|
||||
# test it works
|
||||
for g in grouped.grouper.groupings[0]:
|
||||
pass
|
||||
|
||||
def test_multi_iter(self):
|
||||
s = Series(np.arange(6))
|
||||
k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b'])
|
||||
k2 = np.array(['1', '2', '1', '2', '1', '2'])
|
||||
|
||||
grouped = s.groupby([k1, k2])
|
||||
|
||||
iterated = list(grouped)
|
||||
expected = [('a', '1', s[[0, 2]]), ('a', '2', s[[1]]),
|
||||
('b', '1', s[[4]]), ('b', '2', s[[3, 5]])]
|
||||
for i, ((one, two), three) in enumerate(iterated):
|
||||
e1, e2, e3 = expected[i]
|
||||
assert e1 == one
|
||||
assert e2 == two
|
||||
assert_series_equal(three, e3)
|
||||
|
||||
def test_multi_iter_frame(self, three_group):
|
||||
k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
|
||||
k2 = np.array(['1', '2', '1', '2', '1', '2'])
|
||||
df = DataFrame({'v1': np.random.randn(6),
|
||||
'v2': np.random.randn(6),
|
||||
'k1': k1, 'k2': k2},
|
||||
index=['one', 'two', 'three', 'four', 'five', 'six'])
|
||||
|
||||
grouped = df.groupby(['k1', 'k2'])
|
||||
|
||||
# things get sorted!
|
||||
iterated = list(grouped)
|
||||
idx = df.index
|
||||
expected = [('a', '1', df.loc[idx[[4]]]),
|
||||
('a', '2', df.loc[idx[[3, 5]]]),
|
||||
('b', '1', df.loc[idx[[0, 2]]]),
|
||||
('b', '2', df.loc[idx[[1]]])]
|
||||
for i, ((one, two), three) in enumerate(iterated):
|
||||
e1, e2, e3 = expected[i]
|
||||
assert e1 == one
|
||||
assert e2 == two
|
||||
assert_frame_equal(three, e3)
|
||||
|
||||
# don't iterate through groups with no data
|
||||
df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
|
||||
df['k2'] = np.array(['1', '1', '1', '2', '2', '2'])
|
||||
grouped = df.groupby(['k1', 'k2'])
|
||||
groups = {key: gp for key, gp in grouped}
|
||||
assert len(groups) == 2
|
||||
|
||||
# axis = 1
|
||||
three_levels = three_group.groupby(['A', 'B', 'C']).mean()
|
||||
grouped = three_levels.T.groupby(axis=1, level=(1, 2))
|
||||
for key, group in grouped:
|
||||
pass
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
|
||||
def test_multi_iter_panel(self):
|
||||
wp = tm.makePanel()
|
||||
grouped = wp.groupby([lambda x: x.month, lambda x: x.weekday()],
|
||||
axis=1)
|
||||
|
||||
for (month, wd), group in grouped:
|
||||
exp_axis = [x
|
||||
for x in wp.major_axis
|
||||
if x.month == month and x.weekday() == wd]
|
||||
expected = wp.reindex(major=exp_axis)
|
||||
assert_panel_equal(group, expected)
|
||||
|
||||
def test_dictify(self, df):
|
||||
dict(iter(df.groupby('A')))
|
||||
dict(iter(df.groupby(['A', 'B'])))
|
||||
dict(iter(df['C'].groupby(df['A'])))
|
||||
dict(iter(df['C'].groupby([df['A'], df['B']])))
|
||||
dict(iter(df.groupby('A')['C']))
|
||||
dict(iter(df.groupby(['A', 'B'])['C']))
|
||||
|
||||
def test_groupby_with_small_elem(self):
|
||||
# GH 8542
|
||||
# length=2
|
||||
df = pd.DataFrame({'event': ['start', 'start'],
|
||||
'change': [1234, 5678]},
|
||||
index=pd.DatetimeIndex(['2014-09-10', '2013-10-10']))
|
||||
grouped = df.groupby([pd.Grouper(freq='M'), 'event'])
|
||||
assert len(grouped.groups) == 2
|
||||
assert grouped.ngroups == 2
|
||||
assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups
|
||||
assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups
|
||||
|
||||
res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start'))
|
||||
tm.assert_frame_equal(res, df.iloc[[0], :])
|
||||
res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start'))
|
||||
tm.assert_frame_equal(res, df.iloc[[1], :])
|
||||
|
||||
df = pd.DataFrame({'event': ['start', 'start', 'start'],
|
||||
'change': [1234, 5678, 9123]},
|
||||
index=pd.DatetimeIndex(['2014-09-10', '2013-10-10',
|
||||
'2014-09-15']))
|
||||
grouped = df.groupby([pd.Grouper(freq='M'), 'event'])
|
||||
assert len(grouped.groups) == 2
|
||||
assert grouped.ngroups == 2
|
||||
assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups
|
||||
assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups
|
||||
|
||||
res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start'))
|
||||
tm.assert_frame_equal(res, df.iloc[[0, 2], :])
|
||||
res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start'))
|
||||
tm.assert_frame_equal(res, df.iloc[[1], :])
|
||||
|
||||
# length=3
|
||||
df = pd.DataFrame({'event': ['start', 'start', 'start'],
|
||||
'change': [1234, 5678, 9123]},
|
||||
index=pd.DatetimeIndex(['2014-09-10', '2013-10-10',
|
||||
'2014-08-05']))
|
||||
grouped = df.groupby([pd.Grouper(freq='M'), 'event'])
|
||||
assert len(grouped.groups) == 3
|
||||
assert grouped.ngroups == 3
|
||||
assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups
|
||||
assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups
|
||||
assert (pd.Timestamp('2014-08-31'), 'start') in grouped.groups
|
||||
|
||||
res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start'))
|
||||
tm.assert_frame_equal(res, df.iloc[[0], :])
|
||||
res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start'))
|
||||
tm.assert_frame_equal(res, df.iloc[[1], :])
|
||||
res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start'))
|
||||
tm.assert_frame_equal(res, df.iloc[[2], :])
|
||||
|
||||
def test_grouping_string_repr(self):
|
||||
# GH 13394
|
||||
mi = MultiIndex.from_arrays([list("AAB"), list("aba")])
|
||||
df = DataFrame([[1, 2, 3]], columns=mi)
|
||||
gr = df.groupby(df[('A', 'a')])
|
||||
|
||||
result = gr.grouper.groupings[0].__repr__()
|
||||
expected = "Grouping(('A', 'a'))"
|
||||
assert result == expected
|
||||
@@ -1,68 +0,0 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
@pytest.fixture(params=[['inner'], ['inner', 'outer']])
|
||||
def frame(request):
|
||||
levels = request.param
|
||||
df = pd.DataFrame({'outer': ['a', 'a', 'a', 'b', 'b', 'b'],
|
||||
'inner': [1, 2, 3, 1, 2, 3],
|
||||
'A': np.arange(6),
|
||||
'B': ['one', 'one', 'two', 'two', 'one', 'one']})
|
||||
if levels:
|
||||
df = df.set_index(levels)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def series():
|
||||
df = pd.DataFrame({'outer': ['a', 'a', 'a', 'b', 'b', 'b'],
|
||||
'inner': [1, 2, 3, 1, 2, 3],
|
||||
'A': np.arange(6),
|
||||
'B': ['one', 'one', 'two', 'two', 'one', 'one']})
|
||||
s = df.set_index(['outer', 'inner', 'B'])['A']
|
||||
|
||||
return s
|
||||
|
||||
|
||||
@pytest.mark.parametrize('key_strs,groupers', [
|
||||
('inner', # Index name
|
||||
pd.Grouper(level='inner')
|
||||
),
|
||||
(['inner'], # List of index name
|
||||
[pd.Grouper(level='inner')]
|
||||
),
|
||||
(['B', 'inner'], # Column and index
|
||||
['B', pd.Grouper(level='inner')]
|
||||
),
|
||||
(['inner', 'B'], # Index and column
|
||||
[pd.Grouper(level='inner'), 'B'])])
|
||||
def test_grouper_index_level_as_string(frame, key_strs, groupers):
|
||||
result = frame.groupby(key_strs).mean()
|
||||
expected = frame.groupby(groupers).mean()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('levels', [
|
||||
'inner', 'outer', 'B',
|
||||
['inner'], ['outer'], ['B'],
|
||||
['inner', 'outer'], ['outer', 'inner'],
|
||||
['inner', 'outer', 'B'], ['B', 'outer', 'inner']
|
||||
])
|
||||
def test_grouper_index_level_as_string_series(series, levels):
|
||||
|
||||
# Compute expected result
|
||||
if isinstance(levels, list):
|
||||
groupers = [pd.Grouper(level=lv) for lv in levels]
|
||||
else:
|
||||
groupers = pd.Grouper(level=levels)
|
||||
|
||||
expected = series.groupby(groupers).mean()
|
||||
|
||||
# Compute and check result
|
||||
result = series.groupby(levels).mean()
|
||||
assert_series_equal(result, expected)
|
||||
@@ -1,416 +0,0 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import lrange
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna
|
||||
from pandas.util.testing import (
|
||||
assert_frame_equal, assert_produces_warning, assert_series_equal)
|
||||
|
||||
|
||||
def test_first_last_nth(df):
|
||||
# tests for first / last / nth
|
||||
grouped = df.groupby('A')
|
||||
first = grouped.first()
|
||||
expected = df.loc[[1, 0], ['B', 'C', 'D']]
|
||||
expected.index = Index(['bar', 'foo'], name='A')
|
||||
expected = expected.sort_index()
|
||||
assert_frame_equal(first, expected)
|
||||
|
||||
nth = grouped.nth(0)
|
||||
assert_frame_equal(nth, expected)
|
||||
|
||||
last = grouped.last()
|
||||
expected = df.loc[[5, 7], ['B', 'C', 'D']]
|
||||
expected.index = Index(['bar', 'foo'], name='A')
|
||||
assert_frame_equal(last, expected)
|
||||
|
||||
nth = grouped.nth(-1)
|
||||
assert_frame_equal(nth, expected)
|
||||
|
||||
nth = grouped.nth(1)
|
||||
expected = df.loc[[2, 3], ['B', 'C', 'D']].copy()
|
||||
expected.index = Index(['foo', 'bar'], name='A')
|
||||
expected = expected.sort_index()
|
||||
assert_frame_equal(nth, expected)
|
||||
|
||||
# it works!
|
||||
grouped['B'].first()
|
||||
grouped['B'].last()
|
||||
grouped['B'].nth(0)
|
||||
|
||||
df.loc[df['A'] == 'foo', 'B'] = np.nan
|
||||
assert isna(grouped['B'].first()['foo'])
|
||||
assert isna(grouped['B'].last()['foo'])
|
||||
assert isna(grouped['B'].nth(0)['foo'])
|
||||
|
||||
# v0.14.0 whatsnew
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
|
||||
g = df.groupby('A')
|
||||
result = g.first()
|
||||
expected = df.iloc[[1, 2]].set_index('A')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
expected = df.iloc[[1, 2]].set_index('A')
|
||||
result = g.nth(0, dropna='any')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_last_nth_dtypes(df_mixed_floats):
|
||||
|
||||
df = df_mixed_floats.copy()
|
||||
df['E'] = True
|
||||
df['F'] = 1
|
||||
|
||||
# tests for first / last / nth
|
||||
grouped = df.groupby('A')
|
||||
first = grouped.first()
|
||||
expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']]
|
||||
expected.index = Index(['bar', 'foo'], name='A')
|
||||
expected = expected.sort_index()
|
||||
assert_frame_equal(first, expected)
|
||||
|
||||
last = grouped.last()
|
||||
expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']]
|
||||
expected.index = Index(['bar', 'foo'], name='A')
|
||||
expected = expected.sort_index()
|
||||
assert_frame_equal(last, expected)
|
||||
|
||||
nth = grouped.nth(1)
|
||||
expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']]
|
||||
expected.index = Index(['bar', 'foo'], name='A')
|
||||
expected = expected.sort_index()
|
||||
assert_frame_equal(nth, expected)
|
||||
|
||||
# GH 2763, first/last shifting dtypes
|
||||
idx = lrange(10)
|
||||
idx.append(9)
|
||||
s = Series(data=lrange(11), index=idx, name='IntCol')
|
||||
assert s.dtype == 'int64'
|
||||
f = s.groupby(level=0).first()
|
||||
assert f.dtype == 'int64'
|
||||
|
||||
|
||||
def test_nth():
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
|
||||
g = df.groupby('A')
|
||||
|
||||
assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A'))
|
||||
assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A'))
|
||||
assert_frame_equal(g.nth(2), df.loc[[]].set_index('A'))
|
||||
assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A'))
|
||||
assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A'))
|
||||
assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A'))
|
||||
assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]])
|
||||
assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]])
|
||||
assert_frame_equal(g[['B']].nth(0),
|
||||
df.loc[[0, 2], ['A', 'B']].set_index('A'))
|
||||
|
||||
exp = df.set_index('A')
|
||||
assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
|
||||
assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])
|
||||
|
||||
exp['B'] = np.nan
|
||||
assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
|
||||
assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])
|
||||
|
||||
# out of bounds, regression from 0.13.1
|
||||
# GH 6621
|
||||
df = DataFrame({'color': {0: 'green',
|
||||
1: 'green',
|
||||
2: 'red',
|
||||
3: 'red',
|
||||
4: 'red'},
|
||||
'food': {0: 'ham',
|
||||
1: 'eggs',
|
||||
2: 'eggs',
|
||||
3: 'ham',
|
||||
4: 'pork'},
|
||||
'two': {0: 1.5456590000000001,
|
||||
1: -0.070345000000000005,
|
||||
2: -2.4004539999999999,
|
||||
3: 0.46206000000000003,
|
||||
4: 0.52350799999999997},
|
||||
'one': {0: 0.56573799999999996,
|
||||
1: -0.9742360000000001,
|
||||
2: 1.033801,
|
||||
3: -0.78543499999999999,
|
||||
4: 0.70422799999999997}}).set_index(['color',
|
||||
'food'])
|
||||
|
||||
result = df.groupby(level=0, as_index=False).nth(2)
|
||||
expected = df.iloc[[-1]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(level=0, as_index=False).nth(3)
|
||||
expected = df.loc[[]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH 7559
|
||||
# from the vbench
|
||||
df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64')
|
||||
s = df[1]
|
||||
g = df[0]
|
||||
expected = s.groupby(g).first()
|
||||
expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
|
||||
assert_series_equal(expected2, expected, check_names=False)
|
||||
assert expected.name == 1
|
||||
assert expected2.name == 1
|
||||
|
||||
# validate first
|
||||
v = s[g == 1].iloc[0]
|
||||
assert expected.iloc[0] == v
|
||||
assert expected2.iloc[0] == v
|
||||
|
||||
# this is NOT the same as .first (as sorted is default!)
|
||||
# as it keeps the order in the series (and not the group order)
|
||||
# related GH 7287
|
||||
expected = s.groupby(g, sort=False).first()
|
||||
result = s.groupby(g, sort=False).nth(0, dropna='all')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# doc example
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
|
||||
g = df.groupby('A')
|
||||
# PR 17493, related to issue 11038
|
||||
# test Series.nth with True for dropna produces FutureWarning
|
||||
with assert_produces_warning(FutureWarning):
|
||||
result = g.B.nth(0, dropna=True)
|
||||
expected = g.B.first()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# test multiple nth values
|
||||
df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]],
|
||||
columns=['A', 'B'])
|
||||
g = df.groupby('A')
|
||||
|
||||
assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A'))
|
||||
assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A'))
|
||||
assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A'))
|
||||
assert_frame_equal(
|
||||
g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A'))
|
||||
assert_frame_equal(
|
||||
g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
|
||||
assert_frame_equal(
|
||||
g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
|
||||
assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A'))
|
||||
assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A'))
|
||||
|
||||
business_dates = pd.date_range(start='4/1/2014', end='6/30/2014',
|
||||
freq='B')
|
||||
df = DataFrame(1, index=business_dates, columns=['a', 'b'])
|
||||
# get the first, fourth and last two business days for each month
|
||||
key = [df.index.year, df.index.month]
|
||||
result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
|
||||
expected_dates = pd.to_datetime(
|
||||
['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1',
|
||||
'2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5',
|
||||
'2014/6/27', '2014/6/30'])
|
||||
expected = DataFrame(1, columns=['a', 'b'], index=expected_dates)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth_multi_index(three_group):
|
||||
# PR 9090, related to issue 8979
|
||||
# test nth on MultiIndex, should match .first()
|
||||
grouped = three_group.groupby(['A', 'B'])
|
||||
result = grouped.nth(0)
|
||||
expected = grouped.first()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('data, expected_first, expected_last', [
|
||||
({'id': ['A'],
|
||||
'time': Timestamp('2012-02-01 14:00:00',
|
||||
tz='US/Central'),
|
||||
'foo': [1]},
|
||||
{'id': ['A'],
|
||||
'time': Timestamp('2012-02-01 14:00:00',
|
||||
tz='US/Central'),
|
||||
'foo': [1]},
|
||||
{'id': ['A'],
|
||||
'time': Timestamp('2012-02-01 14:00:00',
|
||||
tz='US/Central'),
|
||||
'foo': [1]}),
|
||||
({'id': ['A', 'B', 'A'],
|
||||
'time': [Timestamp('2012-01-01 13:00:00',
|
||||
tz='America/New_York'),
|
||||
Timestamp('2012-02-01 14:00:00',
|
||||
tz='US/Central'),
|
||||
Timestamp('2012-03-01 12:00:00',
|
||||
tz='Europe/London')],
|
||||
'foo': [1, 2, 3]},
|
||||
{'id': ['A', 'B'],
|
||||
'time': [Timestamp('2012-01-01 13:00:00',
|
||||
tz='America/New_York'),
|
||||
Timestamp('2012-02-01 14:00:00',
|
||||
tz='US/Central')],
|
||||
'foo': [1, 2]},
|
||||
{'id': ['A', 'B'],
|
||||
'time': [Timestamp('2012-03-01 12:00:00',
|
||||
tz='Europe/London'),
|
||||
Timestamp('2012-02-01 14:00:00',
|
||||
tz='US/Central')],
|
||||
'foo': [3, 2]})
|
||||
])
|
||||
def test_first_last_tz(data, expected_first, expected_last):
|
||||
# GH15884
|
||||
# Test that the timezone is retained when calling first
|
||||
# or last on groupby with as_index=False
|
||||
|
||||
df = DataFrame(data)
|
||||
|
||||
result = df.groupby('id', as_index=False).first()
|
||||
expected = DataFrame(expected_first)
|
||||
cols = ['id', 'time', 'foo']
|
||||
assert_frame_equal(result[cols], expected[cols])
|
||||
|
||||
result = df.groupby('id', as_index=False)['time'].first()
|
||||
assert_frame_equal(result, expected[['id', 'time']])
|
||||
|
||||
result = df.groupby('id', as_index=False).last()
|
||||
expected = DataFrame(expected_last)
|
||||
cols = ['id', 'time', 'foo']
|
||||
assert_frame_equal(result[cols], expected[cols])
|
||||
|
||||
result = df.groupby('id', as_index=False)['time'].last()
|
||||
assert_frame_equal(result, expected[['id', 'time']])
|
||||
|
||||
|
||||
def test_nth_multi_index_as_expected():
|
||||
# PR 9090, related to issue 8979
|
||||
# test nth on MultiIndex
|
||||
three_group = DataFrame(
|
||||
{'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
|
||||
'foo', 'foo', 'foo'],
|
||||
'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
|
||||
'two', 'two', 'one'],
|
||||
'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
|
||||
'dull', 'shiny', 'shiny', 'shiny']})
|
||||
grouped = three_group.groupby(['A', 'B'])
|
||||
result = grouped.nth(0)
|
||||
expected = DataFrame(
|
||||
{'C': ['dull', 'dull', 'dull', 'dull']},
|
||||
index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'],
|
||||
['one', 'two', 'one', 'two']],
|
||||
names=['A', 'B']))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_head_tail():
|
||||
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
|
||||
g_as = df.groupby('A', as_index=True)
|
||||
g_not_as = df.groupby('A', as_index=False)
|
||||
|
||||
# as_index= False, much easier
|
||||
assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1))
|
||||
assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1))
|
||||
|
||||
empty_not_as = DataFrame(columns=df.columns,
|
||||
index=pd.Index([], dtype=df.index.dtype))
|
||||
empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype)
|
||||
empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype)
|
||||
assert_frame_equal(empty_not_as, g_not_as.head(0))
|
||||
assert_frame_equal(empty_not_as, g_not_as.tail(0))
|
||||
assert_frame_equal(empty_not_as, g_not_as.head(-1))
|
||||
assert_frame_equal(empty_not_as, g_not_as.tail(-1))
|
||||
|
||||
assert_frame_equal(df, g_not_as.head(7)) # contains all
|
||||
assert_frame_equal(df, g_not_as.tail(7))
|
||||
|
||||
# as_index=True, (used to be different)
|
||||
df_as = df
|
||||
|
||||
assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1))
|
||||
assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1))
|
||||
|
||||
empty_as = DataFrame(index=df_as.index[:0], columns=df.columns)
|
||||
empty_as['A'] = empty_not_as['A'].astype(df.A.dtype)
|
||||
empty_as['B'] = empty_not_as['B'].astype(df.B.dtype)
|
||||
assert_frame_equal(empty_as, g_as.head(0))
|
||||
assert_frame_equal(empty_as, g_as.tail(0))
|
||||
assert_frame_equal(empty_as, g_as.head(-1))
|
||||
assert_frame_equal(empty_as, g_as.tail(-1))
|
||||
|
||||
assert_frame_equal(df_as, g_as.head(7)) # contains all
|
||||
assert_frame_equal(df_as, g_as.tail(7))
|
||||
|
||||
# test with selection
|
||||
assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []])
|
||||
assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0, 2], ['A']])
|
||||
assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0, 2], ['B']])
|
||||
assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0, 2]])
|
||||
|
||||
assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []])
|
||||
assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0, 2], ['A']])
|
||||
assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0, 2], ['B']])
|
||||
assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0, 2]])
|
||||
|
||||
|
||||
def test_group_selection_cache():
|
||||
# GH 12839 nth, head, and tail should return same result consistently
|
||||
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
|
||||
expected = df.iloc[[0, 2]].set_index('A')
|
||||
|
||||
g = df.groupby('A')
|
||||
result1 = g.head(n=2)
|
||||
result2 = g.nth(0)
|
||||
assert_frame_equal(result1, df)
|
||||
assert_frame_equal(result2, expected)
|
||||
|
||||
g = df.groupby('A')
|
||||
result1 = g.tail(n=2)
|
||||
result2 = g.nth(0)
|
||||
assert_frame_equal(result1, df)
|
||||
assert_frame_equal(result2, expected)
|
||||
|
||||
g = df.groupby('A')
|
||||
result1 = g.nth(0)
|
||||
result2 = g.head(n=2)
|
||||
assert_frame_equal(result1, expected)
|
||||
assert_frame_equal(result2, df)
|
||||
|
||||
g = df.groupby('A')
|
||||
result1 = g.nth(0)
|
||||
result2 = g.tail(n=2)
|
||||
assert_frame_equal(result1, expected)
|
||||
assert_frame_equal(result2, df)
|
||||
|
||||
|
||||
def test_nth_empty():
|
||||
# GH 16064
|
||||
df = DataFrame(index=[0], columns=['a', 'b', 'c'])
|
||||
result = df.groupby('a').nth(10)
|
||||
expected = DataFrame(index=Index([], name='a'), columns=['b', 'c'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(['a', 'b']).nth(10)
|
||||
expected = DataFrame(index=MultiIndex([[], []], [[], []],
|
||||
names=['a', 'b']),
|
||||
columns=['c'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth_column_order():
|
||||
# GH 20760
|
||||
# Check that nth preserves column order
|
||||
df = DataFrame([[1, 'b', 100],
|
||||
[1, 'a', 50],
|
||||
[1, 'a', np.nan],
|
||||
[2, 'c', 200],
|
||||
[2, 'd', 150]],
|
||||
columns=['A', 'C', 'B'])
|
||||
result = df.groupby('A').nth(0)
|
||||
expected = DataFrame([['b', 100.0],
|
||||
['c', 200.0]],
|
||||
columns=['C', 'B'],
|
||||
index=Index([1, 2], name='A'))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby('A').nth(-1, dropna='any')
|
||||
expected = DataFrame([['a', 50.0],
|
||||
['d', 150.0]],
|
||||
columns=['C', 'B'],
|
||||
index=Index([1, 2], name='A'))
|
||||
assert_frame_equal(result, expected)
|
||||
@@ -1,306 +0,0 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Series, concat
|
||||
from pandas.util import testing as tm
|
||||
|
||||
|
||||
def test_rank_apply():
|
||||
lev1 = tm.rands_array(10, 100)
|
||||
lev2 = tm.rands_array(10, 130)
|
||||
lab1 = np.random.randint(0, 100, size=500)
|
||||
lab2 = np.random.randint(0, 130, size=500)
|
||||
|
||||
df = DataFrame({'value': np.random.randn(500),
|
||||
'key1': lev1.take(lab1),
|
||||
'key2': lev2.take(lab2)})
|
||||
|
||||
result = df.groupby(['key1', 'key2']).value.rank()
|
||||
|
||||
expected = [piece.value.rank()
|
||||
for key, piece in df.groupby(['key1', 'key2'])]
|
||||
expected = concat(expected, axis=0)
|
||||
expected = expected.reindex(result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby(['key1', 'key2']).value.rank(pct=True)
|
||||
|
||||
expected = [piece.value.rank(pct=True)
|
||||
for key, piece in df.groupby(['key1', 'key2'])]
|
||||
expected = concat(expected, axis=0)
|
||||
expected = expected.reindex(result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [
|
||||
['qux'], ['qux', 'quux']])
|
||||
@pytest.mark.parametrize("vals", [
|
||||
[2, 2, 8, 2, 6],
|
||||
[pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'),
|
||||
pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'),
|
||||
pd.Timestamp('2018-01-06')]])
|
||||
@pytest.mark.parametrize("ties_method,ascending,pct,exp", [
|
||||
('average', True, False, [2., 2., 5., 2., 4.]),
|
||||
('average', True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
|
||||
('average', False, False, [4., 4., 1., 4., 2.]),
|
||||
('average', False, True, [.8, .8, .2, .8, .4]),
|
||||
('min', True, False, [1., 1., 5., 1., 4.]),
|
||||
('min', True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
|
||||
('min', False, False, [3., 3., 1., 3., 2.]),
|
||||
('min', False, True, [.6, .6, .2, .6, .4]),
|
||||
('max', True, False, [3., 3., 5., 3., 4.]),
|
||||
('max', True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
|
||||
('max', False, False, [5., 5., 1., 5., 2.]),
|
||||
('max', False, True, [1., 1., .2, 1., .4]),
|
||||
('first', True, False, [1., 2., 5., 3., 4.]),
|
||||
('first', True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
|
||||
('first', False, False, [3., 4., 1., 5., 2.]),
|
||||
('first', False, True, [.6, .8, .2, 1., .4]),
|
||||
('dense', True, False, [1., 1., 3., 1., 2.]),
|
||||
('dense', True, True, [1. / 3., 1. / 3., 3. / 3., 1. / 3., 2. / 3.]),
|
||||
('dense', False, False, [3., 3., 1., 3., 2.]),
|
||||
('dense', False, True, [3. / 3., 3. / 3., 1. / 3., 3. / 3., 2. / 3.]),
|
||||
])
|
||||
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
|
||||
key = np.repeat(grps, len(vals))
|
||||
vals = vals * len(grps)
|
||||
df = DataFrame({'key': key, 'val': vals})
|
||||
result = df.groupby('key').rank(method=ties_method,
|
||||
ascending=ascending, pct=pct)
|
||||
|
||||
exp_df = DataFrame(exp * len(grps), columns=['val'])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [
|
||||
['qux'], ['qux', 'quux']])
|
||||
@pytest.mark.parametrize("vals", [
|
||||
[-np.inf, -np.inf, np.nan, 1., np.nan, np.inf, np.inf],
|
||||
])
|
||||
@pytest.mark.parametrize("ties_method,ascending,na_option,exp", [
|
||||
('average', True, 'keep', [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
|
||||
('average', True, 'top', [3.5, 3.5, 1.5, 5., 1.5, 6.5, 6.5]),
|
||||
('average', True, 'bottom', [1.5, 1.5, 6.5, 3., 6.5, 4.5, 4.5]),
|
||||
('average', False, 'keep', [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
|
||||
('average', False, 'top', [6.5, 6.5, 1.5, 5., 1.5, 3.5, 3.5]),
|
||||
('average', False, 'bottom', [4.5, 4.5, 6.5, 3., 6.5, 1.5, 1.5]),
|
||||
('min', True, 'keep', [1., 1., np.nan, 3., np.nan, 4., 4.]),
|
||||
('min', True, 'top', [3., 3., 1., 5., 1., 6., 6.]),
|
||||
('min', True, 'bottom', [1., 1., 6., 3., 6., 4., 4.]),
|
||||
('min', False, 'keep', [4., 4., np.nan, 3., np.nan, 1., 1.]),
|
||||
('min', False, 'top', [6., 6., 1., 5., 1., 3., 3.]),
|
||||
('min', False, 'bottom', [4., 4., 6., 3., 6., 1., 1.]),
|
||||
('max', True, 'keep', [2., 2., np.nan, 3., np.nan, 5., 5.]),
|
||||
('max', True, 'top', [4., 4., 2., 5., 2., 7., 7.]),
|
||||
('max', True, 'bottom', [2., 2., 7., 3., 7., 5., 5.]),
|
||||
('max', False, 'keep', [5., 5., np.nan, 3., np.nan, 2., 2.]),
|
||||
('max', False, 'top', [7., 7., 2., 5., 2., 4., 4.]),
|
||||
('max', False, 'bottom', [5., 5., 7., 3., 7., 2., 2.]),
|
||||
('first', True, 'keep', [1., 2., np.nan, 3., np.nan, 4., 5.]),
|
||||
('first', True, 'top', [3., 4., 1., 5., 2., 6., 7.]),
|
||||
('first', True, 'bottom', [1., 2., 6., 3., 7., 4., 5.]),
|
||||
('first', False, 'keep', [4., 5., np.nan, 3., np.nan, 1., 2.]),
|
||||
('first', False, 'top', [6., 7., 1., 5., 2., 3., 4.]),
|
||||
('first', False, 'bottom', [4., 5., 6., 3., 7., 1., 2.]),
|
||||
('dense', True, 'keep', [1., 1., np.nan, 2., np.nan, 3., 3.]),
|
||||
('dense', True, 'top', [2., 2., 1., 3., 1., 4., 4.]),
|
||||
('dense', True, 'bottom', [1., 1., 4., 2., 4., 3., 3.]),
|
||||
('dense', False, 'keep', [3., 3., np.nan, 2., np.nan, 1., 1.]),
|
||||
('dense', False, 'top', [4., 4., 1., 3., 1., 2., 2.]),
|
||||
('dense', False, 'bottom', [3., 3., 4., 2., 4., 1., 1.])
|
||||
])
|
||||
def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
|
||||
# GH 20561
|
||||
key = np.repeat(grps, len(vals))
|
||||
vals = vals * len(grps)
|
||||
df = DataFrame({'key': key, 'val': vals})
|
||||
result = df.groupby('key').rank(method=ties_method,
|
||||
ascending=ascending,
|
||||
na_option=na_option)
|
||||
exp_df = DataFrame(exp * len(grps), columns=['val'])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [
|
||||
['qux'], ['qux', 'quux']])
|
||||
@pytest.mark.parametrize("vals", [
|
||||
[2, 2, np.nan, 8, 2, 6, np.nan, np.nan],
|
||||
[pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan,
|
||||
pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'),
|
||||
pd.Timestamp('2018-01-06'), np.nan, np.nan]
|
||||
])
|
||||
@pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [
|
||||
('average', True, 'keep', False,
|
||||
[2., 2., np.nan, 5., 2., 4., np.nan, np.nan]),
|
||||
('average', True, 'keep', True,
|
||||
[0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan]),
|
||||
('average', False, 'keep', False,
|
||||
[4., 4., np.nan, 1., 4., 2., np.nan, np.nan]),
|
||||
('average', False, 'keep', True,
|
||||
[.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan]),
|
||||
('min', True, 'keep', False,
|
||||
[1., 1., np.nan, 5., 1., 4., np.nan, np.nan]),
|
||||
('min', True, 'keep', True,
|
||||
[0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
|
||||
('min', False, 'keep', False,
|
||||
[3., 3., np.nan, 1., 3., 2., np.nan, np.nan]),
|
||||
('min', False, 'keep', True,
|
||||
[.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
|
||||
('max', True, 'keep', False,
|
||||
[3., 3., np.nan, 5., 3., 4., np.nan, np.nan]),
|
||||
('max', True, 'keep', True,
|
||||
[0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
|
||||
('max', False, 'keep', False,
|
||||
[5., 5., np.nan, 1., 5., 2., np.nan, np.nan]),
|
||||
('max', False, 'keep', True,
|
||||
[1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan]),
|
||||
('first', True, 'keep', False,
|
||||
[1., 2., np.nan, 5., 3., 4., np.nan, np.nan]),
|
||||
('first', True, 'keep', True,
|
||||
[0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
|
||||
('first', False, 'keep', False,
|
||||
[3., 4., np.nan, 1., 5., 2., np.nan, np.nan]),
|
||||
('first', False, 'keep', True,
|
||||
[.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan]),
|
||||
('dense', True, 'keep', False,
|
||||
[1., 1., np.nan, 3., 1., 2., np.nan, np.nan]),
|
||||
('dense', True, 'keep', True,
|
||||
[1. / 3., 1. / 3., np.nan, 3. / 3., 1. / 3., 2. / 3., np.nan, np.nan]),
|
||||
('dense', False, 'keep', False,
|
||||
[3., 3., np.nan, 1., 3., 2., np.nan, np.nan]),
|
||||
('dense', False, 'keep', True,
|
||||
[3. / 3., 3. / 3., np.nan, 1. / 3., 3. / 3., 2. / 3., np.nan, np.nan]),
|
||||
('average', True, 'bottom', False, [2., 2., 7., 5., 2., 4., 7., 7.]),
|
||||
('average', True, 'bottom', True,
|
||||
[0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]),
|
||||
('average', False, 'bottom', False, [4., 4., 7., 1., 4., 2., 7., 7.]),
|
||||
('average', False, 'bottom', True,
|
||||
[0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875]),
|
||||
('min', True, 'bottom', False, [1., 1., 6., 5., 1., 4., 6., 6.]),
|
||||
('min', True, 'bottom', True,
|
||||
[0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75]),
|
||||
('min', False, 'bottom', False, [3., 3., 6., 1., 3., 2., 6., 6.]),
|
||||
('min', False, 'bottom', True,
|
||||
[0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75]),
|
||||
('max', True, 'bottom', False, [3., 3., 8., 5., 3., 4., 8., 8.]),
|
||||
('max', True, 'bottom', True,
|
||||
[0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.]),
|
||||
('max', False, 'bottom', False, [5., 5., 8., 1., 5., 2., 8., 8.]),
|
||||
('max', False, 'bottom', True,
|
||||
[0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.]),
|
||||
('first', True, 'bottom', False, [1., 2., 6., 5., 3., 4., 7., 8.]),
|
||||
('first', True, 'bottom', True,
|
||||
[0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.]),
|
||||
('first', False, 'bottom', False, [3., 4., 6., 1., 5., 2., 7., 8.]),
|
||||
('first', False, 'bottom', True,
|
||||
[0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]),
|
||||
('dense', True, 'bottom', False, [1., 1., 4., 3., 1., 2., 4., 4.]),
|
||||
('dense', True, 'bottom', True,
|
||||
[0.25, 0.25, 1., 0.75, 0.25, 0.5, 1., 1.]),
|
||||
('dense', False, 'bottom', False, [3., 3., 4., 1., 3., 2., 4., 4.]),
|
||||
('dense', False, 'bottom', True,
|
||||
[0.75, 0.75, 1., 0.25, 0.75, 0.5, 1., 1.])
|
||||
])
|
||||
def test_rank_args_missing(grps, vals, ties_method, ascending,
|
||||
na_option, pct, exp):
|
||||
key = np.repeat(grps, len(vals))
|
||||
vals = vals * len(grps)
|
||||
df = DataFrame({'key': key, 'val': vals})
|
||||
result = df.groupby('key').rank(method=ties_method,
|
||||
ascending=ascending,
|
||||
na_option=na_option, pct=pct)
|
||||
|
||||
exp_df = DataFrame(exp * len(grps), columns=['val'])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pct,exp", [
|
||||
(False, [3., 3., 3., 3., 3.]),
|
||||
(True, [.6, .6, .6, .6, .6])])
|
||||
def test_rank_resets_each_group(pct, exp):
|
||||
df = DataFrame(
|
||||
{'key': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'],
|
||||
'val': [1] * 10}
|
||||
)
|
||||
result = df.groupby('key').rank(pct=pct)
|
||||
exp_df = DataFrame(exp * 2, columns=['val'])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
def test_rank_avg_even_vals():
|
||||
df = DataFrame({'key': ['a'] * 4, 'val': [1] * 4})
|
||||
result = df.groupby('key').rank()
|
||||
exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=['val'])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ties_method", [
|
||||
'average', 'min', 'max', 'first', 'dense'])
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
|
||||
@pytest.mark.parametrize("pct", [True, False])
|
||||
@pytest.mark.parametrize("vals", [
|
||||
['bar', 'bar', 'foo', 'bar', 'baz'],
|
||||
['bar', np.nan, 'foo', np.nan, 'baz']
|
||||
])
|
||||
def test_rank_object_raises(ties_method, ascending, na_option,
|
||||
pct, vals):
|
||||
df = DataFrame({'key': ['foo'] * 5, 'val': vals})
|
||||
|
||||
with pytest.raises(TypeError, match="not callable"):
|
||||
df.groupby('key').rank(method=ties_method,
|
||||
ascending=ascending,
|
||||
na_option=na_option, pct=pct)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_option", [True, "bad", 1])
|
||||
@pytest.mark.parametrize("ties_method", [
|
||||
'average', 'min', 'max', 'first', 'dense'])
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
@pytest.mark.parametrize("pct", [True, False])
|
||||
@pytest.mark.parametrize("vals", [
|
||||
['bar', 'bar', 'foo', 'bar', 'baz'],
|
||||
['bar', np.nan, 'foo', np.nan, 'baz'],
|
||||
[1, np.nan, 2, np.nan, 3]
|
||||
])
|
||||
def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals):
|
||||
df = DataFrame({'key': ['foo'] * 5, 'val': vals})
|
||||
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby('key').rank(method=ties_method,
|
||||
ascending=ascending,
|
||||
na_option=na_option, pct=pct)
|
||||
|
||||
|
||||
def test_rank_empty_group():
|
||||
# see gh-22519
|
||||
column = "A"
|
||||
df = DataFrame({
|
||||
"A": [0, 1, 0],
|
||||
"B": [1., np.nan, 2.]
|
||||
})
|
||||
|
||||
result = df.groupby(column).B.rank(pct=True)
|
||||
expected = Series([0.5, np.nan, 1.0], name="B")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby(column).rank(pct=True)
|
||||
expected = DataFrame({"B": [0.5, np.nan, 1.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_key,input_value,output_value", [
|
||||
([1, 2], [1, 1], [1.0, 1.0]),
|
||||
([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
|
||||
([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
|
||||
([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan])
|
||||
])
|
||||
def test_rank_zero_div(input_key, input_value, output_value):
|
||||
# GH 23666
|
||||
df = DataFrame({"A": input_key, "B": input_value})
|
||||
|
||||
result = df.groupby("A").rank(method="dense", pct=True)
|
||||
expected = DataFrame({"B": output_value})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -1,652 +0,0 @@
|
||||
""" test with the TimeGrouper / grouping with datetimes """
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
from numpy import nan
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
from pandas.compat import StringIO
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range
|
||||
from pandas.core.groupby.ops import BinGrouper
|
||||
from pandas.util import testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
class TestGroupBy(object):
|
||||
|
||||
def test_groupby_with_timegrouper(self):
|
||||
# GH 4161
|
||||
# TimeGrouper requires a sorted index
|
||||
# also verifies that the resultant index has the correct name
|
||||
df_original = DataFrame({
|
||||
'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(),
|
||||
'Quantity': [18, 3, 5, 1, 9, 3],
|
||||
'Date': [
|
||||
datetime(2013, 9, 1, 13, 0),
|
||||
datetime(2013, 9, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 3, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 9, 2, 14, 0),
|
||||
]
|
||||
})
|
||||
|
||||
# GH 6908 change target column's order
|
||||
df_reordered = df_original.sort_values(by='Quantity')
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
df = df.set_index(['Date'])
|
||||
|
||||
expected = DataFrame(
|
||||
{'Quantity': 0},
|
||||
index=date_range('20130901',
|
||||
'20131205', freq='5D',
|
||||
name='Date', closed='left'))
|
||||
expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype='int64')
|
||||
|
||||
result1 = df.resample('5D') .sum()
|
||||
assert_frame_equal(result1, expected)
|
||||
|
||||
df_sorted = df.sort_index()
|
||||
result2 = df_sorted.groupby(pd.Grouper(freq='5D')).sum()
|
||||
assert_frame_equal(result2, expected)
|
||||
|
||||
result3 = df.groupby(pd.Grouper(freq='5D')).sum()
|
||||
assert_frame_equal(result3, expected)
|
||||
|
||||
@pytest.mark.parametrize("should_sort", [True, False])
|
||||
def test_groupby_with_timegrouper_methods(self, should_sort):
|
||||
# GH 3881
|
||||
# make sure API of timegrouper conforms
|
||||
|
||||
df = pd.DataFrame({
|
||||
'Branch': 'A A A A A B'.split(),
|
||||
'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(),
|
||||
'Quantity': [1, 3, 5, 8, 9, 3],
|
||||
'Date': [
|
||||
datetime(2013, 1, 1, 13, 0),
|
||||
datetime(2013, 1, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 12, 2, 14, 0),
|
||||
]
|
||||
})
|
||||
|
||||
if should_sort:
|
||||
df = df.sort_values(by='Quantity', ascending=False)
|
||||
|
||||
df = df.set_index('Date', drop=False)
|
||||
g = df.groupby(pd.Grouper(freq='6M'))
|
||||
assert g.group_keys
|
||||
|
||||
assert isinstance(g.grouper, BinGrouper)
|
||||
groups = g.groups
|
||||
assert isinstance(groups, dict)
|
||||
assert len(groups) == 3
|
||||
|
||||
def test_timegrouper_with_reg_groups(self):
|
||||
|
||||
# GH 3794
|
||||
# allow combinateion of timegrouper/reg groups
|
||||
|
||||
df_original = DataFrame({
|
||||
'Branch': 'A A A A A A A B'.split(),
|
||||
'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
|
||||
'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
|
||||
'Date': [
|
||||
datetime(2013, 1, 1, 13, 0),
|
||||
datetime(2013, 1, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 12, 2, 14, 0),
|
||||
]
|
||||
}).set_index('Date')
|
||||
|
||||
df_sorted = df_original.sort_values(by='Quantity', ascending=False)
|
||||
|
||||
for df in [df_original, df_sorted]:
|
||||
expected = DataFrame({
|
||||
'Buyer': 'Carl Joe Mark'.split(),
|
||||
'Quantity': [10, 18, 3],
|
||||
'Date': [
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
]
|
||||
}).set_index(['Date', 'Buyer'])
|
||||
|
||||
result = df.groupby([pd.Grouper(freq='A'), 'Buyer']).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({
|
||||
'Buyer': 'Carl Mark Carl Joe'.split(),
|
||||
'Quantity': [1, 3, 9, 18],
|
||||
'Date': [
|
||||
datetime(2013, 1, 1, 0, 0),
|
||||
datetime(2013, 1, 1, 0, 0),
|
||||
datetime(2013, 7, 1, 0, 0),
|
||||
datetime(2013, 7, 1, 0, 0),
|
||||
]
|
||||
}).set_index(['Date', 'Buyer'])
|
||||
result = df.groupby([pd.Grouper(freq='6MS'), 'Buyer']).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df_original = DataFrame({
|
||||
'Branch': 'A A A A A A A B'.split(),
|
||||
'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
|
||||
'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
|
||||
'Date': [
|
||||
datetime(2013, 10, 1, 13, 0),
|
||||
datetime(2013, 10, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 2, 12, 0),
|
||||
datetime(2013, 10, 2, 14, 0),
|
||||
]
|
||||
}).set_index('Date')
|
||||
|
||||
df_sorted = df_original.sort_values(by='Quantity', ascending=False)
|
||||
for df in [df_original, df_sorted]:
|
||||
|
||||
expected = DataFrame({
|
||||
'Buyer': 'Carl Joe Mark Carl Joe'.split(),
|
||||
'Quantity': [6, 8, 3, 4, 10],
|
||||
'Date': [
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 2, 0, 0),
|
||||
datetime(2013, 10, 2, 0, 0),
|
||||
]
|
||||
}).set_index(['Date', 'Buyer'])
|
||||
|
||||
result = df.groupby([pd.Grouper(freq='1D'), 'Buyer']).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([pd.Grouper(freq='1M'), 'Buyer']).sum()
|
||||
expected = DataFrame({
|
||||
'Buyer': 'Carl Joe Mark'.split(),
|
||||
'Quantity': [10, 18, 3],
|
||||
'Date': [
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
]
|
||||
}).set_index(['Date', 'Buyer'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# passing the name
|
||||
df = df.reset_index()
|
||||
result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer'
|
||||
]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(KeyError):
|
||||
df.groupby([pd.Grouper(freq='1M', key='foo'), 'Buyer']).sum()
|
||||
|
||||
# passing the level
|
||||
df = df.set_index('Date')
|
||||
result = df.groupby([pd.Grouper(freq='1M', level='Date'), 'Buyer'
|
||||
]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
result = df.groupby([pd.Grouper(freq='1M', level=0), 'Buyer']).sum(
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
df.groupby([pd.Grouper(freq='1M', level='foo'),
|
||||
'Buyer']).sum()
|
||||
|
||||
# multi names
|
||||
df = df.copy()
|
||||
df['Date'] = df.index + pd.offsets.MonthEnd(2)
|
||||
result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer'
|
||||
]).sum()
|
||||
expected = DataFrame({
|
||||
'Buyer': 'Carl Joe Mark'.split(),
|
||||
'Quantity': [10, 18, 3],
|
||||
'Date': [
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
]
|
||||
}).set_index(['Date', 'Buyer'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# error as we have both a level and a name!
|
||||
with pytest.raises(ValueError):
|
||||
df.groupby([pd.Grouper(freq='1M', key='Date',
|
||||
level='Date'), 'Buyer']).sum()
|
||||
|
||||
# single groupers
|
||||
expected = DataFrame({'Quantity': [31],
|
||||
'Date': [datetime(2013, 10, 31, 0, 0)
|
||||
]}).set_index('Date')
|
||||
result = df.groupby(pd.Grouper(freq='1M')).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([pd.Grouper(freq='1M')]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({'Quantity': [31],
|
||||
'Date': [datetime(2013, 11, 30, 0, 0)
|
||||
]}).set_index('Date')
|
||||
result = df.groupby(pd.Grouper(freq='1M', key='Date')).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('freq', ['D', 'M', 'A', 'Q-APR'])
|
||||
def test_timegrouper_with_reg_groups_freq(self, freq):
|
||||
# GH 6764 multiple grouping with/without sort
|
||||
df = DataFrame({
|
||||
'date': pd.to_datetime([
|
||||
'20121002', '20121007', '20130130', '20130202', '20130305',
|
||||
'20121002', '20121207', '20130130', '20130202', '20130305',
|
||||
'20130202', '20130305'
|
||||
]),
|
||||
'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
|
||||
'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301,
|
||||
359, 801],
|
||||
'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12]
|
||||
}).set_index('date')
|
||||
|
||||
expected = (
|
||||
df.groupby('user_id')['whole_cost']
|
||||
.resample(freq)
|
||||
.sum(min_count=1) # XXX
|
||||
.dropna()
|
||||
.reorder_levels(['date', 'user_id'])
|
||||
.sort_index()
|
||||
.astype('int64')
|
||||
)
|
||||
expected.name = 'whole_cost'
|
||||
|
||||
result1 = df.sort_index().groupby([pd.Grouper(freq=freq),
|
||||
'user_id'])['whole_cost'].sum()
|
||||
assert_series_equal(result1, expected)
|
||||
|
||||
result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[
|
||||
'whole_cost'].sum()
|
||||
assert_series_equal(result2, expected)
|
||||
|
||||
def test_timegrouper_get_group(self):
|
||||
# GH 6914
|
||||
|
||||
df_original = DataFrame({
|
||||
'Buyer': 'Carl Joe Joe Carl Joe Carl'.split(),
|
||||
'Quantity': [18, 3, 5, 1, 9, 3],
|
||||
'Date': [datetime(2013, 9, 1, 13, 0),
|
||||
datetime(2013, 9, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 3, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 9, 2, 14, 0), ]
|
||||
})
|
||||
df_reordered = df_original.sort_values(by='Quantity')
|
||||
|
||||
# single grouping
|
||||
expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
|
||||
df_original.iloc[[4]]]
|
||||
dt_list = ['2013-09-30', '2013-10-31', '2013-12-31']
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(pd.Grouper(freq='M', key='Date'))
|
||||
for t, expected in zip(dt_list, expected_list):
|
||||
dt = pd.Timestamp(t)
|
||||
result = grouped.get_group(dt)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# multiple grouping
|
||||
expected_list = [df_original.iloc[[1]], df_original.iloc[[3]],
|
||||
df_original.iloc[[4]]]
|
||||
g_list = [('Joe', '2013-09-30'), ('Carl', '2013-10-31'),
|
||||
('Joe', '2013-12-31')]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(['Buyer', pd.Grouper(freq='M', key='Date')])
|
||||
for (b, t), expected in zip(g_list, expected_list):
|
||||
dt = pd.Timestamp(t)
|
||||
result = grouped.get_group((b, dt))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# with index
|
||||
df_original = df_original.set_index('Date')
|
||||
df_reordered = df_original.sort_values(by='Quantity')
|
||||
|
||||
expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
|
||||
df_original.iloc[[4]]]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(pd.Grouper(freq='M'))
|
||||
for t, expected in zip(dt_list, expected_list):
|
||||
dt = pd.Timestamp(t)
|
||||
result = grouped.get_group(dt)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_timegrouper_apply_return_type_series(self):
|
||||
# Using `apply` with the `TimeGrouper` should give the
|
||||
# same return type as an `apply` with a `Grouper`.
|
||||
# Issue #11742
|
||||
df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'],
|
||||
'value': [10, 13]})
|
||||
df_dt = df.copy()
|
||||
df_dt['date'] = pd.to_datetime(df_dt['date'])
|
||||
|
||||
def sumfunc_series(x):
|
||||
return pd.Series([x['value'].sum()], ('sum',))
|
||||
|
||||
expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_series)
|
||||
result = (df_dt.groupby(pd.Grouper(freq='M', key='date'))
|
||||
.apply(sumfunc_series))
|
||||
assert_frame_equal(result.reset_index(drop=True),
|
||||
expected.reset_index(drop=True))
|
||||
|
||||
def test_timegrouper_apply_return_type_value(self):
|
||||
# Using `apply` with the `TimeGrouper` should give the
|
||||
# same return type as an `apply` with a `Grouper`.
|
||||
# Issue #11742
|
||||
df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'],
|
||||
'value': [10, 13]})
|
||||
df_dt = df.copy()
|
||||
df_dt['date'] = pd.to_datetime(df_dt['date'])
|
||||
|
||||
def sumfunc_value(x):
|
||||
return x.value.sum()
|
||||
|
||||
expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_value)
|
||||
with tm.assert_produces_warning(FutureWarning,
|
||||
check_stacklevel=False):
|
||||
result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date'))
|
||||
.apply(sumfunc_value))
|
||||
assert_series_equal(result.reset_index(drop=True),
|
||||
expected.reset_index(drop=True))
|
||||
|
||||
def test_groupby_groups_datetimeindex(self):
|
||||
# GH#1430
|
||||
periods = 1000
|
||||
ind = pd.date_range(start='2012/1/1', freq='5min', periods=periods)
|
||||
df = DataFrame({'high': np.arange(periods),
|
||||
'low': np.arange(periods)}, index=ind)
|
||||
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
|
||||
|
||||
# it works!
|
||||
groups = grouped.groups
|
||||
assert isinstance(list(groups.keys())[0], datetime)
|
||||
|
||||
# GH#11442
|
||||
index = pd.date_range('2015/01/01', periods=5, name='date')
|
||||
df = pd.DataFrame({'A': [5, 6, 7, 8, 9],
|
||||
'B': [1, 2, 3, 4, 5]}, index=index)
|
||||
result = df.groupby(level='date').groups
|
||||
dates = ['2015-01-05', '2015-01-04', '2015-01-03',
|
||||
'2015-01-02', '2015-01-01']
|
||||
expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date')
|
||||
for date in dates}
|
||||
tm.assert_dict_equal(result, expected)
|
||||
|
||||
grouped = df.groupby(level='date')
|
||||
for date in dates:
|
||||
result = grouped.get_group(date)
|
||||
data = [[df.loc[date, 'A'], df.loc[date, 'B']]]
|
||||
expected_index = pd.DatetimeIndex([date], name='date')
|
||||
expected = pd.DataFrame(data,
|
||||
columns=list('AB'),
|
||||
index=expected_index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_groups_datetimeindex_tz(self):
|
||||
# GH 3950
|
||||
dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
|
||||
'2011-07-19 09:00:00', '2011-07-19 07:00:00',
|
||||
'2011-07-19 08:00:00', '2011-07-19 09:00:00']
|
||||
df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
|
||||
'datetime': dates,
|
||||
'value1': np.arange(6, dtype='int64'),
|
||||
'value2': [1, 2] * 3})
|
||||
df['datetime'] = df['datetime'].apply(
|
||||
lambda d: Timestamp(d, tz='US/Pacific'))
|
||||
|
||||
exp_idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00',
|
||||
'2011-07-19 07:00:00',
|
||||
'2011-07-19 08:00:00',
|
||||
'2011-07-19 08:00:00',
|
||||
'2011-07-19 09:00:00',
|
||||
'2011-07-19 09:00:00'],
|
||||
tz='US/Pacific', name='datetime')
|
||||
exp_idx2 = Index(['a', 'b'] * 3, name='label')
|
||||
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
||||
expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5],
|
||||
'value2': [1, 2, 2, 1, 1, 2]},
|
||||
index=exp_idx, columns=['value1', 'value2'])
|
||||
|
||||
result = df.groupby(['datetime', 'label']).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# by level
|
||||
didx = pd.DatetimeIndex(dates, tz='Asia/Tokyo')
|
||||
df = DataFrame({'value1': np.arange(6, dtype='int64'),
|
||||
'value2': [1, 2, 3, 1, 2, 3]},
|
||||
index=didx)
|
||||
|
||||
exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00',
|
||||
'2011-07-19 08:00:00',
|
||||
'2011-07-19 09:00:00'], tz='Asia/Tokyo')
|
||||
expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]},
|
||||
index=exp_idx, columns=['value1', 'value2'])
|
||||
|
||||
result = df.groupby(level=0).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_frame_datetime64_handling_groupby(self):
|
||||
# it works!
|
||||
df = DataFrame([(3, np.datetime64('2012-07-03')),
|
||||
(3, np.datetime64('2012-07-04'))],
|
||||
columns=['a', 'date'])
|
||||
result = df.groupby('a').first()
|
||||
assert result['date'][3] == Timestamp('2012-07-03')
|
||||
|
||||
def test_groupby_multi_timezone(self):
|
||||
|
||||
# combining multiple / different timezones yields UTC
|
||||
|
||||
data = """0,2000-01-28 16:47:00,America/Chicago
|
||||
1,2000-01-29 16:48:00,America/Chicago
|
||||
2,2000-01-30 16:49:00,America/Los_Angeles
|
||||
3,2000-01-31 16:50:00,America/Chicago
|
||||
4,2000-01-01 16:50:00,America/New_York"""
|
||||
|
||||
df = pd.read_csv(StringIO(data), header=None,
|
||||
names=['value', 'date', 'tz'])
|
||||
result = df.groupby('tz').date.apply(
|
||||
lambda x: pd.to_datetime(x).dt.tz_localize(x.name))
|
||||
|
||||
expected = Series([Timestamp('2000-01-28 16:47:00-0600',
|
||||
tz='America/Chicago'),
|
||||
Timestamp('2000-01-29 16:48:00-0600',
|
||||
tz='America/Chicago'),
|
||||
Timestamp('2000-01-30 16:49:00-0800',
|
||||
tz='America/Los_Angeles'),
|
||||
Timestamp('2000-01-31 16:50:00-0600',
|
||||
tz='America/Chicago'),
|
||||
Timestamp('2000-01-01 16:50:00-0500',
|
||||
tz='America/New_York')],
|
||||
name='date',
|
||||
dtype=object)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
tz = 'America/Chicago'
|
||||
res_values = df.groupby('tz').date.get_group(tz)
|
||||
result = pd.to_datetime(res_values).dt.tz_localize(tz)
|
||||
exp_values = Series(['2000-01-28 16:47:00', '2000-01-29 16:48:00',
|
||||
'2000-01-31 16:50:00'],
|
||||
index=[0, 1, 3], name='date')
|
||||
expected = pd.to_datetime(exp_values).dt.tz_localize(tz)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_groups_periods(self):
|
||||
dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
|
||||
'2011-07-19 09:00:00', '2011-07-19 07:00:00',
|
||||
'2011-07-19 08:00:00', '2011-07-19 09:00:00']
|
||||
df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
|
||||
'period': [pd.Period(d, freq='H') for d in dates],
|
||||
'value1': np.arange(6, dtype='int64'),
|
||||
'value2': [1, 2] * 3})
|
||||
|
||||
exp_idx1 = pd.PeriodIndex(['2011-07-19 07:00:00',
|
||||
'2011-07-19 07:00:00',
|
||||
'2011-07-19 08:00:00',
|
||||
'2011-07-19 08:00:00',
|
||||
'2011-07-19 09:00:00',
|
||||
'2011-07-19 09:00:00'],
|
||||
freq='H', name='period')
|
||||
exp_idx2 = Index(['a', 'b'] * 3, name='label')
|
||||
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
||||
expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5],
|
||||
'value2': [1, 2, 2, 1, 1, 2]},
|
||||
index=exp_idx, columns=['value1', 'value2'])
|
||||
|
||||
result = df.groupby(['period', 'label']).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# by level
|
||||
didx = pd.PeriodIndex(dates, freq='H')
|
||||
df = DataFrame({'value1': np.arange(6, dtype='int64'),
|
||||
'value2': [1, 2, 3, 1, 2, 3]},
|
||||
index=didx)
|
||||
|
||||
exp_idx = pd.PeriodIndex(['2011-07-19 07:00:00',
|
||||
'2011-07-19 08:00:00',
|
||||
'2011-07-19 09:00:00'], freq='H')
|
||||
expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]},
|
||||
index=exp_idx, columns=['value1', 'value2'])
|
||||
|
||||
result = df.groupby(level=0).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_first_datetime64(self):
|
||||
df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
|
||||
df[1] = df[1].view('M8[ns]')
|
||||
|
||||
assert issubclass(df[1].dtype.type, np.datetime64)
|
||||
|
||||
result = df.groupby(level=0).first()
|
||||
got_dt = result[1].dtype
|
||||
assert issubclass(got_dt.type, np.datetime64)
|
||||
|
||||
result = df[1].groupby(level=0).first()
|
||||
got_dt = result.dtype
|
||||
assert issubclass(got_dt.type, np.datetime64)
|
||||
|
||||
def test_groupby_max_datetime64(self):
|
||||
# GH 5869
|
||||
# datetimelike dtype conversion from int
|
||||
df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
|
||||
expected = df.groupby('A')['A'].apply(lambda x: x.max())
|
||||
result = df.groupby('A')['A'].max()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_datetime64_32_bit(self):
|
||||
# GH 6410 / numpy 4328
|
||||
# 32-bit under 1.9-dev indexing issue
|
||||
|
||||
df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')] * 2})
|
||||
result = df.groupby("A")["B"].transform(min)
|
||||
expected = Series([pd.Timestamp('2000-01-1')] * 2, name='B')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_with_timezone_selection(self):
|
||||
# GH 11616
|
||||
# Test that column selection returns output in correct timezone.
|
||||
np.random.seed(42)
|
||||
df = pd.DataFrame({
|
||||
'factor': np.random.randint(0, 3, size=60),
|
||||
'time': pd.date_range('01/01/2000 00:00', periods=60,
|
||||
freq='s', tz='UTC')
|
||||
})
|
||||
df1 = df.groupby('factor').max()['time']
|
||||
df2 = df.groupby('factor')['time'].max()
|
||||
tm.assert_series_equal(df1, df2)
|
||||
|
||||
def test_timezone_info(self):
|
||||
# see gh-11682: Timezone info lost when broadcasting
|
||||
# scalar datetime to DataFrame
|
||||
|
||||
df = pd.DataFrame({'a': [1], 'b': [datetime.now(pytz.utc)]})
|
||||
assert df['b'][0].tzinfo == pytz.utc
|
||||
df = pd.DataFrame({'a': [1, 2, 3]})
|
||||
df['b'] = datetime.now(pytz.utc)
|
||||
assert df['b'][0].tzinfo == pytz.utc
|
||||
|
||||
def test_datetime_count(self):
|
||||
df = DataFrame({'a': [1, 2, 3] * 2,
|
||||
'dates': pd.date_range('now', periods=6, freq='T')})
|
||||
result = df.groupby('a').dates.count()
|
||||
expected = Series([
|
||||
2, 2, 2
|
||||
], index=Index([1, 2, 3], name='a'), name='dates')
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_first_last_max_min_on_time_data(self):
|
||||
# GH 10295
|
||||
# Verify that NaT is not in the result of max, min, first and last on
|
||||
# Dataframe with datetime or timedelta values.
|
||||
from datetime import timedelta as td
|
||||
df_test = DataFrame(
|
||||
{'dt': [nan, '2015-07-24 10:10', '2015-07-25 11:11',
|
||||
'2015-07-23 12:12', nan],
|
||||
'td': [nan, td(days=1), td(days=2), td(days=3), nan]})
|
||||
df_test.dt = pd.to_datetime(df_test.dt)
|
||||
df_test['group'] = 'A'
|
||||
df_ref = df_test[df_test.dt.notna()]
|
||||
|
||||
grouped_test = df_test.groupby('group')
|
||||
grouped_ref = df_ref.groupby('group')
|
||||
|
||||
assert_frame_equal(grouped_ref.max(), grouped_test.max())
|
||||
assert_frame_equal(grouped_ref.min(), grouped_test.min())
|
||||
assert_frame_equal(grouped_ref.first(), grouped_test.first())
|
||||
assert_frame_equal(grouped_ref.last(), grouped_test.last())
|
||||
|
||||
def test_nunique_with_timegrouper_and_nat(self):
|
||||
# GH 17575
|
||||
test = pd.DataFrame({
|
||||
'time': [Timestamp('2016-06-28 09:35:35'),
|
||||
pd.NaT,
|
||||
Timestamp('2016-06-28 16:46:28')],
|
||||
'data': ['1', '2', '3']})
|
||||
|
||||
grouper = pd.Grouper(key='time', freq='h')
|
||||
result = test.groupby(grouper)['data'].nunique()
|
||||
expected = test[test.time.notnull()].groupby(grouper)['data'].nunique()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_scalar_call_versus_list_call(self):
|
||||
# Issue: 17530
|
||||
data_frame = {
|
||||
'location': ['shanghai', 'beijing', 'shanghai'],
|
||||
'time': pd.Series(['2017-08-09 13:32:23', '2017-08-11 23:23:15',
|
||||
'2017-08-11 22:23:15'],
|
||||
dtype='datetime64[ns]'),
|
||||
'value': [1, 2, 3]
|
||||
}
|
||||
data_frame = pd.DataFrame(data_frame).set_index('time')
|
||||
grouper = pd.Grouper(freq='D')
|
||||
|
||||
grouped = data_frame.groupby(grouper)
|
||||
result = grouped.count()
|
||||
grouped = data_frame.groupby([grouper])
|
||||
expected = grouped.count()
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
@@ -1,836 +0,0 @@
|
||||
""" test with the .transform """
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import groupby
|
||||
from pandas.compat import StringIO
|
||||
|
||||
from pandas.core.dtypes.common import ensure_platform_int, is_timedelta64_dtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, MultiIndex, Series, Timestamp, concat, date_range
|
||||
from pandas.core.config import option_context
|
||||
from pandas.core.groupby.groupby import DataError
|
||||
from pandas.util import testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
def assert_fp_equal(a, b):
|
||||
assert (np.abs(a - b) < 1e-12).all()
|
||||
|
||||
|
||||
def test_transform():
|
||||
data = Series(np.arange(9) // 3, index=np.arange(9))
|
||||
|
||||
index = np.arange(9)
|
||||
np.random.shuffle(index)
|
||||
data = data.reindex(index)
|
||||
|
||||
grouped = data.groupby(lambda x: x // 3)
|
||||
|
||||
transformed = grouped.transform(lambda x: x * x.sum())
|
||||
assert transformed[7] == 12
|
||||
|
||||
# GH 8046
|
||||
# make sure that we preserve the input order
|
||||
|
||||
df = DataFrame(
|
||||
np.arange(6, dtype='int64').reshape(
|
||||
3, 2), columns=["a", "b"], index=[0, 2, 1])
|
||||
key = [0, 0, 1]
|
||||
expected = df.sort_index().groupby(key).transform(
|
||||
lambda x: x - x.mean()).groupby(key).mean()
|
||||
result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(
|
||||
key).mean()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def demean(arr):
|
||||
return arr - arr.mean()
|
||||
|
||||
people = DataFrame(np.random.randn(5, 5),
|
||||
columns=['a', 'b', 'c', 'd', 'e'],
|
||||
index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
|
||||
key = ['one', 'two', 'one', 'two', 'one']
|
||||
result = people.groupby(key).transform(demean).groupby(key).mean()
|
||||
expected = people.groupby(key).apply(demean).groupby(key).mean()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH 8430
|
||||
df = tm.makeTimeDataFrame()
|
||||
g = df.groupby(pd.Grouper(freq='M'))
|
||||
g.transform(lambda x: x - 1)
|
||||
|
||||
# GH 9700
|
||||
df = DataFrame({'a': range(5, 10), 'b': range(5)})
|
||||
result = df.groupby('a').transform(max)
|
||||
expected = DataFrame({'b': range(5)})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_transform_fast():
|
||||
|
||||
df = DataFrame({'id': np.arange(100000) / 3,
|
||||
'val': np.random.randn(100000)})
|
||||
|
||||
grp = df.groupby('id')['val']
|
||||
|
||||
values = np.repeat(grp.mean().values,
|
||||
ensure_platform_int(grp.count().values))
|
||||
expected = pd.Series(values, index=df.index, name='val')
|
||||
|
||||
result = grp.transform(np.mean)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = grp.transform('mean')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# GH 12737
|
||||
df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5],
|
||||
'd': pd.date_range('2014-1-1', '2014-1-4'),
|
||||
'i': [1, 2, 3, 4]},
|
||||
columns=['grouping', 'f', 'i', 'd'])
|
||||
result = df.groupby('grouping').transform('first')
|
||||
|
||||
dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'),
|
||||
pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')]
|
||||
expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5],
|
||||
'd': dates,
|
||||
'i': [1, 2, 2, 4]},
|
||||
columns=['f', 'i', 'd'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# selection
|
||||
result = df.groupby('grouping')[['f', 'i']].transform('first')
|
||||
expected = expected[['f', 'i']]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# dup columns
|
||||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a'])
|
||||
result = df.groupby('g').transform('first')
|
||||
expected = df.drop('g', axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_transform_broadcast(tsframe, ts):
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
result = grouped.transform(np.mean)
|
||||
|
||||
tm.assert_index_equal(result.index, ts.index)
|
||||
for _, gp in grouped:
|
||||
assert_fp_equal(result.reindex(gp.index), gp.mean())
|
||||
|
||||
grouped = tsframe.groupby(lambda x: x.month)
|
||||
result = grouped.transform(np.mean)
|
||||
tm.assert_index_equal(result.index, tsframe.index)
|
||||
for _, gp in grouped:
|
||||
agged = gp.mean()
|
||||
res = result.reindex(gp.index)
|
||||
for col in tsframe:
|
||||
assert_fp_equal(res[col], agged[col])
|
||||
|
||||
# group columns
|
||||
grouped = tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
|
||||
axis=1)
|
||||
result = grouped.transform(np.mean)
|
||||
tm.assert_index_equal(result.index, tsframe.index)
|
||||
tm.assert_index_equal(result.columns, tsframe.columns)
|
||||
for _, gp in grouped:
|
||||
agged = gp.mean(1)
|
||||
res = result.reindex(columns=gp.columns)
|
||||
for idx in gp.index:
|
||||
assert_fp_equal(res.xs(idx), agged[idx])
|
||||
|
||||
|
||||
def test_transform_axis(tsframe):
|
||||
|
||||
# make sure that we are setting the axes
|
||||
# correctly when on axis=0 or 1
|
||||
# in the presence of a non-monotonic indexer
|
||||
# GH12713
|
||||
|
||||
base = tsframe.iloc[0:5]
|
||||
r = len(base.index)
|
||||
c = len(base.columns)
|
||||
tso = DataFrame(np.random.randn(r, c),
|
||||
index=base.index,
|
||||
columns=base.columns,
|
||||
dtype='float64')
|
||||
# monotonic
|
||||
ts = tso
|
||||
grouped = ts.groupby(lambda x: x.weekday())
|
||||
result = ts - grouped.transform('mean')
|
||||
expected = grouped.apply(lambda x: x - x.mean())
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
ts = ts.T
|
||||
grouped = ts.groupby(lambda x: x.weekday(), axis=1)
|
||||
result = ts - grouped.transform('mean')
|
||||
expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# non-monotonic
|
||||
ts = tso.iloc[[1, 0] + list(range(2, len(base)))]
|
||||
grouped = ts.groupby(lambda x: x.weekday())
|
||||
result = ts - grouped.transform('mean')
|
||||
expected = grouped.apply(lambda x: x - x.mean())
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
ts = ts.T
|
||||
grouped = ts.groupby(lambda x: x.weekday(), axis=1)
|
||||
result = ts - grouped.transform('mean')
|
||||
expected = grouped.apply(lambda x: (x.T - x.mean(1)).T)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_transform_dtype():
|
||||
# GH 9807
|
||||
# Check transform dtype output is preserved
|
||||
df = DataFrame([[1, 3], [2, 3]])
|
||||
result = df.groupby(1).transform('mean')
|
||||
expected = DataFrame([[1.5], [1.5]])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_transform_bug():
|
||||
# GH 5712
|
||||
# transforming on a datetime column
|
||||
df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
|
||||
result = df.groupby('A')['B'].transform(
|
||||
lambda x: x.rank(ascending=False))
|
||||
expected = Series(np.arange(5, 0, step=-1), name='B')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_transform_numeric_to_boolean():
|
||||
# GH 16875
|
||||
# inconsistency in transforming boolean values
|
||||
expected = pd.Series([True, True], name='A')
|
||||
|
||||
df = pd.DataFrame({'A': [1.1, 2.2], 'B': [1, 2]})
|
||||
result = df.groupby('B').A.transform(lambda x: True)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = pd.DataFrame({'A': [1, 2], 'B': [1, 2]})
|
||||
result = df.groupby('B').A.transform(lambda x: True)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_transform_datetime_to_timedelta():
|
||||
# GH 15429
|
||||
# transforming a datetime to timedelta
|
||||
df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
|
||||
expected = pd.Series([
|
||||
Timestamp('20130101') - Timestamp('20130101')] * 5, name='A')
|
||||
|
||||
# this does date math without changing result type in transform
|
||||
base_time = df['A'][0]
|
||||
result = df.groupby('A')['A'].transform(
|
||||
lambda x: x.max() - x.min() + base_time) - base_time
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# this does date math and causes the transform to return timedelta
|
||||
result = df.groupby('A')['A'].transform(lambda x: x.max() - x.min())
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_transform_datetime_to_numeric():
|
||||
# GH 10972
|
||||
# convert dt to float
|
||||
df = DataFrame({
|
||||
'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')})
|
||||
result = df.groupby('a').b.transform(
|
||||
lambda x: x.dt.dayofweek - x.dt.dayofweek.mean())
|
||||
|
||||
expected = Series([-0.5, 0.5], name='b')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# convert dt to int
|
||||
df = DataFrame({
|
||||
'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')})
|
||||
result = df.groupby('a').b.transform(
|
||||
lambda x: x.dt.dayofweek - x.dt.dayofweek.min())
|
||||
|
||||
expected = Series([0, 1], name='b')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_transform_casting():
|
||||
# 13046
|
||||
data = """
|
||||
idx A ID3 DATETIME
|
||||
0 B-028 b76cd912ff "2014-10-08 13:43:27"
|
||||
1 B-054 4a57ed0b02 "2014-10-08 14:26:19"
|
||||
2 B-076 1a682034f8 "2014-10-08 14:29:01"
|
||||
3 B-023 b76cd912ff "2014-10-08 18:39:34"
|
||||
4 B-023 f88g8d7sds "2014-10-08 18:40:18"
|
||||
5 B-033 b76cd912ff "2014-10-08 18:44:30"
|
||||
6 B-032 b76cd912ff "2014-10-08 18:46:00"
|
||||
7 B-037 b76cd912ff "2014-10-08 18:52:15"
|
||||
8 B-046 db959faf02 "2014-10-08 18:59:59"
|
||||
9 B-053 b76cd912ff "2014-10-08 19:17:48"
|
||||
10 B-065 b76cd912ff "2014-10-08 19:21:38"
|
||||
"""
|
||||
df = pd.read_csv(StringIO(data), sep=r'\s+',
|
||||
index_col=[0], parse_dates=['DATETIME'])
|
||||
|
||||
result = df.groupby('ID3')['DATETIME'].transform(lambda x: x.diff())
|
||||
assert is_timedelta64_dtype(result.dtype)
|
||||
|
||||
result = df[['ID3', 'DATETIME']].groupby('ID3').transform(
|
||||
lambda x: x.diff())
|
||||
assert is_timedelta64_dtype(result.DATETIME.dtype)
|
||||
|
||||
|
||||
def test_transform_multiple(ts):
|
||||
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
||||
|
||||
grouped.transform(lambda x: x * 2)
|
||||
grouped.transform(np.mean)
|
||||
|
||||
|
||||
def test_dispatch_transform(tsframe):
|
||||
df = tsframe[::5].reindex(tsframe.index)
|
||||
|
||||
grouped = df.groupby(lambda x: x.month)
|
||||
|
||||
filled = grouped.fillna(method='pad')
|
||||
fillit = lambda x: x.fillna(method='pad')
|
||||
expected = df.groupby(lambda x: x.month).transform(fillit)
|
||||
assert_frame_equal(filled, expected)
|
||||
|
||||
|
||||
def test_transform_select_columns(df):
|
||||
f = lambda x: x.mean()
|
||||
result = df.groupby('A')['C', 'D'].transform(f)
|
||||
|
||||
selection = df[['C', 'D']]
|
||||
expected = selection.groupby(df['A']).transform(f)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_transform_exclude_nuisance(df):
|
||||
|
||||
# this also tests orderings in transform between
|
||||
# series/frame to make sure it's consistent
|
||||
expected = {}
|
||||
grouped = df.groupby('A')
|
||||
expected['C'] = grouped['C'].transform(np.mean)
|
||||
expected['D'] = grouped['D'].transform(np.mean)
|
||||
expected = DataFrame(expected)
|
||||
result = df.groupby('A').transform(np.mean)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_transform_function_aliases(df):
|
||||
result = df.groupby('A').transform('mean')
|
||||
expected = df.groupby('A').transform(np.mean)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby('A')['C'].transform('mean')
|
||||
expected = df.groupby('A')['C'].transform(np.mean)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_series_fast_transform_date():
|
||||
# GH 13191
|
||||
df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3],
|
||||
'd': pd.date_range('2014-1-1', '2014-1-4')})
|
||||
result = df.groupby('grouping')['d'].transform('first')
|
||||
dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'),
|
||||
pd.Timestamp('2014-1-4')]
|
||||
expected = pd.Series(dates, name='d')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_transform_length():
|
||||
# GH 9697
|
||||
df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]})
|
||||
expected = pd.Series([3.0] * 4)
|
||||
|
||||
def nsum(x):
|
||||
return np.nansum(x)
|
||||
|
||||
results = [df.groupby('col1').transform(sum)['col2'],
|
||||
df.groupby('col1')['col2'].transform(sum),
|
||||
df.groupby('col1').transform(nsum)['col2'],
|
||||
df.groupby('col1')['col2'].transform(nsum)]
|
||||
for result in results:
|
||||
assert_series_equal(result, expected, check_names=False)
|
||||
|
||||
|
||||
def test_transform_coercion():
|
||||
|
||||
# 14457
|
||||
# when we are transforming be sure to not coerce
|
||||
# via assignment
|
||||
df = pd.DataFrame(dict(A=['a', 'a'], B=[0, 1]))
|
||||
g = df.groupby('A')
|
||||
|
||||
expected = g.transform(np.mean)
|
||||
result = g.transform(lambda x: np.mean(x))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_transform_with_int():
|
||||
|
||||
# GH 3740, make sure that we might upcast on item-by-item transform
|
||||
|
||||
# floats
|
||||
df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=Series(1, dtype='float64'),
|
||||
C=Series(
|
||||
[1, 2, 3, 1, 2, 3], dtype='float64'), D='foo'))
|
||||
with np.errstate(all='ignore'):
|
||||
result = df.groupby('A').transform(
|
||||
lambda x: (x - x.mean()) / x.std())
|
||||
expected = DataFrame(dict(B=np.nan, C=Series(
|
||||
[-1, 0, 1, -1, 0, 1], dtype='float64')))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# int case
|
||||
df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1,
|
||||
C=[1, 2, 3, 1, 2, 3], D='foo'))
|
||||
with np.errstate(all='ignore'):
|
||||
result = df.groupby('A').transform(
|
||||
lambda x: (x - x.mean()) / x.std())
|
||||
expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1]))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# int that needs float conversion
|
||||
s = Series([2, 3, 4, 10, 5, -1])
|
||||
df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D='foo'))
|
||||
with np.errstate(all='ignore'):
|
||||
result = df.groupby('A').transform(
|
||||
lambda x: (x - x.mean()) / x.std())
|
||||
|
||||
s1 = s.iloc[0:3]
|
||||
s1 = (s1 - s1.mean()) / s1.std()
|
||||
s2 = s.iloc[3:6]
|
||||
s2 = (s2 - s2.mean()) / s2.std()
|
||||
expected = DataFrame(dict(B=np.nan, C=concat([s1, s2])))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# int downcasting
|
||||
result = df.groupby('A').transform(lambda x: x * 2 / 2)
|
||||
expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1]))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_transform_with_nan_group():
|
||||
# GH 9941
|
||||
df = pd.DataFrame({'a': range(10),
|
||||
'b': [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]})
|
||||
result = df.groupby(df.b)['a'].transform(max)
|
||||
expected = pd.Series([1., 1., 2., 3., np.nan, 6., 6., 9., 9., 9.],
|
||||
name='a')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_transform_mixed_type():
|
||||
index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]
|
||||
])
|
||||
df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
|
||||
'c': np.tile(['a', 'b', 'c'], 2),
|
||||
'v': np.arange(1., 7.)}, index=index)
|
||||
|
||||
def f(group):
|
||||
group['g'] = group['d'] * 2
|
||||
return group[:1]
|
||||
|
||||
grouped = df.groupby('c')
|
||||
result = grouped.apply(f)
|
||||
|
||||
assert result['d'].dtype == np.float64
|
||||
|
||||
# this is by definition a mutating operation!
|
||||
with option_context('mode.chained_assignment', None):
|
||||
for key, group in grouped:
|
||||
res = f(group)
|
||||
assert_frame_equal(res, result.loc[key])
|
||||
|
||||
|
||||
def _check_cython_group_transform_cumulative(pd_op, np_op, dtype):
|
||||
"""
|
||||
Check a group transform that executes a cumulative function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pd_op : callable
|
||||
The pandas cumulative function.
|
||||
np_op : callable
|
||||
The analogous one in NumPy.
|
||||
dtype : type
|
||||
The specified dtype of the data.
|
||||
"""
|
||||
|
||||
is_datetimelike = False
|
||||
|
||||
data = np.array([[1], [2], [3], [4]], dtype=dtype)
|
||||
ans = np.zeros_like(data)
|
||||
|
||||
labels = np.array([0, 0, 0, 0], dtype=np.int64)
|
||||
pd_op(ans, data, labels, is_datetimelike)
|
||||
|
||||
tm.assert_numpy_array_equal(np_op(data), ans[:, 0],
|
||||
check_dtype=False)
|
||||
|
||||
|
||||
def test_cython_group_transform_cumsum(any_real_dtype):
|
||||
# see gh-4095
|
||||
dtype = np.dtype(any_real_dtype).type
|
||||
pd_op, np_op = groupby.group_cumsum, np.cumsum
|
||||
_check_cython_group_transform_cumulative(pd_op, np_op, dtype)
|
||||
|
||||
|
||||
def test_cython_group_transform_cumprod():
|
||||
# see gh-4095
|
||||
dtype = np.float64
|
||||
pd_op, np_op = groupby.group_cumprod_float64, np.cumproduct
|
||||
_check_cython_group_transform_cumulative(pd_op, np_op, dtype)
|
||||
|
||||
|
||||
def test_cython_group_transform_algos():
|
||||
# see gh-4095
|
||||
is_datetimelike = False
|
||||
|
||||
# with nans
|
||||
labels = np.array([0, 0, 0, 0, 0], dtype=np.int64)
|
||||
|
||||
data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
|
||||
actual = np.zeros_like(data)
|
||||
actual.fill(np.nan)
|
||||
groupby.group_cumprod_float64(actual, data, labels, is_datetimelike)
|
||||
expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
|
||||
tm.assert_numpy_array_equal(actual[:, 0], expected)
|
||||
|
||||
actual = np.zeros_like(data)
|
||||
actual.fill(np.nan)
|
||||
groupby.group_cumsum(actual, data, labels, is_datetimelike)
|
||||
expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
|
||||
tm.assert_numpy_array_equal(actual[:, 0], expected)
|
||||
|
||||
# timedelta
|
||||
is_datetimelike = True
|
||||
data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
|
||||
actual = np.zeros_like(data, dtype='int64')
|
||||
groupby.group_cumsum(actual, data.view('int64'), labels,
|
||||
is_datetimelike)
|
||||
expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
|
||||
2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
|
||||
np.timedelta64(5, 'ns')])
|
||||
tm.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, args, targop",
|
||||
[('cumprod', (), lambda x: x.cumprod()),
|
||||
('cumsum', (), lambda x: x.cumsum()),
|
||||
('shift', (-1, ), lambda x: x.shift(-1)),
|
||||
('shift', (1, ), lambda x: x.shift())])
|
||||
def test_cython_transform_series(op, args, targop):
|
||||
# GH 4095
|
||||
s = Series(np.random.randn(1000))
|
||||
s_missing = s.copy()
|
||||
s_missing.iloc[2:10] = np.nan
|
||||
labels = np.random.randint(0, 50, size=1000).astype(float)
|
||||
|
||||
# series
|
||||
for data in [s, s_missing]:
|
||||
# print(data.head())
|
||||
expected = data.groupby(labels).transform(targop)
|
||||
|
||||
tm.assert_series_equal(
|
||||
expected,
|
||||
data.groupby(labels).transform(op, *args))
|
||||
tm.assert_series_equal(expected, getattr(
|
||||
data.groupby(labels), op)(*args))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("op", ['cumprod', 'cumsum'])
|
||||
@pytest.mark.parametrize("skipna", [False, True])
|
||||
@pytest.mark.parametrize('input, exp', [
|
||||
# When everything is NaN
|
||||
({'key': ['b'] * 10, 'value': np.nan},
|
||||
pd.Series([np.nan] * 10, name='value')),
|
||||
# When there is a single NaN
|
||||
({'key': ['b'] * 10 + ['a'] * 2,
|
||||
'value': [3] * 3 + [np.nan] + [3] * 8},
|
||||
{('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0],
|
||||
('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729.,
|
||||
2187., 6561., 19683., 3.0, 9.0],
|
||||
('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0],
|
||||
('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18.,
|
||||
21., 24., 27., 3.0, 6.0]})])
|
||||
def test_groupby_cum_skipna(op, skipna, input, exp):
|
||||
df = pd.DataFrame(input)
|
||||
result = df.groupby('key')['value'].transform(op, skipna=skipna)
|
||||
if isinstance(exp, dict):
|
||||
expected = exp[(op, skipna)]
|
||||
else:
|
||||
expected = exp
|
||||
expected = pd.Series(expected, name='value')
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, args, targop",
|
||||
[('cumprod', (), lambda x: x.cumprod()),
|
||||
('cumsum', (), lambda x: x.cumsum()),
|
||||
('shift', (-1, ), lambda x: x.shift(-1)),
|
||||
('shift', (1, ), lambda x: x.shift())])
|
||||
def test_cython_transform_frame(op, args, targop):
|
||||
s = Series(np.random.randn(1000))
|
||||
s_missing = s.copy()
|
||||
s_missing.iloc[2:10] = np.nan
|
||||
labels = np.random.randint(0, 50, size=1000).astype(float)
|
||||
strings = list('qwertyuiopasdfghjklz')
|
||||
strings_missing = strings[:]
|
||||
strings_missing[5] = np.nan
|
||||
df = DataFrame({'float': s,
|
||||
'float_missing': s_missing,
|
||||
'int': [1, 1, 1, 1, 2] * 200,
|
||||
'datetime': pd.date_range('1990-1-1', periods=1000),
|
||||
'timedelta': pd.timedelta_range(1, freq='s',
|
||||
periods=1000),
|
||||
'string': strings * 50,
|
||||
'string_missing': strings_missing * 50},
|
||||
columns=['float', 'float_missing', 'int', 'datetime',
|
||||
'timedelta', 'string', 'string_missing'])
|
||||
df['cat'] = df['string'].astype('category')
|
||||
|
||||
df2 = df.copy()
|
||||
df2.index = pd.MultiIndex.from_product([range(100), range(10)])
|
||||
|
||||
# DataFrame - Single and MultiIndex,
|
||||
# group by values, index level, columns
|
||||
for df in [df, df2]:
|
||||
for gb_target in [dict(by=labels), dict(level=0), dict(by='string')
|
||||
]: # dict(by='string_missing')]:
|
||||
# dict(by=['int','string'])]:
|
||||
|
||||
gb = df.groupby(**gb_target)
|
||||
# whitelisted methods set the selection before applying
|
||||
# bit a of hack to make sure the cythonized shift
|
||||
# is equivalent to pre 0.17.1 behavior
|
||||
if op == 'shift':
|
||||
gb._set_group_selection()
|
||||
|
||||
if op != 'shift' and 'int' not in gb_target:
|
||||
# numeric apply fastpath promotes dtype so have
|
||||
# to apply separately and concat
|
||||
i = gb[['int']].apply(targop)
|
||||
f = gb[['float', 'float_missing']].apply(targop)
|
||||
expected = pd.concat([f, i], axis=1)
|
||||
else:
|
||||
expected = gb.apply(targop)
|
||||
|
||||
expected = expected.sort_index(axis=1)
|
||||
tm.assert_frame_equal(expected,
|
||||
gb.transform(op, *args).sort_index(
|
||||
axis=1))
|
||||
tm.assert_frame_equal(
|
||||
expected,
|
||||
getattr(gb, op)(*args).sort_index(axis=1))
|
||||
# individual columns
|
||||
for c in df:
|
||||
if c not in ['float', 'int', 'float_missing'
|
||||
] and op != 'shift':
|
||||
msg = "No numeric types to aggregate"
|
||||
with pytest.raises(DataError, match=msg):
|
||||
gb[c].transform(op)
|
||||
with pytest.raises(DataError, match=msg):
|
||||
getattr(gb[c], op)()
|
||||
else:
|
||||
expected = gb[c].apply(targop)
|
||||
expected.name = c
|
||||
tm.assert_series_equal(expected,
|
||||
gb[c].transform(op, *args))
|
||||
tm.assert_series_equal(expected,
|
||||
getattr(gb[c], op)(*args))
|
||||
|
||||
|
||||
def test_transform_with_non_scalar_group():
|
||||
# GH 10165
|
||||
cols = pd.MultiIndex.from_tuples([
|
||||
('syn', 'A'), ('mis', 'A'), ('non', 'A'),
|
||||
('syn', 'C'), ('mis', 'C'), ('non', 'C'),
|
||||
('syn', 'T'), ('mis', 'T'), ('non', 'T'),
|
||||
('syn', 'G'), ('mis', 'G'), ('non', 'G')])
|
||||
df = pd.DataFrame(np.random.randint(1, 10, (4, 12)),
|
||||
columns=cols,
|
||||
index=['A', 'C', 'G', 'T'])
|
||||
|
||||
msg = 'transform must return a scalar value for each group.*'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby(axis=1, level=1).transform(
|
||||
lambda z: z.div(z.sum(axis=1), axis=0))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('cols,exp,comp_func', [
|
||||
('a', pd.Series([1, 1, 1], name='a'), tm.assert_series_equal),
|
||||
(['a', 'c'], pd.DataFrame({'a': [1, 1, 1], 'c': [1, 1, 1]}),
|
||||
tm.assert_frame_equal)
|
||||
])
|
||||
@pytest.mark.parametrize('agg_func', [
|
||||
'count', 'rank', 'size'])
|
||||
def test_transform_numeric_ret(cols, exp, comp_func, agg_func):
|
||||
if agg_func == 'size' and isinstance(cols, list):
|
||||
pytest.xfail("'size' transformation not supported with "
|
||||
"NDFrameGroupy")
|
||||
|
||||
# GH 19200
|
||||
df = pd.DataFrame(
|
||||
{'a': pd.date_range('2018-01-01', periods=3),
|
||||
'b': range(3),
|
||||
'c': range(7, 10)})
|
||||
|
||||
result = df.groupby('b')[cols].transform(agg_func)
|
||||
|
||||
if agg_func == 'rank':
|
||||
exp = exp.astype('float')
|
||||
|
||||
comp_func(result, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mix_groupings", [True, False])
|
||||
@pytest.mark.parametrize("as_series", [True, False])
|
||||
@pytest.mark.parametrize("val1,val2", [
|
||||
('foo', 'bar'), (1, 2), (1., 2.)])
|
||||
@pytest.mark.parametrize("fill_method,limit,exp_vals", [
|
||||
("ffill", None,
|
||||
[np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']),
|
||||
("ffill", 1,
|
||||
[np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]),
|
||||
("bfill", None,
|
||||
['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]),
|
||||
("bfill", 1,
|
||||
[np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan])
|
||||
])
|
||||
def test_group_fill_methods(mix_groupings, as_series, val1, val2,
|
||||
fill_method, limit, exp_vals):
|
||||
vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan]
|
||||
_exp_vals = list(exp_vals)
|
||||
# Overwrite placeholder values
|
||||
for index, exp_val in enumerate(_exp_vals):
|
||||
if exp_val == 'val1':
|
||||
_exp_vals[index] = val1
|
||||
elif exp_val == 'val2':
|
||||
_exp_vals[index] = val2
|
||||
|
||||
# Need to modify values and expectations depending on the
|
||||
# Series / DataFrame that we ultimately want to generate
|
||||
if mix_groupings: # ['a', 'b', 'a, 'b', ...]
|
||||
keys = ['a', 'b'] * len(vals)
|
||||
|
||||
def interweave(list_obj):
|
||||
temp = list()
|
||||
for x in list_obj:
|
||||
temp.extend([x, x])
|
||||
|
||||
return temp
|
||||
|
||||
_exp_vals = interweave(_exp_vals)
|
||||
vals = interweave(vals)
|
||||
else: # ['a', 'a', 'a', ... 'b', 'b', 'b']
|
||||
keys = ['a'] * len(vals) + ['b'] * len(vals)
|
||||
_exp_vals = _exp_vals * 2
|
||||
vals = vals * 2
|
||||
|
||||
df = DataFrame({'key': keys, 'val': vals})
|
||||
if as_series:
|
||||
result = getattr(
|
||||
df.groupby('key')['val'], fill_method)(limit=limit)
|
||||
exp = Series(_exp_vals, name='val')
|
||||
assert_series_equal(result, exp)
|
||||
else:
|
||||
result = getattr(df.groupby('key'), fill_method)(limit=limit)
|
||||
exp = DataFrame({'key': keys, 'val': _exp_vals})
|
||||
assert_frame_equal(result, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fill_method", ['ffill', 'bfill'])
|
||||
def test_pad_stable_sorting(fill_method):
|
||||
# GH 21207
|
||||
x = [0] * 20
|
||||
y = [np.nan] * 10 + [1] * 10
|
||||
|
||||
if fill_method == 'bfill':
|
||||
y = y[::-1]
|
||||
|
||||
df = pd.DataFrame({'x': x, 'y': y})
|
||||
expected = df.copy()
|
||||
|
||||
result = getattr(df.groupby('x'), fill_method)()
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_series", [True, False])
|
||||
@pytest.mark.parametrize("freq", [
|
||||
None,
|
||||
pytest.param('D', marks=pytest.mark.xfail(
|
||||
reason='GH#23918 before method uses freq in vectorized approach'))])
|
||||
@pytest.mark.parametrize("periods,fill_method,limit", [
|
||||
(1, 'ffill', None), (1, 'ffill', 1),
|
||||
(1, 'bfill', None), (1, 'bfill', 1),
|
||||
(-1, 'ffill', None), (-1, 'ffill', 1),
|
||||
(-1, 'bfill', None), (-1, 'bfill', 1),
|
||||
])
|
||||
def test_pct_change(test_series, freq, periods, fill_method, limit):
|
||||
# GH 21200, 21621
|
||||
vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4]
|
||||
keys = ['a', 'b']
|
||||
key_v = np.repeat(keys, len(vals))
|
||||
df = DataFrame({'key': key_v, 'vals': vals * 2})
|
||||
|
||||
df_g = getattr(df.groupby('key'), fill_method)(limit=limit)
|
||||
grp = df_g.groupby('key')
|
||||
|
||||
expected = grp['vals'].obj / grp['vals'].shift(periods) - 1
|
||||
|
||||
if test_series:
|
||||
result = df.groupby('key')['vals'].pct_change(
|
||||
periods=periods, fill_method=fill_method, limit=limit, freq=freq)
|
||||
tm.assert_series_equal(result, expected)
|
||||
else:
|
||||
result = df.groupby('key').pct_change(
|
||||
periods=periods, fill_method=fill_method, limit=limit, freq=freq)
|
||||
tm.assert_frame_equal(result, expected.to_frame('vals'))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", [np.any, np.all])
|
||||
def test_any_all_np_func(func):
|
||||
# GH 20653
|
||||
df = pd.DataFrame([['foo', True],
|
||||
[np.nan, True],
|
||||
['foo', True]], columns=['key', 'val'])
|
||||
|
||||
exp = pd.Series([True, np.nan, True], name='val')
|
||||
|
||||
res = df.groupby('key')['val'].transform(func)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
|
||||
def test_groupby_transform_rename():
|
||||
# https://github.com/pandas-dev/pandas/issues/23461
|
||||
def demean_rename(x):
|
||||
result = x - x.mean()
|
||||
|
||||
if isinstance(x, pd.Series):
|
||||
return result
|
||||
|
||||
result = result.rename(
|
||||
columns={c: '{}_demeaned'.format(c) for c in result.columns})
|
||||
|
||||
return result
|
||||
|
||||
df = pd.DataFrame({'group': list('ababa'),
|
||||
'value': [1, 1, 1, 2, 2]})
|
||||
expected = pd.DataFrame({'value': [-1. / 3, -0.5, -1. / 3, 0.5, 2. / 3]})
|
||||
|
||||
result = df.groupby('group').transform(demean_rename)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result_single = df.groupby('group').value.transform(demean_rename)
|
||||
tm.assert_series_equal(result_single, expected['value'])
|
||||
@@ -1,76 +0,0 @@
|
||||
"""
|
||||
these are systematically testing all of the args to value_counts
|
||||
with different size combinations. This is to ensure stability of the sorting
|
||||
and proper parameter handling
|
||||
"""
|
||||
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, MultiIndex, Series, date_range
|
||||
from pandas.util import testing as tm
|
||||
|
||||
|
||||
# our starting frame
|
||||
def seed_df(seed_nans, n, m):
|
||||
np.random.seed(1234)
|
||||
days = date_range('2015-08-24', periods=10)
|
||||
|
||||
frame = DataFrame({
|
||||
'1st': np.random.choice(
|
||||
list('abcd'), n),
|
||||
'2nd': np.random.choice(days, n),
|
||||
'3rd': np.random.randint(1, m + 1, n)
|
||||
})
|
||||
|
||||
if seed_nans:
|
||||
frame.loc[1::11, '1st'] = np.nan
|
||||
frame.loc[3::17, '2nd'] = np.nan
|
||||
frame.loc[7::19, '3rd'] = np.nan
|
||||
frame.loc[8::19, '3rd'] = np.nan
|
||||
frame.loc[9::19, '3rd'] = np.nan
|
||||
|
||||
return frame
|
||||
|
||||
|
||||
# create input df, keys, and the bins
|
||||
binned = []
|
||||
ids = []
|
||||
for seed_nans in [True, False]:
|
||||
for n, m in product((100, 1000), (5, 20)):
|
||||
|
||||
df = seed_df(seed_nans, n, m)
|
||||
bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2)
|
||||
keys = '1st', '2nd', ['1st', '2nd']
|
||||
for k, b in product(keys, bins):
|
||||
binned.append((df, k, b, n, m))
|
||||
ids.append("{}-{}-{}".format(k, n, m))
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids)
|
||||
def test_series_groupby_value_counts(df, keys, bins, n, m):
|
||||
|
||||
def rebuild_index(df):
|
||||
arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
|
||||
df.index = MultiIndex.from_arrays(arr, names=df.index.names)
|
||||
return df
|
||||
|
||||
for isort, normalize, sort, ascending, dropna \
|
||||
in product((False, True), repeat=5):
|
||||
|
||||
kwargs = dict(normalize=normalize, sort=sort,
|
||||
ascending=ascending, dropna=dropna, bins=bins)
|
||||
|
||||
gr = df.groupby(keys, sort=isort)
|
||||
left = gr['3rd'].value_counts(**kwargs)
|
||||
|
||||
gr = df.groupby(keys, sort=isort)
|
||||
right = gr['3rd'].apply(Series.value_counts, **kwargs)
|
||||
right.index.names = right.index.names[:-1] + ['3rd']
|
||||
|
||||
# have to sort on index because of unstable sort on values
|
||||
left, right = map(rebuild_index, (left, right)) # xref GH9212
|
||||
tm.assert_series_equal(left.sort_index(), right.sort_index())
|
||||
@@ -1,297 +0,0 @@
|
||||
"""
|
||||
test methods relating to generic function evaluation
|
||||
the so-called white/black lists
|
||||
"""
|
||||
|
||||
from string import ascii_lowercase
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex, Series, compat, date_range
|
||||
from pandas.util import testing as tm
|
||||
|
||||
AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew',
|
||||
'mad', 'std', 'var', 'sem']
|
||||
AGG_FUNCTIONS_WITH_SKIPNA = ['skew', 'mad']
|
||||
|
||||
df_whitelist = [
|
||||
'quantile',
|
||||
'fillna',
|
||||
'mad',
|
||||
'take',
|
||||
'idxmax',
|
||||
'idxmin',
|
||||
'tshift',
|
||||
'skew',
|
||||
'plot',
|
||||
'hist',
|
||||
'dtypes',
|
||||
'corrwith',
|
||||
'corr',
|
||||
'cov',
|
||||
'diff',
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(params=df_whitelist)
|
||||
def df_whitelist_fixture(request):
|
||||
return request.param
|
||||
|
||||
|
||||
s_whitelist = [
|
||||
'quantile',
|
||||
'fillna',
|
||||
'mad',
|
||||
'take',
|
||||
'idxmax',
|
||||
'idxmin',
|
||||
'tshift',
|
||||
'skew',
|
||||
'plot',
|
||||
'hist',
|
||||
'dtype',
|
||||
'corr',
|
||||
'cov',
|
||||
'diff',
|
||||
'unique',
|
||||
'nlargest',
|
||||
'nsmallest',
|
||||
'is_monotonic_increasing',
|
||||
'is_monotonic_decreasing',
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(params=s_whitelist)
|
||||
def s_whitelist_fixture(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mframe():
|
||||
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
|
||||
'three']],
|
||||
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
|
||||
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
|
||||
names=['first', 'second'])
|
||||
return DataFrame(np.random.randn(10, 3), index=index,
|
||||
columns=['A', 'B', 'C'])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
return DataFrame(
|
||||
{'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
|
||||
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
|
||||
'C': np.random.randn(8),
|
||||
'D': np.random.randn(8)})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_letters():
|
||||
letters = np.array(list(ascii_lowercase))
|
||||
N = 10
|
||||
random_letters = letters.take(np.random.randint(0, 26, N))
|
||||
df = DataFrame({'floats': N / 10 * Series(np.random.random(N)),
|
||||
'letters': Series(random_letters)})
|
||||
return df
|
||||
|
||||
|
||||
@pytest.mark.parametrize("whitelist", [df_whitelist, s_whitelist])
|
||||
def test_groupby_whitelist(df_letters, whitelist):
|
||||
df = df_letters
|
||||
if whitelist == df_whitelist:
|
||||
# dataframe
|
||||
obj = df_letters
|
||||
else:
|
||||
obj = df_letters['floats']
|
||||
|
||||
gb = obj.groupby(df.letters)
|
||||
|
||||
assert set(whitelist) == set(gb._apply_whitelist)
|
||||
|
||||
|
||||
def check_whitelist(obj, df, m):
|
||||
# check the obj for a particular whitelist m
|
||||
|
||||
gb = obj.groupby(df.letters)
|
||||
|
||||
f = getattr(type(gb), m)
|
||||
|
||||
# name
|
||||
try:
|
||||
n = f.__name__
|
||||
except AttributeError:
|
||||
return
|
||||
assert n == m
|
||||
|
||||
# qualname
|
||||
if compat.PY3:
|
||||
try:
|
||||
n = f.__qualname__
|
||||
except AttributeError:
|
||||
return
|
||||
assert n.endswith(m)
|
||||
|
||||
|
||||
def test_groupby_series_whitelist(df_letters, s_whitelist_fixture):
|
||||
m = s_whitelist_fixture
|
||||
df = df_letters
|
||||
check_whitelist(df.letters, df, m)
|
||||
|
||||
|
||||
def test_groupby_frame_whitelist(df_letters, df_whitelist_fixture):
|
||||
m = df_whitelist_fixture
|
||||
df = df_letters
|
||||
check_whitelist(df, df, m)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def raw_frame():
|
||||
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
|
||||
'three']],
|
||||
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
|
||||
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
|
||||
names=['first', 'second'])
|
||||
raw_frame = DataFrame(np.random.randn(10, 3), index=index,
|
||||
columns=Index(['A', 'B', 'C'], name='exp'))
|
||||
raw_frame.iloc[1, [1, 2]] = np.nan
|
||||
raw_frame.iloc[7, [0, 1]] = np.nan
|
||||
return raw_frame
|
||||
|
||||
|
||||
@pytest.mark.parametrize('op', AGG_FUNCTIONS)
|
||||
@pytest.mark.parametrize('level', [0, 1])
|
||||
@pytest.mark.parametrize('axis', [0, 1])
|
||||
@pytest.mark.parametrize('skipna', [True, False])
|
||||
@pytest.mark.parametrize('sort', [True, False])
|
||||
def test_regression_whitelist_methods(
|
||||
raw_frame, op, level,
|
||||
axis, skipna, sort):
|
||||
# GH6944
|
||||
# GH 17537
|
||||
# explicitly test the whitelist methods
|
||||
|
||||
if axis == 0:
|
||||
frame = raw_frame
|
||||
else:
|
||||
frame = raw_frame.T
|
||||
|
||||
if op in AGG_FUNCTIONS_WITH_SKIPNA:
|
||||
grouped = frame.groupby(level=level, axis=axis, sort=sort)
|
||||
result = getattr(grouped, op)(skipna=skipna)
|
||||
expected = getattr(frame, op)(level=level, axis=axis,
|
||||
skipna=skipna)
|
||||
if sort:
|
||||
expected = expected.sort_index(axis=axis, level=level)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
grouped = frame.groupby(level=level, axis=axis, sort=sort)
|
||||
result = getattr(grouped, op)()
|
||||
expected = getattr(frame, op)(level=level, axis=axis)
|
||||
if sort:
|
||||
expected = expected.sort_index(axis=axis, level=level)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_blacklist(df_letters):
|
||||
df = df_letters
|
||||
s = df_letters.floats
|
||||
|
||||
blacklist = [
|
||||
'eval', 'query', 'abs', 'where',
|
||||
'mask', 'align', 'groupby', 'clip', 'astype',
|
||||
'at', 'combine', 'consolidate', 'convert_objects',
|
||||
]
|
||||
to_methods = [method for method in dir(df) if method.startswith('to_')]
|
||||
|
||||
blacklist.extend(to_methods)
|
||||
|
||||
# e.g., to_csv
|
||||
defined_but_not_allowed = ("(?:^Cannot.+{0!r}.+{1!r}.+try using the "
|
||||
"'apply' method$)")
|
||||
|
||||
# e.g., query, eval
|
||||
not_defined = "(?:^{1!r} object has no attribute {0!r}$)"
|
||||
fmt = defined_but_not_allowed + '|' + not_defined
|
||||
for bl in blacklist:
|
||||
for obj in (df, s):
|
||||
gb = obj.groupby(df.letters)
|
||||
msg = fmt.format(bl, type(gb).__name__)
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
getattr(gb, bl)
|
||||
|
||||
|
||||
def test_tab_completion(mframe):
|
||||
grp = mframe.groupby(level='second')
|
||||
results = {v for v in dir(grp) if not v.startswith('_')}
|
||||
expected = {
|
||||
'A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter',
|
||||
'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max',
|
||||
'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot',
|
||||
'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count',
|
||||
'nunique', 'head', 'describe', 'cummax', 'quantile',
|
||||
'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna',
|
||||
'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew',
|
||||
'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith',
|
||||
'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin',
|
||||
'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe',
|
||||
}
|
||||
assert results == expected
|
||||
|
||||
|
||||
def test_groupby_function_rename(mframe):
|
||||
grp = mframe.groupby(level='second')
|
||||
for name in ['sum', 'prod', 'min', 'max', 'first', 'last']:
|
||||
f = getattr(grp, name)
|
||||
assert f.__name__ == name
|
||||
|
||||
|
||||
def test_groupby_selection_with_methods(df):
|
||||
# some methods which require DatetimeIndex
|
||||
rng = date_range('2014', periods=len(df))
|
||||
df.index = rng
|
||||
|
||||
g = df.groupby(['A'])[['C']]
|
||||
g_exp = df[['C']].groupby(df['A'])
|
||||
# TODO check groupby with > 1 col ?
|
||||
|
||||
# methods which are called as .foo()
|
||||
methods = ['count',
|
||||
'corr',
|
||||
'cummax',
|
||||
'cummin',
|
||||
'cumprod',
|
||||
'describe',
|
||||
'rank',
|
||||
'quantile',
|
||||
'diff',
|
||||
'shift',
|
||||
'all',
|
||||
'any',
|
||||
'idxmin',
|
||||
'idxmax',
|
||||
'ffill',
|
||||
'bfill',
|
||||
'pct_change',
|
||||
'tshift']
|
||||
|
||||
for m in methods:
|
||||
res = getattr(g, m)()
|
||||
exp = getattr(g_exp, m)()
|
||||
|
||||
# should always be frames!
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# methods which aren't just .foo()
|
||||
tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0))
|
||||
tm.assert_frame_equal(g.dtypes, g_exp.dtypes)
|
||||
tm.assert_frame_equal(g.apply(lambda x: x.sum()),
|
||||
g_exp.apply(lambda x: x.sum()))
|
||||
|
||||
tm.assert_frame_equal(g.resample('D').mean(), g_exp.resample('D').mean())
|
||||
tm.assert_frame_equal(g.resample('D').ohlc(),
|
||||
g_exp.resample('D').ohlc())
|
||||
|
||||
tm.assert_frame_equal(g.filter(lambda x: len(x) == 3),
|
||||
g_exp.filter(lambda x: len(x) == 3))
|
||||
Reference in New Issue
Block a user