pruned venvs
This commit is contained in:
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -1,140 +0,0 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas import compat
|
||||
from pandas.util._decorators import cache_readonly
|
||||
import pandas.util.testing as tm
|
||||
import pandas as pd
|
||||
|
||||
_seriesd = tm.getSeriesData()
|
||||
_tsd = tm.getTimeSeriesData()
|
||||
|
||||
_frame = pd.DataFrame(_seriesd)
|
||||
_frame2 = pd.DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])
|
||||
_intframe = pd.DataFrame({k: v.astype(int)
|
||||
for k, v in compat.iteritems(_seriesd)})
|
||||
|
||||
_tsframe = pd.DataFrame(_tsd)
|
||||
|
||||
_mixed_frame = _frame.copy()
|
||||
_mixed_frame['foo'] = 'bar'
|
||||
|
||||
|
||||
class TestData(object):
|
||||
|
||||
@cache_readonly
|
||||
def frame(self):
|
||||
return _frame.copy()
|
||||
|
||||
@cache_readonly
|
||||
def frame2(self):
|
||||
return _frame2.copy()
|
||||
|
||||
@cache_readonly
|
||||
def intframe(self):
|
||||
# force these all to int64 to avoid platform testing issues
|
||||
return pd.DataFrame({c: s for c, s in compat.iteritems(_intframe)},
|
||||
dtype=np.int64)
|
||||
|
||||
@cache_readonly
|
||||
def tsframe(self):
|
||||
return _tsframe.copy()
|
||||
|
||||
@cache_readonly
|
||||
def mixed_frame(self):
|
||||
return _mixed_frame.copy()
|
||||
|
||||
@cache_readonly
|
||||
def mixed_float(self):
|
||||
return pd.DataFrame({'A': _frame['A'].copy().astype('float32'),
|
||||
'B': _frame['B'].copy().astype('float32'),
|
||||
'C': _frame['C'].copy().astype('float16'),
|
||||
'D': _frame['D'].copy().astype('float64')})
|
||||
|
||||
@cache_readonly
|
||||
def mixed_float2(self):
|
||||
return pd.DataFrame({'A': _frame2['A'].copy().astype('float32'),
|
||||
'B': _frame2['B'].copy().astype('float32'),
|
||||
'C': _frame2['C'].copy().astype('float16'),
|
||||
'D': _frame2['D'].copy().astype('float64')})
|
||||
|
||||
@cache_readonly
|
||||
def mixed_int(self):
|
||||
return pd.DataFrame({'A': _intframe['A'].copy().astype('int32'),
|
||||
'B': np.ones(len(_intframe['B']), dtype='uint64'),
|
||||
'C': _intframe['C'].copy().astype('uint8'),
|
||||
'D': _intframe['D'].copy().astype('int64')})
|
||||
|
||||
@cache_readonly
|
||||
def all_mixed(self):
|
||||
return pd.DataFrame({'a': 1., 'b': 2, 'c': 'foo',
|
||||
'float32': np.array([1.] * 10, dtype='float32'),
|
||||
'int32': np.array([1] * 10, dtype='int32')},
|
||||
index=np.arange(10))
|
||||
|
||||
@cache_readonly
|
||||
def tzframe(self):
|
||||
result = pd.DataFrame({'A': pd.date_range('20130101', periods=3),
|
||||
'B': pd.date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'C': pd.date_range('20130101', periods=3,
|
||||
tz='CET')})
|
||||
result.iloc[1, 1] = pd.NaT
|
||||
result.iloc[1, 2] = pd.NaT
|
||||
return result
|
||||
|
||||
@cache_readonly
|
||||
def empty(self):
|
||||
return pd.DataFrame({})
|
||||
|
||||
@cache_readonly
|
||||
def ts1(self):
|
||||
return tm.makeTimeSeries(nper=30)
|
||||
|
||||
@cache_readonly
|
||||
def ts2(self):
|
||||
return tm.makeTimeSeries(nper=30)[5:]
|
||||
|
||||
@cache_readonly
|
||||
def simple(self):
|
||||
arr = np.array([[1., 2., 3.],
|
||||
[4., 5., 6.],
|
||||
[7., 8., 9.]])
|
||||
|
||||
return pd.DataFrame(arr, columns=['one', 'two', 'three'],
|
||||
index=['a', 'b', 'c'])
|
||||
|
||||
# self.ts3 = tm.makeTimeSeries()[-5:]
|
||||
# self.ts4 = tm.makeTimeSeries()[1:-1]
|
||||
|
||||
|
||||
def _check_mixed_float(df, dtype=None):
|
||||
# float16 are most likely to be upcasted to float32
|
||||
dtypes = dict(A='float32', B='float32', C='float16', D='float64')
|
||||
if isinstance(dtype, compat.string_types):
|
||||
dtypes = {k: dtype for k, v in dtypes.items()}
|
||||
elif isinstance(dtype, dict):
|
||||
dtypes.update(dtype)
|
||||
if dtypes.get('A'):
|
||||
assert(df.dtypes['A'] == dtypes['A'])
|
||||
if dtypes.get('B'):
|
||||
assert(df.dtypes['B'] == dtypes['B'])
|
||||
if dtypes.get('C'):
|
||||
assert(df.dtypes['C'] == dtypes['C'])
|
||||
if dtypes.get('D'):
|
||||
assert(df.dtypes['D'] == dtypes['D'])
|
||||
|
||||
|
||||
def _check_mixed_int(df, dtype=None):
|
||||
dtypes = dict(A='int32', B='uint64', C='uint8', D='int64')
|
||||
if isinstance(dtype, compat.string_types):
|
||||
dtypes = {k: dtype for k, v in dtypes.items()}
|
||||
elif isinstance(dtype, dict):
|
||||
dtypes.update(dtype)
|
||||
if dtypes.get('A'):
|
||||
assert(df.dtypes['A'] == dtypes['A'])
|
||||
if dtypes.get('B'):
|
||||
assert(df.dtypes['B'] == dtypes['B'])
|
||||
if dtypes.get('C'):
|
||||
assert(df.dtypes['C'] == dtypes['C'])
|
||||
if dtypes.get('D'):
|
||||
assert(df.dtypes['D'] == dtypes['D'])
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,515 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import pytest
|
||||
|
||||
# pylint: disable-msg=W0612,E1101
|
||||
from copy import deepcopy
|
||||
import pydoc
|
||||
|
||||
from pandas.compat import range, lrange, long
|
||||
from pandas import compat
|
||||
|
||||
from numpy.random import randn
|
||||
import numpy as np
|
||||
|
||||
from pandas import (DataFrame, Series, date_range, timedelta_range,
|
||||
Categorical, SparseDataFrame)
|
||||
import pandas as pd
|
||||
|
||||
from pandas.util.testing import (assert_almost_equal,
|
||||
assert_series_equal,
|
||||
assert_frame_equal)
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.tests.frame.common import TestData
|
||||
|
||||
|
||||
class SharedWithSparse(object):
|
||||
"""
|
||||
A collection of tests DataFrame and SparseDataFrame can share.
|
||||
|
||||
In generic tests on this class, use ``self._assert_frame_equal()`` and
|
||||
``self._assert_series_equal()`` which are implemented in sub-classes
|
||||
and dispatch correctly.
|
||||
"""
|
||||
def _assert_frame_equal(self, left, right):
|
||||
"""Dispatch to frame class dependent assertion"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _assert_series_equal(self, left, right):
|
||||
"""Dispatch to series class dependent assertion"""
|
||||
raise NotImplementedError
|
||||
|
||||
def test_copy_index_name_checking(self):
|
||||
# don't want to be able to modify the index stored elsewhere after
|
||||
# making a copy
|
||||
for attr in ('index', 'columns'):
|
||||
ind = getattr(self.frame, attr)
|
||||
ind.name = None
|
||||
cp = self.frame.copy()
|
||||
getattr(cp, attr).name = 'foo'
|
||||
assert getattr(self.frame, attr).name is None
|
||||
|
||||
def test_getitem_pop_assign_name(self):
|
||||
s = self.frame['A']
|
||||
assert s.name == 'A'
|
||||
|
||||
s = self.frame.pop('A')
|
||||
assert s.name == 'A'
|
||||
|
||||
s = self.frame.loc[:, 'B']
|
||||
assert s.name == 'B'
|
||||
|
||||
s2 = s.loc[:]
|
||||
assert s2.name == 'B'
|
||||
|
||||
def test_get_value(self):
|
||||
for idx in self.frame.index:
|
||||
for col in self.frame.columns:
|
||||
with tm.assert_produces_warning(FutureWarning,
|
||||
check_stacklevel=False):
|
||||
result = self.frame.get_value(idx, col)
|
||||
expected = self.frame[col][idx]
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
def test_add_prefix_suffix(self):
|
||||
with_prefix = self.frame.add_prefix('foo#')
|
||||
expected = pd.Index(['foo#%s' % c for c in self.frame.columns])
|
||||
tm.assert_index_equal(with_prefix.columns, expected)
|
||||
|
||||
with_suffix = self.frame.add_suffix('#foo')
|
||||
expected = pd.Index(['%s#foo' % c for c in self.frame.columns])
|
||||
tm.assert_index_equal(with_suffix.columns, expected)
|
||||
|
||||
with_pct_prefix = self.frame.add_prefix('%')
|
||||
expected = pd.Index(['%{}'.format(c) for c in self.frame.columns])
|
||||
tm.assert_index_equal(with_pct_prefix.columns, expected)
|
||||
|
||||
with_pct_suffix = self.frame.add_suffix('%')
|
||||
expected = pd.Index(['{}%'.format(c) for c in self.frame.columns])
|
||||
tm.assert_index_equal(with_pct_suffix.columns, expected)
|
||||
|
||||
def test_get_axis(self):
|
||||
f = self.frame
|
||||
assert f._get_axis_number(0) == 0
|
||||
assert f._get_axis_number(1) == 1
|
||||
assert f._get_axis_number('index') == 0
|
||||
assert f._get_axis_number('rows') == 0
|
||||
assert f._get_axis_number('columns') == 1
|
||||
|
||||
assert f._get_axis_name(0) == 'index'
|
||||
assert f._get_axis_name(1) == 'columns'
|
||||
assert f._get_axis_name('index') == 'index'
|
||||
assert f._get_axis_name('rows') == 'index'
|
||||
assert f._get_axis_name('columns') == 'columns'
|
||||
|
||||
assert f._get_axis(0) is f.index
|
||||
assert f._get_axis(1) is f.columns
|
||||
|
||||
tm.assert_raises_regex(
|
||||
ValueError, 'No axis named', f._get_axis_number, 2)
|
||||
tm.assert_raises_regex(
|
||||
ValueError, 'No axis.*foo', f._get_axis_name, 'foo')
|
||||
tm.assert_raises_regex(
|
||||
ValueError, 'No axis.*None', f._get_axis_name, None)
|
||||
tm.assert_raises_regex(ValueError, 'No axis named',
|
||||
f._get_axis_number, None)
|
||||
|
||||
def test_keys(self):
|
||||
getkeys = self.frame.keys
|
||||
assert getkeys() is self.frame.columns
|
||||
|
||||
def test_column_contains_typeerror(self):
|
||||
try:
|
||||
self.frame.columns in self.frame
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
def test_tab_completion(self):
|
||||
# DataFrame whose columns are identifiers shall have them in __dir__.
|
||||
df = pd.DataFrame([list('abcd'), list('efgh')], columns=list('ABCD'))
|
||||
for key in list('ABCD'):
|
||||
assert key in dir(df)
|
||||
assert isinstance(df.__getitem__('A'), pd.Series)
|
||||
|
||||
# DataFrame whose first-level columns are identifiers shall have
|
||||
# them in __dir__.
|
||||
df = pd.DataFrame(
|
||||
[list('abcd'), list('efgh')],
|
||||
columns=pd.MultiIndex.from_tuples(list(zip('ABCD', 'EFGH'))))
|
||||
for key in list('ABCD'):
|
||||
assert key in dir(df)
|
||||
for key in list('EFGH'):
|
||||
assert key not in dir(df)
|
||||
assert isinstance(df.__getitem__('A'), pd.DataFrame)
|
||||
|
||||
def test_not_hashable(self):
|
||||
df = self.klass([1])
|
||||
pytest.raises(TypeError, hash, df)
|
||||
pytest.raises(TypeError, hash, self.empty)
|
||||
|
||||
def test_new_empty_index(self):
|
||||
df1 = self.klass(randn(0, 3))
|
||||
df2 = self.klass(randn(0, 3))
|
||||
df1.index.name = 'foo'
|
||||
assert df2.index.name is None
|
||||
|
||||
def test_array_interface(self):
|
||||
with np.errstate(all='ignore'):
|
||||
result = np.sqrt(self.frame)
|
||||
assert isinstance(result, type(self.frame))
|
||||
assert result.index is self.frame.index
|
||||
assert result.columns is self.frame.columns
|
||||
|
||||
self._assert_frame_equal(result, self.frame.apply(np.sqrt))
|
||||
|
||||
def test_get_agg_axis(self):
|
||||
cols = self.frame._get_agg_axis(0)
|
||||
assert cols is self.frame.columns
|
||||
|
||||
idx = self.frame._get_agg_axis(1)
|
||||
assert idx is self.frame.index
|
||||
|
||||
pytest.raises(ValueError, self.frame._get_agg_axis, 2)
|
||||
|
||||
def test_nonzero(self):
|
||||
assert self.empty.empty
|
||||
|
||||
assert not self.frame.empty
|
||||
assert not self.mixed_frame.empty
|
||||
|
||||
# corner case
|
||||
df = DataFrame({'A': [1., 2., 3.],
|
||||
'B': ['a', 'b', 'c']},
|
||||
index=np.arange(3))
|
||||
del df['A']
|
||||
assert not df.empty
|
||||
|
||||
def test_iteritems(self):
|
||||
df = self.klass([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b'])
|
||||
for k, v in compat.iteritems(df):
|
||||
assert isinstance(v, self.klass._constructor_sliced)
|
||||
|
||||
def test_items(self):
|
||||
# issue #17213, #13918
|
||||
cols = ['a', 'b', 'c']
|
||||
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols)
|
||||
for c, (k, v) in zip(cols, df.items()):
|
||||
assert c == k
|
||||
assert isinstance(v, Series)
|
||||
assert (df[k] == v).all()
|
||||
|
||||
def test_iter(self):
|
||||
assert tm.equalContents(list(self.frame), self.frame.columns)
|
||||
|
||||
def test_iterrows(self):
|
||||
for k, v in self.frame.iterrows():
|
||||
exp = self.frame.loc[k]
|
||||
self._assert_series_equal(v, exp)
|
||||
|
||||
for k, v in self.mixed_frame.iterrows():
|
||||
exp = self.mixed_frame.loc[k]
|
||||
self._assert_series_equal(v, exp)
|
||||
|
||||
def test_iterrows_iso8601(self):
|
||||
# GH19671
|
||||
if self.klass == SparseDataFrame:
|
||||
pytest.xfail(reason='SparseBlock datetime type not implemented.')
|
||||
|
||||
s = self.klass(
|
||||
{'non_iso8601': ['M1701', 'M1802', 'M1903', 'M2004'],
|
||||
'iso8601': date_range('2000-01-01', periods=4, freq='M')})
|
||||
for k, v in s.iterrows():
|
||||
exp = s.loc[k]
|
||||
self._assert_series_equal(v, exp)
|
||||
|
||||
def test_itertuples(self):
|
||||
for i, tup in enumerate(self.frame.itertuples()):
|
||||
s = self.klass._constructor_sliced(tup[1:])
|
||||
s.name = tup[0]
|
||||
expected = self.frame.iloc[i, :].reset_index(drop=True)
|
||||
self._assert_series_equal(s, expected)
|
||||
|
||||
df = self.klass({'floats': np.random.randn(5),
|
||||
'ints': lrange(5)}, columns=['floats', 'ints'])
|
||||
|
||||
for tup in df.itertuples(index=False):
|
||||
assert isinstance(tup[1], (int, long))
|
||||
|
||||
df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]})
|
||||
dfaa = df[['a', 'a']]
|
||||
|
||||
assert (list(dfaa.itertuples()) ==
|
||||
[(0, 1, 1), (1, 2, 2), (2, 3, 3)])
|
||||
|
||||
# repr with be int/long on 32-bit/windows
|
||||
if not (compat.is_platform_windows() or compat.is_platform_32bit()):
|
||||
assert (repr(list(df.itertuples(name=None))) ==
|
||||
'[(0, 1, 4), (1, 2, 5), (2, 3, 6)]')
|
||||
|
||||
tup = next(df.itertuples(name='TestName'))
|
||||
assert tup._fields == ('Index', 'a', 'b')
|
||||
assert (tup.Index, tup.a, tup.b) == tup
|
||||
assert type(tup).__name__ == 'TestName'
|
||||
|
||||
df.columns = ['def', 'return']
|
||||
tup2 = next(df.itertuples(name='TestName'))
|
||||
assert tup2 == (0, 1, 4)
|
||||
assert tup2._fields == ('Index', '_1', '_2')
|
||||
|
||||
df3 = DataFrame({'f' + str(i): [i] for i in range(1024)})
|
||||
# will raise SyntaxError if trying to create namedtuple
|
||||
tup3 = next(df3.itertuples())
|
||||
assert not hasattr(tup3, '_fields')
|
||||
assert isinstance(tup3, tuple)
|
||||
|
||||
def test_sequence_like_with_categorical(self):
|
||||
|
||||
# GH 7839
|
||||
# make sure can iterate
|
||||
df = DataFrame({"id": [1, 2, 3, 4, 5, 6],
|
||||
"raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
|
||||
df['grade'] = Categorical(df['raw_grade'])
|
||||
|
||||
# basic sequencing testing
|
||||
result = list(df.grade.values)
|
||||
expected = np.array(df.grade.values).tolist()
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
# iteration
|
||||
for t in df.itertuples(index=False):
|
||||
str(t)
|
||||
|
||||
for row, s in df.iterrows():
|
||||
str(s)
|
||||
|
||||
for c, col in df.iteritems():
|
||||
str(s)
|
||||
|
||||
def test_len(self):
|
||||
assert len(self.frame) == len(self.frame.index)
|
||||
|
||||
def test_values(self):
|
||||
frame = self.frame
|
||||
arr = frame.values
|
||||
|
||||
frame_cols = frame.columns
|
||||
for i, row in enumerate(arr):
|
||||
for j, value in enumerate(row):
|
||||
col = frame_cols[j]
|
||||
if np.isnan(value):
|
||||
assert np.isnan(frame[col][i])
|
||||
else:
|
||||
assert value == frame[col][i]
|
||||
|
||||
# mixed type
|
||||
arr = self.mixed_frame[['foo', 'A']].values
|
||||
assert arr[0, 0] == 'bar'
|
||||
|
||||
df = self.klass({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]})
|
||||
arr = df.values
|
||||
assert arr[0, 0] == 1j
|
||||
|
||||
# single block corner case
|
||||
arr = self.frame[['A', 'B']].values
|
||||
expected = self.frame.reindex(columns=['A', 'B']).values
|
||||
assert_almost_equal(arr, expected)
|
||||
|
||||
def test_transpose(self):
|
||||
frame = self.frame
|
||||
dft = frame.T
|
||||
for idx, series in compat.iteritems(dft):
|
||||
for col, value in compat.iteritems(series):
|
||||
if np.isnan(value):
|
||||
assert np.isnan(frame[col][idx])
|
||||
else:
|
||||
assert value == frame[col][idx]
|
||||
|
||||
# mixed type
|
||||
index, data = tm.getMixedTypeDict()
|
||||
mixed = self.klass(data, index=index)
|
||||
|
||||
mixed_T = mixed.T
|
||||
for col, s in compat.iteritems(mixed_T):
|
||||
assert s.dtype == np.object_
|
||||
|
||||
def test_swapaxes(self):
|
||||
df = self.klass(np.random.randn(10, 5))
|
||||
self._assert_frame_equal(df.T, df.swapaxes(0, 1))
|
||||
self._assert_frame_equal(df.T, df.swapaxes(1, 0))
|
||||
self._assert_frame_equal(df, df.swapaxes(0, 0))
|
||||
pytest.raises(ValueError, df.swapaxes, 2, 5)
|
||||
|
||||
def test_axis_aliases(self):
|
||||
f = self.frame
|
||||
|
||||
# reg name
|
||||
expected = f.sum(axis=0)
|
||||
result = f.sum(axis='index')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
expected = f.sum(axis=1)
|
||||
result = f.sum(axis='columns')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_class_axis(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/18147
|
||||
# no exception and no empty docstring
|
||||
assert pydoc.getdoc(DataFrame.index)
|
||||
assert pydoc.getdoc(DataFrame.columns)
|
||||
|
||||
def test_more_values(self):
|
||||
values = self.mixed_frame.values
|
||||
assert values.shape[1] == len(self.mixed_frame.columns)
|
||||
|
||||
def test_repr_with_mi_nat(self):
|
||||
df = self.klass({'X': [1, 2]},
|
||||
index=[[pd.NaT, pd.Timestamp('20130101')], ['a', 'b']])
|
||||
res = repr(df)
|
||||
exp = ' X\nNaT a 1\n2013-01-01 b 2'
|
||||
assert res == exp
|
||||
|
||||
def test_iteritems_names(self):
|
||||
for k, v in compat.iteritems(self.mixed_frame):
|
||||
assert v.name == k
|
||||
|
||||
def test_series_put_names(self):
|
||||
series = self.mixed_frame._series
|
||||
for k, v in compat.iteritems(series):
|
||||
assert v.name == k
|
||||
|
||||
def test_empty_nonzero(self):
|
||||
df = self.klass([1, 2, 3])
|
||||
assert not df.empty
|
||||
df = self.klass(index=[1], columns=[1])
|
||||
assert not df.empty
|
||||
df = self.klass(index=['a', 'b'], columns=['c', 'd']).dropna()
|
||||
assert df.empty
|
||||
assert df.T.empty
|
||||
empty_frames = [self.klass(),
|
||||
self.klass(index=[1]),
|
||||
self.klass(columns=[1]),
|
||||
self.klass({1: []})]
|
||||
for df in empty_frames:
|
||||
assert df.empty
|
||||
assert df.T.empty
|
||||
|
||||
def test_with_datetimelikes(self):
|
||||
|
||||
df = self.klass({'A': date_range('20130101', periods=10),
|
||||
'B': timedelta_range('1 day', periods=10)})
|
||||
t = df.T
|
||||
|
||||
result = t.get_dtype_counts()
|
||||
expected = Series({'object': 10})
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
class TestDataFrameMisc(SharedWithSparse, TestData):
|
||||
|
||||
klass = DataFrame
|
||||
# SharedWithSparse tests use generic, klass-agnostic assertion
|
||||
_assert_frame_equal = staticmethod(assert_frame_equal)
|
||||
_assert_series_equal = staticmethod(assert_series_equal)
|
||||
|
||||
def test_values(self):
|
||||
self.frame.values[:, 0] = 5.
|
||||
assert (self.frame.values[:, 0] == 5).all()
|
||||
|
||||
def test_as_matrix_deprecated(self):
|
||||
# GH18458
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
result = self.frame.as_matrix(columns=self.frame.columns.tolist())
|
||||
expected = self.frame.values
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_deepcopy(self):
|
||||
cp = deepcopy(self.frame)
|
||||
series = cp['A']
|
||||
series[:] = 10
|
||||
for idx, value in compat.iteritems(series):
|
||||
assert self.frame['A'][idx] != value
|
||||
|
||||
def test_transpose_get_view(self):
|
||||
dft = self.frame.T
|
||||
dft.values[:, 5:10] = 5
|
||||
|
||||
assert (self.frame.values[5:10] == 5).all()
|
||||
|
||||
def test_inplace_return_self(self):
|
||||
# re #1893
|
||||
|
||||
data = DataFrame({'a': ['foo', 'bar', 'baz', 'qux'],
|
||||
'b': [0, 0, 1, 1],
|
||||
'c': [1, 2, 3, 4]})
|
||||
|
||||
def _check_f(base, f):
|
||||
result = f(base)
|
||||
assert result is None
|
||||
|
||||
# -----DataFrame-----
|
||||
|
||||
# set_index
|
||||
f = lambda x: x.set_index('a', inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# reset_index
|
||||
f = lambda x: x.reset_index(inplace=True)
|
||||
_check_f(data.set_index('a'), f)
|
||||
|
||||
# drop_duplicates
|
||||
f = lambda x: x.drop_duplicates(inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# sort
|
||||
f = lambda x: x.sort_values('b', inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# sort_index
|
||||
f = lambda x: x.sort_index(inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# fillna
|
||||
f = lambda x: x.fillna(0, inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# replace
|
||||
f = lambda x: x.replace(1, 0, inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# rename
|
||||
f = lambda x: x.rename({1: 'foo'}, inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# -----Series-----
|
||||
d = data.copy()['c']
|
||||
|
||||
# reset_index
|
||||
f = lambda x: x.reset_index(inplace=True, drop=True)
|
||||
_check_f(data.set_index('a')['c'], f)
|
||||
|
||||
# fillna
|
||||
f = lambda x: x.fillna(0, inplace=True)
|
||||
_check_f(d.copy(), f)
|
||||
|
||||
# replace
|
||||
f = lambda x: x.replace(1, 0, inplace=True)
|
||||
_check_f(d.copy(), f)
|
||||
|
||||
# rename
|
||||
f = lambda x: x.rename({1: 'foo'}, inplace=True)
|
||||
_check_f(d.copy(), f)
|
||||
|
||||
def test_tab_complete_warning(self, ip):
|
||||
# https://github.com/pandas-dev/pandas/issues/16409
|
||||
pytest.importorskip('IPython', minversion="6.0.0")
|
||||
from IPython.core.completer import provisionalcompleter
|
||||
|
||||
code = "import pandas as pd; df = pd.DataFrame()"
|
||||
ip.run_code(code)
|
||||
with tm.assert_produces_warning(None):
|
||||
with provisionalcompleter('ignore'):
|
||||
list(ip.Completer.completions('df.', 1))
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,277 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat import range
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Comparisons
|
||||
|
||||
class TestFrameComparisons(object):
|
||||
def test_df_boolean_comparison_error(self):
|
||||
# GH#4576
|
||||
# boolean comparisons with a tuple/list give unexpected results
|
||||
df = pd.DataFrame(np.arange(6).reshape((3, 2)))
|
||||
|
||||
# not shape compatible
|
||||
with pytest.raises(ValueError):
|
||||
df == (2, 2)
|
||||
with pytest.raises(ValueError):
|
||||
df == [2, 2]
|
||||
|
||||
def test_df_float_none_comparison(self):
|
||||
df = pd.DataFrame(np.random.randn(8, 3), index=range(8),
|
||||
columns=['A', 'B', 'C'])
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
df.__eq__(None)
|
||||
|
||||
def test_df_string_comparison(self):
|
||||
df = pd.DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}])
|
||||
mask_a = df.a > 1
|
||||
tm.assert_frame_equal(df[mask_a], df.loc[1:1, :])
|
||||
tm.assert_frame_equal(df[-mask_a], df.loc[0:0, :])
|
||||
|
||||
mask_b = df.b == "foo"
|
||||
tm.assert_frame_equal(df[mask_b], df.loc[0:0, :])
|
||||
tm.assert_frame_equal(df[-mask_b], df.loc[1:1, :])
|
||||
|
||||
@pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le'])
|
||||
def test_df_flex_cmp_constant_return_types(self, opname):
|
||||
# GH#15077, non-empty DataFrame
|
||||
df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]})
|
||||
const = 2
|
||||
|
||||
result = getattr(df, opname)(const).get_dtype_counts()
|
||||
tm.assert_series_equal(result, pd.Series([2], ['bool']))
|
||||
|
||||
@pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le'])
|
||||
def test_df_flex_cmp_constant_return_types_empty(self, opname):
|
||||
# GH#15077 empty DataFrame
|
||||
df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]})
|
||||
const = 2
|
||||
|
||||
empty = df.iloc[:0]
|
||||
result = getattr(empty, opname)(const).get_dtype_counts()
|
||||
tm.assert_series_equal(result, pd.Series([2], ['bool']))
|
||||
|
||||
@pytest.mark.parametrize('timestamps', [
|
||||
[pd.Timestamp('2012-01-01 13:00:00+00:00')] * 2,
|
||||
[pd.Timestamp('2012-01-01 13:00:00')] * 2])
|
||||
def test_tz_aware_scalar_comparison(self, timestamps):
|
||||
# Test for issue #15966
|
||||
df = pd.DataFrame({'test': timestamps})
|
||||
expected = pd.DataFrame({'test': [False, False]})
|
||||
tm.assert_frame_equal(df == -1, expected)
|
||||
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Arithmetic
|
||||
|
||||
class TestFrameFlexArithmetic(object):
|
||||
def test_df_add_flex_filled_mixed_dtypes(self):
|
||||
# GH#19611
|
||||
dti = pd.date_range('2016-01-01', periods=3)
|
||||
ser = pd.Series(['1 Day', 'NaT', '2 Days'], dtype='timedelta64[ns]')
|
||||
df = pd.DataFrame({'A': dti, 'B': ser})
|
||||
other = pd.DataFrame({'A': ser, 'B': ser})
|
||||
fill = pd.Timedelta(days=1).to_timedelta64()
|
||||
result = df.add(other, fill_value=fill)
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{'A': pd.Series(['2016-01-02', '2016-01-03', '2016-01-05'],
|
||||
dtype='datetime64[ns]'),
|
||||
'B': ser * 2})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestFrameMulDiv(object):
|
||||
"""Tests for DataFrame multiplication and division"""
|
||||
# ------------------------------------------------------------------
|
||||
# Mod By Zero
|
||||
|
||||
def test_df_mod_zero_df(self):
|
||||
# GH#3590, modulo as ints
|
||||
df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
|
||||
|
||||
# this is technically wrong, as the integer portion is coerced to float
|
||||
# ###
|
||||
first = pd.Series([0, 0, 0, 0], dtype='float64')
|
||||
second = pd.Series([np.nan, np.nan, np.nan, 0])
|
||||
expected = pd.DataFrame({'first': first, 'second': second})
|
||||
result = df % df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_df_mod_zero_array(self):
|
||||
# GH#3590, modulo as ints
|
||||
df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
|
||||
|
||||
# this is technically wrong, as the integer portion is coerced to float
|
||||
# ###
|
||||
first = pd.Series([0, 0, 0, 0], dtype='float64')
|
||||
second = pd.Series([np.nan, np.nan, np.nan, 0])
|
||||
expected = pd.DataFrame({'first': first, 'second': second})
|
||||
|
||||
# numpy has a slightly different (wrong) treatment
|
||||
with np.errstate(all='ignore'):
|
||||
arr = df.values % df.values
|
||||
result2 = pd.DataFrame(arr, index=df.index,
|
||||
columns=df.columns, dtype='float64')
|
||||
result2.iloc[0:3, 1] = np.nan
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
def test_df_mod_zero_int(self):
|
||||
# GH#3590, modulo as ints
|
||||
df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
|
||||
|
||||
result = df % 0
|
||||
expected = pd.DataFrame(np.nan, index=df.index, columns=df.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# numpy has a slightly different (wrong) treatment
|
||||
with np.errstate(all='ignore'):
|
||||
arr = df.values.astype('float64') % 0
|
||||
result2 = pd.DataFrame(arr, index=df.index, columns=df.columns)
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
def test_df_mod_zero_series_does_not_commute(self):
|
||||
# GH#3590, modulo as ints
|
||||
# not commutative with series
|
||||
df = pd.DataFrame(np.random.randn(10, 5))
|
||||
ser = df[0]
|
||||
res = ser % df
|
||||
res2 = df % ser
|
||||
assert not res.fillna(0).equals(res2.fillna(0))
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Division By Zero
|
||||
|
||||
def test_df_div_zero_df(self):
|
||||
# integer div, but deal with the 0's (GH#9144)
|
||||
df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
|
||||
result = df / df
|
||||
|
||||
first = pd.Series([1.0, 1.0, 1.0, 1.0])
|
||||
second = pd.Series([np.nan, np.nan, np.nan, 1])
|
||||
expected = pd.DataFrame({'first': first, 'second': second})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_df_div_zero_array(self):
|
||||
# integer div, but deal with the 0's (GH#9144)
|
||||
df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
|
||||
|
||||
first = pd.Series([1.0, 1.0, 1.0, 1.0])
|
||||
second = pd.Series([np.nan, np.nan, np.nan, 1])
|
||||
expected = pd.DataFrame({'first': first, 'second': second})
|
||||
|
||||
with np.errstate(all='ignore'):
|
||||
arr = df.values.astype('float') / df.values
|
||||
result = pd.DataFrame(arr, index=df.index,
|
||||
columns=df.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_df_div_zero_int(self):
|
||||
# integer div, but deal with the 0's (GH#9144)
|
||||
df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
|
||||
|
||||
result = df / 0
|
||||
expected = pd.DataFrame(np.inf, index=df.index, columns=df.columns)
|
||||
expected.iloc[0:3, 1] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# numpy has a slightly different (wrong) treatment
|
||||
with np.errstate(all='ignore'):
|
||||
arr = df.values.astype('float64') / 0
|
||||
result2 = pd.DataFrame(arr, index=df.index,
|
||||
columns=df.columns)
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
def test_df_div_zero_series_does_not_commute(self):
|
||||
# integer div, but deal with the 0's (GH#9144)
|
||||
df = pd.DataFrame(np.random.randn(10, 5))
|
||||
ser = df[0]
|
||||
res = ser / df
|
||||
res2 = df / ser
|
||||
assert not res.fillna(0).equals(res2.fillna(0))
|
||||
|
||||
|
||||
class TestFrameArithmetic(object):
|
||||
|
||||
@pytest.mark.xfail(reason='GH#7996 datetime64 units not converted to nano')
|
||||
def test_df_sub_datetime64_not_ns(self):
|
||||
df = pd.DataFrame(pd.date_range('20130101', periods=3))
|
||||
dt64 = np.datetime64('2013-01-01')
|
||||
assert dt64.dtype == 'datetime64[D]'
|
||||
res = df - dt64
|
||||
expected = pd.DataFrame([pd.Timedelta(days=0), pd.Timedelta(days=1),
|
||||
pd.Timedelta(days=2)])
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
@pytest.mark.parametrize('data', [
|
||||
[1, 2, 3],
|
||||
[1.1, 2.2, 3.3],
|
||||
[pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), pd.NaT],
|
||||
['x', 'y', 1]])
|
||||
@pytest.mark.parametrize('dtype', [None, object])
|
||||
def test_df_radd_str_invalid(self, dtype, data):
|
||||
df = pd.DataFrame(data, dtype=dtype)
|
||||
with pytest.raises(TypeError):
|
||||
'foo_' + df
|
||||
|
||||
@pytest.mark.parametrize('dtype', [None, object])
|
||||
def test_df_with_dtype_radd_int(self, dtype):
|
||||
df = pd.DataFrame([1, 2, 3], dtype=dtype)
|
||||
expected = pd.DataFrame([2, 3, 4], dtype=dtype)
|
||||
result = 1 + df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df + 1
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('dtype', [None, object])
|
||||
def test_df_with_dtype_radd_nan(self, dtype):
|
||||
df = pd.DataFrame([1, 2, 3], dtype=dtype)
|
||||
expected = pd.DataFrame([np.nan, np.nan, np.nan], dtype=dtype)
|
||||
result = np.nan + df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df + np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_df_radd_str(self):
|
||||
df = pd.DataFrame(['x', np.nan, 'x'])
|
||||
tm.assert_frame_equal('a' + df, pd.DataFrame(['ax', np.nan, 'ax']))
|
||||
tm.assert_frame_equal(df + 'a', pd.DataFrame(['xa', np.nan, 'xa']))
|
||||
|
||||
|
||||
class TestPeriodFrameArithmetic(object):
|
||||
|
||||
def test_ops_frame_period(self):
|
||||
# GH 13043
|
||||
df = pd.DataFrame({'A': [pd.Period('2015-01', freq='M'),
|
||||
pd.Period('2015-02', freq='M')],
|
||||
'B': [pd.Period('2014-01', freq='M'),
|
||||
pd.Period('2014-02', freq='M')]})
|
||||
assert df['A'].dtype == object
|
||||
assert df['B'].dtype == object
|
||||
|
||||
p = pd.Period('2015-03', freq='M')
|
||||
# dtype will be object because of original dtype
|
||||
exp = pd.DataFrame({'A': np.array([2, 1], dtype=object),
|
||||
'B': np.array([14, 13], dtype=object)})
|
||||
tm.assert_frame_equal(p - df, exp)
|
||||
tm.assert_frame_equal(df - p, -1 * exp)
|
||||
|
||||
df2 = pd.DataFrame({'A': [pd.Period('2015-05', freq='M'),
|
||||
pd.Period('2015-06', freq='M')],
|
||||
'B': [pd.Period('2015-05', freq='M'),
|
||||
pd.Period('2015-06', freq='M')]})
|
||||
assert df2['A'].dtype == object
|
||||
assert df2['B'].dtype == object
|
||||
|
||||
exp = pd.DataFrame({'A': np.array([4, 4], dtype=object),
|
||||
'B': np.array([16, 16], dtype=object)})
|
||||
tm.assert_frame_equal(df2 - df, exp)
|
||||
tm.assert_frame_equal(df - df2, -1 * exp)
|
||||
@@ -1,108 +0,0 @@
|
||||
# coding=utf-8
|
||||
|
||||
import numpy as np
|
||||
from pandas import (DataFrame, date_range, Timestamp, Series,
|
||||
to_datetime)
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from .common import TestData
|
||||
|
||||
|
||||
class TestFrameAsof(TestData):
|
||||
def setup_method(self, method):
|
||||
self.N = N = 50
|
||||
self.rng = date_range('1/1/1990', periods=N, freq='53s')
|
||||
self.df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
|
||||
index=self.rng)
|
||||
|
||||
def test_basic(self):
|
||||
df = self.df.copy()
|
||||
df.loc[15:30, 'A'] = np.nan
|
||||
dates = date_range('1/1/1990', periods=self.N * 3,
|
||||
freq='25s')
|
||||
|
||||
result = df.asof(dates)
|
||||
assert result.notna().all(1).all()
|
||||
lb = df.index[14]
|
||||
ub = df.index[30]
|
||||
|
||||
dates = list(dates)
|
||||
result = df.asof(dates)
|
||||
assert result.notna().all(1).all()
|
||||
|
||||
mask = (result.index >= lb) & (result.index < ub)
|
||||
rs = result[mask]
|
||||
assert (rs == 14).all(1).all()
|
||||
|
||||
def test_subset(self):
|
||||
N = 10
|
||||
rng = date_range('1/1/1990', periods=N, freq='53s')
|
||||
df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
|
||||
index=rng)
|
||||
df.loc[4:8, 'A'] = np.nan
|
||||
dates = date_range('1/1/1990', periods=N * 3,
|
||||
freq='25s')
|
||||
|
||||
# with a subset of A should be the same
|
||||
result = df.asof(dates, subset='A')
|
||||
expected = df.asof(dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# same with A/B
|
||||
result = df.asof(dates, subset=['A', 'B'])
|
||||
expected = df.asof(dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# B gives self.df.asof
|
||||
result = df.asof(dates, subset='B')
|
||||
expected = df.resample('25s', closed='right').ffill().reindex(dates)
|
||||
expected.iloc[20:] = 9
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_missing(self):
|
||||
# GH 15118
|
||||
# no match found - `where` value before earliest date in index
|
||||
N = 10
|
||||
rng = date_range('1/1/1990', periods=N, freq='53s')
|
||||
df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
|
||||
index=rng)
|
||||
result = df.asof('1989-12-31')
|
||||
|
||||
expected = Series(index=['A', 'B'], name=Timestamp('1989-12-31'))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.asof(to_datetime(['1989-12-31']))
|
||||
expected = DataFrame(index=to_datetime(['1989-12-31']),
|
||||
columns=['A', 'B'], dtype='float64')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_all_nans(self):
|
||||
# GH 15713
|
||||
# DataFrame is all nans
|
||||
result = DataFrame([np.nan]).asof([0])
|
||||
expected = DataFrame([np.nan])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# testing non-default indexes, multiple inputs
|
||||
dates = date_range('1/1/1990', periods=self.N * 3, freq='25s')
|
||||
result = DataFrame(np.nan, index=self.rng, columns=['A']).asof(dates)
|
||||
expected = DataFrame(np.nan, index=dates, columns=['A'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# testing multiple columns
|
||||
dates = date_range('1/1/1990', periods=self.N * 3, freq='25s')
|
||||
result = DataFrame(np.nan, index=self.rng,
|
||||
columns=['A', 'B', 'C']).asof(dates)
|
||||
expected = DataFrame(np.nan, index=dates, columns=['A', 'B', 'C'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# testing scalar input
|
||||
result = DataFrame(np.nan, index=[1, 2], columns=['A', 'B']).asof([3])
|
||||
expected = DataFrame(np.nan, index=[3], columns=['A', 'B'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = DataFrame(np.nan, index=[1, 2], columns=['A', 'B']).asof(3)
|
||||
expected = Series(np.nan, index=['A', 'B'], name=3)
|
||||
tm.assert_series_equal(result, expected)
|
||||
-1181
File diff suppressed because it is too large
Load Diff
@@ -1,560 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import pytest
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
import itertools
|
||||
|
||||
from numpy import nan
|
||||
import numpy as np
|
||||
|
||||
from pandas import (DataFrame, Series, Timestamp, date_range, compat,
|
||||
option_context)
|
||||
from pandas.compat import StringIO
|
||||
import pandas as pd
|
||||
|
||||
from pandas.util.testing import (assert_almost_equal,
|
||||
assert_series_equal,
|
||||
assert_frame_equal)
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.tests.frame.common import TestData
|
||||
|
||||
|
||||
# Segregated collection of methods that require the BlockManager internal data
|
||||
# structure
|
||||
|
||||
|
||||
class TestDataFrameBlockInternals(TestData):
|
||||
|
||||
def test_cast_internals(self):
|
||||
casted = DataFrame(self.frame._data, dtype=int)
|
||||
expected = DataFrame(self.frame._series, dtype=int)
|
||||
assert_frame_equal(casted, expected)
|
||||
|
||||
casted = DataFrame(self.frame._data, dtype=np.int32)
|
||||
expected = DataFrame(self.frame._series, dtype=np.int32)
|
||||
assert_frame_equal(casted, expected)
|
||||
|
||||
def test_consolidate(self):
|
||||
self.frame['E'] = 7.
|
||||
consolidated = self.frame._consolidate()
|
||||
assert len(consolidated._data.blocks) == 1
|
||||
|
||||
# Ensure copy, do I want this?
|
||||
recons = consolidated._consolidate()
|
||||
assert recons is not consolidated
|
||||
tm.assert_frame_equal(recons, consolidated)
|
||||
|
||||
self.frame['F'] = 8.
|
||||
assert len(self.frame._data.blocks) == 3
|
||||
|
||||
self.frame._consolidate(inplace=True)
|
||||
assert len(self.frame._data.blocks) == 1
|
||||
|
||||
def test_consolidate_deprecation(self):
|
||||
self.frame['E'] = 7
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
self.frame.consolidate()
|
||||
|
||||
def test_consolidate_inplace(self):
|
||||
frame = self.frame.copy() # noqa
|
||||
|
||||
# triggers in-place consolidation
|
||||
for letter in range(ord('A'), ord('Z')):
|
||||
self.frame[chr(letter)] = chr(letter)
|
||||
|
||||
def test_values_consolidate(self):
|
||||
self.frame['E'] = 7.
|
||||
assert not self.frame._data.is_consolidated()
|
||||
_ = self.frame.values # noqa
|
||||
assert self.frame._data.is_consolidated()
|
||||
|
||||
def test_modify_values(self):
|
||||
self.frame.values[5] = 5
|
||||
assert (self.frame.values[5] == 5).all()
|
||||
|
||||
# unconsolidated
|
||||
self.frame['E'] = 7.
|
||||
self.frame.values[6] = 6
|
||||
assert (self.frame.values[6] == 6).all()
|
||||
|
||||
def test_boolean_set_uncons(self):
|
||||
self.frame['E'] = 7.
|
||||
|
||||
expected = self.frame.values.copy()
|
||||
expected[expected > 1] = 2
|
||||
|
||||
self.frame[self.frame > 1] = 2
|
||||
assert_almost_equal(expected, self.frame.values)
|
||||
|
||||
def test_values_numeric_cols(self):
|
||||
self.frame['foo'] = 'bar'
|
||||
|
||||
values = self.frame[['A', 'B', 'C', 'D']].values
|
||||
assert values.dtype == np.float64
|
||||
|
||||
def test_values_lcd(self):
|
||||
|
||||
# mixed lcd
|
||||
values = self.mixed_float[['A', 'B', 'C', 'D']].values
|
||||
assert values.dtype == np.float64
|
||||
|
||||
values = self.mixed_float[['A', 'B', 'C']].values
|
||||
assert values.dtype == np.float32
|
||||
|
||||
values = self.mixed_float[['C']].values
|
||||
assert values.dtype == np.float16
|
||||
|
||||
# GH 10364
|
||||
# B uint64 forces float because there are other signed int types
|
||||
values = self.mixed_int[['A', 'B', 'C', 'D']].values
|
||||
assert values.dtype == np.float64
|
||||
|
||||
values = self.mixed_int[['A', 'D']].values
|
||||
assert values.dtype == np.int64
|
||||
|
||||
# B uint64 forces float because there are other signed int types
|
||||
values = self.mixed_int[['A', 'B', 'C']].values
|
||||
assert values.dtype == np.float64
|
||||
|
||||
# as B and C are both unsigned, no forcing to float is needed
|
||||
values = self.mixed_int[['B', 'C']].values
|
||||
assert values.dtype == np.uint64
|
||||
|
||||
values = self.mixed_int[['A', 'C']].values
|
||||
assert values.dtype == np.int32
|
||||
|
||||
values = self.mixed_int[['C', 'D']].values
|
||||
assert values.dtype == np.int64
|
||||
|
||||
values = self.mixed_int[['A']].values
|
||||
assert values.dtype == np.int32
|
||||
|
||||
values = self.mixed_int[['C']].values
|
||||
assert values.dtype == np.uint8
|
||||
|
||||
def test_constructor_with_convert(self):
|
||||
# this is actually mostly a test of lib.maybe_convert_objects
|
||||
# #2845
|
||||
df = DataFrame({'A': [2 ** 63 - 1]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([2 ** 63 - 1], np.int64), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [2 ** 63]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([2 ** 63], np.uint64), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [datetime(2005, 1, 1), True]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([datetime(2005, 1, 1), True], np.object_),
|
||||
name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [None, 1]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([np.nan, 1], np.float_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [1.0, 2]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([1.0, 2], np.float_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [1.0 + 2.0j, 3]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [1.0 + 2.0j, 3.0]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [1.0 + 2.0j, True]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([1.0 + 2.0j, True], np.object_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [1.0, None]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([1.0, np.nan], np.float_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [1.0 + 2.0j, None]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray(
|
||||
[1.0 + 2.0j, np.nan], np.complex_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [2.0, 1, True, None]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray(
|
||||
[2.0, 1, True, None], np.object_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [2.0, 1, datetime(2006, 1, 1), None]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([2.0, 1, datetime(2006, 1, 1),
|
||||
None], np.object_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_construction_with_mixed(self):
|
||||
# test construction edge cases with mixed types
|
||||
|
||||
# f7u12, this does not work without extensive workaround
|
||||
data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)],
|
||||
[datetime(2000, 1, 2), datetime(2000, 1, 3),
|
||||
datetime(2000, 1, 1)]]
|
||||
df = DataFrame(data)
|
||||
|
||||
# check dtypes
|
||||
result = df.get_dtype_counts().sort_values()
|
||||
expected = Series({'datetime64[ns]': 3})
|
||||
|
||||
# mixed-type frames
|
||||
self.mixed_frame['datetime'] = datetime.now()
|
||||
self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1)
|
||||
assert self.mixed_frame['datetime'].dtype == 'M8[ns]'
|
||||
assert self.mixed_frame['timedelta'].dtype == 'm8[ns]'
|
||||
result = self.mixed_frame.get_dtype_counts().sort_values()
|
||||
expected = Series({'float64': 4,
|
||||
'object': 1,
|
||||
'datetime64[ns]': 1,
|
||||
'timedelta64[ns]': 1}).sort_values()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_construction_with_conversions(self):
|
||||
|
||||
# convert from a numpy array of non-ns timedelta64
|
||||
arr = np.array([1, 2, 3], dtype='timedelta64[s]')
|
||||
df = DataFrame(index=range(3))
|
||||
df['A'] = arr
|
||||
expected = DataFrame({'A': pd.timedelta_range('00:00:01', periods=3,
|
||||
freq='s')},
|
||||
index=range(3))
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
expected = DataFrame({
|
||||
'dt1': Timestamp('20130101'),
|
||||
'dt2': date_range('20130101', periods=3),
|
||||
# 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'),
|
||||
}, index=range(3))
|
||||
|
||||
df = DataFrame(index=range(3))
|
||||
df['dt1'] = np.datetime64('2013-01-01')
|
||||
df['dt2'] = np.array(['2013-01-01', '2013-01-02', '2013-01-03'],
|
||||
dtype='datetime64[D]')
|
||||
|
||||
# df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01
|
||||
# 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')
|
||||
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_constructor_compound_dtypes(self):
|
||||
# GH 5191
|
||||
# compound dtypes should raise not-implementederror
|
||||
|
||||
def f(dtype):
|
||||
data = list(itertools.repeat((datetime(2001, 1, 1),
|
||||
"aa", 20), 9))
|
||||
return DataFrame(data=data,
|
||||
columns=["A", "B", "C"],
|
||||
dtype=dtype)
|
||||
|
||||
pytest.raises(NotImplementedError, f,
|
||||
[("A", "datetime64[h]"),
|
||||
("B", "str"),
|
||||
("C", "int32")])
|
||||
|
||||
# these work (though results may be unexpected)
|
||||
f('int64')
|
||||
f('float64')
|
||||
|
||||
# 10822
|
||||
# invalid error message on dt inference
|
||||
if not compat.is_platform_windows():
|
||||
f('M8[ns]')
|
||||
|
||||
def test_equals_different_blocks(self):
|
||||
# GH 9330
|
||||
df0 = pd.DataFrame({"A": ["x", "y"], "B": [1, 2],
|
||||
"C": ["w", "z"]})
|
||||
df1 = df0.reset_index()[["A", "B", "C"]]
|
||||
# this assert verifies that the above operations have
|
||||
# induced a block rearrangement
|
||||
assert (df0._data.blocks[0].dtype != df1._data.blocks[0].dtype)
|
||||
|
||||
# do the real tests
|
||||
assert_frame_equal(df0, df1)
|
||||
assert df0.equals(df1)
|
||||
assert df1.equals(df0)
|
||||
|
||||
def test_copy_blocks(self):
|
||||
# API/ENH 9607
|
||||
df = DataFrame(self.frame, copy=True)
|
||||
column = df.columns[0]
|
||||
|
||||
# use the default copy=True, change a column
|
||||
|
||||
# deprecated 0.21.0
|
||||
with tm.assert_produces_warning(FutureWarning,
|
||||
check_stacklevel=False):
|
||||
blocks = df.as_blocks()
|
||||
for dtype, _df in blocks.items():
|
||||
if column in _df:
|
||||
_df.loc[:, column] = _df[column] + 1
|
||||
|
||||
# make sure we did not change the original DataFrame
|
||||
assert not _df[column].equals(df[column])
|
||||
|
||||
def test_no_copy_blocks(self):
|
||||
# API/ENH 9607
|
||||
df = DataFrame(self.frame, copy=True)
|
||||
column = df.columns[0]
|
||||
|
||||
# use the copy=False, change a column
|
||||
|
||||
# deprecated 0.21.0
|
||||
with tm.assert_produces_warning(FutureWarning,
|
||||
check_stacklevel=False):
|
||||
blocks = df.as_blocks(copy=False)
|
||||
for dtype, _df in blocks.items():
|
||||
if column in _df:
|
||||
_df.loc[:, column] = _df[column] + 1
|
||||
|
||||
# make sure we did change the original DataFrame
|
||||
assert _df[column].equals(df[column])
|
||||
|
||||
def test_copy(self):
|
||||
cop = self.frame.copy()
|
||||
cop['E'] = cop['A']
|
||||
assert 'E' not in self.frame
|
||||
|
||||
# copy objects
|
||||
copy = self.mixed_frame.copy()
|
||||
assert copy._data is not self.mixed_frame._data
|
||||
|
||||
def test_pickle(self):
|
||||
unpickled = tm.round_trip_pickle(self.mixed_frame)
|
||||
assert_frame_equal(self.mixed_frame, unpickled)
|
||||
|
||||
# buglet
|
||||
self.mixed_frame._data.ndim
|
||||
|
||||
# empty
|
||||
unpickled = tm.round_trip_pickle(self.empty)
|
||||
repr(unpickled)
|
||||
|
||||
# tz frame
|
||||
unpickled = tm.round_trip_pickle(self.tzframe)
|
||||
assert_frame_equal(self.tzframe, unpickled)
|
||||
|
||||
def test_consolidate_datetime64(self):
|
||||
# numpy vstack bug
|
||||
|
||||
data = """\
|
||||
starting,ending,measure
|
||||
2012-06-21 00:00,2012-06-23 07:00,77
|
||||
2012-06-23 07:00,2012-06-23 16:30,65
|
||||
2012-06-23 16:30,2012-06-25 08:00,77
|
||||
2012-06-25 08:00,2012-06-26 12:00,0
|
||||
2012-06-26 12:00,2012-06-27 08:00,77
|
||||
"""
|
||||
df = pd.read_csv(StringIO(data), parse_dates=[0, 1])
|
||||
|
||||
ser_starting = df.starting
|
||||
ser_starting.index = ser_starting.values
|
||||
ser_starting = ser_starting.tz_localize('US/Eastern')
|
||||
ser_starting = ser_starting.tz_convert('UTC')
|
||||
ser_starting.index.name = 'starting'
|
||||
|
||||
ser_ending = df.ending
|
||||
ser_ending.index = ser_ending.values
|
||||
ser_ending = ser_ending.tz_localize('US/Eastern')
|
||||
ser_ending = ser_ending.tz_convert('UTC')
|
||||
ser_ending.index.name = 'ending'
|
||||
|
||||
df.starting = ser_starting.index
|
||||
df.ending = ser_ending.index
|
||||
|
||||
tm.assert_index_equal(pd.DatetimeIndex(
|
||||
df.starting), ser_starting.index)
|
||||
tm.assert_index_equal(pd.DatetimeIndex(df.ending), ser_ending.index)
|
||||
|
||||
def test_is_mixed_type(self):
|
||||
assert not self.frame._is_mixed_type
|
||||
assert self.mixed_frame._is_mixed_type
|
||||
|
||||
def test_get_numeric_data(self):
|
||||
# TODO(wesm): unused?
|
||||
intname = np.dtype(np.int_).name # noqa
|
||||
floatname = np.dtype(np.float_).name # noqa
|
||||
|
||||
datetime64name = np.dtype('M8[ns]').name
|
||||
objectname = np.dtype(np.object_).name
|
||||
|
||||
df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
|
||||
'f': Timestamp('20010102')},
|
||||
index=np.arange(10))
|
||||
result = df.get_dtype_counts()
|
||||
expected = Series({'int64': 1, 'float64': 1,
|
||||
datetime64name: 1, objectname: 1})
|
||||
result = result.sort_index()
|
||||
expected = expected.sort_index()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
|
||||
'd': np.array([1.] * 10, dtype='float32'),
|
||||
'e': np.array([1] * 10, dtype='int32'),
|
||||
'f': np.array([1] * 10, dtype='int16'),
|
||||
'g': Timestamp('20010102')},
|
||||
index=np.arange(10))
|
||||
|
||||
result = df._get_numeric_data()
|
||||
expected = df.loc[:, ['a', 'b', 'd', 'e', 'f']]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
only_obj = df.loc[:, ['c', 'g']]
|
||||
result = only_obj._get_numeric_data()
|
||||
expected = df.loc[:, []]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame.from_dict(
|
||||
{'a': [1, 2], 'b': ['foo', 'bar'], 'c': [np.pi, np.e]})
|
||||
result = df._get_numeric_data()
|
||||
expected = DataFrame.from_dict({'a': [1, 2], 'c': [np.pi, np.e]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df = result.copy()
|
||||
result = df._get_numeric_data()
|
||||
expected = df
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_objects(self):
|
||||
|
||||
oops = self.mixed_frame.T.T
|
||||
converted = oops._convert(datetime=True)
|
||||
assert_frame_equal(converted, self.mixed_frame)
|
||||
assert converted['A'].dtype == np.float64
|
||||
|
||||
# force numeric conversion
|
||||
self.mixed_frame['H'] = '1.'
|
||||
self.mixed_frame['I'] = '1'
|
||||
|
||||
# add in some items that will be nan
|
||||
length = len(self.mixed_frame)
|
||||
self.mixed_frame['J'] = '1.'
|
||||
self.mixed_frame['K'] = '1'
|
||||
self.mixed_frame.loc[0:5, ['J', 'K']] = 'garbled'
|
||||
converted = self.mixed_frame._convert(datetime=True, numeric=True)
|
||||
assert converted['H'].dtype == 'float64'
|
||||
assert converted['I'].dtype == 'int64'
|
||||
assert converted['J'].dtype == 'float64'
|
||||
assert converted['K'].dtype == 'float64'
|
||||
assert len(converted['J'].dropna()) == length - 5
|
||||
assert len(converted['K'].dropna()) == length - 5
|
||||
|
||||
# via astype
|
||||
converted = self.mixed_frame.copy()
|
||||
converted['H'] = converted['H'].astype('float64')
|
||||
converted['I'] = converted['I'].astype('int64')
|
||||
assert converted['H'].dtype == 'float64'
|
||||
assert converted['I'].dtype == 'int64'
|
||||
|
||||
# via astype, but errors
|
||||
converted = self.mixed_frame.copy()
|
||||
with tm.assert_raises_regex(ValueError, 'invalid literal'):
|
||||
converted['H'].astype('int32')
|
||||
|
||||
# mixed in a single column
|
||||
df = DataFrame(dict(s=Series([1, 'na', 3, 4])))
|
||||
result = df._convert(datetime=True, numeric=True)
|
||||
expected = DataFrame(dict(s=Series([1, np.nan, 3, 4])))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_objects_no_conversion(self):
|
||||
mixed1 = DataFrame(
|
||||
{'a': [1, 2, 3], 'b': [4.0, 5, 6], 'c': ['x', 'y', 'z']})
|
||||
mixed2 = mixed1._convert(datetime=True)
|
||||
assert_frame_equal(mixed1, mixed2)
|
||||
|
||||
def test_infer_objects(self):
|
||||
# GH 11221
|
||||
df = DataFrame({'a': ['a', 1, 2, 3],
|
||||
'b': ['b', 2.0, 3.0, 4.1],
|
||||
'c': ['c', datetime(2016, 1, 1),
|
||||
datetime(2016, 1, 2),
|
||||
datetime(2016, 1, 3)],
|
||||
'd': [1, 2, 3, 'd']},
|
||||
columns=['a', 'b', 'c', 'd'])
|
||||
df = df.iloc[1:].infer_objects()
|
||||
|
||||
assert df['a'].dtype == 'int64'
|
||||
assert df['b'].dtype == 'float64'
|
||||
assert df['c'].dtype == 'M8[ns]'
|
||||
assert df['d'].dtype == 'object'
|
||||
|
||||
expected = DataFrame({'a': [1, 2, 3],
|
||||
'b': [2.0, 3.0, 4.1],
|
||||
'c': [datetime(2016, 1, 1),
|
||||
datetime(2016, 1, 2),
|
||||
datetime(2016, 1, 3)],
|
||||
'd': [2, 3, 'd']},
|
||||
columns=['a', 'b', 'c', 'd'])
|
||||
# reconstruct frame to verify inference is same
|
||||
tm.assert_frame_equal(df.reset_index(drop=True), expected)
|
||||
|
||||
def test_stale_cached_series_bug_473(self):
|
||||
|
||||
# this is chained, but ok
|
||||
with option_context('chained_assignment', None):
|
||||
Y = DataFrame(np.random.random((4, 4)), index=('a', 'b', 'c', 'd'),
|
||||
columns=('e', 'f', 'g', 'h'))
|
||||
repr(Y)
|
||||
Y['e'] = Y['e'].astype('object')
|
||||
Y['g']['c'] = np.NaN
|
||||
repr(Y)
|
||||
result = Y.sum() # noqa
|
||||
exp = Y['g'].sum() # noqa
|
||||
assert pd.isna(Y['g']['c'])
|
||||
|
||||
def test_get_X_columns(self):
|
||||
# numeric and object columns
|
||||
|
||||
df = DataFrame({'a': [1, 2, 3],
|
||||
'b': [True, False, True],
|
||||
'c': ['foo', 'bar', 'baz'],
|
||||
'd': [None, None, None],
|
||||
'e': [3.14, 0.577, 2.773]})
|
||||
|
||||
tm.assert_index_equal(df._get_numeric_data().columns,
|
||||
pd.Index(['a', 'b', 'e']))
|
||||
|
||||
def test_strange_column_corruption_issue(self):
|
||||
# (wesm) Unclear how exactly this is related to internal matters
|
||||
df = DataFrame(index=[0, 1])
|
||||
df[0] = nan
|
||||
wasCol = {}
|
||||
# uncommenting these makes the results match
|
||||
# for col in xrange(100, 200):
|
||||
# wasCol[col] = 1
|
||||
# df[col] = nan
|
||||
|
||||
for i, dt in enumerate(df.index):
|
||||
for col in range(100, 200):
|
||||
if col not in wasCol:
|
||||
wasCol[col] = 1
|
||||
df[col] = nan
|
||||
df[col][dt] = i
|
||||
|
||||
myid = 100
|
||||
|
||||
first = len(df.loc[pd.isna(df[myid]), [myid]])
|
||||
second = len(df.loc[pd.isna(df[myid]), [myid]])
|
||||
assert first == second == 0
|
||||
@@ -1,788 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
from numpy import nan
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from pandas import DataFrame, Index, Series, Timestamp, date_range
|
||||
from pandas.compat import lrange
|
||||
|
||||
from pandas.tests.frame.common import TestData
|
||||
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
class TestDataFrameConcatCommon(TestData):
|
||||
|
||||
def test_concat_multiple_frames_dtypes(self):
|
||||
|
||||
# GH 2759
|
||||
A = DataFrame(data=np.ones((10, 2)), columns=[
|
||||
'foo', 'bar'], dtype=np.float64)
|
||||
B = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
|
||||
results = pd.concat((A, B), axis=1).get_dtype_counts()
|
||||
expected = Series(dict(float64=2, float32=2))
|
||||
assert_series_equal(results, expected)
|
||||
|
||||
def test_concat_multiple_tzs(self):
|
||||
# GH 12467
|
||||
# combining datetime tz-aware and naive DataFrames
|
||||
ts1 = Timestamp('2015-01-01', tz=None)
|
||||
ts2 = Timestamp('2015-01-01', tz='UTC')
|
||||
ts3 = Timestamp('2015-01-01', tz='EST')
|
||||
|
||||
df1 = DataFrame(dict(time=[ts1]))
|
||||
df2 = DataFrame(dict(time=[ts2]))
|
||||
df3 = DataFrame(dict(time=[ts3]))
|
||||
|
||||
results = pd.concat([df1, df2]).reset_index(drop=True)
|
||||
expected = DataFrame(dict(time=[ts1, ts2]), dtype=object)
|
||||
assert_frame_equal(results, expected)
|
||||
|
||||
results = pd.concat([df1, df3]).reset_index(drop=True)
|
||||
expected = DataFrame(dict(time=[ts1, ts3]), dtype=object)
|
||||
assert_frame_equal(results, expected)
|
||||
|
||||
results = pd.concat([df2, df3]).reset_index(drop=True)
|
||||
expected = DataFrame(dict(time=[ts2, ts3]))
|
||||
assert_frame_equal(results, expected)
|
||||
|
||||
def test_concat_tuple_keys(self):
|
||||
# GH 14438
|
||||
df1 = pd.DataFrame(np.ones((2, 2)), columns=list('AB'))
|
||||
df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list('AB'))
|
||||
results = pd.concat((df1, df2), keys=[('bee', 'bah'), ('bee', 'boo')])
|
||||
expected = pd.DataFrame(
|
||||
{'A': {('bee', 'bah', 0): 1.0,
|
||||
('bee', 'bah', 1): 1.0,
|
||||
('bee', 'boo', 0): 2.0,
|
||||
('bee', 'boo', 1): 2.0,
|
||||
('bee', 'boo', 2): 2.0},
|
||||
'B': {('bee', 'bah', 0): 1.0,
|
||||
('bee', 'bah', 1): 1.0,
|
||||
('bee', 'boo', 0): 2.0,
|
||||
('bee', 'boo', 1): 2.0,
|
||||
('bee', 'boo', 2): 2.0}})
|
||||
assert_frame_equal(results, expected)
|
||||
|
||||
def test_append_series_dict(self):
|
||||
df = DataFrame(np.random.randn(5, 4),
|
||||
columns=['foo', 'bar', 'baz', 'qux'])
|
||||
|
||||
series = df.loc[4]
|
||||
with tm.assert_raises_regex(ValueError,
|
||||
'Indexes have overlapping values'):
|
||||
df.append(series, verify_integrity=True)
|
||||
series.name = None
|
||||
with tm.assert_raises_regex(TypeError,
|
||||
'Can only append a Series if '
|
||||
'ignore_index=True'):
|
||||
df.append(series, verify_integrity=True)
|
||||
|
||||
result = df.append(series[::-1], ignore_index=True)
|
||||
expected = df.append(DataFrame({0: series[::-1]}, index=df.columns).T,
|
||||
ignore_index=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# dict
|
||||
result = df.append(series.to_dict(), ignore_index=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.append(series[::-1][:3], ignore_index=True)
|
||||
expected = df.append(DataFrame({0: series[::-1][:3]}).T,
|
||||
ignore_index=True, sort=True)
|
||||
assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
# can append when name set
|
||||
row = df.loc[4]
|
||||
row.name = 5
|
||||
result = df.append(row)
|
||||
expected = df.append(df[-1:], ignore_index=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_list_of_series_dicts(self):
|
||||
df = DataFrame(np.random.randn(5, 4),
|
||||
columns=['foo', 'bar', 'baz', 'qux'])
|
||||
|
||||
dicts = [x.to_dict() for idx, x in df.iterrows()]
|
||||
|
||||
result = df.append(dicts, ignore_index=True)
|
||||
expected = df.append(df, ignore_index=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# different columns
|
||||
dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4},
|
||||
{'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}]
|
||||
result = df.append(dicts, ignore_index=True, sort=True)
|
||||
expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_empty_dataframe(self):
|
||||
|
||||
# Empty df append empty df
|
||||
df1 = DataFrame([])
|
||||
df2 = DataFrame([])
|
||||
result = df1.append(df2)
|
||||
expected = df1.copy()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Non-empty df append empty df
|
||||
df1 = DataFrame(np.random.randn(5, 2))
|
||||
df2 = DataFrame()
|
||||
result = df1.append(df2)
|
||||
expected = df1.copy()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Empty df with columns append empty df
|
||||
df1 = DataFrame(columns=['bar', 'foo'])
|
||||
df2 = DataFrame()
|
||||
result = df1.append(df2)
|
||||
expected = df1.copy()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Non-Empty df with columns append empty df
|
||||
df1 = DataFrame(np.random.randn(5, 2), columns=['bar', 'foo'])
|
||||
df2 = DataFrame()
|
||||
result = df1.append(df2)
|
||||
expected = df1.copy()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_dtypes(self):
|
||||
|
||||
# GH 5754
|
||||
# row appends of different dtypes (so need to do by-item)
|
||||
# can sometimes infer the correct type
|
||||
|
||||
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(5))
|
||||
df2 = DataFrame()
|
||||
result = df1.append(df2)
|
||||
expected = df1.copy()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
|
||||
df2 = DataFrame({'bar': 'foo'}, index=lrange(1, 2))
|
||||
result = df1.append(df2)
|
||||
expected = DataFrame({'bar': [Timestamp('20130101'), 'foo']})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
|
||||
df2 = DataFrame({'bar': np.nan}, index=lrange(1, 2))
|
||||
result = df1.append(df2)
|
||||
expected = DataFrame(
|
||||
{'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
|
||||
df2 = DataFrame({'bar': np.nan}, index=lrange(1, 2), dtype=object)
|
||||
result = df1.append(df2)
|
||||
expected = DataFrame(
|
||||
{'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({'bar': np.nan}, index=lrange(1))
|
||||
df2 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1, 2))
|
||||
result = df1.append(df2)
|
||||
expected = DataFrame(
|
||||
{'bar': Series([np.nan, Timestamp('20130101')], dtype='M8[ns]')})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
|
||||
df2 = DataFrame({'bar': 1}, index=lrange(1, 2), dtype=object)
|
||||
result = df1.append(df2)
|
||||
expected = DataFrame({'bar': Series([Timestamp('20130101'), 1])})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_update(self):
|
||||
df = DataFrame([[1.5, nan, 3.],
|
||||
[1.5, nan, 3.],
|
||||
[1.5, nan, 3],
|
||||
[1.5, nan, 3]])
|
||||
|
||||
other = DataFrame([[3.6, 2., np.nan],
|
||||
[np.nan, np.nan, 7]], index=[1, 3])
|
||||
|
||||
df.update(other)
|
||||
|
||||
expected = DataFrame([[1.5, nan, 3],
|
||||
[3.6, 2, 3],
|
||||
[1.5, nan, 3],
|
||||
[1.5, nan, 7.]])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_update_dtypes(self):
|
||||
|
||||
# gh 3016
|
||||
df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
|
||||
columns=['A', 'B', 'bool1', 'bool2'])
|
||||
|
||||
other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
|
||||
df.update(other)
|
||||
|
||||
expected = DataFrame([[45., 45., False, True], [4., 5., True, False]],
|
||||
columns=['A', 'B', 'bool1', 'bool2'])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_update_nooverwrite(self):
|
||||
df = DataFrame([[1.5, nan, 3.],
|
||||
[1.5, nan, 3.],
|
||||
[1.5, nan, 3],
|
||||
[1.5, nan, 3]])
|
||||
|
||||
other = DataFrame([[3.6, 2., np.nan],
|
||||
[np.nan, np.nan, 7]], index=[1, 3])
|
||||
|
||||
df.update(other, overwrite=False)
|
||||
|
||||
expected = DataFrame([[1.5, nan, 3],
|
||||
[1.5, 2, 3],
|
||||
[1.5, nan, 3],
|
||||
[1.5, nan, 3.]])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_update_filtered(self):
|
||||
df = DataFrame([[1.5, nan, 3.],
|
||||
[1.5, nan, 3.],
|
||||
[1.5, nan, 3],
|
||||
[1.5, nan, 3]])
|
||||
|
||||
other = DataFrame([[3.6, 2., np.nan],
|
||||
[np.nan, np.nan, 7]], index=[1, 3])
|
||||
|
||||
df.update(other, filter_func=lambda x: x > 2)
|
||||
|
||||
expected = DataFrame([[1.5, nan, 3],
|
||||
[1.5, nan, 3],
|
||||
[1.5, nan, 3],
|
||||
[1.5, nan, 7.]])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_update_raise(self):
|
||||
df = DataFrame([[1.5, 1, 3.],
|
||||
[1.5, nan, 3.],
|
||||
[1.5, nan, 3],
|
||||
[1.5, nan, 3]])
|
||||
|
||||
other = DataFrame([[2., nan],
|
||||
[nan, 7]], index=[1, 3], columns=[1, 2])
|
||||
with tm.assert_raises_regex(ValueError, "Data overlaps"):
|
||||
df.update(other, raise_conflict=True)
|
||||
|
||||
def test_update_from_non_df(self):
|
||||
d = {'a': Series([1, 2, 3, 4]), 'b': Series([5, 6, 7, 8])}
|
||||
df = DataFrame(d)
|
||||
|
||||
d['a'] = Series([5, 6, 7, 8])
|
||||
df.update(d)
|
||||
|
||||
expected = DataFrame(d)
|
||||
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
d = {'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}
|
||||
df = DataFrame(d)
|
||||
|
||||
d['a'] = [5, 6, 7, 8]
|
||||
df.update(d)
|
||||
|
||||
expected = DataFrame(d)
|
||||
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_join_str_datetime(self):
|
||||
str_dates = ['20120209', '20120222']
|
||||
dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
|
||||
|
||||
A = DataFrame(str_dates, index=lrange(2), columns=['aa'])
|
||||
C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)
|
||||
|
||||
tst = A.join(C, on='aa')
|
||||
|
||||
assert len(tst.columns) == 3
|
||||
|
||||
def test_join_multiindex_leftright(self):
|
||||
# GH 10741
|
||||
df1 = (pd.DataFrame([['a', 'x', 0.471780], ['a', 'y', 0.774908],
|
||||
['a', 'z', 0.563634], ['b', 'x', -0.353756],
|
||||
['b', 'y', 0.368062], ['b', 'z', -1.721840],
|
||||
['c', 'x', 1], ['c', 'y', 2], ['c', 'z', 3]],
|
||||
columns=['first', 'second', 'value1'])
|
||||
.set_index(['first', 'second']))
|
||||
|
||||
df2 = (pd.DataFrame([['a', 10], ['b', 20]],
|
||||
columns=['first', 'value2'])
|
||||
.set_index(['first']))
|
||||
|
||||
exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10],
|
||||
[-0.353756, 20], [0.368062, 20],
|
||||
[-1.721840, 20],
|
||||
[1.000000, np.nan], [2.000000, np.nan],
|
||||
[3.000000, np.nan]],
|
||||
index=df1.index, columns=['value1', 'value2'])
|
||||
|
||||
# these must be the same results (but columns are flipped)
|
||||
assert_frame_equal(df1.join(df2, how='left'), exp)
|
||||
assert_frame_equal(df2.join(df1, how='right'),
|
||||
exp[['value2', 'value1']])
|
||||
|
||||
exp_idx = pd.MultiIndex.from_product([['a', 'b'], ['x', 'y', 'z']],
|
||||
names=['first', 'second'])
|
||||
exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10],
|
||||
[-0.353756, 20], [0.368062, 20], [-1.721840, 20]],
|
||||
index=exp_idx, columns=['value1', 'value2'])
|
||||
|
||||
assert_frame_equal(df1.join(df2, how='right'), exp)
|
||||
assert_frame_equal(df2.join(df1, how='left'),
|
||||
exp[['value2', 'value1']])
|
||||
|
||||
def test_concat_named_keys(self):
|
||||
# GH 14252
|
||||
df = pd.DataFrame({'foo': [1, 2], 'bar': [0.1, 0.2]})
|
||||
index = Index(['a', 'b'], name='baz')
|
||||
concatted_named_from_keys = pd.concat([df, df], keys=index)
|
||||
expected_named = pd.DataFrame(
|
||||
{'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]},
|
||||
index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]),
|
||||
names=['baz', None]))
|
||||
assert_frame_equal(concatted_named_from_keys, expected_named)
|
||||
|
||||
index_no_name = Index(['a', 'b'], name=None)
|
||||
concatted_named_from_names = pd.concat(
|
||||
[df, df], keys=index_no_name, names=['baz'])
|
||||
assert_frame_equal(concatted_named_from_names, expected_named)
|
||||
|
||||
concatted_unnamed = pd.concat([df, df], keys=index_no_name)
|
||||
expected_unnamed = pd.DataFrame(
|
||||
{'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]},
|
||||
index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]),
|
||||
names=[None, None]))
|
||||
assert_frame_equal(concatted_unnamed, expected_unnamed)
|
||||
|
||||
def test_concat_axis_parameter(self):
|
||||
# GH 14369
|
||||
df1 = pd.DataFrame({'A': [0.1, 0.2]}, index=range(2))
|
||||
df2 = pd.DataFrame({'A': [0.3, 0.4]}, index=range(2))
|
||||
|
||||
# Index/row/0 DataFrame
|
||||
expected_index = pd.DataFrame(
|
||||
{'A': [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1])
|
||||
|
||||
concatted_index = pd.concat([df1, df2], axis='index')
|
||||
assert_frame_equal(concatted_index, expected_index)
|
||||
|
||||
concatted_row = pd.concat([df1, df2], axis='rows')
|
||||
assert_frame_equal(concatted_row, expected_index)
|
||||
|
||||
concatted_0 = pd.concat([df1, df2], axis=0)
|
||||
assert_frame_equal(concatted_0, expected_index)
|
||||
|
||||
# Columns/1 DataFrame
|
||||
expected_columns = pd.DataFrame(
|
||||
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=['A', 'A'])
|
||||
|
||||
concatted_columns = pd.concat([df1, df2], axis='columns')
|
||||
assert_frame_equal(concatted_columns, expected_columns)
|
||||
|
||||
concatted_1 = pd.concat([df1, df2], axis=1)
|
||||
assert_frame_equal(concatted_1, expected_columns)
|
||||
|
||||
series1 = pd.Series([0.1, 0.2])
|
||||
series2 = pd.Series([0.3, 0.4])
|
||||
|
||||
# Index/row/0 Series
|
||||
expected_index_series = pd.Series(
|
||||
[0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1])
|
||||
|
||||
concatted_index_series = pd.concat([series1, series2], axis='index')
|
||||
assert_series_equal(concatted_index_series, expected_index_series)
|
||||
|
||||
concatted_row_series = pd.concat([series1, series2], axis='rows')
|
||||
assert_series_equal(concatted_row_series, expected_index_series)
|
||||
|
||||
concatted_0_series = pd.concat([series1, series2], axis=0)
|
||||
assert_series_equal(concatted_0_series, expected_index_series)
|
||||
|
||||
# Columns/1 Series
|
||||
expected_columns_series = pd.DataFrame(
|
||||
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1])
|
||||
|
||||
concatted_columns_series = pd.concat(
|
||||
[series1, series2], axis='columns')
|
||||
assert_frame_equal(concatted_columns_series, expected_columns_series)
|
||||
|
||||
concatted_1_series = pd.concat([series1, series2], axis=1)
|
||||
assert_frame_equal(concatted_1_series, expected_columns_series)
|
||||
|
||||
# Testing ValueError
|
||||
with tm.assert_raises_regex(ValueError, 'No axis named'):
|
||||
pd.concat([series1, series2], axis='something')
|
||||
|
||||
def test_concat_numerical_names(self):
|
||||
# #15262 # #12223
|
||||
df = pd.DataFrame({'col': range(9)},
|
||||
dtype='int32',
|
||||
index=(pd.MultiIndex
|
||||
.from_product([['A0', 'A1', 'A2'],
|
||||
['B0', 'B1', 'B2']],
|
||||
names=[1, 2])))
|
||||
result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :]))
|
||||
expected = pd.DataFrame({'col': [0, 1, 7, 8]},
|
||||
dtype='int32',
|
||||
index=pd.MultiIndex.from_tuples([('A0', 'B0'),
|
||||
('A0', 'B1'),
|
||||
('A2', 'B1'),
|
||||
('A2', 'B2')],
|
||||
names=[1, 2]))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestDataFrameCombineFirst(TestData):
|
||||
|
||||
def test_combine_first_mixed(self):
|
||||
a = Series(['a', 'b'], index=lrange(2))
|
||||
b = Series(lrange(2), index=lrange(2))
|
||||
f = DataFrame({'A': a, 'B': b})
|
||||
|
||||
a = Series(['a', 'b'], index=lrange(5, 7))
|
||||
b = Series(lrange(2), index=lrange(5, 7))
|
||||
g = DataFrame({'A': a, 'B': b})
|
||||
|
||||
exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]},
|
||||
index=[0, 1, 5, 6])
|
||||
combined = f.combine_first(g)
|
||||
tm.assert_frame_equal(combined, exp)
|
||||
|
||||
def test_combine_first(self):
|
||||
# disjoint
|
||||
head, tail = self.frame[:5], self.frame[5:]
|
||||
|
||||
combined = head.combine_first(tail)
|
||||
reordered_frame = self.frame.reindex(combined.index)
|
||||
assert_frame_equal(combined, reordered_frame)
|
||||
assert tm.equalContents(combined.columns, self.frame.columns)
|
||||
assert_series_equal(combined['A'], reordered_frame['A'])
|
||||
|
||||
# same index
|
||||
fcopy = self.frame.copy()
|
||||
fcopy['A'] = 1
|
||||
del fcopy['C']
|
||||
|
||||
fcopy2 = self.frame.copy()
|
||||
fcopy2['B'] = 0
|
||||
del fcopy2['D']
|
||||
|
||||
combined = fcopy.combine_first(fcopy2)
|
||||
|
||||
assert (combined['A'] == 1).all()
|
||||
assert_series_equal(combined['B'], fcopy['B'])
|
||||
assert_series_equal(combined['C'], fcopy2['C'])
|
||||
assert_series_equal(combined['D'], fcopy['D'])
|
||||
|
||||
# overlap
|
||||
head, tail = reordered_frame[:10].copy(), reordered_frame
|
||||
head['A'] = 1
|
||||
|
||||
combined = head.combine_first(tail)
|
||||
assert (combined['A'][:10] == 1).all()
|
||||
|
||||
# reverse overlap
|
||||
tail['A'][:10] = 0
|
||||
combined = tail.combine_first(head)
|
||||
assert (combined['A'][:10] == 0).all()
|
||||
|
||||
# no overlap
|
||||
f = self.frame[:10]
|
||||
g = self.frame[10:]
|
||||
combined = f.combine_first(g)
|
||||
assert_series_equal(combined['A'].reindex(f.index), f['A'])
|
||||
assert_series_equal(combined['A'].reindex(g.index), g['A'])
|
||||
|
||||
# corner cases
|
||||
comb = self.frame.combine_first(self.empty)
|
||||
assert_frame_equal(comb, self.frame)
|
||||
|
||||
comb = self.empty.combine_first(self.frame)
|
||||
assert_frame_equal(comb, self.frame)
|
||||
|
||||
comb = self.frame.combine_first(DataFrame(index=["faz", "boo"]))
|
||||
assert "faz" in comb.index
|
||||
|
||||
# #2525
|
||||
df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)])
|
||||
df2 = DataFrame({}, columns=['b'])
|
||||
result = df.combine_first(df2)
|
||||
assert 'b' in result
|
||||
|
||||
def test_combine_first_mixed_bug(self):
|
||||
idx = Index(['a', 'b', 'c', 'e'])
|
||||
ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
|
||||
ser2 = Series(['a', 'b', 'c', 'e'], index=idx)
|
||||
ser3 = Series([12, 4, 5, 97], index=idx)
|
||||
|
||||
frame1 = DataFrame({"col0": ser1,
|
||||
"col2": ser2,
|
||||
"col3": ser3})
|
||||
|
||||
idx = Index(['a', 'b', 'c', 'f'])
|
||||
ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
|
||||
ser2 = Series(['a', 'b', 'c', 'f'], index=idx)
|
||||
ser3 = Series([12, 4, 5, 97], index=idx)
|
||||
|
||||
frame2 = DataFrame({"col1": ser1,
|
||||
"col2": ser2,
|
||||
"col5": ser3})
|
||||
|
||||
combined = frame1.combine_first(frame2)
|
||||
assert len(combined.columns) == 5
|
||||
|
||||
# gh 3016 (same as in update)
|
||||
df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
|
||||
columns=['A', 'B', 'bool1', 'bool2'])
|
||||
|
||||
other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
|
||||
result = df.combine_first(other)
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
df.loc[0, 'A'] = np.nan
|
||||
result = df.combine_first(other)
|
||||
df.loc[0, 'A'] = 45
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
# doc example
|
||||
df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan],
|
||||
'B': [np.nan, 2., 3., np.nan, 6.]})
|
||||
|
||||
df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.],
|
||||
'B': [np.nan, np.nan, 3., 4., 6., 8.]})
|
||||
|
||||
result = df1.combine_first(df2)
|
||||
expected = DataFrame(
|
||||
{'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH3552, return object dtype with bools
|
||||
df1 = DataFrame(
|
||||
[[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]])
|
||||
df2 = DataFrame(
|
||||
[[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2])
|
||||
|
||||
result = df1.combine_first(df2)[2]
|
||||
expected = Series([True, True, False], name=2)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# GH 3593, converting datetime64[ns] incorrecly
|
||||
df0 = DataFrame({"a": [datetime(2000, 1, 1),
|
||||
datetime(2000, 1, 2),
|
||||
datetime(2000, 1, 3)]})
|
||||
df1 = DataFrame({"a": [None, None, None]})
|
||||
df2 = df1.combine_first(df0)
|
||||
assert_frame_equal(df2, df0)
|
||||
|
||||
df2 = df0.combine_first(df1)
|
||||
assert_frame_equal(df2, df0)
|
||||
|
||||
df0 = DataFrame({"a": [datetime(2000, 1, 1),
|
||||
datetime(2000, 1, 2),
|
||||
datetime(2000, 1, 3)]})
|
||||
df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
|
||||
df2 = df1.combine_first(df0)
|
||||
result = df0.copy()
|
||||
result.iloc[0, :] = df1.iloc[0, :]
|
||||
assert_frame_equal(df2, result)
|
||||
|
||||
df2 = df0.combine_first(df1)
|
||||
assert_frame_equal(df2, df0)
|
||||
|
||||
def test_combine_first_align_nan(self):
|
||||
# GH 7509 (not fixed)
|
||||
dfa = pd.DataFrame([[pd.Timestamp('2011-01-01'), 2]],
|
||||
columns=['a', 'b'])
|
||||
dfb = pd.DataFrame([[4], [5]], columns=['b'])
|
||||
assert dfa['a'].dtype == 'datetime64[ns]'
|
||||
assert dfa['b'].dtype == 'int64'
|
||||
|
||||
res = dfa.combine_first(dfb)
|
||||
exp = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'), pd.NaT],
|
||||
'b': [2., 5.]}, columns=['a', 'b'])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res['a'].dtype == 'datetime64[ns]'
|
||||
# ToDo: this must be int64
|
||||
assert res['b'].dtype == 'float64'
|
||||
|
||||
res = dfa.iloc[:0].combine_first(dfb)
|
||||
exp = pd.DataFrame({'a': [np.nan, np.nan],
|
||||
'b': [4, 5]}, columns=['a', 'b'])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
# ToDo: this must be datetime64
|
||||
assert res['a'].dtype == 'float64'
|
||||
# ToDo: this must be int64
|
||||
assert res['b'].dtype == 'int64'
|
||||
|
||||
def test_combine_first_timezone(self):
|
||||
# see gh-7630
|
||||
data1 = pd.to_datetime('20100101 01:01').tz_localize('UTC')
|
||||
df1 = pd.DataFrame(columns=['UTCdatetime', 'abc'],
|
||||
data=data1,
|
||||
index=pd.date_range('20140627', periods=1))
|
||||
data2 = pd.to_datetime('20121212 12:12').tz_localize('UTC')
|
||||
df2 = pd.DataFrame(columns=['UTCdatetime', 'xyz'],
|
||||
data=data2,
|
||||
index=pd.date_range('20140628', periods=1))
|
||||
res = df2[['UTCdatetime']].combine_first(df1)
|
||||
exp = pd.DataFrame({'UTCdatetime': [pd.Timestamp('2010-01-01 01:01',
|
||||
tz='UTC'),
|
||||
pd.Timestamp('2012-12-12 12:12',
|
||||
tz='UTC')],
|
||||
'abc': [pd.Timestamp('2010-01-01 01:01:00',
|
||||
tz='UTC'), pd.NaT]},
|
||||
columns=['UTCdatetime', 'abc'],
|
||||
index=pd.date_range('20140627', periods=2,
|
||||
freq='D'))
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res['UTCdatetime'].dtype == 'datetime64[ns, UTC]'
|
||||
assert res['abc'].dtype == 'datetime64[ns, UTC]'
|
||||
|
||||
# see gh-10567
|
||||
dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='UTC')
|
||||
df1 = pd.DataFrame({'DATE': dts1})
|
||||
dts2 = pd.date_range('2015-01-03', '2015-01-05', tz='UTC')
|
||||
df2 = pd.DataFrame({'DATE': dts2})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(res, df1)
|
||||
assert res['DATE'].dtype == 'datetime64[ns, UTC]'
|
||||
|
||||
dts1 = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03',
|
||||
'2011-01-04'], tz='US/Eastern')
|
||||
df1 = pd.DataFrame({'DATE': dts1}, index=[1, 3, 5, 7])
|
||||
dts2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02',
|
||||
'2012-01-03'], tz='US/Eastern')
|
||||
df2 = pd.DataFrame({'DATE': dts2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.DatetimeIndex(['2011-01-01', '2012-01-01', 'NaT',
|
||||
'2012-01-02', '2011-01-03', '2011-01-04'],
|
||||
tz='US/Eastern')
|
||||
exp = pd.DataFrame({'DATE': exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# different tz
|
||||
dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='US/Eastern')
|
||||
df1 = pd.DataFrame({'DATE': dts1})
|
||||
dts2 = pd.date_range('2015-01-03', '2015-01-05')
|
||||
df2 = pd.DataFrame({'DATE': dts2})
|
||||
|
||||
# if df1 doesn't have NaN, keep its dtype
|
||||
res = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(res, df1)
|
||||
assert res['DATE'].dtype == 'datetime64[ns, US/Eastern]'
|
||||
|
||||
dts1 = pd.date_range('2015-01-01', '2015-01-02', tz='US/Eastern')
|
||||
df1 = pd.DataFrame({'DATE': dts1})
|
||||
dts2 = pd.date_range('2015-01-01', '2015-01-03')
|
||||
df2 = pd.DataFrame({'DATE': dts2})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = [pd.Timestamp('2015-01-01', tz='US/Eastern'),
|
||||
pd.Timestamp('2015-01-02', tz='US/Eastern'),
|
||||
pd.Timestamp('2015-01-03')]
|
||||
exp = pd.DataFrame({'DATE': exp_dts})
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res['DATE'].dtype == 'object'
|
||||
|
||||
def test_combine_first_timedelta(self):
|
||||
data1 = pd.TimedeltaIndex(['1 day', 'NaT', '3 day', '4day'])
|
||||
df1 = pd.DataFrame({'TD': data1}, index=[1, 3, 5, 7])
|
||||
data2 = pd.TimedeltaIndex(['10 day', '11 day', '12 day'])
|
||||
df2 = pd.DataFrame({'TD': data2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.TimedeltaIndex(['1 day', '10 day', 'NaT',
|
||||
'11 day', '3 day', '4 day'])
|
||||
exp = pd.DataFrame({'TD': exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res['TD'].dtype == 'timedelta64[ns]'
|
||||
|
||||
def test_combine_first_period(self):
|
||||
data1 = pd.PeriodIndex(['2011-01', 'NaT', '2011-03',
|
||||
'2011-04'], freq='M')
|
||||
df1 = pd.DataFrame({'P': data1}, index=[1, 3, 5, 7])
|
||||
data2 = pd.PeriodIndex(['2012-01-01', '2012-02',
|
||||
'2012-03'], freq='M')
|
||||
df2 = pd.DataFrame({'P': data2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.PeriodIndex(['2011-01', '2012-01', 'NaT',
|
||||
'2012-02', '2011-03', '2011-04'],
|
||||
freq='M')
|
||||
exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res['P'].dtype == 'object'
|
||||
|
||||
# different freq
|
||||
dts2 = pd.PeriodIndex(['2012-01-01', '2012-01-02',
|
||||
'2012-01-03'], freq='D')
|
||||
df2 = pd.DataFrame({'P': dts2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = [pd.Period('2011-01', freq='M'),
|
||||
pd.Period('2012-01-01', freq='D'),
|
||||
pd.NaT,
|
||||
pd.Period('2012-01-02', freq='D'),
|
||||
pd.Period('2011-03', freq='M'),
|
||||
pd.Period('2011-04', freq='M')]
|
||||
exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res['P'].dtype == 'object'
|
||||
|
||||
def test_combine_first_int(self):
|
||||
# GH14687 - integer series that do no align exactly
|
||||
|
||||
df1 = pd.DataFrame({'a': [0, 1, 3, 5]}, dtype='int64')
|
||||
df2 = pd.DataFrame({'a': [1, 4]}, dtype='int64')
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(res, df1)
|
||||
assert res['a'].dtype == 'int64'
|
||||
|
||||
def test_concat_datetime_datetime64_frame(self):
|
||||
# #2624
|
||||
rows = []
|
||||
rows.append([datetime(2010, 1, 1), 1])
|
||||
rows.append([datetime(2010, 1, 2), 'hi'])
|
||||
|
||||
df2_obj = DataFrame.from_records(rows, columns=['date', 'test'])
|
||||
|
||||
ind = date_range(start="2000/1/1", freq="D", periods=10)
|
||||
df1 = DataFrame({'date': ind, 'test': lrange(10)})
|
||||
|
||||
# it works!
|
||||
pd.concat([df1, df2_obj])
|
||||
|
||||
|
||||
class TestDataFrameUpdate(TestData):
|
||||
|
||||
def test_update_nan(self):
|
||||
# #15593 #15617
|
||||
# test 1
|
||||
df1 = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)})
|
||||
df2 = DataFrame({'A': [None, 2, 3]})
|
||||
expected = df1.copy()
|
||||
df1.update(df2, overwrite=False)
|
||||
|
||||
tm.assert_frame_equal(df1, expected)
|
||||
|
||||
# test 2
|
||||
df1 = DataFrame({'A': [1.0, None, 3],
|
||||
'B': date_range('2000', periods=3)})
|
||||
df2 = DataFrame({'A': [None, 2, 3]})
|
||||
expected = DataFrame({'A': [1.0, 2, 3],
|
||||
'B': date_range('2000', periods=3)})
|
||||
df1.update(df2, overwrite=False)
|
||||
|
||||
tm.assert_frame_equal(df1, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,330 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import pytest
|
||||
import pytz
|
||||
import collections
|
||||
from collections import OrderedDict, defaultdict
|
||||
import numpy as np
|
||||
|
||||
from pandas import compat
|
||||
from pandas.compat import long
|
||||
from pandas import (DataFrame, Series, MultiIndex, Timestamp,
|
||||
date_range)
|
||||
|
||||
import pandas.util.testing as tm
|
||||
from pandas.tests.frame.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameConvertTo(TestData):
|
||||
|
||||
def test_to_dict_timestamp(self):
|
||||
|
||||
# GH11247
|
||||
# split/records producing np.datetime64 rather than Timestamps
|
||||
# on datetime64[ns] dtypes only
|
||||
|
||||
tsmp = Timestamp('20130101')
|
||||
test_data = DataFrame({'A': [tsmp, tsmp], 'B': [tsmp, tsmp]})
|
||||
test_data_mixed = DataFrame({'A': [tsmp, tsmp], 'B': [1, 2]})
|
||||
|
||||
expected_records = [{'A': tsmp, 'B': tsmp},
|
||||
{'A': tsmp, 'B': tsmp}]
|
||||
expected_records_mixed = [{'A': tsmp, 'B': 1},
|
||||
{'A': tsmp, 'B': 2}]
|
||||
|
||||
assert (test_data.to_dict(orient='records') ==
|
||||
expected_records)
|
||||
assert (test_data_mixed.to_dict(orient='records') ==
|
||||
expected_records_mixed)
|
||||
|
||||
expected_series = {
|
||||
'A': Series([tsmp, tsmp], name='A'),
|
||||
'B': Series([tsmp, tsmp], name='B'),
|
||||
}
|
||||
expected_series_mixed = {
|
||||
'A': Series([tsmp, tsmp], name='A'),
|
||||
'B': Series([1, 2], name='B'),
|
||||
}
|
||||
|
||||
tm.assert_dict_equal(test_data.to_dict(orient='series'),
|
||||
expected_series)
|
||||
tm.assert_dict_equal(test_data_mixed.to_dict(orient='series'),
|
||||
expected_series_mixed)
|
||||
|
||||
expected_split = {
|
||||
'index': [0, 1],
|
||||
'data': [[tsmp, tsmp],
|
||||
[tsmp, tsmp]],
|
||||
'columns': ['A', 'B']
|
||||
}
|
||||
expected_split_mixed = {
|
||||
'index': [0, 1],
|
||||
'data': [[tsmp, 1],
|
||||
[tsmp, 2]],
|
||||
'columns': ['A', 'B']
|
||||
}
|
||||
|
||||
tm.assert_dict_equal(test_data.to_dict(orient='split'),
|
||||
expected_split)
|
||||
tm.assert_dict_equal(test_data_mixed.to_dict(orient='split'),
|
||||
expected_split_mixed)
|
||||
|
||||
def test_to_dict_invalid_orient(self):
|
||||
df = DataFrame({'A': [0, 1]})
|
||||
pytest.raises(ValueError, df.to_dict, orient='xinvalid')
|
||||
|
||||
def test_to_records_dt64(self):
|
||||
df = DataFrame([["one", "two", "three"],
|
||||
["four", "five", "six"]],
|
||||
index=date_range("2012-01-01", "2012-01-02"))
|
||||
|
||||
# convert_datetime64 defaults to None
|
||||
expected = df.index.values[0]
|
||||
result = df.to_records()['index'][0]
|
||||
assert expected == result
|
||||
|
||||
# check for FutureWarning if convert_datetime64=False is passed
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
expected = df.index.values[0]
|
||||
result = df.to_records(convert_datetime64=False)['index'][0]
|
||||
assert expected == result
|
||||
|
||||
# check for FutureWarning if convert_datetime64=True is passed
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
expected = df.index[0]
|
||||
result = df.to_records(convert_datetime64=True)['index'][0]
|
||||
assert expected == result
|
||||
|
||||
def test_to_records_with_multindex(self):
|
||||
# GH3189
|
||||
index = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
|
||||
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
|
||||
data = np.zeros((8, 4))
|
||||
df = DataFrame(data, index=index)
|
||||
r = df.to_records(index=True)['level_0']
|
||||
assert 'bar' in r
|
||||
assert 'one' not in r
|
||||
|
||||
def test_to_records_with_Mapping_type(self):
|
||||
import email
|
||||
from email.parser import Parser
|
||||
import collections
|
||||
|
||||
collections.Mapping.register(email.message.Message)
|
||||
|
||||
headers = Parser().parsestr('From: <user@example.com>\n'
|
||||
'To: <someone_else@example.com>\n'
|
||||
'Subject: Test message\n'
|
||||
'\n'
|
||||
'Body would go here\n')
|
||||
|
||||
frame = DataFrame.from_records([headers])
|
||||
all(x in frame for x in ['Type', 'Subject', 'From'])
|
||||
|
||||
def test_to_records_floats(self):
|
||||
df = DataFrame(np.random.rand(10, 10))
|
||||
df.to_records()
|
||||
|
||||
def test_to_records_index_name(self):
|
||||
df = DataFrame(np.random.randn(3, 3))
|
||||
df.index.name = 'X'
|
||||
rs = df.to_records()
|
||||
assert 'X' in rs.dtype.fields
|
||||
|
||||
df = DataFrame(np.random.randn(3, 3))
|
||||
rs = df.to_records()
|
||||
assert 'index' in rs.dtype.fields
|
||||
|
||||
df.index = MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')])
|
||||
df.index.names = ['A', None]
|
||||
rs = df.to_records()
|
||||
assert 'level_0' in rs.dtype.fields
|
||||
|
||||
def test_to_records_with_unicode_index(self):
|
||||
# GH13172
|
||||
# unicode_literals conflict with to_records
|
||||
result = DataFrame([{u'a': u'x', u'b': 'y'}]).set_index(u'a')\
|
||||
.to_records()
|
||||
expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')])
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
def test_to_records_with_unicode_column_names(self):
|
||||
# xref issue: https://github.com/numpy/numpy/issues/2407
|
||||
# Issue #11879. to_records used to raise an exception when used
|
||||
# with column names containing non-ascii characters in Python 2
|
||||
result = DataFrame(data={u"accented_name_é": [1.0]}).to_records()
|
||||
|
||||
# Note that numpy allows for unicode field names but dtypes need
|
||||
# to be specified using dictionary instead of list of tuples.
|
||||
expected = np.rec.array(
|
||||
[(0, 1.0)],
|
||||
dtype={"names": ["index", u"accented_name_é"],
|
||||
"formats": ['=i8', '=f8']}
|
||||
)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
def test_to_records_with_categorical(self):
|
||||
|
||||
# GH8626
|
||||
|
||||
# dict creation
|
||||
df = DataFrame({'A': list('abc')}, dtype='category')
|
||||
expected = Series(list('abc'), dtype='category', name='A')
|
||||
tm.assert_series_equal(df['A'], expected)
|
||||
|
||||
# list-like creation
|
||||
df = DataFrame(list('abc'), dtype='category')
|
||||
expected = Series(list('abc'), dtype='category', name=0)
|
||||
tm.assert_series_equal(df[0], expected)
|
||||
|
||||
# to record array
|
||||
# this coerces
|
||||
result = df.to_records()
|
||||
expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')],
|
||||
dtype=[('index', '=i8'), ('0', 'O')])
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('mapping', [
|
||||
dict,
|
||||
collections.defaultdict(list),
|
||||
collections.OrderedDict])
|
||||
def test_to_dict(self, mapping):
|
||||
test_data = {
|
||||
'A': {'1': 1, '2': 2},
|
||||
'B': {'1': '1', '2': '2', '3': '3'},
|
||||
}
|
||||
|
||||
# GH16122
|
||||
recons_data = DataFrame(test_data).to_dict(into=mapping)
|
||||
|
||||
for k, v in compat.iteritems(test_data):
|
||||
for k2, v2 in compat.iteritems(v):
|
||||
assert (v2 == recons_data[k][k2])
|
||||
|
||||
recons_data = DataFrame(test_data).to_dict("l", mapping)
|
||||
|
||||
for k, v in compat.iteritems(test_data):
|
||||
for k2, v2 in compat.iteritems(v):
|
||||
assert (v2 == recons_data[k][int(k2) - 1])
|
||||
|
||||
recons_data = DataFrame(test_data).to_dict("s", mapping)
|
||||
|
||||
for k, v in compat.iteritems(test_data):
|
||||
for k2, v2 in compat.iteritems(v):
|
||||
assert (v2 == recons_data[k][k2])
|
||||
|
||||
recons_data = DataFrame(test_data).to_dict("sp", mapping)
|
||||
expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'],
|
||||
'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]}
|
||||
tm.assert_dict_equal(recons_data, expected_split)
|
||||
|
||||
recons_data = DataFrame(test_data).to_dict("r", mapping)
|
||||
expected_records = [{'A': 1.0, 'B': '1'},
|
||||
{'A': 2.0, 'B': '2'},
|
||||
{'A': np.nan, 'B': '3'}]
|
||||
assert isinstance(recons_data, list)
|
||||
assert (len(recons_data) == 3)
|
||||
for l, r in zip(recons_data, expected_records):
|
||||
tm.assert_dict_equal(l, r)
|
||||
|
||||
# GH10844
|
||||
recons_data = DataFrame(test_data).to_dict("i")
|
||||
|
||||
for k, v in compat.iteritems(test_data):
|
||||
for k2, v2 in compat.iteritems(v):
|
||||
assert (v2 == recons_data[k2][k])
|
||||
|
||||
df = DataFrame(test_data)
|
||||
df['duped'] = df[df.columns[0]]
|
||||
recons_data = df.to_dict("i")
|
||||
comp_data = test_data.copy()
|
||||
comp_data['duped'] = comp_data[df.columns[0]]
|
||||
for k, v in compat.iteritems(comp_data):
|
||||
for k2, v2 in compat.iteritems(v):
|
||||
assert (v2 == recons_data[k2][k])
|
||||
|
||||
@pytest.mark.parametrize('mapping', [
|
||||
list,
|
||||
collections.defaultdict,
|
||||
[]])
|
||||
def test_to_dict_errors(self, mapping):
|
||||
# GH16122
|
||||
df = DataFrame(np.random.randn(3, 3))
|
||||
with pytest.raises(TypeError):
|
||||
df.to_dict(into=mapping)
|
||||
|
||||
def test_to_dict_not_unique_warning(self):
|
||||
# GH16927: When converting to a dict, if a column has a non-unique name
|
||||
# it will be dropped, throwing a warning.
|
||||
df = DataFrame([[1, 2, 3]], columns=['a', 'a', 'b'])
|
||||
with tm.assert_produces_warning(UserWarning):
|
||||
df.to_dict()
|
||||
|
||||
@pytest.mark.parametrize('tz', ['UTC', 'GMT', 'US/Eastern'])
|
||||
def test_to_records_datetimeindex_with_tz(self, tz):
|
||||
# GH13937
|
||||
dr = date_range('2016-01-01', periods=10,
|
||||
freq='S', tz=tz)
|
||||
|
||||
df = DataFrame({'datetime': dr}, index=dr)
|
||||
|
||||
expected = df.to_records()
|
||||
result = df.tz_convert("UTC").to_records()
|
||||
|
||||
# both converted to UTC, so they are equal
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_to_dict_box_scalars(self):
|
||||
# 14216
|
||||
# make sure that we are boxing properly
|
||||
d = {'a': [1], 'b': ['b']}
|
||||
|
||||
result = DataFrame(d).to_dict()
|
||||
assert isinstance(list(result['a'])[0], (int, long))
|
||||
assert isinstance(list(result['b'])[0], (int, long))
|
||||
|
||||
result = DataFrame(d).to_dict(orient='records')
|
||||
assert isinstance(result[0]['a'], (int, long))
|
||||
|
||||
def test_frame_to_dict_tz(self):
|
||||
# GH18372 When converting to dict with orient='records' columns of
|
||||
# datetime that are tz-aware were not converted to required arrays
|
||||
data = [(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
|
||||
(datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc,),)]
|
||||
df = DataFrame(list(data), columns=["d", ])
|
||||
|
||||
result = df.to_dict(orient='records')
|
||||
expected = [
|
||||
{'d': Timestamp('2017-11-18 21:53:00.219225+0000', tz=pytz.utc)},
|
||||
{'d': Timestamp('2017-11-18 22:06:30.061810+0000', tz=pytz.utc)},
|
||||
]
|
||||
tm.assert_dict_equal(result[0], expected[0])
|
||||
tm.assert_dict_equal(result[1], expected[1])
|
||||
|
||||
@pytest.mark.parametrize('into, expected', [
|
||||
(dict, {0: {'int_col': 1, 'float_col': 1.0},
|
||||
1: {'int_col': 2, 'float_col': 2.0},
|
||||
2: {'int_col': 3, 'float_col': 3.0}}),
|
||||
(OrderedDict, OrderedDict([(0, {'int_col': 1, 'float_col': 1.0}),
|
||||
(1, {'int_col': 2, 'float_col': 2.0}),
|
||||
(2, {'int_col': 3, 'float_col': 3.0})])),
|
||||
(defaultdict(list), defaultdict(list,
|
||||
{0: {'int_col': 1, 'float_col': 1.0},
|
||||
1: {'int_col': 2, 'float_col': 2.0},
|
||||
2: {'int_col': 3, 'float_col': 3.0}}))
|
||||
])
|
||||
def test_to_dict_index_dtypes(self, into, expected):
|
||||
# GH 18580
|
||||
# When using to_dict(orient='index') on a dataframe with int
|
||||
# and float columns only the int columns were cast to float
|
||||
|
||||
df = DataFrame({'int_col': [1, 2, 3],
|
||||
'float_col': [1.0, 2.0, 3.0]})
|
||||
|
||||
result = df.to_dict(orient='index', into=into)
|
||||
cols = ['int_col', 'float_col']
|
||||
result = DataFrame.from_dict(result, orient='index')[cols]
|
||||
expected = DataFrame.from_dict(expected, orient='index')[cols]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -1,906 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import pytest
|
||||
|
||||
from datetime import timedelta
|
||||
|
||||
import numpy as np
|
||||
from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp,
|
||||
Categorical, compat, concat, option_context)
|
||||
from pandas.compat import u
|
||||
from pandas import _np_version_under1p14
|
||||
|
||||
from pandas.core.dtypes.dtypes import DatetimeTZDtype, CategoricalDtype
|
||||
from pandas.tests.frame.common import TestData
|
||||
from pandas.util.testing import (assert_series_equal,
|
||||
assert_frame_equal,
|
||||
makeCustomDataframe as mkdf)
|
||||
import pandas.util.testing as tm
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class TestDataFrameDataTypes(TestData):
|
||||
|
||||
def test_concat_empty_dataframe_dtypes(self):
|
||||
df = DataFrame(columns=list("abc"))
|
||||
df['a'] = df['a'].astype(np.bool_)
|
||||
df['b'] = df['b'].astype(np.int32)
|
||||
df['c'] = df['c'].astype(np.float64)
|
||||
|
||||
result = pd.concat([df, df])
|
||||
assert result['a'].dtype == np.bool_
|
||||
assert result['b'].dtype == np.int32
|
||||
assert result['c'].dtype == np.float64
|
||||
|
||||
result = pd.concat([df, df.astype(np.float64)])
|
||||
assert result['a'].dtype == np.object_
|
||||
assert result['b'].dtype == np.float64
|
||||
assert result['c'].dtype == np.float64
|
||||
|
||||
def test_empty_frame_dtypes_ftypes(self):
|
||||
empty_df = pd.DataFrame()
|
||||
assert_series_equal(empty_df.dtypes, pd.Series(dtype=np.object))
|
||||
assert_series_equal(empty_df.ftypes, pd.Series(dtype=np.object))
|
||||
|
||||
nocols_df = pd.DataFrame(index=[1, 2, 3])
|
||||
assert_series_equal(nocols_df.dtypes, pd.Series(dtype=np.object))
|
||||
assert_series_equal(nocols_df.ftypes, pd.Series(dtype=np.object))
|
||||
|
||||
norows_df = pd.DataFrame(columns=list("abc"))
|
||||
assert_series_equal(norows_df.dtypes, pd.Series(
|
||||
np.object, index=list("abc")))
|
||||
assert_series_equal(norows_df.ftypes, pd.Series(
|
||||
'object:dense', index=list("abc")))
|
||||
|
||||
norows_int_df = pd.DataFrame(columns=list("abc")).astype(np.int32)
|
||||
assert_series_equal(norows_int_df.dtypes, pd.Series(
|
||||
np.dtype('int32'), index=list("abc")))
|
||||
assert_series_equal(norows_int_df.ftypes, pd.Series(
|
||||
'int32:dense', index=list("abc")))
|
||||
|
||||
odict = compat.OrderedDict
|
||||
df = pd.DataFrame(odict([('a', 1), ('b', True), ('c', 1.0)]),
|
||||
index=[1, 2, 3])
|
||||
ex_dtypes = pd.Series(odict([('a', np.int64),
|
||||
('b', np.bool),
|
||||
('c', np.float64)]))
|
||||
ex_ftypes = pd.Series(odict([('a', 'int64:dense'),
|
||||
('b', 'bool:dense'),
|
||||
('c', 'float64:dense')]))
|
||||
assert_series_equal(df.dtypes, ex_dtypes)
|
||||
assert_series_equal(df.ftypes, ex_ftypes)
|
||||
|
||||
# same but for empty slice of df
|
||||
assert_series_equal(df[:0].dtypes, ex_dtypes)
|
||||
assert_series_equal(df[:0].ftypes, ex_ftypes)
|
||||
|
||||
def test_datetime_with_tz_dtypes(self):
|
||||
tzframe = DataFrame({'A': date_range('20130101', periods=3),
|
||||
'B': date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'C': date_range('20130101', periods=3, tz='CET')})
|
||||
tzframe.iloc[1, 1] = pd.NaT
|
||||
tzframe.iloc[1, 2] = pd.NaT
|
||||
result = tzframe.dtypes.sort_index()
|
||||
expected = Series([np.dtype('datetime64[ns]'),
|
||||
DatetimeTZDtype('datetime64[ns, US/Eastern]'),
|
||||
DatetimeTZDtype('datetime64[ns, CET]')],
|
||||
['A', 'B', 'C'])
|
||||
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_dtypes_are_correct_after_column_slice(self):
|
||||
# GH6525
|
||||
df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_)
|
||||
odict = compat.OrderedDict
|
||||
assert_series_equal(df.dtypes,
|
||||
pd.Series(odict([('a', np.float_),
|
||||
('b', np.float_),
|
||||
('c', np.float_)])))
|
||||
assert_series_equal(df.iloc[:, 2:].dtypes,
|
||||
pd.Series(odict([('c', np.float_)])))
|
||||
assert_series_equal(df.dtypes,
|
||||
pd.Series(odict([('a', np.float_),
|
||||
('b', np.float_),
|
||||
('c', np.float_)])))
|
||||
|
||||
def test_select_dtypes_include_using_list_like(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.Categorical(list('abc')),
|
||||
'g': pd.date_range('20130101', periods=3),
|
||||
'h': pd.date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'i': pd.date_range('20130101', periods=3,
|
||||
tz='CET'),
|
||||
'j': pd.period_range('2013-01', periods=3,
|
||||
freq='M'),
|
||||
'k': pd.timedelta_range('1 day', periods=3)})
|
||||
|
||||
ri = df.select_dtypes(include=[np.number])
|
||||
ei = df[['b', 'c', 'd', 'k']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=[np.number], exclude=['timedelta'])
|
||||
ei = df[['b', 'c', 'd']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=[np.number, 'category'],
|
||||
exclude=['timedelta'])
|
||||
ei = df[['b', 'c', 'd', 'f']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=['datetime'])
|
||||
ei = df[['g']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=['datetime64'])
|
||||
ei = df[['g']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=['datetimetz'])
|
||||
ei = df[['h', 'i']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
pytest.raises(NotImplementedError,
|
||||
lambda: df.select_dtypes(include=['period']))
|
||||
|
||||
def test_select_dtypes_exclude_using_list_like(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True]})
|
||||
re = df.select_dtypes(exclude=[np.number])
|
||||
ee = df[['a', 'e']]
|
||||
assert_frame_equal(re, ee)
|
||||
|
||||
def test_select_dtypes_exclude_include_using_list_like(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.date_range('now', periods=3).values})
|
||||
exclude = np.datetime64,
|
||||
include = np.bool_, 'integer'
|
||||
r = df.select_dtypes(include=include, exclude=exclude)
|
||||
e = df[['b', 'c', 'e']]
|
||||
assert_frame_equal(r, e)
|
||||
|
||||
exclude = 'datetime',
|
||||
include = 'bool', 'int64', 'int32'
|
||||
r = df.select_dtypes(include=include, exclude=exclude)
|
||||
e = df[['b', 'e']]
|
||||
assert_frame_equal(r, e)
|
||||
|
||||
def test_select_dtypes_include_using_scalars(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.Categorical(list('abc')),
|
||||
'g': pd.date_range('20130101', periods=3),
|
||||
'h': pd.date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'i': pd.date_range('20130101', periods=3,
|
||||
tz='CET'),
|
||||
'j': pd.period_range('2013-01', periods=3,
|
||||
freq='M'),
|
||||
'k': pd.timedelta_range('1 day', periods=3)})
|
||||
|
||||
ri = df.select_dtypes(include=np.number)
|
||||
ei = df[['b', 'c', 'd', 'k']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include='datetime')
|
||||
ei = df[['g']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include='datetime64')
|
||||
ei = df[['g']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include='category')
|
||||
ei = df[['f']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
pytest.raises(NotImplementedError,
|
||||
lambda: df.select_dtypes(include='period'))
|
||||
|
||||
def test_select_dtypes_exclude_using_scalars(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.Categorical(list('abc')),
|
||||
'g': pd.date_range('20130101', periods=3),
|
||||
'h': pd.date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'i': pd.date_range('20130101', periods=3,
|
||||
tz='CET'),
|
||||
'j': pd.period_range('2013-01', periods=3,
|
||||
freq='M'),
|
||||
'k': pd.timedelta_range('1 day', periods=3)})
|
||||
|
||||
ri = df.select_dtypes(exclude=np.number)
|
||||
ei = df[['a', 'e', 'f', 'g', 'h', 'i', 'j']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(exclude='category')
|
||||
ei = df[['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
pytest.raises(NotImplementedError,
|
||||
lambda: df.select_dtypes(exclude='period'))
|
||||
|
||||
def test_select_dtypes_include_exclude_using_scalars(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.Categorical(list('abc')),
|
||||
'g': pd.date_range('20130101', periods=3),
|
||||
'h': pd.date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'i': pd.date_range('20130101', periods=3,
|
||||
tz='CET'),
|
||||
'j': pd.period_range('2013-01', periods=3,
|
||||
freq='M'),
|
||||
'k': pd.timedelta_range('1 day', periods=3)})
|
||||
|
||||
ri = df.select_dtypes(include=np.number, exclude='floating')
|
||||
ei = df[['b', 'c', 'k']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
def test_select_dtypes_include_exclude_mixed_scalars_lists(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.Categorical(list('abc')),
|
||||
'g': pd.date_range('20130101', periods=3),
|
||||
'h': pd.date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'i': pd.date_range('20130101', periods=3,
|
||||
tz='CET'),
|
||||
'j': pd.period_range('2013-01', periods=3,
|
||||
freq='M'),
|
||||
'k': pd.timedelta_range('1 day', periods=3)})
|
||||
|
||||
ri = df.select_dtypes(include=np.number,
|
||||
exclude=['floating', 'timedelta'])
|
||||
ei = df[['b', 'c']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=[np.number, 'category'],
|
||||
exclude='floating')
|
||||
ei = df[['b', 'c', 'f', 'k']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
def test_select_dtypes_duplicate_columns(self):
|
||||
# GH20839
|
||||
odict = compat.OrderedDict
|
||||
df = DataFrame(odict([('a', list('abc')),
|
||||
('b', list(range(1, 4))),
|
||||
('c', np.arange(3, 6).astype('u1')),
|
||||
('d', np.arange(4.0, 7.0, dtype='float64')),
|
||||
('e', [True, False, True]),
|
||||
('f', pd.date_range('now', periods=3).values)]))
|
||||
df.columns = ['a', 'a', 'b', 'b', 'b', 'c']
|
||||
|
||||
expected = DataFrame({'a': list(range(1, 4)),
|
||||
'b': np.arange(3, 6).astype('u1')})
|
||||
|
||||
result = df.select_dtypes(include=[np.number], exclude=['floating'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_select_dtypes_not_an_attr_but_still_valid_dtype(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.date_range('now', periods=3).values})
|
||||
df['g'] = df.f.diff()
|
||||
assert not hasattr(np, 'u8')
|
||||
r = df.select_dtypes(include=['i8', 'O'], exclude=['timedelta'])
|
||||
e = df[['a', 'b']]
|
||||
assert_frame_equal(r, e)
|
||||
|
||||
r = df.select_dtypes(include=['i8', 'O', 'timedelta64[ns]'])
|
||||
e = df[['a', 'b', 'g']]
|
||||
assert_frame_equal(r, e)
|
||||
|
||||
def test_select_dtypes_empty(self):
|
||||
df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))})
|
||||
with tm.assert_raises_regex(ValueError, 'at least one of '
|
||||
'include or exclude '
|
||||
'must be nonempty'):
|
||||
df.select_dtypes()
|
||||
|
||||
def test_select_dtypes_bad_datetime64(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.date_range('now', periods=3).values})
|
||||
with tm.assert_raises_regex(ValueError, '.+ is too specific'):
|
||||
df.select_dtypes(include=['datetime64[D]'])
|
||||
|
||||
with tm.assert_raises_regex(ValueError, '.+ is too specific'):
|
||||
df.select_dtypes(exclude=['datetime64[as]'])
|
||||
|
||||
def test_select_dtypes_datetime_with_tz(self):
|
||||
|
||||
df2 = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'),
|
||||
B=Timestamp('20130603', tz='CET')),
|
||||
index=range(5))
|
||||
df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
|
||||
result = df3.select_dtypes(include=['datetime64[ns]'])
|
||||
expected = df3.reindex(columns=[])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_select_dtypes_str_raises(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'g': list(u('abc')),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.date_range('now', periods=3).values})
|
||||
string_dtypes = set((str, 'str', np.string_, 'S1',
|
||||
'unicode', np.unicode_, 'U1'))
|
||||
try:
|
||||
string_dtypes.add(unicode)
|
||||
except NameError:
|
||||
pass
|
||||
for dt in string_dtypes:
|
||||
with tm.assert_raises_regex(TypeError,
|
||||
'string dtypes are not allowed'):
|
||||
df.select_dtypes(include=[dt])
|
||||
with tm.assert_raises_regex(TypeError,
|
||||
'string dtypes are not allowed'):
|
||||
df.select_dtypes(exclude=[dt])
|
||||
|
||||
def test_select_dtypes_bad_arg_raises(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'g': list(u('abc')),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.date_range('now', periods=3).values})
|
||||
with tm.assert_raises_regex(TypeError, 'data type.'
|
||||
'*not understood'):
|
||||
df.select_dtypes(['blargy, blarg, blarg'])
|
||||
|
||||
def test_select_dtypes_typecodes(self):
|
||||
# GH 11990
|
||||
df = mkdf(30, 3, data_gen_f=lambda x, y: np.random.random())
|
||||
expected = df
|
||||
FLOAT_TYPES = list(np.typecodes['AllFloat'])
|
||||
assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)
|
||||
|
||||
def test_dtypes_gh8722(self):
|
||||
self.mixed_frame['bool'] = self.mixed_frame['A'] > 0
|
||||
result = self.mixed_frame.dtypes
|
||||
expected = Series(dict((k, v.dtype)
|
||||
for k, v in compat.iteritems(self.mixed_frame)),
|
||||
index=result.index)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# compat, GH 8722
|
||||
with option_context('use_inf_as_na', True):
|
||||
df = DataFrame([[1]])
|
||||
result = df.dtypes
|
||||
assert_series_equal(result, Series({0: np.dtype('int64')}))
|
||||
|
||||
def test_ftypes(self):
|
||||
frame = self.mixed_float
|
||||
expected = Series(dict(A='float32:dense',
|
||||
B='float32:dense',
|
||||
C='float16:dense',
|
||||
D='float64:dense')).sort_values()
|
||||
result = frame.ftypes.sort_values()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_astype(self):
|
||||
casted = self.frame.astype(int)
|
||||
expected = DataFrame(self.frame.values.astype(int),
|
||||
index=self.frame.index,
|
||||
columns=self.frame.columns)
|
||||
assert_frame_equal(casted, expected)
|
||||
|
||||
casted = self.frame.astype(np.int32)
|
||||
expected = DataFrame(self.frame.values.astype(np.int32),
|
||||
index=self.frame.index,
|
||||
columns=self.frame.columns)
|
||||
assert_frame_equal(casted, expected)
|
||||
|
||||
self.frame['foo'] = '5'
|
||||
casted = self.frame.astype(int)
|
||||
expected = DataFrame(self.frame.values.astype(int),
|
||||
index=self.frame.index,
|
||||
columns=self.frame.columns)
|
||||
assert_frame_equal(casted, expected)
|
||||
|
||||
# mixed casting
|
||||
def _check_cast(df, v):
|
||||
assert (list(set(s.dtype.name for
|
||||
_, s in compat.iteritems(df)))[0] == v)
|
||||
|
||||
mn = self.all_mixed._get_numeric_data().copy()
|
||||
mn['little_float'] = np.array(12345., dtype='float16')
|
||||
mn['big_float'] = np.array(123456789101112., dtype='float64')
|
||||
|
||||
casted = mn.astype('float64')
|
||||
_check_cast(casted, 'float64')
|
||||
|
||||
casted = mn.astype('int64')
|
||||
_check_cast(casted, 'int64')
|
||||
|
||||
casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float32')
|
||||
_check_cast(casted, 'float32')
|
||||
|
||||
casted = mn.reindex(columns=['little_float']).astype('float16')
|
||||
_check_cast(casted, 'float16')
|
||||
|
||||
casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float16')
|
||||
_check_cast(casted, 'float16')
|
||||
|
||||
casted = mn.astype('float32')
|
||||
_check_cast(casted, 'float32')
|
||||
|
||||
casted = mn.astype('int32')
|
||||
_check_cast(casted, 'int32')
|
||||
|
||||
# to object
|
||||
casted = mn.astype('O')
|
||||
_check_cast(casted, 'object')
|
||||
|
||||
def test_astype_with_exclude_string(self):
|
||||
df = self.frame.copy()
|
||||
expected = self.frame.astype(int)
|
||||
df['string'] = 'foo'
|
||||
casted = df.astype(int, errors='ignore')
|
||||
|
||||
expected['string'] = 'foo'
|
||||
assert_frame_equal(casted, expected)
|
||||
|
||||
df = self.frame.copy()
|
||||
expected = self.frame.astype(np.int32)
|
||||
df['string'] = 'foo'
|
||||
casted = df.astype(np.int32, errors='ignore')
|
||||
|
||||
expected['string'] = 'foo'
|
||||
assert_frame_equal(casted, expected)
|
||||
|
||||
def test_astype_with_view(self):
|
||||
|
||||
tf = self.mixed_float.reindex(columns=['A', 'B', 'C'])
|
||||
|
||||
casted = tf.astype(np.int64)
|
||||
|
||||
casted = tf.astype(np.float32)
|
||||
|
||||
# this is the only real reason to do it this way
|
||||
tf = np.round(self.frame).astype(np.int32)
|
||||
casted = tf.astype(np.float32, copy=False)
|
||||
|
||||
# TODO(wesm): verification?
|
||||
tf = self.frame.astype(np.float64)
|
||||
casted = tf.astype(np.int64, copy=False) # noqa
|
||||
|
||||
def test_astype_cast_nan_inf_int(self):
|
||||
# GH14265, check nan and inf raise error when converting to int
|
||||
types = [np.int32, np.int64]
|
||||
values = [np.nan, np.inf]
|
||||
msg = 'Cannot convert non-finite values \\(NA or inf\\) to integer'
|
||||
|
||||
for this_type in types:
|
||||
for this_val in values:
|
||||
df = DataFrame([this_val])
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
df.astype(this_type)
|
||||
|
||||
def test_astype_str(self):
|
||||
# GH9757
|
||||
a = Series(date_range('2010-01-04', periods=5))
|
||||
b = Series(date_range('3/6/2012 00:00', periods=5, tz='US/Eastern'))
|
||||
c = Series([Timedelta(x, unit='d') for x in range(5)])
|
||||
d = Series(range(5))
|
||||
e = Series([0.0, 0.2, 0.4, 0.6, 0.8])
|
||||
|
||||
df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d, 'e': e})
|
||||
|
||||
# datetimelike
|
||||
# Test str and unicode on python 2.x and just str on python 3.x
|
||||
for tt in set([str, compat.text_type]):
|
||||
result = df.astype(tt)
|
||||
|
||||
expected = DataFrame({
|
||||
'a': list(map(tt, map(lambda x: Timestamp(x)._date_repr,
|
||||
a._values))),
|
||||
'b': list(map(tt, map(Timestamp, b._values))),
|
||||
'c': list(map(tt, map(lambda x: Timedelta(x)
|
||||
._repr_base(format='all'), c._values))),
|
||||
'd': list(map(tt, d._values)),
|
||||
'e': list(map(tt, e._values)),
|
||||
})
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# float/nan
|
||||
# 11302
|
||||
# consistency in astype(str)
|
||||
for tt in set([str, compat.text_type]):
|
||||
result = DataFrame([np.NaN]).astype(tt)
|
||||
expected = DataFrame(['nan'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = DataFrame([1.12345678901234567890]).astype(tt)
|
||||
if _np_version_under1p14:
|
||||
# < 1.14 truncates
|
||||
expected = DataFrame(['1.12345678901'])
|
||||
else:
|
||||
# >= 1.14 preserves the full repr
|
||||
expected = DataFrame(['1.1234567890123457'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("dtype_class", [dict, Series])
|
||||
def test_astype_dict_like(self, dtype_class):
|
||||
# GH7271 & GH16717
|
||||
a = Series(date_range('2010-01-04', periods=5))
|
||||
b = Series(range(5))
|
||||
c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
|
||||
d = Series(['1.0', '2', '3.14', '4', '5.4'])
|
||||
df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d})
|
||||
original = df.copy(deep=True)
|
||||
|
||||
# change type of a subset of columns
|
||||
dt1 = dtype_class({'b': 'str', 'd': 'float32'})
|
||||
result = df.astype(dt1)
|
||||
expected = DataFrame({
|
||||
'a': a,
|
||||
'b': Series(['0', '1', '2', '3', '4']),
|
||||
'c': c,
|
||||
'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')})
|
||||
assert_frame_equal(result, expected)
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
dt2 = dtype_class({'b': np.float32, 'c': 'float32', 'd': np.float64})
|
||||
result = df.astype(dt2)
|
||||
expected = DataFrame({
|
||||
'a': a,
|
||||
'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'),
|
||||
'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'),
|
||||
'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')})
|
||||
assert_frame_equal(result, expected)
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
# change all columns
|
||||
dt3 = dtype_class({'a': str, 'b': str, 'c': str, 'd': str})
|
||||
assert_frame_equal(df.astype(dt3),
|
||||
df.astype(str))
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
# error should be raised when using something other than column labels
|
||||
# in the keys of the dtype dict
|
||||
dt4 = dtype_class({'b': str, 2: str})
|
||||
dt5 = dtype_class({'e': str})
|
||||
pytest.raises(KeyError, df.astype, dt4)
|
||||
pytest.raises(KeyError, df.astype, dt5)
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
# if the dtypes provided are the same as the original dtypes, the
|
||||
# resulting DataFrame should be the same as the original DataFrame
|
||||
dt6 = dtype_class({col: df[col].dtype for col in df.columns})
|
||||
equiv = df.astype(dt6)
|
||||
assert_frame_equal(df, equiv)
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
# GH 16717
|
||||
# if dtypes provided is empty, the resulting DataFrame
|
||||
# should be the same as the original DataFrame
|
||||
dt7 = dtype_class({})
|
||||
result = df.astype(dt7)
|
||||
assert_frame_equal(df, equiv)
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
def test_astype_duplicate_col(self):
|
||||
a1 = Series([1, 2, 3, 4, 5], name='a')
|
||||
b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name='b')
|
||||
a2 = Series([0, 1, 2, 3, 4], name='a')
|
||||
df = concat([a1, b, a2], axis=1)
|
||||
|
||||
result = df.astype(str)
|
||||
a1_str = Series(['1', '2', '3', '4', '5'], dtype='str', name='a')
|
||||
b_str = Series(['0.1', '0.2', '0.4', '0.6', '0.8'], dtype=str,
|
||||
name='b')
|
||||
a2_str = Series(['0', '1', '2', '3', '4'], dtype='str', name='a')
|
||||
expected = concat([a1_str, b_str, a2_str], axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.astype({'a': 'str'})
|
||||
expected = concat([a1_str, b, a2_str], axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('dtype', [
|
||||
'category',
|
||||
CategoricalDtype(),
|
||||
CategoricalDtype(ordered=True),
|
||||
CategoricalDtype(ordered=False),
|
||||
CategoricalDtype(categories=list('abcdef')),
|
||||
CategoricalDtype(categories=list('edba'), ordered=False),
|
||||
CategoricalDtype(categories=list('edcb'), ordered=True)], ids=repr)
|
||||
def test_astype_categorical(self, dtype):
|
||||
# GH 18099
|
||||
d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')}
|
||||
df = DataFrame(d)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("cls", [
|
||||
pd.api.types.CategoricalDtype,
|
||||
pd.api.types.DatetimeTZDtype,
|
||||
pd.api.types.IntervalDtype
|
||||
])
|
||||
def test_astype_categoricaldtype_class_raises(self, cls):
|
||||
df = DataFrame({"A": ['a', 'a', 'b', 'c']})
|
||||
xpr = "Expected an instance of {}".format(cls.__name__)
|
||||
with tm.assert_raises_regex(TypeError, xpr):
|
||||
df.astype({"A": cls})
|
||||
|
||||
with tm.assert_raises_regex(TypeError, xpr):
|
||||
df['A'].astype(cls)
|
||||
|
||||
@pytest.mark.parametrize('dtype', [
|
||||
{100: 'float64', 200: 'uint64'}, 'category', 'float64'])
|
||||
def test_astype_column_metadata(self, dtype):
|
||||
# GH 19920
|
||||
columns = pd.UInt64Index([100, 200, 300], name='foo')
|
||||
df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
|
||||
df = df.astype(dtype)
|
||||
tm.assert_index_equal(df.columns, columns)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["M8", "m8"])
|
||||
@pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
|
||||
def test_astype_from_datetimelike_to_objectt(self, dtype, unit):
|
||||
# tests astype to object dtype
|
||||
# gh-19223 / gh-12425
|
||||
dtype = "{}[{}]".format(dtype, unit)
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(object)
|
||||
assert (result.dtypes == object).all()
|
||||
|
||||
if dtype.startswith('M8'):
|
||||
assert result.iloc[0, 0] == pd.to_datetime(1, unit=unit)
|
||||
else:
|
||||
assert result.iloc[0, 0] == pd.to_timedelta(1, unit=unit)
|
||||
|
||||
@pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
|
||||
@pytest.mark.parametrize("dtype", ["M8", "m8"])
|
||||
@pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
|
||||
def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
|
||||
# tests all units from numeric origination
|
||||
# gh-19223 / gh-12425
|
||||
dtype = "{}[{}]".format(dtype, unit)
|
||||
arr = np.array([[1, 2, 3]], dtype=arr_dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame(arr.astype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
|
||||
def test_astype_to_datetime_unit(self, unit):
|
||||
# tests all units from datetime origination
|
||||
# gh-19223
|
||||
dtype = "M8[{}]".format(unit)
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame(arr.astype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ['ns'])
|
||||
def test_astype_to_timedelta_unit_ns(self, unit):
|
||||
# preserver the timedelta conversion
|
||||
# gh-19223
|
||||
dtype = "m8[{}]".format(unit)
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame(arr.astype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ['us', 'ms', 's', 'h', 'm', 'D'])
|
||||
def test_astype_to_timedelta_unit(self, unit):
|
||||
# coerce to float
|
||||
# gh-19223
|
||||
dtype = "m8[{}]".format(unit)
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame(df.values.astype(dtype).astype(float))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
|
||||
def test_astype_to_incorrect_datetimelike(self, unit):
|
||||
# trying to astype a m to a M, or vice-versa
|
||||
# gh-19224
|
||||
dtype = "M8[{}]".format(unit)
|
||||
other = "m8[{}]".format(unit)
|
||||
|
||||
df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
|
||||
with pytest.raises(TypeError):
|
||||
df.astype(other)
|
||||
|
||||
df = DataFrame(np.array([[1, 2, 3]], dtype=other))
|
||||
with pytest.raises(TypeError):
|
||||
df.astype(dtype)
|
||||
|
||||
def test_timedeltas(self):
|
||||
df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3,
|
||||
freq='D')),
|
||||
B=Series([timedelta(days=i) for i in range(3)])))
|
||||
result = df.get_dtype_counts().sort_index()
|
||||
expected = Series(
|
||||
{'datetime64[ns]': 1, 'timedelta64[ns]': 1}).sort_index()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df['C'] = df['A'] + df['B']
|
||||
expected = Series(
|
||||
{'datetime64[ns]': 2, 'timedelta64[ns]': 1}).sort_values()
|
||||
result = df.get_dtype_counts().sort_values()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# mixed int types
|
||||
df['D'] = 1
|
||||
expected = Series({'datetime64[ns]': 2,
|
||||
'timedelta64[ns]': 1,
|
||||
'int64': 1}).sort_values()
|
||||
result = df.get_dtype_counts().sort_values()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_arg_for_errors_in_astype(self):
|
||||
# issue #14878
|
||||
|
||||
df = DataFrame([1, 2, 3])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
df.astype(np.float64, errors=True)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.astype(np.int8, raise_on_error=False)
|
||||
|
||||
df.astype(np.int8, errors='ignore')
|
||||
|
||||
@pytest.mark.parametrize('input_vals', [
|
||||
([1, 2]),
|
||||
(['1', '2']),
|
||||
(list(pd.date_range('1/1/2011', periods=2, freq='H'))),
|
||||
(list(pd.date_range('1/1/2011', periods=2, freq='H',
|
||||
tz='US/Eastern'))),
|
||||
([pd.Interval(left=0, right=5)]),
|
||||
])
|
||||
def test_constructor_list_str(self, input_vals, string_dtype):
|
||||
# GH 16605
|
||||
# Ensure that data elements are converted to strings when
|
||||
# dtype is str, 'str', or 'U'
|
||||
|
||||
result = DataFrame({'A': input_vals}, dtype=string_dtype)
|
||||
expected = DataFrame({'A': input_vals}).astype({'A': string_dtype})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_constructor_list_str_na(self, string_dtype):
|
||||
|
||||
result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
|
||||
expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestDataFrameDatetimeWithTZ(TestData):
|
||||
|
||||
def test_interleave(self):
|
||||
|
||||
# interleave with object
|
||||
result = self.tzframe.assign(D='foo').values
|
||||
expected = np.array([[Timestamp('2013-01-01 00:00:00'),
|
||||
Timestamp('2013-01-02 00:00:00'),
|
||||
Timestamp('2013-01-03 00:00:00')],
|
||||
[Timestamp('2013-01-01 00:00:00-0500',
|
||||
tz='US/Eastern'),
|
||||
pd.NaT,
|
||||
Timestamp('2013-01-03 00:00:00-0500',
|
||||
tz='US/Eastern')],
|
||||
[Timestamp('2013-01-01 00:00:00+0100', tz='CET'),
|
||||
pd.NaT,
|
||||
Timestamp('2013-01-03 00:00:00+0100', tz='CET')],
|
||||
['foo', 'foo', 'foo']], dtype=object).T
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# interleave with only datetime64[ns]
|
||||
result = self.tzframe.values
|
||||
expected = np.array([[Timestamp('2013-01-01 00:00:00'),
|
||||
Timestamp('2013-01-02 00:00:00'),
|
||||
Timestamp('2013-01-03 00:00:00')],
|
||||
[Timestamp('2013-01-01 00:00:00-0500',
|
||||
tz='US/Eastern'),
|
||||
pd.NaT,
|
||||
Timestamp('2013-01-03 00:00:00-0500',
|
||||
tz='US/Eastern')],
|
||||
[Timestamp('2013-01-01 00:00:00+0100', tz='CET'),
|
||||
pd.NaT,
|
||||
Timestamp('2013-01-03 00:00:00+0100',
|
||||
tz='CET')]], dtype=object).T
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_astype(self):
|
||||
# astype
|
||||
expected = np.array([[Timestamp('2013-01-01 00:00:00'),
|
||||
Timestamp('2013-01-02 00:00:00'),
|
||||
Timestamp('2013-01-03 00:00:00')],
|
||||
[Timestamp('2013-01-01 00:00:00-0500',
|
||||
tz='US/Eastern'),
|
||||
pd.NaT,
|
||||
Timestamp('2013-01-03 00:00:00-0500',
|
||||
tz='US/Eastern')],
|
||||
[Timestamp('2013-01-01 00:00:00+0100', tz='CET'),
|
||||
pd.NaT,
|
||||
Timestamp('2013-01-03 00:00:00+0100',
|
||||
tz='CET')]],
|
||||
dtype=object).T
|
||||
result = self.tzframe.astype(object)
|
||||
assert_frame_equal(result, DataFrame(
|
||||
expected, index=self.tzframe.index, columns=self.tzframe.columns))
|
||||
|
||||
result = self.tzframe.astype('datetime64[ns]')
|
||||
expected = DataFrame({'A': date_range('20130101', periods=3),
|
||||
'B': (date_range('20130101', periods=3,
|
||||
tz='US/Eastern')
|
||||
.tz_convert('UTC')
|
||||
.tz_localize(None)),
|
||||
'C': (date_range('20130101', periods=3,
|
||||
tz='CET')
|
||||
.tz_convert('UTC')
|
||||
.tz_localize(None))})
|
||||
expected.iloc[1, 1] = pd.NaT
|
||||
expected.iloc[1, 2] = pd.NaT
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_astype_str(self):
|
||||
# str formatting
|
||||
result = self.tzframe.astype(str)
|
||||
expected = DataFrame([['2013-01-01', '2013-01-01 00:00:00-05:00',
|
||||
'2013-01-01 00:00:00+01:00'],
|
||||
['2013-01-02', 'NaT', 'NaT'],
|
||||
['2013-01-03', '2013-01-03 00:00:00-05:00',
|
||||
'2013-01-03 00:00:00+01:00']],
|
||||
columns=self.tzframe.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with option_context('display.max_columns', 20):
|
||||
result = str(self.tzframe)
|
||||
assert ('0 2013-01-01 2013-01-01 00:00:00-05:00 '
|
||||
'2013-01-01 00:00:00+01:00') in result
|
||||
assert ('1 2013-01-02 '
|
||||
'NaT NaT') in result
|
||||
assert ('2 2013-01-03 2013-01-03 00:00:00-05:00 '
|
||||
'2013-01-03 00:00:00+01:00') in result
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,184 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
from pandas import DataFrame, Index, PeriodIndex
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def frame_with_period_index():
|
||||
return DataFrame(
|
||||
data=np.arange(20).reshape(4, 5),
|
||||
columns=list('abcde'),
|
||||
index=PeriodIndex(start='2000', freq='A', periods=4))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def frame():
|
||||
return TestData().frame
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left():
|
||||
return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right():
|
||||
return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"how, sort, expected",
|
||||
[('inner', False, DataFrame({'a': [20, 10],
|
||||
'b': [200, 100]},
|
||||
index=[2, 1])),
|
||||
('inner', True, DataFrame({'a': [10, 20],
|
||||
'b': [100, 200]},
|
||||
index=[1, 2])),
|
||||
('left', False, DataFrame({'a': [20, 10, 0],
|
||||
'b': [200, 100, np.nan]},
|
||||
index=[2, 1, 0])),
|
||||
('left', True, DataFrame({'a': [0, 10, 20],
|
||||
'b': [np.nan, 100, 200]},
|
||||
index=[0, 1, 2])),
|
||||
('right', False, DataFrame({'a': [np.nan, 10, 20],
|
||||
'b': [300, 100, 200]},
|
||||
index=[3, 1, 2])),
|
||||
('right', True, DataFrame({'a': [10, 20, np.nan],
|
||||
'b': [100, 200, 300]},
|
||||
index=[1, 2, 3])),
|
||||
('outer', False, DataFrame({'a': [0, 10, 20, np.nan],
|
||||
'b': [np.nan, 100, 200, 300]},
|
||||
index=[0, 1, 2, 3])),
|
||||
('outer', True, DataFrame({'a': [0, 10, 20, np.nan],
|
||||
'b': [np.nan, 100, 200, 300]},
|
||||
index=[0, 1, 2, 3]))])
|
||||
def test_join(left, right, how, sort, expected):
|
||||
|
||||
result = left.join(right, how=how, sort=sort)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_join_index(frame):
|
||||
# left / right
|
||||
|
||||
f = frame.loc[frame.index[:10], ['A', 'B']]
|
||||
f2 = frame.loc[frame.index[5:], ['C', 'D']].iloc[::-1]
|
||||
|
||||
joined = f.join(f2)
|
||||
tm.assert_index_equal(f.index, joined.index)
|
||||
expected_columns = Index(['A', 'B', 'C', 'D'])
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
joined = f.join(f2, how='left')
|
||||
tm.assert_index_equal(joined.index, f.index)
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
joined = f.join(f2, how='right')
|
||||
tm.assert_index_equal(joined.index, f2.index)
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
# inner
|
||||
|
||||
joined = f.join(f2, how='inner')
|
||||
tm.assert_index_equal(joined.index, f.index[5:10])
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
# outer
|
||||
|
||||
joined = f.join(f2, how='outer')
|
||||
tm.assert_index_equal(joined.index, frame.index.sort_values())
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
tm.assert_raises_regex(
|
||||
ValueError, 'join method', f.join, f2, how='foo')
|
||||
|
||||
# corner case - overlapping columns
|
||||
for how in ('outer', 'left', 'inner'):
|
||||
with tm.assert_raises_regex(ValueError, 'columns overlap but '
|
||||
'no suffix'):
|
||||
frame.join(frame, how=how)
|
||||
|
||||
|
||||
def test_join_index_more(frame):
|
||||
af = frame.loc[:, ['A', 'B']]
|
||||
bf = frame.loc[::2, ['C', 'D']]
|
||||
|
||||
expected = af.copy()
|
||||
expected['C'] = frame['C'][::2]
|
||||
expected['D'] = frame['D'][::2]
|
||||
|
||||
result = af.join(bf)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = af.join(bf, how='right')
|
||||
tm.assert_frame_equal(result, expected[::2])
|
||||
|
||||
result = bf.join(af, how='right')
|
||||
tm.assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
|
||||
def test_join_index_series(frame):
|
||||
df = frame.copy()
|
||||
s = df.pop(frame.columns[-1])
|
||||
joined = df.join(s)
|
||||
|
||||
# TODO should this check_names ?
|
||||
tm.assert_frame_equal(joined, frame, check_names=False)
|
||||
|
||||
s.name = None
|
||||
tm.assert_raises_regex(ValueError, 'must have a name', df.join, s)
|
||||
|
||||
|
||||
def test_join_overlap(frame):
|
||||
df1 = frame.loc[:, ['A', 'B', 'C']]
|
||||
df2 = frame.loc[:, ['B', 'C', 'D']]
|
||||
|
||||
joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2')
|
||||
df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1')
|
||||
df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2')
|
||||
|
||||
no_overlap = frame.loc[:, ['A', 'D']]
|
||||
expected = df1_suf.join(df2_suf).join(no_overlap)
|
||||
|
||||
# column order not necessarily sorted
|
||||
tm.assert_frame_equal(joined, expected.loc[:, joined.columns])
|
||||
|
||||
|
||||
def test_join_period_index(frame_with_period_index):
|
||||
other = frame_with_period_index.rename(
|
||||
columns=lambda x: '{key}{key}'.format(key=x))
|
||||
|
||||
joined_values = np.concatenate(
|
||||
[frame_with_period_index.values] * 2, axis=1)
|
||||
|
||||
joined_cols = frame_with_period_index.columns.append(other.columns)
|
||||
|
||||
joined = frame_with_period_index.join(other)
|
||||
expected = DataFrame(
|
||||
data=joined_values,
|
||||
columns=joined_cols,
|
||||
index=frame_with_period_index.index)
|
||||
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
|
||||
|
||||
def test_join_left_sequence_non_unique_index():
|
||||
# https://github.com/pandas-dev/pandas/issues/19607
|
||||
df1 = DataFrame({'a': [0, 10, 20]}, index=[1, 2, 3])
|
||||
df2 = DataFrame({'b': [100, 200, 300]}, index=[4, 3, 2])
|
||||
df3 = DataFrame({'c': [400, 500, 600]}, index=[2, 2, 4])
|
||||
|
||||
joined = df1.join([df2, df3], how='left')
|
||||
|
||||
expected = DataFrame({
|
||||
'a': [0, 10, 10, 20],
|
||||
'b': [np.nan, 300, 300, 200],
|
||||
'c': [np.nan, 400, 500, np.nan]
|
||||
}, index=[1, 2, 2, 3])
|
||||
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
@@ -1,846 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import pytest
|
||||
|
||||
from distutils.version import LooseVersion
|
||||
from numpy import nan, random
|
||||
import numpy as np
|
||||
|
||||
import datetime
|
||||
import dateutil
|
||||
|
||||
from pandas.compat import lrange
|
||||
from pandas import (DataFrame, Series, Timestamp,
|
||||
date_range, Categorical)
|
||||
import pandas as pd
|
||||
|
||||
from pandas.util.testing import assert_series_equal, assert_frame_equal
|
||||
|
||||
import pandas.util.testing as tm
|
||||
import pandas.util._test_decorators as td
|
||||
from pandas.tests.frame.common import TestData, _check_mixed_float
|
||||
|
||||
|
||||
try:
|
||||
import scipy
|
||||
_is_scipy_ge_0190 = (LooseVersion(scipy.__version__) >=
|
||||
LooseVersion('0.19.0'))
|
||||
except:
|
||||
_is_scipy_ge_0190 = False
|
||||
|
||||
|
||||
def _skip_if_no_pchip():
|
||||
try:
|
||||
from scipy.interpolate import pchip_interpolate # noqa
|
||||
except ImportError:
|
||||
import pytest
|
||||
pytest.skip('scipy.interpolate.pchip missing')
|
||||
|
||||
|
||||
class TestDataFrameMissingData(TestData):
|
||||
|
||||
def test_dropEmptyRows(self):
|
||||
N = len(self.frame.index)
|
||||
mat = random.randn(N)
|
||||
mat[:5] = nan
|
||||
|
||||
frame = DataFrame({'foo': mat}, index=self.frame.index)
|
||||
original = Series(mat, index=self.frame.index, name='foo')
|
||||
expected = original.dropna()
|
||||
inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()
|
||||
|
||||
smaller_frame = frame.dropna(how='all')
|
||||
# check that original was preserved
|
||||
assert_series_equal(frame['foo'], original)
|
||||
inplace_frame1.dropna(how='all', inplace=True)
|
||||
assert_series_equal(smaller_frame['foo'], expected)
|
||||
assert_series_equal(inplace_frame1['foo'], expected)
|
||||
|
||||
smaller_frame = frame.dropna(how='all', subset=['foo'])
|
||||
inplace_frame2.dropna(how='all', subset=['foo'], inplace=True)
|
||||
assert_series_equal(smaller_frame['foo'], expected)
|
||||
assert_series_equal(inplace_frame2['foo'], expected)
|
||||
|
||||
def test_dropIncompleteRows(self):
|
||||
N = len(self.frame.index)
|
||||
mat = random.randn(N)
|
||||
mat[:5] = nan
|
||||
|
||||
frame = DataFrame({'foo': mat}, index=self.frame.index)
|
||||
frame['bar'] = 5
|
||||
original = Series(mat, index=self.frame.index, name='foo')
|
||||
inp_frame1, inp_frame2 = frame.copy(), frame.copy()
|
||||
|
||||
smaller_frame = frame.dropna()
|
||||
assert_series_equal(frame['foo'], original)
|
||||
inp_frame1.dropna(inplace=True)
|
||||
|
||||
exp = Series(mat[5:], index=self.frame.index[5:], name='foo')
|
||||
tm.assert_series_equal(smaller_frame['foo'], exp)
|
||||
tm.assert_series_equal(inp_frame1['foo'], exp)
|
||||
|
||||
samesize_frame = frame.dropna(subset=['bar'])
|
||||
assert_series_equal(frame['foo'], original)
|
||||
assert (frame['bar'] == 5).all()
|
||||
inp_frame2.dropna(subset=['bar'], inplace=True)
|
||||
tm.assert_index_equal(samesize_frame.index, self.frame.index)
|
||||
tm.assert_index_equal(inp_frame2.index, self.frame.index)
|
||||
|
||||
def test_dropna(self):
|
||||
df = DataFrame(np.random.randn(6, 4))
|
||||
df[2][:2] = nan
|
||||
|
||||
dropped = df.dropna(axis=1)
|
||||
expected = df.loc[:, [0, 1, 3]]
|
||||
inp = df.copy()
|
||||
inp.dropna(axis=1, inplace=True)
|
||||
assert_frame_equal(dropped, expected)
|
||||
assert_frame_equal(inp, expected)
|
||||
|
||||
dropped = df.dropna(axis=0)
|
||||
expected = df.loc[lrange(2, 6)]
|
||||
inp = df.copy()
|
||||
inp.dropna(axis=0, inplace=True)
|
||||
assert_frame_equal(dropped, expected)
|
||||
assert_frame_equal(inp, expected)
|
||||
|
||||
# threshold
|
||||
dropped = df.dropna(axis=1, thresh=5)
|
||||
expected = df.loc[:, [0, 1, 3]]
|
||||
inp = df.copy()
|
||||
inp.dropna(axis=1, thresh=5, inplace=True)
|
||||
assert_frame_equal(dropped, expected)
|
||||
assert_frame_equal(inp, expected)
|
||||
|
||||
dropped = df.dropna(axis=0, thresh=4)
|
||||
expected = df.loc[lrange(2, 6)]
|
||||
inp = df.copy()
|
||||
inp.dropna(axis=0, thresh=4, inplace=True)
|
||||
assert_frame_equal(dropped, expected)
|
||||
assert_frame_equal(inp, expected)
|
||||
|
||||
dropped = df.dropna(axis=1, thresh=4)
|
||||
assert_frame_equal(dropped, df)
|
||||
|
||||
dropped = df.dropna(axis=1, thresh=3)
|
||||
assert_frame_equal(dropped, df)
|
||||
|
||||
# subset
|
||||
dropped = df.dropna(axis=0, subset=[0, 1, 3])
|
||||
inp = df.copy()
|
||||
inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
|
||||
assert_frame_equal(dropped, df)
|
||||
assert_frame_equal(inp, df)
|
||||
|
||||
# all
|
||||
dropped = df.dropna(axis=1, how='all')
|
||||
assert_frame_equal(dropped, df)
|
||||
|
||||
df[2] = nan
|
||||
dropped = df.dropna(axis=1, how='all')
|
||||
expected = df.loc[:, [0, 1, 3]]
|
||||
assert_frame_equal(dropped, expected)
|
||||
|
||||
# bad input
|
||||
pytest.raises(ValueError, df.dropna, axis=3)
|
||||
|
||||
def test_drop_and_dropna_caching(self):
|
||||
# tst that cacher updates
|
||||
original = Series([1, 2, np.nan], name='A')
|
||||
expected = Series([1, 2], dtype=original.dtype, name='A')
|
||||
df = pd.DataFrame({'A': original.values.copy()})
|
||||
df2 = df.copy()
|
||||
df['A'].dropna()
|
||||
assert_series_equal(df['A'], original)
|
||||
df['A'].dropna(inplace=True)
|
||||
assert_series_equal(df['A'], expected)
|
||||
df2['A'].drop([1])
|
||||
assert_series_equal(df2['A'], original)
|
||||
df2['A'].drop([1], inplace=True)
|
||||
assert_series_equal(df2['A'], original.drop([1]))
|
||||
|
||||
def test_dropna_corner(self):
|
||||
# bad input
|
||||
pytest.raises(ValueError, self.frame.dropna, how='foo')
|
||||
pytest.raises(TypeError, self.frame.dropna, how=None)
|
||||
# non-existent column - 8303
|
||||
pytest.raises(KeyError, self.frame.dropna, subset=['A', 'X'])
|
||||
|
||||
def test_dropna_multiple_axes(self):
|
||||
df = DataFrame([[1, np.nan, 2, 3],
|
||||
[4, np.nan, 5, 6],
|
||||
[np.nan, np.nan, np.nan, np.nan],
|
||||
[7, np.nan, 8, 9]])
|
||||
cp = df.copy()
|
||||
|
||||
# GH20987
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
result = df.dropna(how='all', axis=[0, 1])
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
result2 = df.dropna(how='all', axis=(0, 1))
|
||||
expected = df.dropna(how='all').dropna(how='all', axis=1)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
assert_frame_equal(result2, expected)
|
||||
assert_frame_equal(df, cp)
|
||||
|
||||
inp = df.copy()
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
inp.dropna(how='all', axis=(0, 1), inplace=True)
|
||||
assert_frame_equal(inp, expected)
|
||||
|
||||
def test_dropna_tz_aware_datetime(self):
|
||||
# GH13407
|
||||
df = DataFrame()
|
||||
dt1 = datetime.datetime(2015, 1, 1,
|
||||
tzinfo=dateutil.tz.tzutc())
|
||||
dt2 = datetime.datetime(2015, 2, 2,
|
||||
tzinfo=dateutil.tz.tzutc())
|
||||
df['Time'] = [dt1]
|
||||
result = df.dropna(axis=0)
|
||||
expected = DataFrame({'Time': [dt1]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Ex2
|
||||
df = DataFrame({'Time': [dt1, None, np.nan, dt2]})
|
||||
result = df.dropna(axis=0)
|
||||
expected = DataFrame([dt1, dt2],
|
||||
columns=['Time'],
|
||||
index=[0, 3])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna(self):
|
||||
tf = self.tsframe
|
||||
tf.loc[tf.index[:5], 'A'] = nan
|
||||
tf.loc[tf.index[-5:], 'A'] = nan
|
||||
|
||||
zero_filled = self.tsframe.fillna(0)
|
||||
assert (zero_filled.loc[zero_filled.index[:5], 'A'] == 0).all()
|
||||
|
||||
padded = self.tsframe.fillna(method='pad')
|
||||
assert np.isnan(padded.loc[padded.index[:5], 'A']).all()
|
||||
assert (padded.loc[padded.index[-5:], 'A'] ==
|
||||
padded.loc[padded.index[-5], 'A']).all()
|
||||
|
||||
# mixed type
|
||||
mf = self.mixed_frame
|
||||
mf.loc[mf.index[5:20], 'foo'] = nan
|
||||
mf.loc[mf.index[-10:], 'A'] = nan
|
||||
result = self.mixed_frame.fillna(value=0)
|
||||
result = self.mixed_frame.fillna(method='pad')
|
||||
|
||||
pytest.raises(ValueError, self.tsframe.fillna)
|
||||
pytest.raises(ValueError, self.tsframe.fillna, 5, method='ffill')
|
||||
|
||||
# mixed numeric (but no float16)
|
||||
mf = self.mixed_float.reindex(columns=['A', 'B', 'D'])
|
||||
mf.loc[mf.index[-10:], 'A'] = nan
|
||||
result = mf.fillna(value=0)
|
||||
_check_mixed_float(result, dtype=dict(C=None))
|
||||
|
||||
result = mf.fillna(method='pad')
|
||||
_check_mixed_float(result, dtype=dict(C=None))
|
||||
|
||||
# empty frame (GH #2778)
|
||||
df = DataFrame(columns=['x'])
|
||||
for m in ['pad', 'backfill']:
|
||||
df.x.fillna(method=m, inplace=True)
|
||||
df.x.fillna(method=m)
|
||||
|
||||
# with different dtype (GH3386)
|
||||
df = DataFrame([['a', 'a', np.nan, 'a'], [
|
||||
'b', 'b', np.nan, 'b'], ['c', 'c', np.nan, 'c']])
|
||||
|
||||
result = df.fillna({2: 'foo'})
|
||||
expected = DataFrame([['a', 'a', 'foo', 'a'],
|
||||
['b', 'b', 'foo', 'b'],
|
||||
['c', 'c', 'foo', 'c']])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df.fillna({2: 'foo'}, inplace=True)
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# limit and value
|
||||
df = DataFrame(np.random.randn(10, 3))
|
||||
df.iloc[2:7, 0] = np.nan
|
||||
df.iloc[3:5, 2] = np.nan
|
||||
|
||||
expected = df.copy()
|
||||
expected.iloc[2, 0] = 999
|
||||
expected.iloc[3, 2] = 999
|
||||
result = df.fillna(999, limit=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# with datelike
|
||||
# GH 6344
|
||||
df = DataFrame({
|
||||
'Date': [pd.NaT, Timestamp("2014-1-1")],
|
||||
'Date2': [Timestamp("2013-1-1"), pd.NaT]
|
||||
})
|
||||
|
||||
expected = df.copy()
|
||||
expected['Date'] = expected['Date'].fillna(
|
||||
df.loc[df.index[0], 'Date2'])
|
||||
result = df.fillna(value={'Date': df['Date2']})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# with timezone
|
||||
# GH 15855
|
||||
df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
|
||||
pd.NaT]})
|
||||
exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
|
||||
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
|
||||
assert_frame_equal(df.fillna(method='pad'), exp)
|
||||
|
||||
df = pd.DataFrame({'A': [pd.NaT,
|
||||
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
|
||||
exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
|
||||
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
|
||||
assert_frame_equal(df.fillna(method='bfill'), exp)
|
||||
|
||||
def test_na_actions_categorical(self):
|
||||
|
||||
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
|
||||
vals = ["a", "b", np.nan, "d"]
|
||||
df = DataFrame({"cats": cat, "vals": vals})
|
||||
cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
|
||||
vals2 = ["a", "b", "b", "d"]
|
||||
df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
|
||||
cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
|
||||
vals3 = ["a", "b", np.nan]
|
||||
df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
|
||||
cat4 = Categorical([1, 2], categories=[1, 2, 3])
|
||||
vals4 = ["a", "b"]
|
||||
df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
|
||||
|
||||
# fillna
|
||||
res = df.fillna(value={"cats": 3, "vals": "b"})
|
||||
tm.assert_frame_equal(res, df_exp_fill)
|
||||
|
||||
with tm.assert_raises_regex(ValueError, "fill value must be "
|
||||
"in categories"):
|
||||
df.fillna(value={"cats": 4, "vals": "c"})
|
||||
|
||||
res = df.fillna(method='pad')
|
||||
tm.assert_frame_equal(res, df_exp_fill)
|
||||
|
||||
# dropna
|
||||
res = df.dropna(subset=["cats"])
|
||||
tm.assert_frame_equal(res, df_exp_drop_cats)
|
||||
|
||||
res = df.dropna()
|
||||
tm.assert_frame_equal(res, df_exp_drop_all)
|
||||
|
||||
# make sure that fillna takes missing values into account
|
||||
c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
|
||||
df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})
|
||||
|
||||
cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
|
||||
df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
|
||||
|
||||
res = df.fillna("a")
|
||||
tm.assert_frame_equal(res, df_exp)
|
||||
|
||||
def test_fillna_categorical_nan(self):
|
||||
# GH 14021
|
||||
# np.nan should always be a valid filler
|
||||
cat = Categorical([np.nan, 2, np.nan])
|
||||
val = Categorical([np.nan, np.nan, np.nan])
|
||||
df = DataFrame({"cats": cat, "vals": val})
|
||||
res = df.fillna(df.median())
|
||||
v_exp = [np.nan, np.nan, np.nan]
|
||||
df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
|
||||
dtype='category')
|
||||
tm.assert_frame_equal(res, df_exp)
|
||||
|
||||
result = df.cats.fillna(np.nan)
|
||||
tm.assert_series_equal(result, df.cats)
|
||||
result = df.vals.fillna(np.nan)
|
||||
tm.assert_series_equal(result, df.vals)
|
||||
|
||||
idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
|
||||
'2011-01-01 09:00', pd.NaT, pd.NaT])
|
||||
df = DataFrame({'a': Categorical(idx)})
|
||||
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
|
||||
|
||||
idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01',
|
||||
pd.NaT, pd.NaT], freq='M')
|
||||
df = DataFrame({'a': Categorical(idx)})
|
||||
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
|
||||
|
||||
idx = pd.TimedeltaIndex(['1 days', '2 days',
|
||||
'1 days', pd.NaT, pd.NaT])
|
||||
df = DataFrame({'a': Categorical(idx)})
|
||||
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
|
||||
|
||||
def test_fillna_downcast(self):
|
||||
# GH 15277
|
||||
# infer int64 from float64
|
||||
df = pd.DataFrame({'a': [1., np.nan]})
|
||||
result = df.fillna(0, downcast='infer')
|
||||
expected = pd.DataFrame({'a': [1, 0]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# infer int64 from float64 when fillna value is a dict
|
||||
df = pd.DataFrame({'a': [1., np.nan]})
|
||||
result = df.fillna({'a': 0}, downcast='infer')
|
||||
expected = pd.DataFrame({'a': [1, 0]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_dtype_conversion(self):
|
||||
# make sure that fillna on an empty frame works
|
||||
df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
|
||||
result = df.get_dtype_counts().sort_values()
|
||||
expected = Series({'object': 5})
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df.fillna(1)
|
||||
expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
|
||||
result = result.get_dtype_counts().sort_values()
|
||||
expected = Series({'int64': 5})
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# empty block
|
||||
df = DataFrame(index=lrange(3), columns=['A', 'B'], dtype='float64')
|
||||
result = df.fillna('nan')
|
||||
expected = DataFrame('nan', index=lrange(3), columns=['A', 'B'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# equiv of replace
|
||||
df = DataFrame(dict(A=[1, np.nan], B=[1., 2.]))
|
||||
for v in ['', 1, np.nan, 1.0]:
|
||||
expected = df.replace(np.nan, v)
|
||||
result = df.fillna(v)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_datetime_columns(self):
|
||||
# GH 7095
|
||||
df = pd.DataFrame({'A': [-1, -2, np.nan],
|
||||
'B': date_range('20130101', periods=3),
|
||||
'C': ['foo', 'bar', None],
|
||||
'D': ['foo2', 'bar2', None]},
|
||||
index=date_range('20130110', periods=3))
|
||||
result = df.fillna('?')
|
||||
expected = pd.DataFrame({'A': [-1, -2, '?'],
|
||||
'B': date_range('20130101', periods=3),
|
||||
'C': ['foo', 'bar', '?'],
|
||||
'D': ['foo2', 'bar2', '?']},
|
||||
index=date_range('20130110', periods=3))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = pd.DataFrame({'A': [-1, -2, np.nan],
|
||||
'B': [pd.Timestamp('2013-01-01'),
|
||||
pd.Timestamp('2013-01-02'), pd.NaT],
|
||||
'C': ['foo', 'bar', None],
|
||||
'D': ['foo2', 'bar2', None]},
|
||||
index=date_range('20130110', periods=3))
|
||||
result = df.fillna('?')
|
||||
expected = pd.DataFrame({'A': [-1, -2, '?'],
|
||||
'B': [pd.Timestamp('2013-01-01'),
|
||||
pd.Timestamp('2013-01-02'), '?'],
|
||||
'C': ['foo', 'bar', '?'],
|
||||
'D': ['foo2', 'bar2', '?']},
|
||||
index=pd.date_range('20130110', periods=3))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_ffill(self):
|
||||
self.tsframe['A'][:5] = nan
|
||||
self.tsframe['A'][-5:] = nan
|
||||
|
||||
assert_frame_equal(self.tsframe.ffill(),
|
||||
self.tsframe.fillna(method='ffill'))
|
||||
|
||||
def test_bfill(self):
|
||||
self.tsframe['A'][:5] = nan
|
||||
self.tsframe['A'][-5:] = nan
|
||||
|
||||
assert_frame_equal(self.tsframe.bfill(),
|
||||
self.tsframe.fillna(method='bfill'))
|
||||
|
||||
def test_frame_pad_backfill_limit(self):
|
||||
index = np.arange(10)
|
||||
df = DataFrame(np.random.randn(10, 4), index=index)
|
||||
|
||||
result = df[:2].reindex(index, method='pad', limit=5)
|
||||
|
||||
expected = df[:2].reindex(index).fillna(method='pad')
|
||||
expected.values[-3:] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df[-2:].reindex(index, method='backfill', limit=5)
|
||||
|
||||
expected = df[-2:].reindex(index).fillna(method='backfill')
|
||||
expected.values[:3] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_frame_fillna_limit(self):
|
||||
index = np.arange(10)
|
||||
df = DataFrame(np.random.randn(10, 4), index=index)
|
||||
|
||||
result = df[:2].reindex(index)
|
||||
result = result.fillna(method='pad', limit=5)
|
||||
|
||||
expected = df[:2].reindex(index).fillna(method='pad')
|
||||
expected.values[-3:] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df[-2:].reindex(index)
|
||||
result = result.fillna(method='backfill', limit=5)
|
||||
|
||||
expected = df[-2:].reindex(index).fillna(method='backfill')
|
||||
expected.values[:3] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_skip_certain_blocks(self):
|
||||
# don't try to fill boolean, int blocks
|
||||
|
||||
df = DataFrame(np.random.randn(10, 4).astype(int))
|
||||
|
||||
# it works!
|
||||
df.fillna(np.nan)
|
||||
|
||||
def test_fillna_inplace(self):
|
||||
df = DataFrame(np.random.randn(10, 4))
|
||||
df[1][:4] = np.nan
|
||||
df[3][-4:] = np.nan
|
||||
|
||||
expected = df.fillna(value=0)
|
||||
assert expected is not df
|
||||
|
||||
df.fillna(value=0, inplace=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
expected = df.fillna(value={0: 0}, inplace=True)
|
||||
assert expected is None
|
||||
|
||||
df[1][:4] = np.nan
|
||||
df[3][-4:] = np.nan
|
||||
expected = df.fillna(method='ffill')
|
||||
assert expected is not df
|
||||
|
||||
df.fillna(method='ffill', inplace=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_fillna_dict_series(self):
|
||||
df = DataFrame({'a': [nan, 1, 2, nan, nan],
|
||||
'b': [1, 2, 3, nan, nan],
|
||||
'c': [nan, 1, 2, 3, 4]})
|
||||
|
||||
result = df.fillna({'a': 0, 'b': 5})
|
||||
|
||||
expected = df.copy()
|
||||
expected['a'] = expected['a'].fillna(0)
|
||||
expected['b'] = expected['b'].fillna(5)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# it works
|
||||
result = df.fillna({'a': 0, 'b': 5, 'd': 7})
|
||||
|
||||
# Series treated same as dict
|
||||
result = df.fillna(df.max())
|
||||
expected = df.fillna(df.max().to_dict())
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# disable this for now
|
||||
with tm.assert_raises_regex(NotImplementedError,
|
||||
'column by column'):
|
||||
df.fillna(df.max(1), axis=1)
|
||||
|
||||
def test_fillna_dataframe(self):
|
||||
# GH 8377
|
||||
df = DataFrame({'a': [nan, 1, 2, nan, nan],
|
||||
'b': [1, 2, 3, nan, nan],
|
||||
'c': [nan, 1, 2, 3, 4]},
|
||||
index=list('VWXYZ'))
|
||||
|
||||
# df2 may have different index and columns
|
||||
df2 = DataFrame({'a': [nan, 10, 20, 30, 40],
|
||||
'b': [50, 60, 70, 80, 90],
|
||||
'foo': ['bar'] * 5},
|
||||
index=list('VWXuZ'))
|
||||
|
||||
result = df.fillna(df2)
|
||||
|
||||
# only those columns and indices which are shared get filled
|
||||
expected = DataFrame({'a': [nan, 1, 2, nan, 40],
|
||||
'b': [1, 2, 3, nan, 90],
|
||||
'c': [nan, 1, 2, 3, 4]},
|
||||
index=list('VWXYZ'))
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_columns(self):
|
||||
df = DataFrame(np.random.randn(10, 10))
|
||||
df.values[:, ::2] = np.nan
|
||||
|
||||
result = df.fillna(method='ffill', axis=1)
|
||||
expected = df.T.fillna(method='pad').T
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df.insert(6, 'foo', 5)
|
||||
result = df.fillna(method='ffill', axis=1)
|
||||
expected = df.astype(float).fillna(method='ffill', axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_invalid_method(self):
|
||||
with tm.assert_raises_regex(ValueError, 'ffil'):
|
||||
self.frame.fillna(method='ffil')
|
||||
|
||||
def test_fillna_invalid_value(self):
|
||||
# list
|
||||
pytest.raises(TypeError, self.frame.fillna, [1, 2])
|
||||
# tuple
|
||||
pytest.raises(TypeError, self.frame.fillna, (1, 2))
|
||||
# frame with series
|
||||
pytest.raises(TypeError, self.frame.iloc[:, 0].fillna, self.frame)
|
||||
|
||||
def test_fillna_col_reordering(self):
|
||||
cols = ["COL." + str(i) for i in range(5, 0, -1)]
|
||||
data = np.random.rand(20, 5)
|
||||
df = DataFrame(index=lrange(20), columns=cols, data=data)
|
||||
filled = df.fillna(method='ffill')
|
||||
assert df.columns.tolist() == filled.columns.tolist()
|
||||
|
||||
def test_fill_corner(self):
|
||||
mf = self.mixed_frame
|
||||
mf.loc[mf.index[5:20], 'foo'] = nan
|
||||
mf.loc[mf.index[-10:], 'A'] = nan
|
||||
|
||||
filled = self.mixed_frame.fillna(value=0)
|
||||
assert (filled.loc[filled.index[5:20], 'foo'] == 0).all()
|
||||
del self.mixed_frame['foo']
|
||||
|
||||
empty_float = self.frame.reindex(columns=[])
|
||||
|
||||
# TODO(wesm): unused?
|
||||
result = empty_float.fillna(value=0) # noqa
|
||||
|
||||
def test_fill_value_when_combine_const(self):
|
||||
# GH12723
|
||||
dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float')
|
||||
df = DataFrame({'foo': dat}, index=range(6))
|
||||
|
||||
exp = df.fillna(0).add(2)
|
||||
res = df.add(2, fill_value=0)
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
|
||||
class TestDataFrameInterpolate(TestData):
|
||||
|
||||
def test_interp_basic(self):
|
||||
df = DataFrame({'A': [1, 2, np.nan, 4],
|
||||
'B': [1, 4, 9, np.nan],
|
||||
'C': [1, 2, 3, 5],
|
||||
'D': list('abcd')})
|
||||
expected = DataFrame({'A': [1., 2., 3., 4.],
|
||||
'B': [1., 4., 9., 9.],
|
||||
'C': [1, 2, 3, 5],
|
||||
'D': list('abcd')})
|
||||
result = df.interpolate()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.set_index('C').interpolate()
|
||||
expected = df.set_index('C')
|
||||
expected.loc[3, 'A'] = 3
|
||||
expected.loc[5, 'B'] = 9
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_bad_method(self):
|
||||
df = DataFrame({'A': [1, 2, np.nan, 4],
|
||||
'B': [1, 4, 9, np.nan],
|
||||
'C': [1, 2, 3, 5],
|
||||
'D': list('abcd')})
|
||||
with pytest.raises(ValueError):
|
||||
df.interpolate(method='not_a_method')
|
||||
|
||||
def test_interp_combo(self):
|
||||
df = DataFrame({'A': [1., 2., np.nan, 4.],
|
||||
'B': [1, 4, 9, np.nan],
|
||||
'C': [1, 2, 3, 5],
|
||||
'D': list('abcd')})
|
||||
|
||||
result = df['A'].interpolate()
|
||||
expected = Series([1., 2., 3., 4.], name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df['A'].interpolate(downcast='infer')
|
||||
expected = Series([1, 2, 3, 4], name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_interp_nan_idx(self):
|
||||
df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]})
|
||||
df = df.set_index('A')
|
||||
with pytest.raises(NotImplementedError):
|
||||
df.interpolate(method='values')
|
||||
|
||||
@td.skip_if_no_scipy
|
||||
def test_interp_various(self):
|
||||
df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
|
||||
'C': [1, 2, 3, 5, 8, 13, 21]})
|
||||
df = df.set_index('C')
|
||||
expected = df.copy()
|
||||
result = df.interpolate(method='polynomial', order=1)
|
||||
|
||||
expected.A.loc[3] = 2.66666667
|
||||
expected.A.loc[13] = 5.76923076
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method='cubic')
|
||||
# GH #15662.
|
||||
# new cubic and quadratic interpolation algorithms from scipy 0.19.0.
|
||||
# previously `splmake` was used. See scipy/scipy#6710
|
||||
if _is_scipy_ge_0190:
|
||||
expected.A.loc[3] = 2.81547781
|
||||
expected.A.loc[13] = 5.52964175
|
||||
else:
|
||||
expected.A.loc[3] = 2.81621174
|
||||
expected.A.loc[13] = 5.64146581
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method='nearest')
|
||||
expected.A.loc[3] = 2
|
||||
expected.A.loc[13] = 5
|
||||
assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
result = df.interpolate(method='quadratic')
|
||||
if _is_scipy_ge_0190:
|
||||
expected.A.loc[3] = 2.82150771
|
||||
expected.A.loc[13] = 6.12648668
|
||||
else:
|
||||
expected.A.loc[3] = 2.82533638
|
||||
expected.A.loc[13] = 6.02817974
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method='slinear')
|
||||
expected.A.loc[3] = 2.66666667
|
||||
expected.A.loc[13] = 5.76923077
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method='zero')
|
||||
expected.A.loc[3] = 2.
|
||||
expected.A.loc[13] = 5
|
||||
assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
@td.skip_if_no_scipy
|
||||
def test_interp_alt_scipy(self):
|
||||
df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
|
||||
'C': [1, 2, 3, 5, 8, 13, 21]})
|
||||
result = df.interpolate(method='barycentric')
|
||||
expected = df.copy()
|
||||
expected.loc[2, 'A'] = 3
|
||||
expected.loc[5, 'A'] = 6
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method='barycentric', downcast='infer')
|
||||
assert_frame_equal(result, expected.astype(np.int64))
|
||||
|
||||
result = df.interpolate(method='krogh')
|
||||
expectedk = df.copy()
|
||||
expectedk['A'] = expected['A']
|
||||
assert_frame_equal(result, expectedk)
|
||||
|
||||
_skip_if_no_pchip()
|
||||
import scipy
|
||||
result = df.interpolate(method='pchip')
|
||||
expected.loc[2, 'A'] = 3
|
||||
|
||||
if LooseVersion(scipy.__version__) >= LooseVersion('0.17.0'):
|
||||
expected.loc[5, 'A'] = 6.0
|
||||
else:
|
||||
expected.loc[5, 'A'] = 6.125
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_rowwise(self):
|
||||
df = DataFrame({0: [1, 2, np.nan, 4],
|
||||
1: [2, 3, 4, np.nan],
|
||||
2: [np.nan, 4, 5, 6],
|
||||
3: [4, np.nan, 6, 7],
|
||||
4: [1, 2, 3, 4]})
|
||||
result = df.interpolate(axis=1)
|
||||
expected = df.copy()
|
||||
expected.loc[3, 1] = 5
|
||||
expected.loc[0, 2] = 3
|
||||
expected.loc[1, 3] = 3
|
||||
expected[4] = expected[4].astype(np.float64)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(axis=1, method='values')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(axis=0)
|
||||
expected = df.interpolate()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_rowwise_alt(self):
|
||||
df = DataFrame({0: [0, .5, 1., np.nan, 4, 8, np.nan, np.nan, 64],
|
||||
1: [1, 2, 3, 4, 3, 2, 1, 0, -1]})
|
||||
df.interpolate(axis=0)
|
||||
|
||||
@pytest.mark.parametrize("check_scipy", [
|
||||
False, pytest.param(True, marks=td.skip_if_no_scipy)
|
||||
])
|
||||
def test_interp_leading_nans(self, check_scipy):
|
||||
df = DataFrame({"A": [np.nan, np.nan, .5, .25, 0],
|
||||
"B": [np.nan, -3, -3.5, np.nan, -4]})
|
||||
result = df.interpolate()
|
||||
expected = df.copy()
|
||||
expected['B'].loc[3] = -3.75
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
if check_scipy:
|
||||
result = df.interpolate(method='polynomial', order=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_raise_on_only_mixed(self):
|
||||
df = DataFrame({'A': [1, 2, np.nan, 4],
|
||||
'B': ['a', 'b', 'c', 'd'],
|
||||
'C': [np.nan, 2, 5, 7],
|
||||
'D': [np.nan, np.nan, 9, 9],
|
||||
'E': [1, 2, 3, 4]})
|
||||
with pytest.raises(TypeError):
|
||||
df.interpolate(axis=1)
|
||||
|
||||
def test_interp_inplace(self):
|
||||
df = DataFrame({'a': [1., 2., np.nan, 4.]})
|
||||
expected = DataFrame({'a': [1., 2., 3., 4.]})
|
||||
result = df.copy()
|
||||
result['a'].interpolate(inplace=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.copy()
|
||||
result['a'].interpolate(inplace=True, downcast='infer')
|
||||
assert_frame_equal(result, expected.astype('int64'))
|
||||
|
||||
def test_interp_inplace_row(self):
|
||||
# GH 10395
|
||||
result = DataFrame({'a': [1., 2., 3., 4.],
|
||||
'b': [np.nan, 2., 3., 4.],
|
||||
'c': [3, 2, 2, 2]})
|
||||
expected = result.interpolate(method='linear', axis=1, inplace=False)
|
||||
result.interpolate(method='linear', axis=1, inplace=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_ignore_all_good(self):
|
||||
# GH
|
||||
df = DataFrame({'A': [1, 2, np.nan, 4],
|
||||
'B': [1, 2, 3, 4],
|
||||
'C': [1., 2., np.nan, 4.],
|
||||
'D': [1., 2., 3., 4.]})
|
||||
expected = DataFrame({'A': np.array(
|
||||
[1, 2, 3, 4], dtype='float64'),
|
||||
'B': np.array(
|
||||
[1, 2, 3, 4], dtype='int64'),
|
||||
'C': np.array(
|
||||
[1., 2., 3, 4.], dtype='float64'),
|
||||
'D': np.array(
|
||||
[1., 2., 3., 4.], dtype='float64')})
|
||||
|
||||
result = df.interpolate(downcast=None)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# all good
|
||||
result = df[['B', 'D']].interpolate(downcast=None)
|
||||
assert_frame_equal(result, df[['B', 'D']])
|
||||
@@ -1,283 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
import pytest
|
||||
from pandas.compat import range, lrange
|
||||
import numpy as np
|
||||
from pandas.compat import PY36
|
||||
|
||||
from pandas import DataFrame, Series, Index, MultiIndex
|
||||
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.tests.frame.common import TestData
|
||||
|
||||
|
||||
# Column add, remove, delete.
|
||||
|
||||
|
||||
class TestDataFrameMutateColumns(TestData):
|
||||
|
||||
def test_assign(self):
|
||||
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
original = df.copy()
|
||||
result = df.assign(C=df.B / df.A)
|
||||
expected = df.copy()
|
||||
expected['C'] = [4, 2.5, 2]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# lambda syntax
|
||||
result = df.assign(C=lambda x: x.B / x.A)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# original is unmodified
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
# Non-Series array-like
|
||||
result = df.assign(C=[4, 2.5, 2])
|
||||
assert_frame_equal(result, expected)
|
||||
# original is unmodified
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
result = df.assign(B=df.B / df.A)
|
||||
expected = expected.drop('B', axis=1).rename(columns={'C': 'B'})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# overwrite
|
||||
result = df.assign(A=df.A + df.B)
|
||||
expected = df.copy()
|
||||
expected['A'] = [5, 7, 9]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# lambda
|
||||
result = df.assign(A=lambda x: x.A + x.B)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_multiple(self):
|
||||
df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=['A', 'B'])
|
||||
result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
|
||||
expected = DataFrame([[1, 4, 7, 1, 4], [2, 5, 8, 2, 5],
|
||||
[3, 6, 9, 3, 6]], columns=list('ABCDE'))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_order(self):
|
||||
# GH 9818
|
||||
df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
|
||||
result = df.assign(D=df.A + df.B, C=df.A - df.B)
|
||||
|
||||
if PY36:
|
||||
expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]],
|
||||
columns=list('ABDC'))
|
||||
else:
|
||||
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
|
||||
columns=list('ABCD'))
|
||||
assert_frame_equal(result, expected)
|
||||
result = df.assign(C=df.A - df.B, D=df.A + df.B)
|
||||
|
||||
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
|
||||
columns=list('ABCD'))
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_bad(self):
|
||||
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
|
||||
# non-keyword argument
|
||||
with pytest.raises(TypeError):
|
||||
df.assign(lambda x: x.A)
|
||||
with pytest.raises(AttributeError):
|
||||
df.assign(C=df.A, D=df.A + df.C)
|
||||
|
||||
@pytest.mark.skipif(PY36, reason="""Issue #14207: valid for python
|
||||
3.6 and above""")
|
||||
def test_assign_dependent_old_python(self):
|
||||
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
|
||||
# Key C does not exist at definition time of df
|
||||
with pytest.raises(KeyError):
|
||||
df.assign(C=lambda df: df.A,
|
||||
D=lambda df: df['A'] + df['C'])
|
||||
with pytest.raises(KeyError):
|
||||
df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
|
||||
|
||||
@pytest.mark.skipif(not PY36, reason="""Issue #14207: not valid for
|
||||
python 3.5 and below""")
|
||||
def test_assign_dependent(self):
|
||||
df = DataFrame({'A': [1, 2], 'B': [3, 4]})
|
||||
|
||||
result = df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
|
||||
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
|
||||
columns=list('ABCD'))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.assign(C=lambda df: df.A,
|
||||
D=lambda df: df['A'] + df['C'])
|
||||
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
|
||||
columns=list('ABCD'))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_insert_error_msmgs(self):
|
||||
|
||||
# GH 7432
|
||||
df = DataFrame({'foo': ['a', 'b', 'c'], 'bar': [
|
||||
1, 2, 3], 'baz': ['d', 'e', 'f']}).set_index('foo')
|
||||
s = DataFrame({'foo': ['a', 'b', 'c', 'a'], 'fiz': [
|
||||
'g', 'h', 'i', 'j']}).set_index('foo')
|
||||
msg = 'cannot reindex from a duplicate axis'
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
df['newcol'] = s
|
||||
|
||||
# GH 4107, more descriptive error message
|
||||
df = DataFrame(np.random.randint(0, 2, (4, 4)),
|
||||
columns=['a', 'b', 'c', 'd'])
|
||||
|
||||
msg = 'incompatible index of inserted column with frame index'
|
||||
with tm.assert_raises_regex(TypeError, msg):
|
||||
df['gr'] = df.groupby(['b', 'c']).count()
|
||||
|
||||
def test_insert_benchmark(self):
|
||||
# from the vb_suite/frame_methods/frame_insert_columns
|
||||
N = 10
|
||||
K = 5
|
||||
df = DataFrame(index=lrange(N))
|
||||
new_col = np.random.randn(N)
|
||||
for i in range(K):
|
||||
df[i] = new_col
|
||||
expected = DataFrame(np.repeat(new_col, K).reshape(N, K),
|
||||
index=lrange(N))
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_insert(self):
|
||||
df = DataFrame(np.random.randn(5, 3), index=np.arange(5),
|
||||
columns=['c', 'b', 'a'])
|
||||
|
||||
df.insert(0, 'foo', df['a'])
|
||||
tm.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a']))
|
||||
tm.assert_series_equal(df['a'], df['foo'], check_names=False)
|
||||
|
||||
df.insert(2, 'bar', df['c'])
|
||||
tm.assert_index_equal(df.columns,
|
||||
Index(['foo', 'c', 'bar', 'b', 'a']))
|
||||
tm.assert_almost_equal(df['c'], df['bar'], check_names=False)
|
||||
|
||||
# diff dtype
|
||||
|
||||
# new item
|
||||
df['x'] = df['a'].astype('float32')
|
||||
result = Series(dict(float32=1, float64=5))
|
||||
assert (df.get_dtype_counts().sort_index() == result).all()
|
||||
|
||||
# replacing current (in different block)
|
||||
df['a'] = df['a'].astype('float32')
|
||||
result = Series(dict(float32=2, float64=4))
|
||||
assert (df.get_dtype_counts().sort_index() == result).all()
|
||||
|
||||
df['y'] = df['a'].astype('int32')
|
||||
result = Series(dict(float32=2, float64=4, int32=1))
|
||||
assert (df.get_dtype_counts().sort_index() == result).all()
|
||||
|
||||
with tm.assert_raises_regex(ValueError, 'already exists'):
|
||||
df.insert(1, 'a', df['b'])
|
||||
pytest.raises(ValueError, df.insert, 1, 'c', df['b'])
|
||||
|
||||
df.columns.name = 'some_name'
|
||||
# preserve columns name field
|
||||
df.insert(0, 'baz', df['c'])
|
||||
assert df.columns.name == 'some_name'
|
||||
|
||||
# GH 13522
|
||||
df = DataFrame(index=['A', 'B', 'C'])
|
||||
df['X'] = df.index
|
||||
df['X'] = ['x', 'y', 'z']
|
||||
exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C'])
|
||||
assert_frame_equal(df, exp)
|
||||
|
||||
def test_delitem(self):
|
||||
del self.frame['A']
|
||||
assert 'A' not in self.frame
|
||||
|
||||
def test_delitem_multiindex(self):
|
||||
midx = MultiIndex.from_product([['A', 'B'], [1, 2]])
|
||||
df = DataFrame(np.random.randn(4, 4), columns=midx)
|
||||
assert len(df.columns) == 4
|
||||
assert ('A', ) in df.columns
|
||||
assert 'A' in df.columns
|
||||
|
||||
result = df['A']
|
||||
assert isinstance(result, DataFrame)
|
||||
del df['A']
|
||||
|
||||
assert len(df.columns) == 2
|
||||
|
||||
# A still in the levels, BUT get a KeyError if trying
|
||||
# to delete
|
||||
assert ('A', ) not in df.columns
|
||||
with pytest.raises(KeyError):
|
||||
del df[('A',)]
|
||||
|
||||
# behavior of dropped/deleted MultiIndex levels changed from
|
||||
# GH 2770 to GH 19027: MultiIndex no longer '.__contains__'
|
||||
# levels which are dropped/deleted
|
||||
assert 'A' not in df.columns
|
||||
with pytest.raises(KeyError):
|
||||
del df['A']
|
||||
|
||||
def test_pop(self):
|
||||
self.frame.columns.name = 'baz'
|
||||
|
||||
self.frame.pop('A')
|
||||
assert 'A' not in self.frame
|
||||
|
||||
self.frame['foo'] = 'bar'
|
||||
self.frame.pop('foo')
|
||||
assert 'foo' not in self.frame
|
||||
# TODO assert self.frame.columns.name == 'baz'
|
||||
|
||||
# gh-10912: inplace ops cause caching issue
|
||||
a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[
|
||||
'A', 'B', 'C'], index=['X', 'Y'])
|
||||
b = a.pop('B')
|
||||
b += 1
|
||||
|
||||
# original frame
|
||||
expected = DataFrame([[1, 3], [4, 6]], columns=[
|
||||
'A', 'C'], index=['X', 'Y'])
|
||||
tm.assert_frame_equal(a, expected)
|
||||
|
||||
# result
|
||||
expected = Series([2, 5], index=['X', 'Y'], name='B') + 1
|
||||
tm.assert_series_equal(b, expected)
|
||||
|
||||
def test_pop_non_unique_cols(self):
|
||||
df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]})
|
||||
df.columns = ["a", "b", "a"]
|
||||
|
||||
res = df.pop("a")
|
||||
assert type(res) == DataFrame
|
||||
assert len(res) == 2
|
||||
assert len(df.columns) == 1
|
||||
assert "b" in df.columns
|
||||
assert "a" not in df.columns
|
||||
assert len(df.index) == 2
|
||||
|
||||
def test_insert_column_bug_4032(self):
|
||||
|
||||
# GH4032, inserting a column and renaming causing errors
|
||||
df = DataFrame({'b': [1.1, 2.2]})
|
||||
df = df.rename(columns={})
|
||||
df.insert(0, 'a', [1, 2])
|
||||
|
||||
result = df.rename(columns={})
|
||||
str(result)
|
||||
expected = DataFrame([[1, 1.1], [2, 2.2]], columns=['a', 'b'])
|
||||
assert_frame_equal(result, expected)
|
||||
df.insert(0, 'c', [1.3, 2.3])
|
||||
|
||||
result = df.rename(columns={})
|
||||
str(result)
|
||||
|
||||
expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]],
|
||||
columns=['c', 'a', 'b'])
|
||||
assert_frame_equal(result, expected)
|
||||
-478
@@ -1,478 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat import lrange, u
|
||||
from pandas import DataFrame, Series, MultiIndex, date_range
|
||||
import pandas as pd
|
||||
|
||||
from pandas.util.testing import assert_series_equal, assert_frame_equal
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.tests.frame.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameNonuniqueIndexes(TestData):
|
||||
|
||||
def test_column_dups_operations(self):
|
||||
|
||||
def check(result, expected=None):
|
||||
if expected is not None:
|
||||
assert_frame_equal(result, expected)
|
||||
result.dtypes
|
||||
str(result)
|
||||
|
||||
# assignment
|
||||
# GH 3687
|
||||
arr = np.random.randn(3, 2)
|
||||
idx = lrange(2)
|
||||
df = DataFrame(arr, columns=['A', 'A'])
|
||||
df.columns = idx
|
||||
expected = DataFrame(arr, columns=idx)
|
||||
check(df, expected)
|
||||
|
||||
idx = date_range('20130101', periods=4, freq='Q-NOV')
|
||||
df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
|
||||
columns=['a', 'a', 'a', 'a'])
|
||||
df.columns = idx
|
||||
expected = DataFrame(
|
||||
[[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
|
||||
check(df, expected)
|
||||
|
||||
# insert
|
||||
df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
|
||||
columns=['foo', 'bar', 'foo', 'hello'])
|
||||
df['string'] = 'bah'
|
||||
expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'],
|
||||
[2, 1, 3, 5, 'bah']],
|
||||
columns=['foo', 'bar', 'foo', 'hello', 'string'])
|
||||
check(df, expected)
|
||||
with tm.assert_raises_regex(ValueError, 'Length of value'):
|
||||
df.insert(0, 'AnotherColumn', range(len(df.index) - 1))
|
||||
|
||||
# insert same dtype
|
||||
df['foo2'] = 3
|
||||
expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3],
|
||||
[2, 1, 3, 5, 'bah', 3]],
|
||||
columns=['foo', 'bar', 'foo', 'hello',
|
||||
'string', 'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
# set (non-dup)
|
||||
df['foo2'] = 4
|
||||
expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4],
|
||||
[2, 1, 3, 5, 'bah', 4]],
|
||||
columns=['foo', 'bar', 'foo', 'hello',
|
||||
'string', 'foo2'])
|
||||
check(df, expected)
|
||||
df['foo2'] = 3
|
||||
|
||||
# delete (non dup)
|
||||
del df['bar']
|
||||
expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3],
|
||||
[2, 3, 5, 'bah', 3]],
|
||||
columns=['foo', 'foo', 'hello', 'string', 'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
# try to delete again (its not consolidated)
|
||||
del df['hello']
|
||||
expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
|
||||
[2, 3, 'bah', 3]],
|
||||
columns=['foo', 'foo', 'string', 'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
# consolidate
|
||||
df = df._consolidate()
|
||||
expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
|
||||
[2, 3, 'bah', 3]],
|
||||
columns=['foo', 'foo', 'string', 'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
# insert
|
||||
df.insert(2, 'new_col', 5.)
|
||||
expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3],
|
||||
[2, 3, 5., 'bah', 3]],
|
||||
columns=['foo', 'foo', 'new_col', 'string',
|
||||
'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
# insert a dup
|
||||
tm.assert_raises_regex(ValueError, 'cannot insert',
|
||||
df.insert, 2, 'new_col', 4.)
|
||||
df.insert(2, 'new_col', 4., allow_duplicates=True)
|
||||
expected = DataFrame([[1, 1, 4., 5., 'bah', 3],
|
||||
[1, 2, 4., 5., 'bah', 3],
|
||||
[2, 3, 4., 5., 'bah', 3]],
|
||||
columns=['foo', 'foo', 'new_col',
|
||||
'new_col', 'string', 'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
# delete (dup)
|
||||
del df['foo']
|
||||
expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3],
|
||||
[4., 5., 'bah', 3]],
|
||||
columns=['new_col', 'new_col', 'string', 'foo2'])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# dup across dtypes
|
||||
df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]],
|
||||
columns=['foo', 'bar', 'foo', 'hello'])
|
||||
check(df)
|
||||
|
||||
df['foo2'] = 7.
|
||||
expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.],
|
||||
[2, 1, 3., 5, 7.]],
|
||||
columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
result = df['foo']
|
||||
expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]],
|
||||
columns=['foo', 'foo'])
|
||||
check(result, expected)
|
||||
|
||||
# multiple replacements
|
||||
df['foo'] = 'string'
|
||||
expected = DataFrame([['string', 1, 'string', 5, 7.],
|
||||
['string', 1, 'string', 5, 7.],
|
||||
['string', 1, 'string', 5, 7.]],
|
||||
columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
del df['foo']
|
||||
expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[
|
||||
'bar', 'hello', 'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
# values
|
||||
df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x'])
|
||||
result = df.values
|
||||
expected = np.array([[1, 2.5], [3, 4.5]])
|
||||
assert (result == expected).all().all()
|
||||
|
||||
# rename, GH 4403
|
||||
df4 = DataFrame(
|
||||
{'RT': [0.0454],
|
||||
'TClose': [22.02],
|
||||
'TExg': [0.0422]},
|
||||
index=MultiIndex.from_tuples([(600809, 20130331)],
|
||||
names=['STK_ID', 'RPT_Date']))
|
||||
|
||||
df5 = DataFrame({'RPT_Date': [20120930, 20121231, 20130331],
|
||||
'STK_ID': [600809] * 3,
|
||||
'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')],
|
||||
'TClose': [38.05, 41.66, 30.01]},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(600809, 20120930),
|
||||
(600809, 20121231),
|
||||
(600809, 20130331)],
|
||||
names=['STK_ID', 'RPT_Date']))
|
||||
|
||||
k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True)
|
||||
result = k.rename(
|
||||
columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'})
|
||||
str(result)
|
||||
result.dtypes
|
||||
|
||||
expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809,
|
||||
u('饡驦'), 30.01]],
|
||||
columns=['RT', 'TClose', 'TExg',
|
||||
'RPT_Date', 'STK_ID', 'STK_Name',
|
||||
'QT_Close'])
|
||||
.set_index(['STK_ID', 'RPT_Date'], drop=False))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# reindex is invalid!
|
||||
df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
|
||||
columns=['bar', 'a', 'a'])
|
||||
pytest.raises(ValueError, df.reindex, columns=['bar'])
|
||||
pytest.raises(ValueError, df.reindex, columns=['bar', 'foo'])
|
||||
|
||||
# drop
|
||||
df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
|
||||
columns=['bar', 'a', 'a'])
|
||||
result = df.drop(['a'], axis=1)
|
||||
expected = DataFrame([[1], [1], [1]], columns=['bar'])
|
||||
check(result, expected)
|
||||
result = df.drop('a', axis=1)
|
||||
check(result, expected)
|
||||
|
||||
# describe
|
||||
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
|
||||
columns=['bar', 'a', 'a'], dtype='float64')
|
||||
result = df.describe()
|
||||
s = df.iloc[:, 0].describe()
|
||||
expected = pd.concat([s, s, s], keys=df.columns, axis=1)
|
||||
check(result, expected)
|
||||
|
||||
# check column dups with index equal and not equal to df's index
|
||||
df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'],
|
||||
columns=['A', 'B', 'A'])
|
||||
for index in [df.index, pd.Index(list('edcba'))]:
|
||||
this_df = df.copy()
|
||||
expected_ser = pd.Series(index.values, index=this_df.index)
|
||||
expected_df = DataFrame({'A': expected_ser,
|
||||
'B': this_df['B'],
|
||||
'A': expected_ser},
|
||||
columns=['A', 'B', 'A'])
|
||||
this_df['A'] = index
|
||||
check(this_df, expected_df)
|
||||
|
||||
# operations
|
||||
for op in ['__add__', '__mul__', '__sub__', '__truediv__']:
|
||||
df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
|
||||
expected = getattr(df, op)(df)
|
||||
expected.columns = ['A', 'A']
|
||||
df.columns = ['A', 'A']
|
||||
result = getattr(df, op)(df)
|
||||
check(result, expected)
|
||||
|
||||
# multiple assignments that change dtypes
|
||||
# the location indexer is a slice
|
||||
# GH 6120
|
||||
df = DataFrame(np.random.randn(5, 2), columns=['that', 'that'])
|
||||
expected = DataFrame(1.0, index=range(5), columns=['that', 'that'])
|
||||
|
||||
df['that'] = 1.0
|
||||
check(df, expected)
|
||||
|
||||
df = DataFrame(np.random.rand(5, 2), columns=['that', 'that'])
|
||||
expected = DataFrame(1, index=range(5), columns=['that', 'that'])
|
||||
|
||||
df['that'] = 1
|
||||
check(df, expected)
|
||||
|
||||
def test_column_dups2(self):
|
||||
|
||||
# drop buggy GH 6240
|
||||
df = DataFrame({'A': np.random.randn(5),
|
||||
'B': np.random.randn(5),
|
||||
'C': np.random.randn(5),
|
||||
'D': ['a', 'b', 'c', 'd', 'e']})
|
||||
|
||||
expected = df.take([0, 1, 1], axis=1)
|
||||
df2 = df.take([2, 0, 1, 2, 1], axis=1)
|
||||
result = df2.drop('C', axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# dropna
|
||||
df = DataFrame({'A': np.random.randn(5),
|
||||
'B': np.random.randn(5),
|
||||
'C': np.random.randn(5),
|
||||
'D': ['a', 'b', 'c', 'd', 'e']})
|
||||
df.iloc[2, [0, 1, 2]] = np.nan
|
||||
df.iloc[0, 0] = np.nan
|
||||
df.iloc[1, 1] = np.nan
|
||||
df.iloc[:, 3] = np.nan
|
||||
expected = df.dropna(subset=['A', 'B', 'C'], how='all')
|
||||
expected.columns = ['A', 'A', 'B', 'C']
|
||||
|
||||
df.columns = ['A', 'A', 'B', 'C']
|
||||
|
||||
result = df.dropna(subset=['A', 'C'], how='all')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_column_dups_indexing(self):
|
||||
def check(result, expected=None):
|
||||
if expected is not None:
|
||||
assert_frame_equal(result, expected)
|
||||
result.dtypes
|
||||
str(result)
|
||||
|
||||
# boolean indexing
|
||||
# GH 4879
|
||||
dups = ['A', 'A', 'C', 'D']
|
||||
df = DataFrame(np.arange(12).reshape(3, 4), columns=[
|
||||
'A', 'B', 'C', 'D'], dtype='float64')
|
||||
expected = df[df.C > 6]
|
||||
expected.columns = dups
|
||||
df = DataFrame(np.arange(12).reshape(3, 4),
|
||||
columns=dups, dtype='float64')
|
||||
result = df[df.C > 6]
|
||||
check(result, expected)
|
||||
|
||||
# where
|
||||
df = DataFrame(np.arange(12).reshape(3, 4), columns=[
|
||||
'A', 'B', 'C', 'D'], dtype='float64')
|
||||
expected = df[df > 6]
|
||||
expected.columns = dups
|
||||
df = DataFrame(np.arange(12).reshape(3, 4),
|
||||
columns=dups, dtype='float64')
|
||||
result = df[df > 6]
|
||||
check(result, expected)
|
||||
|
||||
# boolean with the duplicate raises
|
||||
df = DataFrame(np.arange(12).reshape(3, 4),
|
||||
columns=dups, dtype='float64')
|
||||
pytest.raises(ValueError, lambda: df[df.A > 6])
|
||||
|
||||
# dup aligining operations should work
|
||||
# GH 5185
|
||||
df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
|
||||
df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
|
||||
expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
|
||||
result = df1.sub(df2)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# equality
|
||||
df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]],
|
||||
columns=['A', 'B'])
|
||||
df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]],
|
||||
columns=['A', 'A'])
|
||||
|
||||
# not-comparing like-labelled
|
||||
pytest.raises(ValueError, lambda: df1 == df2)
|
||||
|
||||
df1r = df1.reindex_like(df2)
|
||||
result = df1r == df2
|
||||
expected = DataFrame([[False, True], [True, False], [False, False], [
|
||||
True, False]], columns=['A', 'A'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# mixed column selection
|
||||
# GH 5639
|
||||
dfbool = DataFrame({'one': Series([True, True, False],
|
||||
index=['a', 'b', 'c']),
|
||||
'two': Series([False, False, True, False],
|
||||
index=['a', 'b', 'c', 'd']),
|
||||
'three': Series([False, True, True, True],
|
||||
index=['a', 'b', 'c', 'd'])})
|
||||
expected = pd.concat(
|
||||
[dfbool['one'], dfbool['three'], dfbool['one']], axis=1)
|
||||
result = dfbool[['one', 'three', 'one']]
|
||||
check(result, expected)
|
||||
|
||||
# multi-axis dups
|
||||
# GH 6121
|
||||
df = DataFrame(np.arange(25.).reshape(5, 5),
|
||||
index=['a', 'b', 'c', 'd', 'e'],
|
||||
columns=['A', 'B', 'C', 'D', 'E'])
|
||||
z = df[['A', 'C', 'A']].copy()
|
||||
expected = z.loc[['a', 'c', 'a']]
|
||||
|
||||
df = DataFrame(np.arange(25.).reshape(5, 5),
|
||||
index=['a', 'b', 'c', 'd', 'e'],
|
||||
columns=['A', 'B', 'C', 'D', 'E'])
|
||||
z = df[['A', 'C', 'A']]
|
||||
result = z.loc[['a', 'c', 'a']]
|
||||
check(result, expected)
|
||||
|
||||
def test_column_dups_indexing2(self):
|
||||
|
||||
# GH 8363
|
||||
# datetime ops with a non-unique index
|
||||
df = DataFrame({'A': np.arange(5, dtype='int64'),
|
||||
'B': np.arange(1, 6, dtype='int64')},
|
||||
index=[2, 2, 3, 3, 4])
|
||||
result = df.B - df.A
|
||||
expected = Series(1, index=[2, 2, 3, 3, 4])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': date_range('20130101', periods=5),
|
||||
'B': date_range('20130101 09:00:00', periods=5)},
|
||||
index=[2, 2, 3, 3, 4])
|
||||
result = df.B - df.A
|
||||
expected = Series(pd.Timedelta('9 hours'), index=[2, 2, 3, 3, 4])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_columns_with_dups(self):
|
||||
# GH 3468 related
|
||||
|
||||
# basic
|
||||
df = DataFrame([[1, 2]], columns=['a', 'a'])
|
||||
df.columns = ['a', 'a.1']
|
||||
str(df)
|
||||
expected = DataFrame([[1, 2]], columns=['a', 'a.1'])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
df = DataFrame([[1, 2, 3]], columns=['b', 'a', 'a'])
|
||||
df.columns = ['b', 'a', 'a.1']
|
||||
str(df)
|
||||
expected = DataFrame([[1, 2, 3]], columns=['b', 'a', 'a.1'])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# with a dup index
|
||||
df = DataFrame([[1, 2]], columns=['a', 'a'])
|
||||
df.columns = ['b', 'b']
|
||||
str(df)
|
||||
expected = DataFrame([[1, 2]], columns=['b', 'b'])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# multi-dtype
|
||||
df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
|
||||
columns=['a', 'a', 'b', 'b', 'd', 'c', 'c'])
|
||||
df.columns = list('ABCDEFG')
|
||||
str(df)
|
||||
expected = DataFrame(
|
||||
[[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('ABCDEFG'))
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# this is an error because we cannot disambiguate the dup columns
|
||||
pytest.raises(Exception, lambda x: DataFrame(
|
||||
[[1, 2, 'foo', 'bar']], columns=['a', 'a', 'a', 'a']))
|
||||
|
||||
# dups across blocks
|
||||
df_float = DataFrame(np.random.randn(10, 3), dtype='float64')
|
||||
df_int = DataFrame(np.random.randn(10, 3), dtype='int64')
|
||||
df_bool = DataFrame(True, index=df_float.index,
|
||||
columns=df_float.columns)
|
||||
df_object = DataFrame('foo', index=df_float.index,
|
||||
columns=df_float.columns)
|
||||
df_dt = DataFrame(pd.Timestamp('20010101'),
|
||||
index=df_float.index,
|
||||
columns=df_float.columns)
|
||||
df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
|
||||
|
||||
assert len(df._data._blknos) == len(df.columns)
|
||||
assert len(df._data._blklocs) == len(df.columns)
|
||||
|
||||
# testing iloc
|
||||
for i in range(len(df.columns)):
|
||||
df.iloc[:, i]
|
||||
|
||||
# dup columns across dtype GH 2079/2194
|
||||
vals = [[1, -1, 2.], [2, -2, 3.]]
|
||||
rs = DataFrame(vals, columns=['A', 'A', 'B'])
|
||||
xp = DataFrame(vals)
|
||||
xp.columns = ['A', 'A', 'B']
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_values_duplicates(self):
|
||||
df = DataFrame([[1, 2, 'a', 'b'],
|
||||
[1, 2, 'a', 'b']],
|
||||
columns=['one', 'one', 'two', 'two'])
|
||||
|
||||
result = df.values
|
||||
expected = np.array([[1, 2, 'a', 'b'], [1, 2, 'a', 'b']],
|
||||
dtype=object)
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_set_value_by_index(self):
|
||||
# See gh-12344
|
||||
df = DataFrame(np.arange(9).reshape(3, 3).T)
|
||||
df.columns = list('AAA')
|
||||
expected = df.iloc[:, 2]
|
||||
|
||||
df.iloc[:, 0] = 3
|
||||
assert_series_equal(df.iloc[:, 2], expected)
|
||||
|
||||
df = DataFrame(np.arange(9).reshape(3, 3).T)
|
||||
df.columns = [2, float(2), str(2)]
|
||||
expected = df.iloc[:, 1]
|
||||
|
||||
df.iloc[:, 0] = 3
|
||||
assert_series_equal(df.iloc[:, 1], expected)
|
||||
|
||||
def test_insert_with_columns_dups(self):
|
||||
# GH 14291
|
||||
df = pd.DataFrame()
|
||||
df.insert(0, 'A', ['g', 'h', 'i'], allow_duplicates=True)
|
||||
df.insert(0, 'A', ['d', 'e', 'f'], allow_duplicates=True)
|
||||
df.insert(0, 'A', ['a', 'b', 'c'], allow_duplicates=True)
|
||||
exp = pd.DataFrame([['a', 'd', 'g'], ['b', 'e', 'h'],
|
||||
['c', 'f', 'i']], columns=['A', 'A', 'A'])
|
||||
assert_frame_equal(df, exp)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,140 +0,0 @@
|
||||
import numpy as np
|
||||
from numpy.random import randn
|
||||
from datetime import timedelta
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
from pandas import (PeriodIndex, period_range, DataFrame, date_range,
|
||||
Index, to_datetime, DatetimeIndex)
|
||||
|
||||
|
||||
def _permute(obj):
|
||||
return obj.take(np.random.permutation(len(obj)))
|
||||
|
||||
|
||||
class TestPeriodIndex(object):
|
||||
|
||||
def setup_method(self, method):
|
||||
pass
|
||||
|
||||
def test_as_frame_columns(self):
|
||||
rng = period_range('1/1/2000', periods=5)
|
||||
df = DataFrame(randn(10, 5), columns=rng)
|
||||
|
||||
ts = df[rng[0]]
|
||||
tm.assert_series_equal(ts, df.iloc[:, 0])
|
||||
|
||||
# GH # 1211
|
||||
repr(df)
|
||||
|
||||
ts = df['1/1/2000']
|
||||
tm.assert_series_equal(ts, df.iloc[:, 0])
|
||||
|
||||
def test_frame_setitem(self):
|
||||
rng = period_range('1/1/2000', periods=5, name='index')
|
||||
df = DataFrame(randn(5, 3), index=rng)
|
||||
|
||||
df['Index'] = rng
|
||||
rs = Index(df['Index'])
|
||||
tm.assert_index_equal(rs, rng, check_names=False)
|
||||
assert rs.name == 'Index'
|
||||
assert rng.name == 'index'
|
||||
|
||||
rs = df.reset_index().set_index('index')
|
||||
assert isinstance(rs.index, PeriodIndex)
|
||||
tm.assert_index_equal(rs.index, rng)
|
||||
|
||||
def test_frame_to_time_stamp(self):
|
||||
K = 5
|
||||
index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009')
|
||||
df = DataFrame(randn(len(index), K), index=index)
|
||||
df['mix'] = 'a'
|
||||
|
||||
exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC')
|
||||
result = df.to_timestamp('D', 'end')
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
tm.assert_numpy_array_equal(result.values, df.values)
|
||||
|
||||
exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN')
|
||||
result = df.to_timestamp('D', 'start')
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
def _get_with_delta(delta, freq='A-DEC'):
|
||||
return date_range(to_datetime('1/1/2001') + delta,
|
||||
to_datetime('12/31/2009') + delta, freq=freq)
|
||||
|
||||
delta = timedelta(hours=23)
|
||||
result = df.to_timestamp('H', 'end')
|
||||
exp_index = _get_with_delta(delta)
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
delta = timedelta(hours=23, minutes=59)
|
||||
result = df.to_timestamp('T', 'end')
|
||||
exp_index = _get_with_delta(delta)
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
result = df.to_timestamp('S', 'end')
|
||||
delta = timedelta(hours=23, minutes=59, seconds=59)
|
||||
exp_index = _get_with_delta(delta)
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
# columns
|
||||
df = df.T
|
||||
|
||||
exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC')
|
||||
result = df.to_timestamp('D', 'end', axis=1)
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
tm.assert_numpy_array_equal(result.values, df.values)
|
||||
|
||||
exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN')
|
||||
result = df.to_timestamp('D', 'start', axis=1)
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
|
||||
delta = timedelta(hours=23)
|
||||
result = df.to_timestamp('H', 'end', axis=1)
|
||||
exp_index = _get_with_delta(delta)
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
|
||||
delta = timedelta(hours=23, minutes=59)
|
||||
result = df.to_timestamp('T', 'end', axis=1)
|
||||
exp_index = _get_with_delta(delta)
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
|
||||
result = df.to_timestamp('S', 'end', axis=1)
|
||||
delta = timedelta(hours=23, minutes=59, seconds=59)
|
||||
exp_index = _get_with_delta(delta)
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
|
||||
# invalid axis
|
||||
tm.assert_raises_regex(
|
||||
ValueError, 'axis', df.to_timestamp, axis=2)
|
||||
|
||||
result1 = df.to_timestamp('5t', axis=1)
|
||||
result2 = df.to_timestamp('t', axis=1)
|
||||
expected = pd.date_range('2001-01-01', '2009-01-01', freq='AS')
|
||||
assert isinstance(result1.columns, DatetimeIndex)
|
||||
assert isinstance(result2.columns, DatetimeIndex)
|
||||
tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8)
|
||||
tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8)
|
||||
# PeriodIndex.to_timestamp always use 'infer'
|
||||
assert result1.columns.freqstr == 'AS-JAN'
|
||||
assert result2.columns.freqstr == 'AS-JAN'
|
||||
|
||||
def test_frame_index_to_string(self):
|
||||
index = PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M')
|
||||
frame = DataFrame(np.random.randn(3, 4), index=index)
|
||||
|
||||
# it works!
|
||||
frame.to_string()
|
||||
|
||||
def test_align_frame(self):
|
||||
rng = period_range('1/1/2000', '1/1/2010', freq='A')
|
||||
ts = DataFrame(np.random.randn(len(rng), 3), index=rng)
|
||||
|
||||
result = ts + ts[::2]
|
||||
expected = ts + ts
|
||||
expected.values[1::2] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = ts + _permute(ts[::2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -1,393 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
from pandas import (DataFrame, Series, Timestamp, _np_version_under1p11)
|
||||
import pandas as pd
|
||||
|
||||
from pandas.util.testing import assert_series_equal, assert_frame_equal
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.tests.frame.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameQuantile(TestData):
|
||||
|
||||
def test_quantile(self):
|
||||
from numpy import percentile
|
||||
|
||||
q = self.tsframe.quantile(0.1, axis=0)
|
||||
assert q['A'] == percentile(self.tsframe['A'], 10)
|
||||
tm.assert_index_equal(q.index, self.tsframe.columns)
|
||||
|
||||
q = self.tsframe.quantile(0.9, axis=1)
|
||||
assert (q['2000-01-17'] ==
|
||||
percentile(self.tsframe.loc['2000-01-17'], 90))
|
||||
tm.assert_index_equal(q.index, self.tsframe.index)
|
||||
|
||||
# test degenerate case
|
||||
q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0)
|
||||
assert(np.isnan(q['x']) and np.isnan(q['y']))
|
||||
|
||||
# non-numeric exclusion
|
||||
df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]})
|
||||
rs = df.quantile(0.5)
|
||||
xp = df.median().rename(0.5)
|
||||
assert_series_equal(rs, xp)
|
||||
|
||||
# axis
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
||||
result = df.quantile(.5, axis=1)
|
||||
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df.quantile([.5, .75], axis=1)
|
||||
expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75],
|
||||
3: [3.5, 3.75]}, index=[0.5, 0.75])
|
||||
assert_frame_equal(result, expected, check_index_type=True)
|
||||
|
||||
# We may want to break API in the future to change this
|
||||
# so that we exclude non-numeric along the same axis
|
||||
# See GH #7312
|
||||
df = DataFrame([[1, 2, 3],
|
||||
['a', 'b', 4]])
|
||||
result = df.quantile(.5, axis=1)
|
||||
expected = Series([3., 4.], index=[0, 1], name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_quantile_axis_mixed(self):
|
||||
|
||||
# mixed on axis=1
|
||||
df = DataFrame({"A": [1, 2, 3],
|
||||
"B": [2., 3., 4.],
|
||||
"C": pd.date_range('20130101', periods=3),
|
||||
"D": ['foo', 'bar', 'baz']})
|
||||
result = df.quantile(.5, axis=1)
|
||||
expected = Series([1.5, 2.5, 3.5], name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# must raise
|
||||
def f():
|
||||
df.quantile(.5, axis=1, numeric_only=False)
|
||||
pytest.raises(TypeError, f)
|
||||
|
||||
def test_quantile_axis_parameter(self):
|
||||
# GH 9543/9544
|
||||
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
||||
|
||||
result = df.quantile(.5, axis=0)
|
||||
|
||||
expected = Series([2., 3.], index=["A", "B"], name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
expected = df.quantile(.5, axis="index")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df.quantile(.5, axis=1)
|
||||
|
||||
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df.quantile(.5, axis="columns")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
pytest.raises(ValueError, df.quantile, 0.1, axis=-1)
|
||||
pytest.raises(ValueError, df.quantile, 0.1, axis="column")
|
||||
|
||||
def test_quantile_interpolation(self):
|
||||
# see gh-10174
|
||||
from numpy import percentile
|
||||
|
||||
# interpolation = linear (default case)
|
||||
q = self.tsframe.quantile(0.1, axis=0, interpolation='linear')
|
||||
assert q['A'] == percentile(self.tsframe['A'], 10)
|
||||
q = self.intframe.quantile(0.1)
|
||||
assert q['A'] == percentile(self.intframe['A'], 10)
|
||||
|
||||
# test with and without interpolation keyword
|
||||
q1 = self.intframe.quantile(0.1)
|
||||
assert q1['A'] == np.percentile(self.intframe['A'], 10)
|
||||
tm.assert_series_equal(q, q1)
|
||||
|
||||
# interpolation method other than default linear
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
||||
result = df.quantile(.5, axis=1, interpolation='nearest')
|
||||
expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# cross-check interpolation=nearest results in original dtype
|
||||
exp = np.percentile(np.array([[1, 2, 3], [2, 3, 4]]), .5,
|
||||
axis=0, interpolation='nearest')
|
||||
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='int64')
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# float
|
||||
df = DataFrame({"A": [1., 2., 3.], "B": [2., 3., 4.]}, index=[1, 2, 3])
|
||||
result = df.quantile(.5, axis=1, interpolation='nearest')
|
||||
expected = Series([1., 2., 3.], index=[1, 2, 3], name=0.5)
|
||||
tm.assert_series_equal(result, expected)
|
||||
exp = np.percentile(np.array([[1., 2., 3.], [2., 3., 4.]]), .5,
|
||||
axis=0, interpolation='nearest')
|
||||
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='float64')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# axis
|
||||
result = df.quantile([.5, .75], axis=1, interpolation='lower')
|
||||
expected = DataFrame({1: [1., 1.], 2: [2., 2.],
|
||||
3: [3., 3.]}, index=[0.5, 0.75])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# test degenerate case
|
||||
df = DataFrame({'x': [], 'y': []})
|
||||
q = df.quantile(0.1, axis=0, interpolation='higher')
|
||||
assert(np.isnan(q['x']) and np.isnan(q['y']))
|
||||
|
||||
# multi
|
||||
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
|
||||
columns=['a', 'b', 'c'])
|
||||
result = df.quantile([.25, .5], interpolation='midpoint')
|
||||
|
||||
# https://github.com/numpy/numpy/issues/7163
|
||||
if _np_version_under1p11:
|
||||
expected = DataFrame([[1.5, 1.5, 1.5], [2.5, 2.5, 2.5]],
|
||||
index=[.25, .5], columns=['a', 'b', 'c'])
|
||||
else:
|
||||
expected = DataFrame([[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
|
||||
index=[.25, .5], columns=['a', 'b', 'c'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_quantile_multi(self):
|
||||
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
|
||||
columns=['a', 'b', 'c'])
|
||||
result = df.quantile([.25, .5])
|
||||
expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
|
||||
index=[.25, .5], columns=['a', 'b', 'c'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# axis = 1
|
||||
result = df.quantile([.25, .5], axis=1)
|
||||
expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
|
||||
index=[.25, .5], columns=[0, 1, 2])
|
||||
|
||||
# empty
|
||||
result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0)
|
||||
expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]},
|
||||
index=[.1, .9])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_quantile_datetime(self):
|
||||
df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]})
|
||||
|
||||
# exclude datetime
|
||||
result = df.quantile(.5)
|
||||
expected = Series([2.5], index=['b'])
|
||||
|
||||
# datetime
|
||||
result = df.quantile(.5, numeric_only=False)
|
||||
expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5],
|
||||
index=['a', 'b'],
|
||||
name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# datetime w/ multi
|
||||
result = df.quantile([.5], numeric_only=False)
|
||||
expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]],
|
||||
index=[.5], columns=['a', 'b'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# axis = 1
|
||||
df['c'] = pd.to_datetime(['2011', '2012'])
|
||||
result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False)
|
||||
expected = Series([Timestamp('2010-07-02 12:00:00'),
|
||||
Timestamp('2011-07-02 12:00:00')],
|
||||
index=[0, 1],
|
||||
name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False)
|
||||
expected = DataFrame([[Timestamp('2010-07-02 12:00:00'),
|
||||
Timestamp('2011-07-02 12:00:00')]],
|
||||
index=[0.5], columns=[0, 1])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# empty when numeric_only=True
|
||||
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
|
||||
# result = df[['a', 'c']].quantile(.5)
|
||||
# result = df[['a', 'c']].quantile([.5])
|
||||
|
||||
def test_quantile_invalid(self):
|
||||
msg = 'percentiles should all be in the interval \\[0, 1\\]'
|
||||
for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
self.tsframe.quantile(invalid)
|
||||
|
||||
def test_quantile_box(self):
|
||||
df = DataFrame({'A': [pd.Timestamp('2011-01-01'),
|
||||
pd.Timestamp('2011-01-02'),
|
||||
pd.Timestamp('2011-01-03')],
|
||||
'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.Timestamp('2011-01-03', tz='US/Eastern')],
|
||||
'C': [pd.Timedelta('1 days'),
|
||||
pd.Timedelta('2 days'),
|
||||
pd.Timedelta('3 days')]})
|
||||
|
||||
res = df.quantile(0.5, numeric_only=False)
|
||||
|
||||
exp = pd.Series([pd.Timestamp('2011-01-02'),
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.Timedelta('2 days')],
|
||||
name=0.5, index=['A', 'B', 'C'])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5], numeric_only=False)
|
||||
exp = pd.DataFrame([[pd.Timestamp('2011-01-02'),
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.Timedelta('2 days')]],
|
||||
index=[0.5], columns=['A', 'B', 'C'])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# DatetimeBlock may be consolidated and contain NaT in different loc
|
||||
df = DataFrame({'A': [pd.Timestamp('2011-01-01'),
|
||||
pd.NaT,
|
||||
pd.Timestamp('2011-01-02'),
|
||||
pd.Timestamp('2011-01-03')],
|
||||
'a': [pd.Timestamp('2011-01-01'),
|
||||
pd.Timestamp('2011-01-02'),
|
||||
pd.NaT,
|
||||
pd.Timestamp('2011-01-03')],
|
||||
'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
|
||||
pd.NaT,
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.Timestamp('2011-01-03', tz='US/Eastern')],
|
||||
'b': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.NaT,
|
||||
pd.Timestamp('2011-01-03', tz='US/Eastern')],
|
||||
'C': [pd.Timedelta('1 days'),
|
||||
pd.Timedelta('2 days'),
|
||||
pd.Timedelta('3 days'),
|
||||
pd.NaT],
|
||||
'c': [pd.NaT,
|
||||
pd.Timedelta('1 days'),
|
||||
pd.Timedelta('2 days'),
|
||||
pd.Timedelta('3 days')]},
|
||||
columns=list('AaBbCc'))
|
||||
|
||||
res = df.quantile(0.5, numeric_only=False)
|
||||
exp = pd.Series([pd.Timestamp('2011-01-02'),
|
||||
pd.Timestamp('2011-01-02'),
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.Timedelta('2 days'),
|
||||
pd.Timedelta('2 days')],
|
||||
name=0.5, index=list('AaBbCc'))
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5], numeric_only=False)
|
||||
exp = pd.DataFrame([[pd.Timestamp('2011-01-02'),
|
||||
pd.Timestamp('2011-01-02'),
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.Timedelta('2 days'),
|
||||
pd.Timedelta('2 days')]],
|
||||
index=[0.5], columns=list('AaBbCc'))
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_quantile_nan(self):
|
||||
|
||||
# GH 14357 - float block where some cols have missing values
|
||||
df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)})
|
||||
df.iloc[-1, 1] = np.nan
|
||||
|
||||
res = df.quantile(0.5)
|
||||
exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5, 0.75])
|
||||
exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.quantile(0.5, axis=1)
|
||||
exp = Series(np.arange(1.0, 6.0), name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5, 0.75], axis=1)
|
||||
exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# full-nan column
|
||||
df['b'] = np.nan
|
||||
|
||||
res = df.quantile(0.5)
|
||||
exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5, 0.75])
|
||||
exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]},
|
||||
index=[0.5, 0.75])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_quantile_nat(self):
|
||||
|
||||
# full NaT column
|
||||
df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]})
|
||||
|
||||
res = df.quantile(0.5, numeric_only=False)
|
||||
exp = Series([pd.NaT], index=['a'], name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5], numeric_only=False)
|
||||
exp = DataFrame({'a': [pd.NaT]}, index=[0.5])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# mixed non-null / full null column
|
||||
df = DataFrame({'a': [pd.Timestamp('2012-01-01'),
|
||||
pd.Timestamp('2012-01-02'),
|
||||
pd.Timestamp('2012-01-03')],
|
||||
'b': [pd.NaT, pd.NaT, pd.NaT]})
|
||||
|
||||
res = df.quantile(0.5, numeric_only=False)
|
||||
exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'],
|
||||
name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5], numeric_only=False)
|
||||
exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5],
|
||||
columns=['a', 'b'])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_quantile_empty(self):
|
||||
|
||||
# floats
|
||||
df = DataFrame(columns=['a', 'b'], dtype='float64')
|
||||
|
||||
res = df.quantile(0.5)
|
||||
exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5])
|
||||
exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
|
||||
# res = df.quantile(0.5, axis=1)
|
||||
# res = df.quantile([0.5], axis=1)
|
||||
|
||||
# ints
|
||||
df = DataFrame(columns=['a', 'b'], dtype='int64')
|
||||
|
||||
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
|
||||
# res = df.quantile(0.5)
|
||||
|
||||
# datetimes
|
||||
df = DataFrame(columns=['a', 'b'], dtype='datetime64[ns]')
|
||||
|
||||
# FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0)
|
||||
# res = df.quantile(0.5, numeric_only=False)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,299 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import pytest
|
||||
import numpy as np
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from distutils.version import LooseVersion
|
||||
from datetime import timedelta, datetime
|
||||
from numpy import nan
|
||||
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
from pandas.tests.frame.common import TestData
|
||||
from pandas import Series, DataFrame
|
||||
from pandas.compat import product
|
||||
|
||||
|
||||
class TestRank(TestData):
|
||||
s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3])
|
||||
df = DataFrame({'A': s, 'B': s})
|
||||
|
||||
results = {
|
||||
'average': np.array([1.5, 5.5, 7.0, 3.5, nan,
|
||||
3.5, 1.5, 8.0, nan, 5.5]),
|
||||
'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]),
|
||||
'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]),
|
||||
'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]),
|
||||
'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]),
|
||||
}
|
||||
|
||||
def test_rank(self):
|
||||
rankdata = pytest.importorskip('scipy.stats.rankdata')
|
||||
|
||||
self.frame['A'][::2] = np.nan
|
||||
self.frame['B'][::3] = np.nan
|
||||
self.frame['C'][::4] = np.nan
|
||||
self.frame['D'][::5] = np.nan
|
||||
|
||||
ranks0 = self.frame.rank()
|
||||
ranks1 = self.frame.rank(1)
|
||||
mask = np.isnan(self.frame.values)
|
||||
|
||||
fvals = self.frame.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(rankdata, 0, fvals)
|
||||
exp0[mask] = np.nan
|
||||
|
||||
exp1 = np.apply_along_axis(rankdata, 1, fvals)
|
||||
exp1[mask] = np.nan
|
||||
|
||||
tm.assert_almost_equal(ranks0.values, exp0)
|
||||
tm.assert_almost_equal(ranks1.values, exp1)
|
||||
|
||||
# integers
|
||||
df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))
|
||||
|
||||
result = df.rank()
|
||||
exp = df.astype(float).rank()
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
result = df.rank(1)
|
||||
exp = df.astype(float).rank(1)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
def test_rank2(self):
|
||||
df = DataFrame([[1, 3, 2], [1, 2, 3]])
|
||||
expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
|
||||
result = df.rank(1, pct=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame([[1, 3, 2], [1, 2, 3]])
|
||||
expected = df.rank(0) / 2.0
|
||||
result = df.rank(0, pct=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']])
|
||||
expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
|
||||
result = df.rank(1, numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
|
||||
result = df.rank(0, numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']])
|
||||
expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]])
|
||||
result = df.rank(1, numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]])
|
||||
result = df.rank(0, numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# f7u12, this does not work without extensive workaround
|
||||
data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)],
|
||||
[datetime(2000, 1, 2), datetime(2000, 1, 3),
|
||||
datetime(2000, 1, 1)]]
|
||||
df = DataFrame(data)
|
||||
|
||||
# check the rank
|
||||
expected = DataFrame([[2., nan, 1.],
|
||||
[2., 3., 1.]])
|
||||
result = df.rank(1, numeric_only=False, ascending=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([[1., nan, 2.],
|
||||
[2., 1., 3.]])
|
||||
result = df.rank(1, numeric_only=False, ascending=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# mixed-type frames
|
||||
self.mixed_frame['datetime'] = datetime.now()
|
||||
self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1)
|
||||
|
||||
result = self.mixed_frame.rank(1)
|
||||
expected = self.mixed_frame.rank(1, numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10,
|
||||
1e60, 1e80, 1e-30]})
|
||||
exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]})
|
||||
tm.assert_frame_equal(df.rank(), exp)
|
||||
|
||||
def test_rank_na_option(self):
|
||||
rankdata = pytest.importorskip('scipy.stats.rankdata')
|
||||
|
||||
self.frame['A'][::2] = np.nan
|
||||
self.frame['B'][::3] = np.nan
|
||||
self.frame['C'][::4] = np.nan
|
||||
self.frame['D'][::5] = np.nan
|
||||
|
||||
# bottom
|
||||
ranks0 = self.frame.rank(na_option='bottom')
|
||||
ranks1 = self.frame.rank(1, na_option='bottom')
|
||||
|
||||
fvals = self.frame.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(rankdata, 0, fvals)
|
||||
exp1 = np.apply_along_axis(rankdata, 1, fvals)
|
||||
|
||||
tm.assert_almost_equal(ranks0.values, exp0)
|
||||
tm.assert_almost_equal(ranks1.values, exp1)
|
||||
|
||||
# top
|
||||
ranks0 = self.frame.rank(na_option='top')
|
||||
ranks1 = self.frame.rank(1, na_option='top')
|
||||
|
||||
fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
|
||||
fval1 = self.frame.T
|
||||
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
|
||||
fval1 = fval1.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(rankdata, 0, fval0)
|
||||
exp1 = np.apply_along_axis(rankdata, 1, fval1)
|
||||
|
||||
tm.assert_almost_equal(ranks0.values, exp0)
|
||||
tm.assert_almost_equal(ranks1.values, exp1)
|
||||
|
||||
# descending
|
||||
|
||||
# bottom
|
||||
ranks0 = self.frame.rank(na_option='top', ascending=False)
|
||||
ranks1 = self.frame.rank(1, na_option='top', ascending=False)
|
||||
|
||||
fvals = self.frame.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(rankdata, 0, -fvals)
|
||||
exp1 = np.apply_along_axis(rankdata, 1, -fvals)
|
||||
|
||||
tm.assert_almost_equal(ranks0.values, exp0)
|
||||
tm.assert_almost_equal(ranks1.values, exp1)
|
||||
|
||||
# descending
|
||||
|
||||
# top
|
||||
ranks0 = self.frame.rank(na_option='bottom', ascending=False)
|
||||
ranks1 = self.frame.rank(1, na_option='bottom', ascending=False)
|
||||
|
||||
fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
|
||||
fval1 = self.frame.T
|
||||
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
|
||||
fval1 = fval1.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(rankdata, 0, -fval0)
|
||||
exp1 = np.apply_along_axis(rankdata, 1, -fval1)
|
||||
|
||||
tm.assert_numpy_array_equal(ranks0.values, exp0)
|
||||
tm.assert_numpy_array_equal(ranks1.values, exp1)
|
||||
|
||||
def test_rank_axis(self):
|
||||
# check if using axes' names gives the same result
|
||||
df = DataFrame([[2, 1], [4, 3]])
|
||||
tm.assert_frame_equal(df.rank(axis=0), df.rank(axis='index'))
|
||||
tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns'))
|
||||
|
||||
def test_rank_methods_frame(self):
|
||||
pytest.importorskip('scipy.stats.special')
|
||||
rankdata = pytest.importorskip('scipy.stats.rankdata')
|
||||
import scipy
|
||||
|
||||
xs = np.random.randint(0, 21, (100, 26))
|
||||
xs = (xs - 10.0) / 10.0
|
||||
cols = [chr(ord('z') - i) for i in range(xs.shape[1])]
|
||||
|
||||
for vals in [xs, xs + 1e6, xs * 1e-6]:
|
||||
df = DataFrame(vals, columns=cols)
|
||||
|
||||
for ax in [0, 1]:
|
||||
for m in ['average', 'min', 'max', 'first', 'dense']:
|
||||
result = df.rank(axis=ax, method=m)
|
||||
sprank = np.apply_along_axis(
|
||||
rankdata, ax, vals,
|
||||
m if m != 'first' else 'ordinal')
|
||||
sprank = sprank.astype(np.float64)
|
||||
expected = DataFrame(sprank, columns=cols)
|
||||
|
||||
if (LooseVersion(scipy.__version__) >=
|
||||
LooseVersion('0.17.0')):
|
||||
expected = expected.astype('float64')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_rank_descending(self):
|
||||
dtypes = ['O', 'f8', 'i8']
|
||||
|
||||
for dtype, method in product(dtypes, self.results):
|
||||
if 'i' in dtype:
|
||||
df = self.df.dropna()
|
||||
else:
|
||||
df = self.df.astype(dtype)
|
||||
|
||||
res = df.rank(ascending=False)
|
||||
expected = (df.max() - df).rank()
|
||||
assert_frame_equal(res, expected)
|
||||
|
||||
if method == 'first' and dtype == 'O':
|
||||
continue
|
||||
|
||||
expected = (df.max() - df).rank(method=method)
|
||||
|
||||
if dtype != 'O':
|
||||
res2 = df.rank(method=method, ascending=False,
|
||||
numeric_only=True)
|
||||
assert_frame_equal(res2, expected)
|
||||
|
||||
res3 = df.rank(method=method, ascending=False,
|
||||
numeric_only=False)
|
||||
assert_frame_equal(res3, expected)
|
||||
|
||||
def test_rank_2d_tie_methods(self):
|
||||
df = self.df
|
||||
|
||||
def _check2d(df, expected, method='average', axis=0):
|
||||
exp_df = DataFrame({'A': expected, 'B': expected})
|
||||
|
||||
if axis == 1:
|
||||
df = df.T
|
||||
exp_df = exp_df.T
|
||||
|
||||
result = df.rank(method=method, axis=axis)
|
||||
assert_frame_equal(result, exp_df)
|
||||
|
||||
dtypes = [None, object]
|
||||
disabled = set([(object, 'first')])
|
||||
results = self.results
|
||||
|
||||
for method, axis, dtype in product(results, [0, 1], dtypes):
|
||||
if (dtype, method) in disabled:
|
||||
continue
|
||||
frame = df if dtype is None else df.astype(dtype)
|
||||
_check2d(frame, results[method], method=method, axis=axis)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method,exp", [("dense",
|
||||
[[1., 1., 1.],
|
||||
[1., 0.5, 2. / 3],
|
||||
[1., 0.5, 1. / 3]]),
|
||||
("min",
|
||||
[[1. / 3, 1., 1.],
|
||||
[1. / 3, 1. / 3, 2. / 3],
|
||||
[1. / 3, 1. / 3, 1. / 3]]),
|
||||
("max",
|
||||
[[1., 1., 1.],
|
||||
[1., 2. / 3, 2. / 3],
|
||||
[1., 2. / 3, 1. / 3]]),
|
||||
("average",
|
||||
[[2. / 3, 1., 1.],
|
||||
[2. / 3, 0.5, 2. / 3],
|
||||
[2. / 3, 0.5, 1. / 3]]),
|
||||
("first",
|
||||
[[1. / 3, 1., 1.],
|
||||
[2. / 3, 1. / 3, 2. / 3],
|
||||
[3. / 3, 2. / 3, 1. / 3]])])
|
||||
def test_rank_pct_true(method, exp):
|
||||
# see gh-15630.
|
||||
|
||||
df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
|
||||
result = df.rank(method=method, pct=True)
|
||||
|
||||
expected = DataFrame(exp)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,525 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
import re
|
||||
import sys
|
||||
import textwrap
|
||||
|
||||
from numpy import nan
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (DataFrame, Series, compat, option_context,
|
||||
date_range, period_range, Categorical)
|
||||
from pandas.compat import StringIO, lrange, u, PYPY
|
||||
import pandas.io.formats.format as fmt
|
||||
import pandas as pd
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.tests.frame.common import TestData
|
||||
|
||||
|
||||
# Segregated collection of methods that require the BlockManager internal data
|
||||
# structure
|
||||
|
||||
|
||||
class TestDataFrameReprInfoEtc(TestData):
|
||||
|
||||
def test_repr_empty(self):
|
||||
# empty
|
||||
foo = repr(self.empty) # noqa
|
||||
|
||||
# empty with index
|
||||
frame = DataFrame(index=np.arange(1000))
|
||||
foo = repr(frame) # noqa
|
||||
|
||||
def test_repr_mixed(self):
|
||||
buf = StringIO()
|
||||
|
||||
# mixed
|
||||
foo = repr(self.mixed_frame) # noqa
|
||||
self.mixed_frame.info(verbose=False, buf=buf)
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_repr_mixed_big(self):
|
||||
# big mixed
|
||||
biggie = DataFrame({'A': np.random.randn(200),
|
||||
'B': tm.makeStringIndex(200)},
|
||||
index=lrange(200))
|
||||
biggie.loc[:20, 'A'] = nan
|
||||
biggie.loc[:20, 'B'] = nan
|
||||
|
||||
foo = repr(biggie) # noqa
|
||||
|
||||
def test_repr(self):
|
||||
buf = StringIO()
|
||||
|
||||
# small one
|
||||
foo = repr(self.frame)
|
||||
self.frame.info(verbose=False, buf=buf)
|
||||
|
||||
# even smaller
|
||||
self.frame.reindex(columns=['A']).info(verbose=False, buf=buf)
|
||||
self.frame.reindex(columns=['A', 'B']).info(verbose=False, buf=buf)
|
||||
|
||||
# exhausting cases in DataFrame.info
|
||||
|
||||
# columns but no index
|
||||
no_index = DataFrame(columns=[0, 1, 3])
|
||||
foo = repr(no_index) # noqa
|
||||
|
||||
# no columns or index
|
||||
self.empty.info(buf=buf)
|
||||
|
||||
df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"])
|
||||
assert "\t" not in repr(df)
|
||||
assert "\r" not in repr(df)
|
||||
assert "a\n" not in repr(df)
|
||||
|
||||
def test_repr_dimensions(self):
|
||||
df = DataFrame([[1, 2, ], [3, 4]])
|
||||
with option_context('display.show_dimensions', True):
|
||||
assert "2 rows x 2 columns" in repr(df)
|
||||
|
||||
with option_context('display.show_dimensions', False):
|
||||
assert "2 rows x 2 columns" not in repr(df)
|
||||
|
||||
with option_context('display.show_dimensions', 'truncate'):
|
||||
assert "2 rows x 2 columns" not in repr(df)
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_repr_big(self):
|
||||
# big one
|
||||
biggie = DataFrame(np.zeros((200, 4)), columns=lrange(4),
|
||||
index=lrange(200))
|
||||
repr(biggie)
|
||||
|
||||
def test_repr_unsortable(self):
|
||||
# columns are not sortable
|
||||
import warnings
|
||||
warn_filters = warnings.filters
|
||||
warnings.filterwarnings('ignore',
|
||||
category=FutureWarning,
|
||||
module=".*format")
|
||||
|
||||
unsortable = DataFrame({'foo': [1] * 50,
|
||||
datetime.today(): [1] * 50,
|
||||
'bar': ['bar'] * 50,
|
||||
datetime.today() + timedelta(1): ['bar'] * 50},
|
||||
index=np.arange(50))
|
||||
repr(unsortable)
|
||||
|
||||
fmt.set_option('display.precision', 3, 'display.column_space', 10)
|
||||
repr(self.frame)
|
||||
|
||||
fmt.set_option('display.max_rows', 10, 'display.max_columns', 2)
|
||||
repr(self.frame)
|
||||
|
||||
fmt.set_option('display.max_rows', 1000, 'display.max_columns', 1000)
|
||||
repr(self.frame)
|
||||
|
||||
tm.reset_display_options()
|
||||
|
||||
warnings.filters = warn_filters
|
||||
|
||||
def test_repr_unicode(self):
|
||||
uval = u('\u03c3\u03c3\u03c3\u03c3')
|
||||
|
||||
# TODO(wesm): is this supposed to be used?
|
||||
bval = uval.encode('utf-8') # noqa
|
||||
|
||||
df = DataFrame({'A': [uval, uval]})
|
||||
|
||||
result = repr(df)
|
||||
ex_top = ' A'
|
||||
assert result.split('\n')[0].rstrip() == ex_top
|
||||
|
||||
df = DataFrame({'A': [uval, uval]})
|
||||
result = repr(df)
|
||||
assert result.split('\n')[0].rstrip() == ex_top
|
||||
|
||||
def test_unicode_string_with_unicode(self):
|
||||
df = DataFrame({'A': [u("\u05d0")]})
|
||||
|
||||
if compat.PY3:
|
||||
str(df)
|
||||
else:
|
||||
compat.text_type(df)
|
||||
|
||||
def test_bytestring_with_unicode(self):
|
||||
df = DataFrame({'A': [u("\u05d0")]})
|
||||
if compat.PY3:
|
||||
bytes(df)
|
||||
else:
|
||||
str(df)
|
||||
|
||||
def test_very_wide_info_repr(self):
|
||||
df = DataFrame(np.random.randn(10, 20),
|
||||
columns=tm.rands_array(10, 20))
|
||||
repr(df)
|
||||
|
||||
def test_repr_column_name_unicode_truncation_bug(self):
|
||||
# #1906
|
||||
df = DataFrame({'Id': [7117434],
|
||||
'StringCol': ('Is it possible to modify drop plot code'
|
||||
' so that the output graph is displayed '
|
||||
'in iphone simulator, Is it possible to '
|
||||
'modify drop plot code so that the '
|
||||
'output graph is \xe2\x80\xa8displayed '
|
||||
'in iphone simulator.Now we are adding '
|
||||
'the CSV file externally. I want to Call'
|
||||
' the File through the code..')})
|
||||
|
||||
with option_context('display.max_columns', 20):
|
||||
assert 'StringCol' in repr(df)
|
||||
|
||||
def test_latex_repr(self):
|
||||
result = r"""\begin{tabular}{llll}
|
||||
\toprule
|
||||
{} & 0 & 1 & 2 \\
|
||||
\midrule
|
||||
0 & $\alpha$ & b & c \\
|
||||
1 & 1 & 2 & 3 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
with option_context("display.latex.escape", False,
|
||||
'display.latex.repr', True):
|
||||
df = DataFrame([[r'$\alpha$', 'b', 'c'], [1, 2, 3]])
|
||||
assert result == df._repr_latex_()
|
||||
|
||||
# GH 12182
|
||||
assert df._repr_latex_() is None
|
||||
|
||||
@tm.capture_stdout
|
||||
def test_info(self):
|
||||
io = StringIO()
|
||||
self.frame.info(buf=io)
|
||||
self.tsframe.info(buf=io)
|
||||
|
||||
frame = DataFrame(np.random.randn(5, 3))
|
||||
|
||||
frame.info()
|
||||
frame.info(verbose=False)
|
||||
|
||||
def test_info_memory(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/21056
|
||||
df = pd.DataFrame({'a': pd.Series([1, 2], dtype='i8')})
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
result = buf.getvalue()
|
||||
bytes = float(df.memory_usage().sum())
|
||||
|
||||
expected = textwrap.dedent("""\
|
||||
<class 'pandas.core.frame.DataFrame'>
|
||||
RangeIndex: 2 entries, 0 to 1
|
||||
Data columns (total 1 columns):
|
||||
a 2 non-null int64
|
||||
dtypes: int64(1)
|
||||
memory usage: {} bytes
|
||||
""".format(bytes))
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_info_wide(self):
|
||||
from pandas import set_option, reset_option
|
||||
io = StringIO()
|
||||
df = DataFrame(np.random.randn(5, 101))
|
||||
df.info(buf=io)
|
||||
|
||||
io = StringIO()
|
||||
df.info(buf=io, max_cols=101)
|
||||
rs = io.getvalue()
|
||||
assert len(rs.splitlines()) > 100
|
||||
xp = rs
|
||||
|
||||
set_option('display.max_info_columns', 101)
|
||||
io = StringIO()
|
||||
df.info(buf=io)
|
||||
assert rs == xp
|
||||
reset_option('display.max_info_columns')
|
||||
|
||||
def test_info_duplicate_columns(self):
|
||||
io = StringIO()
|
||||
|
||||
# it works!
|
||||
frame = DataFrame(np.random.randn(1500, 4),
|
||||
columns=['a', 'a', 'b', 'b'])
|
||||
frame.info(buf=io)
|
||||
|
||||
def test_info_duplicate_columns_shows_correct_dtypes(self):
|
||||
# GH11761
|
||||
io = StringIO()
|
||||
|
||||
frame = DataFrame([[1, 2.0]],
|
||||
columns=['a', 'a'])
|
||||
frame.info(buf=io)
|
||||
io.seek(0)
|
||||
lines = io.readlines()
|
||||
assert 'a 1 non-null int64\n' == lines[3]
|
||||
assert 'a 1 non-null float64\n' == lines[4]
|
||||
|
||||
def test_info_shows_column_dtypes(self):
|
||||
dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
|
||||
'complex128', 'object', 'bool']
|
||||
data = {}
|
||||
n = 10
|
||||
for i, dtype in enumerate(dtypes):
|
||||
data[i] = np.random.randint(2, size=n).astype(dtype)
|
||||
df = DataFrame(data)
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
res = buf.getvalue()
|
||||
for i, dtype in enumerate(dtypes):
|
||||
name = '%d %d non-null %s' % (i, n, dtype)
|
||||
assert name in res
|
||||
|
||||
def test_info_max_cols(self):
|
||||
df = DataFrame(np.random.randn(10, 5))
|
||||
for len_, verbose in [(5, None), (5, False), (10, True)]:
|
||||
# For verbose always ^ setting ^ summarize ^ full output
|
||||
with option_context('max_info_columns', 4):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf, verbose=verbose)
|
||||
res = buf.getvalue()
|
||||
assert len(res.strip().split('\n')) == len_
|
||||
|
||||
for len_, verbose in [(10, None), (5, False), (10, True)]:
|
||||
|
||||
# max_cols no exceeded
|
||||
with option_context('max_info_columns', 5):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf, verbose=verbose)
|
||||
res = buf.getvalue()
|
||||
assert len(res.strip().split('\n')) == len_
|
||||
|
||||
for len_, max_cols in [(10, 5), (5, 4)]:
|
||||
# setting truncates
|
||||
with option_context('max_info_columns', 4):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf, max_cols=max_cols)
|
||||
res = buf.getvalue()
|
||||
assert len(res.strip().split('\n')) == len_
|
||||
|
||||
# setting wouldn't truncate
|
||||
with option_context('max_info_columns', 5):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf, max_cols=max_cols)
|
||||
res = buf.getvalue()
|
||||
assert len(res.strip().split('\n')) == len_
|
||||
|
||||
def test_info_memory_usage(self):
|
||||
# Ensure memory usage is displayed, when asserted, on the last line
|
||||
dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
|
||||
'complex128', 'object', 'bool']
|
||||
data = {}
|
||||
n = 10
|
||||
for i, dtype in enumerate(dtypes):
|
||||
data[i] = np.random.randint(2, size=n).astype(dtype)
|
||||
df = DataFrame(data)
|
||||
buf = StringIO()
|
||||
|
||||
# display memory usage case
|
||||
df.info(buf=buf, memory_usage=True)
|
||||
res = buf.getvalue().splitlines()
|
||||
assert "memory usage: " in res[-1]
|
||||
|
||||
# do not display memory usage case
|
||||
df.info(buf=buf, memory_usage=False)
|
||||
res = buf.getvalue().splitlines()
|
||||
assert "memory usage: " not in res[-1]
|
||||
|
||||
df.info(buf=buf, memory_usage=True)
|
||||
res = buf.getvalue().splitlines()
|
||||
|
||||
# memory usage is a lower bound, so print it as XYZ+ MB
|
||||
assert re.match(r"memory usage: [^+]+\+", res[-1])
|
||||
|
||||
df.iloc[:, :5].info(buf=buf, memory_usage=True)
|
||||
res = buf.getvalue().splitlines()
|
||||
|
||||
# excluded column with object dtype, so estimate is accurate
|
||||
assert not re.match(r"memory usage: [^+]+\+", res[-1])
|
||||
|
||||
# Test a DataFrame with duplicate columns
|
||||
dtypes = ['int64', 'int64', 'int64', 'float64']
|
||||
data = {}
|
||||
n = 100
|
||||
for i, dtype in enumerate(dtypes):
|
||||
data[i] = np.random.randint(2, size=n).astype(dtype)
|
||||
df = DataFrame(data)
|
||||
df.columns = dtypes
|
||||
|
||||
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
|
||||
df_with_object_index.info(buf=buf, memory_usage=True)
|
||||
res = buf.getvalue().splitlines()
|
||||
assert re.match(r"memory usage: [^+]+\+", res[-1])
|
||||
|
||||
df_with_object_index.info(buf=buf, memory_usage='deep')
|
||||
res = buf.getvalue().splitlines()
|
||||
assert re.match(r"memory usage: [^+]+$", res[-1])
|
||||
|
||||
# Ensure df size is as expected
|
||||
# (cols * rows * bytes) + index size
|
||||
df_size = df.memory_usage().sum()
|
||||
exp_size = len(dtypes) * n * 8 + df.index.nbytes
|
||||
assert df_size == exp_size
|
||||
|
||||
# Ensure number of cols in memory_usage is the same as df
|
||||
size_df = np.size(df.columns.values) + 1 # index=True; default
|
||||
assert size_df == np.size(df.memory_usage())
|
||||
|
||||
# assert deep works only on object
|
||||
assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
|
||||
|
||||
# test for validity
|
||||
DataFrame(1, index=['a'], columns=['A']
|
||||
).memory_usage(index=True)
|
||||
DataFrame(1, index=['a'], columns=['A']
|
||||
).index.nbytes
|
||||
df = DataFrame(
|
||||
data=1,
|
||||
index=pd.MultiIndex.from_product(
|
||||
[['a'], range(1000)]),
|
||||
columns=['A']
|
||||
)
|
||||
df.index.nbytes
|
||||
df.memory_usage(index=True)
|
||||
df.index.values.nbytes
|
||||
|
||||
mem = df.memory_usage(deep=True).sum()
|
||||
assert mem > 0
|
||||
|
||||
@pytest.mark.skipif(PYPY,
|
||||
reason="on PyPy deep=True doesn't change result")
|
||||
def test_info_memory_usage_deep_not_pypy(self):
|
||||
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
|
||||
assert (df_with_object_index.memory_usage(
|
||||
index=True, deep=True).sum() >
|
||||
df_with_object_index.memory_usage(
|
||||
index=True).sum())
|
||||
|
||||
df_object = pd.DataFrame({'a': ['a']})
|
||||
assert (df_object.memory_usage(deep=True).sum() >
|
||||
df_object.memory_usage().sum())
|
||||
|
||||
@pytest.mark.skipif(not PYPY,
|
||||
reason="on PyPy deep=True does not change result")
|
||||
def test_info_memory_usage_deep_pypy(self):
|
||||
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
|
||||
assert (df_with_object_index.memory_usage(
|
||||
index=True, deep=True).sum() ==
|
||||
df_with_object_index.memory_usage(
|
||||
index=True).sum())
|
||||
|
||||
df_object = pd.DataFrame({'a': ['a']})
|
||||
assert (df_object.memory_usage(deep=True).sum() ==
|
||||
df_object.memory_usage().sum())
|
||||
|
||||
@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
|
||||
def test_usage_via_getsizeof(self):
|
||||
df = DataFrame(
|
||||
data=1,
|
||||
index=pd.MultiIndex.from_product(
|
||||
[['a'], range(1000)]),
|
||||
columns=['A']
|
||||
)
|
||||
mem = df.memory_usage(deep=True).sum()
|
||||
# sys.getsizeof will call the .memory_usage with
|
||||
# deep=True, and add on some GC overhead
|
||||
diff = mem - sys.getsizeof(df)
|
||||
assert abs(diff) < 100
|
||||
|
||||
def test_info_memory_usage_qualified(self):
|
||||
|
||||
buf = StringIO()
|
||||
df = DataFrame(1, columns=list('ab'),
|
||||
index=[1, 2, 3])
|
||||
df.info(buf=buf)
|
||||
assert '+' not in buf.getvalue()
|
||||
|
||||
buf = StringIO()
|
||||
df = DataFrame(1, columns=list('ab'),
|
||||
index=list('ABC'))
|
||||
df.info(buf=buf)
|
||||
assert '+' in buf.getvalue()
|
||||
|
||||
buf = StringIO()
|
||||
df = DataFrame(1, columns=list('ab'),
|
||||
index=pd.MultiIndex.from_product(
|
||||
[range(3), range(3)]))
|
||||
df.info(buf=buf)
|
||||
assert '+' not in buf.getvalue()
|
||||
|
||||
buf = StringIO()
|
||||
df = DataFrame(1, columns=list('ab'),
|
||||
index=pd.MultiIndex.from_product(
|
||||
[range(3), ['foo', 'bar']]))
|
||||
df.info(buf=buf)
|
||||
assert '+' in buf.getvalue()
|
||||
|
||||
def test_info_memory_usage_bug_on_multiindex(self):
|
||||
# GH 14308
|
||||
# memory usage introspection should not materialize .values
|
||||
|
||||
from string import ascii_uppercase as uppercase
|
||||
|
||||
def memory_usage(f):
|
||||
return f.memory_usage(deep=True).sum()
|
||||
|
||||
N = 100
|
||||
M = len(uppercase)
|
||||
index = pd.MultiIndex.from_product([list(uppercase),
|
||||
pd.date_range('20160101',
|
||||
periods=N)],
|
||||
names=['id', 'date'])
|
||||
df = DataFrame({'value': np.random.randn(N * M)}, index=index)
|
||||
|
||||
unstacked = df.unstack('id')
|
||||
assert df.values.nbytes == unstacked.values.nbytes
|
||||
assert memory_usage(df) > memory_usage(unstacked)
|
||||
|
||||
# high upper bound
|
||||
assert memory_usage(unstacked) - memory_usage(df) < 2000
|
||||
|
||||
def test_info_categorical(self):
|
||||
# GH14298
|
||||
idx = pd.CategoricalIndex(['a', 'b'])
|
||||
df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
|
||||
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
|
||||
def test_info_categorical_column(self):
|
||||
|
||||
# make sure it works
|
||||
n = 2500
|
||||
df = DataFrame({'int64': np.random.randint(100, size=n)})
|
||||
df['category'] = Series(np.array(list('abcdefghij')).take(
|
||||
np.random.randint(0, 10, size=n))).astype('category')
|
||||
df.isna()
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
|
||||
df2 = df[df['category'] == 'd']
|
||||
buf = compat.StringIO()
|
||||
df2.info(buf=buf)
|
||||
|
||||
def test_repr_categorical_dates_periods(self):
|
||||
# normal DataFrame
|
||||
dt = date_range('2011-01-01 09:00', freq='H', periods=5,
|
||||
tz='US/Eastern')
|
||||
p = period_range('2011-01', freq='M', periods=5)
|
||||
df = DataFrame({'dt': dt, 'p': p})
|
||||
exp = """ dt p
|
||||
0 2011-01-01 09:00:00-05:00 2011-01
|
||||
1 2011-01-01 10:00:00-05:00 2011-02
|
||||
2 2011-01-01 11:00:00-05:00 2011-03
|
||||
3 2011-01-01 12:00:00-05:00 2011-04
|
||||
4 2011-01-01 13:00:00-05:00 2011-05"""
|
||||
|
||||
df = DataFrame({'dt': Categorical(dt), 'p': Categorical(p)})
|
||||
assert repr(df) == exp
|
||||
@@ -1,912 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from warnings import catch_warnings
|
||||
from datetime import datetime
|
||||
|
||||
import itertools
|
||||
import pytest
|
||||
|
||||
from numpy.random import randn
|
||||
from numpy import nan
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat import u
|
||||
from pandas import (DataFrame, Index, Series, MultiIndex, date_range,
|
||||
Timedelta, Period)
|
||||
import pandas as pd
|
||||
|
||||
from pandas.util.testing import assert_series_equal, assert_frame_equal
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.tests.frame.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameReshape(TestData):
|
||||
|
||||
def test_pivot(self):
|
||||
data = {
|
||||
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
|
||||
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
|
||||
'values': [1., 2., 3., 3., 2., 1.]
|
||||
}
|
||||
|
||||
frame = DataFrame(data)
|
||||
pivoted = frame.pivot(
|
||||
index='index', columns='columns', values='values')
|
||||
|
||||
expected = DataFrame({
|
||||
'One': {'A': 1., 'B': 2., 'C': 3.},
|
||||
'Two': {'A': 1., 'B': 2., 'C': 3.}
|
||||
})
|
||||
|
||||
expected.index.name, expected.columns.name = 'index', 'columns'
|
||||
tm.assert_frame_equal(pivoted, expected)
|
||||
|
||||
# name tracking
|
||||
assert pivoted.index.name == 'index'
|
||||
assert pivoted.columns.name == 'columns'
|
||||
|
||||
# don't specify values
|
||||
pivoted = frame.pivot(index='index', columns='columns')
|
||||
assert pivoted.index.name == 'index'
|
||||
assert pivoted.columns.names == (None, 'columns')
|
||||
|
||||
with catch_warnings(record=True):
|
||||
# pivot multiple columns
|
||||
wp = tm.makePanel()
|
||||
lp = wp.to_frame()
|
||||
df = lp.reset_index()
|
||||
tm.assert_frame_equal(df.pivot('major', 'minor'), lp.unstack())
|
||||
|
||||
def test_pivot_duplicates(self):
|
||||
data = DataFrame({'a': ['bar', 'bar', 'foo', 'foo', 'foo'],
|
||||
'b': ['one', 'two', 'one', 'one', 'two'],
|
||||
'c': [1., 2., 3., 3., 4.]})
|
||||
with tm.assert_raises_regex(ValueError, 'duplicate entries'):
|
||||
data.pivot('a', 'b', 'c')
|
||||
|
||||
def test_pivot_empty(self):
|
||||
df = DataFrame({}, columns=['a', 'b', 'c'])
|
||||
result = df.pivot('a', 'b', 'c')
|
||||
expected = DataFrame({})
|
||||
tm.assert_frame_equal(result, expected, check_names=False)
|
||||
|
||||
def test_pivot_integer_bug(self):
|
||||
df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")])
|
||||
|
||||
result = df.pivot(index=1, columns=0, values=2)
|
||||
repr(result)
|
||||
tm.assert_index_equal(result.columns, Index(['A', 'B'], name=0))
|
||||
|
||||
def test_pivot_index_none(self):
|
||||
# gh-3962
|
||||
data = {
|
||||
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
|
||||
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
|
||||
'values': [1., 2., 3., 3., 2., 1.]
|
||||
}
|
||||
|
||||
frame = DataFrame(data).set_index('index')
|
||||
result = frame.pivot(columns='columns', values='values')
|
||||
expected = DataFrame({
|
||||
'One': {'A': 1., 'B': 2., 'C': 3.},
|
||||
'Two': {'A': 1., 'B': 2., 'C': 3.}
|
||||
})
|
||||
|
||||
expected.index.name, expected.columns.name = 'index', 'columns'
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# omit values
|
||||
result = frame.pivot(columns='columns')
|
||||
|
||||
expected.columns = pd.MultiIndex.from_tuples([('values', 'One'),
|
||||
('values', 'Two')],
|
||||
names=[None, 'columns'])
|
||||
expected.index.name = 'index'
|
||||
tm.assert_frame_equal(result, expected, check_names=False)
|
||||
assert result.index.name == 'index'
|
||||
assert result.columns.names == (None, 'columns')
|
||||
expected.columns = expected.columns.droplevel(0)
|
||||
result = frame.pivot(columns='columns', values='values')
|
||||
|
||||
expected.columns.name = 'columns'
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_stack_unstack(self):
|
||||
df = self.frame.copy()
|
||||
df[:] = np.arange(np.prod(df.shape)).reshape(df.shape)
|
||||
|
||||
stacked = df.stack()
|
||||
stacked_df = DataFrame({'foo': stacked, 'bar': stacked})
|
||||
|
||||
unstacked = stacked.unstack()
|
||||
unstacked_df = stacked_df.unstack()
|
||||
|
||||
assert_frame_equal(unstacked, df)
|
||||
assert_frame_equal(unstacked_df['bar'], df)
|
||||
|
||||
unstacked_cols = stacked.unstack(0)
|
||||
unstacked_cols_df = stacked_df.unstack(0)
|
||||
assert_frame_equal(unstacked_cols.T, df)
|
||||
assert_frame_equal(unstacked_cols_df['bar'].T, df)
|
||||
|
||||
def test_stack_mixed_level(self):
|
||||
# GH 18310
|
||||
levels = [range(3), [3, 'a', 'b'], [1, 2]]
|
||||
|
||||
# flat columns:
|
||||
df = DataFrame(1, index=levels[0], columns=levels[1])
|
||||
result = df.stack()
|
||||
expected = Series(1, index=MultiIndex.from_product(levels[:2]))
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# MultiIndex columns:
|
||||
df = DataFrame(1, index=levels[0],
|
||||
columns=MultiIndex.from_product(levels[1:]))
|
||||
result = df.stack(1)
|
||||
expected = DataFrame(1, index=MultiIndex.from_product([levels[0],
|
||||
levels[2]]),
|
||||
columns=levels[1])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# as above, but used labels in level are actually of homogeneous type
|
||||
result = df[['a', 'b']].stack(1)
|
||||
expected = expected[['a', 'b']]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_fill(self):
|
||||
|
||||
# GH #9746: fill_value keyword argument for Series
|
||||
# and DataFrame unstack
|
||||
|
||||
# From a series
|
||||
data = Series([1, 2, 4, 5], dtype=np.int16)
|
||||
data.index = MultiIndex.from_tuples(
|
||||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
||||
|
||||
result = data.unstack(fill_value=-1)
|
||||
expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]},
|
||||
index=['x', 'y', 'z'], dtype=np.int16)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# From a series with incorrect data type for fill_value
|
||||
result = data.unstack(fill_value=0.5)
|
||||
expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]},
|
||||
index=['x', 'y', 'z'], dtype=np.float)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH #13971: fill_value when unstacking multiple levels:
|
||||
df = DataFrame({'x': ['a', 'a', 'b'],
|
||||
'y': ['j', 'k', 'j'],
|
||||
'z': [0, 1, 2],
|
||||
'w': [0, 1, 2]}).set_index(['x', 'y', 'z'])
|
||||
unstacked = df.unstack(['x', 'y'], fill_value=0)
|
||||
key = ('w', 'b', 'j')
|
||||
expected = unstacked[key]
|
||||
result = pd.Series([0, 0, 2], index=unstacked.index, name=key)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
stacked = unstacked.stack(['x', 'y'])
|
||||
stacked.index = stacked.index.reorder_levels(df.index.names)
|
||||
# Workaround for GH #17886 (unnecessarily casts to float):
|
||||
stacked = stacked.astype(np.int64)
|
||||
result = stacked.loc[df.index]
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
# From a series
|
||||
s = df['w']
|
||||
result = s.unstack(['x', 'y'], fill_value=0)
|
||||
expected = unstacked['w']
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_fill_frame(self):
|
||||
|
||||
# From a dataframe
|
||||
rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
|
||||
df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
|
||||
df.index = MultiIndex.from_tuples(
|
||||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
||||
|
||||
result = df.unstack(fill_value=-1)
|
||||
|
||||
rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
|
||||
expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
|
||||
expected.columns = MultiIndex.from_tuples(
|
||||
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# From a mixed type dataframe
|
||||
df['A'] = df['A'].astype(np.int16)
|
||||
df['B'] = df['B'].astype(np.float64)
|
||||
|
||||
result = df.unstack(fill_value=-1)
|
||||
expected['A'] = expected['A'].astype(np.int16)
|
||||
expected['B'] = expected['B'].astype(np.float64)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# From a dataframe with incorrect data type for fill_value
|
||||
result = df.unstack(fill_value=0.5)
|
||||
|
||||
rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
|
||||
expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
|
||||
expected.columns = MultiIndex.from_tuples(
|
||||
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_fill_frame_datetime(self):
|
||||
|
||||
# Test unstacking with date times
|
||||
dv = pd.date_range('2012-01-01', periods=4).values
|
||||
data = Series(dv)
|
||||
data.index = MultiIndex.from_tuples(
|
||||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
||||
|
||||
result = data.unstack()
|
||||
expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]],
|
||||
'b': [dv[1], dv[2], pd.NaT]},
|
||||
index=['x', 'y', 'z'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = data.unstack(fill_value=dv[0])
|
||||
expected = DataFrame({'a': [dv[0], dv[0], dv[3]],
|
||||
'b': [dv[1], dv[2], dv[0]]},
|
||||
index=['x', 'y', 'z'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_fill_frame_timedelta(self):
|
||||
|
||||
# Test unstacking with time deltas
|
||||
td = [Timedelta(days=i) for i in range(4)]
|
||||
data = Series(td)
|
||||
data.index = MultiIndex.from_tuples(
|
||||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
||||
|
||||
result = data.unstack()
|
||||
expected = DataFrame({'a': [td[0], pd.NaT, td[3]],
|
||||
'b': [td[1], td[2], pd.NaT]},
|
||||
index=['x', 'y', 'z'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = data.unstack(fill_value=td[1])
|
||||
expected = DataFrame({'a': [td[0], td[1], td[3]],
|
||||
'b': [td[1], td[2], td[1]]},
|
||||
index=['x', 'y', 'z'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_fill_frame_period(self):
|
||||
|
||||
# Test unstacking with period
|
||||
periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'),
|
||||
Period('2012-04')]
|
||||
data = Series(periods)
|
||||
data.index = MultiIndex.from_tuples(
|
||||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
||||
|
||||
result = data.unstack()
|
||||
expected = DataFrame({'a': [periods[0], None, periods[3]],
|
||||
'b': [periods[1], periods[2], None]},
|
||||
index=['x', 'y', 'z'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = data.unstack(fill_value=periods[1])
|
||||
expected = DataFrame({'a': [periods[0], periods[1], periods[3]],
|
||||
'b': [periods[1], periods[2], periods[1]]},
|
||||
index=['x', 'y', 'z'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_fill_frame_categorical(self):
|
||||
|
||||
# Test unstacking with categorical
|
||||
data = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
|
||||
data.index = pd.MultiIndex.from_tuples(
|
||||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
||||
|
||||
# By default missing values will be NaN
|
||||
result = data.unstack()
|
||||
expected = DataFrame({'a': pd.Categorical(list('axa'),
|
||||
categories=list('abc')),
|
||||
'b': pd.Categorical(list('bcx'),
|
||||
categories=list('abc'))},
|
||||
index=list('xyz'))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Fill with non-category results in NaN entries similar to above
|
||||
result = data.unstack(fill_value='d')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Fill with category value replaces missing values as expected
|
||||
result = data.unstack(fill_value='c')
|
||||
expected = DataFrame({'a': pd.Categorical(list('aca'),
|
||||
categories=list('abc')),
|
||||
'b': pd.Categorical(list('bcc'),
|
||||
categories=list('abc'))},
|
||||
index=list('xyz'))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_preserve_dtypes(self):
|
||||
# Checks fix for #11847
|
||||
df = pd.DataFrame(dict(state=['IL', 'MI', 'NC'],
|
||||
index=['a', 'b', 'c'],
|
||||
some_categories=pd.Series(['a', 'b', 'c']
|
||||
).astype('category'),
|
||||
A=np.random.rand(3),
|
||||
B=1,
|
||||
C='foo',
|
||||
D=pd.Timestamp('20010102'),
|
||||
E=pd.Series([1.0, 50.0, 100.0]
|
||||
).astype('float32'),
|
||||
F=pd.Series([3.0, 4.0, 5.0]).astype('float64'),
|
||||
G=False,
|
||||
H=pd.Series([1, 200, 923442], dtype='int8')))
|
||||
|
||||
def unstack_and_compare(df, column_name):
|
||||
unstacked1 = df.unstack([column_name])
|
||||
unstacked2 = df.unstack(column_name)
|
||||
assert_frame_equal(unstacked1, unstacked2)
|
||||
|
||||
df1 = df.set_index(['state', 'index'])
|
||||
unstack_and_compare(df1, 'index')
|
||||
|
||||
df1 = df.set_index(['state', 'some_categories'])
|
||||
unstack_and_compare(df1, 'some_categories')
|
||||
|
||||
df1 = df.set_index(['F', 'C'])
|
||||
unstack_and_compare(df1, 'F')
|
||||
|
||||
df1 = df.set_index(['G', 'B', 'state'])
|
||||
unstack_and_compare(df1, 'B')
|
||||
|
||||
df1 = df.set_index(['E', 'A'])
|
||||
unstack_and_compare(df1, 'E')
|
||||
|
||||
df1 = df.set_index(['state', 'index'])
|
||||
s = df1['A']
|
||||
unstack_and_compare(s, 'index')
|
||||
|
||||
def test_stack_ints(self):
|
||||
columns = MultiIndex.from_tuples(list(itertools.product(range(3),
|
||||
repeat=3)))
|
||||
df = DataFrame(np.random.randn(30, 27), columns=columns)
|
||||
|
||||
assert_frame_equal(df.stack(level=[1, 2]),
|
||||
df.stack(level=1).stack(level=1))
|
||||
assert_frame_equal(df.stack(level=[-2, -1]),
|
||||
df.stack(level=1).stack(level=1))
|
||||
|
||||
df_named = df.copy()
|
||||
df_named.columns.set_names(range(3), inplace=True)
|
||||
|
||||
assert_frame_equal(df_named.stack(level=[1, 2]),
|
||||
df_named.stack(level=1).stack(level=1))
|
||||
|
||||
def test_stack_mixed_levels(self):
|
||||
columns = MultiIndex.from_tuples(
|
||||
[('A', 'cat', 'long'), ('B', 'cat', 'long'),
|
||||
('A', 'dog', 'short'), ('B', 'dog', 'short')],
|
||||
names=['exp', 'animal', 'hair_length']
|
||||
)
|
||||
df = DataFrame(randn(4, 4), columns=columns)
|
||||
|
||||
animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
|
||||
exp_hair_stacked = df.stack(level=['exp', 'hair_length'])
|
||||
|
||||
# GH #8584: Need to check that stacking works when a number
|
||||
# is passed that is both a level name and in the range of
|
||||
# the level numbers
|
||||
df2 = df.copy()
|
||||
df2.columns.names = ['exp', 'animal', 1]
|
||||
assert_frame_equal(df2.stack(level=['animal', 1]),
|
||||
animal_hair_stacked, check_names=False)
|
||||
assert_frame_equal(df2.stack(level=['exp', 1]),
|
||||
exp_hair_stacked, check_names=False)
|
||||
|
||||
# When mixed types are passed and the ints are not level
|
||||
# names, raise
|
||||
pytest.raises(ValueError, df2.stack, level=['animal', 0])
|
||||
|
||||
# GH #8584: Having 0 in the level names could raise a
|
||||
# strange error about lexsort depth
|
||||
df3 = df.copy()
|
||||
df3.columns.names = ['exp', 'animal', 0]
|
||||
assert_frame_equal(df3.stack(level=['animal', 0]),
|
||||
animal_hair_stacked, check_names=False)
|
||||
|
||||
def test_stack_int_level_names(self):
|
||||
columns = MultiIndex.from_tuples(
|
||||
[('A', 'cat', 'long'), ('B', 'cat', 'long'),
|
||||
('A', 'dog', 'short'), ('B', 'dog', 'short')],
|
||||
names=['exp', 'animal', 'hair_length']
|
||||
)
|
||||
df = DataFrame(randn(4, 4), columns=columns)
|
||||
|
||||
exp_animal_stacked = df.stack(level=['exp', 'animal'])
|
||||
animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
|
||||
exp_hair_stacked = df.stack(level=['exp', 'hair_length'])
|
||||
|
||||
df2 = df.copy()
|
||||
df2.columns.names = [0, 1, 2]
|
||||
assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked,
|
||||
check_names=False)
|
||||
assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked,
|
||||
check_names=False)
|
||||
assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked,
|
||||
check_names=False)
|
||||
|
||||
# Out-of-order int column names
|
||||
df3 = df.copy()
|
||||
df3.columns.names = [2, 0, 1]
|
||||
assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked,
|
||||
check_names=False)
|
||||
assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked,
|
||||
check_names=False)
|
||||
assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked,
|
||||
check_names=False)
|
||||
|
||||
def test_unstack_bool(self):
|
||||
df = DataFrame([False, False],
|
||||
index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]),
|
||||
columns=['col'])
|
||||
rs = df.unstack()
|
||||
xp = DataFrame(np.array([[False, np.nan], [np.nan, False]],
|
||||
dtype=object),
|
||||
index=['a', 'b'],
|
||||
columns=MultiIndex.from_arrays([['col', 'col'],
|
||||
['c', 'l']]))
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_unstack_level_binding(self):
|
||||
# GH9856
|
||||
mi = pd.MultiIndex(
|
||||
levels=[[u('foo'), u('bar')], [u('one'), u('two')],
|
||||
[u('a'), u('b')]],
|
||||
labels=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]],
|
||||
names=[u('first'), u('second'), u('third')])
|
||||
s = pd.Series(0, index=mi)
|
||||
result = s.unstack([1, 2]).stack(0)
|
||||
|
||||
expected_mi = pd.MultiIndex(
|
||||
levels=[['foo', 'bar'], ['one', 'two']],
|
||||
labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
|
||||
names=['first', 'second'])
|
||||
|
||||
expected = pd.DataFrame(np.array([[np.nan, 0],
|
||||
[0, np.nan],
|
||||
[np.nan, 0],
|
||||
[0, np.nan]],
|
||||
dtype=np.float64),
|
||||
index=expected_mi,
|
||||
columns=pd.Index(['a', 'b'], name='third'))
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_to_series(self):
|
||||
# check reversibility
|
||||
data = self.frame.unstack()
|
||||
|
||||
assert isinstance(data, Series)
|
||||
undo = data.unstack().T
|
||||
assert_frame_equal(undo, self.frame)
|
||||
|
||||
# check NA handling
|
||||
data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]})
|
||||
data.index = Index(['a', 'b', 'c'])
|
||||
result = data.unstack()
|
||||
|
||||
midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']],
|
||||
labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
|
||||
expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx)
|
||||
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# check composability of unstack
|
||||
old_data = data.copy()
|
||||
for _ in range(4):
|
||||
data = data.unstack()
|
||||
assert_frame_equal(old_data, data)
|
||||
|
||||
def test_unstack_dtypes(self):
|
||||
|
||||
# GH 2929
|
||||
rows = [[1, 1, 3, 4],
|
||||
[1, 2, 3, 4],
|
||||
[2, 1, 3, 4],
|
||||
[2, 2, 3, 4]]
|
||||
|
||||
df = DataFrame(rows, columns=list('ABCD'))
|
||||
result = df.get_dtype_counts()
|
||||
expected = Series({'int64': 4})
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# single dtype
|
||||
df2 = df.set_index(['A', 'B'])
|
||||
df3 = df2.unstack('B')
|
||||
result = df3.get_dtype_counts()
|
||||
expected = Series({'int64': 4})
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# mixed
|
||||
df2 = df.set_index(['A', 'B'])
|
||||
df2['C'] = 3.
|
||||
df3 = df2.unstack('B')
|
||||
result = df3.get_dtype_counts()
|
||||
expected = Series({'int64': 2, 'float64': 2})
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df2['D'] = 'foo'
|
||||
df3 = df2.unstack('B')
|
||||
result = df3.get_dtype_counts()
|
||||
expected = Series({'float64': 2, 'object': 2})
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# GH7405
|
||||
for c, d in (np.zeros(5), np.zeros(5)), \
|
||||
(np.arange(5, dtype='f8'), np.arange(5, 10, dtype='f8')):
|
||||
|
||||
df = DataFrame({'A': ['a'] * 5, 'C': c, 'D': d,
|
||||
'B': pd.date_range('2012-01-01', periods=5)})
|
||||
|
||||
right = df.iloc[:3].copy(deep=True)
|
||||
|
||||
df = df.set_index(['A', 'B'])
|
||||
df['D'] = df['D'].astype('int64')
|
||||
|
||||
left = df.iloc[:3].unstack(0)
|
||||
right = right.set_index(['A', 'B']).unstack(0)
|
||||
right[('D', 'a')] = right[('D', 'a')].astype('int64')
|
||||
|
||||
assert left.shape == (3, 2)
|
||||
tm.assert_frame_equal(left, right)
|
||||
|
||||
def test_unstack_non_unique_index_names(self):
|
||||
idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')],
|
||||
names=['c1', 'c1'])
|
||||
df = DataFrame([1, 2], index=idx)
|
||||
with pytest.raises(ValueError):
|
||||
df.unstack('c1')
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
df.T.stack('c1')
|
||||
|
||||
def test_unstack_unused_levels(self):
|
||||
# GH 17845: unused labels in index make unstack() cast int to float
|
||||
idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1]
|
||||
df = pd.DataFrame([[1, 0]] * 3, index=idx)
|
||||
|
||||
result = df.unstack()
|
||||
exp_col = pd.MultiIndex.from_product([[0, 1], ['A', 'B', 'C']])
|
||||
expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=['a'],
|
||||
columns=exp_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert((result.columns.levels[1] == idx.levels[1]).all())
|
||||
|
||||
# Unused items on both levels
|
||||
levels = [[0, 1, 7], [0, 1, 2, 3]]
|
||||
labels = [[0, 0, 1, 1], [0, 2, 0, 2]]
|
||||
idx = pd.MultiIndex(levels, labels)
|
||||
block = np.arange(4).reshape(2, 2)
|
||||
df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx)
|
||||
result = df.unstack()
|
||||
expected = pd.DataFrame(np.concatenate([block * 2, block * 2 + 1],
|
||||
axis=1),
|
||||
columns=idx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert((result.columns.levels[1] == idx.levels[1]).all())
|
||||
|
||||
# With mixed dtype and NaN
|
||||
levels = [['a', 2, 'c'], [1, 3, 5, 7]]
|
||||
labels = [[0, -1, 1, 1], [0, 2, -1, 2]]
|
||||
idx = pd.MultiIndex(levels, labels)
|
||||
data = np.arange(8)
|
||||
df = pd.DataFrame(data.reshape(4, 2), index=idx)
|
||||
|
||||
cases = ((0, [13, 16, 6, 9, 2, 5, 8, 11],
|
||||
[np.nan, 'a', 2], [np.nan, 5, 1]),
|
||||
(1, [8, 11, 1, 4, 12, 15, 13, 16],
|
||||
[np.nan, 5, 1], [np.nan, 'a', 2]))
|
||||
for level, idces, col_level, idx_level in cases:
|
||||
result = df.unstack(level=level)
|
||||
exp_data = np.zeros(18) * np.nan
|
||||
exp_data[idces] = data
|
||||
cols = pd.MultiIndex.from_product([[0, 1], col_level])
|
||||
expected = pd.DataFrame(exp_data.reshape(3, 6),
|
||||
index=idx_level, columns=cols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("cols", [['A', 'C'], slice(None)])
|
||||
def test_unstack_unused_level(self, cols):
|
||||
# GH 18562 : unused labels on the unstacked level
|
||||
df = pd.DataFrame([[2010, 'a', 'I'],
|
||||
[2011, 'b', 'II']],
|
||||
columns=['A', 'B', 'C'])
|
||||
|
||||
ind = df.set_index(['A', 'B', 'C'], drop=False)
|
||||
selection = ind.loc[(slice(None), slice(None), 'I'), cols]
|
||||
result = selection.unstack()
|
||||
|
||||
expected = ind.iloc[[0]][cols]
|
||||
expected.columns = MultiIndex.from_product([expected.columns, ['I']],
|
||||
names=[None, 'C'])
|
||||
expected.index = expected.index.droplevel('C')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_nan_index(self): # GH7466
|
||||
cast = lambda val: '{0:1}'.format('' if val != val else val)
|
||||
nan = np.nan
|
||||
|
||||
def verify(df):
|
||||
mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
|
||||
rows, cols = df.notna().values.nonzero()
|
||||
for i, j in zip(rows, cols):
|
||||
left = sorted(df.iloc[i, j].split('.'))
|
||||
right = mk_list(df.index[i]) + mk_list(df.columns[j])
|
||||
right = sorted(list(map(cast, right)))
|
||||
assert left == right
|
||||
|
||||
df = DataFrame({'jim': ['a', 'b', nan, 'd'],
|
||||
'joe': ['w', 'x', 'y', 'z'],
|
||||
'jolie': ['a.w', 'b.x', ' .y', 'd.z']})
|
||||
|
||||
left = df.set_index(['jim', 'joe']).unstack()['jolie']
|
||||
right = df.set_index(['joe', 'jim']).unstack()['jolie'].T
|
||||
assert_frame_equal(left, right)
|
||||
|
||||
for idx in itertools.permutations(df.columns[:2]):
|
||||
mi = df.set_index(list(idx))
|
||||
for lev in range(2):
|
||||
udf = mi.unstack(level=lev)
|
||||
assert udf.notna().values.sum() == len(df)
|
||||
verify(udf['jolie'])
|
||||
|
||||
df = DataFrame({'1st': ['d'] * 3 + [nan] * 5 + ['a'] * 2 +
|
||||
['c'] * 3 + ['e'] * 2 + ['b'] * 5,
|
||||
'2nd': ['y'] * 2 + ['w'] * 3 + [nan] * 3 +
|
||||
['z'] * 4 + [nan] * 3 + ['x'] * 3 + [nan] * 2,
|
||||
'3rd': [67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59,
|
||||
50, 62, 59, 76, 52, 14, 53, 60, 51]})
|
||||
|
||||
df['4th'], df['5th'] = \
|
||||
df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \
|
||||
df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1)
|
||||
|
||||
for idx in itertools.permutations(['1st', '2nd', '3rd']):
|
||||
mi = df.set_index(list(idx))
|
||||
for lev in range(3):
|
||||
udf = mi.unstack(level=lev)
|
||||
assert udf.notna().values.sum() == 2 * len(df)
|
||||
for col in ['4th', '5th']:
|
||||
verify(udf[col])
|
||||
|
||||
# GH7403
|
||||
df = pd.DataFrame(
|
||||
{'A': list('aaaabbbb'), 'B': range(8), 'C': range(8)})
|
||||
df.iloc[3, 1] = np.NaN
|
||||
left = df.set_index(['A', 'B']).unstack(0)
|
||||
|
||||
vals = [[3, 0, 1, 2, nan, nan, nan, nan],
|
||||
[nan, nan, nan, nan, 4, 5, 6, 7]]
|
||||
vals = list(map(list, zip(*vals)))
|
||||
idx = Index([nan, 0, 1, 2, 4, 5, 6, 7], name='B')
|
||||
cols = MultiIndex(levels=[['C'], ['a', 'b']],
|
||||
labels=[[0, 0], [0, 1]],
|
||||
names=[None, 'A'])
|
||||
|
||||
right = DataFrame(vals, columns=cols, index=idx)
|
||||
assert_frame_equal(left, right)
|
||||
|
||||
df = DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
|
||||
'C': range(8)})
|
||||
df.iloc[2, 1] = np.NaN
|
||||
left = df.set_index(['A', 'B']).unstack(0)
|
||||
|
||||
vals = [[2, nan], [0, 4], [1, 5], [nan, 6], [3, 7]]
|
||||
cols = MultiIndex(levels=[['C'], ['a', 'b']],
|
||||
labels=[[0, 0], [0, 1]],
|
||||
names=[None, 'A'])
|
||||
idx = Index([nan, 0, 1, 2, 3], name='B')
|
||||
right = DataFrame(vals, columns=cols, index=idx)
|
||||
assert_frame_equal(left, right)
|
||||
|
||||
df = pd.DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
|
||||
'C': range(8)})
|
||||
df.iloc[3, 1] = np.NaN
|
||||
left = df.set_index(['A', 'B']).unstack(0)
|
||||
|
||||
vals = [[3, nan], [0, 4], [1, 5], [2, 6], [nan, 7]]
|
||||
cols = MultiIndex(levels=[['C'], ['a', 'b']],
|
||||
labels=[[0, 0], [0, 1]],
|
||||
names=[None, 'A'])
|
||||
idx = Index([nan, 0, 1, 2, 3], name='B')
|
||||
right = DataFrame(vals, columns=cols, index=idx)
|
||||
assert_frame_equal(left, right)
|
||||
|
||||
# GH7401
|
||||
df = pd.DataFrame({'A': list('aaaaabbbbb'),
|
||||
'B': (date_range('2012-01-01', periods=5)
|
||||
.tolist() * 2),
|
||||
'C': np.arange(10)})
|
||||
|
||||
df.iloc[3, 1] = np.NaN
|
||||
left = df.set_index(['A', 'B']).unstack()
|
||||
|
||||
vals = np.array([[3, 0, 1, 2, nan, 4], [nan, 5, 6, 7, 8, 9]])
|
||||
idx = Index(['a', 'b'], name='A')
|
||||
cols = MultiIndex(levels=[['C'], date_range('2012-01-01', periods=5)],
|
||||
labels=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
|
||||
names=[None, 'B'])
|
||||
|
||||
right = DataFrame(vals, columns=cols, index=idx)
|
||||
assert_frame_equal(left, right)
|
||||
|
||||
# GH4862
|
||||
vals = [['Hg', nan, nan, 680585148],
|
||||
['U', 0.0, nan, 680585148],
|
||||
['Pb', 7.07e-06, nan, 680585148],
|
||||
['Sn', 2.3614e-05, 0.0133, 680607017],
|
||||
['Ag', 0.0, 0.0133, 680607017],
|
||||
['Hg', -0.00015, 0.0133, 680607017]]
|
||||
df = DataFrame(vals, columns=['agent', 'change', 'dosage', 's_id'],
|
||||
index=[17263, 17264, 17265, 17266, 17267, 17268])
|
||||
|
||||
left = df.copy().set_index(['s_id', 'dosage', 'agent']).unstack()
|
||||
|
||||
vals = [[nan, nan, 7.07e-06, nan, 0.0],
|
||||
[0.0, -0.00015, nan, 2.3614e-05, nan]]
|
||||
|
||||
idx = MultiIndex(levels=[[680585148, 680607017], [0.0133]],
|
||||
labels=[[0, 1], [-1, 0]],
|
||||
names=['s_id', 'dosage'])
|
||||
|
||||
cols = MultiIndex(levels=[['change'], ['Ag', 'Hg', 'Pb', 'Sn', 'U']],
|
||||
labels=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
|
||||
names=[None, 'agent'])
|
||||
|
||||
right = DataFrame(vals, columns=cols, index=idx)
|
||||
assert_frame_equal(left, right)
|
||||
|
||||
left = df.loc[17264:].copy().set_index(['s_id', 'dosage', 'agent'])
|
||||
assert_frame_equal(left.unstack(), right)
|
||||
|
||||
# GH9497 - multiple unstack with nulls
|
||||
df = DataFrame({'1st': [1, 2, 1, 2, 1, 2],
|
||||
'2nd': pd.date_range('2014-02-01', periods=6,
|
||||
freq='D'),
|
||||
'jim': 100 + np.arange(6),
|
||||
'joe': (np.random.randn(6) * 10).round(2)})
|
||||
|
||||
df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02')
|
||||
df.loc[1, '2nd'] = df.loc[3, '2nd'] = nan
|
||||
df.loc[1, '3rd'] = df.loc[4, '3rd'] = nan
|
||||
|
||||
left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd'])
|
||||
assert left.notna().values.sum() == 2 * len(df)
|
||||
|
||||
for col in ['jim', 'joe']:
|
||||
for _, r in df.iterrows():
|
||||
key = r['1st'], (col, r['2nd'], r['3rd'])
|
||||
assert r[col] == left.loc[key]
|
||||
|
||||
def test_stack_datetime_column_multiIndex(self):
|
||||
# GH 8039
|
||||
t = datetime(2014, 1, 1)
|
||||
df = DataFrame(
|
||||
[1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, 'A', 'B')]))
|
||||
result = df.stack()
|
||||
|
||||
eidx = MultiIndex.from_product([(0, 1, 2, 3), ('B',)])
|
||||
ecols = MultiIndex.from_tuples([(t, 'A')])
|
||||
expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_stack_partial_multiIndex(self):
|
||||
# GH 8844
|
||||
def _test_stack_with_multiindex(multiindex):
|
||||
df = DataFrame(np.arange(3 * len(multiindex))
|
||||
.reshape(3, len(multiindex)),
|
||||
columns=multiindex)
|
||||
for level in (-1, 0, 1, [0, 1], [1, 0]):
|
||||
result = df.stack(level=level, dropna=False)
|
||||
|
||||
if isinstance(level, int):
|
||||
# Stacking a single level should not make any all-NaN rows,
|
||||
# so df.stack(level=level, dropna=False) should be the same
|
||||
# as df.stack(level=level, dropna=True).
|
||||
expected = df.stack(level=level, dropna=True)
|
||||
if isinstance(expected, Series):
|
||||
assert_series_equal(result, expected)
|
||||
else:
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df.columns = MultiIndex.from_tuples(df.columns.get_values(),
|
||||
names=df.columns.names)
|
||||
expected = df.stack(level=level, dropna=False)
|
||||
if isinstance(expected, Series):
|
||||
assert_series_equal(result, expected)
|
||||
else:
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
full_multiindex = MultiIndex.from_tuples([('B', 'x'), ('B', 'z'),
|
||||
('A', 'y'),
|
||||
('C', 'x'), ('C', 'u')],
|
||||
names=['Upper', 'Lower'])
|
||||
for multiindex_columns in ([0, 1, 2, 3, 4],
|
||||
[0, 1, 2, 3], [0, 1, 2, 4],
|
||||
[0, 1, 2], [1, 2, 3], [2, 3, 4],
|
||||
[0, 1], [0, 2], [0, 3],
|
||||
[0], [2], [4]):
|
||||
_test_stack_with_multiindex(full_multiindex[multiindex_columns])
|
||||
if len(multiindex_columns) > 1:
|
||||
multiindex_columns.reverse()
|
||||
_test_stack_with_multiindex(
|
||||
full_multiindex[multiindex_columns])
|
||||
|
||||
df = DataFrame(np.arange(6).reshape(2, 3),
|
||||
columns=full_multiindex[[0, 1, 3]])
|
||||
result = df.stack(dropna=False)
|
||||
expected = DataFrame([[0, 2], [1, nan], [3, 5], [4, nan]],
|
||||
index=MultiIndex(
|
||||
levels=[[0, 1], ['u', 'x', 'y', 'z']],
|
||||
labels=[[0, 0, 1, 1],
|
||||
[1, 3, 1, 3]],
|
||||
names=[None, 'Lower']),
|
||||
columns=Index(['B', 'C'], name='Upper'),
|
||||
dtype=df.dtypes[0])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_stack_preserve_categorical_dtype(self):
|
||||
# GH13854
|
||||
for ordered in [False, True]:
|
||||
for labels in [list("yxz"), list("yxy")]:
|
||||
cidx = pd.CategoricalIndex(labels, categories=list("xyz"),
|
||||
ordered=ordered)
|
||||
df = DataFrame([[10, 11, 12]], columns=cidx)
|
||||
result = df.stack()
|
||||
|
||||
# `MutliIndex.from_product` preserves categorical dtype -
|
||||
# it's tested elsewhere.
|
||||
midx = pd.MultiIndex.from_product([df.index, cidx])
|
||||
expected = Series([10, 11, 12], index=midx)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("level", [0, 'baz'])
|
||||
def test_unstack_swaplevel_sortlevel(self, level):
|
||||
# GH 20994
|
||||
mi = pd.MultiIndex.from_product([[0], ['d', 'c']],
|
||||
names=['bar', 'baz'])
|
||||
df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=['B', 'A'])
|
||||
df.columns.name = 'foo'
|
||||
|
||||
expected = pd.DataFrame([
|
||||
[3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples([
|
||||
('c', 'A'), ('c', 'B'), ('d', 'A'), ('d', 'B')], names=[
|
||||
'baz', 'foo']))
|
||||
expected.index.name = 'bar'
|
||||
|
||||
result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_unstack_fill_frame_object():
|
||||
# GH12815 Test unstacking with object.
|
||||
data = pd.Series(['a', 'b', 'c', 'a'], dtype='object')
|
||||
data.index = pd.MultiIndex.from_tuples(
|
||||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
||||
|
||||
# By default missing values will be NaN
|
||||
result = data.unstack()
|
||||
expected = pd.DataFrame(
|
||||
{'a': ['a', np.nan, 'a'], 'b': ['b', 'c', np.nan]},
|
||||
index=list('xyz')
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Fill with any value replaces missing values as expected
|
||||
result = data.unstack(fill_value='d')
|
||||
expected = pd.DataFrame(
|
||||
{'a': ['a', 'd', 'a'], 'b': ['b', 'c', 'd']},
|
||||
index=list('xyz')
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
-126
@@ -1,126 +0,0 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, Index
|
||||
from pandas.errors import PerformanceWarning
|
||||
from pandas.util import testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_none():
|
||||
return DataFrame({
|
||||
'outer': ['a', 'a', 'a', 'b', 'b', 'b'],
|
||||
'inner': [1, 2, 2, 2, 1, 1],
|
||||
'A': np.arange(6, 0, -1),
|
||||
('B', 5): ['one', 'one', 'two', 'two', 'one', 'one']})
|
||||
|
||||
|
||||
@pytest.fixture(params=[
|
||||
['outer'],
|
||||
['outer', 'inner']
|
||||
])
|
||||
def df_idx(request, df_none):
|
||||
levels = request.param
|
||||
return df_none.set_index(levels)
|
||||
|
||||
|
||||
@pytest.fixture(params=[
|
||||
'inner', # index level
|
||||
['outer'], # list of index level
|
||||
'A', # column
|
||||
[('B', 5)], # list of column
|
||||
['inner', 'outer'], # two index levels
|
||||
[('B', 5), 'outer'], # index level and column
|
||||
['A', ('B', 5)], # Two columns
|
||||
['inner', 'outer'] # two index levels and column
|
||||
])
|
||||
def sort_names(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def ascending(request):
|
||||
return request.param
|
||||
|
||||
|
||||
def test_sort_index_level_and_column_label(
|
||||
df_none, df_idx, sort_names, ascending):
|
||||
|
||||
# GH 14353
|
||||
|
||||
# Get index levels from df_idx
|
||||
levels = df_idx.index.names
|
||||
|
||||
# Compute expected by sorting on columns and the setting index
|
||||
expected = df_none.sort_values(by=sort_names,
|
||||
ascending=ascending,
|
||||
axis=0).set_index(levels)
|
||||
|
||||
# Compute result sorting on mix on columns and index levels
|
||||
result = df_idx.sort_values(by=sort_names,
|
||||
ascending=ascending,
|
||||
axis=0)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_sort_column_level_and_index_label(
|
||||
df_none, df_idx, sort_names, ascending):
|
||||
|
||||
# GH 14353
|
||||
|
||||
# Get levels from df_idx
|
||||
levels = df_idx.index.names
|
||||
|
||||
# Compute expected by sorting on axis=0, setting index levels, and then
|
||||
# transposing. For some cases this will result in a frame with
|
||||
# multiple column levels
|
||||
expected = df_none.sort_values(by=sort_names,
|
||||
ascending=ascending,
|
||||
axis=0).set_index(levels).T
|
||||
|
||||
# Compute result by transposing and sorting on axis=1.
|
||||
result = df_idx.T.sort_values(by=sort_names,
|
||||
ascending=ascending,
|
||||
axis=1)
|
||||
|
||||
if len(levels) > 1:
|
||||
# Accessing multi-level columns that are not lexsorted raises a
|
||||
# performance warning
|
||||
with tm.assert_produces_warning(PerformanceWarning,
|
||||
check_stacklevel=False):
|
||||
assert_frame_equal(result, expected)
|
||||
else:
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_sort_values_column_index_level_precedence():
|
||||
# GH 14353, when a string passed as the `by` parameter
|
||||
# matches a column and an index level the column takes
|
||||
# precedence
|
||||
|
||||
# Construct DataFrame with index and column named 'idx'
|
||||
idx = Index(np.arange(1, 7), name='idx')
|
||||
df = DataFrame({'A': np.arange(11, 17),
|
||||
'idx': np.arange(6, 0, -1)},
|
||||
index=idx)
|
||||
|
||||
# Sorting by 'idx' should sort by the idx column and raise a
|
||||
# FutureWarning
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
result = df.sort_values(by='idx')
|
||||
|
||||
# This should be equivalent to sorting by the 'idx' index level in
|
||||
# descending order
|
||||
expected = df.sort_index(level='idx', ascending=False)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Perform same test with MultiIndex
|
||||
df_multi = df.set_index('A', append=True)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
result = df_multi.sort_values(by='idx')
|
||||
|
||||
expected = df_multi.sort_index(level='idx', ascending=False)
|
||||
assert_frame_equal(result, expected)
|
||||
@@ -1,600 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import pytest
|
||||
import random
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
from pandas.compat import lrange
|
||||
from pandas.api.types import CategoricalDtype
|
||||
from pandas import (DataFrame, Series, MultiIndex, Timestamp,
|
||||
date_range, NaT, IntervalIndex)
|
||||
|
||||
from pandas.util.testing import assert_series_equal, assert_frame_equal
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.tests.frame.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameSorting(TestData):
|
||||
|
||||
def test_sort(self):
|
||||
frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
|
||||
columns=['A', 'B', 'C', 'D'])
|
||||
|
||||
# see gh-9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
frame.sortlevel()
|
||||
|
||||
def test_sort_values(self):
|
||||
frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]],
|
||||
index=[1, 2, 3], columns=list('ABC'))
|
||||
|
||||
# by column (axis=0)
|
||||
sorted_df = frame.sort_values(by='A')
|
||||
indexer = frame['A'].argsort().values
|
||||
expected = frame.loc[frame.index[indexer]]
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by='A', ascending=False)
|
||||
indexer = indexer[::-1]
|
||||
expected = frame.loc[frame.index[indexer]]
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by='A', ascending=False)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# GH4839
|
||||
sorted_df = frame.sort_values(by=['A'], ascending=[False])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# multiple bys
|
||||
sorted_df = frame.sort_values(by=['B', 'C'])
|
||||
expected = frame.loc[[2, 1, 3]]
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=['B', 'C'], ascending=False)
|
||||
assert_frame_equal(sorted_df, expected[::-1])
|
||||
|
||||
sorted_df = frame.sort_values(by=['B', 'A'], ascending=[True, False])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
pytest.raises(ValueError, lambda: frame.sort_values(
|
||||
by=['A', 'B'], axis=2, inplace=True))
|
||||
|
||||
# by row (axis=1): GH 10806
|
||||
sorted_df = frame.sort_values(by=3, axis=1)
|
||||
expected = frame
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=3, axis=1, ascending=False)
|
||||
expected = frame.reindex(columns=['C', 'B', 'A'])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=[1, 2], axis='columns')
|
||||
expected = frame.reindex(columns=['B', 'A', 'C'])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=[1, 3], axis=1,
|
||||
ascending=[True, False])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False)
|
||||
expected = frame.reindex(columns=['C', 'B', 'A'])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
msg = r'Length of ascending \(5\) != length of by \(2\)'
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
frame.sort_values(by=['A', 'B'], axis=0, ascending=[True] * 5)
|
||||
|
||||
def test_sort_values_inplace(self):
|
||||
frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4],
|
||||
columns=['A', 'B', 'C', 'D'])
|
||||
|
||||
sorted_df = frame.copy()
|
||||
sorted_df.sort_values(by='A', inplace=True)
|
||||
expected = frame.sort_values(by='A')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
sorted_df.sort_values(by=1, axis=1, inplace=True)
|
||||
expected = frame.sort_values(by=1, axis=1)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
sorted_df.sort_values(by='A', ascending=False, inplace=True)
|
||||
expected = frame.sort_values(by='A', ascending=False)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
sorted_df.sort_values(by=['A', 'B'], ascending=False, inplace=True)
|
||||
expected = frame.sort_values(by=['A', 'B'], ascending=False)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_sort_nan(self):
|
||||
# GH3917
|
||||
nan = np.nan
|
||||
df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
|
||||
'B': [9, nan, 5, 2, 5, 4, 5]})
|
||||
|
||||
# sort one column only
|
||||
expected = DataFrame(
|
||||
{'A': [nan, 1, 1, 2, 4, 6, 8],
|
||||
'B': [5, 9, 2, nan, 5, 5, 4]},
|
||||
index=[2, 0, 3, 1, 6, 4, 5])
|
||||
sorted_df = df.sort_values(['A'], na_position='first')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{'A': [nan, 8, 6, 4, 2, 1, 1],
|
||||
'B': [5, 4, 5, 5, nan, 9, 2]},
|
||||
index=[2, 5, 4, 6, 1, 0, 3])
|
||||
sorted_df = df.sort_values(['A'], na_position='first', ascending=False)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
expected = df.reindex(columns=['B', 'A'])
|
||||
sorted_df = df.sort_values(by=1, axis=1, na_position='first')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# na_position='last', order
|
||||
expected = DataFrame(
|
||||
{'A': [1, 1, 2, 4, 6, 8, nan],
|
||||
'B': [2, 9, nan, 5, 5, 4, 5]},
|
||||
index=[3, 0, 1, 6, 4, 5, 2])
|
||||
sorted_df = df.sort_values(['A', 'B'])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# na_position='first', order
|
||||
expected = DataFrame(
|
||||
{'A': [nan, 1, 1, 2, 4, 6, 8],
|
||||
'B': [5, 2, 9, nan, 5, 5, 4]},
|
||||
index=[2, 3, 0, 1, 6, 4, 5])
|
||||
sorted_df = df.sort_values(['A', 'B'], na_position='first')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# na_position='first', not order
|
||||
expected = DataFrame(
|
||||
{'A': [nan, 1, 1, 2, 4, 6, 8],
|
||||
'B': [5, 9, 2, nan, 5, 5, 4]},
|
||||
index=[2, 0, 3, 1, 6, 4, 5])
|
||||
sorted_df = df.sort_values(['A', 'B'], ascending=[
|
||||
1, 0], na_position='first')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# na_position='last', not order
|
||||
expected = DataFrame(
|
||||
{'A': [8, 6, 4, 2, 1, 1, nan],
|
||||
'B': [4, 5, 5, nan, 2, 9, 5]},
|
||||
index=[5, 4, 6, 1, 3, 0, 2])
|
||||
sorted_df = df.sort_values(['A', 'B'], ascending=[
|
||||
0, 1], na_position='last')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# Test DataFrame with nan label
|
||||
df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
|
||||
'B': [9, nan, 5, 2, 5, 4, 5]},
|
||||
index=[1, 2, 3, 4, 5, 6, nan])
|
||||
|
||||
# NaN label, ascending=True, na_position='last'
|
||||
sorted_df = df.sort_index(
|
||||
kind='quicksort', ascending=True, na_position='last')
|
||||
expected = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
|
||||
'B': [9, nan, 5, 2, 5, 4, 5]},
|
||||
index=[1, 2, 3, 4, 5, 6, nan])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# NaN label, ascending=True, na_position='first'
|
||||
sorted_df = df.sort_index(na_position='first')
|
||||
expected = DataFrame({'A': [4, 1, 2, nan, 1, 6, 8],
|
||||
'B': [5, 9, nan, 5, 2, 5, 4]},
|
||||
index=[nan, 1, 2, 3, 4, 5, 6])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# NaN label, ascending=False, na_position='last'
|
||||
sorted_df = df.sort_index(kind='quicksort', ascending=False)
|
||||
expected = DataFrame({'A': [8, 6, 1, nan, 2, 1, 4],
|
||||
'B': [4, 5, 2, 5, nan, 9, 5]},
|
||||
index=[6, 5, 4, 3, 2, 1, nan])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# NaN label, ascending=False, na_position='first'
|
||||
sorted_df = df.sort_index(
|
||||
kind='quicksort', ascending=False, na_position='first')
|
||||
expected = DataFrame({'A': [4, 8, 6, 1, nan, 2, 1],
|
||||
'B': [5, 4, 5, 2, 5, nan, 9]},
|
||||
index=[nan, 6, 5, 4, 3, 2, 1])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_stable_descending_sort(self):
|
||||
# GH #6399
|
||||
df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']],
|
||||
columns=['sort_col', 'order'])
|
||||
sorted_df = df.sort_values(by='sort_col', kind='mergesort',
|
||||
ascending=False)
|
||||
assert_frame_equal(df, sorted_df)
|
||||
|
||||
def test_stable_descending_multicolumn_sort(self):
|
||||
nan = np.nan
|
||||
df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
|
||||
'B': [9, nan, 5, 2, 5, 4, 5]})
|
||||
# test stable mergesort
|
||||
expected = DataFrame(
|
||||
{'A': [nan, 8, 6, 4, 2, 1, 1],
|
||||
'B': [5, 4, 5, 5, nan, 2, 9]},
|
||||
index=[2, 5, 4, 6, 1, 3, 0])
|
||||
sorted_df = df.sort_values(['A', 'B'], ascending=[0, 1],
|
||||
na_position='first',
|
||||
kind='mergesort')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{'A': [nan, 8, 6, 4, 2, 1, 1],
|
||||
'B': [5, 4, 5, 5, nan, 9, 2]},
|
||||
index=[2, 5, 4, 6, 1, 0, 3])
|
||||
sorted_df = df.sort_values(['A', 'B'], ascending=[0, 0],
|
||||
na_position='first',
|
||||
kind='mergesort')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_stable_categorial(self):
|
||||
# GH 16793
|
||||
df = DataFrame({
|
||||
'x': pd.Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)
|
||||
})
|
||||
expected = df.copy()
|
||||
sorted_df = df.sort_values('x', kind='mergesort')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_sort_datetimes(self):
|
||||
|
||||
# GH 3461, argsort / lexsort differences for a datetime column
|
||||
df = DataFrame(['a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'],
|
||||
columns=['A'],
|
||||
index=date_range('20130101', periods=9))
|
||||
dts = [Timestamp(x)
|
||||
for x in ['2004-02-11', '2004-01-21', '2004-01-26',
|
||||
'2005-09-20', '2010-10-04', '2009-05-12',
|
||||
'2008-11-12', '2010-09-28', '2010-09-28']]
|
||||
df['B'] = dts[::2] + dts[1::2]
|
||||
df['C'] = 2.
|
||||
df['A1'] = 3.
|
||||
|
||||
df1 = df.sort_values(by='A')
|
||||
df2 = df.sort_values(by=['A'])
|
||||
assert_frame_equal(df1, df2)
|
||||
|
||||
df1 = df.sort_values(by='B')
|
||||
df2 = df.sort_values(by=['B'])
|
||||
assert_frame_equal(df1, df2)
|
||||
|
||||
df1 = df.sort_values(by='B')
|
||||
|
||||
df2 = df.sort_values(by=['C', 'B'])
|
||||
assert_frame_equal(df1, df2)
|
||||
|
||||
def test_frame_column_inplace_sort_exception(self):
|
||||
s = self.frame['A']
|
||||
with tm.assert_raises_regex(ValueError, "This Series is a view"):
|
||||
s.sort_values(inplace=True)
|
||||
|
||||
cp = s.copy()
|
||||
cp.sort_values() # it works!
|
||||
|
||||
def test_sort_nat_values_in_int_column(self):
|
||||
|
||||
# GH 14922: "sorting with large float and multiple columns incorrect"
|
||||
|
||||
# cause was that the int64 value NaT was considered as "na". Which is
|
||||
# only correct for datetime64 columns.
|
||||
|
||||
int_values = (2, int(NaT))
|
||||
float_values = (2.0, -1.797693e308)
|
||||
|
||||
df = DataFrame(dict(int=int_values, float=float_values),
|
||||
columns=["int", "float"])
|
||||
|
||||
df_reversed = DataFrame(dict(int=int_values[::-1],
|
||||
float=float_values[::-1]),
|
||||
columns=["int", "float"],
|
||||
index=[1, 0])
|
||||
|
||||
# NaT is not a "na" for int64 columns, so na_position must not
|
||||
# influence the result:
|
||||
df_sorted = df.sort_values(["int", "float"], na_position="last")
|
||||
assert_frame_equal(df_sorted, df_reversed)
|
||||
|
||||
df_sorted = df.sort_values(["int", "float"], na_position="first")
|
||||
assert_frame_equal(df_sorted, df_reversed)
|
||||
|
||||
# reverse sorting order
|
||||
df_sorted = df.sort_values(["int", "float"], ascending=False)
|
||||
assert_frame_equal(df_sorted, df)
|
||||
|
||||
# and now check if NaT is still considered as "na" for datetime64
|
||||
# columns:
|
||||
df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT],
|
||||
float=float_values), columns=["datetime", "float"])
|
||||
|
||||
df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")],
|
||||
float=float_values[::-1]),
|
||||
columns=["datetime", "float"],
|
||||
index=[1, 0])
|
||||
|
||||
df_sorted = df.sort_values(["datetime", "float"], na_position="first")
|
||||
assert_frame_equal(df_sorted, df_reversed)
|
||||
|
||||
df_sorted = df.sort_values(["datetime", "float"], na_position="last")
|
||||
assert_frame_equal(df_sorted, df)
|
||||
|
||||
# Ascending should not affect the results.
|
||||
df_sorted = df.sort_values(["datetime", "float"], ascending=False)
|
||||
assert_frame_equal(df_sorted, df)
|
||||
|
||||
def test_sort_nat(self):
|
||||
|
||||
# GH 16836
|
||||
|
||||
d1 = [Timestamp(x) for x in ['2016-01-01', '2015-01-01',
|
||||
np.nan, '2016-01-01']]
|
||||
d2 = [Timestamp(x) for x in ['2017-01-01', '2014-01-01',
|
||||
'2016-01-01', '2015-01-01']]
|
||||
df = pd.DataFrame({'a': d1, 'b': d2}, index=[0, 1, 2, 3])
|
||||
|
||||
d3 = [Timestamp(x) for x in ['2015-01-01', '2016-01-01',
|
||||
'2016-01-01', np.nan]]
|
||||
d4 = [Timestamp(x) for x in ['2014-01-01', '2015-01-01',
|
||||
'2017-01-01', '2016-01-01']]
|
||||
expected = pd.DataFrame({'a': d3, 'b': d4}, index=[1, 3, 0, 2])
|
||||
sorted_df = df.sort_values(by=['a', 'b'], )
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
|
||||
class TestDataFrameSortIndexKinds(TestData):
|
||||
|
||||
def test_sort_index_multicolumn(self):
|
||||
A = np.arange(5).repeat(20)
|
||||
B = np.tile(np.arange(5), 20)
|
||||
random.shuffle(A)
|
||||
random.shuffle(B)
|
||||
frame = DataFrame({'A': A, 'B': B,
|
||||
'C': np.random.randn(100)})
|
||||
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
frame.sort_index(by=['A', 'B'])
|
||||
result = frame.sort_values(by=['A', 'B'])
|
||||
indexer = np.lexsort((frame['B'], frame['A']))
|
||||
expected = frame.take(indexer)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
frame.sort_index(by=['A', 'B'], ascending=False)
|
||||
result = frame.sort_values(by=['A', 'B'], ascending=False)
|
||||
indexer = np.lexsort((frame['B'].rank(ascending=False),
|
||||
frame['A'].rank(ascending=False)))
|
||||
expected = frame.take(indexer)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
frame.sort_index(by=['B', 'A'])
|
||||
result = frame.sort_values(by=['B', 'A'])
|
||||
indexer = np.lexsort((frame['A'], frame['B']))
|
||||
expected = frame.take(indexer)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_index_inplace(self):
|
||||
frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4],
|
||||
columns=['A', 'B', 'C', 'D'])
|
||||
|
||||
# axis=0
|
||||
unordered = frame.loc[[3, 2, 4, 1]]
|
||||
a_id = id(unordered['A'])
|
||||
df = unordered.copy()
|
||||
df.sort_index(inplace=True)
|
||||
expected = frame
|
||||
assert_frame_equal(df, expected)
|
||||
assert a_id != id(df['A'])
|
||||
|
||||
df = unordered.copy()
|
||||
df.sort_index(ascending=False, inplace=True)
|
||||
expected = frame[::-1]
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# axis=1
|
||||
unordered = frame.loc[:, ['D', 'B', 'C', 'A']]
|
||||
df = unordered.copy()
|
||||
df.sort_index(axis=1, inplace=True)
|
||||
expected = frame
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
df = unordered.copy()
|
||||
df.sort_index(axis=1, ascending=False, inplace=True)
|
||||
expected = frame.iloc[:, ::-1]
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_sort_index_different_sortorder(self):
|
||||
A = np.arange(20).repeat(5)
|
||||
B = np.tile(np.arange(5), 20)
|
||||
|
||||
indexer = np.random.permutation(100)
|
||||
A = A.take(indexer)
|
||||
B = B.take(indexer)
|
||||
|
||||
df = DataFrame({'A': A, 'B': B,
|
||||
'C': np.random.randn(100)})
|
||||
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by=['A', 'B'], ascending=[1, 0])
|
||||
result = df.sort_values(by=['A', 'B'], ascending=[1, 0])
|
||||
|
||||
ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
|
||||
expected = df.take(ex_indexer)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# test with multiindex, too
|
||||
idf = df.set_index(['A', 'B'])
|
||||
|
||||
result = idf.sort_index(ascending=[1, 0])
|
||||
expected = idf.take(ex_indexer)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# also, Series!
|
||||
result = idf['C'].sort_index(ascending=[1, 0])
|
||||
assert_series_equal(result, expected['C'])
|
||||
|
||||
def test_sort_index_duplicates(self):
|
||||
|
||||
# with 9816, these are all translated to .sort_values
|
||||
|
||||
df = DataFrame([lrange(5, 9), lrange(4)],
|
||||
columns=['a', 'a', 'b', 'b'])
|
||||
|
||||
with tm.assert_raises_regex(ValueError, 'not unique'):
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by='a')
|
||||
with tm.assert_raises_regex(ValueError, 'not unique'):
|
||||
df.sort_values(by='a')
|
||||
|
||||
with tm.assert_raises_regex(ValueError, 'not unique'):
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by=['a'])
|
||||
with tm.assert_raises_regex(ValueError, 'not unique'):
|
||||
df.sort_values(by=['a'])
|
||||
|
||||
with tm.assert_raises_regex(ValueError, 'not unique'):
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
# multi-column 'by' is separate codepath
|
||||
df.sort_index(by=['a', 'b'])
|
||||
with tm.assert_raises_regex(ValueError, 'not unique'):
|
||||
# multi-column 'by' is separate codepath
|
||||
df.sort_values(by=['a', 'b'])
|
||||
|
||||
# with multi-index
|
||||
# GH4370
|
||||
df = DataFrame(np.random.randn(4, 2),
|
||||
columns=MultiIndex.from_tuples([('a', 0), ('a', 1)]))
|
||||
with tm.assert_raises_regex(ValueError, 'level'):
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by='a')
|
||||
with tm.assert_raises_regex(ValueError, 'level'):
|
||||
df.sort_values(by='a')
|
||||
|
||||
# convert tuples to a list of tuples
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by=[('a', 1)])
|
||||
expected = df.sort_values(by=[('a', 1)])
|
||||
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by=('a', 1))
|
||||
result = df.sort_values(by=('a', 1))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_index_level(self):
|
||||
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
|
||||
df = DataFrame([[1, 2], [3, 4]], mi)
|
||||
res = df.sort_index(level='A', sort_remaining=False)
|
||||
assert_frame_equal(df, res)
|
||||
|
||||
res = df.sort_index(level=['A', 'B'], sort_remaining=False)
|
||||
assert_frame_equal(df, res)
|
||||
|
||||
def test_sort_index_categorical_index(self):
|
||||
|
||||
df = (DataFrame({'A': np.arange(6, dtype='int64'),
|
||||
'B': Series(list('aabbca'))
|
||||
.astype(CategoricalDtype(list('cab')))})
|
||||
.set_index('B'))
|
||||
|
||||
result = df.sort_index()
|
||||
expected = df.iloc[[4, 0, 1, 5, 2, 3]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.sort_index(ascending=False)
|
||||
expected = df.iloc[[3, 2, 5, 1, 0, 4]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_index(self):
|
||||
# GH13496
|
||||
|
||||
frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
|
||||
columns=['A', 'B', 'C', 'D'])
|
||||
|
||||
# axis=0 : sort rows by index labels
|
||||
unordered = frame.loc[[3, 2, 4, 1]]
|
||||
result = unordered.sort_index(axis=0)
|
||||
expected = frame
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = unordered.sort_index(ascending=False)
|
||||
expected = frame[::-1]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# axis=1 : sort columns by column names
|
||||
unordered = frame.iloc[:, [2, 1, 3, 0]]
|
||||
result = unordered.sort_index(axis=1)
|
||||
assert_frame_equal(result, frame)
|
||||
|
||||
result = unordered.sort_index(axis=1, ascending=False)
|
||||
expected = frame.iloc[:, ::-1]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("level", ['A', 0]) # GH 21052
|
||||
def test_sort_index_multiindex(self, level):
|
||||
# GH13496
|
||||
|
||||
# sort rows by specified level of multi-index
|
||||
mi = MultiIndex.from_tuples([
|
||||
[2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list('ABC'))
|
||||
df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi)
|
||||
|
||||
expected_mi = MultiIndex.from_tuples([
|
||||
[1, 1, 1],
|
||||
[2, 1, 2],
|
||||
[2, 1, 3]], names=list('ABC'))
|
||||
expected = pd.DataFrame([
|
||||
[5, 6],
|
||||
[3, 4],
|
||||
[1, 2]], index=expected_mi)
|
||||
result = df.sort_index(level=level)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# sort_remaining=False
|
||||
expected_mi = MultiIndex.from_tuples([
|
||||
[1, 1, 1],
|
||||
[2, 1, 3],
|
||||
[2, 1, 2]], names=list('ABC'))
|
||||
expected = pd.DataFrame([
|
||||
[5, 6],
|
||||
[1, 2],
|
||||
[3, 4]], index=expected_mi)
|
||||
result = df.sort_index(level=level, sort_remaining=False)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_index_intervalindex(self):
|
||||
# this is a de-facto sort via unstack
|
||||
# confirming that we sort in the order of the bins
|
||||
y = Series(np.random.randn(100))
|
||||
x1 = Series(np.sign(np.random.randn(100)))
|
||||
x2 = pd.cut(Series(np.random.randn(100)),
|
||||
bins=[-3, -0.5, 0, 0.5, 3])
|
||||
model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])
|
||||
|
||||
result = model.groupby(['X1', 'X2'], observed=True).mean().unstack()
|
||||
expected = IntervalIndex.from_tuples(
|
||||
[(-3.0, -0.5), (-0.5, 0.0),
|
||||
(0.0, 0.5), (0.5, 3.0)],
|
||||
closed='right')
|
||||
result = result.columns.levels[1].categories
|
||||
tm.assert_index_equal(result, expected)
|
||||
@@ -1,572 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from warnings import catch_warnings
|
||||
import numpy as np
|
||||
|
||||
from pandas import DataFrame, Series, MultiIndex, Panel, Index
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.tests.frame.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameSubclassing(TestData):
|
||||
|
||||
def test_frame_subclassing_and_slicing(self):
|
||||
# Subclass frame and ensure it returns the right class on slicing it
|
||||
# In reference to PR 9632
|
||||
|
||||
class CustomSeries(Series):
|
||||
|
||||
@property
|
||||
def _constructor(self):
|
||||
return CustomSeries
|
||||
|
||||
def custom_series_function(self):
|
||||
return 'OK'
|
||||
|
||||
class CustomDataFrame(DataFrame):
|
||||
"""
|
||||
Subclasses pandas DF, fills DF with simulation results, adds some
|
||||
custom plotting functions.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kw):
|
||||
super(CustomDataFrame, self).__init__(*args, **kw)
|
||||
|
||||
@property
|
||||
def _constructor(self):
|
||||
return CustomDataFrame
|
||||
|
||||
_constructor_sliced = CustomSeries
|
||||
|
||||
def custom_frame_function(self):
|
||||
return 'OK'
|
||||
|
||||
data = {'col1': range(10),
|
||||
'col2': range(10)}
|
||||
cdf = CustomDataFrame(data)
|
||||
|
||||
# Did we get back our own DF class?
|
||||
assert isinstance(cdf, CustomDataFrame)
|
||||
|
||||
# Do we get back our own Series class after selecting a column?
|
||||
cdf_series = cdf.col1
|
||||
assert isinstance(cdf_series, CustomSeries)
|
||||
assert cdf_series.custom_series_function() == 'OK'
|
||||
|
||||
# Do we get back our own DF class after slicing row-wise?
|
||||
cdf_rows = cdf[1:5]
|
||||
assert isinstance(cdf_rows, CustomDataFrame)
|
||||
assert cdf_rows.custom_frame_function() == 'OK'
|
||||
|
||||
# Make sure sliced part of multi-index frame is custom class
|
||||
mcol = pd.MultiIndex.from_tuples([('A', 'A'), ('A', 'B')])
|
||||
cdf_multi = CustomDataFrame([[0, 1], [2, 3]], columns=mcol)
|
||||
assert isinstance(cdf_multi['A'], CustomDataFrame)
|
||||
|
||||
mcol = pd.MultiIndex.from_tuples([('A', ''), ('B', '')])
|
||||
cdf_multi2 = CustomDataFrame([[0, 1], [2, 3]], columns=mcol)
|
||||
assert isinstance(cdf_multi2['A'], CustomSeries)
|
||||
|
||||
def test_dataframe_metadata(self):
|
||||
df = tm.SubclassedDataFrame({'X': [1, 2, 3], 'Y': [1, 2, 3]},
|
||||
index=['a', 'b', 'c'])
|
||||
df.testattr = 'XXX'
|
||||
|
||||
assert df.testattr == 'XXX'
|
||||
assert df[['X']].testattr == 'XXX'
|
||||
assert df.loc[['a', 'b'], :].testattr == 'XXX'
|
||||
assert df.iloc[[0, 1], :].testattr == 'XXX'
|
||||
|
||||
# see gh-9776
|
||||
assert df.iloc[0:1, :].testattr == 'XXX'
|
||||
|
||||
# see gh-10553
|
||||
unpickled = tm.round_trip_pickle(df)
|
||||
tm.assert_frame_equal(df, unpickled)
|
||||
assert df._metadata == unpickled._metadata
|
||||
assert df.testattr == unpickled.testattr
|
||||
|
||||
def test_indexing_sliced(self):
|
||||
# GH 11559
|
||||
df = tm.SubclassedDataFrame({'X': [1, 2, 3],
|
||||
'Y': [4, 5, 6],
|
||||
'Z': [7, 8, 9]},
|
||||
index=['a', 'b', 'c'])
|
||||
res = df.loc[:, 'X']
|
||||
exp = tm.SubclassedSeries([1, 2, 3], index=list('abc'), name='X')
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
res = df.iloc[:, 1]
|
||||
exp = tm.SubclassedSeries([4, 5, 6], index=list('abc'), name='Y')
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
res = df.loc[:, 'Z']
|
||||
exp = tm.SubclassedSeries([7, 8, 9], index=list('abc'), name='Z')
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
res = df.loc['a', :]
|
||||
exp = tm.SubclassedSeries([1, 4, 7], index=list('XYZ'), name='a')
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
res = df.iloc[1, :]
|
||||
exp = tm.SubclassedSeries([2, 5, 8], index=list('XYZ'), name='b')
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
res = df.loc['c', :]
|
||||
exp = tm.SubclassedSeries([3, 6, 9], index=list('XYZ'), name='c')
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
def test_to_panel_expanddim(self):
|
||||
# GH 9762
|
||||
|
||||
with catch_warnings(record=True):
|
||||
class SubclassedFrame(DataFrame):
|
||||
|
||||
@property
|
||||
def _constructor_expanddim(self):
|
||||
return SubclassedPanel
|
||||
|
||||
class SubclassedPanel(Panel):
|
||||
pass
|
||||
|
||||
index = MultiIndex.from_tuples([(0, 0), (0, 1), (0, 2)])
|
||||
df = SubclassedFrame({'X': [1, 2, 3], 'Y': [4, 5, 6]}, index=index)
|
||||
result = df.to_panel()
|
||||
assert isinstance(result, SubclassedPanel)
|
||||
expected = SubclassedPanel([[[1, 2, 3]], [[4, 5, 6]]],
|
||||
items=['X', 'Y'], major_axis=[0],
|
||||
minor_axis=[0, 1, 2],
|
||||
dtype='int64')
|
||||
tm.assert_panel_equal(result, expected)
|
||||
|
||||
def test_subclass_attr_err_propagation(self):
|
||||
# GH 11808
|
||||
class A(DataFrame):
|
||||
|
||||
@property
|
||||
def bar(self):
|
||||
return self.i_dont_exist
|
||||
with tm.assert_raises_regex(AttributeError, '.*i_dont_exist.*'):
|
||||
A().bar
|
||||
|
||||
def test_subclass_align(self):
|
||||
# GH 12983
|
||||
df1 = tm.SubclassedDataFrame({'a': [1, 3, 5],
|
||||
'b': [1, 3, 5]}, index=list('ACE'))
|
||||
df2 = tm.SubclassedDataFrame({'c': [1, 2, 4],
|
||||
'd': [1, 2, 4]}, index=list('ABD'))
|
||||
|
||||
res1, res2 = df1.align(df2, axis=0)
|
||||
exp1 = tm.SubclassedDataFrame({'a': [1, np.nan, 3, np.nan, 5],
|
||||
'b': [1, np.nan, 3, np.nan, 5]},
|
||||
index=list('ABCDE'))
|
||||
exp2 = tm.SubclassedDataFrame({'c': [1, 2, np.nan, 4, np.nan],
|
||||
'd': [1, 2, np.nan, 4, np.nan]},
|
||||
index=list('ABCDE'))
|
||||
assert isinstance(res1, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(res1, exp1)
|
||||
assert isinstance(res2, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(res2, exp2)
|
||||
|
||||
res1, res2 = df1.a.align(df2.c)
|
||||
assert isinstance(res1, tm.SubclassedSeries)
|
||||
tm.assert_series_equal(res1, exp1.a)
|
||||
assert isinstance(res2, tm.SubclassedSeries)
|
||||
tm.assert_series_equal(res2, exp2.c)
|
||||
|
||||
def test_subclass_align_combinations(self):
|
||||
# GH 12983
|
||||
df = tm.SubclassedDataFrame({'a': [1, 3, 5],
|
||||
'b': [1, 3, 5]}, index=list('ACE'))
|
||||
s = tm.SubclassedSeries([1, 2, 4], index=list('ABD'), name='x')
|
||||
|
||||
# frame + series
|
||||
res1, res2 = df.align(s, axis=0)
|
||||
exp1 = pd.DataFrame({'a': [1, np.nan, 3, np.nan, 5],
|
||||
'b': [1, np.nan, 3, np.nan, 5]},
|
||||
index=list('ABCDE'))
|
||||
# name is lost when
|
||||
exp2 = pd.Series([1, 2, np.nan, 4, np.nan],
|
||||
index=list('ABCDE'), name='x')
|
||||
|
||||
assert isinstance(res1, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(res1, exp1)
|
||||
assert isinstance(res2, tm.SubclassedSeries)
|
||||
tm.assert_series_equal(res2, exp2)
|
||||
|
||||
# series + frame
|
||||
res1, res2 = s.align(df)
|
||||
assert isinstance(res1, tm.SubclassedSeries)
|
||||
tm.assert_series_equal(res1, exp2)
|
||||
assert isinstance(res2, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(res2, exp1)
|
||||
|
||||
def test_subclass_iterrows(self):
|
||||
# GH 13977
|
||||
df = tm.SubclassedDataFrame({'a': [1]})
|
||||
for i, row in df.iterrows():
|
||||
assert isinstance(row, tm.SubclassedSeries)
|
||||
tm.assert_series_equal(row, df.loc[i])
|
||||
|
||||
def test_subclass_sparse_slice(self):
|
||||
rows = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
|
||||
ssdf = tm.SubclassedSparseDataFrame(rows)
|
||||
ssdf.testattr = "testattr"
|
||||
|
||||
tm.assert_sp_frame_equal(ssdf.loc[:2],
|
||||
tm.SubclassedSparseDataFrame(rows[:3]))
|
||||
tm.assert_sp_frame_equal(ssdf.iloc[:2],
|
||||
tm.SubclassedSparseDataFrame(rows[:2]))
|
||||
tm.assert_sp_frame_equal(ssdf[:2],
|
||||
tm.SubclassedSparseDataFrame(rows[:2]))
|
||||
assert ssdf.loc[:2].testattr == "testattr"
|
||||
assert ssdf.iloc[:2].testattr == "testattr"
|
||||
assert ssdf[:2].testattr == "testattr"
|
||||
|
||||
tm.assert_sp_series_equal(ssdf.loc[1],
|
||||
tm.SubclassedSparseSeries(rows[1]),
|
||||
check_names=False)
|
||||
tm.assert_sp_series_equal(ssdf.iloc[1],
|
||||
tm.SubclassedSparseSeries(rows[1]),
|
||||
check_names=False)
|
||||
|
||||
def test_subclass_sparse_transpose(self):
|
||||
ossdf = tm.SubclassedSparseDataFrame([[1, 2, 3],
|
||||
[4, 5, 6]])
|
||||
essdf = tm.SubclassedSparseDataFrame([[1, 4],
|
||||
[2, 5],
|
||||
[3, 6]])
|
||||
tm.assert_sp_frame_equal(ossdf.T, essdf)
|
||||
|
||||
def test_subclass_stack(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=['a', 'b', 'c'],
|
||||
columns=['X', 'Y', 'Z'])
|
||||
|
||||
res = df.stack()
|
||||
exp = tm.SubclassedSeries(
|
||||
[1, 2, 3, 4, 5, 6, 7, 8, 9],
|
||||
index=[list('aaabbbccc'), list('XYZXYZXYZ')])
|
||||
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_subclass_stack_multi(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame([
|
||||
[10, 11, 12, 13],
|
||||
[20, 21, 22, 23],
|
||||
[30, 31, 32, 33],
|
||||
[40, 41, 42, 43]],
|
||||
index=MultiIndex.from_tuples(
|
||||
list(zip(list('AABB'), list('cdcd'))),
|
||||
names=['aaa', 'ccc']),
|
||||
columns=MultiIndex.from_tuples(
|
||||
list(zip(list('WWXX'), list('yzyz'))),
|
||||
names=['www', 'yyy']))
|
||||
|
||||
exp = tm.SubclassedDataFrame([
|
||||
[10, 12],
|
||||
[11, 13],
|
||||
[20, 22],
|
||||
[21, 23],
|
||||
[30, 32],
|
||||
[31, 33],
|
||||
[40, 42],
|
||||
[41, 43]],
|
||||
index=MultiIndex.from_tuples(list(zip(
|
||||
list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))),
|
||||
names=['aaa', 'ccc', 'yyy']),
|
||||
columns=Index(['W', 'X'], name='www'))
|
||||
|
||||
res = df.stack()
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.stack('yyy')
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
exp = tm.SubclassedDataFrame([
|
||||
[10, 11],
|
||||
[12, 13],
|
||||
[20, 21],
|
||||
[22, 23],
|
||||
[30, 31],
|
||||
[32, 33],
|
||||
[40, 41],
|
||||
[42, 43]],
|
||||
index=MultiIndex.from_tuples(list(zip(
|
||||
list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))),
|
||||
names=['aaa', 'ccc', 'www']),
|
||||
columns=Index(['y', 'z'], name='yyy'))
|
||||
|
||||
res = df.stack('www')
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_subclass_stack_multi_mixed(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame([
|
||||
[10, 11, 12.0, 13.0],
|
||||
[20, 21, 22.0, 23.0],
|
||||
[30, 31, 32.0, 33.0],
|
||||
[40, 41, 42.0, 43.0]],
|
||||
index=MultiIndex.from_tuples(
|
||||
list(zip(list('AABB'), list('cdcd'))),
|
||||
names=['aaa', 'ccc']),
|
||||
columns=MultiIndex.from_tuples(
|
||||
list(zip(list('WWXX'), list('yzyz'))),
|
||||
names=['www', 'yyy']))
|
||||
|
||||
exp = tm.SubclassedDataFrame([
|
||||
[10, 12.0],
|
||||
[11, 13.0],
|
||||
[20, 22.0],
|
||||
[21, 23.0],
|
||||
[30, 32.0],
|
||||
[31, 33.0],
|
||||
[40, 42.0],
|
||||
[41, 43.0]],
|
||||
index=MultiIndex.from_tuples(list(zip(
|
||||
list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))),
|
||||
names=['aaa', 'ccc', 'yyy']),
|
||||
columns=Index(['W', 'X'], name='www'))
|
||||
|
||||
res = df.stack()
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.stack('yyy')
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
exp = tm.SubclassedDataFrame([
|
||||
[10.0, 11.0],
|
||||
[12.0, 13.0],
|
||||
[20.0, 21.0],
|
||||
[22.0, 23.0],
|
||||
[30.0, 31.0],
|
||||
[32.0, 33.0],
|
||||
[40.0, 41.0],
|
||||
[42.0, 43.0]],
|
||||
index=MultiIndex.from_tuples(list(zip(
|
||||
list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))),
|
||||
names=['aaa', 'ccc', 'www']),
|
||||
columns=Index(['y', 'z'], name='yyy'))
|
||||
|
||||
res = df.stack('www')
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_subclass_unstack(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=['a', 'b', 'c'],
|
||||
columns=['X', 'Y', 'Z'])
|
||||
|
||||
res = df.unstack()
|
||||
exp = tm.SubclassedSeries(
|
||||
[1, 4, 7, 2, 5, 8, 3, 6, 9],
|
||||
index=[list('XXXYYYZZZ'), list('abcabcabc')])
|
||||
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_subclass_unstack_multi(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame([
|
||||
[10, 11, 12, 13],
|
||||
[20, 21, 22, 23],
|
||||
[30, 31, 32, 33],
|
||||
[40, 41, 42, 43]],
|
||||
index=MultiIndex.from_tuples(
|
||||
list(zip(list('AABB'), list('cdcd'))),
|
||||
names=['aaa', 'ccc']),
|
||||
columns=MultiIndex.from_tuples(
|
||||
list(zip(list('WWXX'), list('yzyz'))),
|
||||
names=['www', 'yyy']))
|
||||
|
||||
exp = tm.SubclassedDataFrame([
|
||||
[10, 20, 11, 21, 12, 22, 13, 23],
|
||||
[30, 40, 31, 41, 32, 42, 33, 43]],
|
||||
index=Index(['A', 'B'], name='aaa'),
|
||||
columns=MultiIndex.from_tuples(list(zip(
|
||||
list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))),
|
||||
names=['www', 'yyy', 'ccc']))
|
||||
|
||||
res = df.unstack()
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.unstack('ccc')
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
exp = tm.SubclassedDataFrame([
|
||||
[10, 30, 11, 31, 12, 32, 13, 33],
|
||||
[20, 40, 21, 41, 22, 42, 23, 43]],
|
||||
index=Index(['c', 'd'], name='ccc'),
|
||||
columns=MultiIndex.from_tuples(list(zip(
|
||||
list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))),
|
||||
names=['www', 'yyy', 'aaa']))
|
||||
|
||||
res = df.unstack('aaa')
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_subclass_unstack_multi_mixed(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame([
|
||||
[10, 11, 12.0, 13.0],
|
||||
[20, 21, 22.0, 23.0],
|
||||
[30, 31, 32.0, 33.0],
|
||||
[40, 41, 42.0, 43.0]],
|
||||
index=MultiIndex.from_tuples(
|
||||
list(zip(list('AABB'), list('cdcd'))),
|
||||
names=['aaa', 'ccc']),
|
||||
columns=MultiIndex.from_tuples(
|
||||
list(zip(list('WWXX'), list('yzyz'))),
|
||||
names=['www', 'yyy']))
|
||||
|
||||
exp = tm.SubclassedDataFrame([
|
||||
[10, 20, 11, 21, 12.0, 22.0, 13.0, 23.0],
|
||||
[30, 40, 31, 41, 32.0, 42.0, 33.0, 43.0]],
|
||||
index=Index(['A', 'B'], name='aaa'),
|
||||
columns=MultiIndex.from_tuples(list(zip(
|
||||
list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))),
|
||||
names=['www', 'yyy', 'ccc']))
|
||||
|
||||
res = df.unstack()
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.unstack('ccc')
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
exp = tm.SubclassedDataFrame([
|
||||
[10, 30, 11, 31, 12.0, 32.0, 13.0, 33.0],
|
||||
[20, 40, 21, 41, 22.0, 42.0, 23.0, 43.0]],
|
||||
index=Index(['c', 'd'], name='ccc'),
|
||||
columns=MultiIndex.from_tuples(list(zip(
|
||||
list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))),
|
||||
names=['www', 'yyy', 'aaa']))
|
||||
|
||||
res = df.unstack('aaa')
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_subclass_pivot(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame({
|
||||
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
|
||||
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
|
||||
'values': [1., 2., 3., 3., 2., 1.]})
|
||||
|
||||
pivoted = df.pivot(
|
||||
index='index', columns='columns', values='values')
|
||||
|
||||
expected = tm.SubclassedDataFrame({
|
||||
'One': {'A': 1., 'B': 2., 'C': 3.},
|
||||
'Two': {'A': 1., 'B': 2., 'C': 3.}})
|
||||
|
||||
expected.index.name, expected.columns.name = 'index', 'columns'
|
||||
|
||||
tm.assert_frame_equal(pivoted, expected)
|
||||
|
||||
def test_subclassed_melt(self):
|
||||
# GH 15564
|
||||
cheese = tm.SubclassedDataFrame({
|
||||
'first': ['John', 'Mary'],
|
||||
'last': ['Doe', 'Bo'],
|
||||
'height': [5.5, 6.0],
|
||||
'weight': [130, 150]})
|
||||
|
||||
melted = pd.melt(cheese, id_vars=['first', 'last'])
|
||||
|
||||
expected = tm.SubclassedDataFrame([
|
||||
['John', 'Doe', 'height', 5.5],
|
||||
['Mary', 'Bo', 'height', 6.0],
|
||||
['John', 'Doe', 'weight', 130],
|
||||
['Mary', 'Bo', 'weight', 150]],
|
||||
columns=['first', 'last', 'variable', 'value'])
|
||||
|
||||
tm.assert_frame_equal(melted, expected)
|
||||
|
||||
def test_subclassed_wide_to_long(self):
|
||||
# GH 9762
|
||||
|
||||
np.random.seed(123)
|
||||
x = np.random.randn(3)
|
||||
df = tm.SubclassedDataFrame({
|
||||
"A1970": {0: "a", 1: "b", 2: "c"},
|
||||
"A1980": {0: "d", 1: "e", 2: "f"},
|
||||
"B1970": {0: 2.5, 1: 1.2, 2: .7},
|
||||
"B1980": {0: 3.2, 1: 1.3, 2: .1},
|
||||
"X": dict(zip(range(3), x))})
|
||||
|
||||
df["id"] = df.index
|
||||
exp_data = {"X": x.tolist() + x.tolist(),
|
||||
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
|
||||
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
|
||||
"year": [1970, 1970, 1970, 1980, 1980, 1980],
|
||||
"id": [0, 1, 2, 0, 1, 2]}
|
||||
expected = tm.SubclassedDataFrame(exp_data)
|
||||
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
|
||||
long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year")
|
||||
|
||||
tm.assert_frame_equal(long_frame, expected)
|
||||
|
||||
def test_subclassed_apply(self):
|
||||
# GH 19822
|
||||
|
||||
def check_row_subclass(row):
|
||||
assert isinstance(row, tm.SubclassedSeries)
|
||||
|
||||
def strech(row):
|
||||
if row["variable"] == "height":
|
||||
row["value"] += 0.5
|
||||
return row
|
||||
|
||||
df = tm.SubclassedDataFrame([
|
||||
['John', 'Doe', 'height', 5.5],
|
||||
['Mary', 'Bo', 'height', 6.0],
|
||||
['John', 'Doe', 'weight', 130],
|
||||
['Mary', 'Bo', 'weight', 150]],
|
||||
columns=['first', 'last', 'variable', 'value'])
|
||||
|
||||
df.apply(lambda x: check_row_subclass(x))
|
||||
df.apply(lambda x: check_row_subclass(x), axis=1)
|
||||
|
||||
expected = tm.SubclassedDataFrame([
|
||||
['John', 'Doe', 'height', 6.0],
|
||||
['Mary', 'Bo', 'height', 6.5],
|
||||
['John', 'Doe', 'weight', 130],
|
||||
['Mary', 'Bo', 'weight', 150]],
|
||||
columns=['first', 'last', 'variable', 'value'])
|
||||
|
||||
result = df.apply(lambda x: strech(x), axis=1)
|
||||
assert isinstance(result, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = tm.SubclassedDataFrame([
|
||||
[1, 2, 3],
|
||||
[1, 2, 3],
|
||||
[1, 2, 3],
|
||||
[1, 2, 3]])
|
||||
|
||||
result = df.apply(lambda x: tm.SubclassedSeries([1, 2, 3]), axis=1)
|
||||
assert isinstance(result, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand")
|
||||
assert isinstance(result, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = tm.SubclassedSeries([
|
||||
[1, 2, 3],
|
||||
[1, 2, 3],
|
||||
[1, 2, 3],
|
||||
[1, 2, 3]])
|
||||
|
||||
result = df.apply(lambda x: [1, 2, 3], axis=1)
|
||||
assert not isinstance(result, tm.SubclassedDataFrame)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -1,847 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from datetime import datetime, time
|
||||
|
||||
import pytest
|
||||
|
||||
from numpy import nan
|
||||
from numpy.random import randn
|
||||
import numpy as np
|
||||
|
||||
from pandas import (DataFrame, Series, Index,
|
||||
Timestamp, DatetimeIndex, MultiIndex,
|
||||
to_datetime, date_range, period_range)
|
||||
import pandas as pd
|
||||
import pandas.tseries.offsets as offsets
|
||||
|
||||
from pandas.util.testing import (assert_series_equal,
|
||||
assert_frame_equal,
|
||||
assert_index_equal,
|
||||
assert_raises_regex)
|
||||
|
||||
import pandas.util.testing as tm
|
||||
from pandas.compat import product
|
||||
|
||||
from pandas.tests.frame.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameTimeSeriesMethods(TestData):
|
||||
|
||||
def test_diff(self):
|
||||
the_diff = self.tsframe.diff(1)
|
||||
|
||||
assert_series_equal(the_diff['A'],
|
||||
self.tsframe['A'] - self.tsframe['A'].shift(1))
|
||||
|
||||
# int dtype
|
||||
a = 10000000000000000
|
||||
b = a + 1
|
||||
s = Series([a, b])
|
||||
|
||||
rs = DataFrame({'s': s}).diff()
|
||||
assert rs.s[1] == 1
|
||||
|
||||
# mixed numeric
|
||||
tf = self.tsframe.astype('float32')
|
||||
the_diff = tf.diff(1)
|
||||
assert_series_equal(the_diff['A'],
|
||||
tf['A'] - tf['A'].shift(1))
|
||||
|
||||
# issue 10907
|
||||
df = pd.DataFrame({'y': pd.Series([2]), 'z': pd.Series([3])})
|
||||
df.insert(0, 'x', 1)
|
||||
result = df.diff(axis=1)
|
||||
expected = pd.DataFrame({'x': np.nan, 'y': pd.Series(
|
||||
1), 'z': pd.Series(1)}).astype('float64')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('tz', [None, 'UTC'])
|
||||
def test_diff_datetime_axis0(self, tz):
|
||||
# GH 18578
|
||||
df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz),
|
||||
1: date_range('2010', freq='D', periods=2, tz=tz)})
|
||||
|
||||
result = df.diff(axis=0)
|
||||
expected = DataFrame({0: pd.TimedeltaIndex(['NaT', '1 days']),
|
||||
1: pd.TimedeltaIndex(['NaT', '1 days'])})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('tz', [None, 'UTC'])
|
||||
def test_diff_datetime_axis1(self, tz):
|
||||
# GH 18578
|
||||
df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz),
|
||||
1: date_range('2010', freq='D', periods=2, tz=tz)})
|
||||
if tz is None:
|
||||
result = df.diff(axis=1)
|
||||
expected = DataFrame({0: pd.TimedeltaIndex(['NaT', 'NaT']),
|
||||
1: pd.TimedeltaIndex(['0 days',
|
||||
'0 days'])})
|
||||
assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(NotImplementedError):
|
||||
result = df.diff(axis=1)
|
||||
|
||||
def test_diff_timedelta(self):
|
||||
# GH 4533
|
||||
df = DataFrame(dict(time=[Timestamp('20130101 9:01'),
|
||||
Timestamp('20130101 9:02')],
|
||||
value=[1.0, 2.0]))
|
||||
|
||||
res = df.diff()
|
||||
exp = DataFrame([[pd.NaT, np.nan],
|
||||
[pd.Timedelta('00:01:00'), 1]],
|
||||
columns=['time', 'value'])
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
def test_diff_mixed_dtype(self):
|
||||
df = DataFrame(np.random.randn(5, 3))
|
||||
df['A'] = np.array([1, 2, 3, 4, 5], dtype=object)
|
||||
|
||||
result = df.diff()
|
||||
assert result[0].dtype == np.float64
|
||||
|
||||
def test_diff_neg_n(self):
|
||||
rs = self.tsframe.diff(-1)
|
||||
xp = self.tsframe - self.tsframe.shift(-1)
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_diff_float_n(self):
|
||||
rs = self.tsframe.diff(1.)
|
||||
xp = self.tsframe.diff(1)
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_diff_axis(self):
|
||||
# GH 9727
|
||||
df = DataFrame([[1., 2.], [3., 4.]])
|
||||
assert_frame_equal(df.diff(axis=1), DataFrame(
|
||||
[[np.nan, 1.], [np.nan, 1.]]))
|
||||
assert_frame_equal(df.diff(axis=0), DataFrame(
|
||||
[[np.nan, np.nan], [2., 2.]]))
|
||||
|
||||
def test_pct_change(self):
|
||||
rs = self.tsframe.pct_change(fill_method=None)
|
||||
assert_frame_equal(rs, self.tsframe / self.tsframe.shift(1) - 1)
|
||||
|
||||
rs = self.tsframe.pct_change(2)
|
||||
filled = self.tsframe.fillna(method='pad')
|
||||
assert_frame_equal(rs, filled / filled.shift(2) - 1)
|
||||
|
||||
rs = self.tsframe.pct_change(fill_method='bfill', limit=1)
|
||||
filled = self.tsframe.fillna(method='bfill', limit=1)
|
||||
assert_frame_equal(rs, filled / filled.shift(1) - 1)
|
||||
|
||||
rs = self.tsframe.pct_change(freq='5D')
|
||||
filled = self.tsframe.fillna(method='pad')
|
||||
assert_frame_equal(rs,
|
||||
(filled / filled.shift(freq='5D') - 1)
|
||||
.reindex_like(filled))
|
||||
|
||||
def test_pct_change_shift_over_nas(self):
|
||||
s = Series([1., 1.5, np.nan, 2.5, 3.])
|
||||
|
||||
df = DataFrame({'a': s, 'b': s})
|
||||
|
||||
chg = df.pct_change()
|
||||
expected = Series([np.nan, 0.5, 0., 2.5 / 1.5 - 1, .2])
|
||||
edf = DataFrame({'a': expected, 'b': expected})
|
||||
assert_frame_equal(chg, edf)
|
||||
|
||||
@pytest.mark.parametrize("freq, periods, fill_method, limit",
|
||||
[('5B', 5, None, None),
|
||||
('3B', 3, None, None),
|
||||
('3B', 3, 'bfill', None),
|
||||
('7B', 7, 'pad', 1),
|
||||
('7B', 7, 'bfill', 3),
|
||||
('14B', 14, None, None)])
|
||||
def test_pct_change_periods_freq(self, freq, periods, fill_method, limit):
|
||||
# GH 7292
|
||||
rs_freq = self.tsframe.pct_change(freq=freq,
|
||||
fill_method=fill_method,
|
||||
limit=limit)
|
||||
rs_periods = self.tsframe.pct_change(periods,
|
||||
fill_method=fill_method,
|
||||
limit=limit)
|
||||
assert_frame_equal(rs_freq, rs_periods)
|
||||
|
||||
empty_ts = DataFrame(index=self.tsframe.index,
|
||||
columns=self.tsframe.columns)
|
||||
rs_freq = empty_ts.pct_change(freq=freq,
|
||||
fill_method=fill_method,
|
||||
limit=limit)
|
||||
rs_periods = empty_ts.pct_change(periods,
|
||||
fill_method=fill_method,
|
||||
limit=limit)
|
||||
assert_frame_equal(rs_freq, rs_periods)
|
||||
|
||||
def test_frame_ctor_datetime64_column(self):
|
||||
rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s')
|
||||
dates = np.asarray(rng)
|
||||
|
||||
df = DataFrame({'A': np.random.randn(len(rng)), 'B': dates})
|
||||
assert np.issubdtype(df['B'].dtype, np.dtype('M8[ns]'))
|
||||
|
||||
def test_frame_add_datetime64_column(self):
|
||||
rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s')
|
||||
df = DataFrame(index=np.arange(len(rng)))
|
||||
|
||||
df['A'] = rng
|
||||
assert np.issubdtype(df['A'].dtype, np.dtype('M8[ns]'))
|
||||
|
||||
def test_frame_datetime64_pre1900_repr(self):
|
||||
df = DataFrame({'year': date_range('1/1/1700', periods=50,
|
||||
freq='A-DEC')})
|
||||
# it works!
|
||||
repr(df)
|
||||
|
||||
def test_frame_add_datetime64_col_other_units(self):
|
||||
n = 100
|
||||
|
||||
units = ['h', 'm', 's', 'ms', 'D', 'M', 'Y']
|
||||
|
||||
ns_dtype = np.dtype('M8[ns]')
|
||||
|
||||
for unit in units:
|
||||
dtype = np.dtype('M8[%s]' % unit)
|
||||
vals = np.arange(n, dtype=np.int64).view(dtype)
|
||||
|
||||
df = DataFrame({'ints': np.arange(n)}, index=np.arange(n))
|
||||
df[unit] = vals
|
||||
|
||||
ex_vals = to_datetime(vals.astype('O')).values
|
||||
|
||||
assert df[unit].dtype == ns_dtype
|
||||
assert (df[unit].values == ex_vals).all()
|
||||
|
||||
# Test insertion into existing datetime64 column
|
||||
df = DataFrame({'ints': np.arange(n)}, index=np.arange(n))
|
||||
df['dates'] = np.arange(n, dtype=np.int64).view(ns_dtype)
|
||||
|
||||
for unit in units:
|
||||
dtype = np.dtype('M8[%s]' % unit)
|
||||
vals = np.arange(n, dtype=np.int64).view(dtype)
|
||||
|
||||
tmp = df.copy()
|
||||
|
||||
tmp['dates'] = vals
|
||||
ex_vals = to_datetime(vals.astype('O')).values
|
||||
|
||||
assert (tmp['dates'].values == ex_vals).all()
|
||||
|
||||
def test_shift(self):
|
||||
# naive shift
|
||||
shiftedFrame = self.tsframe.shift(5)
|
||||
tm.assert_index_equal(shiftedFrame.index, self.tsframe.index)
|
||||
|
||||
shiftedSeries = self.tsframe['A'].shift(5)
|
||||
assert_series_equal(shiftedFrame['A'], shiftedSeries)
|
||||
|
||||
shiftedFrame = self.tsframe.shift(-5)
|
||||
tm.assert_index_equal(shiftedFrame.index, self.tsframe.index)
|
||||
|
||||
shiftedSeries = self.tsframe['A'].shift(-5)
|
||||
assert_series_equal(shiftedFrame['A'], shiftedSeries)
|
||||
|
||||
# shift by 0
|
||||
unshifted = self.tsframe.shift(0)
|
||||
assert_frame_equal(unshifted, self.tsframe)
|
||||
|
||||
# shift by DateOffset
|
||||
shiftedFrame = self.tsframe.shift(5, freq=offsets.BDay())
|
||||
assert len(shiftedFrame) == len(self.tsframe)
|
||||
|
||||
shiftedFrame2 = self.tsframe.shift(5, freq='B')
|
||||
assert_frame_equal(shiftedFrame, shiftedFrame2)
|
||||
|
||||
d = self.tsframe.index[0]
|
||||
shifted_d = d + offsets.BDay(5)
|
||||
assert_series_equal(self.tsframe.xs(d),
|
||||
shiftedFrame.xs(shifted_d), check_names=False)
|
||||
|
||||
# shift int frame
|
||||
int_shifted = self.intframe.shift(1) # noqa
|
||||
|
||||
# Shifting with PeriodIndex
|
||||
ps = tm.makePeriodFrame()
|
||||
shifted = ps.shift(1)
|
||||
unshifted = shifted.shift(-1)
|
||||
tm.assert_index_equal(shifted.index, ps.index)
|
||||
tm.assert_index_equal(unshifted.index, ps.index)
|
||||
tm.assert_numpy_array_equal(unshifted.iloc[:, 0].dropna().values,
|
||||
ps.iloc[:-1, 0].values)
|
||||
|
||||
shifted2 = ps.shift(1, 'B')
|
||||
shifted3 = ps.shift(1, offsets.BDay())
|
||||
assert_frame_equal(shifted2, shifted3)
|
||||
assert_frame_equal(ps, shifted2.shift(-1, 'B'))
|
||||
|
||||
tm.assert_raises_regex(ValueError,
|
||||
'does not match PeriodIndex freq',
|
||||
ps.shift, freq='D')
|
||||
|
||||
# shift other axis
|
||||
# GH 6371
|
||||
df = DataFrame(np.random.rand(10, 5))
|
||||
expected = pd.concat([DataFrame(np.nan, index=df.index,
|
||||
columns=[0]),
|
||||
df.iloc[:, 0:-1]],
|
||||
ignore_index=True, axis=1)
|
||||
result = df.shift(1, axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# shift named axis
|
||||
df = DataFrame(np.random.rand(10, 5))
|
||||
expected = pd.concat([DataFrame(np.nan, index=df.index,
|
||||
columns=[0]),
|
||||
df.iloc[:, 0:-1]],
|
||||
ignore_index=True, axis=1)
|
||||
result = df.shift(1, axis='columns')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_shift_bool(self):
|
||||
df = DataFrame({'high': [True, False],
|
||||
'low': [False, False]})
|
||||
rs = df.shift(1)
|
||||
xp = DataFrame(np.array([[np.nan, np.nan],
|
||||
[True, False]], dtype=object),
|
||||
columns=['high', 'low'])
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_shift_categorical(self):
|
||||
# GH 9416
|
||||
s1 = pd.Series(['a', 'b', 'c'], dtype='category')
|
||||
s2 = pd.Series(['A', 'B', 'C'], dtype='category')
|
||||
df = DataFrame({'one': s1, 'two': s2})
|
||||
rs = df.shift(1)
|
||||
xp = DataFrame({'one': s1.shift(1), 'two': s2.shift(1)})
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_shift_empty(self):
|
||||
# Regression test for #8019
|
||||
df = DataFrame({'foo': []})
|
||||
rs = df.shift(-1)
|
||||
|
||||
assert_frame_equal(df, rs)
|
||||
|
||||
def test_shift_duplicate_columns(self):
|
||||
# GH 9092; verify that position-based shifting works
|
||||
# in the presence of duplicate columns
|
||||
column_lists = [list(range(5)), [1] * 5, [1, 1, 2, 2, 1]]
|
||||
data = np.random.randn(20, 5)
|
||||
|
||||
shifted = []
|
||||
for columns in column_lists:
|
||||
df = pd.DataFrame(data.copy(), columns=columns)
|
||||
for s in range(5):
|
||||
df.iloc[:, s] = df.iloc[:, s].shift(s + 1)
|
||||
df.columns = range(5)
|
||||
shifted.append(df)
|
||||
|
||||
# sanity check the base case
|
||||
nulls = shifted[0].isna().sum()
|
||||
assert_series_equal(nulls, Series(range(1, 6), dtype='int64'))
|
||||
|
||||
# check all answers are the same
|
||||
assert_frame_equal(shifted[0], shifted[1])
|
||||
assert_frame_equal(shifted[0], shifted[2])
|
||||
|
||||
def test_tshift(self):
|
||||
# PeriodIndex
|
||||
ps = tm.makePeriodFrame()
|
||||
shifted = ps.tshift(1)
|
||||
unshifted = shifted.tshift(-1)
|
||||
|
||||
assert_frame_equal(unshifted, ps)
|
||||
|
||||
shifted2 = ps.tshift(freq='B')
|
||||
assert_frame_equal(shifted, shifted2)
|
||||
|
||||
shifted3 = ps.tshift(freq=offsets.BDay())
|
||||
assert_frame_equal(shifted, shifted3)
|
||||
|
||||
tm.assert_raises_regex(
|
||||
ValueError, 'does not match', ps.tshift, freq='M')
|
||||
|
||||
# DatetimeIndex
|
||||
shifted = self.tsframe.tshift(1)
|
||||
unshifted = shifted.tshift(-1)
|
||||
|
||||
assert_frame_equal(self.tsframe, unshifted)
|
||||
|
||||
shifted2 = self.tsframe.tshift(freq=self.tsframe.index.freq)
|
||||
assert_frame_equal(shifted, shifted2)
|
||||
|
||||
inferred_ts = DataFrame(self.tsframe.values,
|
||||
Index(np.asarray(self.tsframe.index)),
|
||||
columns=self.tsframe.columns)
|
||||
shifted = inferred_ts.tshift(1)
|
||||
unshifted = shifted.tshift(-1)
|
||||
assert_frame_equal(shifted, self.tsframe.tshift(1))
|
||||
assert_frame_equal(unshifted, inferred_ts)
|
||||
|
||||
no_freq = self.tsframe.iloc[[0, 5, 7], :]
|
||||
pytest.raises(ValueError, no_freq.tshift)
|
||||
|
||||
def test_truncate(self):
|
||||
ts = self.tsframe[::3]
|
||||
|
||||
start, end = self.tsframe.index[3], self.tsframe.index[6]
|
||||
|
||||
start_missing = self.tsframe.index[2]
|
||||
end_missing = self.tsframe.index[7]
|
||||
|
||||
# neither specified
|
||||
truncated = ts.truncate()
|
||||
assert_frame_equal(truncated, ts)
|
||||
|
||||
# both specified
|
||||
expected = ts[1:3]
|
||||
|
||||
truncated = ts.truncate(start, end)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
truncated = ts.truncate(start_missing, end_missing)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
# start specified
|
||||
expected = ts[1:]
|
||||
|
||||
truncated = ts.truncate(before=start)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
truncated = ts.truncate(before=start_missing)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
# end specified
|
||||
expected = ts[:3]
|
||||
|
||||
truncated = ts.truncate(after=end)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
truncated = ts.truncate(after=end_missing)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
pytest.raises(ValueError, ts.truncate,
|
||||
before=ts.index[-1] - 1,
|
||||
after=ts.index[0] + 1)
|
||||
|
||||
def test_truncate_copy(self):
|
||||
index = self.tsframe.index
|
||||
truncated = self.tsframe.truncate(index[5], index[10])
|
||||
truncated.values[:] = 5.
|
||||
assert not (self.tsframe.values[5:11] == 5).any()
|
||||
|
||||
def test_truncate_nonsortedindex(self):
|
||||
# GH 17935
|
||||
|
||||
df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e']},
|
||||
index=[5, 3, 2, 9, 0])
|
||||
with tm.assert_raises_regex(ValueError,
|
||||
'truncate requires a sorted index'):
|
||||
df.truncate(before=3, after=9)
|
||||
|
||||
rng = pd.date_range('2011-01-01', '2012-01-01', freq='W')
|
||||
ts = pd.DataFrame({'A': np.random.randn(len(rng)),
|
||||
'B': np.random.randn(len(rng))},
|
||||
index=rng)
|
||||
with tm.assert_raises_regex(ValueError,
|
||||
'truncate requires a sorted index'):
|
||||
ts.sort_values('A', ascending=False).truncate(before='2011-11',
|
||||
after='2011-12')
|
||||
|
||||
df = pd.DataFrame({3: np.random.randn(5),
|
||||
20: np.random.randn(5),
|
||||
2: np.random.randn(5),
|
||||
0: np.random.randn(5)},
|
||||
columns=[3, 20, 2, 0])
|
||||
with tm.assert_raises_regex(ValueError,
|
||||
'truncate requires a sorted index'):
|
||||
df.truncate(before=2, after=20, axis=1)
|
||||
|
||||
def test_asfreq(self):
|
||||
offset_monthly = self.tsframe.asfreq(offsets.BMonthEnd())
|
||||
rule_monthly = self.tsframe.asfreq('BM')
|
||||
|
||||
tm.assert_almost_equal(offset_monthly['A'], rule_monthly['A'])
|
||||
|
||||
filled = rule_monthly.asfreq('B', method='pad') # noqa
|
||||
# TODO: actually check that this worked.
|
||||
|
||||
# don't forget!
|
||||
filled_dep = rule_monthly.asfreq('B', method='pad') # noqa
|
||||
|
||||
# test does not blow up on length-0 DataFrame
|
||||
zero_length = self.tsframe.reindex([])
|
||||
result = zero_length.asfreq('BM')
|
||||
assert result is not zero_length
|
||||
|
||||
def test_asfreq_datetimeindex(self):
|
||||
df = DataFrame({'A': [1, 2, 3]},
|
||||
index=[datetime(2011, 11, 1), datetime(2011, 11, 2),
|
||||
datetime(2011, 11, 3)])
|
||||
df = df.asfreq('B')
|
||||
assert isinstance(df.index, DatetimeIndex)
|
||||
|
||||
ts = df['A'].asfreq('B')
|
||||
assert isinstance(ts.index, DatetimeIndex)
|
||||
|
||||
def test_asfreq_fillvalue(self):
|
||||
# test for fill value during upsampling, related to issue 3715
|
||||
|
||||
# setup
|
||||
rng = pd.date_range('1/1/2016', periods=10, freq='2S')
|
||||
ts = pd.Series(np.arange(len(rng)), index=rng)
|
||||
df = pd.DataFrame({'one': ts})
|
||||
|
||||
# insert pre-existing missing value
|
||||
df.loc['2016-01-01 00:00:08', 'one'] = None
|
||||
|
||||
actual_df = df.asfreq(freq='1S', fill_value=9.0)
|
||||
expected_df = df.asfreq(freq='1S').fillna(9.0)
|
||||
expected_df.loc['2016-01-01 00:00:08', 'one'] = None
|
||||
assert_frame_equal(expected_df, actual_df)
|
||||
|
||||
expected_series = ts.asfreq(freq='1S').fillna(9.0)
|
||||
actual_series = ts.asfreq(freq='1S', fill_value=9.0)
|
||||
assert_series_equal(expected_series, actual_series)
|
||||
|
||||
@pytest.mark.parametrize("data,idx,expected_first,expected_last", [
|
||||
({'A': [1, 2, 3]}, [1, 1, 2], 1, 2),
|
||||
({'A': [1, 2, 3]}, [1, 2, 2], 1, 2),
|
||||
({'A': [1, 2, 3, 4]}, ['d', 'd', 'd', 'd'], 'd', 'd'),
|
||||
({'A': [1, np.nan, 3]}, [1, 1, 2], 1, 2),
|
||||
({'A': [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2),
|
||||
({'A': [1, np.nan, 3]}, [1, 2, 2], 1, 2)])
|
||||
def test_first_last_valid(self, data, idx,
|
||||
expected_first, expected_last):
|
||||
N = len(self.frame.index)
|
||||
mat = randn(N)
|
||||
mat[:5] = nan
|
||||
mat[-5:] = nan
|
||||
|
||||
frame = DataFrame({'foo': mat}, index=self.frame.index)
|
||||
index = frame.first_valid_index()
|
||||
|
||||
assert index == frame.index[5]
|
||||
|
||||
index = frame.last_valid_index()
|
||||
assert index == frame.index[-6]
|
||||
|
||||
# GH12800
|
||||
empty = DataFrame()
|
||||
assert empty.last_valid_index() is None
|
||||
assert empty.first_valid_index() is None
|
||||
|
||||
# GH17400: no valid entries
|
||||
frame[:] = nan
|
||||
assert frame.last_valid_index() is None
|
||||
assert frame.first_valid_index() is None
|
||||
|
||||
# GH20499: its preserves freq with holes
|
||||
frame.index = date_range("20110101", periods=N, freq="B")
|
||||
frame.iloc[1] = 1
|
||||
frame.iloc[-2] = 1
|
||||
assert frame.first_valid_index() == frame.index[1]
|
||||
assert frame.last_valid_index() == frame.index[-2]
|
||||
assert frame.first_valid_index().freq == frame.index.freq
|
||||
assert frame.last_valid_index().freq == frame.index.freq
|
||||
|
||||
# GH 21441
|
||||
df = DataFrame(data, index=idx)
|
||||
assert expected_first == df.first_valid_index()
|
||||
assert expected_last == df.last_valid_index()
|
||||
|
||||
def test_first_subset(self):
|
||||
ts = tm.makeTimeDataFrame(freq='12h')
|
||||
result = ts.first('10d')
|
||||
assert len(result) == 20
|
||||
|
||||
ts = tm.makeTimeDataFrame(freq='D')
|
||||
result = ts.first('10d')
|
||||
assert len(result) == 10
|
||||
|
||||
result = ts.first('3M')
|
||||
expected = ts[:'3/31/2000']
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = ts.first('21D')
|
||||
expected = ts[:21]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = ts[:0].first('3M')
|
||||
assert_frame_equal(result, ts[:0])
|
||||
|
||||
def test_first_raises(self):
|
||||
# GH20725
|
||||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
with pytest.raises(TypeError): # index is not a DatetimeIndex
|
||||
df.first('1D')
|
||||
|
||||
def test_last_subset(self):
|
||||
ts = tm.makeTimeDataFrame(freq='12h')
|
||||
result = ts.last('10d')
|
||||
assert len(result) == 20
|
||||
|
||||
ts = tm.makeTimeDataFrame(nper=30, freq='D')
|
||||
result = ts.last('10d')
|
||||
assert len(result) == 10
|
||||
|
||||
result = ts.last('21D')
|
||||
expected = ts['2000-01-10':]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = ts.last('21D')
|
||||
expected = ts[-21:]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = ts[:0].last('3M')
|
||||
assert_frame_equal(result, ts[:0])
|
||||
|
||||
def test_last_raises(self):
|
||||
# GH20725
|
||||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
with pytest.raises(TypeError): # index is not a DatetimeIndex
|
||||
df.last('1D')
|
||||
|
||||
def test_at_time(self):
|
||||
rng = date_range('1/1/2000', '1/5/2000', freq='5min')
|
||||
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
|
||||
rs = ts.at_time(rng[1])
|
||||
assert (rs.index.hour == rng[1].hour).all()
|
||||
assert (rs.index.minute == rng[1].minute).all()
|
||||
assert (rs.index.second == rng[1].second).all()
|
||||
|
||||
result = ts.at_time('9:30')
|
||||
expected = ts.at_time(time(9, 30))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = ts.loc[time(9, 30)]
|
||||
expected = ts.loc[(rng.hour == 9) & (rng.minute == 30)]
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# midnight, everything
|
||||
rng = date_range('1/1/2000', '1/31/2000')
|
||||
ts = DataFrame(np.random.randn(len(rng), 3), index=rng)
|
||||
|
||||
result = ts.at_time(time(0, 0))
|
||||
assert_frame_equal(result, ts)
|
||||
|
||||
# time doesn't exist
|
||||
rng = date_range('1/1/2012', freq='23Min', periods=384)
|
||||
ts = DataFrame(np.random.randn(len(rng), 2), rng)
|
||||
rs = ts.at_time('16:00')
|
||||
assert len(rs) == 0
|
||||
|
||||
def test_at_time_raises(self):
|
||||
# GH20725
|
||||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
with pytest.raises(TypeError): # index is not a DatetimeIndex
|
||||
df.at_time('00:00')
|
||||
|
||||
def test_between_time(self):
|
||||
rng = date_range('1/1/2000', '1/5/2000', freq='5min')
|
||||
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
|
||||
stime = time(0, 0)
|
||||
etime = time(1, 0)
|
||||
|
||||
close_open = product([True, False], [True, False])
|
||||
for inc_start, inc_end in close_open:
|
||||
filtered = ts.between_time(stime, etime, inc_start, inc_end)
|
||||
exp_len = 13 * 4 + 1
|
||||
if not inc_start:
|
||||
exp_len -= 5
|
||||
if not inc_end:
|
||||
exp_len -= 4
|
||||
|
||||
assert len(filtered) == exp_len
|
||||
for rs in filtered.index:
|
||||
t = rs.time()
|
||||
if inc_start:
|
||||
assert t >= stime
|
||||
else:
|
||||
assert t > stime
|
||||
|
||||
if inc_end:
|
||||
assert t <= etime
|
||||
else:
|
||||
assert t < etime
|
||||
|
||||
result = ts.between_time('00:00', '01:00')
|
||||
expected = ts.between_time(stime, etime)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# across midnight
|
||||
rng = date_range('1/1/2000', '1/5/2000', freq='5min')
|
||||
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
|
||||
stime = time(22, 0)
|
||||
etime = time(9, 0)
|
||||
|
||||
close_open = product([True, False], [True, False])
|
||||
for inc_start, inc_end in close_open:
|
||||
filtered = ts.between_time(stime, etime, inc_start, inc_end)
|
||||
exp_len = (12 * 11 + 1) * 4 + 1
|
||||
if not inc_start:
|
||||
exp_len -= 4
|
||||
if not inc_end:
|
||||
exp_len -= 4
|
||||
|
||||
assert len(filtered) == exp_len
|
||||
for rs in filtered.index:
|
||||
t = rs.time()
|
||||
if inc_start:
|
||||
assert (t >= stime) or (t <= etime)
|
||||
else:
|
||||
assert (t > stime) or (t <= etime)
|
||||
|
||||
if inc_end:
|
||||
assert (t <= etime) or (t >= stime)
|
||||
else:
|
||||
assert (t < etime) or (t >= stime)
|
||||
|
||||
def test_between_time_raises(self):
|
||||
# GH20725
|
||||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
with pytest.raises(TypeError): # index is not a DatetimeIndex
|
||||
df.between_time(start_time='00:00', end_time='12:00')
|
||||
|
||||
def test_operation_on_NaT(self):
|
||||
# Both NaT and Timestamp are in DataFrame.
|
||||
df = pd.DataFrame({'foo': [pd.NaT, pd.NaT,
|
||||
pd.Timestamp('2012-05-01')]})
|
||||
|
||||
res = df.min()
|
||||
exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.max()
|
||||
exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
# GH12941, only NaTs are in DataFrame.
|
||||
df = pd.DataFrame({'foo': [pd.NaT, pd.NaT]})
|
||||
|
||||
res = df.min()
|
||||
exp = pd.Series([pd.NaT], index=["foo"])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.max()
|
||||
exp = pd.Series([pd.NaT], index=["foo"])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_datetime_assignment_with_NaT_and_diff_time_units(self):
|
||||
# GH 7492
|
||||
data_ns = np.array([1, 'nat'], dtype='datetime64[ns]')
|
||||
result = pd.Series(data_ns).to_frame()
|
||||
result['new'] = data_ns
|
||||
expected = pd.DataFrame({0: [1, None],
|
||||
'new': [1, None]}, dtype='datetime64[ns]')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
# OutOfBoundsDatetime error shouldn't occur
|
||||
data_s = np.array([1, 'nat'], dtype='datetime64[s]')
|
||||
result['new'] = data_s
|
||||
expected = pd.DataFrame({0: [1, None],
|
||||
'new': [1e9, None]}, dtype='datetime64[ns]')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_frame_to_period(self):
|
||||
K = 5
|
||||
from pandas.core.indexes.period import period_range
|
||||
|
||||
dr = date_range('1/1/2000', '1/1/2001')
|
||||
pr = period_range('1/1/2000', '1/1/2001')
|
||||
df = DataFrame(randn(len(dr), K), index=dr)
|
||||
df['mix'] = 'a'
|
||||
|
||||
pts = df.to_period()
|
||||
exp = df.copy()
|
||||
exp.index = pr
|
||||
assert_frame_equal(pts, exp)
|
||||
|
||||
pts = df.to_period('M')
|
||||
tm.assert_index_equal(pts.index, exp.index.asfreq('M'))
|
||||
|
||||
df = df.T
|
||||
pts = df.to_period(axis=1)
|
||||
exp = df.copy()
|
||||
exp.columns = pr
|
||||
assert_frame_equal(pts, exp)
|
||||
|
||||
pts = df.to_period('M', axis=1)
|
||||
tm.assert_index_equal(pts.columns, exp.columns.asfreq('M'))
|
||||
|
||||
pytest.raises(ValueError, df.to_period, axis=2)
|
||||
|
||||
@pytest.mark.parametrize("fn", ['tz_localize', 'tz_convert'])
|
||||
def test_tz_convert_and_localize(self, fn):
|
||||
l0 = date_range('20140701', periods=5, freq='D')
|
||||
|
||||
# TODO: l1 should be a PeriodIndex for testing
|
||||
# after GH2106 is addressed
|
||||
with pytest.raises(NotImplementedError):
|
||||
period_range('20140701', periods=1).tz_convert('UTC')
|
||||
with pytest.raises(NotImplementedError):
|
||||
period_range('20140701', periods=1).tz_localize('UTC')
|
||||
# l1 = period_range('20140701', periods=5, freq='D')
|
||||
l1 = date_range('20140701', periods=5, freq='D')
|
||||
|
||||
int_idx = Index(range(5))
|
||||
|
||||
if fn == 'tz_convert':
|
||||
l0 = l0.tz_localize('UTC')
|
||||
l1 = l1.tz_localize('UTC')
|
||||
|
||||
for idx in [l0, l1]:
|
||||
|
||||
l0_expected = getattr(idx, fn)('US/Pacific')
|
||||
l1_expected = getattr(idx, fn)('US/Pacific')
|
||||
|
||||
df1 = DataFrame(np.ones(5), index=l0)
|
||||
df1 = getattr(df1, fn)('US/Pacific')
|
||||
assert_index_equal(df1.index, l0_expected)
|
||||
|
||||
# MultiIndex
|
||||
# GH7846
|
||||
df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1]))
|
||||
|
||||
df3 = getattr(df2, fn)('US/Pacific', level=0)
|
||||
assert not df3.index.levels[0].equals(l0)
|
||||
assert_index_equal(df3.index.levels[0], l0_expected)
|
||||
assert_index_equal(df3.index.levels[1], l1)
|
||||
assert not df3.index.levels[1].equals(l1_expected)
|
||||
|
||||
df3 = getattr(df2, fn)('US/Pacific', level=1)
|
||||
assert_index_equal(df3.index.levels[0], l0)
|
||||
assert not df3.index.levels[0].equals(l0_expected)
|
||||
assert_index_equal(df3.index.levels[1], l1_expected)
|
||||
assert not df3.index.levels[1].equals(l1)
|
||||
|
||||
df4 = DataFrame(np.ones(5),
|
||||
MultiIndex.from_arrays([int_idx, l0]))
|
||||
|
||||
# TODO: untested
|
||||
df5 = getattr(df4, fn)('US/Pacific', level=1) # noqa
|
||||
|
||||
assert_index_equal(df3.index.levels[0], l0)
|
||||
assert not df3.index.levels[0].equals(l0_expected)
|
||||
assert_index_equal(df3.index.levels[1], l1_expected)
|
||||
assert not df3.index.levels[1].equals(l1)
|
||||
|
||||
# Bad Inputs
|
||||
|
||||
# Not DatetimeIndex / PeriodIndex
|
||||
with assert_raises_regex(TypeError, 'DatetimeIndex'):
|
||||
df = DataFrame(index=int_idx)
|
||||
df = getattr(df, fn)('US/Pacific')
|
||||
|
||||
# Not DatetimeIndex / PeriodIndex
|
||||
with assert_raises_regex(TypeError, 'DatetimeIndex'):
|
||||
df = DataFrame(np.ones(5),
|
||||
MultiIndex.from_arrays([int_idx, l0]))
|
||||
df = getattr(df, fn)('US/Pacific', level=0)
|
||||
|
||||
# Invalid level
|
||||
with assert_raises_regex(ValueError, 'not valid'):
|
||||
df = DataFrame(index=l0)
|
||||
df = getattr(df, fn)('US/Pacific', level=1)
|
||||
@@ -1,145 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for DataFrame timezone-related methods
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
import pytest
|
||||
import pytz
|
||||
import numpy as np
|
||||
|
||||
import pandas.util.testing as tm
|
||||
from pandas.compat import lrange
|
||||
from pandas.core.indexes.datetimes import date_range
|
||||
from pandas.core.dtypes.dtypes import DatetimeTZDtype
|
||||
from pandas import Series, DataFrame
|
||||
|
||||
|
||||
class TestDataFrameTimezones(object):
|
||||
def test_frame_from_records_utc(self):
|
||||
rec = {'datum': 1.5,
|
||||
'begin_time': datetime(2006, 4, 27, tzinfo=pytz.utc)}
|
||||
|
||||
# it works
|
||||
DataFrame.from_records([rec], index='begin_time')
|
||||
|
||||
def test_frame_tz_localize(self):
|
||||
rng = date_range('1/1/2011', periods=100, freq='H')
|
||||
|
||||
df = DataFrame({'a': 1}, index=rng)
|
||||
result = df.tz_localize('utc')
|
||||
expected = DataFrame({'a': 1}, rng.tz_localize('UTC'))
|
||||
assert result.index.tz.zone == 'UTC'
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = df.T
|
||||
result = df.tz_localize('utc', axis=1)
|
||||
assert result.columns.tz.zone == 'UTC'
|
||||
tm.assert_frame_equal(result, expected.T)
|
||||
|
||||
def test_frame_tz_convert(self):
|
||||
rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern')
|
||||
|
||||
df = DataFrame({'a': 1}, index=rng)
|
||||
result = df.tz_convert('Europe/Berlin')
|
||||
expected = DataFrame({'a': 1}, rng.tz_convert('Europe/Berlin'))
|
||||
assert result.index.tz.zone == 'Europe/Berlin'
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = df.T
|
||||
result = df.tz_convert('Europe/Berlin', axis=1)
|
||||
assert result.columns.tz.zone == 'Europe/Berlin'
|
||||
tm.assert_frame_equal(result, expected.T)
|
||||
|
||||
def test_frame_join_tzaware(self):
|
||||
test1 = DataFrame(np.zeros((6, 3)),
|
||||
index=date_range("2012-11-15 00:00:00", periods=6,
|
||||
freq="100L", tz="US/Central"))
|
||||
test2 = DataFrame(np.zeros((3, 3)),
|
||||
index=date_range("2012-11-15 00:00:00", periods=3,
|
||||
freq="250L", tz="US/Central"),
|
||||
columns=lrange(3, 6))
|
||||
|
||||
result = test1.join(test2, how='outer')
|
||||
ex_index = test1.index.union(test2.index)
|
||||
|
||||
tm.assert_index_equal(result.index, ex_index)
|
||||
assert result.index.tz.zone == 'US/Central'
|
||||
|
||||
def test_frame_add_tz_mismatch_converts_to_utc(self):
|
||||
rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern')
|
||||
df = DataFrame(np.random.randn(len(rng)), index=rng, columns=['a'])
|
||||
|
||||
df_moscow = df.tz_convert('Europe/Moscow')
|
||||
result = df + df_moscow
|
||||
assert result.index.tz is pytz.utc
|
||||
|
||||
result = df_moscow + df
|
||||
assert result.index.tz is pytz.utc
|
||||
|
||||
def test_frame_align_aware(self):
|
||||
idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern')
|
||||
idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern')
|
||||
df1 = DataFrame(np.random.randn(len(idx1), 3), idx1)
|
||||
df2 = DataFrame(np.random.randn(len(idx2), 3), idx2)
|
||||
new1, new2 = df1.align(df2)
|
||||
assert df1.index.tz == new1.index.tz
|
||||
assert df2.index.tz == new2.index.tz
|
||||
|
||||
# different timezones convert to UTC
|
||||
|
||||
# frame with frame
|
||||
df1_central = df1.tz_convert('US/Central')
|
||||
new1, new2 = df1.align(df1_central)
|
||||
assert new1.index.tz == pytz.UTC
|
||||
assert new2.index.tz == pytz.UTC
|
||||
|
||||
# frame with Series
|
||||
new1, new2 = df1.align(df1_central[0], axis=0)
|
||||
assert new1.index.tz == pytz.UTC
|
||||
assert new2.index.tz == pytz.UTC
|
||||
|
||||
df1[0].align(df1_central, axis=0)
|
||||
assert new1.index.tz == pytz.UTC
|
||||
assert new2.index.tz == pytz.UTC
|
||||
|
||||
@pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern'])
|
||||
def test_frame_no_datetime64_dtype(self, tz):
|
||||
# after GH#7822
|
||||
# these retain the timezones on dict construction
|
||||
dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI')
|
||||
dr_tz = dr.tz_localize(tz)
|
||||
df = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr)
|
||||
tz_expected = DatetimeTZDtype('ns', dr_tz.tzinfo)
|
||||
assert df['B'].dtype == tz_expected
|
||||
|
||||
# GH#2810 (with timezones)
|
||||
datetimes_naive = [ts.to_pydatetime() for ts in dr]
|
||||
datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz]
|
||||
df = DataFrame({'dr': dr,
|
||||
'dr_tz': dr_tz,
|
||||
'datetimes_naive': datetimes_naive,
|
||||
'datetimes_with_tz': datetimes_with_tz})
|
||||
result = df.get_dtype_counts().sort_index()
|
||||
expected = Series({'datetime64[ns]': 2,
|
||||
str(tz_expected): 2}).sort_index()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern'])
|
||||
def test_frame_reset_index(self, tz):
|
||||
dr = date_range('2012-06-02', periods=10, tz=tz)
|
||||
df = DataFrame(np.random.randn(len(dr)), dr)
|
||||
roundtripped = df.reset_index().set_index('index')
|
||||
xp = df.index.tz
|
||||
rs = roundtripped.index.tz
|
||||
assert xp == rs
|
||||
|
||||
@pytest.mark.parametrize('tz', [None, 'America/New_York'])
|
||||
def test_boolean_compare_transpose_tzindex_with_dst(self, tz):
|
||||
# GH 19970
|
||||
idx = date_range('20161101', '20161130', freq='4H', tz=tz)
|
||||
df = DataFrame({'a': range(len(idx)), 'b': range(len(idx))},
|
||||
index=idx)
|
||||
result = df.T == df.T
|
||||
expected = DataFrame(True, index=list('ab'), columns=idx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,33 +0,0 @@
|
||||
from pandas.core.frame import DataFrame
|
||||
|
||||
import pytest
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dataframe():
|
||||
return DataFrame({'a': [1, 2], 'b': [3, 4]})
|
||||
|
||||
|
||||
class TestDataFrameValidate(object):
|
||||
"""Tests for error handling related to data types of method arguments."""
|
||||
|
||||
@pytest.mark.parametrize("func", ["query", "eval", "set_index",
|
||||
"reset_index", "dropna",
|
||||
"drop_duplicates", "sort_values"])
|
||||
@pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0])
|
||||
def test_validate_bool_args(self, dataframe, func, inplace):
|
||||
msg = "For argument \"inplace\" expected type bool"
|
||||
kwargs = dict(inplace=inplace)
|
||||
|
||||
if func == "query":
|
||||
kwargs["expr"] = "a > b"
|
||||
elif func == "eval":
|
||||
kwargs["expr"] = "a + b"
|
||||
elif func == "set_index":
|
||||
kwargs["keys"] = ["a"]
|
||||
elif func == "sort_values":
|
||||
kwargs["by"] = ["a"]
|
||||
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
getattr(dataframe, func)(**kwargs)
|
||||
Reference in New Issue
Block a user