demo + utils venv

This commit is contained in:
d3m1g0d
2019-02-03 13:40:10 +01:00
parent 5fa112490b
commit cfa9c8ea23
5994 changed files with 1353819 additions and 0 deletions
@@ -0,0 +1,141 @@
import numpy as np
from pandas.util._decorators import cache_readonly
import pandas as pd
from pandas import compat
import pandas.util.testing as tm
_seriesd = tm.getSeriesData()
_tsd = tm.getTimeSeriesData()
_frame = pd.DataFrame(_seriesd)
_frame2 = pd.DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])
_intframe = pd.DataFrame({k: v.astype(int)
for k, v in compat.iteritems(_seriesd)})
_tsframe = pd.DataFrame(_tsd)
_mixed_frame = _frame.copy()
_mixed_frame['foo'] = 'bar'
class TestData(object):
@cache_readonly
def frame(self):
return _frame.copy()
@cache_readonly
def frame2(self):
return _frame2.copy()
@cache_readonly
def intframe(self):
# force these all to int64 to avoid platform testing issues
return pd.DataFrame({c: s for c, s in compat.iteritems(_intframe)},
dtype=np.int64)
@cache_readonly
def tsframe(self):
return _tsframe.copy()
@cache_readonly
def mixed_frame(self):
return _mixed_frame.copy()
@cache_readonly
def mixed_float(self):
return pd.DataFrame({'A': _frame['A'].copy().astype('float32'),
'B': _frame['B'].copy().astype('float32'),
'C': _frame['C'].copy().astype('float16'),
'D': _frame['D'].copy().astype('float64')})
@cache_readonly
def mixed_float2(self):
return pd.DataFrame({'A': _frame2['A'].copy().astype('float32'),
'B': _frame2['B'].copy().astype('float32'),
'C': _frame2['C'].copy().astype('float16'),
'D': _frame2['D'].copy().astype('float64')})
@cache_readonly
def mixed_int(self):
return pd.DataFrame({'A': _intframe['A'].copy().astype('int32'),
'B': np.ones(len(_intframe['B']), dtype='uint64'),
'C': _intframe['C'].copy().astype('uint8'),
'D': _intframe['D'].copy().astype('int64')})
@cache_readonly
def all_mixed(self):
return pd.DataFrame({'a': 1., 'b': 2, 'c': 'foo',
'float32': np.array([1.] * 10, dtype='float32'),
'int32': np.array([1] * 10, dtype='int32')},
index=np.arange(10))
@cache_readonly
def tzframe(self):
result = pd.DataFrame({'A': pd.date_range('20130101', periods=3),
'B': pd.date_range('20130101', periods=3,
tz='US/Eastern'),
'C': pd.date_range('20130101', periods=3,
tz='CET')})
result.iloc[1, 1] = pd.NaT
result.iloc[1, 2] = pd.NaT
return result
@cache_readonly
def empty(self):
return pd.DataFrame({})
@cache_readonly
def ts1(self):
return tm.makeTimeSeries(nper=30)
@cache_readonly
def ts2(self):
return tm.makeTimeSeries(nper=30)[5:]
@cache_readonly
def simple(self):
arr = np.array([[1., 2., 3.],
[4., 5., 6.],
[7., 8., 9.]])
return pd.DataFrame(arr, columns=['one', 'two', 'three'],
index=['a', 'b', 'c'])
# self.ts3 = tm.makeTimeSeries()[-5:]
# self.ts4 = tm.makeTimeSeries()[1:-1]
def _check_mixed_float(df, dtype=None):
# float16 are most likely to be upcasted to float32
dtypes = dict(A='float32', B='float32', C='float16', D='float64')
if isinstance(dtype, compat.string_types):
dtypes = {k: dtype for k, v in dtypes.items()}
elif isinstance(dtype, dict):
dtypes.update(dtype)
if dtypes.get('A'):
assert(df.dtypes['A'] == dtypes['A'])
if dtypes.get('B'):
assert(df.dtypes['B'] == dtypes['B'])
if dtypes.get('C'):
assert(df.dtypes['C'] == dtypes['C'])
if dtypes.get('D'):
assert(df.dtypes['D'] == dtypes['D'])
def _check_mixed_int(df, dtype=None):
dtypes = dict(A='int32', B='uint64', C='uint8', D='int64')
if isinstance(dtype, compat.string_types):
dtypes = {k: dtype for k, v in dtypes.items()}
elif isinstance(dtype, dict):
dtypes.update(dtype)
if dtypes.get('A'):
assert(df.dtypes['A'] == dtypes['A'])
if dtypes.get('B'):
assert(df.dtypes['B'] == dtypes['B'])
if dtypes.get('C'):
assert(df.dtypes['C'] == dtypes['C'])
if dtypes.get('D'):
assert(df.dtypes['D'] == dtypes['D'])
@@ -0,0 +1,221 @@
import numpy as np
import pytest
from pandas import DataFrame, NaT, compat, date_range
import pandas.util.testing as tm
@pytest.fixture
def float_frame():
"""
Fixture for DataFrame of floats with index of unique strings
Columns are ['A', 'B', 'C', 'D'].
"""
return DataFrame(tm.getSeriesData())
@pytest.fixture
def float_frame_with_na():
"""
Fixture for DataFrame of floats with index of unique strings
Columns are ['A', 'B', 'C', 'D']; some entries are missing
"""
df = DataFrame(tm.getSeriesData())
# set some NAs
df.loc[5:10] = np.nan
df.loc[15:20, -2:] = np.nan
return df
@pytest.fixture
def float_frame2():
"""
Fixture for DataFrame of floats with index of unique strings
Columns are ['D', 'C', 'B', 'A']
"""
return DataFrame(tm.getSeriesData(), columns=['D', 'C', 'B', 'A'])
@pytest.fixture
def bool_frame_with_na():
"""
Fixture for DataFrame of booleans with index of unique strings
Columns are ['A', 'B', 'C', 'D']; some entries are missing
"""
df = DataFrame(tm.getSeriesData()) > 0
df = df.astype(object)
# set some NAs
df.loc[5:10] = np.nan
df.loc[15:20, -2:] = np.nan
return df
@pytest.fixture
def int_frame():
"""
Fixture for DataFrame of ints with index of unique strings
Columns are ['A', 'B', 'C', 'D']
"""
df = DataFrame({k: v.astype(int)
for k, v in compat.iteritems(tm.getSeriesData())})
# force these all to int64 to avoid platform testing issues
return DataFrame({c: s for c, s in compat.iteritems(df)}, dtype=np.int64)
@pytest.fixture
def datetime_frame():
"""
Fixture for DataFrame of floats with DatetimeIndex
Columns are ['A', 'B', 'C', 'D']
"""
return DataFrame(tm.getTimeSeriesData())
@pytest.fixture
def float_string_frame():
"""
Fixture for DataFrame of floats and strings with index of unique strings
Columns are ['A', 'B', 'C', 'D', 'foo'].
"""
df = DataFrame(tm.getSeriesData())
df['foo'] = 'bar'
return df
@pytest.fixture
def mixed_float_frame():
"""
Fixture for DataFrame of different float types with index of unique strings
Columns are ['A', 'B', 'C', 'D'].
"""
df = DataFrame(tm.getSeriesData())
df.A = df.A.astype('float32')
df.B = df.B.astype('float32')
df.C = df.C.astype('float16')
df.D = df.D.astype('float64')
return df
@pytest.fixture
def mixed_float_frame2():
"""
Fixture for DataFrame of different float types with index of unique strings
Columns are ['A', 'B', 'C', 'D'].
"""
df = DataFrame(tm.getSeriesData())
df.D = df.D.astype('float32')
df.C = df.C.astype('float32')
df.B = df.B.astype('float16')
df.D = df.D.astype('float64')
return df
@pytest.fixture
def mixed_int_frame():
"""
Fixture for DataFrame of different int types with index of unique strings
Columns are ['A', 'B', 'C', 'D'].
"""
df = DataFrame({k: v.astype(int)
for k, v in compat.iteritems(tm.getSeriesData())})
df.A = df.A.astype('int32')
df.B = np.ones(len(df.B), dtype='uint64')
df.C = df.C.astype('uint8')
df.D = df.C.astype('int64')
return df
@pytest.fixture
def mixed_type_frame():
"""
Fixture for DataFrame of float/int/string columns with RangeIndex
Columns are ['a', 'b', 'c', 'float32', 'int32'].
"""
return DataFrame({'a': 1., 'b': 2, 'c': 'foo',
'float32': np.array([1.] * 10, dtype='float32'),
'int32': np.array([1] * 10, dtype='int32')},
index=np.arange(10))
@pytest.fixture
def timezone_frame():
"""
Fixture for DataFrame of date_range Series with different time zones
Columns are ['A', 'B', 'C']; some entries are missing
"""
df = DataFrame({'A': date_range('20130101', periods=3),
'B': date_range('20130101', periods=3,
tz='US/Eastern'),
'C': date_range('20130101', periods=3,
tz='CET')})
df.iloc[1, 1] = NaT
df.iloc[1, 2] = NaT
return df
@pytest.fixture
def empty_frame():
"""
Fixture for empty DataFrame
"""
return DataFrame({})
@pytest.fixture
def datetime_series():
"""
Fixture for Series of floats with DatetimeIndex
"""
return tm.makeTimeSeries(nper=30)
@pytest.fixture
def datetime_series_short():
"""
Fixture for Series of floats with DatetimeIndex
"""
return tm.makeTimeSeries(nper=30)[5:]
@pytest.fixture
def simple_frame():
"""
Fixture for simple 3x3 DataFrame
Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c'].
"""
arr = np.array([[1., 2., 3.],
[4., 5., 6.],
[7., 8., 9.]])
return DataFrame(arr, columns=['one', 'two', 'three'],
index=['a', 'b', 'c'])
@pytest.fixture
def frame_of_index_cols():
"""
Fixture for DataFrame of columns that can be used for indexing
Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.
"""
df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
'B': ['one', 'two', 'three', 'one', 'two'],
'C': ['a', 'b', 'c', 'd', 'e'],
'D': np.random.randn(5),
'E': np.random.randn(5),
('tuple', 'as', 'label'): np.random.randn(5)})
return df
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,534 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
# pylint: disable-msg=W0612,E1101
from copy import deepcopy
import pydoc
import numpy as np
import pytest
from pandas.compat import long, lrange, range
import pandas as pd
from pandas import (
Categorical, DataFrame, Series, SparseDataFrame, compat, date_range,
timedelta_range)
import pandas.util.testing as tm
from pandas.util.testing import (
assert_almost_equal, assert_frame_equal, assert_series_equal)
class SharedWithSparse(object):
"""
A collection of tests DataFrame and SparseDataFrame can share.
In generic tests on this class, use ``self._assert_frame_equal()`` and
``self._assert_series_equal()`` which are implemented in sub-classes
and dispatch correctly.
"""
def _assert_frame_equal(self, left, right):
"""Dispatch to frame class dependent assertion"""
raise NotImplementedError
def _assert_series_equal(self, left, right):
"""Dispatch to series class dependent assertion"""
raise NotImplementedError
def test_copy_index_name_checking(self, float_frame):
# don't want to be able to modify the index stored elsewhere after
# making a copy
for attr in ('index', 'columns'):
ind = getattr(float_frame, attr)
ind.name = None
cp = float_frame.copy()
getattr(cp, attr).name = 'foo'
assert getattr(float_frame, attr).name is None
def test_getitem_pop_assign_name(self, float_frame):
s = float_frame['A']
assert s.name == 'A'
s = float_frame.pop('A')
assert s.name == 'A'
s = float_frame.loc[:, 'B']
assert s.name == 'B'
s2 = s.loc[:]
assert s2.name == 'B'
def test_get_value(self, float_frame):
for idx in float_frame.index:
for col in float_frame.columns:
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = float_frame.get_value(idx, col)
expected = float_frame[col][idx]
tm.assert_almost_equal(result, expected)
def test_add_prefix_suffix(self, float_frame):
with_prefix = float_frame.add_prefix('foo#')
expected = pd.Index(['foo#%s' % c for c in float_frame.columns])
tm.assert_index_equal(with_prefix.columns, expected)
with_suffix = float_frame.add_suffix('#foo')
expected = pd.Index(['%s#foo' % c for c in float_frame.columns])
tm.assert_index_equal(with_suffix.columns, expected)
with_pct_prefix = float_frame.add_prefix('%')
expected = pd.Index(['%{}'.format(c) for c in float_frame.columns])
tm.assert_index_equal(with_pct_prefix.columns, expected)
with_pct_suffix = float_frame.add_suffix('%')
expected = pd.Index(['{}%'.format(c) for c in float_frame.columns])
tm.assert_index_equal(with_pct_suffix.columns, expected)
def test_get_axis(self, float_frame):
f = float_frame
assert f._get_axis_number(0) == 0
assert f._get_axis_number(1) == 1
assert f._get_axis_number('index') == 0
assert f._get_axis_number('rows') == 0
assert f._get_axis_number('columns') == 1
assert f._get_axis_name(0) == 'index'
assert f._get_axis_name(1) == 'columns'
assert f._get_axis_name('index') == 'index'
assert f._get_axis_name('rows') == 'index'
assert f._get_axis_name('columns') == 'columns'
assert f._get_axis(0) is f.index
assert f._get_axis(1) is f.columns
with pytest.raises(ValueError, match='No axis named'):
f._get_axis_number(2)
with pytest.raises(ValueError, match='No axis.*foo'):
f._get_axis_name('foo')
with pytest.raises(ValueError, match='No axis.*None'):
f._get_axis_name(None)
with pytest.raises(ValueError, match='No axis named'):
f._get_axis_number(None)
def test_keys(self, float_frame):
getkeys = float_frame.keys
assert getkeys() is float_frame.columns
def test_column_contains_typeerror(self, float_frame):
try:
float_frame.columns in float_frame
except TypeError:
pass
def test_tab_completion(self):
# DataFrame whose columns are identifiers shall have them in __dir__.
df = pd.DataFrame([list('abcd'), list('efgh')], columns=list('ABCD'))
for key in list('ABCD'):
assert key in dir(df)
assert isinstance(df.__getitem__('A'), pd.Series)
# DataFrame whose first-level columns are identifiers shall have
# them in __dir__.
df = pd.DataFrame(
[list('abcd'), list('efgh')],
columns=pd.MultiIndex.from_tuples(list(zip('ABCD', 'EFGH'))))
for key in list('ABCD'):
assert key in dir(df)
for key in list('EFGH'):
assert key not in dir(df)
assert isinstance(df.__getitem__('A'), pd.DataFrame)
def test_not_hashable(self, empty_frame):
df = self.klass([1])
pytest.raises(TypeError, hash, df)
pytest.raises(TypeError, hash, empty_frame)
def test_new_empty_index(self):
df1 = self.klass(np.random.randn(0, 3))
df2 = self.klass(np.random.randn(0, 3))
df1.index.name = 'foo'
assert df2.index.name is None
def test_array_interface(self, float_frame):
with np.errstate(all='ignore'):
result = np.sqrt(float_frame)
assert isinstance(result, type(float_frame))
assert result.index is float_frame.index
assert result.columns is float_frame.columns
self._assert_frame_equal(result, float_frame.apply(np.sqrt))
def test_get_agg_axis(self, float_frame):
cols = float_frame._get_agg_axis(0)
assert cols is float_frame.columns
idx = float_frame._get_agg_axis(1)
assert idx is float_frame.index
pytest.raises(ValueError, float_frame._get_agg_axis, 2)
def test_nonzero(self, float_frame, float_string_frame, empty_frame):
assert empty_frame.empty
assert not float_frame.empty
assert not float_string_frame.empty
# corner case
df = DataFrame({'A': [1., 2., 3.],
'B': ['a', 'b', 'c']},
index=np.arange(3))
del df['A']
assert not df.empty
def test_iteritems(self):
df = self.klass([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b'])
for k, v in compat.iteritems(df):
assert isinstance(v, self.klass._constructor_sliced)
def test_items(self):
# GH 17213, GH 13918
cols = ['a', 'b', 'c']
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols)
for c, (k, v) in zip(cols, df.items()):
assert c == k
assert isinstance(v, Series)
assert (df[k] == v).all()
def test_iter(self, float_frame):
assert tm.equalContents(list(float_frame), float_frame.columns)
def test_iterrows(self, float_frame, float_string_frame):
for k, v in float_frame.iterrows():
exp = float_frame.loc[k]
self._assert_series_equal(v, exp)
for k, v in float_string_frame.iterrows():
exp = float_string_frame.loc[k]
self._assert_series_equal(v, exp)
def test_iterrows_iso8601(self):
# GH 19671
if self.klass == SparseDataFrame:
pytest.xfail(reason='SparseBlock datetime type not implemented.')
s = self.klass(
{'non_iso8601': ['M1701', 'M1802', 'M1903', 'M2004'],
'iso8601': date_range('2000-01-01', periods=4, freq='M')})
for k, v in s.iterrows():
exp = s.loc[k]
self._assert_series_equal(v, exp)
def test_itertuples(self, float_frame):
for i, tup in enumerate(float_frame.itertuples()):
s = self.klass._constructor_sliced(tup[1:])
s.name = tup[0]
expected = float_frame.iloc[i, :].reset_index(drop=True)
self._assert_series_equal(s, expected)
df = self.klass({'floats': np.random.randn(5),
'ints': lrange(5)}, columns=['floats', 'ints'])
for tup in df.itertuples(index=False):
assert isinstance(tup[1], (int, long))
df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]})
dfaa = df[['a', 'a']]
assert (list(dfaa.itertuples()) ==
[(0, 1, 1), (1, 2, 2), (2, 3, 3)])
# repr with be int/long on 32-bit/windows
if not (compat.is_platform_windows() or compat.is_platform_32bit()):
assert (repr(list(df.itertuples(name=None))) ==
'[(0, 1, 4), (1, 2, 5), (2, 3, 6)]')
tup = next(df.itertuples(name='TestName'))
assert tup._fields == ('Index', 'a', 'b')
assert (tup.Index, tup.a, tup.b) == tup
assert type(tup).__name__ == 'TestName'
df.columns = ['def', 'return']
tup2 = next(df.itertuples(name='TestName'))
assert tup2 == (0, 1, 4)
assert tup2._fields == ('Index', '_1', '_2')
df3 = DataFrame({'f' + str(i): [i] for i in range(1024)})
# will raise SyntaxError if trying to create namedtuple
tup3 = next(df3.itertuples())
assert not hasattr(tup3, '_fields')
assert isinstance(tup3, tuple)
def test_sequence_like_with_categorical(self):
# GH 7839
# make sure can iterate
df = DataFrame({"id": [1, 2, 3, 4, 5, 6],
"raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
df['grade'] = Categorical(df['raw_grade'])
# basic sequencing testing
result = list(df.grade.values)
expected = np.array(df.grade.values).tolist()
tm.assert_almost_equal(result, expected)
# iteration
for t in df.itertuples(index=False):
str(t)
for row, s in df.iterrows():
str(s)
for c, col in df.iteritems():
str(s)
def test_len(self, float_frame):
assert len(float_frame) == len(float_frame.index)
def test_values(self, float_frame, float_string_frame):
frame = float_frame
arr = frame.values
frame_cols = frame.columns
for i, row in enumerate(arr):
for j, value in enumerate(row):
col = frame_cols[j]
if np.isnan(value):
assert np.isnan(frame[col][i])
else:
assert value == frame[col][i]
# mixed type
arr = float_string_frame[['foo', 'A']].values
assert arr[0, 0] == 'bar'
df = self.klass({'complex': [1j, 2j, 3j], 'real': [1, 2, 3]})
arr = df.values
assert arr[0, 0] == 1j
# single block corner case
arr = float_frame[['A', 'B']].values
expected = float_frame.reindex(columns=['A', 'B']).values
assert_almost_equal(arr, expected)
def test_to_numpy(self):
df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]})
expected = np.array([[1, 3], [2, 4.5]])
result = df.to_numpy()
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_dtype(self):
df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]})
expected = np.array([[1, 3], [2, 4]], dtype="int64")
result = df.to_numpy(dtype="int64")
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_copy(self):
arr = np.random.randn(4, 3)
df = pd.DataFrame(arr)
assert df.values.base is arr
assert df.to_numpy(copy=False).base is arr
assert df.to_numpy(copy=True).base is None
def test_transpose(self, float_frame):
frame = float_frame
dft = frame.T
for idx, series in compat.iteritems(dft):
for col, value in compat.iteritems(series):
if np.isnan(value):
assert np.isnan(frame[col][idx])
else:
assert value == frame[col][idx]
# mixed type
index, data = tm.getMixedTypeDict()
mixed = self.klass(data, index=index)
mixed_T = mixed.T
for col, s in compat.iteritems(mixed_T):
assert s.dtype == np.object_
def test_swapaxes(self):
df = self.klass(np.random.randn(10, 5))
self._assert_frame_equal(df.T, df.swapaxes(0, 1))
self._assert_frame_equal(df.T, df.swapaxes(1, 0))
self._assert_frame_equal(df, df.swapaxes(0, 0))
pytest.raises(ValueError, df.swapaxes, 2, 5)
def test_axis_aliases(self, float_frame):
f = float_frame
# reg name
expected = f.sum(axis=0)
result = f.sum(axis='index')
assert_series_equal(result, expected)
expected = f.sum(axis=1)
result = f.sum(axis='columns')
assert_series_equal(result, expected)
def test_class_axis(self):
# GH 18147
# no exception and no empty docstring
assert pydoc.getdoc(DataFrame.index)
assert pydoc.getdoc(DataFrame.columns)
def test_more_values(self, float_string_frame):
values = float_string_frame.values
assert values.shape[1] == len(float_string_frame.columns)
def test_repr_with_mi_nat(self, float_string_frame):
df = self.klass({'X': [1, 2]},
index=[[pd.NaT, pd.Timestamp('20130101')], ['a', 'b']])
result = repr(df)
expected = ' X\nNaT a 1\n2013-01-01 b 2'
assert result == expected
def test_iteritems_names(self, float_string_frame):
for k, v in compat.iteritems(float_string_frame):
assert v.name == k
def test_series_put_names(self, float_string_frame):
series = float_string_frame._series
for k, v in compat.iteritems(series):
assert v.name == k
def test_empty_nonzero(self):
df = self.klass([1, 2, 3])
assert not df.empty
df = self.klass(index=[1], columns=[1])
assert not df.empty
df = self.klass(index=['a', 'b'], columns=['c', 'd']).dropna()
assert df.empty
assert df.T.empty
empty_frames = [self.klass(),
self.klass(index=[1]),
self.klass(columns=[1]),
self.klass({1: []})]
for df in empty_frames:
assert df.empty
assert df.T.empty
def test_with_datetimelikes(self):
df = self.klass({'A': date_range('20130101', periods=10),
'B': timedelta_range('1 day', periods=10)})
t = df.T
result = t.get_dtype_counts()
if self.klass is DataFrame:
expected = Series({'object': 10})
else:
expected = Series({'Sparse[object, nan]': 10})
tm.assert_series_equal(result, expected)
class TestDataFrameMisc(SharedWithSparse):
klass = DataFrame
# SharedWithSparse tests use generic, klass-agnostic assertion
_assert_frame_equal = staticmethod(assert_frame_equal)
_assert_series_equal = staticmethod(assert_series_equal)
def test_values(self, float_frame):
float_frame.values[:, 0] = 5.
assert (float_frame.values[:, 0] == 5).all()
def test_as_matrix_deprecated(self, float_frame):
# GH 18458
with tm.assert_produces_warning(FutureWarning):
cols = float_frame.columns.tolist()
result = float_frame.as_matrix(columns=cols)
expected = float_frame.values
tm.assert_numpy_array_equal(result, expected)
def test_deepcopy(self, float_frame):
cp = deepcopy(float_frame)
series = cp['A']
series[:] = 10
for idx, value in compat.iteritems(series):
assert float_frame['A'][idx] != value
def test_transpose_get_view(self, float_frame):
dft = float_frame.T
dft.values[:, 5:10] = 5
assert (float_frame.values[5:10] == 5).all()
def test_inplace_return_self(self):
# GH 1893
data = DataFrame({'a': ['foo', 'bar', 'baz', 'qux'],
'b': [0, 0, 1, 1],
'c': [1, 2, 3, 4]})
def _check_f(base, f):
result = f(base)
assert result is None
# -----DataFrame-----
# set_index
f = lambda x: x.set_index('a', inplace=True)
_check_f(data.copy(), f)
# reset_index
f = lambda x: x.reset_index(inplace=True)
_check_f(data.set_index('a'), f)
# drop_duplicates
f = lambda x: x.drop_duplicates(inplace=True)
_check_f(data.copy(), f)
# sort
f = lambda x: x.sort_values('b', inplace=True)
_check_f(data.copy(), f)
# sort_index
f = lambda x: x.sort_index(inplace=True)
_check_f(data.copy(), f)
# fillna
f = lambda x: x.fillna(0, inplace=True)
_check_f(data.copy(), f)
# replace
f = lambda x: x.replace(1, 0, inplace=True)
_check_f(data.copy(), f)
# rename
f = lambda x: x.rename({1: 'foo'}, inplace=True)
_check_f(data.copy(), f)
# -----Series-----
d = data.copy()['c']
# reset_index
f = lambda x: x.reset_index(inplace=True, drop=True)
_check_f(data.set_index('a')['c'], f)
# fillna
f = lambda x: x.fillna(0, inplace=True)
_check_f(d.copy(), f)
# replace
f = lambda x: x.replace(1, 0, inplace=True)
_check_f(d.copy(), f)
# rename
f = lambda x: x.rename({1: 'foo'}, inplace=True)
_check_f(d.copy(), f)
def test_tab_complete_warning(self, ip):
# GH 16409
pytest.importorskip('IPython', minversion="6.0.0")
from IPython.core.completer import provisionalcompleter
code = "import pandas as pd; df = pd.DataFrame()"
ip.run_code(code)
with tm.assert_produces_warning(None):
with provisionalcompleter('ignore'):
list(ip.Completer.completions('df.', 1))
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,636 @@
# -*- coding: utf-8 -*-
from collections import deque
from datetime import datetime
import operator
import numpy as np
import pytest
from pandas.compat import range
import pandas as pd
from pandas.tests.frame.common import _check_mixed_float, _check_mixed_int
import pandas.util.testing as tm
# -------------------------------------------------------------------
# Comparisons
class TestFrameComparisons(object):
# Specifically _not_ flex-comparisons
def test_comparison_invalid(self):
def check(df, df2):
for (x, y) in [(df, df2), (df2, df)]:
# we expect the result to match Series comparisons for
# == and !=, inequalities should raise
result = x == y
expected = pd.DataFrame({col: x[col] == y[col]
for col in x.columns},
index=x.index, columns=x.columns)
tm.assert_frame_equal(result, expected)
result = x != y
expected = pd.DataFrame({col: x[col] != y[col]
for col in x.columns},
index=x.index, columns=x.columns)
tm.assert_frame_equal(result, expected)
with pytest.raises(TypeError):
x >= y
with pytest.raises(TypeError):
x > y
with pytest.raises(TypeError):
x < y
with pytest.raises(TypeError):
x <= y
# GH4968
# invalid date/int comparisons
df = pd.DataFrame(np.random.randint(10, size=(10, 1)), columns=['a'])
df['dates'] = pd.date_range('20010101', periods=len(df))
df2 = df.copy()
df2['dates'] = df['a']
check(df, df2)
df = pd.DataFrame(np.random.randint(10, size=(10, 2)),
columns=['a', 'b'])
df2 = pd.DataFrame({'a': pd.date_range('20010101', periods=len(df)),
'b': pd.date_range('20100101', periods=len(df))})
check(df, df2)
def test_timestamp_compare(self):
# make sure we can compare Timestamps on the right AND left hand side
# GH#4982
df = pd. DataFrame({'dates1': pd.date_range('20010101', periods=10),
'dates2': pd.date_range('20010102', periods=10),
'intcol': np.random.randint(1000000000, size=10),
'floatcol': np.random.randn(10),
'stringcol': list(tm.rands(10))})
df.loc[np.random.rand(len(df)) > 0.5, 'dates2'] = pd.NaT
ops = {'gt': 'lt', 'lt': 'gt', 'ge': 'le', 'le': 'ge', 'eq': 'eq',
'ne': 'ne'}
for left, right in ops.items():
left_f = getattr(operator, left)
right_f = getattr(operator, right)
# no nats
if left in ['eq', 'ne']:
expected = left_f(df, pd.Timestamp('20010109'))
result = right_f(pd.Timestamp('20010109'), df)
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(TypeError):
left_f(df, pd.Timestamp('20010109'))
with pytest.raises(TypeError):
right_f(pd.Timestamp('20010109'), df)
# nats
expected = left_f(df, pd.Timestamp('nat'))
result = right_f(pd.Timestamp('nat'), df)
tm.assert_frame_equal(result, expected)
def test_mixed_comparison(self):
# GH#13128, GH#22163 != datetime64 vs non-dt64 should be False,
# not raise TypeError
# (this appears to be fixed before GH#22163, not sure when)
df = pd.DataFrame([['1989-08-01', 1], ['1989-08-01', 2]])
other = pd.DataFrame([['a', 'b'], ['c', 'd']])
result = df == other
assert not result.any().any()
result = df != other
assert result.all().all()
def test_df_boolean_comparison_error(self):
# GH#4576, GH#22880
# comparing DataFrame against list/tuple with len(obj) matching
# len(df.columns) is supported as of GH#22800
df = pd.DataFrame(np.arange(6).reshape((3, 2)))
expected = pd.DataFrame([[False, False],
[True, False],
[False, False]])
result = df == (2, 2)
tm.assert_frame_equal(result, expected)
result = df == [2, 2]
tm.assert_frame_equal(result, expected)
def test_df_float_none_comparison(self):
df = pd.DataFrame(np.random.randn(8, 3), index=range(8),
columns=['A', 'B', 'C'])
result = df.__eq__(None)
assert not result.any().any()
def test_df_string_comparison(self):
df = pd.DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}])
mask_a = df.a > 1
tm.assert_frame_equal(df[mask_a], df.loc[1:1, :])
tm.assert_frame_equal(df[-mask_a], df.loc[0:0, :])
mask_b = df.b == "foo"
tm.assert_frame_equal(df[mask_b], df.loc[0:0, :])
tm.assert_frame_equal(df[-mask_b], df.loc[1:1, :])
class TestFrameFlexComparisons(object):
# TODO: test_bool_flex_frame needs a better name
def test_bool_flex_frame(self):
data = np.random.randn(5, 3)
other_data = np.random.randn(5, 3)
df = pd.DataFrame(data)
other = pd.DataFrame(other_data)
ndim_5 = np.ones(df.shape + (1, 3))
# Unaligned
def _check_unaligned_frame(meth, op, df, other):
part_o = other.loc[3:, 1:].copy()
rs = meth(part_o)
xp = op(df, part_o.reindex(index=df.index, columns=df.columns))
tm.assert_frame_equal(rs, xp)
# DataFrame
assert df.eq(df).values.all()
assert not df.ne(df).values.any()
for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']:
f = getattr(df, op)
o = getattr(operator, op)
# No NAs
tm.assert_frame_equal(f(other), o(df, other))
_check_unaligned_frame(f, o, df, other)
# ndarray
tm.assert_frame_equal(f(other.values), o(df, other.values))
# scalar
tm.assert_frame_equal(f(0), o(df, 0))
# NAs
msg = "Unable to coerce to Series/DataFrame"
tm.assert_frame_equal(f(np.nan), o(df, np.nan))
with pytest.raises(ValueError, match=msg):
f(ndim_5)
# Series
def _test_seq(df, idx_ser, col_ser):
idx_eq = df.eq(idx_ser, axis=0)
col_eq = df.eq(col_ser)
idx_ne = df.ne(idx_ser, axis=0)
col_ne = df.ne(col_ser)
tm.assert_frame_equal(col_eq, df == pd.Series(col_ser))
tm.assert_frame_equal(col_eq, -col_ne)
tm.assert_frame_equal(idx_eq, -idx_ne)
tm.assert_frame_equal(idx_eq, df.T.eq(idx_ser).T)
tm.assert_frame_equal(col_eq, df.eq(list(col_ser)))
tm.assert_frame_equal(idx_eq, df.eq(pd.Series(idx_ser), axis=0))
tm.assert_frame_equal(idx_eq, df.eq(list(idx_ser), axis=0))
idx_gt = df.gt(idx_ser, axis=0)
col_gt = df.gt(col_ser)
idx_le = df.le(idx_ser, axis=0)
col_le = df.le(col_ser)
tm.assert_frame_equal(col_gt, df > pd.Series(col_ser))
tm.assert_frame_equal(col_gt, -col_le)
tm.assert_frame_equal(idx_gt, -idx_le)
tm.assert_frame_equal(idx_gt, df.T.gt(idx_ser).T)
idx_ge = df.ge(idx_ser, axis=0)
col_ge = df.ge(col_ser)
idx_lt = df.lt(idx_ser, axis=0)
col_lt = df.lt(col_ser)
tm.assert_frame_equal(col_ge, df >= pd.Series(col_ser))
tm.assert_frame_equal(col_ge, -col_lt)
tm.assert_frame_equal(idx_ge, -idx_lt)
tm.assert_frame_equal(idx_ge, df.T.ge(idx_ser).T)
idx_ser = pd.Series(np.random.randn(5))
col_ser = pd.Series(np.random.randn(3))
_test_seq(df, idx_ser, col_ser)
# list/tuple
_test_seq(df, idx_ser.values, col_ser.values)
# NA
df.loc[0, 0] = np.nan
rs = df.eq(df)
assert not rs.loc[0, 0]
rs = df.ne(df)
assert rs.loc[0, 0]
rs = df.gt(df)
assert not rs.loc[0, 0]
rs = df.lt(df)
assert not rs.loc[0, 0]
rs = df.ge(df)
assert not rs.loc[0, 0]
rs = df.le(df)
assert not rs.loc[0, 0]
# complex
arr = np.array([np.nan, 1, 6, np.nan])
arr2 = np.array([2j, np.nan, 7, None])
df = pd.DataFrame({'a': arr})
df2 = pd.DataFrame({'a': arr2})
rs = df.gt(df2)
assert not rs.values.any()
rs = df.ne(df2)
assert rs.values.all()
arr3 = np.array([2j, np.nan, None])
df3 = pd.DataFrame({'a': arr3})
rs = df3.gt(2j)
assert not rs.values.any()
# corner, dtype=object
df1 = pd.DataFrame({'col': ['foo', np.nan, 'bar']})
df2 = pd.DataFrame({'col': ['foo', datetime.now(), 'bar']})
result = df1.ne(df2)
exp = pd.DataFrame({'col': [False, True, False]})
tm.assert_frame_equal(result, exp)
def test_flex_comparison_nat(self):
# GH 15697, GH 22163 df.eq(pd.NaT) should behave like df == pd.NaT,
# and _definitely_ not be NaN
df = pd.DataFrame([pd.NaT])
result = df == pd.NaT
# result.iloc[0, 0] is a np.bool_ object
assert result.iloc[0, 0].item() is False
result = df.eq(pd.NaT)
assert result.iloc[0, 0].item() is False
result = df != pd.NaT
assert result.iloc[0, 0].item() is True
result = df.ne(pd.NaT)
assert result.iloc[0, 0].item() is True
@pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le'])
def test_df_flex_cmp_constant_return_types(self, opname):
# GH 15077, non-empty DataFrame
df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]})
const = 2
result = getattr(df, opname)(const).get_dtype_counts()
tm.assert_series_equal(result, pd.Series([2], ['bool']))
@pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le'])
def test_df_flex_cmp_constant_return_types_empty(self, opname):
# GH 15077 empty DataFrame
df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]})
const = 2
empty = df.iloc[:0]
result = getattr(empty, opname)(const).get_dtype_counts()
tm.assert_series_equal(result, pd.Series([2], ['bool']))
# -------------------------------------------------------------------
# Arithmetic
class TestFrameFlexArithmetic(object):
def test_df_add_td64_columnwise(self):
# GH 22534 Check that column-wise addition broadcasts correctly
dti = pd.date_range('2016-01-01', periods=10)
tdi = pd.timedelta_range('1', periods=10)
tser = pd.Series(tdi)
df = pd.DataFrame({0: dti, 1: tdi})
result = df.add(tser, axis=0)
expected = pd.DataFrame({0: dti + tdi,
1: tdi + tdi})
tm.assert_frame_equal(result, expected)
def test_df_add_flex_filled_mixed_dtypes(self):
# GH 19611
dti = pd.date_range('2016-01-01', periods=3)
ser = pd.Series(['1 Day', 'NaT', '2 Days'], dtype='timedelta64[ns]')
df = pd.DataFrame({'A': dti, 'B': ser})
other = pd.DataFrame({'A': ser, 'B': ser})
fill = pd.Timedelta(days=1).to_timedelta64()
result = df.add(other, fill_value=fill)
expected = pd.DataFrame(
{'A': pd.Series(['2016-01-02', '2016-01-03', '2016-01-05'],
dtype='datetime64[ns]'),
'B': ser * 2})
tm.assert_frame_equal(result, expected)
def test_arith_flex_frame(self, all_arithmetic_operators, float_frame,
mixed_float_frame):
# one instance of parametrized fixture
op = all_arithmetic_operators
def f(x, y):
# r-versions not in operator-stdlib; get op without "r" and invert
if op.startswith('__r'):
return getattr(operator, op.replace('__r', '__'))(y, x)
return getattr(operator, op)(x, y)
result = getattr(float_frame, op)(2 * float_frame)
expected = f(float_frame, 2 * float_frame)
tm.assert_frame_equal(result, expected)
# vs mix float
result = getattr(mixed_float_frame, op)(2 * mixed_float_frame)
expected = f(mixed_float_frame, 2 * mixed_float_frame)
tm.assert_frame_equal(result, expected)
_check_mixed_float(result, dtype=dict(C=None))
@pytest.mark.parametrize('op', ['__add__', '__sub__', '__mul__'])
def test_arith_flex_frame_mixed(self, op, int_frame, mixed_int_frame,
mixed_float_frame):
f = getattr(operator, op)
# vs mix int
result = getattr(mixed_int_frame, op)(2 + mixed_int_frame)
expected = f(mixed_int_frame, 2 + mixed_int_frame)
# no overflow in the uint
dtype = None
if op in ['__sub__']:
dtype = dict(B='uint64', C=None)
elif op in ['__add__', '__mul__']:
dtype = dict(C=None)
tm.assert_frame_equal(result, expected)
_check_mixed_int(result, dtype=dtype)
# vs mix float
result = getattr(mixed_float_frame, op)(2 * mixed_float_frame)
expected = f(mixed_float_frame, 2 * mixed_float_frame)
tm.assert_frame_equal(result, expected)
_check_mixed_float(result, dtype=dict(C=None))
# vs plain int
result = getattr(int_frame, op)(2 * int_frame)
expected = f(int_frame, 2 * int_frame)
tm.assert_frame_equal(result, expected)
def test_arith_flex_frame_raise(self, all_arithmetic_operators,
float_frame):
# one instance of parametrized fixture
op = all_arithmetic_operators
# Check that arrays with dim >= 3 raise
for dim in range(3, 6):
arr = np.ones((1,) * dim)
msg = "Unable to coerce to Series/DataFrame"
with pytest.raises(ValueError, match=msg):
getattr(float_frame, op)(arr)
def test_arith_flex_frame_corner(self, float_frame):
const_add = float_frame.add(1)
tm.assert_frame_equal(const_add, float_frame + 1)
# corner cases
result = float_frame.add(float_frame[:0])
tm.assert_frame_equal(result, float_frame * np.nan)
result = float_frame[:0].add(float_frame)
tm.assert_frame_equal(result, float_frame * np.nan)
with pytest.raises(NotImplementedError, match='fill_value'):
float_frame.add(float_frame.iloc[0], fill_value=3)
with pytest.raises(NotImplementedError, match='fill_value'):
float_frame.add(float_frame.iloc[0], axis='index', fill_value=3)
def test_arith_flex_series(self, simple_frame):
df = simple_frame
row = df.xs('a')
col = df['two']
# after arithmetic refactor, add truediv here
ops = ['add', 'sub', 'mul', 'mod']
for op in ops:
f = getattr(df, op)
op = getattr(operator, op)
tm.assert_frame_equal(f(row), op(df, row))
tm.assert_frame_equal(f(col, axis=0), op(df.T, col).T)
# special case for some reason
tm.assert_frame_equal(df.add(row, axis=None), df + row)
# cases which will be refactored after big arithmetic refactor
tm.assert_frame_equal(df.div(row), df / row)
tm.assert_frame_equal(df.div(col, axis=0), (df.T / col).T)
# broadcasting issue in GH 7325
df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='int64')
expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
result = df.div(df[0], axis='index')
tm.assert_frame_equal(result, expected)
df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='float64')
expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
result = df.div(df[0], axis='index')
tm.assert_frame_equal(result, expected)
def test_arith_flex_zero_len_raises(self):
# GH 19522 passing fill_value to frame flex arith methods should
# raise even in the zero-length special cases
ser_len0 = pd.Series([])
df_len0 = pd.DataFrame([], columns=['A', 'B'])
df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
with pytest.raises(NotImplementedError, match='fill_value'):
df.add(ser_len0, fill_value='E')
with pytest.raises(NotImplementedError, match='fill_value'):
df_len0.sub(df['A'], axis=None, fill_value=3)
class TestFrameArithmetic(object):
def test_df_add_2d_array_rowlike_broadcasts(self):
# GH#23000
arr = np.arange(6).reshape(3, 2)
df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C'])
rowlike = arr[[1], :] # shape --> (1, ncols)
assert rowlike.shape == (1, df.shape[1])
expected = pd.DataFrame([[2, 4],
[4, 6],
[6, 8]],
columns=df.columns, index=df.index,
# specify dtype explicitly to avoid failing
# on 32bit builds
dtype=arr.dtype)
result = df + rowlike
tm.assert_frame_equal(result, expected)
result = rowlike + df
tm.assert_frame_equal(result, expected)
def test_df_add_2d_array_collike_broadcasts(self):
# GH#23000
arr = np.arange(6).reshape(3, 2)
df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C'])
collike = arr[:, [1]] # shape --> (nrows, 1)
assert collike.shape == (df.shape[0], 1)
expected = pd.DataFrame([[1, 2],
[5, 6],
[9, 10]],
columns=df.columns, index=df.index,
# specify dtype explicitly to avoid failing
# on 32bit builds
dtype=arr.dtype)
result = df + collike
tm.assert_frame_equal(result, expected)
result = collike + df
tm.assert_frame_equal(result, expected)
def test_df_arith_2d_array_rowlike_broadcasts(self,
all_arithmetic_operators):
# GH#23000
opname = all_arithmetic_operators
arr = np.arange(6).reshape(3, 2)
df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C'])
rowlike = arr[[1], :] # shape --> (1, ncols)
assert rowlike.shape == (1, df.shape[1])
exvals = [getattr(df.loc['A'], opname)(rowlike.squeeze()),
getattr(df.loc['B'], opname)(rowlike.squeeze()),
getattr(df.loc['C'], opname)(rowlike.squeeze())]
expected = pd.DataFrame(exvals, columns=df.columns, index=df.index)
if opname in ['__rmod__', '__rfloordiv__']:
# exvals will have dtypes [f8, i8, i8] so expected will be
# all-f8, but the DataFrame operation will return mixed dtypes
# use exvals[-1].dtype instead of "i8" for compat with 32-bit
# systems/pythons
expected[False] = expected[False].astype(exvals[-1].dtype)
result = getattr(df, opname)(rowlike)
tm.assert_frame_equal(result, expected)
def test_df_arith_2d_array_collike_broadcasts(self,
all_arithmetic_operators):
# GH#23000
opname = all_arithmetic_operators
arr = np.arange(6).reshape(3, 2)
df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C'])
collike = arr[:, [1]] # shape --> (nrows, 1)
assert collike.shape == (df.shape[0], 1)
exvals = {True: getattr(df[True], opname)(collike.squeeze()),
False: getattr(df[False], opname)(collike.squeeze())}
dtype = None
if opname in ['__rmod__', '__rfloordiv__']:
# Series ops may return mixed int/float dtypes in cases where
# DataFrame op will return all-float. So we upcast `expected`
dtype = np.common_type(*[x.values for x in exvals.values()])
expected = pd.DataFrame(exvals, columns=df.columns, index=df.index,
dtype=dtype)
result = getattr(df, opname)(collike)
tm.assert_frame_equal(result, expected)
def test_df_bool_mul_int(self):
# GH 22047, GH 22163 multiplication by 1 should result in int dtype,
# not object dtype
df = pd.DataFrame([[False, True], [False, False]])
result = df * 1
# On appveyor this comes back as np.int32 instead of np.int64,
# so we check dtype.kind instead of just dtype
kinds = result.dtypes.apply(lambda x: x.kind)
assert (kinds == 'i').all()
result = 1 * df
kinds = result.dtypes.apply(lambda x: x.kind)
assert (kinds == 'i').all()
def test_arith_mixed(self):
left = pd.DataFrame({'A': ['a', 'b', 'c'],
'B': [1, 2, 3]})
result = left + left
expected = pd.DataFrame({'A': ['aa', 'bb', 'cc'],
'B': [2, 4, 6]})
tm.assert_frame_equal(result, expected)
def test_arith_getitem_commute(self):
df = pd.DataFrame({'A': [1.1, 3.3], 'B': [2.5, -3.9]})
def _test_op(df, op):
result = op(df, 1)
if not df.columns.is_unique:
raise ValueError("Only unique columns supported by this test")
for col in result.columns:
tm.assert_series_equal(result[col], op(df[col], 1))
_test_op(df, operator.add)
_test_op(df, operator.sub)
_test_op(df, operator.mul)
_test_op(df, operator.truediv)
_test_op(df, operator.floordiv)
_test_op(df, operator.pow)
_test_op(df, lambda x, y: y + x)
_test_op(df, lambda x, y: y - x)
_test_op(df, lambda x, y: y * x)
_test_op(df, lambda x, y: y / x)
_test_op(df, lambda x, y: y ** x)
_test_op(df, lambda x, y: x + y)
_test_op(df, lambda x, y: x - y)
_test_op(df, lambda x, y: x * y)
_test_op(df, lambda x, y: x / y)
_test_op(df, lambda x, y: x ** y)
@pytest.mark.parametrize('values', [[1, 2], (1, 2), np.array([1, 2]),
range(1, 3), deque([1, 2])])
def test_arith_alignment_non_pandas_object(self, values):
# GH#17901
df = pd.DataFrame({'A': [1, 1], 'B': [1, 1]})
expected = pd.DataFrame({'A': [2, 2], 'B': [3, 3]})
result = df + values
tm.assert_frame_equal(result, expected)
def test_arith_non_pandas_object(self):
df = pd.DataFrame(np.arange(1, 10, dtype='f8').reshape(3, 3),
columns=['one', 'two', 'three'],
index=['a', 'b', 'c'])
val1 = df.xs('a').values
added = pd.DataFrame(df.values + val1,
index=df.index, columns=df.columns)
tm.assert_frame_equal(df + val1, added)
added = pd.DataFrame((df.values.T + val1).T,
index=df.index, columns=df.columns)
tm.assert_frame_equal(df.add(val1, axis=0), added)
val2 = list(df['two'])
added = pd.DataFrame(df.values + val2,
index=df.index, columns=df.columns)
tm.assert_frame_equal(df + val2, added)
added = pd.DataFrame((df.values.T + val2).T, index=df.index,
columns=df.columns)
tm.assert_frame_equal(df.add(val2, axis='index'), added)
val3 = np.random.rand(*df.shape)
added = pd.DataFrame(df.values + val3,
index=df.index, columns=df.columns)
tm.assert_frame_equal(df.add(val3), added)
@@ -0,0 +1,126 @@
# coding=utf-8
import numpy as np
import pytest
from pandas import DataFrame, Series, Timestamp, date_range, to_datetime
import pandas.util.testing as tm
from .common import TestData
class TestFrameAsof(TestData):
def setup_method(self, method):
self.N = N = 50
self.rng = date_range('1/1/1990', periods=N, freq='53s')
self.df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
index=self.rng)
def test_basic(self):
df = self.df.copy()
df.loc[15:30, 'A'] = np.nan
dates = date_range('1/1/1990', periods=self.N * 3,
freq='25s')
result = df.asof(dates)
assert result.notna().all(1).all()
lb = df.index[14]
ub = df.index[30]
dates = list(dates)
result = df.asof(dates)
assert result.notna().all(1).all()
mask = (result.index >= lb) & (result.index < ub)
rs = result[mask]
assert (rs == 14).all(1).all()
def test_subset(self):
N = 10
rng = date_range('1/1/1990', periods=N, freq='53s')
df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
index=rng)
df.loc[4:8, 'A'] = np.nan
dates = date_range('1/1/1990', periods=N * 3,
freq='25s')
# with a subset of A should be the same
result = df.asof(dates, subset='A')
expected = df.asof(dates)
tm.assert_frame_equal(result, expected)
# same with A/B
result = df.asof(dates, subset=['A', 'B'])
expected = df.asof(dates)
tm.assert_frame_equal(result, expected)
# B gives self.df.asof
result = df.asof(dates, subset='B')
expected = df.resample('25s', closed='right').ffill().reindex(dates)
expected.iloc[20:] = 9
tm.assert_frame_equal(result, expected)
def test_missing(self):
# GH 15118
# no match found - `where` value before earliest date in index
N = 10
rng = date_range('1/1/1990', periods=N, freq='53s')
df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
index=rng)
result = df.asof('1989-12-31')
expected = Series(index=['A', 'B'], name=Timestamp('1989-12-31'))
tm.assert_series_equal(result, expected)
result = df.asof(to_datetime(['1989-12-31']))
expected = DataFrame(index=to_datetime(['1989-12-31']),
columns=['A', 'B'], dtype='float64')
tm.assert_frame_equal(result, expected)
def test_all_nans(self):
# GH 15713
# DataFrame is all nans
result = DataFrame([np.nan]).asof([0])
expected = DataFrame([np.nan])
tm.assert_frame_equal(result, expected)
# testing non-default indexes, multiple inputs
dates = date_range('1/1/1990', periods=self.N * 3, freq='25s')
result = DataFrame(np.nan, index=self.rng, columns=['A']).asof(dates)
expected = DataFrame(np.nan, index=dates, columns=['A'])
tm.assert_frame_equal(result, expected)
# testing multiple columns
dates = date_range('1/1/1990', periods=self.N * 3, freq='25s')
result = DataFrame(np.nan, index=self.rng,
columns=['A', 'B', 'C']).asof(dates)
expected = DataFrame(np.nan, index=dates, columns=['A', 'B', 'C'])
tm.assert_frame_equal(result, expected)
# testing scalar input
result = DataFrame(np.nan, index=[1, 2], columns=['A', 'B']).asof([3])
expected = DataFrame(np.nan, index=[3], columns=['A', 'B'])
tm.assert_frame_equal(result, expected)
result = DataFrame(np.nan, index=[1, 2], columns=['A', 'B']).asof(3)
expected = Series(np.nan, index=['A', 'B'], name=3)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"stamp,expected",
[(Timestamp('2018-01-01 23:22:43.325+00:00'),
Series(2.0, name=Timestamp('2018-01-01 23:22:43.325+00:00'))),
(Timestamp('2018-01-01 22:33:20.682+01:00'),
Series(1.0, name=Timestamp('2018-01-01 22:33:20.682+01:00'))),
]
)
def test_time_zone_aware_index(self, stamp, expected):
# GH21194
# Testing awareness of DataFrame index considering different
# UTC and timezone
df = DataFrame(data=[1, 2],
index=[Timestamp('2018-01-01 21:00:05.001+00:00'),
Timestamp('2018-01-01 22:35:10.550+00:00')])
result = df.asof(stamp)
tm.assert_series_equal(result, expected)
@@ -0,0 +1,587 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
from datetime import datetime, timedelta
import itertools
import numpy as np
import pytest
from pandas.compat import StringIO
import pandas as pd
from pandas import (
Categorical, DataFrame, Series, Timestamp, compat, date_range,
option_context)
from pandas.core.arrays import IntervalArray, integer_array
from pandas.core.internals.blocks import IntBlock
import pandas.util.testing as tm
from pandas.util.testing import (
assert_almost_equal, assert_frame_equal, assert_series_equal)
# Segregated collection of methods that require the BlockManager internal data
# structure
class TestDataFrameBlockInternals():
def test_setitem_invalidates_datetime_index_freq(self):
# GH#24096 altering a datetime64tz column inplace invalidates the
# `freq` attribute on the underlying DatetimeIndex
dti = date_range('20130101', periods=3, tz='US/Eastern')
ts = dti[1]
df = DataFrame({'B': dti})
assert df['B']._values.freq == 'D'
df.iloc[1, 0] = pd.NaT
assert df['B']._values.freq is None
# check that the DatetimeIndex was not altered in place
assert dti.freq == 'D'
assert dti[1] == ts
def test_cast_internals(self, float_frame):
casted = DataFrame(float_frame._data, dtype=int)
expected = DataFrame(float_frame._series, dtype=int)
assert_frame_equal(casted, expected)
casted = DataFrame(float_frame._data, dtype=np.int32)
expected = DataFrame(float_frame._series, dtype=np.int32)
assert_frame_equal(casted, expected)
def test_consolidate(self, float_frame):
float_frame['E'] = 7.
consolidated = float_frame._consolidate()
assert len(consolidated._data.blocks) == 1
# Ensure copy, do I want this?
recons = consolidated._consolidate()
assert recons is not consolidated
tm.assert_frame_equal(recons, consolidated)
float_frame['F'] = 8.
assert len(float_frame._data.blocks) == 3
float_frame._consolidate(inplace=True)
assert len(float_frame._data.blocks) == 1
def test_consolidate_inplace(self, float_frame):
frame = float_frame.copy() # noqa
# triggers in-place consolidation
for letter in range(ord('A'), ord('Z')):
float_frame[chr(letter)] = chr(letter)
def test_values_consolidate(self, float_frame):
float_frame['E'] = 7.
assert not float_frame._data.is_consolidated()
_ = float_frame.values # noqa
assert float_frame._data.is_consolidated()
def test_modify_values(self, float_frame):
float_frame.values[5] = 5
assert (float_frame.values[5] == 5).all()
# unconsolidated
float_frame['E'] = 7.
float_frame.values[6] = 6
assert (float_frame.values[6] == 6).all()
def test_boolean_set_uncons(self, float_frame):
float_frame['E'] = 7.
expected = float_frame.values.copy()
expected[expected > 1] = 2
float_frame[float_frame > 1] = 2
assert_almost_equal(expected, float_frame.values)
def test_values_numeric_cols(self, float_frame):
float_frame['foo'] = 'bar'
values = float_frame[['A', 'B', 'C', 'D']].values
assert values.dtype == np.float64
def test_values_lcd(self, mixed_float_frame, mixed_int_frame):
# mixed lcd
values = mixed_float_frame[['A', 'B', 'C', 'D']].values
assert values.dtype == np.float64
values = mixed_float_frame[['A', 'B', 'C']].values
assert values.dtype == np.float32
values = mixed_float_frame[['C']].values
assert values.dtype == np.float16
# GH 10364
# B uint64 forces float because there are other signed int types
values = mixed_int_frame[['A', 'B', 'C', 'D']].values
assert values.dtype == np.float64
values = mixed_int_frame[['A', 'D']].values
assert values.dtype == np.int64
# B uint64 forces float because there are other signed int types
values = mixed_int_frame[['A', 'B', 'C']].values
assert values.dtype == np.float64
# as B and C are both unsigned, no forcing to float is needed
values = mixed_int_frame[['B', 'C']].values
assert values.dtype == np.uint64
values = mixed_int_frame[['A', 'C']].values
assert values.dtype == np.int32
values = mixed_int_frame[['C', 'D']].values
assert values.dtype == np.int64
values = mixed_int_frame[['A']].values
assert values.dtype == np.int32
values = mixed_int_frame[['C']].values
assert values.dtype == np.uint8
def test_constructor_with_convert(self):
# this is actually mostly a test of lib.maybe_convert_objects
# #2845
df = DataFrame({'A': [2 ** 63 - 1]})
result = df['A']
expected = Series(np.asarray([2 ** 63 - 1], np.int64), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [2 ** 63]})
result = df['A']
expected = Series(np.asarray([2 ** 63], np.uint64), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [datetime(2005, 1, 1), True]})
result = df['A']
expected = Series(np.asarray([datetime(2005, 1, 1), True], np.object_),
name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [None, 1]})
result = df['A']
expected = Series(np.asarray([np.nan, 1], np.float_), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [1.0, 2]})
result = df['A']
expected = Series(np.asarray([1.0, 2], np.float_), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [1.0 + 2.0j, 3]})
result = df['A']
expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex_), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [1.0 + 2.0j, 3.0]})
result = df['A']
expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex_), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [1.0 + 2.0j, True]})
result = df['A']
expected = Series(np.asarray([1.0 + 2.0j, True], np.object_), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [1.0, None]})
result = df['A']
expected = Series(np.asarray([1.0, np.nan], np.float_), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [1.0 + 2.0j, None]})
result = df['A']
expected = Series(np.asarray(
[1.0 + 2.0j, np.nan], np.complex_), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [2.0, 1, True, None]})
result = df['A']
expected = Series(np.asarray(
[2.0, 1, True, None], np.object_), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [2.0, 1, datetime(2006, 1, 1), None]})
result = df['A']
expected = Series(np.asarray([2.0, 1, datetime(2006, 1, 1),
None], np.object_), name='A')
assert_series_equal(result, expected)
def test_construction_with_mixed(self, float_string_frame):
# test construction edge cases with mixed types
# f7u12, this does not work without extensive workaround
data = [[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
[datetime(2000, 1, 2), datetime(2000, 1, 3),
datetime(2000, 1, 1)]]
df = DataFrame(data)
# check dtypes
result = df.get_dtype_counts().sort_values()
expected = Series({'datetime64[ns]': 3})
# mixed-type frames
float_string_frame['datetime'] = datetime.now()
float_string_frame['timedelta'] = timedelta(days=1, seconds=1)
assert float_string_frame['datetime'].dtype == 'M8[ns]'
assert float_string_frame['timedelta'].dtype == 'm8[ns]'
result = float_string_frame.get_dtype_counts().sort_values()
expected = Series({'float64': 4,
'object': 1,
'datetime64[ns]': 1,
'timedelta64[ns]': 1}).sort_values()
assert_series_equal(result, expected)
def test_construction_with_conversions(self):
# convert from a numpy array of non-ns timedelta64
arr = np.array([1, 2, 3], dtype='timedelta64[s]')
df = DataFrame(index=range(3))
df['A'] = arr
expected = DataFrame({'A': pd.timedelta_range('00:00:01', periods=3,
freq='s')},
index=range(3))
assert_frame_equal(df, expected)
expected = DataFrame({
'dt1': Timestamp('20130101'),
'dt2': date_range('20130101', periods=3),
# 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'),
}, index=range(3))
df = DataFrame(index=range(3))
df['dt1'] = np.datetime64('2013-01-01')
df['dt2'] = np.array(['2013-01-01', '2013-01-02', '2013-01-03'],
dtype='datetime64[D]')
# df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01
# 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')
assert_frame_equal(df, expected)
def test_constructor_compound_dtypes(self):
# GH 5191
# compound dtypes should raise not-implementederror
def f(dtype):
data = list(itertools.repeat((datetime(2001, 1, 1),
"aa", 20), 9))
return DataFrame(data=data,
columns=["A", "B", "C"],
dtype=dtype)
pytest.raises(NotImplementedError, f,
[("A", "datetime64[h]"),
("B", "str"),
("C", "int32")])
# these work (though results may be unexpected)
f('int64')
f('float64')
# 10822
# invalid error message on dt inference
if not compat.is_platform_windows():
f('M8[ns]')
def test_equals_different_blocks(self):
# GH 9330
df0 = pd.DataFrame({"A": ["x", "y"], "B": [1, 2],
"C": ["w", "z"]})
df1 = df0.reset_index()[["A", "B", "C"]]
# this assert verifies that the above operations have
# induced a block rearrangement
assert (df0._data.blocks[0].dtype != df1._data.blocks[0].dtype)
# do the real tests
assert_frame_equal(df0, df1)
assert df0.equals(df1)
assert df1.equals(df0)
def test_copy_blocks(self, float_frame):
# API/ENH 9607
df = DataFrame(float_frame, copy=True)
column = df.columns[0]
# use the default copy=True, change a column
# deprecated 0.21.0
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
blocks = df.as_blocks()
for dtype, _df in blocks.items():
if column in _df:
_df.loc[:, column] = _df[column] + 1
# make sure we did not change the original DataFrame
assert not _df[column].equals(df[column])
def test_no_copy_blocks(self, float_frame):
# API/ENH 9607
df = DataFrame(float_frame, copy=True)
column = df.columns[0]
# use the copy=False, change a column
# deprecated 0.21.0
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
blocks = df.as_blocks(copy=False)
for dtype, _df in blocks.items():
if column in _df:
_df.loc[:, column] = _df[column] + 1
# make sure we did change the original DataFrame
assert _df[column].equals(df[column])
def test_copy(self, float_frame, float_string_frame):
cop = float_frame.copy()
cop['E'] = cop['A']
assert 'E' not in float_frame
# copy objects
copy = float_string_frame.copy()
assert copy._data is not float_string_frame._data
def test_pickle(self, float_string_frame, empty_frame, timezone_frame):
unpickled = tm.round_trip_pickle(float_string_frame)
assert_frame_equal(float_string_frame, unpickled)
# buglet
float_string_frame._data.ndim
# empty
unpickled = tm.round_trip_pickle(empty_frame)
repr(unpickled)
# tz frame
unpickled = tm.round_trip_pickle(timezone_frame)
assert_frame_equal(timezone_frame, unpickled)
def test_consolidate_datetime64(self):
# numpy vstack bug
data = """\
starting,ending,measure
2012-06-21 00:00,2012-06-23 07:00,77
2012-06-23 07:00,2012-06-23 16:30,65
2012-06-23 16:30,2012-06-25 08:00,77
2012-06-25 08:00,2012-06-26 12:00,0
2012-06-26 12:00,2012-06-27 08:00,77
"""
df = pd.read_csv(StringIO(data), parse_dates=[0, 1])
ser_starting = df.starting
ser_starting.index = ser_starting.values
ser_starting = ser_starting.tz_localize('US/Eastern')
ser_starting = ser_starting.tz_convert('UTC')
ser_starting.index.name = 'starting'
ser_ending = df.ending
ser_ending.index = ser_ending.values
ser_ending = ser_ending.tz_localize('US/Eastern')
ser_ending = ser_ending.tz_convert('UTC')
ser_ending.index.name = 'ending'
df.starting = ser_starting.index
df.ending = ser_ending.index
tm.assert_index_equal(pd.DatetimeIndex(
df.starting), ser_starting.index)
tm.assert_index_equal(pd.DatetimeIndex(df.ending), ser_ending.index)
def test_is_mixed_type(self, float_frame, float_string_frame):
assert not float_frame._is_mixed_type
assert float_string_frame._is_mixed_type
def test_get_numeric_data(self):
# TODO(wesm): unused?
intname = np.dtype(np.int_).name # noqa
floatname = np.dtype(np.float_).name # noqa
datetime64name = np.dtype('M8[ns]').name
objectname = np.dtype(np.object_).name
df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
'f': Timestamp('20010102')},
index=np.arange(10))
result = df.get_dtype_counts()
expected = Series({'int64': 1, 'float64': 1,
datetime64name: 1, objectname: 1})
result = result.sort_index()
expected = expected.sort_index()
assert_series_equal(result, expected)
df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
'd': np.array([1.] * 10, dtype='float32'),
'e': np.array([1] * 10, dtype='int32'),
'f': np.array([1] * 10, dtype='int16'),
'g': Timestamp('20010102')},
index=np.arange(10))
result = df._get_numeric_data()
expected = df.loc[:, ['a', 'b', 'd', 'e', 'f']]
assert_frame_equal(result, expected)
only_obj = df.loc[:, ['c', 'g']]
result = only_obj._get_numeric_data()
expected = df.loc[:, []]
assert_frame_equal(result, expected)
df = DataFrame.from_dict(
{'a': [1, 2], 'b': ['foo', 'bar'], 'c': [np.pi, np.e]})
result = df._get_numeric_data()
expected = DataFrame.from_dict({'a': [1, 2], 'c': [np.pi, np.e]})
assert_frame_equal(result, expected)
df = result.copy()
result = df._get_numeric_data()
expected = df
assert_frame_equal(result, expected)
def test_get_numeric_data_extension_dtype(self):
# GH 22290
df = DataFrame({
'A': integer_array([-10, np.nan, 0, 10, 20, 30], dtype='Int64'),
'B': Categorical(list('abcabc')),
'C': integer_array([0, 1, 2, 3, np.nan, 5], dtype='UInt8'),
'D': IntervalArray.from_breaks(range(7))})
result = df._get_numeric_data()
expected = df.loc[:, ['A', 'C']]
assert_frame_equal(result, expected)
def test_convert_objects(self, float_string_frame):
oops = float_string_frame.T.T
converted = oops._convert(datetime=True)
assert_frame_equal(converted, float_string_frame)
assert converted['A'].dtype == np.float64
# force numeric conversion
float_string_frame['H'] = '1.'
float_string_frame['I'] = '1'
# add in some items that will be nan
length = len(float_string_frame)
float_string_frame['J'] = '1.'
float_string_frame['K'] = '1'
float_string_frame.loc[0:5, ['J', 'K']] = 'garbled'
converted = float_string_frame._convert(datetime=True, numeric=True)
assert converted['H'].dtype == 'float64'
assert converted['I'].dtype == 'int64'
assert converted['J'].dtype == 'float64'
assert converted['K'].dtype == 'float64'
assert len(converted['J'].dropna()) == length - 5
assert len(converted['K'].dropna()) == length - 5
# via astype
converted = float_string_frame.copy()
converted['H'] = converted['H'].astype('float64')
converted['I'] = converted['I'].astype('int64')
assert converted['H'].dtype == 'float64'
assert converted['I'].dtype == 'int64'
# via astype, but errors
converted = float_string_frame.copy()
with pytest.raises(ValueError, match='invalid literal'):
converted['H'].astype('int32')
# mixed in a single column
df = DataFrame(dict(s=Series([1, 'na', 3, 4])))
result = df._convert(datetime=True, numeric=True)
expected = DataFrame(dict(s=Series([1, np.nan, 3, 4])))
assert_frame_equal(result, expected)
def test_convert_objects_no_conversion(self):
mixed1 = DataFrame(
{'a': [1, 2, 3], 'b': [4.0, 5, 6], 'c': ['x', 'y', 'z']})
mixed2 = mixed1._convert(datetime=True)
assert_frame_equal(mixed1, mixed2)
def test_infer_objects(self):
# GH 11221
df = DataFrame({'a': ['a', 1, 2, 3],
'b': ['b', 2.0, 3.0, 4.1],
'c': ['c', datetime(2016, 1, 1),
datetime(2016, 1, 2),
datetime(2016, 1, 3)],
'd': [1, 2, 3, 'd']},
columns=['a', 'b', 'c', 'd'])
df = df.iloc[1:].infer_objects()
assert df['a'].dtype == 'int64'
assert df['b'].dtype == 'float64'
assert df['c'].dtype == 'M8[ns]'
assert df['d'].dtype == 'object'
expected = DataFrame({'a': [1, 2, 3],
'b': [2.0, 3.0, 4.1],
'c': [datetime(2016, 1, 1),
datetime(2016, 1, 2),
datetime(2016, 1, 3)],
'd': [2, 3, 'd']},
columns=['a', 'b', 'c', 'd'])
# reconstruct frame to verify inference is same
tm.assert_frame_equal(df.reset_index(drop=True), expected)
def test_stale_cached_series_bug_473(self):
# this is chained, but ok
with option_context('chained_assignment', None):
Y = DataFrame(np.random.random((4, 4)), index=('a', 'b', 'c', 'd'),
columns=('e', 'f', 'g', 'h'))
repr(Y)
Y['e'] = Y['e'].astype('object')
Y['g']['c'] = np.NaN
repr(Y)
result = Y.sum() # noqa
exp = Y['g'].sum() # noqa
assert pd.isna(Y['g']['c'])
def test_get_X_columns(self):
# numeric and object columns
df = DataFrame({'a': [1, 2, 3],
'b': [True, False, True],
'c': ['foo', 'bar', 'baz'],
'd': [None, None, None],
'e': [3.14, 0.577, 2.773]})
tm.assert_index_equal(df._get_numeric_data().columns,
pd.Index(['a', 'b', 'e']))
def test_strange_column_corruption_issue(self):
# (wesm) Unclear how exactly this is related to internal matters
df = DataFrame(index=[0, 1])
df[0] = np.nan
wasCol = {}
# uncommenting these makes the results match
# for col in xrange(100, 200):
# wasCol[col] = 1
# df[col] = np.nan
for i, dt in enumerate(df.index):
for col in range(100, 200):
if col not in wasCol:
wasCol[col] = 1
df[col] = np.nan
df[col][dt] = i
myid = 100
first = len(df.loc[pd.isna(df[myid]), [myid]])
second = len(df.loc[pd.isna(df[myid]), [myid]])
assert first == second == 0
def test_constructor_no_pandas_array(self):
# Ensure that PandasArray isn't allowed inside Series
# See https://github.com/pandas-dev/pandas/issues/23995 for more.
arr = pd.Series([1, 2, 3]).array
result = pd.DataFrame({"A": arr})
expected = pd.DataFrame({"A": [1, 2, 3]})
tm.assert_frame_equal(result, expected)
assert isinstance(result._data.blocks[0], IntBlock)
@@ -0,0 +1,863 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
from datetime import datetime
import numpy as np
import pytest
from pandas.compat import lrange
import pandas as pd
from pandas import DataFrame, Index, Series, Timestamp, date_range
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal
class TestDataFrameConcatCommon(TestData):
def test_concat_multiple_frames_dtypes(self):
# GH 2759
A = DataFrame(data=np.ones((10, 2)), columns=[
'foo', 'bar'], dtype=np.float64)
B = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
results = pd.concat((A, B), axis=1).get_dtype_counts()
expected = Series(dict(float64=2, float32=2))
assert_series_equal(results, expected)
@pytest.mark.parametrize('data', [
pd.date_range('2000', periods=4),
pd.date_range('2000', periods=4, tz="US/Central"),
pd.period_range('2000', periods=4),
pd.timedelta_range(0, periods=4),
])
def test_combine_datetlike_udf(self, data):
# https://github.com/pandas-dev/pandas/issues/23079
df = pd.DataFrame({"A": data})
other = df.copy()
df.iloc[1, 0] = None
def combiner(a, b):
return b
result = df.combine(other, combiner)
tm.assert_frame_equal(result, other)
def test_concat_multiple_tzs(self):
# GH 12467
# combining datetime tz-aware and naive DataFrames
ts1 = Timestamp('2015-01-01', tz=None)
ts2 = Timestamp('2015-01-01', tz='UTC')
ts3 = Timestamp('2015-01-01', tz='EST')
df1 = DataFrame(dict(time=[ts1]))
df2 = DataFrame(dict(time=[ts2]))
df3 = DataFrame(dict(time=[ts3]))
results = pd.concat([df1, df2]).reset_index(drop=True)
expected = DataFrame(dict(time=[ts1, ts2]), dtype=object)
assert_frame_equal(results, expected)
results = pd.concat([df1, df3]).reset_index(drop=True)
expected = DataFrame(dict(time=[ts1, ts3]), dtype=object)
assert_frame_equal(results, expected)
results = pd.concat([df2, df3]).reset_index(drop=True)
expected = DataFrame(dict(time=[ts2, ts3]))
assert_frame_equal(results, expected)
@pytest.mark.parametrize(
't1',
[
'2015-01-01',
pytest.param(pd.NaT, marks=pytest.mark.xfail(
reason='GH23037 incorrect dtype when concatenating'))])
def test_concat_tz_NaT(self, t1):
# GH 22796
# Concating tz-aware multicolumn DataFrames
ts1 = Timestamp(t1, tz='UTC')
ts2 = Timestamp('2015-01-01', tz='UTC')
ts3 = Timestamp('2015-01-01', tz='UTC')
df1 = DataFrame([[ts1, ts2]])
df2 = DataFrame([[ts3]])
result = pd.concat([df1, df2])
expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0])
assert_frame_equal(result, expected)
def test_concat_tz_not_aligned(self):
# GH 22796
ts = pd.to_datetime([1, 2]).tz_localize("UTC")
a = pd.DataFrame({"A": ts})
b = pd.DataFrame({"A": ts, "B": ts})
result = pd.concat([a, b], sort=True, ignore_index=True)
expected = pd.DataFrame({"A": list(ts) + list(ts),
"B": [pd.NaT, pd.NaT] + list(ts)})
assert_frame_equal(result, expected)
def test_concat_tuple_keys(self):
# GH 14438
df1 = pd.DataFrame(np.ones((2, 2)), columns=list('AB'))
df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list('AB'))
results = pd.concat((df1, df2), keys=[('bee', 'bah'), ('bee', 'boo')])
expected = pd.DataFrame(
{'A': {('bee', 'bah', 0): 1.0,
('bee', 'bah', 1): 1.0,
('bee', 'boo', 0): 2.0,
('bee', 'boo', 1): 2.0,
('bee', 'boo', 2): 2.0},
'B': {('bee', 'bah', 0): 1.0,
('bee', 'bah', 1): 1.0,
('bee', 'boo', 0): 2.0,
('bee', 'boo', 1): 2.0,
('bee', 'boo', 2): 2.0}})
assert_frame_equal(results, expected)
def test_append_series_dict(self):
df = DataFrame(np.random.randn(5, 4),
columns=['foo', 'bar', 'baz', 'qux'])
series = df.loc[4]
msg = 'Indexes have overlapping values'
with pytest.raises(ValueError, match=msg):
df.append(series, verify_integrity=True)
series.name = None
msg = 'Can only append a Series if ignore_index=True'
with pytest.raises(TypeError, match=msg):
df.append(series, verify_integrity=True)
result = df.append(series[::-1], ignore_index=True)
expected = df.append(DataFrame({0: series[::-1]}, index=df.columns).T,
ignore_index=True)
assert_frame_equal(result, expected)
# dict
result = df.append(series.to_dict(), ignore_index=True)
assert_frame_equal(result, expected)
result = df.append(series[::-1][:3], ignore_index=True)
expected = df.append(DataFrame({0: series[::-1][:3]}).T,
ignore_index=True, sort=True)
assert_frame_equal(result, expected.loc[:, result.columns])
# can append when name set
row = df.loc[4]
row.name = 5
result = df.append(row)
expected = df.append(df[-1:], ignore_index=True)
assert_frame_equal(result, expected)
def test_append_list_of_series_dicts(self):
df = DataFrame(np.random.randn(5, 4),
columns=['foo', 'bar', 'baz', 'qux'])
dicts = [x.to_dict() for idx, x in df.iterrows()]
result = df.append(dicts, ignore_index=True)
expected = df.append(df, ignore_index=True)
assert_frame_equal(result, expected)
# different columns
dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4},
{'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}]
result = df.append(dicts, ignore_index=True, sort=True)
expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
assert_frame_equal(result, expected)
def test_append_empty_dataframe(self):
# Empty df append empty df
df1 = DataFrame([])
df2 = DataFrame([])
result = df1.append(df2)
expected = df1.copy()
assert_frame_equal(result, expected)
# Non-empty df append empty df
df1 = DataFrame(np.random.randn(5, 2))
df2 = DataFrame()
result = df1.append(df2)
expected = df1.copy()
assert_frame_equal(result, expected)
# Empty df with columns append empty df
df1 = DataFrame(columns=['bar', 'foo'])
df2 = DataFrame()
result = df1.append(df2)
expected = df1.copy()
assert_frame_equal(result, expected)
# Non-Empty df with columns append empty df
df1 = DataFrame(np.random.randn(5, 2), columns=['bar', 'foo'])
df2 = DataFrame()
result = df1.append(df2)
expected = df1.copy()
assert_frame_equal(result, expected)
def test_append_dtypes(self):
# GH 5754
# row appends of different dtypes (so need to do by-item)
# can sometimes infer the correct type
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(5))
df2 = DataFrame()
result = df1.append(df2)
expected = df1.copy()
assert_frame_equal(result, expected)
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
df2 = DataFrame({'bar': 'foo'}, index=lrange(1, 2))
result = df1.append(df2)
expected = DataFrame({'bar': [Timestamp('20130101'), 'foo']})
assert_frame_equal(result, expected)
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
df2 = DataFrame({'bar': np.nan}, index=lrange(1, 2))
result = df1.append(df2)
expected = DataFrame(
{'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')})
assert_frame_equal(result, expected)
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
df2 = DataFrame({'bar': np.nan}, index=lrange(1, 2), dtype=object)
result = df1.append(df2)
expected = DataFrame(
{'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')})
assert_frame_equal(result, expected)
df1 = DataFrame({'bar': np.nan}, index=lrange(1))
df2 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1, 2))
result = df1.append(df2)
expected = DataFrame(
{'bar': Series([np.nan, Timestamp('20130101')], dtype='M8[ns]')})
assert_frame_equal(result, expected)
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
df2 = DataFrame({'bar': 1}, index=lrange(1, 2), dtype=object)
result = df1.append(df2)
expected = DataFrame({'bar': Series([Timestamp('20130101'), 1])})
assert_frame_equal(result, expected)
def test_update(self):
df = DataFrame([[1.5, np.nan, 3.],
[1.5, np.nan, 3.],
[1.5, np.nan, 3],
[1.5, np.nan, 3]])
other = DataFrame([[3.6, 2., np.nan],
[np.nan, np.nan, 7]], index=[1, 3])
df.update(other)
expected = DataFrame([[1.5, np.nan, 3],
[3.6, 2, 3],
[1.5, np.nan, 3],
[1.5, np.nan, 7.]])
assert_frame_equal(df, expected)
def test_update_dtypes(self):
# gh 3016
df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
columns=['A', 'B', 'bool1', 'bool2'])
other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
df.update(other)
expected = DataFrame([[45., 45., False, True], [4., 5., True, False]],
columns=['A', 'B', 'bool1', 'bool2'])
assert_frame_equal(df, expected)
def test_update_nooverwrite(self):
df = DataFrame([[1.5, np.nan, 3.],
[1.5, np.nan, 3.],
[1.5, np.nan, 3],
[1.5, np.nan, 3]])
other = DataFrame([[3.6, 2., np.nan],
[np.nan, np.nan, 7]], index=[1, 3])
df.update(other, overwrite=False)
expected = DataFrame([[1.5, np.nan, 3],
[1.5, 2, 3],
[1.5, np.nan, 3],
[1.5, np.nan, 3.]])
assert_frame_equal(df, expected)
def test_update_filtered(self):
df = DataFrame([[1.5, np.nan, 3.],
[1.5, np.nan, 3.],
[1.5, np.nan, 3],
[1.5, np.nan, 3]])
other = DataFrame([[3.6, 2., np.nan],
[np.nan, np.nan, 7]], index=[1, 3])
df.update(other, filter_func=lambda x: x > 2)
expected = DataFrame([[1.5, np.nan, 3],
[1.5, np.nan, 3],
[1.5, np.nan, 3],
[1.5, np.nan, 7.]])
assert_frame_equal(df, expected)
@pytest.mark.parametrize('bad_kwarg, exception, msg', [
# errors must be 'ignore' or 'raise'
({'errors': 'something'}, ValueError, 'The parameter errors must.*'),
({'join': 'inner'}, NotImplementedError, 'Only left join is supported')
])
def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg):
df = DataFrame([[1.5, 1, 3.]])
with pytest.raises(exception, match=msg):
df.update(df, **bad_kwarg)
def test_update_raise_on_overlap(self):
df = DataFrame([[1.5, 1, 3.],
[1.5, np.nan, 3.],
[1.5, np.nan, 3],
[1.5, np.nan, 3]])
other = DataFrame([[2., np.nan],
[np.nan, 7]], index=[1, 3], columns=[1, 2])
with pytest.raises(ValueError, match="Data overlaps"):
df.update(other, errors='raise')
@pytest.mark.parametrize('raise_conflict', [True, False])
def test_update_deprecation(self, raise_conflict):
df = DataFrame([[1.5, 1, 3.]])
other = DataFrame()
with tm.assert_produces_warning(FutureWarning):
df.update(other, raise_conflict=raise_conflict)
def test_update_from_non_df(self):
d = {'a': Series([1, 2, 3, 4]), 'b': Series([5, 6, 7, 8])}
df = DataFrame(d)
d['a'] = Series([5, 6, 7, 8])
df.update(d)
expected = DataFrame(d)
assert_frame_equal(df, expected)
d = {'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}
df = DataFrame(d)
d['a'] = [5, 6, 7, 8]
df.update(d)
expected = DataFrame(d)
assert_frame_equal(df, expected)
def test_join_str_datetime(self):
str_dates = ['20120209', '20120222']
dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
A = DataFrame(str_dates, index=lrange(2), columns=['aa'])
C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)
tst = A.join(C, on='aa')
assert len(tst.columns) == 3
def test_join_multiindex_leftright(self):
# GH 10741
df1 = (pd.DataFrame([['a', 'x', 0.471780], ['a', 'y', 0.774908],
['a', 'z', 0.563634], ['b', 'x', -0.353756],
['b', 'y', 0.368062], ['b', 'z', -1.721840],
['c', 'x', 1], ['c', 'y', 2], ['c', 'z', 3]],
columns=['first', 'second', 'value1'])
.set_index(['first', 'second']))
df2 = (pd.DataFrame([['a', 10], ['b', 20]],
columns=['first', 'value2'])
.set_index(['first']))
exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10],
[-0.353756, 20], [0.368062, 20],
[-1.721840, 20],
[1.000000, np.nan], [2.000000, np.nan],
[3.000000, np.nan]],
index=df1.index, columns=['value1', 'value2'])
# these must be the same results (but columns are flipped)
assert_frame_equal(df1.join(df2, how='left'), exp)
assert_frame_equal(df2.join(df1, how='right'),
exp[['value2', 'value1']])
exp_idx = pd.MultiIndex.from_product([['a', 'b'], ['x', 'y', 'z']],
names=['first', 'second'])
exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10],
[-0.353756, 20], [0.368062, 20], [-1.721840, 20]],
index=exp_idx, columns=['value1', 'value2'])
assert_frame_equal(df1.join(df2, how='right'), exp)
assert_frame_equal(df2.join(df1, how='left'),
exp[['value2', 'value1']])
def test_concat_named_keys(self):
# GH 14252
df = pd.DataFrame({'foo': [1, 2], 'bar': [0.1, 0.2]})
index = Index(['a', 'b'], name='baz')
concatted_named_from_keys = pd.concat([df, df], keys=index)
expected_named = pd.DataFrame(
{'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]},
index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]),
names=['baz', None]))
assert_frame_equal(concatted_named_from_keys, expected_named)
index_no_name = Index(['a', 'b'], name=None)
concatted_named_from_names = pd.concat(
[df, df], keys=index_no_name, names=['baz'])
assert_frame_equal(concatted_named_from_names, expected_named)
concatted_unnamed = pd.concat([df, df], keys=index_no_name)
expected_unnamed = pd.DataFrame(
{'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]},
index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]),
names=[None, None]))
assert_frame_equal(concatted_unnamed, expected_unnamed)
def test_concat_axis_parameter(self):
# GH 14369
df1 = pd.DataFrame({'A': [0.1, 0.2]}, index=range(2))
df2 = pd.DataFrame({'A': [0.3, 0.4]}, index=range(2))
# Index/row/0 DataFrame
expected_index = pd.DataFrame(
{'A': [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1])
concatted_index = pd.concat([df1, df2], axis='index')
assert_frame_equal(concatted_index, expected_index)
concatted_row = pd.concat([df1, df2], axis='rows')
assert_frame_equal(concatted_row, expected_index)
concatted_0 = pd.concat([df1, df2], axis=0)
assert_frame_equal(concatted_0, expected_index)
# Columns/1 DataFrame
expected_columns = pd.DataFrame(
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=['A', 'A'])
concatted_columns = pd.concat([df1, df2], axis='columns')
assert_frame_equal(concatted_columns, expected_columns)
concatted_1 = pd.concat([df1, df2], axis=1)
assert_frame_equal(concatted_1, expected_columns)
series1 = pd.Series([0.1, 0.2])
series2 = pd.Series([0.3, 0.4])
# Index/row/0 Series
expected_index_series = pd.Series(
[0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1])
concatted_index_series = pd.concat([series1, series2], axis='index')
assert_series_equal(concatted_index_series, expected_index_series)
concatted_row_series = pd.concat([series1, series2], axis='rows')
assert_series_equal(concatted_row_series, expected_index_series)
concatted_0_series = pd.concat([series1, series2], axis=0)
assert_series_equal(concatted_0_series, expected_index_series)
# Columns/1 Series
expected_columns_series = pd.DataFrame(
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1])
concatted_columns_series = pd.concat(
[series1, series2], axis='columns')
assert_frame_equal(concatted_columns_series, expected_columns_series)
concatted_1_series = pd.concat([series1, series2], axis=1)
assert_frame_equal(concatted_1_series, expected_columns_series)
# Testing ValueError
with pytest.raises(ValueError, match='No axis named'):
pd.concat([series1, series2], axis='something')
def test_concat_numerical_names(self):
# #15262 # #12223
df = pd.DataFrame({'col': range(9)},
dtype='int32',
index=(pd.MultiIndex
.from_product([['A0', 'A1', 'A2'],
['B0', 'B1', 'B2']],
names=[1, 2])))
result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :]))
expected = pd.DataFrame({'col': [0, 1, 7, 8]},
dtype='int32',
index=pd.MultiIndex.from_tuples([('A0', 'B0'),
('A0', 'B1'),
('A2', 'B1'),
('A2', 'B2')],
names=[1, 2]))
tm.assert_frame_equal(result, expected)
class TestDataFrameCombineFirst(TestData):
def test_combine_first_mixed(self):
a = Series(['a', 'b'], index=lrange(2))
b = Series(lrange(2), index=lrange(2))
f = DataFrame({'A': a, 'B': b})
a = Series(['a', 'b'], index=lrange(5, 7))
b = Series(lrange(2), index=lrange(5, 7))
g = DataFrame({'A': a, 'B': b})
exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]},
index=[0, 1, 5, 6])
combined = f.combine_first(g)
tm.assert_frame_equal(combined, exp)
def test_combine_first(self):
# disjoint
head, tail = self.frame[:5], self.frame[5:]
combined = head.combine_first(tail)
reordered_frame = self.frame.reindex(combined.index)
assert_frame_equal(combined, reordered_frame)
assert tm.equalContents(combined.columns, self.frame.columns)
assert_series_equal(combined['A'], reordered_frame['A'])
# same index
fcopy = self.frame.copy()
fcopy['A'] = 1
del fcopy['C']
fcopy2 = self.frame.copy()
fcopy2['B'] = 0
del fcopy2['D']
combined = fcopy.combine_first(fcopy2)
assert (combined['A'] == 1).all()
assert_series_equal(combined['B'], fcopy['B'])
assert_series_equal(combined['C'], fcopy2['C'])
assert_series_equal(combined['D'], fcopy['D'])
# overlap
head, tail = reordered_frame[:10].copy(), reordered_frame
head['A'] = 1
combined = head.combine_first(tail)
assert (combined['A'][:10] == 1).all()
# reverse overlap
tail['A'][:10] = 0
combined = tail.combine_first(head)
assert (combined['A'][:10] == 0).all()
# no overlap
f = self.frame[:10]
g = self.frame[10:]
combined = f.combine_first(g)
assert_series_equal(combined['A'].reindex(f.index), f['A'])
assert_series_equal(combined['A'].reindex(g.index), g['A'])
# corner cases
comb = self.frame.combine_first(self.empty)
assert_frame_equal(comb, self.frame)
comb = self.empty.combine_first(self.frame)
assert_frame_equal(comb, self.frame)
comb = self.frame.combine_first(DataFrame(index=["faz", "boo"]))
assert "faz" in comb.index
# #2525
df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)])
df2 = DataFrame({}, columns=['b'])
result = df.combine_first(df2)
assert 'b' in result
def test_combine_first_mixed_bug(self):
idx = Index(['a', 'b', 'c', 'e'])
ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
ser2 = Series(['a', 'b', 'c', 'e'], index=idx)
ser3 = Series([12, 4, 5, 97], index=idx)
frame1 = DataFrame({"col0": ser1,
"col2": ser2,
"col3": ser3})
idx = Index(['a', 'b', 'c', 'f'])
ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
ser2 = Series(['a', 'b', 'c', 'f'], index=idx)
ser3 = Series([12, 4, 5, 97], index=idx)
frame2 = DataFrame({"col1": ser1,
"col2": ser2,
"col5": ser3})
combined = frame1.combine_first(frame2)
assert len(combined.columns) == 5
# gh 3016 (same as in update)
df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
columns=['A', 'B', 'bool1', 'bool2'])
other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
result = df.combine_first(other)
assert_frame_equal(result, df)
df.loc[0, 'A'] = np.nan
result = df.combine_first(other)
df.loc[0, 'A'] = 45
assert_frame_equal(result, df)
# doc example
df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan],
'B': [np.nan, 2., 3., np.nan, 6.]})
df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.],
'B': [np.nan, np.nan, 3., 4., 6., 8.]})
result = df1.combine_first(df2)
expected = DataFrame(
{'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]})
assert_frame_equal(result, expected)
# GH3552, return object dtype with bools
df1 = DataFrame(
[[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]])
df2 = DataFrame(
[[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2])
result = df1.combine_first(df2)[2]
expected = Series([True, True, False], name=2)
assert_series_equal(result, expected)
# GH 3593, converting datetime64[ns] incorrecly
df0 = DataFrame({"a": [datetime(2000, 1, 1),
datetime(2000, 1, 2),
datetime(2000, 1, 3)]})
df1 = DataFrame({"a": [None, None, None]})
df2 = df1.combine_first(df0)
assert_frame_equal(df2, df0)
df2 = df0.combine_first(df1)
assert_frame_equal(df2, df0)
df0 = DataFrame({"a": [datetime(2000, 1, 1),
datetime(2000, 1, 2),
datetime(2000, 1, 3)]})
df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
df2 = df1.combine_first(df0)
result = df0.copy()
result.iloc[0, :] = df1.iloc[0, :]
assert_frame_equal(df2, result)
df2 = df0.combine_first(df1)
assert_frame_equal(df2, df0)
def test_combine_first_align_nan(self):
# GH 7509 (not fixed)
dfa = pd.DataFrame([[pd.Timestamp('2011-01-01'), 2]],
columns=['a', 'b'])
dfb = pd.DataFrame([[4], [5]], columns=['b'])
assert dfa['a'].dtype == 'datetime64[ns]'
assert dfa['b'].dtype == 'int64'
res = dfa.combine_first(dfb)
exp = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'), pd.NaT],
'b': [2., 5.]}, columns=['a', 'b'])
tm.assert_frame_equal(res, exp)
assert res['a'].dtype == 'datetime64[ns]'
# ToDo: this must be int64
assert res['b'].dtype == 'float64'
res = dfa.iloc[:0].combine_first(dfb)
exp = pd.DataFrame({'a': [np.nan, np.nan],
'b': [4, 5]}, columns=['a', 'b'])
tm.assert_frame_equal(res, exp)
# ToDo: this must be datetime64
assert res['a'].dtype == 'float64'
# ToDo: this must be int64
assert res['b'].dtype == 'int64'
def test_combine_first_timezone(self):
# see gh-7630
data1 = pd.to_datetime('20100101 01:01').tz_localize('UTC')
df1 = pd.DataFrame(columns=['UTCdatetime', 'abc'],
data=data1,
index=pd.date_range('20140627', periods=1))
data2 = pd.to_datetime('20121212 12:12').tz_localize('UTC')
df2 = pd.DataFrame(columns=['UTCdatetime', 'xyz'],
data=data2,
index=pd.date_range('20140628', periods=1))
res = df2[['UTCdatetime']].combine_first(df1)
exp = pd.DataFrame({'UTCdatetime': [pd.Timestamp('2010-01-01 01:01',
tz='UTC'),
pd.Timestamp('2012-12-12 12:12',
tz='UTC')],
'abc': [pd.Timestamp('2010-01-01 01:01:00',
tz='UTC'), pd.NaT]},
columns=['UTCdatetime', 'abc'],
index=pd.date_range('20140627', periods=2,
freq='D'))
tm.assert_frame_equal(res, exp)
assert res['UTCdatetime'].dtype == 'datetime64[ns, UTC]'
assert res['abc'].dtype == 'datetime64[ns, UTC]'
# see gh-10567
dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='UTC')
df1 = pd.DataFrame({'DATE': dts1})
dts2 = pd.date_range('2015-01-03', '2015-01-05', tz='UTC')
df2 = pd.DataFrame({'DATE': dts2})
res = df1.combine_first(df2)
tm.assert_frame_equal(res, df1)
assert res['DATE'].dtype == 'datetime64[ns, UTC]'
dts1 = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03',
'2011-01-04'], tz='US/Eastern')
df1 = pd.DataFrame({'DATE': dts1}, index=[1, 3, 5, 7])
dts2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02',
'2012-01-03'], tz='US/Eastern')
df2 = pd.DataFrame({'DATE': dts2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = pd.DatetimeIndex(['2011-01-01', '2012-01-01', 'NaT',
'2012-01-02', '2011-01-03', '2011-01-04'],
tz='US/Eastern')
exp = pd.DataFrame({'DATE': exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
# different tz
dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='US/Eastern')
df1 = pd.DataFrame({'DATE': dts1})
dts2 = pd.date_range('2015-01-03', '2015-01-05')
df2 = pd.DataFrame({'DATE': dts2})
# if df1 doesn't have NaN, keep its dtype
res = df1.combine_first(df2)
tm.assert_frame_equal(res, df1)
assert res['DATE'].dtype == 'datetime64[ns, US/Eastern]'
dts1 = pd.date_range('2015-01-01', '2015-01-02', tz='US/Eastern')
df1 = pd.DataFrame({'DATE': dts1})
dts2 = pd.date_range('2015-01-01', '2015-01-03')
df2 = pd.DataFrame({'DATE': dts2})
res = df1.combine_first(df2)
exp_dts = [pd.Timestamp('2015-01-01', tz='US/Eastern'),
pd.Timestamp('2015-01-02', tz='US/Eastern'),
pd.Timestamp('2015-01-03')]
exp = pd.DataFrame({'DATE': exp_dts})
tm.assert_frame_equal(res, exp)
assert res['DATE'].dtype == 'object'
def test_combine_first_timedelta(self):
data1 = pd.TimedeltaIndex(['1 day', 'NaT', '3 day', '4day'])
df1 = pd.DataFrame({'TD': data1}, index=[1, 3, 5, 7])
data2 = pd.TimedeltaIndex(['10 day', '11 day', '12 day'])
df2 = pd.DataFrame({'TD': data2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = pd.TimedeltaIndex(['1 day', '10 day', 'NaT',
'11 day', '3 day', '4 day'])
exp = pd.DataFrame({'TD': exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
assert res['TD'].dtype == 'timedelta64[ns]'
def test_combine_first_period(self):
data1 = pd.PeriodIndex(['2011-01', 'NaT', '2011-03',
'2011-04'], freq='M')
df1 = pd.DataFrame({'P': data1}, index=[1, 3, 5, 7])
data2 = pd.PeriodIndex(['2012-01-01', '2012-02',
'2012-03'], freq='M')
df2 = pd.DataFrame({'P': data2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = pd.PeriodIndex(['2011-01', '2012-01', 'NaT',
'2012-02', '2011-03', '2011-04'],
freq='M')
exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
assert res['P'].dtype == data1.dtype
# different freq
dts2 = pd.PeriodIndex(['2012-01-01', '2012-01-02',
'2012-01-03'], freq='D')
df2 = pd.DataFrame({'P': dts2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = [pd.Period('2011-01', freq='M'),
pd.Period('2012-01-01', freq='D'),
pd.NaT,
pd.Period('2012-01-02', freq='D'),
pd.Period('2011-03', freq='M'),
pd.Period('2011-04', freq='M')]
exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
assert res['P'].dtype == 'object'
def test_combine_first_int(self):
# GH14687 - integer series that do no align exactly
df1 = pd.DataFrame({'a': [0, 1, 3, 5]}, dtype='int64')
df2 = pd.DataFrame({'a': [1, 4]}, dtype='int64')
res = df1.combine_first(df2)
tm.assert_frame_equal(res, df1)
assert res['a'].dtype == 'int64'
@pytest.mark.parametrize("val", [1, 1.0])
def test_combine_first_with_asymmetric_other(self, val):
# see gh-20699
df1 = pd.DataFrame({'isNum': [val]})
df2 = pd.DataFrame({'isBool': [True]})
res = df1.combine_first(df2)
exp = pd.DataFrame({'isBool': [True], 'isNum': [val]})
tm.assert_frame_equal(res, exp)
def test_concat_datetime_datetime64_frame(self):
# #2624
rows = []
rows.append([datetime(2010, 1, 1), 1])
rows.append([datetime(2010, 1, 2), 'hi'])
df2_obj = DataFrame.from_records(rows, columns=['date', 'test'])
ind = date_range(start="2000/1/1", freq="D", periods=10)
df1 = DataFrame({'date': ind, 'test': lrange(10)})
# it works!
pd.concat([df1, df2_obj])
class TestDataFrameUpdate(TestData):
def test_update_nan(self):
# #15593 #15617
# test 1
df1 = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)})
df2 = DataFrame({'A': [None, 2, 3]})
expected = df1.copy()
df1.update(df2, overwrite=False)
tm.assert_frame_equal(df1, expected)
# test 2
df1 = DataFrame({'A': [1.0, None, 3],
'B': date_range('2000', periods=3)})
df2 = DataFrame({'A': [None, 2, 3]})
expected = DataFrame({'A': [1.0, 2, 3],
'B': date_range('2000', periods=3)})
df1.update(df2, overwrite=False)
tm.assert_frame_equal(df1, expected)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,490 @@
# -*- coding: utf-8 -*-
import collections
from collections import OrderedDict, defaultdict
from datetime import datetime
import numpy as np
import pytest
import pytz
from pandas.compat import long
from pandas import DataFrame, MultiIndex, Series, Timestamp, compat, date_range
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
class TestDataFrameConvertTo(TestData):
def test_to_dict_timestamp(self):
# GH11247
# split/records producing np.datetime64 rather than Timestamps
# on datetime64[ns] dtypes only
tsmp = Timestamp('20130101')
test_data = DataFrame({'A': [tsmp, tsmp], 'B': [tsmp, tsmp]})
test_data_mixed = DataFrame({'A': [tsmp, tsmp], 'B': [1, 2]})
expected_records = [{'A': tsmp, 'B': tsmp},
{'A': tsmp, 'B': tsmp}]
expected_records_mixed = [{'A': tsmp, 'B': 1},
{'A': tsmp, 'B': 2}]
assert (test_data.to_dict(orient='records') ==
expected_records)
assert (test_data_mixed.to_dict(orient='records') ==
expected_records_mixed)
expected_series = {
'A': Series([tsmp, tsmp], name='A'),
'B': Series([tsmp, tsmp], name='B'),
}
expected_series_mixed = {
'A': Series([tsmp, tsmp], name='A'),
'B': Series([1, 2], name='B'),
}
tm.assert_dict_equal(test_data.to_dict(orient='series'),
expected_series)
tm.assert_dict_equal(test_data_mixed.to_dict(orient='series'),
expected_series_mixed)
expected_split = {
'index': [0, 1],
'data': [[tsmp, tsmp],
[tsmp, tsmp]],
'columns': ['A', 'B']
}
expected_split_mixed = {
'index': [0, 1],
'data': [[tsmp, 1],
[tsmp, 2]],
'columns': ['A', 'B']
}
tm.assert_dict_equal(test_data.to_dict(orient='split'),
expected_split)
tm.assert_dict_equal(test_data_mixed.to_dict(orient='split'),
expected_split_mixed)
def test_to_dict_index_not_unique_with_index_orient(self):
# GH22801
# Data loss when indexes are not unique. Raise ValueError.
df = DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
pytest.raises(ValueError, df.to_dict, orient='index')
def test_to_dict_invalid_orient(self):
df = DataFrame({'A': [0, 1]})
pytest.raises(ValueError, df.to_dict, orient='xinvalid')
def test_to_records_dt64(self):
df = DataFrame([["one", "two", "three"],
["four", "five", "six"]],
index=date_range("2012-01-01", "2012-01-02"))
# convert_datetime64 defaults to None
expected = df.index.values[0]
result = df.to_records()['index'][0]
assert expected == result
# check for FutureWarning if convert_datetime64=False is passed
with tm.assert_produces_warning(FutureWarning):
expected = df.index.values[0]
result = df.to_records(convert_datetime64=False)['index'][0]
assert expected == result
# check for FutureWarning if convert_datetime64=True is passed
with tm.assert_produces_warning(FutureWarning):
expected = df.index[0]
result = df.to_records(convert_datetime64=True)['index'][0]
assert expected == result
def test_to_records_with_multindex(self):
# GH3189
index = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
data = np.zeros((8, 4))
df = DataFrame(data, index=index)
r = df.to_records(index=True)['level_0']
assert 'bar' in r
assert 'one' not in r
def test_to_records_with_Mapping_type(self):
import email
from email.parser import Parser
compat.Mapping.register(email.message.Message)
headers = Parser().parsestr('From: <user@example.com>\n'
'To: <someone_else@example.com>\n'
'Subject: Test message\n'
'\n'
'Body would go here\n')
frame = DataFrame.from_records([headers])
all(x in frame for x in ['Type', 'Subject', 'From'])
def test_to_records_floats(self):
df = DataFrame(np.random.rand(10, 10))
df.to_records()
def test_to_records_index_name(self):
df = DataFrame(np.random.randn(3, 3))
df.index.name = 'X'
rs = df.to_records()
assert 'X' in rs.dtype.fields
df = DataFrame(np.random.randn(3, 3))
rs = df.to_records()
assert 'index' in rs.dtype.fields
df.index = MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')])
df.index.names = ['A', None]
rs = df.to_records()
assert 'level_0' in rs.dtype.fields
def test_to_records_with_unicode_index(self):
# GH13172
# unicode_literals conflict with to_records
result = DataFrame([{u'a': u'x', u'b': 'y'}]).set_index(u'a') \
.to_records()
expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')])
tm.assert_almost_equal(result, expected)
def test_to_records_with_unicode_column_names(self):
# xref issue: https://github.com/numpy/numpy/issues/2407
# Issue #11879. to_records used to raise an exception when used
# with column names containing non-ascii characters in Python 2
result = DataFrame(data={u"accented_name_é": [1.0]}).to_records()
# Note that numpy allows for unicode field names but dtypes need
# to be specified using dictionary instead of list of tuples.
expected = np.rec.array(
[(0, 1.0)],
dtype={"names": ["index", u"accented_name_é"],
"formats": ['=i8', '=f8']}
)
tm.assert_almost_equal(result, expected)
def test_to_records_with_categorical(self):
# GH8626
# dict creation
df = DataFrame({'A': list('abc')}, dtype='category')
expected = Series(list('abc'), dtype='category', name='A')
tm.assert_series_equal(df['A'], expected)
# list-like creation
df = DataFrame(list('abc'), dtype='category')
expected = Series(list('abc'), dtype='category', name=0)
tm.assert_series_equal(df[0], expected)
# to record array
# this coerces
result = df.to_records()
expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')],
dtype=[('index', '=i8'), ('0', 'O')])
tm.assert_almost_equal(result, expected)
@pytest.mark.parametrize("kwargs,expected", [
# No dtypes --> default to array dtypes.
(dict(),
np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[("index", "<i8"), ("A", "<i8"),
("B", "<f8"), ("C", "O")])),
# Should have no effect in this case.
(dict(index=True),
np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[("index", "<i8"), ("A", "<i8"),
("B", "<f8"), ("C", "O")])),
# Column dtype applied across the board. Index unaffected.
(dict(column_dtypes="<U4"),
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<i8"), ("A", "<U4"),
("B", "<U4"), ("C", "<U4")])),
# Index dtype applied across the board. Columns unaffected.
(dict(index_dtypes="<U1"),
np.rec.array([("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
dtype=[("index", "<U1"), ("A", "<i8"),
("B", "<f8"), ("C", "O")])),
# Pass in a type instance.
(dict(column_dtypes=np.unicode),
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<i8"), ("A", "<U"),
("B", "<U"), ("C", "<U")])),
# Pass in a dictionary (name-only).
(dict(column_dtypes={"A": np.int8, "B": np.float32, "C": "<U2"}),
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<i8"), ("A", "i1"),
("B", "<f4"), ("C", "<U2")])),
# Pass in a dictionary (indices-only).
(dict(index_dtypes={0: "int16"}),
np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[("index", "i2"), ("A", "<i8"),
("B", "<f8"), ("C", "O")])),
# Ignore index mappings if index is not True.
(dict(index=False, index_dtypes="<U2"),
np.rec.array([(1, 0.2, "a"), (2, 1.5, "bc")],
dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")])),
# Non-existent names / indices in mapping should not error.
(dict(index_dtypes={0: "int16", "not-there": "float32"}),
np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[("index", "i2"), ("A", "<i8"),
("B", "<f8"), ("C", "O")])),
# Names / indices not in mapping default to array dtype.
(dict(column_dtypes={"A": np.int8, "B": np.float32}),
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<i8"), ("A", "i1"),
("B", "<f4"), ("C", "O")])),
# Mixture of everything.
(dict(column_dtypes={"A": np.int8, "B": np.float32},
index_dtypes="<U2"),
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<U2"), ("A", "i1"),
("B", "<f4"), ("C", "O")])),
# Invalid dype values.
(dict(index=False, column_dtypes=list()),
"Invalid dtype \\[\\] specified for column A"),
(dict(index=False, column_dtypes={"A": "int32", "B": 5}),
"Invalid dtype 5 specified for column B"),
])
def test_to_records_dtype(self, kwargs, expected):
# see gh-18146
df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
if isinstance(expected, str):
with pytest.raises(ValueError, match=expected):
df.to_records(**kwargs)
else:
result = df.to_records(**kwargs)
tm.assert_almost_equal(result, expected)
@pytest.mark.parametrize("df,kwargs,expected", [
# MultiIndex in the index.
(DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=list("abc")).set_index(["a", "b"]),
dict(column_dtypes="float64", index_dtypes={0: "int32", 1: "int8"}),
np.rec.array([(1, 2, 3.), (4, 5, 6.), (7, 8, 9.)],
dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")])),
# MultiIndex in the columns.
(DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
("c", "f")])),
dict(column_dtypes={0: "<U1", 2: "float32"}, index_dtypes="float32"),
np.rec.array([(0., u"1", 2, 3.), (1., u"4", 5, 6.),
(2., u"7", 8, 9.)],
dtype=[("index", "<f4"),
("('a', 'd')", "<U1"),
("('b', 'e')", "<i8"),
("('c', 'f')", "<f4")])),
# MultiIndex in both the columns and index.
(DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=MultiIndex.from_tuples([
("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")),
index=MultiIndex.from_tuples([
("d", -4), ("d", -5), ("f", -6)], names=list("cd"))),
dict(column_dtypes="float64", index_dtypes={0: "<U2", 1: "int8"}),
np.rec.array([("d", -4, 1., 2., 3.), ("d", -5, 4., 5., 6.),
("f", -6, 7, 8, 9.)],
dtype=[("c", "<U2"), ("d", "i1"),
("('a', 'd')", "<f8"), ("('b', 'e')", "<f8"),
("('c', 'f')", "<f8")]))
])
def test_to_records_dtype_mi(self, df, kwargs, expected):
# see gh-18146
result = df.to_records(**kwargs)
tm.assert_almost_equal(result, expected)
def test_to_records_dict_like(self):
# see gh-18146
class DictLike(object):
def __init__(self, **kwargs):
self.d = kwargs.copy()
def __getitem__(self, key):
return self.d.__getitem__(key)
def __contains__(self, key):
return key in self.d
def keys(self):
return self.d.keys()
df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
dtype_mappings = dict(column_dtypes=DictLike(**{"A": np.int8,
"B": np.float32}),
index_dtypes="<U2")
result = df.to_records(**dtype_mappings)
expected = np.rec.array([("0", "1", "0.2", "a"),
("1", "2", "1.5", "bc")],
dtype=[("index", "<U2"), ("A", "i1"),
("B", "<f4"), ("C", "O")])
tm.assert_almost_equal(result, expected)
@pytest.mark.parametrize('mapping', [
dict,
collections.defaultdict(list),
collections.OrderedDict])
def test_to_dict(self, mapping):
test_data = {
'A': {'1': 1, '2': 2},
'B': {'1': '1', '2': '2', '3': '3'},
}
# GH16122
recons_data = DataFrame(test_data).to_dict(into=mapping)
for k, v in compat.iteritems(test_data):
for k2, v2 in compat.iteritems(v):
assert (v2 == recons_data[k][k2])
recons_data = DataFrame(test_data).to_dict("l", mapping)
for k, v in compat.iteritems(test_data):
for k2, v2 in compat.iteritems(v):
assert (v2 == recons_data[k][int(k2) - 1])
recons_data = DataFrame(test_data).to_dict("s", mapping)
for k, v in compat.iteritems(test_data):
for k2, v2 in compat.iteritems(v):
assert (v2 == recons_data[k][k2])
recons_data = DataFrame(test_data).to_dict("sp", mapping)
expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'],
'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]}
tm.assert_dict_equal(recons_data, expected_split)
recons_data = DataFrame(test_data).to_dict("r", mapping)
expected_records = [{'A': 1.0, 'B': '1'},
{'A': 2.0, 'B': '2'},
{'A': np.nan, 'B': '3'}]
assert isinstance(recons_data, list)
assert (len(recons_data) == 3)
for l, r in zip(recons_data, expected_records):
tm.assert_dict_equal(l, r)
# GH10844
recons_data = DataFrame(test_data).to_dict("i")
for k, v in compat.iteritems(test_data):
for k2, v2 in compat.iteritems(v):
assert (v2 == recons_data[k2][k])
df = DataFrame(test_data)
df['duped'] = df[df.columns[0]]
recons_data = df.to_dict("i")
comp_data = test_data.copy()
comp_data['duped'] = comp_data[df.columns[0]]
for k, v in compat.iteritems(comp_data):
for k2, v2 in compat.iteritems(v):
assert (v2 == recons_data[k2][k])
@pytest.mark.parametrize('mapping', [
list,
collections.defaultdict,
[]])
def test_to_dict_errors(self, mapping):
# GH16122
df = DataFrame(np.random.randn(3, 3))
with pytest.raises(TypeError):
df.to_dict(into=mapping)
def test_to_dict_not_unique_warning(self):
# GH16927: When converting to a dict, if a column has a non-unique name
# it will be dropped, throwing a warning.
df = DataFrame([[1, 2, 3]], columns=['a', 'a', 'b'])
with tm.assert_produces_warning(UserWarning):
df.to_dict()
@pytest.mark.parametrize('tz', ['UTC', 'GMT', 'US/Eastern'])
def test_to_records_datetimeindex_with_tz(self, tz):
# GH13937
dr = date_range('2016-01-01', periods=10,
freq='S', tz=tz)
df = DataFrame({'datetime': dr}, index=dr)
expected = df.to_records()
result = df.tz_convert("UTC").to_records()
# both converted to UTC, so they are equal
tm.assert_numpy_array_equal(result, expected)
# orient - orient argument to to_dict function
# item_getter - function for extracting value from
# the resulting dict using column name and index
@pytest.mark.parametrize('orient,item_getter', [
('dict', lambda d, col, idx: d[col][idx]),
('records', lambda d, col, idx: d[idx][col]),
('list', lambda d, col, idx: d[col][idx]),
('split', lambda d, col, idx: d['data'][idx][d['columns'].index(col)]),
('index', lambda d, col, idx: d[idx][col])
])
def test_to_dict_box_scalars(self, orient, item_getter):
# 14216, 23753
# make sure that we are boxing properly
df = DataFrame({'a': [1, 2], 'b': [.1, .2]})
result = df.to_dict(orient=orient)
assert isinstance(item_getter(result, 'a', 0), (int, long))
assert isinstance(item_getter(result, 'b', 0), float)
def test_frame_to_dict_tz(self):
# GH18372 When converting to dict with orient='records' columns of
# datetime that are tz-aware were not converted to required arrays
data = [(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
(datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc,),)]
df = DataFrame(list(data), columns=["d", ])
result = df.to_dict(orient='records')
expected = [
{'d': Timestamp('2017-11-18 21:53:00.219225+0000', tz=pytz.utc)},
{'d': Timestamp('2017-11-18 22:06:30.061810+0000', tz=pytz.utc)},
]
tm.assert_dict_equal(result[0], expected[0])
tm.assert_dict_equal(result[1], expected[1])
@pytest.mark.parametrize('into, expected', [
(dict, {0: {'int_col': 1, 'float_col': 1.0},
1: {'int_col': 2, 'float_col': 2.0},
2: {'int_col': 3, 'float_col': 3.0}}),
(OrderedDict, OrderedDict([(0, {'int_col': 1, 'float_col': 1.0}),
(1, {'int_col': 2, 'float_col': 2.0}),
(2, {'int_col': 3, 'float_col': 3.0})])),
(defaultdict(list), defaultdict(list,
{0: {'int_col': 1, 'float_col': 1.0},
1: {'int_col': 2, 'float_col': 2.0},
2: {'int_col': 3, 'float_col': 3.0}}))
])
def test_to_dict_index_dtypes(self, into, expected):
# GH 18580
# When using to_dict(orient='index') on a dataframe with int
# and float columns only the int columns were cast to float
df = DataFrame({'int_col': [1, 2, 3],
'float_col': [1.0, 2.0, 3.0]})
result = df.to_dict(orient='index', into=into)
cols = ['int_col', 'float_col']
result = DataFrame.from_dict(result, orient='index')[cols]
expected = DataFrame.from_dict(expected, orient='index')[cols]
tm.assert_frame_equal(result, expected)
@@ -0,0 +1,989 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
from datetime import timedelta
import numpy as np
import pytest
from pandas.compat import u
from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype
import pandas as pd
from pandas import (
Categorical, DataFrame, Series, Timedelta, Timestamp,
_np_version_under1p14, compat, concat, date_range, option_context)
from pandas.core.arrays import integer_array
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
from pandas.util.testing import (
assert_frame_equal, assert_series_equal, makeCustomDataframe as mkdf)
@pytest.fixture(params=[str, compat.text_type])
def text_dtype(request):
return request.param
class TestDataFrameDataTypes(TestData):
def test_concat_empty_dataframe_dtypes(self):
df = DataFrame(columns=list("abc"))
df['a'] = df['a'].astype(np.bool_)
df['b'] = df['b'].astype(np.int32)
df['c'] = df['c'].astype(np.float64)
result = pd.concat([df, df])
assert result['a'].dtype == np.bool_
assert result['b'].dtype == np.int32
assert result['c'].dtype == np.float64
result = pd.concat([df, df.astype(np.float64)])
assert result['a'].dtype == np.object_
assert result['b'].dtype == np.float64
assert result['c'].dtype == np.float64
def test_empty_frame_dtypes_ftypes(self):
empty_df = pd.DataFrame()
assert_series_equal(empty_df.dtypes, pd.Series(dtype=np.object))
assert_series_equal(empty_df.ftypes, pd.Series(dtype=np.object))
nocols_df = pd.DataFrame(index=[1, 2, 3])
assert_series_equal(nocols_df.dtypes, pd.Series(dtype=np.object))
assert_series_equal(nocols_df.ftypes, pd.Series(dtype=np.object))
norows_df = pd.DataFrame(columns=list("abc"))
assert_series_equal(norows_df.dtypes, pd.Series(
np.object, index=list("abc")))
assert_series_equal(norows_df.ftypes, pd.Series(
'object:dense', index=list("abc")))
norows_int_df = pd.DataFrame(columns=list("abc")).astype(np.int32)
assert_series_equal(norows_int_df.dtypes, pd.Series(
np.dtype('int32'), index=list("abc")))
assert_series_equal(norows_int_df.ftypes, pd.Series(
'int32:dense', index=list("abc")))
odict = compat.OrderedDict
df = pd.DataFrame(odict([('a', 1), ('b', True), ('c', 1.0)]),
index=[1, 2, 3])
ex_dtypes = pd.Series(odict([('a', np.int64),
('b', np.bool),
('c', np.float64)]))
ex_ftypes = pd.Series(odict([('a', 'int64:dense'),
('b', 'bool:dense'),
('c', 'float64:dense')]))
assert_series_equal(df.dtypes, ex_dtypes)
assert_series_equal(df.ftypes, ex_ftypes)
# same but for empty slice of df
assert_series_equal(df[:0].dtypes, ex_dtypes)
assert_series_equal(df[:0].ftypes, ex_ftypes)
def test_datetime_with_tz_dtypes(self):
tzframe = DataFrame({'A': date_range('20130101', periods=3),
'B': date_range('20130101', periods=3,
tz='US/Eastern'),
'C': date_range('20130101', periods=3, tz='CET')})
tzframe.iloc[1, 1] = pd.NaT
tzframe.iloc[1, 2] = pd.NaT
result = tzframe.dtypes.sort_index()
expected = Series([np.dtype('datetime64[ns]'),
DatetimeTZDtype('ns', 'US/Eastern'),
DatetimeTZDtype('ns', 'CET')],
['A', 'B', 'C'])
assert_series_equal(result, expected)
def test_dtypes_are_correct_after_column_slice(self):
# GH6525
df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_)
odict = compat.OrderedDict
assert_series_equal(df.dtypes,
pd.Series(odict([('a', np.float_),
('b', np.float_),
('c', np.float_)])))
assert_series_equal(df.iloc[:, 2:].dtypes,
pd.Series(odict([('c', np.float_)])))
assert_series_equal(df.dtypes,
pd.Series(odict([('a', np.float_),
('b', np.float_),
('c', np.float_)])))
def test_select_dtypes_include_using_list_like(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.Categorical(list('abc')),
'g': pd.date_range('20130101', periods=3),
'h': pd.date_range('20130101', periods=3,
tz='US/Eastern'),
'i': pd.date_range('20130101', periods=3,
tz='CET'),
'j': pd.period_range('2013-01', periods=3,
freq='M'),
'k': pd.timedelta_range('1 day', periods=3)})
ri = df.select_dtypes(include=[np.number])
ei = df[['b', 'c', 'd', 'k']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=[np.number], exclude=['timedelta'])
ei = df[['b', 'c', 'd']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=[np.number, 'category'],
exclude=['timedelta'])
ei = df[['b', 'c', 'd', 'f']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=['datetime'])
ei = df[['g']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=['datetime64'])
ei = df[['g']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=['datetimetz'])
ei = df[['h', 'i']]
assert_frame_equal(ri, ei)
pytest.raises(NotImplementedError,
lambda: df.select_dtypes(include=['period']))
def test_select_dtypes_exclude_using_list_like(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True]})
re = df.select_dtypes(exclude=[np.number])
ee = df[['a', 'e']]
assert_frame_equal(re, ee)
def test_select_dtypes_exclude_include_using_list_like(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.date_range('now', periods=3).values})
exclude = np.datetime64,
include = np.bool_, 'integer'
r = df.select_dtypes(include=include, exclude=exclude)
e = df[['b', 'c', 'e']]
assert_frame_equal(r, e)
exclude = 'datetime',
include = 'bool', 'int64', 'int32'
r = df.select_dtypes(include=include, exclude=exclude)
e = df[['b', 'e']]
assert_frame_equal(r, e)
def test_select_dtypes_include_using_scalars(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.Categorical(list('abc')),
'g': pd.date_range('20130101', periods=3),
'h': pd.date_range('20130101', periods=3,
tz='US/Eastern'),
'i': pd.date_range('20130101', periods=3,
tz='CET'),
'j': pd.period_range('2013-01', periods=3,
freq='M'),
'k': pd.timedelta_range('1 day', periods=3)})
ri = df.select_dtypes(include=np.number)
ei = df[['b', 'c', 'd', 'k']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include='datetime')
ei = df[['g']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include='datetime64')
ei = df[['g']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include='category')
ei = df[['f']]
assert_frame_equal(ri, ei)
pytest.raises(NotImplementedError,
lambda: df.select_dtypes(include='period'))
def test_select_dtypes_exclude_using_scalars(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.Categorical(list('abc')),
'g': pd.date_range('20130101', periods=3),
'h': pd.date_range('20130101', periods=3,
tz='US/Eastern'),
'i': pd.date_range('20130101', periods=3,
tz='CET'),
'j': pd.period_range('2013-01', periods=3,
freq='M'),
'k': pd.timedelta_range('1 day', periods=3)})
ri = df.select_dtypes(exclude=np.number)
ei = df[['a', 'e', 'f', 'g', 'h', 'i', 'j']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(exclude='category')
ei = df[['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k']]
assert_frame_equal(ri, ei)
pytest.raises(NotImplementedError,
lambda: df.select_dtypes(exclude='period'))
def test_select_dtypes_include_exclude_using_scalars(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.Categorical(list('abc')),
'g': pd.date_range('20130101', periods=3),
'h': pd.date_range('20130101', periods=3,
tz='US/Eastern'),
'i': pd.date_range('20130101', periods=3,
tz='CET'),
'j': pd.period_range('2013-01', periods=3,
freq='M'),
'k': pd.timedelta_range('1 day', periods=3)})
ri = df.select_dtypes(include=np.number, exclude='floating')
ei = df[['b', 'c', 'k']]
assert_frame_equal(ri, ei)
def test_select_dtypes_include_exclude_mixed_scalars_lists(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.Categorical(list('abc')),
'g': pd.date_range('20130101', periods=3),
'h': pd.date_range('20130101', periods=3,
tz='US/Eastern'),
'i': pd.date_range('20130101', periods=3,
tz='CET'),
'j': pd.period_range('2013-01', periods=3,
freq='M'),
'k': pd.timedelta_range('1 day', periods=3)})
ri = df.select_dtypes(include=np.number,
exclude=['floating', 'timedelta'])
ei = df[['b', 'c']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=[np.number, 'category'],
exclude='floating')
ei = df[['b', 'c', 'f', 'k']]
assert_frame_equal(ri, ei)
def test_select_dtypes_duplicate_columns(self):
# GH20839
odict = compat.OrderedDict
df = DataFrame(odict([('a', list('abc')),
('b', list(range(1, 4))),
('c', np.arange(3, 6).astype('u1')),
('d', np.arange(4.0, 7.0, dtype='float64')),
('e', [True, False, True]),
('f', pd.date_range('now', periods=3).values)]))
df.columns = ['a', 'a', 'b', 'b', 'b', 'c']
expected = DataFrame({'a': list(range(1, 4)),
'b': np.arange(3, 6).astype('u1')})
result = df.select_dtypes(include=[np.number], exclude=['floating'])
assert_frame_equal(result, expected)
def test_select_dtypes_not_an_attr_but_still_valid_dtype(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.date_range('now', periods=3).values})
df['g'] = df.f.diff()
assert not hasattr(np, 'u8')
r = df.select_dtypes(include=['i8', 'O'], exclude=['timedelta'])
e = df[['a', 'b']]
assert_frame_equal(r, e)
r = df.select_dtypes(include=['i8', 'O', 'timedelta64[ns]'])
e = df[['a', 'b', 'g']]
assert_frame_equal(r, e)
def test_select_dtypes_empty(self):
df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))})
msg = 'at least one of include or exclude must be nonempty'
with pytest.raises(ValueError, match=msg):
df.select_dtypes()
def test_select_dtypes_bad_datetime64(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.date_range('now', periods=3).values})
with pytest.raises(ValueError, match='.+ is too specific'):
df.select_dtypes(include=['datetime64[D]'])
with pytest.raises(ValueError, match='.+ is too specific'):
df.select_dtypes(exclude=['datetime64[as]'])
def test_select_dtypes_datetime_with_tz(self):
df2 = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'),
B=Timestamp('20130603', tz='CET')),
index=range(5))
df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
result = df3.select_dtypes(include=['datetime64[ns]'])
expected = df3.reindex(columns=[])
assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", [
str, "str", np.string_, "S1", "unicode", np.unicode_, "U1",
compat.text_type
])
@pytest.mark.parametrize("arg", ["include", "exclude"])
def test_select_dtypes_str_raises(self, dtype, arg):
df = DataFrame({"a": list("abc"),
"g": list(u("abc")),
"b": list(range(1, 4)),
"c": np.arange(3, 6).astype("u1"),
"d": np.arange(4.0, 7.0, dtype="float64"),
"e": [True, False, True],
"f": pd.date_range("now", periods=3).values})
msg = "string dtypes are not allowed"
kwargs = {arg: [dtype]}
with pytest.raises(TypeError, match=msg):
df.select_dtypes(**kwargs)
def test_select_dtypes_bad_arg_raises(self):
df = DataFrame({'a': list('abc'),
'g': list(u('abc')),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.date_range('now', periods=3).values})
msg = 'data type.*not understood'
with pytest.raises(TypeError, match=msg):
df.select_dtypes(['blargy, blarg, blarg'])
def test_select_dtypes_typecodes(self):
# GH 11990
df = mkdf(30, 3, data_gen_f=lambda x, y: np.random.random())
expected = df
FLOAT_TYPES = list(np.typecodes['AllFloat'])
assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)
def test_dtypes_gh8722(self):
self.mixed_frame['bool'] = self.mixed_frame['A'] > 0
result = self.mixed_frame.dtypes
expected = Series({k: v.dtype
for k, v in compat.iteritems(self.mixed_frame)},
index=result.index)
assert_series_equal(result, expected)
# compat, GH 8722
with option_context('use_inf_as_na', True):
df = DataFrame([[1]])
result = df.dtypes
assert_series_equal(result, Series({0: np.dtype('int64')}))
def test_ftypes(self):
frame = self.mixed_float
expected = Series(dict(A='float32:dense',
B='float32:dense',
C='float16:dense',
D='float64:dense')).sort_values()
result = frame.ftypes.sort_values()
assert_series_equal(result, expected)
def test_astype(self):
casted = self.frame.astype(int)
expected = DataFrame(self.frame.values.astype(int),
index=self.frame.index,
columns=self.frame.columns)
assert_frame_equal(casted, expected)
casted = self.frame.astype(np.int32)
expected = DataFrame(self.frame.values.astype(np.int32),
index=self.frame.index,
columns=self.frame.columns)
assert_frame_equal(casted, expected)
self.frame['foo'] = '5'
casted = self.frame.astype(int)
expected = DataFrame(self.frame.values.astype(int),
index=self.frame.index,
columns=self.frame.columns)
assert_frame_equal(casted, expected)
# mixed casting
def _check_cast(df, v):
assert (list({s.dtype.name for
_, s in compat.iteritems(df)})[0] == v)
mn = self.all_mixed._get_numeric_data().copy()
mn['little_float'] = np.array(12345., dtype='float16')
mn['big_float'] = np.array(123456789101112., dtype='float64')
casted = mn.astype('float64')
_check_cast(casted, 'float64')
casted = mn.astype('int64')
_check_cast(casted, 'int64')
casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float32')
_check_cast(casted, 'float32')
casted = mn.reindex(columns=['little_float']).astype('float16')
_check_cast(casted, 'float16')
casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float16')
_check_cast(casted, 'float16')
casted = mn.astype('float32')
_check_cast(casted, 'float32')
casted = mn.astype('int32')
_check_cast(casted, 'int32')
# to object
casted = mn.astype('O')
_check_cast(casted, 'object')
def test_astype_with_exclude_string(self):
df = self.frame.copy()
expected = self.frame.astype(int)
df['string'] = 'foo'
casted = df.astype(int, errors='ignore')
expected['string'] = 'foo'
assert_frame_equal(casted, expected)
df = self.frame.copy()
expected = self.frame.astype(np.int32)
df['string'] = 'foo'
casted = df.astype(np.int32, errors='ignore')
expected['string'] = 'foo'
assert_frame_equal(casted, expected)
def test_astype_with_view(self):
tf = self.mixed_float.reindex(columns=['A', 'B', 'C'])
casted = tf.astype(np.int64)
casted = tf.astype(np.float32)
# this is the only real reason to do it this way
tf = np.round(self.frame).astype(np.int32)
casted = tf.astype(np.float32, copy=False)
# TODO(wesm): verification?
tf = self.frame.astype(np.float64)
casted = tf.astype(np.int64, copy=False) # noqa
@pytest.mark.parametrize("dtype", [np.int32, np.int64])
@pytest.mark.parametrize("val", [np.nan, np.inf])
def test_astype_cast_nan_inf_int(self, val, dtype):
# see gh-14265
#
# Check NaN and inf --> raise error when converting to int.
msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
df = DataFrame([val])
with pytest.raises(ValueError, match=msg):
df.astype(dtype)
def test_astype_str(self, text_dtype):
# see gh-9757
a = Series(date_range("2010-01-04", periods=5))
b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
c = Series([Timedelta(x, unit="d") for x in range(5)])
d = Series(range(5))
e = Series([0.0, 0.2, 0.4, 0.6, 0.8])
df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})
# Datetime-like
# Test str and unicode on Python 2.x and just str on Python 3.x
result = df.astype(text_dtype)
expected = DataFrame({
"a": list(map(text_dtype,
map(lambda x: Timestamp(x)._date_repr, a._values))),
"b": list(map(text_dtype, map(Timestamp, b._values))),
"c": list(map(text_dtype,
map(lambda x: Timedelta(x)._repr_base(format="all"),
c._values))),
"d": list(map(text_dtype, d._values)),
"e": list(map(text_dtype, e._values)),
})
assert_frame_equal(result, expected)
def test_astype_str_float(self, text_dtype):
# see gh-11302
result = DataFrame([np.NaN]).astype(text_dtype)
expected = DataFrame(["nan"])
assert_frame_equal(result, expected)
result = DataFrame([1.12345678901234567890]).astype(text_dtype)
# < 1.14 truncates
# >= 1.14 preserves the full repr
val = ("1.12345678901" if _np_version_under1p14
else "1.1234567890123457")
expected = DataFrame([val])
assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype_class", [dict, Series])
def test_astype_dict_like(self, dtype_class):
# GH7271 & GH16717
a = Series(date_range('2010-01-04', periods=5))
b = Series(range(5))
c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
d = Series(['1.0', '2', '3.14', '4', '5.4'])
df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d})
original = df.copy(deep=True)
# change type of a subset of columns
dt1 = dtype_class({'b': 'str', 'd': 'float32'})
result = df.astype(dt1)
expected = DataFrame({
'a': a,
'b': Series(['0', '1', '2', '3', '4']),
'c': c,
'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')})
assert_frame_equal(result, expected)
assert_frame_equal(df, original)
dt2 = dtype_class({'b': np.float32, 'c': 'float32', 'd': np.float64})
result = df.astype(dt2)
expected = DataFrame({
'a': a,
'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'),
'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'),
'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')})
assert_frame_equal(result, expected)
assert_frame_equal(df, original)
# change all columns
dt3 = dtype_class({'a': str, 'b': str, 'c': str, 'd': str})
assert_frame_equal(df.astype(dt3),
df.astype(str))
assert_frame_equal(df, original)
# error should be raised when using something other than column labels
# in the keys of the dtype dict
dt4 = dtype_class({'b': str, 2: str})
dt5 = dtype_class({'e': str})
pytest.raises(KeyError, df.astype, dt4)
pytest.raises(KeyError, df.astype, dt5)
assert_frame_equal(df, original)
# if the dtypes provided are the same as the original dtypes, the
# resulting DataFrame should be the same as the original DataFrame
dt6 = dtype_class({col: df[col].dtype for col in df.columns})
equiv = df.astype(dt6)
assert_frame_equal(df, equiv)
assert_frame_equal(df, original)
# GH 16717
# if dtypes provided is empty, the resulting DataFrame
# should be the same as the original DataFrame
dt7 = dtype_class({})
result = df.astype(dt7)
assert_frame_equal(df, equiv)
assert_frame_equal(df, original)
def test_astype_duplicate_col(self):
a1 = Series([1, 2, 3, 4, 5], name='a')
b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name='b')
a2 = Series([0, 1, 2, 3, 4], name='a')
df = concat([a1, b, a2], axis=1)
result = df.astype(str)
a1_str = Series(['1', '2', '3', '4', '5'], dtype='str', name='a')
b_str = Series(['0.1', '0.2', '0.4', '0.6', '0.8'], dtype=str,
name='b')
a2_str = Series(['0', '1', '2', '3', '4'], dtype='str', name='a')
expected = concat([a1_str, b_str, a2_str], axis=1)
assert_frame_equal(result, expected)
result = df.astype({'a': 'str'})
expected = concat([a1_str, b, a2_str], axis=1)
assert_frame_equal(result, expected)
@pytest.mark.parametrize('dtype', [
'category',
CategoricalDtype(),
CategoricalDtype(ordered=True),
CategoricalDtype(ordered=False),
CategoricalDtype(categories=list('abcdef')),
CategoricalDtype(categories=list('edba'), ordered=False),
CategoricalDtype(categories=list('edcb'), ordered=True)], ids=repr)
def test_astype_categorical(self, dtype):
# GH 18099
d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')}
df = DataFrame(d)
result = df.astype(dtype)
expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("cls", [
pd.api.types.CategoricalDtype,
pd.api.types.DatetimeTZDtype,
pd.api.types.IntervalDtype
])
def test_astype_categoricaldtype_class_raises(self, cls):
df = DataFrame({"A": ['a', 'a', 'b', 'c']})
xpr = "Expected an instance of {}".format(cls.__name__)
with pytest.raises(TypeError, match=xpr):
df.astype({"A": cls})
with pytest.raises(TypeError, match=xpr):
df['A'].astype(cls)
@pytest.mark.parametrize("dtype", ['Int64', 'Int32', 'Int16'])
def test_astype_extension_dtypes(self, dtype):
# GH 22578
df = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], columns=['a', 'b'])
expected1 = pd.DataFrame({'a': integer_array([1, 3, 5],
dtype=dtype),
'b': integer_array([2, 4, 6],
dtype=dtype)})
tm.assert_frame_equal(df.astype(dtype), expected1)
tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1)
tm.assert_frame_equal(df.astype(dtype).astype('float64'), df)
df = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], columns=['a', 'b'])
df['b'] = df['b'].astype(dtype)
expected2 = pd.DataFrame({'a': [1., 3., 5.],
'b': integer_array([2, 4, 6],
dtype=dtype)})
tm.assert_frame_equal(df, expected2)
tm.assert_frame_equal(df.astype(dtype), expected1)
tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1)
@pytest.mark.parametrize("dtype", ['Int64', 'Int32', 'Int16'])
def test_astype_extension_dtypes_1d(self, dtype):
# GH 22578
df = pd.DataFrame({'a': [1., 2., 3.]})
expected1 = pd.DataFrame({'a': integer_array([1, 2, 3],
dtype=dtype)})
tm.assert_frame_equal(df.astype(dtype), expected1)
tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1)
df = pd.DataFrame({'a': [1., 2., 3.]})
df['a'] = df['a'].astype(dtype)
expected2 = pd.DataFrame({'a': integer_array([1, 2, 3],
dtype=dtype)})
tm.assert_frame_equal(df, expected2)
tm.assert_frame_equal(df.astype(dtype), expected1)
tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1)
@pytest.mark.parametrize("dtype", ['category', 'Int64'])
def test_astype_extension_dtypes_duplicate_col(self, dtype):
# GH 24704
a1 = Series([0, np.nan, 4], name='a')
a2 = Series([np.nan, 3, 5], name='a')
df = concat([a1, a2], axis=1)
result = df.astype(dtype)
expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1)
assert_frame_equal(result, expected)
@pytest.mark.parametrize('dtype', [
{100: 'float64', 200: 'uint64'}, 'category', 'float64'])
def test_astype_column_metadata(self, dtype):
# GH 19920
columns = pd.UInt64Index([100, 200, 300], name='foo')
df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
df = df.astype(dtype)
tm.assert_index_equal(df.columns, columns)
@pytest.mark.parametrize("dtype", ["M8", "m8"])
@pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
def test_astype_from_datetimelike_to_objectt(self, dtype, unit):
# tests astype to object dtype
# gh-19223 / gh-12425
dtype = "{}[{}]".format(dtype, unit)
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
result = df.astype(object)
assert (result.dtypes == object).all()
if dtype.startswith('M8'):
assert result.iloc[0, 0] == pd.to_datetime(1, unit=unit)
else:
assert result.iloc[0, 0] == pd.to_timedelta(1, unit=unit)
@pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
@pytest.mark.parametrize("dtype", ["M8", "m8"])
@pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
# tests all units from numeric origination
# gh-19223 / gh-12425
dtype = "{}[{}]".format(dtype, unit)
arr = np.array([[1, 2, 3]], dtype=arr_dtype)
df = DataFrame(arr)
result = df.astype(dtype)
expected = DataFrame(arr.astype(dtype))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
def test_astype_to_datetime_unit(self, unit):
# tests all units from datetime origination
# gh-19223
dtype = "M8[{}]".format(unit)
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
result = df.astype(dtype)
expected = DataFrame(arr.astype(dtype))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("unit", ['ns'])
def test_astype_to_timedelta_unit_ns(self, unit):
# preserver the timedelta conversion
# gh-19223
dtype = "m8[{}]".format(unit)
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
result = df.astype(dtype)
expected = DataFrame(arr.astype(dtype))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("unit", ['us', 'ms', 's', 'h', 'm', 'D'])
def test_astype_to_timedelta_unit(self, unit):
# coerce to float
# gh-19223
dtype = "m8[{}]".format(unit)
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
result = df.astype(dtype)
expected = DataFrame(df.values.astype(dtype).astype(float))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
def test_astype_to_incorrect_datetimelike(self, unit):
# trying to astype a m to a M, or vice-versa
# gh-19224
dtype = "M8[{}]".format(unit)
other = "m8[{}]".format(unit)
df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
with pytest.raises(TypeError):
df.astype(other)
df = DataFrame(np.array([[1, 2, 3]], dtype=other))
with pytest.raises(TypeError):
df.astype(dtype)
def test_timedeltas(self):
df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3,
freq='D')),
B=Series([timedelta(days=i) for i in range(3)])))
result = df.get_dtype_counts().sort_index()
expected = Series(
{'datetime64[ns]': 1, 'timedelta64[ns]': 1}).sort_index()
assert_series_equal(result, expected)
df['C'] = df['A'] + df['B']
expected = Series(
{'datetime64[ns]': 2, 'timedelta64[ns]': 1}).sort_values()
result = df.get_dtype_counts().sort_values()
assert_series_equal(result, expected)
# mixed int types
df['D'] = 1
expected = Series({'datetime64[ns]': 2,
'timedelta64[ns]': 1,
'int64': 1}).sort_values()
result = df.get_dtype_counts().sort_values()
assert_series_equal(result, expected)
def test_arg_for_errors_in_astype(self):
# issue #14878
df = DataFrame([1, 2, 3])
with pytest.raises(ValueError):
df.astype(np.float64, errors=True)
df.astype(np.int8, errors='ignore')
@pytest.mark.parametrize('input_vals', [
([1, 2]),
(['1', '2']),
(list(pd.date_range('1/1/2011', periods=2, freq='H'))),
(list(pd.date_range('1/1/2011', periods=2, freq='H',
tz='US/Eastern'))),
([pd.Interval(left=0, right=5)]),
])
def test_constructor_list_str(self, input_vals, string_dtype):
# GH 16605
# Ensure that data elements are converted to strings when
# dtype is str, 'str', or 'U'
result = DataFrame({'A': input_vals}, dtype=string_dtype)
expected = DataFrame({'A': input_vals}).astype({'A': string_dtype})
assert_frame_equal(result, expected)
def test_constructor_list_str_na(self, string_dtype):
result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object)
assert_frame_equal(result, expected)
@pytest.mark.parametrize("data, expected", [
# empty
(DataFrame(), True),
# multi-same
(DataFrame({"A": [1, 2], "B": [1, 2]}), True),
# multi-object
(DataFrame({"A": np.array([1, 2], dtype=object),
"B": np.array(["a", "b"], dtype=object)}), True),
# multi-extension
(DataFrame({"A": pd.Categorical(['a', 'b']),
"B": pd.Categorical(['a', 'b'])}), True),
# differ types
(DataFrame({"A": [1, 2], "B": [1., 2.]}), False),
# differ sizes
(DataFrame({"A": np.array([1, 2], dtype=np.int32),
"B": np.array([1, 2], dtype=np.int64)}), False),
# multi-extension differ
(DataFrame({"A": pd.Categorical(['a', 'b']),
"B": pd.Categorical(['b', 'c'])}), False),
])
def test_is_homogeneous_type(self, data, expected):
assert data._is_homogeneous_type is expected
def test_asarray_homogenous(self):
df = pd.DataFrame({"A": pd.Categorical([1, 2]),
"B": pd.Categorical([1, 2])})
result = np.asarray(df)
# may change from object in the future
expected = np.array([[1, 1], [2, 2]], dtype='object')
tm.assert_numpy_array_equal(result, expected)
class TestDataFrameDatetimeWithTZ(TestData):
def test_interleave(self):
# interleave with object
result = self.tzframe.assign(D='foo').values
expected = np.array([[Timestamp('2013-01-01 00:00:00'),
Timestamp('2013-01-02 00:00:00'),
Timestamp('2013-01-03 00:00:00')],
[Timestamp('2013-01-01 00:00:00-0500',
tz='US/Eastern'),
pd.NaT,
Timestamp('2013-01-03 00:00:00-0500',
tz='US/Eastern')],
[Timestamp('2013-01-01 00:00:00+0100', tz='CET'),
pd.NaT,
Timestamp('2013-01-03 00:00:00+0100', tz='CET')],
['foo', 'foo', 'foo']], dtype=object).T
tm.assert_numpy_array_equal(result, expected)
# interleave with only datetime64[ns]
result = self.tzframe.values
expected = np.array([[Timestamp('2013-01-01 00:00:00'),
Timestamp('2013-01-02 00:00:00'),
Timestamp('2013-01-03 00:00:00')],
[Timestamp('2013-01-01 00:00:00-0500',
tz='US/Eastern'),
pd.NaT,
Timestamp('2013-01-03 00:00:00-0500',
tz='US/Eastern')],
[Timestamp('2013-01-01 00:00:00+0100', tz='CET'),
pd.NaT,
Timestamp('2013-01-03 00:00:00+0100',
tz='CET')]], dtype=object).T
tm.assert_numpy_array_equal(result, expected)
def test_astype(self):
# astype
expected = np.array([[Timestamp('2013-01-01 00:00:00'),
Timestamp('2013-01-02 00:00:00'),
Timestamp('2013-01-03 00:00:00')],
[Timestamp('2013-01-01 00:00:00-0500',
tz='US/Eastern'),
pd.NaT,
Timestamp('2013-01-03 00:00:00-0500',
tz='US/Eastern')],
[Timestamp('2013-01-01 00:00:00+0100', tz='CET'),
pd.NaT,
Timestamp('2013-01-03 00:00:00+0100',
tz='CET')]],
dtype=object).T
result = self.tzframe.astype(object)
assert_frame_equal(result, DataFrame(
expected, index=self.tzframe.index, columns=self.tzframe.columns))
result = self.tzframe.astype('datetime64[ns]')
expected = DataFrame({'A': date_range('20130101', periods=3),
'B': (date_range('20130101', periods=3,
tz='US/Eastern')
.tz_convert('UTC')
.tz_localize(None)),
'C': (date_range('20130101', periods=3,
tz='CET')
.tz_convert('UTC')
.tz_localize(None))})
expected.iloc[1, 1] = pd.NaT
expected.iloc[1, 2] = pd.NaT
assert_frame_equal(result, expected)
def test_astype_str(self):
# str formatting
result = self.tzframe.astype(str)
expected = DataFrame([['2013-01-01', '2013-01-01 00:00:00-05:00',
'2013-01-01 00:00:00+01:00'],
['2013-01-02', 'NaT', 'NaT'],
['2013-01-03', '2013-01-03 00:00:00-05:00',
'2013-01-03 00:00:00+01:00']],
columns=self.tzframe.columns)
tm.assert_frame_equal(result, expected)
with option_context('display.max_columns', 20):
result = str(self.tzframe)
assert ('0 2013-01-01 2013-01-01 00:00:00-05:00 '
'2013-01-01 00:00:00+01:00') in result
assert ('1 2013-01-02 '
'NaT NaT') in result
assert ('2 2013-01-03 2013-01-03 00:00:00-05:00 '
'2013-01-03 00:00:00+01:00') in result
@@ -0,0 +1,455 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import numpy as np
import pytest
from pandas.compat import lrange, string_types
from pandas import DataFrame, Series
import pandas.util.testing as tm
@pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']])
def test_duplicated_with_misspelled_column_name(subset):
# GH 19730
df = DataFrame({'A': [0, 0, 1],
'B': [0, 0, 1],
'C': [0, 0, 1]})
with pytest.raises(KeyError):
df.duplicated(subset)
with pytest.raises(KeyError):
df.drop_duplicates(subset)
@pytest.mark.slow
def test_duplicated_do_not_fail_on_wide_dataframes():
# gh-21524
# Given the wide dataframe with a lot of columns
# with different (important!) values
data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
for i in range(100)}
df = DataFrame(data).T
result = df.duplicated()
# Then duplicates produce the bool Series as a result and don't fail during
# calculation. Actual values doesn't matter here, though usually it's all
# False in this case
assert isinstance(result, Series)
assert result.dtype == np.bool
@pytest.mark.parametrize('keep, expected', [
('first', Series([False, False, True, False, True])),
('last', Series([True, True, False, False, False])),
(False, Series([True, True, True, False, True]))
])
def test_duplicated_keep(keep, expected):
df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})
result = df.duplicated(keep=keep)
tm.assert_series_equal(result, expected)
@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal")
@pytest.mark.parametrize('keep, expected', [
('first', Series([False, False, True, False, True])),
('last', Series([True, True, False, False, False])),
(False, Series([True, True, True, False, True]))
])
def test_duplicated_nan_none(keep, expected):
df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object)
result = df.duplicated(keep=keep)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize('keep', ['first', 'last', False])
@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
def test_duplicated_subset(subset, keep):
df = DataFrame({'A': [0, 1, 1, 2, 0],
'B': ['a', 'b', 'b', 'c', 'a'],
'C': [np.nan, 3, 3, None, np.nan]})
if subset is None:
subset = list(df.columns)
elif isinstance(subset, string_types):
# need to have a DataFrame, not a Series
# -> select columns with singleton list, not string
subset = [subset]
expected = df[subset].duplicated(keep=keep)
result = df.duplicated(keep=keep, subset=subset)
tm.assert_series_equal(result, expected)
def test_drop_duplicates():
df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'bar', 'foo'],
'B': ['one', 'one', 'two', 'two',
'two', 'two', 'one', 'two'],
'C': [1, 1, 2, 2, 2, 2, 1, 2],
'D': lrange(8)})
# single column
result = df.drop_duplicates('AAA')
expected = df[:2]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates('AAA', keep='last')
expected = df.loc[[6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates('AAA', keep=False)
expected = df.loc[[]]
tm.assert_frame_equal(result, expected)
assert len(result) == 0
# multi column
expected = df.loc[[0, 1, 2, 3]]
result = df.drop_duplicates(np.array(['AAA', 'B']))
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(['AAA', 'B'])
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(('AAA', 'B'), keep='last')
expected = df.loc[[0, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(('AAA', 'B'), keep=False)
expected = df.loc[[0]]
tm.assert_frame_equal(result, expected)
# consider everything
df2 = df.loc[:, ['AAA', 'B', 'C']]
result = df2.drop_duplicates()
# in this case only
expected = df2.drop_duplicates(['AAA', 'B'])
tm.assert_frame_equal(result, expected)
result = df2.drop_duplicates(keep='last')
expected = df2.drop_duplicates(['AAA', 'B'], keep='last')
tm.assert_frame_equal(result, expected)
result = df2.drop_duplicates(keep=False)
expected = df2.drop_duplicates(['AAA', 'B'], keep=False)
tm.assert_frame_equal(result, expected)
# integers
result = df.drop_duplicates('C')
expected = df.iloc[[0, 2]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates('C', keep='last')
expected = df.iloc[[-2, -1]]
tm.assert_frame_equal(result, expected)
df['E'] = df['C'].astype('int8')
result = df.drop_duplicates('E')
expected = df.iloc[[0, 2]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates('E', keep='last')
expected = df.iloc[[-2, -1]]
tm.assert_frame_equal(result, expected)
# GH 11376
df = DataFrame({'x': [7, 6, 3, 3, 4, 8, 0],
'y': [0, 6, 5, 5, 9, 1, 2]})
expected = df.loc[df.index != 3]
tm.assert_frame_equal(df.drop_duplicates(), expected)
df = DataFrame([[1, 0], [0, 2]])
tm.assert_frame_equal(df.drop_duplicates(), df)
df = DataFrame([[-2, 0], [0, -4]])
tm.assert_frame_equal(df.drop_duplicates(), df)
x = np.iinfo(np.int64).max / 3 * 2
df = DataFrame([[-x, x], [0, x + 4]])
tm.assert_frame_equal(df.drop_duplicates(), df)
df = DataFrame([[-x, x], [x, x + 4]])
tm.assert_frame_equal(df.drop_duplicates(), df)
# GH 11864
df = DataFrame([i] * 9 for i in range(16))
df = df.append([[1] + [0] * 8], ignore_index=True)
for keep in ['first', 'last', False]:
assert df.duplicated(keep=keep).sum() == 0
def test_drop_duplicates_with_duplicate_column_names():
# GH17836
df = DataFrame([
[1, 2, 5],
[3, 4, 6],
[3, 4, 7]
], columns=['a', 'a', 'b'])
result0 = df.drop_duplicates()
tm.assert_frame_equal(result0, df)
result1 = df.drop_duplicates('a')
expected1 = df[:2]
tm.assert_frame_equal(result1, expected1)
def test_drop_duplicates_for_take_all():
df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar',
'foo', 'bar', 'qux', 'foo'],
'B': ['one', 'one', 'two', 'two',
'two', 'two', 'one', 'two'],
'C': [1, 1, 2, 2, 2, 2, 1, 2],
'D': lrange(8)})
# single column
result = df.drop_duplicates('AAA')
expected = df.iloc[[0, 1, 2, 6]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates('AAA', keep='last')
expected = df.iloc[[2, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates('AAA', keep=False)
expected = df.iloc[[2, 6]]
tm.assert_frame_equal(result, expected)
# multiple columns
result = df.drop_duplicates(['AAA', 'B'])
expected = df.iloc[[0, 1, 2, 3, 4, 6]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(['AAA', 'B'], keep='last')
expected = df.iloc[[0, 1, 2, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(['AAA', 'B'], keep=False)
expected = df.iloc[[0, 1, 2, 6]]
tm.assert_frame_equal(result, expected)
def test_drop_duplicates_tuple():
df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'bar', 'foo'],
'B': ['one', 'one', 'two', 'two',
'two', 'two', 'one', 'two'],
'C': [1, 1, 2, 2, 2, 2, 1, 2],
'D': lrange(8)})
# single column
result = df.drop_duplicates(('AA', 'AB'))
expected = df[:2]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(('AA', 'AB'), keep='last')
expected = df.loc[[6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(('AA', 'AB'), keep=False)
expected = df.loc[[]] # empty df
assert len(result) == 0
tm.assert_frame_equal(result, expected)
# multi column
expected = df.loc[[0, 1, 2, 3]]
result = df.drop_duplicates((('AA', 'AB'), 'B'))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize('df', [
DataFrame(),
DataFrame(columns=[]),
DataFrame(columns=['A', 'B', 'C']),
DataFrame(index=[]),
DataFrame(index=['A', 'B', 'C'])
])
def test_drop_duplicates_empty(df):
# GH 20516
result = df.drop_duplicates()
tm.assert_frame_equal(result, df)
result = df.copy()
result.drop_duplicates(inplace=True)
tm.assert_frame_equal(result, df)
def test_drop_duplicates_NA():
# none
df = DataFrame({'A': [None, None, 'foo', 'bar',
'foo', 'bar', 'bar', 'foo'],
'B': ['one', 'one', 'two', 'two',
'two', 'two', 'one', 'two'],
'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
'D': lrange(8)})
# single column
result = df.drop_duplicates('A')
expected = df.loc[[0, 2, 3]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates('A', keep='last')
expected = df.loc[[1, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates('A', keep=False)
expected = df.loc[[]] # empty df
tm.assert_frame_equal(result, expected)
assert len(result) == 0
# multi column
result = df.drop_duplicates(['A', 'B'])
expected = df.loc[[0, 2, 3, 6]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(['A', 'B'], keep='last')
expected = df.loc[[1, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(['A', 'B'], keep=False)
expected = df.loc[[6]]
tm.assert_frame_equal(result, expected)
# nan
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'bar', 'foo'],
'B': ['one', 'one', 'two', 'two',
'two', 'two', 'one', 'two'],
'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
'D': lrange(8)})
# single column
result = df.drop_duplicates('C')
expected = df[:2]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates('C', keep='last')
expected = df.loc[[3, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates('C', keep=False)
expected = df.loc[[]] # empty df
tm.assert_frame_equal(result, expected)
assert len(result) == 0
# multi column
result = df.drop_duplicates(['C', 'B'])
expected = df.loc[[0, 1, 2, 4]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(['C', 'B'], keep='last')
expected = df.loc[[1, 3, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(['C', 'B'], keep=False)
expected = df.loc[[1]]
tm.assert_frame_equal(result, expected)
def test_drop_duplicates_NA_for_take_all():
# none
df = DataFrame({'A': [None, None, 'foo', 'bar',
'foo', 'baz', 'bar', 'qux'],
'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]})
# single column
result = df.drop_duplicates('A')
expected = df.iloc[[0, 2, 3, 5, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates('A', keep='last')
expected = df.iloc[[1, 4, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates('A', keep=False)
expected = df.iloc[[5, 7]]
tm.assert_frame_equal(result, expected)
# nan
# single column
result = df.drop_duplicates('C')
expected = df.iloc[[0, 1, 5, 6]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates('C', keep='last')
expected = df.iloc[[3, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates('C', keep=False)
expected = df.iloc[[5, 6]]
tm.assert_frame_equal(result, expected)
def test_drop_duplicates_inplace():
orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'bar', 'foo'],
'B': ['one', 'one', 'two', 'two',
'two', 'two', 'one', 'two'],
'C': [1, 1, 2, 2, 2, 2, 1, 2],
'D': lrange(8)})
# single column
df = orig.copy()
df.drop_duplicates('A', inplace=True)
expected = orig[:2]
result = df
tm.assert_frame_equal(result, expected)
df = orig.copy()
df.drop_duplicates('A', keep='last', inplace=True)
expected = orig.loc[[6, 7]]
result = df
tm.assert_frame_equal(result, expected)
df = orig.copy()
df.drop_duplicates('A', keep=False, inplace=True)
expected = orig.loc[[]]
result = df
tm.assert_frame_equal(result, expected)
assert len(df) == 0
# multi column
df = orig.copy()
df.drop_duplicates(['A', 'B'], inplace=True)
expected = orig.loc[[0, 1, 2, 3]]
result = df
tm.assert_frame_equal(result, expected)
df = orig.copy()
df.drop_duplicates(['A', 'B'], keep='last', inplace=True)
expected = orig.loc[[0, 5, 6, 7]]
result = df
tm.assert_frame_equal(result, expected)
df = orig.copy()
df.drop_duplicates(['A', 'B'], keep=False, inplace=True)
expected = orig.loc[[0]]
result = df
tm.assert_frame_equal(result, expected)
# consider everything
orig2 = orig.loc[:, ['A', 'B', 'C']].copy()
df2 = orig2.copy()
df2.drop_duplicates(inplace=True)
# in this case only
expected = orig2.drop_duplicates(['A', 'B'])
result = df2
tm.assert_frame_equal(result, expected)
df2 = orig2.copy()
df2.drop_duplicates(keep='last', inplace=True)
expected = orig2.drop_duplicates(['A', 'B'], keep='last')
result = df2
tm.assert_frame_equal(result, expected)
df2 = orig2.copy()
df2.drop_duplicates(keep=False, inplace=True)
expected = orig2.drop_duplicates(['A', 'B'], keep=False)
result = df2
tm.assert_frame_equal(result, expected)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,185 @@
# -*- coding: utf-8 -*-
import numpy as np
import pytest
from pandas import DataFrame, Index, period_range
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
@pytest.fixture
def frame_with_period_index():
return DataFrame(
data=np.arange(20).reshape(4, 5),
columns=list('abcde'),
index=period_range(start='2000', freq='A', periods=4))
@pytest.fixture
def frame():
return TestData().frame
@pytest.fixture
def left():
return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0])
@pytest.fixture
def right():
return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2])
@pytest.mark.parametrize(
"how, sort, expected",
[('inner', False, DataFrame({'a': [20, 10],
'b': [200, 100]},
index=[2, 1])),
('inner', True, DataFrame({'a': [10, 20],
'b': [100, 200]},
index=[1, 2])),
('left', False, DataFrame({'a': [20, 10, 0],
'b': [200, 100, np.nan]},
index=[2, 1, 0])),
('left', True, DataFrame({'a': [0, 10, 20],
'b': [np.nan, 100, 200]},
index=[0, 1, 2])),
('right', False, DataFrame({'a': [np.nan, 10, 20],
'b': [300, 100, 200]},
index=[3, 1, 2])),
('right', True, DataFrame({'a': [10, 20, np.nan],
'b': [100, 200, 300]},
index=[1, 2, 3])),
('outer', False, DataFrame({'a': [0, 10, 20, np.nan],
'b': [np.nan, 100, 200, 300]},
index=[0, 1, 2, 3])),
('outer', True, DataFrame({'a': [0, 10, 20, np.nan],
'b': [np.nan, 100, 200, 300]},
index=[0, 1, 2, 3]))])
def test_join(left, right, how, sort, expected):
result = left.join(right, how=how, sort=sort)
tm.assert_frame_equal(result, expected)
def test_join_index(frame):
# left / right
f = frame.loc[frame.index[:10], ['A', 'B']]
f2 = frame.loc[frame.index[5:], ['C', 'D']].iloc[::-1]
joined = f.join(f2)
tm.assert_index_equal(f.index, joined.index)
expected_columns = Index(['A', 'B', 'C', 'D'])
tm.assert_index_equal(joined.columns, expected_columns)
joined = f.join(f2, how='left')
tm.assert_index_equal(joined.index, f.index)
tm.assert_index_equal(joined.columns, expected_columns)
joined = f.join(f2, how='right')
tm.assert_index_equal(joined.index, f2.index)
tm.assert_index_equal(joined.columns, expected_columns)
# inner
joined = f.join(f2, how='inner')
tm.assert_index_equal(joined.index, f.index[5:10])
tm.assert_index_equal(joined.columns, expected_columns)
# outer
joined = f.join(f2, how='outer')
tm.assert_index_equal(joined.index, frame.index.sort_values())
tm.assert_index_equal(joined.columns, expected_columns)
with pytest.raises(ValueError, match='join method'):
f.join(f2, how='foo')
# corner case - overlapping columns
msg = 'columns overlap but no suffix'
for how in ('outer', 'left', 'inner'):
with pytest.raises(ValueError, match=msg):
frame.join(frame, how=how)
def test_join_index_more(frame):
af = frame.loc[:, ['A', 'B']]
bf = frame.loc[::2, ['C', 'D']]
expected = af.copy()
expected['C'] = frame['C'][::2]
expected['D'] = frame['D'][::2]
result = af.join(bf)
tm.assert_frame_equal(result, expected)
result = af.join(bf, how='right')
tm.assert_frame_equal(result, expected[::2])
result = bf.join(af, how='right')
tm.assert_frame_equal(result, expected.loc[:, result.columns])
def test_join_index_series(frame):
df = frame.copy()
s = df.pop(frame.columns[-1])
joined = df.join(s)
# TODO should this check_names ?
tm.assert_frame_equal(joined, frame, check_names=False)
s.name = None
with pytest.raises(ValueError, match='must have a name'):
df.join(s)
def test_join_overlap(frame):
df1 = frame.loc[:, ['A', 'B', 'C']]
df2 = frame.loc[:, ['B', 'C', 'D']]
joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2')
df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1')
df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2')
no_overlap = frame.loc[:, ['A', 'D']]
expected = df1_suf.join(df2_suf).join(no_overlap)
# column order not necessarily sorted
tm.assert_frame_equal(joined, expected.loc[:, joined.columns])
def test_join_period_index(frame_with_period_index):
other = frame_with_period_index.rename(
columns=lambda x: '{key}{key}'.format(key=x))
joined_values = np.concatenate(
[frame_with_period_index.values] * 2, axis=1)
joined_cols = frame_with_period_index.columns.append(other.columns)
joined = frame_with_period_index.join(other)
expected = DataFrame(
data=joined_values,
columns=joined_cols,
index=frame_with_period_index.index)
tm.assert_frame_equal(joined, expected)
def test_join_left_sequence_non_unique_index():
# https://github.com/pandas-dev/pandas/issues/19607
df1 = DataFrame({'a': [0, 10, 20]}, index=[1, 2, 3])
df2 = DataFrame({'b': [100, 200, 300]}, index=[4, 3, 2])
df3 = DataFrame({'c': [400, 500, 600]}, index=[2, 2, 4])
joined = df1.join([df2, df3], how='left')
expected = DataFrame({
'a': [0, 10, 10, 20],
'b': [np.nan, 300, 300, 200],
'c': [np.nan, 400, 500, np.nan]
}, index=[1, 2, 2, 3])
tm.assert_frame_equal(joined, expected)
@@ -0,0 +1,863 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import datetime
from distutils.version import LooseVersion
import dateutil
import numpy as np
import pytest
from pandas.compat import lrange
import pandas.util._test_decorators as td
import pandas as pd
from pandas import Categorical, DataFrame, Series, Timestamp, date_range
from pandas.tests.frame.common import TestData, _check_mixed_float
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal
try:
import scipy
_is_scipy_ge_0190 = (LooseVersion(scipy.__version__) >=
LooseVersion('0.19.0'))
except ImportError:
_is_scipy_ge_0190 = False
def _skip_if_no_pchip():
try:
from scipy.interpolate import pchip_interpolate # noqa
except ImportError:
import pytest
pytest.skip('scipy.interpolate.pchip missing')
class TestDataFrameMissingData(TestData):
def test_dropEmptyRows(self):
N = len(self.frame.index)
mat = np.random.randn(N)
mat[:5] = np.nan
frame = DataFrame({'foo': mat}, index=self.frame.index)
original = Series(mat, index=self.frame.index, name='foo')
expected = original.dropna()
inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()
smaller_frame = frame.dropna(how='all')
# check that original was preserved
assert_series_equal(frame['foo'], original)
inplace_frame1.dropna(how='all', inplace=True)
assert_series_equal(smaller_frame['foo'], expected)
assert_series_equal(inplace_frame1['foo'], expected)
smaller_frame = frame.dropna(how='all', subset=['foo'])
inplace_frame2.dropna(how='all', subset=['foo'], inplace=True)
assert_series_equal(smaller_frame['foo'], expected)
assert_series_equal(inplace_frame2['foo'], expected)
def test_dropIncompleteRows(self):
N = len(self.frame.index)
mat = np.random.randn(N)
mat[:5] = np.nan
frame = DataFrame({'foo': mat}, index=self.frame.index)
frame['bar'] = 5
original = Series(mat, index=self.frame.index, name='foo')
inp_frame1, inp_frame2 = frame.copy(), frame.copy()
smaller_frame = frame.dropna()
assert_series_equal(frame['foo'], original)
inp_frame1.dropna(inplace=True)
exp = Series(mat[5:], index=self.frame.index[5:], name='foo')
tm.assert_series_equal(smaller_frame['foo'], exp)
tm.assert_series_equal(inp_frame1['foo'], exp)
samesize_frame = frame.dropna(subset=['bar'])
assert_series_equal(frame['foo'], original)
assert (frame['bar'] == 5).all()
inp_frame2.dropna(subset=['bar'], inplace=True)
tm.assert_index_equal(samesize_frame.index, self.frame.index)
tm.assert_index_equal(inp_frame2.index, self.frame.index)
def test_dropna(self):
df = DataFrame(np.random.randn(6, 4))
df[2][:2] = np.nan
dropped = df.dropna(axis=1)
expected = df.loc[:, [0, 1, 3]]
inp = df.copy()
inp.dropna(axis=1, inplace=True)
assert_frame_equal(dropped, expected)
assert_frame_equal(inp, expected)
dropped = df.dropna(axis=0)
expected = df.loc[lrange(2, 6)]
inp = df.copy()
inp.dropna(axis=0, inplace=True)
assert_frame_equal(dropped, expected)
assert_frame_equal(inp, expected)
# threshold
dropped = df.dropna(axis=1, thresh=5)
expected = df.loc[:, [0, 1, 3]]
inp = df.copy()
inp.dropna(axis=1, thresh=5, inplace=True)
assert_frame_equal(dropped, expected)
assert_frame_equal(inp, expected)
dropped = df.dropna(axis=0, thresh=4)
expected = df.loc[lrange(2, 6)]
inp = df.copy()
inp.dropna(axis=0, thresh=4, inplace=True)
assert_frame_equal(dropped, expected)
assert_frame_equal(inp, expected)
dropped = df.dropna(axis=1, thresh=4)
assert_frame_equal(dropped, df)
dropped = df.dropna(axis=1, thresh=3)
assert_frame_equal(dropped, df)
# subset
dropped = df.dropna(axis=0, subset=[0, 1, 3])
inp = df.copy()
inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
assert_frame_equal(dropped, df)
assert_frame_equal(inp, df)
# all
dropped = df.dropna(axis=1, how='all')
assert_frame_equal(dropped, df)
df[2] = np.nan
dropped = df.dropna(axis=1, how='all')
expected = df.loc[:, [0, 1, 3]]
assert_frame_equal(dropped, expected)
# bad input
pytest.raises(ValueError, df.dropna, axis=3)
def test_drop_and_dropna_caching(self):
# tst that cacher updates
original = Series([1, 2, np.nan], name='A')
expected = Series([1, 2], dtype=original.dtype, name='A')
df = pd.DataFrame({'A': original.values.copy()})
df2 = df.copy()
df['A'].dropna()
assert_series_equal(df['A'], original)
df['A'].dropna(inplace=True)
assert_series_equal(df['A'], expected)
df2['A'].drop([1])
assert_series_equal(df2['A'], original)
df2['A'].drop([1], inplace=True)
assert_series_equal(df2['A'], original.drop([1]))
def test_dropna_corner(self):
# bad input
pytest.raises(ValueError, self.frame.dropna, how='foo')
pytest.raises(TypeError, self.frame.dropna, how=None)
# non-existent column - 8303
pytest.raises(KeyError, self.frame.dropna, subset=['A', 'X'])
def test_dropna_multiple_axes(self):
df = DataFrame([[1, np.nan, 2, 3],
[4, np.nan, 5, 6],
[np.nan, np.nan, np.nan, np.nan],
[7, np.nan, 8, 9]])
cp = df.copy()
# GH20987
with tm.assert_produces_warning(FutureWarning):
result = df.dropna(how='all', axis=[0, 1])
with tm.assert_produces_warning(FutureWarning):
result2 = df.dropna(how='all', axis=(0, 1))
expected = df.dropna(how='all').dropna(how='all', axis=1)
assert_frame_equal(result, expected)
assert_frame_equal(result2, expected)
assert_frame_equal(df, cp)
inp = df.copy()
with tm.assert_produces_warning(FutureWarning):
inp.dropna(how='all', axis=(0, 1), inplace=True)
assert_frame_equal(inp, expected)
def test_dropna_tz_aware_datetime(self):
# GH13407
df = DataFrame()
dt1 = datetime.datetime(2015, 1, 1,
tzinfo=dateutil.tz.tzutc())
dt2 = datetime.datetime(2015, 2, 2,
tzinfo=dateutil.tz.tzutc())
df['Time'] = [dt1]
result = df.dropna(axis=0)
expected = DataFrame({'Time': [dt1]})
assert_frame_equal(result, expected)
# Ex2
df = DataFrame({'Time': [dt1, None, np.nan, dt2]})
result = df.dropna(axis=0)
expected = DataFrame([dt1, dt2],
columns=['Time'],
index=[0, 3])
assert_frame_equal(result, expected)
def test_fillna(self):
tf = self.tsframe
tf.loc[tf.index[:5], 'A'] = np.nan
tf.loc[tf.index[-5:], 'A'] = np.nan
zero_filled = self.tsframe.fillna(0)
assert (zero_filled.loc[zero_filled.index[:5], 'A'] == 0).all()
padded = self.tsframe.fillna(method='pad')
assert np.isnan(padded.loc[padded.index[:5], 'A']).all()
assert (padded.loc[padded.index[-5:], 'A'] ==
padded.loc[padded.index[-5], 'A']).all()
# mixed type
mf = self.mixed_frame
mf.loc[mf.index[5:20], 'foo'] = np.nan
mf.loc[mf.index[-10:], 'A'] = np.nan
result = self.mixed_frame.fillna(value=0)
result = self.mixed_frame.fillna(method='pad')
pytest.raises(ValueError, self.tsframe.fillna)
pytest.raises(ValueError, self.tsframe.fillna, 5, method='ffill')
# mixed numeric (but no float16)
mf = self.mixed_float.reindex(columns=['A', 'B', 'D'])
mf.loc[mf.index[-10:], 'A'] = np.nan
result = mf.fillna(value=0)
_check_mixed_float(result, dtype=dict(C=None))
result = mf.fillna(method='pad')
_check_mixed_float(result, dtype=dict(C=None))
# empty frame (GH #2778)
df = DataFrame(columns=['x'])
for m in ['pad', 'backfill']:
df.x.fillna(method=m, inplace=True)
df.x.fillna(method=m)
# with different dtype (GH3386)
df = DataFrame([['a', 'a', np.nan, 'a'], [
'b', 'b', np.nan, 'b'], ['c', 'c', np.nan, 'c']])
result = df.fillna({2: 'foo'})
expected = DataFrame([['a', 'a', 'foo', 'a'],
['b', 'b', 'foo', 'b'],
['c', 'c', 'foo', 'c']])
assert_frame_equal(result, expected)
df.fillna({2: 'foo'}, inplace=True)
assert_frame_equal(df, expected)
# limit and value
df = DataFrame(np.random.randn(10, 3))
df.iloc[2:7, 0] = np.nan
df.iloc[3:5, 2] = np.nan
expected = df.copy()
expected.iloc[2, 0] = 999
expected.iloc[3, 2] = 999
result = df.fillna(999, limit=1)
assert_frame_equal(result, expected)
# with datelike
# GH 6344
df = DataFrame({
'Date': [pd.NaT, Timestamp("2014-1-1")],
'Date2': [Timestamp("2013-1-1"), pd.NaT]
})
expected = df.copy()
expected['Date'] = expected['Date'].fillna(
df.loc[df.index[0], 'Date2'])
result = df.fillna(value={'Date': df['Date2']})
assert_frame_equal(result, expected)
# with timezone
# GH 15855
df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
pd.NaT]})
exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
assert_frame_equal(df.fillna(method='pad'), exp)
df = pd.DataFrame({'A': [pd.NaT,
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
assert_frame_equal(df.fillna(method='bfill'), exp)
# with timezone in another column
# GH 15522
df = pd.DataFrame({'A': pd.date_range('20130101', periods=4,
tz='US/Eastern'),
'B': [1, 2, np.nan, np.nan]})
result = df.fillna(method='pad')
expected = pd.DataFrame({'A': pd.date_range('20130101', periods=4,
tz='US/Eastern'),
'B': [1., 2., 2., 2.]})
assert_frame_equal(result, expected)
def test_na_actions_categorical(self):
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
vals = ["a", "b", np.nan, "d"]
df = DataFrame({"cats": cat, "vals": vals})
cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
vals2 = ["a", "b", "b", "d"]
df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
vals3 = ["a", "b", np.nan]
df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
cat4 = Categorical([1, 2], categories=[1, 2, 3])
vals4 = ["a", "b"]
df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
# fillna
res = df.fillna(value={"cats": 3, "vals": "b"})
tm.assert_frame_equal(res, df_exp_fill)
with pytest.raises(ValueError, match=("fill value must "
"be in categories")):
df.fillna(value={"cats": 4, "vals": "c"})
res = df.fillna(method='pad')
tm.assert_frame_equal(res, df_exp_fill)
# dropna
res = df.dropna(subset=["cats"])
tm.assert_frame_equal(res, df_exp_drop_cats)
res = df.dropna()
tm.assert_frame_equal(res, df_exp_drop_all)
# make sure that fillna takes missing values into account
c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})
cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
res = df.fillna("a")
tm.assert_frame_equal(res, df_exp)
def test_fillna_categorical_nan(self):
# GH 14021
# np.nan should always be a valid filler
cat = Categorical([np.nan, 2, np.nan])
val = Categorical([np.nan, np.nan, np.nan])
df = DataFrame({"cats": cat, "vals": val})
res = df.fillna(df.median())
v_exp = [np.nan, np.nan, np.nan]
df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
dtype='category')
tm.assert_frame_equal(res, df_exp)
result = df.cats.fillna(np.nan)
tm.assert_series_equal(result, df.cats)
result = df.vals.fillna(np.nan)
tm.assert_series_equal(result, df.vals)
idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
'2011-01-01 09:00', pd.NaT, pd.NaT])
df = DataFrame({'a': Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01',
pd.NaT, pd.NaT], freq='M')
df = DataFrame({'a': Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
idx = pd.TimedeltaIndex(['1 days', '2 days',
'1 days', pd.NaT, pd.NaT])
df = DataFrame({'a': Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
def test_fillna_downcast(self):
# GH 15277
# infer int64 from float64
df = pd.DataFrame({'a': [1., np.nan]})
result = df.fillna(0, downcast='infer')
expected = pd.DataFrame({'a': [1, 0]})
assert_frame_equal(result, expected)
# infer int64 from float64 when fillna value is a dict
df = pd.DataFrame({'a': [1., np.nan]})
result = df.fillna({'a': 0}, downcast='infer')
expected = pd.DataFrame({'a': [1, 0]})
assert_frame_equal(result, expected)
def test_fillna_dtype_conversion(self):
# make sure that fillna on an empty frame works
df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
result = df.get_dtype_counts().sort_values()
expected = Series({'object': 5})
assert_series_equal(result, expected)
result = df.fillna(1)
expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
result = result.get_dtype_counts().sort_values()
expected = Series({'int64': 5})
assert_series_equal(result, expected)
# empty block
df = DataFrame(index=lrange(3), columns=['A', 'B'], dtype='float64')
result = df.fillna('nan')
expected = DataFrame('nan', index=lrange(3), columns=['A', 'B'])
assert_frame_equal(result, expected)
# equiv of replace
df = DataFrame(dict(A=[1, np.nan], B=[1., 2.]))
for v in ['', 1, np.nan, 1.0]:
expected = df.replace(np.nan, v)
result = df.fillna(v)
assert_frame_equal(result, expected)
def test_fillna_datetime_columns(self):
# GH 7095
df = pd.DataFrame({'A': [-1, -2, np.nan],
'B': date_range('20130101', periods=3),
'C': ['foo', 'bar', None],
'D': ['foo2', 'bar2', None]},
index=date_range('20130110', periods=3))
result = df.fillna('?')
expected = pd.DataFrame({'A': [-1, -2, '?'],
'B': date_range('20130101', periods=3),
'C': ['foo', 'bar', '?'],
'D': ['foo2', 'bar2', '?']},
index=date_range('20130110', periods=3))
tm.assert_frame_equal(result, expected)
df = pd.DataFrame({'A': [-1, -2, np.nan],
'B': [pd.Timestamp('2013-01-01'),
pd.Timestamp('2013-01-02'), pd.NaT],
'C': ['foo', 'bar', None],
'D': ['foo2', 'bar2', None]},
index=date_range('20130110', periods=3))
result = df.fillna('?')
expected = pd.DataFrame({'A': [-1, -2, '?'],
'B': [pd.Timestamp('2013-01-01'),
pd.Timestamp('2013-01-02'), '?'],
'C': ['foo', 'bar', '?'],
'D': ['foo2', 'bar2', '?']},
index=pd.date_range('20130110', periods=3))
tm.assert_frame_equal(result, expected)
def test_ffill(self):
self.tsframe['A'][:5] = np.nan
self.tsframe['A'][-5:] = np.nan
assert_frame_equal(self.tsframe.ffill(),
self.tsframe.fillna(method='ffill'))
def test_bfill(self):
self.tsframe['A'][:5] = np.nan
self.tsframe['A'][-5:] = np.nan
assert_frame_equal(self.tsframe.bfill(),
self.tsframe.fillna(method='bfill'))
def test_frame_pad_backfill_limit(self):
index = np.arange(10)
df = DataFrame(np.random.randn(10, 4), index=index)
result = df[:2].reindex(index, method='pad', limit=5)
expected = df[:2].reindex(index).fillna(method='pad')
expected.values[-3:] = np.nan
tm.assert_frame_equal(result, expected)
result = df[-2:].reindex(index, method='backfill', limit=5)
expected = df[-2:].reindex(index).fillna(method='backfill')
expected.values[:3] = np.nan
tm.assert_frame_equal(result, expected)
def test_frame_fillna_limit(self):
index = np.arange(10)
df = DataFrame(np.random.randn(10, 4), index=index)
result = df[:2].reindex(index)
result = result.fillna(method='pad', limit=5)
expected = df[:2].reindex(index).fillna(method='pad')
expected.values[-3:] = np.nan
tm.assert_frame_equal(result, expected)
result = df[-2:].reindex(index)
result = result.fillna(method='backfill', limit=5)
expected = df[-2:].reindex(index).fillna(method='backfill')
expected.values[:3] = np.nan
tm.assert_frame_equal(result, expected)
def test_fillna_skip_certain_blocks(self):
# don't try to fill boolean, int blocks
df = DataFrame(np.random.randn(10, 4).astype(int))
# it works!
df.fillna(np.nan)
def test_fillna_inplace(self):
df = DataFrame(np.random.randn(10, 4))
df[1][:4] = np.nan
df[3][-4:] = np.nan
expected = df.fillna(value=0)
assert expected is not df
df.fillna(value=0, inplace=True)
tm.assert_frame_equal(df, expected)
expected = df.fillna(value={0: 0}, inplace=True)
assert expected is None
df[1][:4] = np.nan
df[3][-4:] = np.nan
expected = df.fillna(method='ffill')
assert expected is not df
df.fillna(method='ffill', inplace=True)
tm.assert_frame_equal(df, expected)
def test_fillna_dict_series(self):
df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan],
'b': [1, 2, 3, np.nan, np.nan],
'c': [np.nan, 1, 2, 3, 4]})
result = df.fillna({'a': 0, 'b': 5})
expected = df.copy()
expected['a'] = expected['a'].fillna(0)
expected['b'] = expected['b'].fillna(5)
assert_frame_equal(result, expected)
# it works
result = df.fillna({'a': 0, 'b': 5, 'd': 7})
# Series treated same as dict
result = df.fillna(df.max())
expected = df.fillna(df.max().to_dict())
assert_frame_equal(result, expected)
# disable this for now
with pytest.raises(NotImplementedError, match='column by column'):
df.fillna(df.max(1), axis=1)
def test_fillna_dataframe(self):
# GH 8377
df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan],
'b': [1, 2, 3, np.nan, np.nan],
'c': [np.nan, 1, 2, 3, 4]},
index=list('VWXYZ'))
# df2 may have different index and columns
df2 = DataFrame({'a': [np.nan, 10, 20, 30, 40],
'b': [50, 60, 70, 80, 90],
'foo': ['bar'] * 5},
index=list('VWXuZ'))
result = df.fillna(df2)
# only those columns and indices which are shared get filled
expected = DataFrame({'a': [np.nan, 1, 2, np.nan, 40],
'b': [1, 2, 3, np.nan, 90],
'c': [np.nan, 1, 2, 3, 4]},
index=list('VWXYZ'))
assert_frame_equal(result, expected)
def test_fillna_columns(self):
df = DataFrame(np.random.randn(10, 10))
df.values[:, ::2] = np.nan
result = df.fillna(method='ffill', axis=1)
expected = df.T.fillna(method='pad').T
assert_frame_equal(result, expected)
df.insert(6, 'foo', 5)
result = df.fillna(method='ffill', axis=1)
expected = df.astype(float).fillna(method='ffill', axis=1)
assert_frame_equal(result, expected)
def test_fillna_invalid_method(self):
with pytest.raises(ValueError, match='ffil'):
self.frame.fillna(method='ffil')
def test_fillna_invalid_value(self):
# list
pytest.raises(TypeError, self.frame.fillna, [1, 2])
# tuple
pytest.raises(TypeError, self.frame.fillna, (1, 2))
# frame with series
pytest.raises(TypeError, self.frame.iloc[:, 0].fillna, self.frame)
def test_fillna_col_reordering(self):
cols = ["COL." + str(i) for i in range(5, 0, -1)]
data = np.random.rand(20, 5)
df = DataFrame(index=lrange(20), columns=cols, data=data)
filled = df.fillna(method='ffill')
assert df.columns.tolist() == filled.columns.tolist()
def test_fill_corner(self):
mf = self.mixed_frame
mf.loc[mf.index[5:20], 'foo'] = np.nan
mf.loc[mf.index[-10:], 'A'] = np.nan
filled = self.mixed_frame.fillna(value=0)
assert (filled.loc[filled.index[5:20], 'foo'] == 0).all()
del self.mixed_frame['foo']
empty_float = self.frame.reindex(columns=[])
# TODO(wesm): unused?
result = empty_float.fillna(value=0) # noqa
def test_fill_value_when_combine_const(self):
# GH12723
dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float')
df = DataFrame({'foo': dat}, index=range(6))
exp = df.fillna(0).add(2)
res = df.add(2, fill_value=0)
assert_frame_equal(res, exp)
class TestDataFrameInterpolate(TestData):
def test_interp_basic(self):
df = DataFrame({'A': [1, 2, np.nan, 4],
'B': [1, 4, 9, np.nan],
'C': [1, 2, 3, 5],
'D': list('abcd')})
expected = DataFrame({'A': [1., 2., 3., 4.],
'B': [1., 4., 9., 9.],
'C': [1, 2, 3, 5],
'D': list('abcd')})
result = df.interpolate()
assert_frame_equal(result, expected)
result = df.set_index('C').interpolate()
expected = df.set_index('C')
expected.loc[3, 'A'] = 3
expected.loc[5, 'B'] = 9
assert_frame_equal(result, expected)
def test_interp_bad_method(self):
df = DataFrame({'A': [1, 2, np.nan, 4],
'B': [1, 4, 9, np.nan],
'C': [1, 2, 3, 5],
'D': list('abcd')})
with pytest.raises(ValueError):
df.interpolate(method='not_a_method')
def test_interp_combo(self):
df = DataFrame({'A': [1., 2., np.nan, 4.],
'B': [1, 4, 9, np.nan],
'C': [1, 2, 3, 5],
'D': list('abcd')})
result = df['A'].interpolate()
expected = Series([1., 2., 3., 4.], name='A')
assert_series_equal(result, expected)
result = df['A'].interpolate(downcast='infer')
expected = Series([1, 2, 3, 4], name='A')
assert_series_equal(result, expected)
def test_interp_nan_idx(self):
df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]})
df = df.set_index('A')
with pytest.raises(NotImplementedError):
df.interpolate(method='values')
@td.skip_if_no_scipy
def test_interp_various(self):
df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
'C': [1, 2, 3, 5, 8, 13, 21]})
df = df.set_index('C')
expected = df.copy()
result = df.interpolate(method='polynomial', order=1)
expected.A.loc[3] = 2.66666667
expected.A.loc[13] = 5.76923076
assert_frame_equal(result, expected)
result = df.interpolate(method='cubic')
# GH #15662.
# new cubic and quadratic interpolation algorithms from scipy 0.19.0.
# previously `splmake` was used. See scipy/scipy#6710
if _is_scipy_ge_0190:
expected.A.loc[3] = 2.81547781
expected.A.loc[13] = 5.52964175
else:
expected.A.loc[3] = 2.81621174
expected.A.loc[13] = 5.64146581
assert_frame_equal(result, expected)
result = df.interpolate(method='nearest')
expected.A.loc[3] = 2
expected.A.loc[13] = 5
assert_frame_equal(result, expected, check_dtype=False)
result = df.interpolate(method='quadratic')
if _is_scipy_ge_0190:
expected.A.loc[3] = 2.82150771
expected.A.loc[13] = 6.12648668
else:
expected.A.loc[3] = 2.82533638
expected.A.loc[13] = 6.02817974
assert_frame_equal(result, expected)
result = df.interpolate(method='slinear')
expected.A.loc[3] = 2.66666667
expected.A.loc[13] = 5.76923077
assert_frame_equal(result, expected)
result = df.interpolate(method='zero')
expected.A.loc[3] = 2.
expected.A.loc[13] = 5
assert_frame_equal(result, expected, check_dtype=False)
@td.skip_if_no_scipy
def test_interp_alt_scipy(self):
df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
'C': [1, 2, 3, 5, 8, 13, 21]})
result = df.interpolate(method='barycentric')
expected = df.copy()
expected.loc[2, 'A'] = 3
expected.loc[5, 'A'] = 6
assert_frame_equal(result, expected)
result = df.interpolate(method='barycentric', downcast='infer')
assert_frame_equal(result, expected.astype(np.int64))
result = df.interpolate(method='krogh')
expectedk = df.copy()
expectedk['A'] = expected['A']
assert_frame_equal(result, expectedk)
_skip_if_no_pchip()
import scipy
result = df.interpolate(method='pchip')
expected.loc[2, 'A'] = 3
if LooseVersion(scipy.__version__) >= LooseVersion('0.17.0'):
expected.loc[5, 'A'] = 6.0
else:
expected.loc[5, 'A'] = 6.125
assert_frame_equal(result, expected)
def test_interp_rowwise(self):
df = DataFrame({0: [1, 2, np.nan, 4],
1: [2, 3, 4, np.nan],
2: [np.nan, 4, 5, 6],
3: [4, np.nan, 6, 7],
4: [1, 2, 3, 4]})
result = df.interpolate(axis=1)
expected = df.copy()
expected.loc[3, 1] = 5
expected.loc[0, 2] = 3
expected.loc[1, 3] = 3
expected[4] = expected[4].astype(np.float64)
assert_frame_equal(result, expected)
result = df.interpolate(axis=1, method='values')
assert_frame_equal(result, expected)
result = df.interpolate(axis=0)
expected = df.interpolate()
assert_frame_equal(result, expected)
def test_rowwise_alt(self):
df = DataFrame({0: [0, .5, 1., np.nan, 4, 8, np.nan, np.nan, 64],
1: [1, 2, 3, 4, 3, 2, 1, 0, -1]})
df.interpolate(axis=0)
@pytest.mark.parametrize("check_scipy", [
False, pytest.param(True, marks=td.skip_if_no_scipy)
])
def test_interp_leading_nans(self, check_scipy):
df = DataFrame({"A": [np.nan, np.nan, .5, .25, 0],
"B": [np.nan, -3, -3.5, np.nan, -4]})
result = df.interpolate()
expected = df.copy()
expected['B'].loc[3] = -3.75
assert_frame_equal(result, expected)
if check_scipy:
result = df.interpolate(method='polynomial', order=1)
assert_frame_equal(result, expected)
def test_interp_raise_on_only_mixed(self):
df = DataFrame({'A': [1, 2, np.nan, 4],
'B': ['a', 'b', 'c', 'd'],
'C': [np.nan, 2, 5, 7],
'D': [np.nan, np.nan, 9, 9],
'E': [1, 2, 3, 4]})
with pytest.raises(TypeError):
df.interpolate(axis=1)
def test_interp_raise_on_all_object_dtype(self):
# GH 22985
df = DataFrame({
'A': [1, 2, 3],
'B': [4, 5, 6]},
dtype='object')
msg = ("Cannot interpolate with all object-dtype columns "
"in the DataFrame. Try setting at least one "
"column to a numeric dtype.")
with pytest.raises(TypeError, match=msg):
df.interpolate()
def test_interp_inplace(self):
df = DataFrame({'a': [1., 2., np.nan, 4.]})
expected = DataFrame({'a': [1., 2., 3., 4.]})
result = df.copy()
result['a'].interpolate(inplace=True)
assert_frame_equal(result, expected)
result = df.copy()
result['a'].interpolate(inplace=True, downcast='infer')
assert_frame_equal(result, expected.astype('int64'))
def test_interp_inplace_row(self):
# GH 10395
result = DataFrame({'a': [1., 2., 3., 4.],
'b': [np.nan, 2., 3., 4.],
'c': [3, 2, 2, 2]})
expected = result.interpolate(method='linear', axis=1, inplace=False)
result.interpolate(method='linear', axis=1, inplace=True)
assert_frame_equal(result, expected)
def test_interp_ignore_all_good(self):
# GH
df = DataFrame({'A': [1, 2, np.nan, 4],
'B': [1, 2, 3, 4],
'C': [1., 2., np.nan, 4.],
'D': [1., 2., 3., 4.]})
expected = DataFrame({'A': np.array(
[1, 2, 3, 4], dtype='float64'),
'B': np.array(
[1, 2, 3, 4], dtype='int64'),
'C': np.array(
[1., 2., 3, 4.], dtype='float64'),
'D': np.array(
[1., 2., 3., 4.], dtype='float64')})
result = df.interpolate(downcast=None)
assert_frame_equal(result, expected)
# all good
result = df[['B', 'D']].interpolate(downcast=None)
assert_frame_equal(result, df[['B', 'D']])
@@ -0,0 +1,280 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import numpy as np
import pytest
from pandas.compat import PY36, lrange, range
from pandas import DataFrame, Index, MultiIndex, Series
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal
# Column add, remove, delete.
class TestDataFrameMutateColumns(TestData):
def test_assign(self):
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
original = df.copy()
result = df.assign(C=df.B / df.A)
expected = df.copy()
expected['C'] = [4, 2.5, 2]
assert_frame_equal(result, expected)
# lambda syntax
result = df.assign(C=lambda x: x.B / x.A)
assert_frame_equal(result, expected)
# original is unmodified
assert_frame_equal(df, original)
# Non-Series array-like
result = df.assign(C=[4, 2.5, 2])
assert_frame_equal(result, expected)
# original is unmodified
assert_frame_equal(df, original)
result = df.assign(B=df.B / df.A)
expected = expected.drop('B', axis=1).rename(columns={'C': 'B'})
assert_frame_equal(result, expected)
# overwrite
result = df.assign(A=df.A + df.B)
expected = df.copy()
expected['A'] = [5, 7, 9]
assert_frame_equal(result, expected)
# lambda
result = df.assign(A=lambda x: x.A + x.B)
assert_frame_equal(result, expected)
def test_assign_multiple(self):
df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=['A', 'B'])
result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
expected = DataFrame([[1, 4, 7, 1, 4], [2, 5, 8, 2, 5],
[3, 6, 9, 3, 6]], columns=list('ABCDE'))
assert_frame_equal(result, expected)
def test_assign_order(self):
# GH 9818
df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
result = df.assign(D=df.A + df.B, C=df.A - df.B)
if PY36:
expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]],
columns=list('ABDC'))
else:
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
columns=list('ABCD'))
assert_frame_equal(result, expected)
result = df.assign(C=df.A - df.B, D=df.A + df.B)
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
columns=list('ABCD'))
assert_frame_equal(result, expected)
def test_assign_bad(self):
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
# non-keyword argument
with pytest.raises(TypeError):
df.assign(lambda x: x.A)
with pytest.raises(AttributeError):
df.assign(C=df.A, D=df.A + df.C)
@pytest.mark.skipif(PY36, reason="""Issue #14207: valid for python
3.6 and above""")
def test_assign_dependent_old_python(self):
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
# Key C does not exist at definition time of df
with pytest.raises(KeyError):
df.assign(C=lambda df: df.A,
D=lambda df: df['A'] + df['C'])
with pytest.raises(KeyError):
df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
@pytest.mark.skipif(not PY36, reason="""Issue #14207: not valid for
python 3.5 and below""")
def test_assign_dependent(self):
df = DataFrame({'A': [1, 2], 'B': [3, 4]})
result = df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
columns=list('ABCD'))
assert_frame_equal(result, expected)
result = df.assign(C=lambda df: df.A,
D=lambda df: df['A'] + df['C'])
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
columns=list('ABCD'))
assert_frame_equal(result, expected)
def test_insert_error_msmgs(self):
# GH 7432
df = DataFrame({'foo': ['a', 'b', 'c'], 'bar': [
1, 2, 3], 'baz': ['d', 'e', 'f']}).set_index('foo')
s = DataFrame({'foo': ['a', 'b', 'c', 'a'], 'fiz': [
'g', 'h', 'i', 'j']}).set_index('foo')
msg = 'cannot reindex from a duplicate axis'
with pytest.raises(ValueError, match=msg):
df['newcol'] = s
# GH 4107, more descriptive error message
df = DataFrame(np.random.randint(0, 2, (4, 4)),
columns=['a', 'b', 'c', 'd'])
msg = 'incompatible index of inserted column with frame index'
with pytest.raises(TypeError, match=msg):
df['gr'] = df.groupby(['b', 'c']).count()
def test_insert_benchmark(self):
# from the vb_suite/frame_methods/frame_insert_columns
N = 10
K = 5
df = DataFrame(index=lrange(N))
new_col = np.random.randn(N)
for i in range(K):
df[i] = new_col
expected = DataFrame(np.repeat(new_col, K).reshape(N, K),
index=lrange(N))
assert_frame_equal(df, expected)
def test_insert(self):
df = DataFrame(np.random.randn(5, 3), index=np.arange(5),
columns=['c', 'b', 'a'])
df.insert(0, 'foo', df['a'])
tm.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a']))
tm.assert_series_equal(df['a'], df['foo'], check_names=False)
df.insert(2, 'bar', df['c'])
tm.assert_index_equal(df.columns,
Index(['foo', 'c', 'bar', 'b', 'a']))
tm.assert_almost_equal(df['c'], df['bar'], check_names=False)
# diff dtype
# new item
df['x'] = df['a'].astype('float32')
result = Series(dict(float32=1, float64=5))
assert (df.get_dtype_counts().sort_index() == result).all()
# replacing current (in different block)
df['a'] = df['a'].astype('float32')
result = Series(dict(float32=2, float64=4))
assert (df.get_dtype_counts().sort_index() == result).all()
df['y'] = df['a'].astype('int32')
result = Series(dict(float32=2, float64=4, int32=1))
assert (df.get_dtype_counts().sort_index() == result).all()
with pytest.raises(ValueError, match='already exists'):
df.insert(1, 'a', df['b'])
pytest.raises(ValueError, df.insert, 1, 'c', df['b'])
df.columns.name = 'some_name'
# preserve columns name field
df.insert(0, 'baz', df['c'])
assert df.columns.name == 'some_name'
# GH 13522
df = DataFrame(index=['A', 'B', 'C'])
df['X'] = df.index
df['X'] = ['x', 'y', 'z']
exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C'])
assert_frame_equal(df, exp)
def test_delitem(self):
del self.frame['A']
assert 'A' not in self.frame
def test_delitem_multiindex(self):
midx = MultiIndex.from_product([['A', 'B'], [1, 2]])
df = DataFrame(np.random.randn(4, 4), columns=midx)
assert len(df.columns) == 4
assert ('A', ) in df.columns
assert 'A' in df.columns
result = df['A']
assert isinstance(result, DataFrame)
del df['A']
assert len(df.columns) == 2
# A still in the levels, BUT get a KeyError if trying
# to delete
assert ('A', ) not in df.columns
with pytest.raises(KeyError):
del df[('A',)]
# behavior of dropped/deleted MultiIndex levels changed from
# GH 2770 to GH 19027: MultiIndex no longer '.__contains__'
# levels which are dropped/deleted
assert 'A' not in df.columns
with pytest.raises(KeyError):
del df['A']
def test_pop(self):
self.frame.columns.name = 'baz'
self.frame.pop('A')
assert 'A' not in self.frame
self.frame['foo'] = 'bar'
self.frame.pop('foo')
assert 'foo' not in self.frame
assert self.frame.columns.name == 'baz'
# gh-10912: inplace ops cause caching issue
a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[
'A', 'B', 'C'], index=['X', 'Y'])
b = a.pop('B')
b += 1
# original frame
expected = DataFrame([[1, 3], [4, 6]], columns=[
'A', 'C'], index=['X', 'Y'])
tm.assert_frame_equal(a, expected)
# result
expected = Series([2, 5], index=['X', 'Y'], name='B') + 1
tm.assert_series_equal(b, expected)
def test_pop_non_unique_cols(self):
df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]})
df.columns = ["a", "b", "a"]
res = df.pop("a")
assert type(res) == DataFrame
assert len(res) == 2
assert len(df.columns) == 1
assert "b" in df.columns
assert "a" not in df.columns
assert len(df.index) == 2
def test_insert_column_bug_4032(self):
# GH4032, inserting a column and renaming causing errors
df = DataFrame({'b': [1.1, 2.2]})
df = df.rename(columns={})
df.insert(0, 'a', [1, 2])
result = df.rename(columns={})
str(result)
expected = DataFrame([[1, 1.1], [2, 2.2]], columns=['a', 'b'])
assert_frame_equal(result, expected)
df.insert(0, 'c', [1.3, 2.3])
result = df.rename(columns={})
str(result)
expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]],
columns=['c', 'a', 'b'])
assert_frame_equal(result, expected)
@@ -0,0 +1,477 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import numpy as np
import pytest
from pandas.compat import lrange, u
import pandas as pd
from pandas import DataFrame, MultiIndex, Series, date_range
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal
class TestDataFrameNonuniqueIndexes(TestData):
def test_column_dups_operations(self):
def check(result, expected=None):
if expected is not None:
assert_frame_equal(result, expected)
result.dtypes
str(result)
# assignment
# GH 3687
arr = np.random.randn(3, 2)
idx = lrange(2)
df = DataFrame(arr, columns=['A', 'A'])
df.columns = idx
expected = DataFrame(arr, columns=idx)
check(df, expected)
idx = date_range('20130101', periods=4, freq='Q-NOV')
df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
columns=['a', 'a', 'a', 'a'])
df.columns = idx
expected = DataFrame(
[[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
check(df, expected)
# insert
df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
columns=['foo', 'bar', 'foo', 'hello'])
df['string'] = 'bah'
expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'],
[2, 1, 3, 5, 'bah']],
columns=['foo', 'bar', 'foo', 'hello', 'string'])
check(df, expected)
with pytest.raises(ValueError, match='Length of value'):
df.insert(0, 'AnotherColumn', range(len(df.index) - 1))
# insert same dtype
df['foo2'] = 3
expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3],
[2, 1, 3, 5, 'bah', 3]],
columns=['foo', 'bar', 'foo', 'hello',
'string', 'foo2'])
check(df, expected)
# set (non-dup)
df['foo2'] = 4
expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4],
[2, 1, 3, 5, 'bah', 4]],
columns=['foo', 'bar', 'foo', 'hello',
'string', 'foo2'])
check(df, expected)
df['foo2'] = 3
# delete (non dup)
del df['bar']
expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3],
[2, 3, 5, 'bah', 3]],
columns=['foo', 'foo', 'hello', 'string', 'foo2'])
check(df, expected)
# try to delete again (its not consolidated)
del df['hello']
expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
[2, 3, 'bah', 3]],
columns=['foo', 'foo', 'string', 'foo2'])
check(df, expected)
# consolidate
df = df._consolidate()
expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
[2, 3, 'bah', 3]],
columns=['foo', 'foo', 'string', 'foo2'])
check(df, expected)
# insert
df.insert(2, 'new_col', 5.)
expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3],
[2, 3, 5., 'bah', 3]],
columns=['foo', 'foo', 'new_col', 'string',
'foo2'])
check(df, expected)
# insert a dup
with pytest.raises(ValueError, match='cannot insert'):
df.insert(2, 'new_col', 4.)
df.insert(2, 'new_col', 4., allow_duplicates=True)
expected = DataFrame([[1, 1, 4., 5., 'bah', 3],
[1, 2, 4., 5., 'bah', 3],
[2, 3, 4., 5., 'bah', 3]],
columns=['foo', 'foo', 'new_col',
'new_col', 'string', 'foo2'])
check(df, expected)
# delete (dup)
del df['foo']
expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3],
[4., 5., 'bah', 3]],
columns=['new_col', 'new_col', 'string', 'foo2'])
assert_frame_equal(df, expected)
# dup across dtypes
df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]],
columns=['foo', 'bar', 'foo', 'hello'])
check(df)
df['foo2'] = 7.
expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.],
[2, 1, 3., 5, 7.]],
columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
check(df, expected)
result = df['foo']
expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]],
columns=['foo', 'foo'])
check(result, expected)
# multiple replacements
df['foo'] = 'string'
expected = DataFrame([['string', 1, 'string', 5, 7.],
['string', 1, 'string', 5, 7.],
['string', 1, 'string', 5, 7.]],
columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
check(df, expected)
del df['foo']
expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[
'bar', 'hello', 'foo2'])
check(df, expected)
# values
df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x'])
result = df.values
expected = np.array([[1, 2.5], [3, 4.5]])
assert (result == expected).all().all()
# rename, GH 4403
df4 = DataFrame(
{'RT': [0.0454],
'TClose': [22.02],
'TExg': [0.0422]},
index=MultiIndex.from_tuples([(600809, 20130331)],
names=['STK_ID', 'RPT_Date']))
df5 = DataFrame({'RPT_Date': [20120930, 20121231, 20130331],
'STK_ID': [600809] * 3,
'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')],
'TClose': [38.05, 41.66, 30.01]},
index=MultiIndex.from_tuples(
[(600809, 20120930),
(600809, 20121231),
(600809, 20130331)],
names=['STK_ID', 'RPT_Date']))
k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True)
result = k.rename(
columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'})
str(result)
result.dtypes
expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809,
u('饡驦'), 30.01]],
columns=['RT', 'TClose', 'TExg',
'RPT_Date', 'STK_ID', 'STK_Name',
'QT_Close'])
.set_index(['STK_ID', 'RPT_Date'], drop=False))
assert_frame_equal(result, expected)
# reindex is invalid!
df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
columns=['bar', 'a', 'a'])
pytest.raises(ValueError, df.reindex, columns=['bar'])
pytest.raises(ValueError, df.reindex, columns=['bar', 'foo'])
# drop
df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
columns=['bar', 'a', 'a'])
result = df.drop(['a'], axis=1)
expected = DataFrame([[1], [1], [1]], columns=['bar'])
check(result, expected)
result = df.drop('a', axis=1)
check(result, expected)
# describe
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
columns=['bar', 'a', 'a'], dtype='float64')
result = df.describe()
s = df.iloc[:, 0].describe()
expected = pd.concat([s, s, s], keys=df.columns, axis=1)
check(result, expected)
# check column dups with index equal and not equal to df's index
df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'],
columns=['A', 'B', 'A'])
for index in [df.index, pd.Index(list('edcba'))]:
this_df = df.copy()
expected_ser = pd.Series(index.values, index=this_df.index)
expected_df = DataFrame({'A': expected_ser,
'B': this_df['B'],
'A': expected_ser},
columns=['A', 'B', 'A'])
this_df['A'] = index
check(this_df, expected_df)
# operations
for op in ['__add__', '__mul__', '__sub__', '__truediv__']:
df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
expected = getattr(df, op)(df)
expected.columns = ['A', 'A']
df.columns = ['A', 'A']
result = getattr(df, op)(df)
check(result, expected)
# multiple assignments that change dtypes
# the location indexer is a slice
# GH 6120
df = DataFrame(np.random.randn(5, 2), columns=['that', 'that'])
expected = DataFrame(1.0, index=range(5), columns=['that', 'that'])
df['that'] = 1.0
check(df, expected)
df = DataFrame(np.random.rand(5, 2), columns=['that', 'that'])
expected = DataFrame(1, index=range(5), columns=['that', 'that'])
df['that'] = 1
check(df, expected)
def test_column_dups2(self):
# drop buggy GH 6240
df = DataFrame({'A': np.random.randn(5),
'B': np.random.randn(5),
'C': np.random.randn(5),
'D': ['a', 'b', 'c', 'd', 'e']})
expected = df.take([0, 1, 1], axis=1)
df2 = df.take([2, 0, 1, 2, 1], axis=1)
result = df2.drop('C', axis=1)
assert_frame_equal(result, expected)
# dropna
df = DataFrame({'A': np.random.randn(5),
'B': np.random.randn(5),
'C': np.random.randn(5),
'D': ['a', 'b', 'c', 'd', 'e']})
df.iloc[2, [0, 1, 2]] = np.nan
df.iloc[0, 0] = np.nan
df.iloc[1, 1] = np.nan
df.iloc[:, 3] = np.nan
expected = df.dropna(subset=['A', 'B', 'C'], how='all')
expected.columns = ['A', 'A', 'B', 'C']
df.columns = ['A', 'A', 'B', 'C']
result = df.dropna(subset=['A', 'C'], how='all')
assert_frame_equal(result, expected)
def test_column_dups_indexing(self):
def check(result, expected=None):
if expected is not None:
assert_frame_equal(result, expected)
result.dtypes
str(result)
# boolean indexing
# GH 4879
dups = ['A', 'A', 'C', 'D']
df = DataFrame(np.arange(12).reshape(3, 4), columns=[
'A', 'B', 'C', 'D'], dtype='float64')
expected = df[df.C > 6]
expected.columns = dups
df = DataFrame(np.arange(12).reshape(3, 4),
columns=dups, dtype='float64')
result = df[df.C > 6]
check(result, expected)
# where
df = DataFrame(np.arange(12).reshape(3, 4), columns=[
'A', 'B', 'C', 'D'], dtype='float64')
expected = df[df > 6]
expected.columns = dups
df = DataFrame(np.arange(12).reshape(3, 4),
columns=dups, dtype='float64')
result = df[df > 6]
check(result, expected)
# boolean with the duplicate raises
df = DataFrame(np.arange(12).reshape(3, 4),
columns=dups, dtype='float64')
pytest.raises(ValueError, lambda: df[df.A > 6])
# dup aligining operations should work
# GH 5185
df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
result = df1.sub(df2)
assert_frame_equal(result, expected)
# equality
df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]],
columns=['A', 'B'])
df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]],
columns=['A', 'A'])
# not-comparing like-labelled
pytest.raises(ValueError, lambda: df1 == df2)
df1r = df1.reindex_like(df2)
result = df1r == df2
expected = DataFrame([[False, True], [True, False], [False, False], [
True, False]], columns=['A', 'A'])
assert_frame_equal(result, expected)
# mixed column selection
# GH 5639
dfbool = DataFrame({'one': Series([True, True, False],
index=['a', 'b', 'c']),
'two': Series([False, False, True, False],
index=['a', 'b', 'c', 'd']),
'three': Series([False, True, True, True],
index=['a', 'b', 'c', 'd'])})
expected = pd.concat(
[dfbool['one'], dfbool['three'], dfbool['one']], axis=1)
result = dfbool[['one', 'three', 'one']]
check(result, expected)
# multi-axis dups
# GH 6121
df = DataFrame(np.arange(25.).reshape(5, 5),
index=['a', 'b', 'c', 'd', 'e'],
columns=['A', 'B', 'C', 'D', 'E'])
z = df[['A', 'C', 'A']].copy()
expected = z.loc[['a', 'c', 'a']]
df = DataFrame(np.arange(25.).reshape(5, 5),
index=['a', 'b', 'c', 'd', 'e'],
columns=['A', 'B', 'C', 'D', 'E'])
z = df[['A', 'C', 'A']]
result = z.loc[['a', 'c', 'a']]
check(result, expected)
def test_column_dups_indexing2(self):
# GH 8363
# datetime ops with a non-unique index
df = DataFrame({'A': np.arange(5, dtype='int64'),
'B': np.arange(1, 6, dtype='int64')},
index=[2, 2, 3, 3, 4])
result = df.B - df.A
expected = Series(1, index=[2, 2, 3, 3, 4])
assert_series_equal(result, expected)
df = DataFrame({'A': date_range('20130101', periods=5),
'B': date_range('20130101 09:00:00', periods=5)},
index=[2, 2, 3, 3, 4])
result = df.B - df.A
expected = Series(pd.Timedelta('9 hours'), index=[2, 2, 3, 3, 4])
assert_series_equal(result, expected)
def test_columns_with_dups(self):
# GH 3468 related
# basic
df = DataFrame([[1, 2]], columns=['a', 'a'])
df.columns = ['a', 'a.1']
str(df)
expected = DataFrame([[1, 2]], columns=['a', 'a.1'])
assert_frame_equal(df, expected)
df = DataFrame([[1, 2, 3]], columns=['b', 'a', 'a'])
df.columns = ['b', 'a', 'a.1']
str(df)
expected = DataFrame([[1, 2, 3]], columns=['b', 'a', 'a.1'])
assert_frame_equal(df, expected)
# with a dup index
df = DataFrame([[1, 2]], columns=['a', 'a'])
df.columns = ['b', 'b']
str(df)
expected = DataFrame([[1, 2]], columns=['b', 'b'])
assert_frame_equal(df, expected)
# multi-dtype
df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
columns=['a', 'a', 'b', 'b', 'd', 'c', 'c'])
df.columns = list('ABCDEFG')
str(df)
expected = DataFrame(
[[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('ABCDEFG'))
assert_frame_equal(df, expected)
# this is an error because we cannot disambiguate the dup columns
pytest.raises(Exception, lambda x: DataFrame(
[[1, 2, 'foo', 'bar']], columns=['a', 'a', 'a', 'a']))
# dups across blocks
df_float = DataFrame(np.random.randn(10, 3), dtype='float64')
df_int = DataFrame(np.random.randn(10, 3), dtype='int64')
df_bool = DataFrame(True, index=df_float.index,
columns=df_float.columns)
df_object = DataFrame('foo', index=df_float.index,
columns=df_float.columns)
df_dt = DataFrame(pd.Timestamp('20010101'),
index=df_float.index,
columns=df_float.columns)
df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
assert len(df._data._blknos) == len(df.columns)
assert len(df._data._blklocs) == len(df.columns)
# testing iloc
for i in range(len(df.columns)):
df.iloc[:, i]
# dup columns across dtype GH 2079/2194
vals = [[1, -1, 2.], [2, -2, 3.]]
rs = DataFrame(vals, columns=['A', 'A', 'B'])
xp = DataFrame(vals)
xp.columns = ['A', 'A', 'B']
assert_frame_equal(rs, xp)
def test_values_duplicates(self):
df = DataFrame([[1, 2, 'a', 'b'],
[1, 2, 'a', 'b']],
columns=['one', 'one', 'two', 'two'])
result = df.values
expected = np.array([[1, 2, 'a', 'b'], [1, 2, 'a', 'b']],
dtype=object)
tm.assert_numpy_array_equal(result, expected)
def test_set_value_by_index(self):
# See gh-12344
df = DataFrame(np.arange(9).reshape(3, 3).T)
df.columns = list('AAA')
expected = df.iloc[:, 2]
df.iloc[:, 0] = 3
assert_series_equal(df.iloc[:, 2], expected)
df = DataFrame(np.arange(9).reshape(3, 3).T)
df.columns = [2, float(2), str(2)]
expected = df.iloc[:, 1]
df.iloc[:, 0] = 3
assert_series_equal(df.iloc[:, 1], expected)
def test_insert_with_columns_dups(self):
# GH 14291
df = pd.DataFrame()
df.insert(0, 'A', ['g', 'h', 'i'], allow_duplicates=True)
df.insert(0, 'A', ['d', 'e', 'f'], allow_duplicates=True)
df.insert(0, 'A', ['a', 'b', 'c'], allow_duplicates=True)
exp = pd.DataFrame([['a', 'd', 'g'], ['b', 'e', 'h'],
['c', 'f', 'i']], columns=['A', 'A', 'A'])
assert_frame_equal(df, exp)
@@ -0,0 +1,802 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
from decimal import Decimal
import operator
import numpy as np
import pytest
from pandas.compat import range
import pandas as pd
from pandas import DataFrame, MultiIndex, Series, compat
import pandas.core.common as com
from pandas.tests.frame.common import TestData, _check_mixed_float
import pandas.util.testing as tm
from pandas.util.testing import (
assert_frame_equal, assert_numpy_array_equal, assert_series_equal)
class TestDataFrameUnaryOperators(object):
# __pos__, __neg__, __inv__
@pytest.mark.parametrize('df,expected', [
(pd.DataFrame({'a': [-1, 1]}), pd.DataFrame({'a': [1, -1]})),
(pd.DataFrame({'a': [False, True]}),
pd.DataFrame({'a': [True, False]})),
(pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}),
pd.DataFrame({'a': pd.Series(pd.to_timedelta([1, -1]))}))
])
def test_neg_numeric(self, df, expected):
assert_frame_equal(-df, expected)
assert_series_equal(-df['a'], expected['a'])
@pytest.mark.parametrize('df, expected', [
(np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)),
([Decimal('1.0'), Decimal('2.0')], [Decimal('-1.0'), Decimal('-2.0')]),
])
def test_neg_object(self, df, expected):
# GH#21380
df = pd.DataFrame({'a': df})
expected = pd.DataFrame({'a': expected})
assert_frame_equal(-df, expected)
assert_series_equal(-df['a'], expected['a'])
@pytest.mark.parametrize('df', [
pd.DataFrame({'a': ['a', 'b']}),
pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}),
])
def test_neg_raises(self, df):
with pytest.raises(TypeError):
(- df)
with pytest.raises(TypeError):
(- df['a'])
def test_invert(self):
_seriesd = tm.getSeriesData()
df = pd.DataFrame(_seriesd)
assert_frame_equal(-(df < 0), ~(df < 0))
@pytest.mark.parametrize('df', [
pd.DataFrame({'a': [-1, 1]}),
pd.DataFrame({'a': [False, True]}),
pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}),
])
def test_pos_numeric(self, df):
# GH#16073
assert_frame_equal(+df, df)
assert_series_equal(+df['a'], df['a'])
@pytest.mark.parametrize('df', [
# numpy changing behavior in the future
pytest.param(pd.DataFrame({'a': ['a', 'b']}),
marks=[pytest.mark.filterwarnings("ignore")]),
pd.DataFrame({'a': np.array([-1, 2], dtype=object)}),
pd.DataFrame({'a': [Decimal('-1.0'), Decimal('2.0')]}),
])
def test_pos_object(self, df):
# GH#21380
assert_frame_equal(+df, df)
assert_series_equal(+df['a'], df['a'])
@pytest.mark.parametrize('df', [
pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}),
])
def test_pos_raises(self, df):
with pytest.raises(TypeError):
(+ df)
with pytest.raises(TypeError):
(+ df['a'])
class TestDataFrameLogicalOperators(object):
# &, |, ^
def test_logical_ops_empty_frame(self):
# GH#5808
# empty frames, non-mixed dtype
df = DataFrame(index=[1])
result = df & df
assert_frame_equal(result, df)
result = df | df
assert_frame_equal(result, df)
df2 = DataFrame(index=[1, 2])
result = df & df2
assert_frame_equal(result, df2)
dfa = DataFrame(index=[1], columns=['A'])
result = dfa & dfa
assert_frame_equal(result, dfa)
def test_logical_ops_bool_frame(self):
# GH#5808
df1a_bool = DataFrame(True, index=[1], columns=['A'])
result = df1a_bool & df1a_bool
assert_frame_equal(result, df1a_bool)
result = df1a_bool | df1a_bool
assert_frame_equal(result, df1a_bool)
def test_logical_ops_int_frame(self):
# GH#5808
df1a_int = DataFrame(1, index=[1], columns=['A'])
df1a_bool = DataFrame(True, index=[1], columns=['A'])
result = df1a_int | df1a_bool
assert_frame_equal(result, df1a_int)
def test_logical_ops_invalid(self):
# GH#5808
df1 = DataFrame(1.0, index=[1], columns=['A'])
df2 = DataFrame(True, index=[1], columns=['A'])
with pytest.raises(TypeError):
df1 | df2
df1 = DataFrame('foo', index=[1], columns=['A'])
df2 = DataFrame(True, index=[1], columns=['A'])
with pytest.raises(TypeError):
df1 | df2
def test_logical_operators(self):
def _check_bin_op(op):
result = op(df1, df2)
expected = DataFrame(op(df1.values, df2.values), index=df1.index,
columns=df1.columns)
assert result.values.dtype == np.bool_
assert_frame_equal(result, expected)
def _check_unary_op(op):
result = op(df1)
expected = DataFrame(op(df1.values), index=df1.index,
columns=df1.columns)
assert result.values.dtype == np.bool_
assert_frame_equal(result, expected)
df1 = {'a': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True},
'b': {'a': False, 'b': True, 'c': False,
'd': False, 'e': False},
'c': {'a': False, 'b': False, 'c': True,
'd': False, 'e': False},
'd': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True},
'e': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True}}
df2 = {'a': {'a': True, 'b': False, 'c': True, 'd': False, 'e': False},
'b': {'a': False, 'b': True, 'c': False,
'd': False, 'e': False},
'c': {'a': True, 'b': False, 'c': True, 'd': False, 'e': False},
'd': {'a': False, 'b': False, 'c': False,
'd': True, 'e': False},
'e': {'a': False, 'b': False, 'c': False,
'd': False, 'e': True}}
df1 = DataFrame(df1)
df2 = DataFrame(df2)
_check_bin_op(operator.and_)
_check_bin_op(operator.or_)
_check_bin_op(operator.xor)
_check_unary_op(operator.inv) # TODO: belongs elsewhere
def test_logical_with_nas(self):
d = DataFrame({'a': [np.nan, False], 'b': [True, True]})
# GH4947
# bool comparisons should return bool
result = d['a'] | d['b']
expected = Series([False, True])
assert_series_equal(result, expected)
# GH4604, automatic casting here
result = d['a'].fillna(False) | d['b']
expected = Series([True, True])
assert_series_equal(result, expected)
result = d['a'].fillna(False, downcast=False) | d['b']
expected = Series([True, True])
assert_series_equal(result, expected)
class TestDataFrameOperators(TestData):
@pytest.mark.parametrize('op', [operator.add, operator.sub,
operator.mul, operator.truediv])
def test_operators_none_as_na(self, op):
df = DataFrame({"col1": [2, 5.0, 123, None],
"col2": [1, 2, 3, 4]}, dtype=object)
# since filling converts dtypes from object, changed expected to be
# object
filled = df.fillna(np.nan)
result = op(df, 3)
expected = op(filled, 3).astype(object)
expected[com.isna(expected)] = None
assert_frame_equal(result, expected)
result = op(df, df)
expected = op(filled, filled).astype(object)
expected[com.isna(expected)] = None
assert_frame_equal(result, expected)
result = op(df, df.fillna(7))
assert_frame_equal(result, expected)
result = op(df.fillna(7), df)
assert_frame_equal(result, expected, check_dtype=False)
@pytest.mark.parametrize('op,res', [('__eq__', False),
('__ne__', True)])
# TODO: not sure what's correct here.
@pytest.mark.filterwarnings("ignore:elementwise:FutureWarning")
def test_logical_typeerror_with_non_valid(self, op, res):
# we are comparing floats vs a string
result = getattr(self.frame, op)('foo')
assert bool(result.all().all()) is res
def test_binary_ops_align(self):
# test aligning binary ops
# GH 6681
index = MultiIndex.from_product([list('abc'),
['one', 'two', 'three'],
[1, 2, 3]],
names=['first', 'second', 'third'])
df = DataFrame(np.arange(27 * 3).reshape(27, 3),
index=index,
columns=['value1', 'value2', 'value3']).sort_index()
idx = pd.IndexSlice
for op in ['add', 'sub', 'mul', 'div', 'truediv']:
opa = getattr(operator, op, None)
if opa is None:
continue
x = Series([1.0, 10.0, 100.0], [1, 2, 3])
result = getattr(df, op)(x, level='third', axis=0)
expected = pd.concat([opa(df.loc[idx[:, :, i], :], v)
for i, v in x.iteritems()]).sort_index()
assert_frame_equal(result, expected)
x = Series([1.0, 10.0], ['two', 'three'])
result = getattr(df, op)(x, level='second', axis=0)
expected = (pd.concat([opa(df.loc[idx[:, i], :], v)
for i, v in x.iteritems()])
.reindex_like(df).sort_index())
assert_frame_equal(result, expected)
# GH9463 (alignment level of dataframe with series)
midx = MultiIndex.from_product([['A', 'B'], ['a', 'b']])
df = DataFrame(np.ones((2, 4), dtype='int64'), columns=midx)
s = pd.Series({'a': 1, 'b': 2})
df2 = df.copy()
df2.columns.names = ['lvl0', 'lvl1']
s2 = s.copy()
s2.index.name = 'lvl1'
# different cases of integer/string level names:
res1 = df.mul(s, axis=1, level=1)
res2 = df.mul(s2, axis=1, level=1)
res3 = df2.mul(s, axis=1, level=1)
res4 = df2.mul(s2, axis=1, level=1)
res5 = df2.mul(s, axis=1, level='lvl1')
res6 = df2.mul(s2, axis=1, level='lvl1')
exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype='int64'),
columns=midx)
for res in [res1, res2]:
assert_frame_equal(res, exp)
exp.columns.names = ['lvl0', 'lvl1']
for res in [res3, res4, res5, res6]:
assert_frame_equal(res, exp)
def test_dti_tz_convert_to_utc(self):
base = pd.DatetimeIndex(['2011-01-01', '2011-01-02',
'2011-01-03'], tz='UTC')
idx1 = base.tz_convert('Asia/Tokyo')[:2]
idx2 = base.tz_convert('US/Eastern')[1:]
df1 = DataFrame({'A': [1, 2]}, index=idx1)
df2 = DataFrame({'A': [1, 1]}, index=idx2)
exp = DataFrame({'A': [np.nan, 3, np.nan]}, index=base)
assert_frame_equal(df1 + df2, exp)
def test_combineFrame(self):
frame_copy = self.frame.reindex(self.frame.index[::2])
del frame_copy['D']
frame_copy['C'][:5] = np.nan
added = self.frame + frame_copy
indexer = added['A'].dropna().index
exp = (self.frame['A'] * 2).copy()
tm.assert_series_equal(added['A'].dropna(), exp.loc[indexer])
exp.loc[~exp.index.isin(indexer)] = np.nan
tm.assert_series_equal(added['A'], exp.loc[added['A'].index])
assert np.isnan(added['C'].reindex(frame_copy.index)[:5]).all()
# assert(False)
assert np.isnan(added['D']).all()
self_added = self.frame + self.frame
tm.assert_index_equal(self_added.index, self.frame.index)
added_rev = frame_copy + self.frame
assert np.isnan(added['D']).all()
assert np.isnan(added_rev['D']).all()
# corner cases
# empty
plus_empty = self.frame + self.empty
assert np.isnan(plus_empty.values).all()
empty_plus = self.empty + self.frame
assert np.isnan(empty_plus.values).all()
empty_empty = self.empty + self.empty
assert empty_empty.empty
# out of order
reverse = self.frame.reindex(columns=self.frame.columns[::-1])
assert_frame_equal(reverse + self.frame, self.frame * 2)
# mix vs float64, upcast
added = self.frame + self.mixed_float
_check_mixed_float(added, dtype='float64')
added = self.mixed_float + self.frame
_check_mixed_float(added, dtype='float64')
# mix vs mix
added = self.mixed_float + self.mixed_float2
_check_mixed_float(added, dtype=dict(C=None))
added = self.mixed_float2 + self.mixed_float
_check_mixed_float(added, dtype=dict(C=None))
# with int
added = self.frame + self.mixed_int
_check_mixed_float(added, dtype='float64')
def test_combineSeries(self):
# Series
series = self.frame.xs(self.frame.index[0])
added = self.frame + series
for key, s in compat.iteritems(added):
assert_series_equal(s, self.frame[key] + series[key])
larger_series = series.to_dict()
larger_series['E'] = 1
larger_series = Series(larger_series)
larger_added = self.frame + larger_series
for key, s in compat.iteritems(self.frame):
assert_series_equal(larger_added[key], s + series[key])
assert 'E' in larger_added
assert np.isnan(larger_added['E']).all()
# no upcast needed
added = self.mixed_float + series
_check_mixed_float(added)
# vs mix (upcast) as needed
added = self.mixed_float + series.astype('float32')
_check_mixed_float(added, dtype=dict(C=None))
added = self.mixed_float + series.astype('float16')
_check_mixed_float(added, dtype=dict(C=None))
# these raise with numexpr.....as we are adding an int64 to an
# uint64....weird vs int
# added = self.mixed_int + (100*series).astype('int64')
# _check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C =
# 'int64', D = 'int64'))
# added = self.mixed_int + (100*series).astype('int32')
# _check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C =
# 'int32', D = 'int64'))
# TimeSeries
ts = self.tsframe['A']
# 10890
# we no longer allow auto timeseries broadcasting
# and require explicit broadcasting
added = self.tsframe.add(ts, axis='index')
for key, col in compat.iteritems(self.tsframe):
result = col + ts
assert_series_equal(added[key], result, check_names=False)
assert added[key].name == key
if col.name == ts.name:
assert result.name == 'A'
else:
assert result.name is None
smaller_frame = self.tsframe[:-5]
smaller_added = smaller_frame.add(ts, axis='index')
tm.assert_index_equal(smaller_added.index, self.tsframe.index)
smaller_ts = ts[:-5]
smaller_added2 = self.tsframe.add(smaller_ts, axis='index')
assert_frame_equal(smaller_added, smaller_added2)
# length 0, result is all-nan
result = self.tsframe.add(ts[:0], axis='index')
expected = DataFrame(np.nan, index=self.tsframe.index,
columns=self.tsframe.columns)
assert_frame_equal(result, expected)
# Frame is all-nan
result = self.tsframe[:0].add(ts, axis='index')
expected = DataFrame(np.nan, index=self.tsframe.index,
columns=self.tsframe.columns)
assert_frame_equal(result, expected)
# empty but with non-empty index
frame = self.tsframe[:1].reindex(columns=[])
result = frame.mul(ts, axis='index')
assert len(result) == len(ts)
def test_combineFunc(self):
result = self.frame * 2
tm.assert_numpy_array_equal(result.values, self.frame.values * 2)
# vs mix
result = self.mixed_float * 2
for c, s in compat.iteritems(result):
tm.assert_numpy_array_equal(
s.values, self.mixed_float[c].values * 2)
_check_mixed_float(result, dtype=dict(C=None))
result = self.empty * 2
assert result.index is self.empty.index
assert len(result.columns) == 0
def test_comparisons(self):
df1 = tm.makeTimeDataFrame()
df2 = tm.makeTimeDataFrame()
row = self.simple.xs('a')
ndim_5 = np.ones(df1.shape + (1, 1, 1))
def test_comp(func):
result = func(df1, df2)
tm.assert_numpy_array_equal(result.values,
func(df1.values, df2.values))
with pytest.raises(ValueError, match='dim must be <= 2'):
func(df1, ndim_5)
result2 = func(self.simple, row)
tm.assert_numpy_array_equal(result2.values,
func(self.simple.values, row.values))
result3 = func(self.frame, 0)
tm.assert_numpy_array_equal(result3.values,
func(self.frame.values, 0))
msg = 'Can only compare identically-labeled DataFrame'
with pytest.raises(ValueError, match=msg):
func(self.simple, self.simple[:2])
test_comp(operator.eq)
test_comp(operator.ne)
test_comp(operator.lt)
test_comp(operator.gt)
test_comp(operator.ge)
test_comp(operator.le)
def test_comparison_protected_from_errstate(self):
missing_df = tm.makeDataFrame()
missing_df.iloc[0]['A'] = np.nan
with np.errstate(invalid='ignore'):
expected = missing_df.values < 0
with np.errstate(invalid='raise'):
result = (missing_df < 0).values
tm.assert_numpy_array_equal(result, expected)
def test_boolean_comparison(self):
# GH 4576
# boolean comparisons with a tuple/list give unexpected results
df = DataFrame(np.arange(6).reshape((3, 2)))
b = np.array([2, 2])
b_r = np.atleast_2d([2, 2])
b_c = b_r.T
lst = [2, 2, 2]
tup = tuple(lst)
# gt
expected = DataFrame([[False, False], [False, True], [True, True]])
result = df > b
assert_frame_equal(result, expected)
result = df.values > b
assert_numpy_array_equal(result, expected.values)
msg1d = 'Unable to coerce to Series, length must be 2: given 3'
msg2d = 'Unable to coerce to DataFrame, shape must be'
msg2db = 'operands could not be broadcast together with shapes'
with pytest.raises(ValueError, match=msg1d):
# wrong shape
df > lst
with pytest.raises(ValueError, match=msg1d):
# wrong shape
result = df > tup
# broadcasts like ndarray (GH#23000)
result = df > b_r
assert_frame_equal(result, expected)
result = df.values > b_r
assert_numpy_array_equal(result, expected.values)
with pytest.raises(ValueError, match=msg2d):
df > b_c
with pytest.raises(ValueError, match=msg2db):
df.values > b_c
# ==
expected = DataFrame([[False, False], [True, False], [False, False]])
result = df == b
assert_frame_equal(result, expected)
with pytest.raises(ValueError, match=msg1d):
result = df == lst
with pytest.raises(ValueError, match=msg1d):
result = df == tup
# broadcasts like ndarray (GH#23000)
result = df == b_r
assert_frame_equal(result, expected)
result = df.values == b_r
assert_numpy_array_equal(result, expected.values)
with pytest.raises(ValueError, match=msg2d):
df == b_c
assert df.values.shape != b_c.shape
# with alignment
df = DataFrame(np.arange(6).reshape((3, 2)),
columns=list('AB'), index=list('abc'))
expected.index = df.index
expected.columns = df.columns
with pytest.raises(ValueError, match=msg1d):
result = df == lst
with pytest.raises(ValueError, match=msg1d):
result = df == tup
def test_combine_generic(self):
df1 = self.frame
df2 = self.frame.loc[self.frame.index[:-5], ['A', 'B', 'C']]
combined = df1.combine(df2, np.add)
combined2 = df2.combine(df1, np.add)
assert combined['D'].isna().all()
assert combined2['D'].isna().all()
chunk = combined.loc[combined.index[:-5], ['A', 'B', 'C']]
chunk2 = combined2.loc[combined2.index[:-5], ['A', 'B', 'C']]
exp = self.frame.loc[self.frame.index[:-5],
['A', 'B', 'C']].reindex_like(chunk) * 2
assert_frame_equal(chunk, exp)
assert_frame_equal(chunk2, exp)
def test_inplace_ops_alignment(self):
# inplace ops / ops alignment
# GH 8511
columns = list('abcdefg')
X_orig = DataFrame(np.arange(10 * len(columns))
.reshape(-1, len(columns)),
columns=columns, index=range(10))
Z = 100 * X_orig.iloc[:, 1:-1].copy()
block1 = list('bedcf')
subs = list('bcdef')
# add
X = X_orig.copy()
result1 = (X[block1] + Z).reindex(columns=subs)
X[block1] += Z
result2 = X.reindex(columns=subs)
X = X_orig.copy()
result3 = (X[block1] + Z[block1]).reindex(columns=subs)
X[block1] += Z[block1]
result4 = X.reindex(columns=subs)
assert_frame_equal(result1, result2)
assert_frame_equal(result1, result3)
assert_frame_equal(result1, result4)
# sub
X = X_orig.copy()
result1 = (X[block1] - Z).reindex(columns=subs)
X[block1] -= Z
result2 = X.reindex(columns=subs)
X = X_orig.copy()
result3 = (X[block1] - Z[block1]).reindex(columns=subs)
X[block1] -= Z[block1]
result4 = X.reindex(columns=subs)
assert_frame_equal(result1, result2)
assert_frame_equal(result1, result3)
assert_frame_equal(result1, result4)
def test_inplace_ops_identity(self):
# GH 5104
# make sure that we are actually changing the object
s_orig = Series([1, 2, 3])
df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5))
# no dtype change
s = s_orig.copy()
s2 = s
s += 1
assert_series_equal(s, s2)
assert_series_equal(s_orig + 1, s)
assert s is s2
assert s._data is s2._data
df = df_orig.copy()
df2 = df
df += 1
assert_frame_equal(df, df2)
assert_frame_equal(df_orig + 1, df)
assert df is df2
assert df._data is df2._data
# dtype change
s = s_orig.copy()
s2 = s
s += 1.5
assert_series_equal(s, s2)
assert_series_equal(s_orig + 1.5, s)
df = df_orig.copy()
df2 = df
df += 1.5
assert_frame_equal(df, df2)
assert_frame_equal(df_orig + 1.5, df)
assert df is df2
assert df._data is df2._data
# mixed dtype
arr = np.random.randint(0, 10, size=5)
df_orig = DataFrame({'A': arr.copy(), 'B': 'foo'})
df = df_orig.copy()
df2 = df
df['A'] += 1
expected = DataFrame({'A': arr.copy() + 1, 'B': 'foo'})
assert_frame_equal(df, expected)
assert_frame_equal(df2, expected)
assert df._data is df2._data
df = df_orig.copy()
df2 = df
df['A'] += 1.5
expected = DataFrame({'A': arr.copy() + 1.5, 'B': 'foo'})
assert_frame_equal(df, expected)
assert_frame_equal(df2, expected)
assert df._data is df2._data
@pytest.mark.parametrize('op', ['add', 'and', 'div', 'floordiv', 'mod',
'mul', 'or', 'pow', 'sub', 'truediv',
'xor'])
def test_inplace_ops_identity2(self, op):
if compat.PY3 and op == 'div':
return
df = DataFrame({'a': [1., 2., 3.],
'b': [1, 2, 3]})
operand = 2
if op in ('and', 'or', 'xor'):
# cannot use floats for boolean ops
df['a'] = [True, False, True]
df_copy = df.copy()
iop = '__i{}__'.format(op)
op = '__{}__'.format(op)
# no id change and value is correct
getattr(df, iop)(operand)
expected = getattr(df_copy, op)(operand)
assert_frame_equal(df, expected)
expected = id(df)
assert id(df) == expected
def test_alignment_non_pandas(self):
index = ['A', 'B', 'C']
columns = ['X', 'Y', 'Z']
df = pd.DataFrame(np.random.randn(3, 3), index=index, columns=columns)
align = pd.core.ops._align_method_FRAME
for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.int64),
range(1, 4)]:
tm.assert_series_equal(align(df, val, 'index'),
Series([1, 2, 3], index=df.index))
tm.assert_series_equal(align(df, val, 'columns'),
Series([1, 2, 3], index=df.columns))
# length mismatch
msg = 'Unable to coerce to Series, length must be 3: given 2'
for val in [[1, 2], (1, 2), np.array([1, 2]), range(1, 3)]:
with pytest.raises(ValueError, match=msg):
align(df, val, 'index')
with pytest.raises(ValueError, match=msg):
align(df, val, 'columns')
val = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
tm.assert_frame_equal(align(df, val, 'index'),
DataFrame(val, index=df.index,
columns=df.columns))
tm.assert_frame_equal(align(df, val, 'columns'),
DataFrame(val, index=df.index,
columns=df.columns))
# shape mismatch
msg = 'Unable to coerce to DataFrame, shape must be'
val = np.array([[1, 2, 3], [4, 5, 6]])
with pytest.raises(ValueError, match=msg):
align(df, val, 'index')
with pytest.raises(ValueError, match=msg):
align(df, val, 'columns')
val = np.zeros((3, 3, 3))
with pytest.raises(ValueError):
align(df, val, 'index')
with pytest.raises(ValueError):
align(df, val, 'columns')
def test_no_warning(self, all_arithmetic_operators):
df = pd.DataFrame({"A": [0., 0.], "B": [0., None]})
b = df['B']
with tm.assert_produces_warning(None):
getattr(df, all_arithmetic_operators)(b, 0)
@@ -0,0 +1,147 @@
from datetime import timedelta
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame, DatetimeIndex, Index, PeriodIndex, Timedelta, date_range,
period_range, to_datetime)
import pandas.util.testing as tm
def _permute(obj):
return obj.take(np.random.permutation(len(obj)))
class TestPeriodIndex(object):
def test_as_frame_columns(self):
rng = period_range('1/1/2000', periods=5)
df = DataFrame(np.random.randn(10, 5), columns=rng)
ts = df[rng[0]]
tm.assert_series_equal(ts, df.iloc[:, 0])
# GH # 1211
repr(df)
ts = df['1/1/2000']
tm.assert_series_equal(ts, df.iloc[:, 0])
def test_frame_setitem(self):
rng = period_range('1/1/2000', periods=5, name='index')
df = DataFrame(np.random.randn(5, 3), index=rng)
df['Index'] = rng
rs = Index(df['Index'])
tm.assert_index_equal(rs, rng, check_names=False)
assert rs.name == 'Index'
assert rng.name == 'index'
rs = df.reset_index().set_index('index')
assert isinstance(rs.index, PeriodIndex)
tm.assert_index_equal(rs.index, rng)
def test_frame_to_time_stamp(self):
K = 5
index = period_range(freq='A', start='1/1/2001', end='12/1/2009')
df = DataFrame(np.random.randn(len(index), K), index=index)
df['mix'] = 'a'
exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC')
exp_index = exp_index + Timedelta(1, 'D') - Timedelta(1, 'ns')
result = df.to_timestamp('D', 'end')
tm.assert_index_equal(result.index, exp_index)
tm.assert_numpy_array_equal(result.values, df.values)
exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN')
result = df.to_timestamp('D', 'start')
tm.assert_index_equal(result.index, exp_index)
def _get_with_delta(delta, freq='A-DEC'):
return date_range(to_datetime('1/1/2001') + delta,
to_datetime('12/31/2009') + delta, freq=freq)
delta = timedelta(hours=23)
result = df.to_timestamp('H', 'end')
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, 'h') - Timedelta(1, 'ns')
tm.assert_index_equal(result.index, exp_index)
delta = timedelta(hours=23, minutes=59)
result = df.to_timestamp('T', 'end')
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, 'm') - Timedelta(1, 'ns')
tm.assert_index_equal(result.index, exp_index)
result = df.to_timestamp('S', 'end')
delta = timedelta(hours=23, minutes=59, seconds=59)
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, 's') - Timedelta(1, 'ns')
tm.assert_index_equal(result.index, exp_index)
# columns
df = df.T
exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC')
exp_index = exp_index + Timedelta(1, 'D') - Timedelta(1, 'ns')
result = df.to_timestamp('D', 'end', axis=1)
tm.assert_index_equal(result.columns, exp_index)
tm.assert_numpy_array_equal(result.values, df.values)
exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN')
result = df.to_timestamp('D', 'start', axis=1)
tm.assert_index_equal(result.columns, exp_index)
delta = timedelta(hours=23)
result = df.to_timestamp('H', 'end', axis=1)
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, 'h') - Timedelta(1, 'ns')
tm.assert_index_equal(result.columns, exp_index)
delta = timedelta(hours=23, minutes=59)
result = df.to_timestamp('T', 'end', axis=1)
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, 'm') - Timedelta(1, 'ns')
tm.assert_index_equal(result.columns, exp_index)
result = df.to_timestamp('S', 'end', axis=1)
delta = timedelta(hours=23, minutes=59, seconds=59)
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, 's') - Timedelta(1, 'ns')
tm.assert_index_equal(result.columns, exp_index)
# invalid axis
with pytest.raises(ValueError, match='axis'):
df.to_timestamp(axis=2)
result1 = df.to_timestamp('5t', axis=1)
result2 = df.to_timestamp('t', axis=1)
expected = pd.date_range('2001-01-01', '2009-01-01', freq='AS')
assert isinstance(result1.columns, DatetimeIndex)
assert isinstance(result2.columns, DatetimeIndex)
tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8)
tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8)
# PeriodIndex.to_timestamp always use 'infer'
assert result1.columns.freqstr == 'AS-JAN'
assert result2.columns.freqstr == 'AS-JAN'
def test_frame_index_to_string(self):
index = PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M')
frame = DataFrame(np.random.randn(3, 4), index=index)
# it works!
frame.to_string()
def test_align_frame(self):
rng = period_range('1/1/2000', '1/1/2010', freq='A')
ts = DataFrame(np.random.randn(len(rng), 3), index=rng)
result = ts + ts[::2]
expected = ts + ts
expected.values[1::2] = np.nan
tm.assert_frame_equal(result, expected)
result = ts + _permute(ts[::2])
tm.assert_frame_equal(result, expected)
@@ -0,0 +1,384 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, Series, Timestamp
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal
class TestDataFrameQuantile(TestData):
def test_quantile(self):
from numpy import percentile
q = self.tsframe.quantile(0.1, axis=0)
assert q['A'] == percentile(self.tsframe['A'], 10)
tm.assert_index_equal(q.index, self.tsframe.columns)
q = self.tsframe.quantile(0.9, axis=1)
assert (q['2000-01-17'] ==
percentile(self.tsframe.loc['2000-01-17'], 90))
tm.assert_index_equal(q.index, self.tsframe.index)
# test degenerate case
q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0)
assert(np.isnan(q['x']) and np.isnan(q['y']))
# non-numeric exclusion
df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]})
rs = df.quantile(0.5)
xp = df.median().rename(0.5)
assert_series_equal(rs, xp)
# axis
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
result = df.quantile(.5, axis=1)
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
assert_series_equal(result, expected)
result = df.quantile([.5, .75], axis=1)
expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75],
3: [3.5, 3.75]}, index=[0.5, 0.75])
assert_frame_equal(result, expected, check_index_type=True)
# We may want to break API in the future to change this
# so that we exclude non-numeric along the same axis
# See GH #7312
df = DataFrame([[1, 2, 3],
['a', 'b', 4]])
result = df.quantile(.5, axis=1)
expected = Series([3., 4.], index=[0, 1], name=0.5)
assert_series_equal(result, expected)
def test_quantile_axis_mixed(self):
# mixed on axis=1
df = DataFrame({"A": [1, 2, 3],
"B": [2., 3., 4.],
"C": pd.date_range('20130101', periods=3),
"D": ['foo', 'bar', 'baz']})
result = df.quantile(.5, axis=1)
expected = Series([1.5, 2.5, 3.5], name=0.5)
assert_series_equal(result, expected)
# must raise
with pytest.raises(TypeError):
df.quantile(.5, axis=1, numeric_only=False)
def test_quantile_axis_parameter(self):
# GH 9543/9544
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
result = df.quantile(.5, axis=0)
expected = Series([2., 3.], index=["A", "B"], name=0.5)
assert_series_equal(result, expected)
expected = df.quantile(.5, axis="index")
assert_series_equal(result, expected)
result = df.quantile(.5, axis=1)
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
assert_series_equal(result, expected)
result = df.quantile(.5, axis="columns")
assert_series_equal(result, expected)
pytest.raises(ValueError, df.quantile, 0.1, axis=-1)
pytest.raises(ValueError, df.quantile, 0.1, axis="column")
def test_quantile_interpolation(self):
# see gh-10174
from numpy import percentile
# interpolation = linear (default case)
q = self.tsframe.quantile(0.1, axis=0, interpolation='linear')
assert q['A'] == percentile(self.tsframe['A'], 10)
q = self.intframe.quantile(0.1)
assert q['A'] == percentile(self.intframe['A'], 10)
# test with and without interpolation keyword
q1 = self.intframe.quantile(0.1)
assert q1['A'] == np.percentile(self.intframe['A'], 10)
tm.assert_series_equal(q, q1)
# interpolation method other than default linear
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
result = df.quantile(.5, axis=1, interpolation='nearest')
expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5)
tm.assert_series_equal(result, expected)
# cross-check interpolation=nearest results in original dtype
exp = np.percentile(np.array([[1, 2, 3], [2, 3, 4]]), .5,
axis=0, interpolation='nearest')
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='int64')
tm.assert_series_equal(result, expected)
# float
df = DataFrame({"A": [1., 2., 3.], "B": [2., 3., 4.]}, index=[1, 2, 3])
result = df.quantile(.5, axis=1, interpolation='nearest')
expected = Series([1., 2., 3.], index=[1, 2, 3], name=0.5)
tm.assert_series_equal(result, expected)
exp = np.percentile(np.array([[1., 2., 3.], [2., 3., 4.]]), .5,
axis=0, interpolation='nearest')
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='float64')
assert_series_equal(result, expected)
# axis
result = df.quantile([.5, .75], axis=1, interpolation='lower')
expected = DataFrame({1: [1., 1.], 2: [2., 2.],
3: [3., 3.]}, index=[0.5, 0.75])
assert_frame_equal(result, expected)
# test degenerate case
df = DataFrame({'x': [], 'y': []})
q = df.quantile(0.1, axis=0, interpolation='higher')
assert(np.isnan(q['x']) and np.isnan(q['y']))
# multi
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
columns=['a', 'b', 'c'])
result = df.quantile([.25, .5], interpolation='midpoint')
# https://github.com/numpy/numpy/issues/7163
expected = DataFrame([[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
index=[.25, .5], columns=['a', 'b', 'c'])
assert_frame_equal(result, expected)
def test_quantile_multi(self):
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
columns=['a', 'b', 'c'])
result = df.quantile([.25, .5])
expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
index=[.25, .5], columns=['a', 'b', 'c'])
assert_frame_equal(result, expected)
# axis = 1
result = df.quantile([.25, .5], axis=1)
expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
index=[.25, .5], columns=[0, 1, 2])
# empty
result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0)
expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]},
index=[.1, .9])
assert_frame_equal(result, expected)
def test_quantile_datetime(self):
df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]})
# exclude datetime
result = df.quantile(.5)
expected = Series([2.5], index=['b'])
# datetime
result = df.quantile(.5, numeric_only=False)
expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5],
index=['a', 'b'],
name=0.5)
assert_series_equal(result, expected)
# datetime w/ multi
result = df.quantile([.5], numeric_only=False)
expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]],
index=[.5], columns=['a', 'b'])
assert_frame_equal(result, expected)
# axis = 1
df['c'] = pd.to_datetime(['2011', '2012'])
result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False)
expected = Series([Timestamp('2010-07-02 12:00:00'),
Timestamp('2011-07-02 12:00:00')],
index=[0, 1],
name=0.5)
assert_series_equal(result, expected)
result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False)
expected = DataFrame([[Timestamp('2010-07-02 12:00:00'),
Timestamp('2011-07-02 12:00:00')]],
index=[0.5], columns=[0, 1])
assert_frame_equal(result, expected)
# empty when numeric_only=True
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
# result = df[['a', 'c']].quantile(.5)
# result = df[['a', 'c']].quantile([.5])
def test_quantile_invalid(self):
msg = 'percentiles should all be in the interval \\[0, 1\\]'
for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
with pytest.raises(ValueError, match=msg):
self.tsframe.quantile(invalid)
def test_quantile_box(self):
df = DataFrame({'A': [pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-03')],
'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timestamp('2011-01-03', tz='US/Eastern')],
'C': [pd.Timedelta('1 days'),
pd.Timedelta('2 days'),
pd.Timedelta('3 days')]})
res = df.quantile(0.5, numeric_only=False)
exp = pd.Series([pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timedelta('2 days')],
name=0.5, index=['A', 'B', 'C'])
tm.assert_series_equal(res, exp)
res = df.quantile([0.5], numeric_only=False)
exp = pd.DataFrame([[pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timedelta('2 days')]],
index=[0.5], columns=['A', 'B', 'C'])
tm.assert_frame_equal(res, exp)
# DatetimeBlock may be consolidated and contain NaT in different loc
df = DataFrame({'A': [pd.Timestamp('2011-01-01'),
pd.NaT,
pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-03')],
'a': [pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-01-02'),
pd.NaT,
pd.Timestamp('2011-01-03')],
'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
pd.NaT,
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timestamp('2011-01-03', tz='US/Eastern')],
'b': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.NaT,
pd.Timestamp('2011-01-03', tz='US/Eastern')],
'C': [pd.Timedelta('1 days'),
pd.Timedelta('2 days'),
pd.Timedelta('3 days'),
pd.NaT],
'c': [pd.NaT,
pd.Timedelta('1 days'),
pd.Timedelta('2 days'),
pd.Timedelta('3 days')]},
columns=list('AaBbCc'))
res = df.quantile(0.5, numeric_only=False)
exp = pd.Series([pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timedelta('2 days'),
pd.Timedelta('2 days')],
name=0.5, index=list('AaBbCc'))
tm.assert_series_equal(res, exp)
res = df.quantile([0.5], numeric_only=False)
exp = pd.DataFrame([[pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timedelta('2 days'),
pd.Timedelta('2 days')]],
index=[0.5], columns=list('AaBbCc'))
tm.assert_frame_equal(res, exp)
def test_quantile_nan(self):
# GH 14357 - float block where some cols have missing values
df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)})
df.iloc[-1, 1] = np.nan
res = df.quantile(0.5)
exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5, 0.75])
exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75])
tm.assert_frame_equal(res, exp)
res = df.quantile(0.5, axis=1)
exp = Series(np.arange(1.0, 6.0), name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5, 0.75], axis=1)
exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
tm.assert_frame_equal(res, exp)
# full-nan column
df['b'] = np.nan
res = df.quantile(0.5)
exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5, 0.75])
exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]},
index=[0.5, 0.75])
tm.assert_frame_equal(res, exp)
def test_quantile_nat(self):
# full NaT column
df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]})
res = df.quantile(0.5, numeric_only=False)
exp = Series([pd.NaT], index=['a'], name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5], numeric_only=False)
exp = DataFrame({'a': [pd.NaT]}, index=[0.5])
tm.assert_frame_equal(res, exp)
# mixed non-null / full null column
df = DataFrame({'a': [pd.Timestamp('2012-01-01'),
pd.Timestamp('2012-01-02'),
pd.Timestamp('2012-01-03')],
'b': [pd.NaT, pd.NaT, pd.NaT]})
res = df.quantile(0.5, numeric_only=False)
exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'],
name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5], numeric_only=False)
exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5],
columns=['a', 'b'])
tm.assert_frame_equal(res, exp)
def test_quantile_empty(self):
# floats
df = DataFrame(columns=['a', 'b'], dtype='float64')
res = df.quantile(0.5)
exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5])
exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5])
tm.assert_frame_equal(res, exp)
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
# res = df.quantile(0.5, axis=1)
# res = df.quantile([0.5], axis=1)
# ints
df = DataFrame(columns=['a', 'b'], dtype='int64')
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
# res = df.quantile(0.5)
# datetimes
df = DataFrame(columns=['a', 'b'], dtype='datetime64[ns]')
# FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0)
# res = df.quantile(0.5, numeric_only=False)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,318 @@
# -*- coding: utf-8 -*-
from datetime import datetime, timedelta
from distutils.version import LooseVersion
import numpy as np
import pytest
from pandas import DataFrame, Series
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal
class TestRank(TestData):
s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])
df = DataFrame({'A': s, 'B': s})
results = {
'average': np.array([1.5, 5.5, 7.0, 3.5, np.nan,
3.5, 1.5, 8.0, np.nan, 5.5]),
'min': np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]),
'max': np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]),
'first': np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]),
'dense': np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]),
}
@pytest.fixture(params=['average', 'min', 'max', 'first', 'dense'])
def method(self, request):
"""
Fixture for trying all rank methods
"""
return request.param
def test_rank(self):
rankdata = pytest.importorskip('scipy.stats.rankdata')
self.frame['A'][::2] = np.nan
self.frame['B'][::3] = np.nan
self.frame['C'][::4] = np.nan
self.frame['D'][::5] = np.nan
ranks0 = self.frame.rank()
ranks1 = self.frame.rank(1)
mask = np.isnan(self.frame.values)
fvals = self.frame.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, fvals)
exp0[mask] = np.nan
exp1 = np.apply_along_axis(rankdata, 1, fvals)
exp1[mask] = np.nan
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# integers
df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))
result = df.rank()
exp = df.astype(float).rank()
tm.assert_frame_equal(result, exp)
result = df.rank(1)
exp = df.astype(float).rank(1)
tm.assert_frame_equal(result, exp)
def test_rank2(self):
df = DataFrame([[1, 3, 2], [1, 2, 3]])
expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
result = df.rank(1, pct=True)
tm.assert_frame_equal(result, expected)
df = DataFrame([[1, 3, 2], [1, 2, 3]])
expected = df.rank(0) / 2.0
result = df.rank(0, pct=True)
tm.assert_frame_equal(result, expected)
df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']])
expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
result = df.rank(1, numeric_only=False)
tm.assert_frame_equal(result, expected)
expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
result = df.rank(0, numeric_only=False)
tm.assert_frame_equal(result, expected)
df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']])
expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]])
result = df.rank(1, numeric_only=False)
tm.assert_frame_equal(result, expected)
expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]])
result = df.rank(0, numeric_only=False)
tm.assert_frame_equal(result, expected)
# f7u12, this does not work without extensive workaround
data = [[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
[datetime(2000, 1, 2), datetime(2000, 1, 3),
datetime(2000, 1, 1)]]
df = DataFrame(data)
# check the rank
expected = DataFrame([[2., np.nan, 1.],
[2., 3., 1.]])
result = df.rank(1, numeric_only=False, ascending=True)
tm.assert_frame_equal(result, expected)
expected = DataFrame([[1., np.nan, 2.],
[2., 1., 3.]])
result = df.rank(1, numeric_only=False, ascending=False)
tm.assert_frame_equal(result, expected)
# mixed-type frames
self.mixed_frame['datetime'] = datetime.now()
self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1)
result = self.mixed_frame.rank(1)
expected = self.mixed_frame.rank(1, numeric_only=True)
tm.assert_frame_equal(result, expected)
df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10,
1e60, 1e80, 1e-30]})
exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]})
tm.assert_frame_equal(df.rank(), exp)
def test_rank_na_option(self):
rankdata = pytest.importorskip('scipy.stats.rankdata')
self.frame['A'][::2] = np.nan
self.frame['B'][::3] = np.nan
self.frame['C'][::4] = np.nan
self.frame['D'][::5] = np.nan
# bottom
ranks0 = self.frame.rank(na_option='bottom')
ranks1 = self.frame.rank(1, na_option='bottom')
fvals = self.frame.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, fvals)
exp1 = np.apply_along_axis(rankdata, 1, fvals)
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# top
ranks0 = self.frame.rank(na_option='top')
ranks1 = self.frame.rank(1, na_option='top')
fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
fval1 = self.frame.T
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
fval1 = fval1.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, fval0)
exp1 = np.apply_along_axis(rankdata, 1, fval1)
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# descending
# bottom
ranks0 = self.frame.rank(na_option='top', ascending=False)
ranks1 = self.frame.rank(1, na_option='top', ascending=False)
fvals = self.frame.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, -fvals)
exp1 = np.apply_along_axis(rankdata, 1, -fvals)
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# descending
# top
ranks0 = self.frame.rank(na_option='bottom', ascending=False)
ranks1 = self.frame.rank(1, na_option='bottom', ascending=False)
fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
fval1 = self.frame.T
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
fval1 = fval1.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, -fval0)
exp1 = np.apply_along_axis(rankdata, 1, -fval1)
tm.assert_numpy_array_equal(ranks0.values, exp0)
tm.assert_numpy_array_equal(ranks1.values, exp1)
# bad values throw error
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
with pytest.raises(ValueError, match=msg):
self.frame.rank(na_option='bad', ascending=False)
# invalid type
with pytest.raises(ValueError, match=msg):
self.frame.rank(na_option=True, ascending=False)
def test_rank_axis(self):
# check if using axes' names gives the same result
df = DataFrame([[2, 1], [4, 3]])
tm.assert_frame_equal(df.rank(axis=0), df.rank(axis='index'))
tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns'))
def test_rank_methods_frame(self):
pytest.importorskip('scipy.stats.special')
rankdata = pytest.importorskip('scipy.stats.rankdata')
import scipy
xs = np.random.randint(0, 21, (100, 26))
xs = (xs - 10.0) / 10.0
cols = [chr(ord('z') - i) for i in range(xs.shape[1])]
for vals in [xs, xs + 1e6, xs * 1e-6]:
df = DataFrame(vals, columns=cols)
for ax in [0, 1]:
for m in ['average', 'min', 'max', 'first', 'dense']:
result = df.rank(axis=ax, method=m)
sprank = np.apply_along_axis(
rankdata, ax, vals,
m if m != 'first' else 'ordinal')
sprank = sprank.astype(np.float64)
expected = DataFrame(sprank, columns=cols)
if (LooseVersion(scipy.__version__) >=
LooseVersion('0.17.0')):
expected = expected.astype('float64')
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
def test_rank_descending(self, method, dtype):
if 'i' in dtype:
df = self.df.dropna()
else:
df = self.df.astype(dtype)
res = df.rank(ascending=False)
expected = (df.max() - df).rank()
assert_frame_equal(res, expected)
if method == 'first' and dtype == 'O':
return
expected = (df.max() - df).rank(method=method)
if dtype != 'O':
res2 = df.rank(method=method, ascending=False,
numeric_only=True)
assert_frame_equal(res2, expected)
res3 = df.rank(method=method, ascending=False,
numeric_only=False)
assert_frame_equal(res3, expected)
@pytest.mark.parametrize('axis', [0, 1])
@pytest.mark.parametrize('dtype', [None, object])
def test_rank_2d_tie_methods(self, method, axis, dtype):
df = self.df
def _check2d(df, expected, method='average', axis=0):
exp_df = DataFrame({'A': expected, 'B': expected})
if axis == 1:
df = df.T
exp_df = exp_df.T
result = df.rank(method=method, axis=axis)
assert_frame_equal(result, exp_df)
disabled = {(object, 'first')}
if (dtype, method) in disabled:
return
frame = df if dtype is None else df.astype(dtype)
_check2d(frame, self.results[method], method=method, axis=axis)
@pytest.mark.parametrize(
"method,exp", [("dense",
[[1., 1., 1.],
[1., 0.5, 2. / 3],
[1., 0.5, 1. / 3]]),
("min",
[[1. / 3, 1., 1.],
[1. / 3, 1. / 3, 2. / 3],
[1. / 3, 1. / 3, 1. / 3]]),
("max",
[[1., 1., 1.],
[1., 2. / 3, 2. / 3],
[1., 2. / 3, 1. / 3]]),
("average",
[[2. / 3, 1., 1.],
[2. / 3, 0.5, 2. / 3],
[2. / 3, 0.5, 1. / 3]]),
("first",
[[1. / 3, 1., 1.],
[2. / 3, 1. / 3, 2. / 3],
[3. / 3, 2. / 3, 1. / 3]])])
def test_rank_pct_true(self, method, exp):
# see gh-15630.
df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
result = df.rank(method=method, pct=True)
expected = DataFrame(exp)
tm.assert_frame_equal(result, expected)
@pytest.mark.single
def test_pct_max_many_rows(self):
# GH 18271
df = DataFrame({'A': np.arange(2**24 + 1),
'B': np.arange(2**24 + 1, 0, -1)})
result = df.rank(pct=True).max()
assert (result == 1).all()
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,523 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
from datetime import datetime, timedelta
import re
import sys
import textwrap
import numpy as np
import pytest
from pandas.compat import PYPY, StringIO, lrange, u
import pandas as pd
from pandas import (
Categorical, DataFrame, Series, compat, date_range, option_context,
period_range)
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
import pandas.io.formats.format as fmt
# Segregated collection of methods that require the BlockManager internal data
# structure
class TestDataFrameReprInfoEtc(TestData):
def test_repr_empty(self):
# empty
foo = repr(self.empty) # noqa
# empty with index
frame = DataFrame(index=np.arange(1000))
foo = repr(frame) # noqa
def test_repr_mixed(self):
buf = StringIO()
# mixed
foo = repr(self.mixed_frame) # noqa
self.mixed_frame.info(verbose=False, buf=buf)
@pytest.mark.slow
def test_repr_mixed_big(self):
# big mixed
biggie = DataFrame({'A': np.random.randn(200),
'B': tm.makeStringIndex(200)},
index=lrange(200))
biggie.loc[:20, 'A'] = np.nan
biggie.loc[:20, 'B'] = np.nan
foo = repr(biggie) # noqa
def test_repr(self):
buf = StringIO()
# small one
foo = repr(self.frame)
self.frame.info(verbose=False, buf=buf)
# even smaller
self.frame.reindex(columns=['A']).info(verbose=False, buf=buf)
self.frame.reindex(columns=['A', 'B']).info(verbose=False, buf=buf)
# exhausting cases in DataFrame.info
# columns but no index
no_index = DataFrame(columns=[0, 1, 3])
foo = repr(no_index) # noqa
# no columns or index
self.empty.info(buf=buf)
df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"])
assert "\t" not in repr(df)
assert "\r" not in repr(df)
assert "a\n" not in repr(df)
def test_repr_dimensions(self):
df = DataFrame([[1, 2, ], [3, 4]])
with option_context('display.show_dimensions', True):
assert "2 rows x 2 columns" in repr(df)
with option_context('display.show_dimensions', False):
assert "2 rows x 2 columns" not in repr(df)
with option_context('display.show_dimensions', 'truncate'):
assert "2 rows x 2 columns" not in repr(df)
@pytest.mark.slow
def test_repr_big(self):
# big one
biggie = DataFrame(np.zeros((200, 4)), columns=lrange(4),
index=lrange(200))
repr(biggie)
def test_repr_unsortable(self):
# columns are not sortable
import warnings
warn_filters = warnings.filters
warnings.filterwarnings('ignore',
category=FutureWarning,
module=".*format")
unsortable = DataFrame({'foo': [1] * 50,
datetime.today(): [1] * 50,
'bar': ['bar'] * 50,
datetime.today() + timedelta(1): ['bar'] * 50},
index=np.arange(50))
repr(unsortable)
fmt.set_option('display.precision', 3, 'display.column_space', 10)
repr(self.frame)
fmt.set_option('display.max_rows', 10, 'display.max_columns', 2)
repr(self.frame)
fmt.set_option('display.max_rows', 1000, 'display.max_columns', 1000)
repr(self.frame)
tm.reset_display_options()
warnings.filters = warn_filters
def test_repr_unicode(self):
uval = u('\u03c3\u03c3\u03c3\u03c3')
# TODO(wesm): is this supposed to be used?
bval = uval.encode('utf-8') # noqa
df = DataFrame({'A': [uval, uval]})
result = repr(df)
ex_top = ' A'
assert result.split('\n')[0].rstrip() == ex_top
df = DataFrame({'A': [uval, uval]})
result = repr(df)
assert result.split('\n')[0].rstrip() == ex_top
def test_unicode_string_with_unicode(self):
df = DataFrame({'A': [u("\u05d0")]})
if compat.PY3:
str(df)
else:
compat.text_type(df)
def test_bytestring_with_unicode(self):
df = DataFrame({'A': [u("\u05d0")]})
if compat.PY3:
bytes(df)
else:
str(df)
def test_very_wide_info_repr(self):
df = DataFrame(np.random.randn(10, 20),
columns=tm.rands_array(10, 20))
repr(df)
def test_repr_column_name_unicode_truncation_bug(self):
# #1906
df = DataFrame({'Id': [7117434],
'StringCol': ('Is it possible to modify drop plot code'
' so that the output graph is displayed '
'in iphone simulator, Is it possible to '
'modify drop plot code so that the '
'output graph is \xe2\x80\xa8displayed '
'in iphone simulator.Now we are adding '
'the CSV file externally. I want to Call'
' the File through the code..')})
with option_context('display.max_columns', 20):
assert 'StringCol' in repr(df)
def test_latex_repr(self):
result = r"""\begin{tabular}{llll}
\toprule
{} & 0 & 1 & 2 \\
\midrule
0 & $\alpha$ & b & c \\
1 & 1 & 2 & 3 \\
\bottomrule
\end{tabular}
"""
with option_context("display.latex.escape", False,
'display.latex.repr', True):
df = DataFrame([[r'$\alpha$', 'b', 'c'], [1, 2, 3]])
assert result == df._repr_latex_()
# GH 12182
assert df._repr_latex_() is None
def test_info(self):
io = StringIO()
self.frame.info(buf=io)
self.tsframe.info(buf=io)
frame = DataFrame(np.random.randn(5, 3))
frame.info()
frame.info(verbose=False)
def test_info_memory(self):
# https://github.com/pandas-dev/pandas/issues/21056
df = pd.DataFrame({'a': pd.Series([1, 2], dtype='i8')})
buf = StringIO()
df.info(buf=buf)
result = buf.getvalue()
bytes = float(df.memory_usage().sum())
expected = textwrap.dedent("""\
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
a 2 non-null int64
dtypes: int64(1)
memory usage: {} bytes
""".format(bytes))
assert result == expected
def test_info_wide(self):
from pandas import set_option, reset_option
io = StringIO()
df = DataFrame(np.random.randn(5, 101))
df.info(buf=io)
io = StringIO()
df.info(buf=io, max_cols=101)
rs = io.getvalue()
assert len(rs.splitlines()) > 100
xp = rs
set_option('display.max_info_columns', 101)
io = StringIO()
df.info(buf=io)
assert rs == xp
reset_option('display.max_info_columns')
def test_info_duplicate_columns(self):
io = StringIO()
# it works!
frame = DataFrame(np.random.randn(1500, 4),
columns=['a', 'a', 'b', 'b'])
frame.info(buf=io)
def test_info_duplicate_columns_shows_correct_dtypes(self):
# GH11761
io = StringIO()
frame = DataFrame([[1, 2.0]],
columns=['a', 'a'])
frame.info(buf=io)
io.seek(0)
lines = io.readlines()
assert 'a 1 non-null int64\n' == lines[3]
assert 'a 1 non-null float64\n' == lines[4]
def test_info_shows_column_dtypes(self):
dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
'complex128', 'object', 'bool']
data = {}
n = 10
for i, dtype in enumerate(dtypes):
data[i] = np.random.randint(2, size=n).astype(dtype)
df = DataFrame(data)
buf = StringIO()
df.info(buf=buf)
res = buf.getvalue()
for i, dtype in enumerate(dtypes):
name = '%d %d non-null %s' % (i, n, dtype)
assert name in res
def test_info_max_cols(self):
df = DataFrame(np.random.randn(10, 5))
for len_, verbose in [(5, None), (5, False), (10, True)]:
# For verbose always ^ setting ^ summarize ^ full output
with option_context('max_info_columns', 4):
buf = StringIO()
df.info(buf=buf, verbose=verbose)
res = buf.getvalue()
assert len(res.strip().split('\n')) == len_
for len_, verbose in [(10, None), (5, False), (10, True)]:
# max_cols no exceeded
with option_context('max_info_columns', 5):
buf = StringIO()
df.info(buf=buf, verbose=verbose)
res = buf.getvalue()
assert len(res.strip().split('\n')) == len_
for len_, max_cols in [(10, 5), (5, 4)]:
# setting truncates
with option_context('max_info_columns', 4):
buf = StringIO()
df.info(buf=buf, max_cols=max_cols)
res = buf.getvalue()
assert len(res.strip().split('\n')) == len_
# setting wouldn't truncate
with option_context('max_info_columns', 5):
buf = StringIO()
df.info(buf=buf, max_cols=max_cols)
res = buf.getvalue()
assert len(res.strip().split('\n')) == len_
def test_info_memory_usage(self):
# Ensure memory usage is displayed, when asserted, on the last line
dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
'complex128', 'object', 'bool']
data = {}
n = 10
for i, dtype in enumerate(dtypes):
data[i] = np.random.randint(2, size=n).astype(dtype)
df = DataFrame(data)
buf = StringIO()
# display memory usage case
df.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert "memory usage: " in res[-1]
# do not display memory usage case
df.info(buf=buf, memory_usage=False)
res = buf.getvalue().splitlines()
assert "memory usage: " not in res[-1]
df.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
# memory usage is a lower bound, so print it as XYZ+ MB
assert re.match(r"memory usage: [^+]+\+", res[-1])
df.iloc[:, :5].info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
# excluded column with object dtype, so estimate is accurate
assert not re.match(r"memory usage: [^+]+\+", res[-1])
# Test a DataFrame with duplicate columns
dtypes = ['int64', 'int64', 'int64', 'float64']
data = {}
n = 100
for i, dtype in enumerate(dtypes):
data[i] = np.random.randint(2, size=n).astype(dtype)
df = DataFrame(data)
df.columns = dtypes
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
df_with_object_index.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+\+", res[-1])
df_with_object_index.info(buf=buf, memory_usage='deep')
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+$", res[-1])
# Ensure df size is as expected
# (cols * rows * bytes) + index size
df_size = df.memory_usage().sum()
exp_size = len(dtypes) * n * 8 + df.index.nbytes
assert df_size == exp_size
# Ensure number of cols in memory_usage is the same as df
size_df = np.size(df.columns.values) + 1 # index=True; default
assert size_df == np.size(df.memory_usage())
# assert deep works only on object
assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
# test for validity
DataFrame(1, index=['a'], columns=['A']
).memory_usage(index=True)
DataFrame(1, index=['a'], columns=['A']
).index.nbytes
df = DataFrame(
data=1,
index=pd.MultiIndex.from_product(
[['a'], range(1000)]),
columns=['A']
)
df.index.nbytes
df.memory_usage(index=True)
df.index.values.nbytes
mem = df.memory_usage(deep=True).sum()
assert mem > 0
@pytest.mark.skipif(PYPY,
reason="on PyPy deep=True doesn't change result")
def test_info_memory_usage_deep_not_pypy(self):
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
assert (df_with_object_index.memory_usage(
index=True, deep=True).sum() >
df_with_object_index.memory_usage(
index=True).sum())
df_object = pd.DataFrame({'a': ['a']})
assert (df_object.memory_usage(deep=True).sum() >
df_object.memory_usage().sum())
@pytest.mark.skipif(not PYPY,
reason="on PyPy deep=True does not change result")
def test_info_memory_usage_deep_pypy(self):
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
assert (df_with_object_index.memory_usage(
index=True, deep=True).sum() ==
df_with_object_index.memory_usage(
index=True).sum())
df_object = pd.DataFrame({'a': ['a']})
assert (df_object.memory_usage(deep=True).sum() ==
df_object.memory_usage().sum())
@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
def test_usage_via_getsizeof(self):
df = DataFrame(
data=1,
index=pd.MultiIndex.from_product(
[['a'], range(1000)]),
columns=['A']
)
mem = df.memory_usage(deep=True).sum()
# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = mem - sys.getsizeof(df)
assert abs(diff) < 100
def test_info_memory_usage_qualified(self):
buf = StringIO()
df = DataFrame(1, columns=list('ab'),
index=[1, 2, 3])
df.info(buf=buf)
assert '+' not in buf.getvalue()
buf = StringIO()
df = DataFrame(1, columns=list('ab'),
index=list('ABC'))
df.info(buf=buf)
assert '+' in buf.getvalue()
buf = StringIO()
df = DataFrame(1, columns=list('ab'),
index=pd.MultiIndex.from_product(
[range(3), range(3)]))
df.info(buf=buf)
assert '+' not in buf.getvalue()
buf = StringIO()
df = DataFrame(1, columns=list('ab'),
index=pd.MultiIndex.from_product(
[range(3), ['foo', 'bar']]))
df.info(buf=buf)
assert '+' in buf.getvalue()
def test_info_memory_usage_bug_on_multiindex(self):
# GH 14308
# memory usage introspection should not materialize .values
from string import ascii_uppercase as uppercase
def memory_usage(f):
return f.memory_usage(deep=True).sum()
N = 100
M = len(uppercase)
index = pd.MultiIndex.from_product([list(uppercase),
pd.date_range('20160101',
periods=N)],
names=['id', 'date'])
df = DataFrame({'value': np.random.randn(N * M)}, index=index)
unstacked = df.unstack('id')
assert df.values.nbytes == unstacked.values.nbytes
assert memory_usage(df) > memory_usage(unstacked)
# high upper bound
assert memory_usage(unstacked) - memory_usage(df) < 2000
def test_info_categorical(self):
# GH14298
idx = pd.CategoricalIndex(['a', 'b'])
df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
buf = StringIO()
df.info(buf=buf)
def test_info_categorical_column(self):
# make sure it works
n = 2500
df = DataFrame({'int64': np.random.randint(100, size=n)})
df['category'] = Series(np.array(list('abcdefghij')).take(
np.random.randint(0, 10, size=n))).astype('category')
df.isna()
buf = StringIO()
df.info(buf=buf)
df2 = df[df['category'] == 'd']
buf = compat.StringIO()
df2.info(buf=buf)
def test_repr_categorical_dates_periods(self):
# normal DataFrame
dt = date_range('2011-01-01 09:00', freq='H', periods=5,
tz='US/Eastern')
p = period_range('2011-01', freq='M', periods=5)
df = DataFrame({'dt': dt, 'p': p})
exp = """ dt p
0 2011-01-01 09:00:00-05:00 2011-01
1 2011-01-01 10:00:00-05:00 2011-02
2 2011-01-01 11:00:00-05:00 2011-03
3 2011-01-01 12:00:00-05:00 2011-04
4 2011-01-01 13:00:00-05:00 2011-05"""
df = DataFrame({'dt': Categorical(dt), 'p': Categorical(p)})
assert repr(df) == exp
@@ -0,0 +1,968 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
from datetime import datetime
import itertools
from warnings import catch_warnings, simplefilter
import numpy as np
import pytest
from pandas.compat import u
import pandas as pd
from pandas import (
DataFrame, Index, MultiIndex, Period, Series, Timedelta, date_range)
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal
class TestDataFrameReshape(TestData):
def test_pivot(self):
data = {
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
'values': [1., 2., 3., 3., 2., 1.]
}
frame = DataFrame(data)
pivoted = frame.pivot(
index='index', columns='columns', values='values')
expected = DataFrame({
'One': {'A': 1., 'B': 2., 'C': 3.},
'Two': {'A': 1., 'B': 2., 'C': 3.}
})
expected.index.name, expected.columns.name = 'index', 'columns'
tm.assert_frame_equal(pivoted, expected)
# name tracking
assert pivoted.index.name == 'index'
assert pivoted.columns.name == 'columns'
# don't specify values
pivoted = frame.pivot(index='index', columns='columns')
assert pivoted.index.name == 'index'
assert pivoted.columns.names == (None, 'columns')
with catch_warnings(record=True):
# pivot multiple columns
simplefilter("ignore", FutureWarning)
wp = tm.makePanel()
lp = wp.to_frame()
df = lp.reset_index()
tm.assert_frame_equal(df.pivot('major', 'minor'), lp.unstack())
def test_pivot_duplicates(self):
data = DataFrame({'a': ['bar', 'bar', 'foo', 'foo', 'foo'],
'b': ['one', 'two', 'one', 'one', 'two'],
'c': [1., 2., 3., 3., 4.]})
with pytest.raises(ValueError, match='duplicate entries'):
data.pivot('a', 'b', 'c')
def test_pivot_empty(self):
df = DataFrame({}, columns=['a', 'b', 'c'])
result = df.pivot('a', 'b', 'c')
expected = DataFrame({})
tm.assert_frame_equal(result, expected, check_names=False)
def test_pivot_integer_bug(self):
df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")])
result = df.pivot(index=1, columns=0, values=2)
repr(result)
tm.assert_index_equal(result.columns, Index(['A', 'B'], name=0))
def test_pivot_index_none(self):
# gh-3962
data = {
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
'values': [1., 2., 3., 3., 2., 1.]
}
frame = DataFrame(data).set_index('index')
result = frame.pivot(columns='columns', values='values')
expected = DataFrame({
'One': {'A': 1., 'B': 2., 'C': 3.},
'Two': {'A': 1., 'B': 2., 'C': 3.}
})
expected.index.name, expected.columns.name = 'index', 'columns'
assert_frame_equal(result, expected)
# omit values
result = frame.pivot(columns='columns')
expected.columns = pd.MultiIndex.from_tuples([('values', 'One'),
('values', 'Two')],
names=[None, 'columns'])
expected.index.name = 'index'
tm.assert_frame_equal(result, expected, check_names=False)
assert result.index.name == 'index'
assert result.columns.names == (None, 'columns')
expected.columns = expected.columns.droplevel(0)
result = frame.pivot(columns='columns', values='values')
expected.columns.name = 'columns'
tm.assert_frame_equal(result, expected)
def test_stack_unstack(self):
df = self.frame.copy()
df[:] = np.arange(np.prod(df.shape)).reshape(df.shape)
stacked = df.stack()
stacked_df = DataFrame({'foo': stacked, 'bar': stacked})
unstacked = stacked.unstack()
unstacked_df = stacked_df.unstack()
assert_frame_equal(unstacked, df)
assert_frame_equal(unstacked_df['bar'], df)
unstacked_cols = stacked.unstack(0)
unstacked_cols_df = stacked_df.unstack(0)
assert_frame_equal(unstacked_cols.T, df)
assert_frame_equal(unstacked_cols_df['bar'].T, df)
def test_stack_mixed_level(self):
# GH 18310
levels = [range(3), [3, 'a', 'b'], [1, 2]]
# flat columns:
df = DataFrame(1, index=levels[0], columns=levels[1])
result = df.stack()
expected = Series(1, index=MultiIndex.from_product(levels[:2]))
assert_series_equal(result, expected)
# MultiIndex columns:
df = DataFrame(1, index=levels[0],
columns=MultiIndex.from_product(levels[1:]))
result = df.stack(1)
expected = DataFrame(1, index=MultiIndex.from_product([levels[0],
levels[2]]),
columns=levels[1])
assert_frame_equal(result, expected)
# as above, but used labels in level are actually of homogeneous type
result = df[['a', 'b']].stack(1)
expected = expected[['a', 'b']]
assert_frame_equal(result, expected)
def test_unstack_fill(self):
# GH #9746: fill_value keyword argument for Series
# and DataFrame unstack
# From a series
data = Series([1, 2, 4, 5], dtype=np.int16)
data.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
result = data.unstack(fill_value=-1)
expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]},
index=['x', 'y', 'z'], dtype=np.int16)
assert_frame_equal(result, expected)
# From a series with incorrect data type for fill_value
result = data.unstack(fill_value=0.5)
expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]},
index=['x', 'y', 'z'], dtype=np.float)
assert_frame_equal(result, expected)
# GH #13971: fill_value when unstacking multiple levels:
df = DataFrame({'x': ['a', 'a', 'b'],
'y': ['j', 'k', 'j'],
'z': [0, 1, 2],
'w': [0, 1, 2]}).set_index(['x', 'y', 'z'])
unstacked = df.unstack(['x', 'y'], fill_value=0)
key = ('w', 'b', 'j')
expected = unstacked[key]
result = pd.Series([0, 0, 2], index=unstacked.index, name=key)
assert_series_equal(result, expected)
stacked = unstacked.stack(['x', 'y'])
stacked.index = stacked.index.reorder_levels(df.index.names)
# Workaround for GH #17886 (unnecessarily casts to float):
stacked = stacked.astype(np.int64)
result = stacked.loc[df.index]
assert_frame_equal(result, df)
# From a series
s = df['w']
result = s.unstack(['x', 'y'], fill_value=0)
expected = unstacked['w']
assert_frame_equal(result, expected)
def test_unstack_fill_frame(self):
# From a dataframe
rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
df.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
result = df.unstack(fill_value=-1)
rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
expected.columns = MultiIndex.from_tuples(
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
assert_frame_equal(result, expected)
# From a mixed type dataframe
df['A'] = df['A'].astype(np.int16)
df['B'] = df['B'].astype(np.float64)
result = df.unstack(fill_value=-1)
expected['A'] = expected['A'].astype(np.int16)
expected['B'] = expected['B'].astype(np.float64)
assert_frame_equal(result, expected)
# From a dataframe with incorrect data type for fill_value
result = df.unstack(fill_value=0.5)
rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
expected.columns = MultiIndex.from_tuples(
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
assert_frame_equal(result, expected)
def test_unstack_fill_frame_datetime(self):
# Test unstacking with date times
dv = pd.date_range('2012-01-01', periods=4).values
data = Series(dv)
data.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
result = data.unstack()
expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]],
'b': [dv[1], dv[2], pd.NaT]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
result = data.unstack(fill_value=dv[0])
expected = DataFrame({'a': [dv[0], dv[0], dv[3]],
'b': [dv[1], dv[2], dv[0]]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
def test_unstack_fill_frame_timedelta(self):
# Test unstacking with time deltas
td = [Timedelta(days=i) for i in range(4)]
data = Series(td)
data.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
result = data.unstack()
expected = DataFrame({'a': [td[0], pd.NaT, td[3]],
'b': [td[1], td[2], pd.NaT]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
result = data.unstack(fill_value=td[1])
expected = DataFrame({'a': [td[0], td[1], td[3]],
'b': [td[1], td[2], td[1]]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
def test_unstack_fill_frame_period(self):
# Test unstacking with period
periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'),
Period('2012-04')]
data = Series(periods)
data.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
result = data.unstack()
expected = DataFrame({'a': [periods[0], None, periods[3]],
'b': [periods[1], periods[2], None]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
result = data.unstack(fill_value=periods[1])
expected = DataFrame({'a': [periods[0], periods[1], periods[3]],
'b': [periods[1], periods[2], periods[1]]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
def test_unstack_fill_frame_categorical(self):
# Test unstacking with categorical
data = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
data.index = pd.MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')],
)
# By default missing values will be NaN
result = data.unstack()
expected = DataFrame({'a': pd.Categorical(list('axa'),
categories=list('abc')),
'b': pd.Categorical(list('bcx'),
categories=list('abc'))},
index=list('xyz'))
assert_frame_equal(result, expected)
# Fill with non-category results in a TypeError
msg = r"'fill_value' \('d'\) is not in"
with pytest.raises(TypeError, match=msg):
data.unstack(fill_value='d')
# Fill with category value replaces missing values as expected
result = data.unstack(fill_value='c')
expected = DataFrame({'a': pd.Categorical(list('aca'),
categories=list('abc')),
'b': pd.Categorical(list('bcc'),
categories=list('abc'))},
index=list('xyz'))
assert_frame_equal(result, expected)
def test_unstack_preserve_dtypes(self):
# Checks fix for #11847
df = pd.DataFrame(dict(state=['IL', 'MI', 'NC'],
index=['a', 'b', 'c'],
some_categories=pd.Series(['a', 'b', 'c']
).astype('category'),
A=np.random.rand(3),
B=1,
C='foo',
D=pd.Timestamp('20010102'),
E=pd.Series([1.0, 50.0, 100.0]
).astype('float32'),
F=pd.Series([3.0, 4.0, 5.0]).astype('float64'),
G=False,
H=pd.Series([1, 200, 923442], dtype='int8')))
def unstack_and_compare(df, column_name):
unstacked1 = df.unstack([column_name])
unstacked2 = df.unstack(column_name)
assert_frame_equal(unstacked1, unstacked2)
df1 = df.set_index(['state', 'index'])
unstack_and_compare(df1, 'index')
df1 = df.set_index(['state', 'some_categories'])
unstack_and_compare(df1, 'some_categories')
df1 = df.set_index(['F', 'C'])
unstack_and_compare(df1, 'F')
df1 = df.set_index(['G', 'B', 'state'])
unstack_and_compare(df1, 'B')
df1 = df.set_index(['E', 'A'])
unstack_and_compare(df1, 'E')
df1 = df.set_index(['state', 'index'])
s = df1['A']
unstack_and_compare(s, 'index')
def test_stack_ints(self):
columns = MultiIndex.from_tuples(list(itertools.product(range(3),
repeat=3)))
df = DataFrame(np.random.randn(30, 27), columns=columns)
assert_frame_equal(df.stack(level=[1, 2]),
df.stack(level=1).stack(level=1))
assert_frame_equal(df.stack(level=[-2, -1]),
df.stack(level=1).stack(level=1))
df_named = df.copy()
df_named.columns.set_names(range(3), inplace=True)
assert_frame_equal(df_named.stack(level=[1, 2]),
df_named.stack(level=1).stack(level=1))
def test_stack_mixed_levels(self):
columns = MultiIndex.from_tuples(
[('A', 'cat', 'long'), ('B', 'cat', 'long'),
('A', 'dog', 'short'), ('B', 'dog', 'short')],
names=['exp', 'animal', 'hair_length']
)
df = DataFrame(np.random.randn(4, 4), columns=columns)
animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
exp_hair_stacked = df.stack(level=['exp', 'hair_length'])
# GH #8584: Need to check that stacking works when a number
# is passed that is both a level name and in the range of
# the level numbers
df2 = df.copy()
df2.columns.names = ['exp', 'animal', 1]
assert_frame_equal(df2.stack(level=['animal', 1]),
animal_hair_stacked, check_names=False)
assert_frame_equal(df2.stack(level=['exp', 1]),
exp_hair_stacked, check_names=False)
# When mixed types are passed and the ints are not level
# names, raise
pytest.raises(ValueError, df2.stack, level=['animal', 0])
# GH #8584: Having 0 in the level names could raise a
# strange error about lexsort depth
df3 = df.copy()
df3.columns.names = ['exp', 'animal', 0]
assert_frame_equal(df3.stack(level=['animal', 0]),
animal_hair_stacked, check_names=False)
def test_stack_int_level_names(self):
columns = MultiIndex.from_tuples(
[('A', 'cat', 'long'), ('B', 'cat', 'long'),
('A', 'dog', 'short'), ('B', 'dog', 'short')],
names=['exp', 'animal', 'hair_length']
)
df = DataFrame(np.random.randn(4, 4), columns=columns)
exp_animal_stacked = df.stack(level=['exp', 'animal'])
animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
exp_hair_stacked = df.stack(level=['exp', 'hair_length'])
df2 = df.copy()
df2.columns.names = [0, 1, 2]
assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked,
check_names=False)
assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked,
check_names=False)
assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked,
check_names=False)
# Out-of-order int column names
df3 = df.copy()
df3.columns.names = [2, 0, 1]
assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked,
check_names=False)
assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked,
check_names=False)
assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked,
check_names=False)
def test_unstack_bool(self):
df = DataFrame([False, False],
index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]),
columns=['col'])
rs = df.unstack()
xp = DataFrame(np.array([[False, np.nan], [np.nan, False]],
dtype=object),
index=['a', 'b'],
columns=MultiIndex.from_arrays([['col', 'col'],
['c', 'l']]))
assert_frame_equal(rs, xp)
def test_unstack_level_binding(self):
# GH9856
mi = pd.MultiIndex(
levels=[[u('foo'), u('bar')], [u('one'), u('two')],
[u('a'), u('b')]],
codes=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]],
names=[u('first'), u('second'), u('third')])
s = pd.Series(0, index=mi)
result = s.unstack([1, 2]).stack(0)
expected_mi = pd.MultiIndex(
levels=[['foo', 'bar'], ['one', 'two']],
codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
names=['first', 'second'])
expected = pd.DataFrame(np.array([[np.nan, 0],
[0, np.nan],
[np.nan, 0],
[0, np.nan]],
dtype=np.float64),
index=expected_mi,
columns=pd.Index(['a', 'b'], name='third'))
assert_frame_equal(result, expected)
def test_unstack_to_series(self):
# check reversibility
data = self.frame.unstack()
assert isinstance(data, Series)
undo = data.unstack().T
assert_frame_equal(undo, self.frame)
# check NA handling
data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]})
data.index = Index(['a', 'b', 'c'])
result = data.unstack()
midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']],
codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx)
assert_series_equal(result, expected)
# check composability of unstack
old_data = data.copy()
for _ in range(4):
data = data.unstack()
assert_frame_equal(old_data, data)
def test_unstack_dtypes(self):
# GH 2929
rows = [[1, 1, 3, 4],
[1, 2, 3, 4],
[2, 1, 3, 4],
[2, 2, 3, 4]]
df = DataFrame(rows, columns=list('ABCD'))
result = df.get_dtype_counts()
expected = Series({'int64': 4})
assert_series_equal(result, expected)
# single dtype
df2 = df.set_index(['A', 'B'])
df3 = df2.unstack('B')
result = df3.get_dtype_counts()
expected = Series({'int64': 4})
assert_series_equal(result, expected)
# mixed
df2 = df.set_index(['A', 'B'])
df2['C'] = 3.
df3 = df2.unstack('B')
result = df3.get_dtype_counts()
expected = Series({'int64': 2, 'float64': 2})
assert_series_equal(result, expected)
df2['D'] = 'foo'
df3 = df2.unstack('B')
result = df3.get_dtype_counts()
expected = Series({'float64': 2, 'object': 2})
assert_series_equal(result, expected)
# GH7405
for c, d in (np.zeros(5), np.zeros(5)), \
(np.arange(5, dtype='f8'), np.arange(5, 10, dtype='f8')):
df = DataFrame({'A': ['a'] * 5, 'C': c, 'D': d,
'B': pd.date_range('2012-01-01', periods=5)})
right = df.iloc[:3].copy(deep=True)
df = df.set_index(['A', 'B'])
df['D'] = df['D'].astype('int64')
left = df.iloc[:3].unstack(0)
right = right.set_index(['A', 'B']).unstack(0)
right[('D', 'a')] = right[('D', 'a')].astype('int64')
assert left.shape == (3, 2)
tm.assert_frame_equal(left, right)
def test_unstack_non_unique_index_names(self):
idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')],
names=['c1', 'c1'])
df = DataFrame([1, 2], index=idx)
with pytest.raises(ValueError):
df.unstack('c1')
with pytest.raises(ValueError):
df.T.stack('c1')
def test_unstack_unused_levels(self):
# GH 17845: unused codes in index make unstack() cast int to float
idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1]
df = pd.DataFrame([[1, 0]] * 3, index=idx)
result = df.unstack()
exp_col = pd.MultiIndex.from_product([[0, 1], ['A', 'B', 'C']])
expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=['a'],
columns=exp_col)
tm.assert_frame_equal(result, expected)
assert((result.columns.levels[1] == idx.levels[1]).all())
# Unused items on both levels
levels = [[0, 1, 7], [0, 1, 2, 3]]
codes = [[0, 0, 1, 1], [0, 2, 0, 2]]
idx = pd.MultiIndex(levels, codes)
block = np.arange(4).reshape(2, 2)
df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx)
result = df.unstack()
expected = pd.DataFrame(np.concatenate([block * 2, block * 2 + 1],
axis=1),
columns=idx)
tm.assert_frame_equal(result, expected)
assert((result.columns.levels[1] == idx.levels[1]).all())
# With mixed dtype and NaN
levels = [['a', 2, 'c'], [1, 3, 5, 7]]
codes = [[0, -1, 1, 1], [0, 2, -1, 2]]
idx = pd.MultiIndex(levels, codes)
data = np.arange(8)
df = pd.DataFrame(data.reshape(4, 2), index=idx)
cases = ((0, [13, 16, 6, 9, 2, 5, 8, 11],
[np.nan, 'a', 2], [np.nan, 5, 1]),
(1, [8, 11, 1, 4, 12, 15, 13, 16],
[np.nan, 5, 1], [np.nan, 'a', 2]))
for level, idces, col_level, idx_level in cases:
result = df.unstack(level=level)
exp_data = np.zeros(18) * np.nan
exp_data[idces] = data
cols = pd.MultiIndex.from_product([[0, 1], col_level])
expected = pd.DataFrame(exp_data.reshape(3, 6),
index=idx_level, columns=cols)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("cols", [['A', 'C'], slice(None)])
def test_unstack_unused_level(self, cols):
# GH 18562 : unused codes on the unstacked level
df = pd.DataFrame([[2010, 'a', 'I'],
[2011, 'b', 'II']],
columns=['A', 'B', 'C'])
ind = df.set_index(['A', 'B', 'C'], drop=False)
selection = ind.loc[(slice(None), slice(None), 'I'), cols]
result = selection.unstack()
expected = ind.iloc[[0]][cols]
expected.columns = MultiIndex.from_product([expected.columns, ['I']],
names=[None, 'C'])
expected.index = expected.index.droplevel('C')
tm.assert_frame_equal(result, expected)
def test_unstack_nan_index(self): # GH7466
cast = lambda val: '{0:1}'.format('' if val != val else val)
def verify(df):
mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
rows, cols = df.notna().values.nonzero()
for i, j in zip(rows, cols):
left = sorted(df.iloc[i, j].split('.'))
right = mk_list(df.index[i]) + mk_list(df.columns[j])
right = sorted(list(map(cast, right)))
assert left == right
df = DataFrame({'jim': ['a', 'b', np.nan, 'd'],
'joe': ['w', 'x', 'y', 'z'],
'jolie': ['a.w', 'b.x', ' .y', 'd.z']})
left = df.set_index(['jim', 'joe']).unstack()['jolie']
right = df.set_index(['joe', 'jim']).unstack()['jolie'].T
assert_frame_equal(left, right)
for idx in itertools.permutations(df.columns[:2]):
mi = df.set_index(list(idx))
for lev in range(2):
udf = mi.unstack(level=lev)
assert udf.notna().values.sum() == len(df)
verify(udf['jolie'])
df = DataFrame({'1st': ['d'] * 3 + [np.nan] * 5 + ['a'] * 2 +
['c'] * 3 + ['e'] * 2 + ['b'] * 5,
'2nd': ['y'] * 2 + ['w'] * 3 + [np.nan] * 3 +
['z'] * 4 + [np.nan] * 3 + ['x'] * 3 + [np.nan] * 2,
'3rd': [67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59,
50, 62, 59, 76, 52, 14, 53, 60, 51]})
df['4th'], df['5th'] = \
df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \
df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1)
for idx in itertools.permutations(['1st', '2nd', '3rd']):
mi = df.set_index(list(idx))
for lev in range(3):
udf = mi.unstack(level=lev)
assert udf.notna().values.sum() == 2 * len(df)
for col in ['4th', '5th']:
verify(udf[col])
# GH7403
df = pd.DataFrame(
{'A': list('aaaabbbb'), 'B': range(8), 'C': range(8)})
df.iloc[3, 1] = np.NaN
left = df.set_index(['A', 'B']).unstack(0)
vals = [[3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7]]
vals = list(map(list, zip(*vals)))
idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name='B')
cols = MultiIndex(levels=[['C'], ['a', 'b']],
codes=[[0, 0], [0, 1]],
names=[None, 'A'])
right = DataFrame(vals, columns=cols, index=idx)
assert_frame_equal(left, right)
df = DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
'C': range(8)})
df.iloc[2, 1] = np.NaN
left = df.set_index(['A', 'B']).unstack(0)
vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]]
cols = MultiIndex(levels=[['C'], ['a', 'b']],
codes=[[0, 0], [0, 1]],
names=[None, 'A'])
idx = Index([np.nan, 0, 1, 2, 3], name='B')
right = DataFrame(vals, columns=cols, index=idx)
assert_frame_equal(left, right)
df = pd.DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
'C': range(8)})
df.iloc[3, 1] = np.NaN
left = df.set_index(['A', 'B']).unstack(0)
vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]]
cols = MultiIndex(levels=[['C'], ['a', 'b']],
codes=[[0, 0], [0, 1]],
names=[None, 'A'])
idx = Index([np.nan, 0, 1, 2, 3], name='B')
right = DataFrame(vals, columns=cols, index=idx)
assert_frame_equal(left, right)
# GH7401
df = pd.DataFrame({'A': list('aaaaabbbbb'),
'B': (date_range('2012-01-01', periods=5)
.tolist() * 2),
'C': np.arange(10)})
df.iloc[3, 1] = np.NaN
left = df.set_index(['A', 'B']).unstack()
vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]])
idx = Index(['a', 'b'], name='A')
cols = MultiIndex(levels=[['C'], date_range('2012-01-01', periods=5)],
codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
names=[None, 'B'])
right = DataFrame(vals, columns=cols, index=idx)
assert_frame_equal(left, right)
# GH4862
vals = [['Hg', np.nan, np.nan, 680585148],
['U', 0.0, np.nan, 680585148],
['Pb', 7.07e-06, np.nan, 680585148],
['Sn', 2.3614e-05, 0.0133, 680607017],
['Ag', 0.0, 0.0133, 680607017],
['Hg', -0.00015, 0.0133, 680607017]]
df = DataFrame(vals, columns=['agent', 'change', 'dosage', 's_id'],
index=[17263, 17264, 17265, 17266, 17267, 17268])
left = df.copy().set_index(['s_id', 'dosage', 'agent']).unstack()
vals = [[np.nan, np.nan, 7.07e-06, np.nan, 0.0],
[0.0, -0.00015, np.nan, 2.3614e-05, np.nan]]
idx = MultiIndex(levels=[[680585148, 680607017], [0.0133]],
codes=[[0, 1], [-1, 0]],
names=['s_id', 'dosage'])
cols = MultiIndex(levels=[['change'], ['Ag', 'Hg', 'Pb', 'Sn', 'U']],
codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
names=[None, 'agent'])
right = DataFrame(vals, columns=cols, index=idx)
assert_frame_equal(left, right)
left = df.loc[17264:].copy().set_index(['s_id', 'dosage', 'agent'])
assert_frame_equal(left.unstack(), right)
# GH9497 - multiple unstack with nulls
df = DataFrame({'1st': [1, 2, 1, 2, 1, 2],
'2nd': pd.date_range('2014-02-01', periods=6,
freq='D'),
'jim': 100 + np.arange(6),
'joe': (np.random.randn(6) * 10).round(2)})
df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02')
df.loc[1, '2nd'] = df.loc[3, '2nd'] = np.nan
df.loc[1, '3rd'] = df.loc[4, '3rd'] = np.nan
left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd'])
assert left.notna().values.sum() == 2 * len(df)
for col in ['jim', 'joe']:
for _, r in df.iterrows():
key = r['1st'], (col, r['2nd'], r['3rd'])
assert r[col] == left.loc[key]
def test_stack_datetime_column_multiIndex(self):
# GH 8039
t = datetime(2014, 1, 1)
df = DataFrame(
[1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, 'A', 'B')]))
result = df.stack()
eidx = MultiIndex.from_product([(0, 1, 2, 3), ('B',)])
ecols = MultiIndex.from_tuples([(t, 'A')])
expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols)
assert_frame_equal(result, expected)
def test_stack_partial_multiIndex(self):
# GH 8844
def _test_stack_with_multiindex(multiindex):
df = DataFrame(np.arange(3 * len(multiindex))
.reshape(3, len(multiindex)),
columns=multiindex)
for level in (-1, 0, 1, [0, 1], [1, 0]):
result = df.stack(level=level, dropna=False)
if isinstance(level, int):
# Stacking a single level should not make any all-NaN rows,
# so df.stack(level=level, dropna=False) should be the same
# as df.stack(level=level, dropna=True).
expected = df.stack(level=level, dropna=True)
if isinstance(expected, Series):
assert_series_equal(result, expected)
else:
assert_frame_equal(result, expected)
df.columns = MultiIndex.from_tuples(df.columns.get_values(),
names=df.columns.names)
expected = df.stack(level=level, dropna=False)
if isinstance(expected, Series):
assert_series_equal(result, expected)
else:
assert_frame_equal(result, expected)
full_multiindex = MultiIndex.from_tuples([('B', 'x'), ('B', 'z'),
('A', 'y'),
('C', 'x'), ('C', 'u')],
names=['Upper', 'Lower'])
for multiindex_columns in ([0, 1, 2, 3, 4],
[0, 1, 2, 3], [0, 1, 2, 4],
[0, 1, 2], [1, 2, 3], [2, 3, 4],
[0, 1], [0, 2], [0, 3],
[0], [2], [4]):
_test_stack_with_multiindex(full_multiindex[multiindex_columns])
if len(multiindex_columns) > 1:
multiindex_columns.reverse()
_test_stack_with_multiindex(
full_multiindex[multiindex_columns])
df = DataFrame(np.arange(6).reshape(2, 3),
columns=full_multiindex[[0, 1, 3]])
result = df.stack(dropna=False)
expected = DataFrame([[0, 2], [1, np.nan], [3, 5], [4, np.nan]],
index=MultiIndex(
levels=[[0, 1], ['u', 'x', 'y', 'z']],
codes=[[0, 0, 1, 1],
[1, 3, 1, 3]],
names=[None, 'Lower']),
columns=Index(['B', 'C'], name='Upper'),
dtype=df.dtypes[0])
assert_frame_equal(result, expected)
@pytest.mark.parametrize('ordered', [False, True])
@pytest.mark.parametrize('labels', [list("yxz"), list("yxy")])
def test_stack_preserve_categorical_dtype(self, ordered, labels):
# GH13854
cidx = pd.CategoricalIndex(labels, categories=list("xyz"),
ordered=ordered)
df = DataFrame([[10, 11, 12]], columns=cidx)
result = df.stack()
# `MutliIndex.from_product` preserves categorical dtype -
# it's tested elsewhere.
midx = pd.MultiIndex.from_product([df.index, cidx])
expected = Series([10, 11, 12], index=midx)
tm.assert_series_equal(result, expected)
def test_stack_preserve_categorical_dtype_values(self):
# GH-23077
cat = pd.Categorical(['a', 'a', 'b', 'c'])
df = pd.DataFrame({"A": cat, "B": cat})
result = df.stack()
index = pd.MultiIndex.from_product([[0, 1, 2, 3], ['A', 'B']])
expected = pd.Series(pd.Categorical(['a', 'a', 'a', 'a',
'b', 'b', 'c', 'c']),
index=index)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize('level', [0, 1])
def test_unstack_mixed_extension_types(self, level):
index = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 1)],
names=['a', 'b'])
df = pd.DataFrame({"A": pd.core.arrays.integer_array([0, 1, None]),
"B": pd.Categorical(['a', 'a', 'b'])}, index=index)
result = df.unstack(level=level)
expected = df.astype(object).unstack(level=level)
expected_dtypes = pd.Series([df.A.dtype] * 2 + [df.B.dtype] * 2,
index=result.columns)
tm.assert_series_equal(result.dtypes, expected_dtypes)
tm.assert_frame_equal(result.astype(object), expected)
@pytest.mark.parametrize("level", [0, 'baz'])
def test_unstack_swaplevel_sortlevel(self, level):
# GH 20994
mi = pd.MultiIndex.from_product([[0], ['d', 'c']],
names=['bar', 'baz'])
df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=['B', 'A'])
df.columns.name = 'foo'
expected = pd.DataFrame([
[3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples([
('c', 'A'), ('c', 'B'), ('d', 'A'), ('d', 'B')], names=[
'baz', 'foo']))
expected.index.name = 'bar'
result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level)
tm.assert_frame_equal(result, expected)
def test_unstack_fill_frame_object():
# GH12815 Test unstacking with object.
data = pd.Series(['a', 'b', 'c', 'a'], dtype='object')
data.index = pd.MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
# By default missing values will be NaN
result = data.unstack()
expected = pd.DataFrame(
{'a': ['a', np.nan, 'a'], 'b': ['b', 'c', np.nan]},
index=list('xyz')
)
assert_frame_equal(result, expected)
# Fill with any value replaces missing values as expected
result = data.unstack(fill_value='d')
expected = pd.DataFrame(
{'a': ['a', 'd', 'a'], 'b': ['b', 'c', 'd']},
index=list('xyz')
)
assert_frame_equal(result, expected)
def test_unstack_timezone_aware_values():
# GH 18338
df = pd.DataFrame({
'timestamp': [
pd.Timestamp('2017-08-27 01:00:00.709949+0000', tz='UTC')],
'a': ['a'],
'b': ['b'],
'c': ['c'],
}, columns=['timestamp', 'a', 'b', 'c'])
result = df.set_index(['a', 'b']).unstack()
expected = pd.DataFrame([[pd.Timestamp('2017-08-27 01:00:00.709949+0000',
tz='UTC'),
'c']],
index=pd.Index(['a'], name='a'),
columns=pd.MultiIndex(
levels=[['timestamp', 'c'], ['b']],
codes=[[0, 1], [0, 0]],
names=[None, 'b']))
assert_frame_equal(result, expected)
def test_stack_timezone_aware_values():
# GH 19420
ts = pd.date_range(freq="D", start="20180101", end="20180103",
tz="America/New_York")
df = pd.DataFrame({"A": ts}, index=["a", "b", "c"])
result = df.stack()
expected = pd.Series(ts,
index=pd.MultiIndex(levels=[['a', 'b', 'c'], ['A']],
codes=[[0, 1, 2], [0, 0, 0]]))
assert_series_equal(result, expected)
@@ -0,0 +1,96 @@
import numpy as np
import pytest
from pandas.errors import PerformanceWarning
from pandas import DataFrame
from pandas.util import testing as tm
from pandas.util.testing import assert_frame_equal
@pytest.fixture
def df_none():
return DataFrame({
'outer': ['a', 'a', 'a', 'b', 'b', 'b'],
'inner': [1, 2, 2, 2, 1, 1],
'A': np.arange(6, 0, -1),
('B', 5): ['one', 'one', 'two', 'two', 'one', 'one']})
@pytest.fixture(params=[
['outer'],
['outer', 'inner']
])
def df_idx(request, df_none):
levels = request.param
return df_none.set_index(levels)
@pytest.fixture(params=[
'inner', # index level
['outer'], # list of index level
'A', # column
[('B', 5)], # list of column
['inner', 'outer'], # two index levels
[('B', 5), 'outer'], # index level and column
['A', ('B', 5)], # Two columns
['inner', 'outer'] # two index levels and column
])
def sort_names(request):
return request.param
@pytest.fixture(params=[True, False])
def ascending(request):
return request.param
def test_sort_index_level_and_column_label(
df_none, df_idx, sort_names, ascending):
# GH 14353
# Get index levels from df_idx
levels = df_idx.index.names
# Compute expected by sorting on columns and the setting index
expected = df_none.sort_values(by=sort_names,
ascending=ascending,
axis=0).set_index(levels)
# Compute result sorting on mix on columns and index levels
result = df_idx.sort_values(by=sort_names,
ascending=ascending,
axis=0)
assert_frame_equal(result, expected)
def test_sort_column_level_and_index_label(
df_none, df_idx, sort_names, ascending):
# GH 14353
# Get levels from df_idx
levels = df_idx.index.names
# Compute expected by sorting on axis=0, setting index levels, and then
# transposing. For some cases this will result in a frame with
# multiple column levels
expected = df_none.sort_values(by=sort_names,
ascending=ascending,
axis=0).set_index(levels).T
# Compute result by transposing and sorting on axis=1.
result = df_idx.T.sort_values(by=sort_names,
ascending=ascending,
axis=1)
if len(levels) > 1:
# Accessing multi-level columns that are not lexsorted raises a
# performance warning
with tm.assert_produces_warning(PerformanceWarning,
check_stacklevel=False):
assert_frame_equal(result, expected)
else:
assert_frame_equal(result, expected)
@@ -0,0 +1,670 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import random
import numpy as np
import pytest
from pandas.compat import lrange
import pandas as pd
from pandas import (
Categorical, DataFrame, IntervalIndex, MultiIndex, NaT, Series, Timestamp,
date_range)
from pandas.api.types import CategoricalDtype
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal
class TestDataFrameSorting(TestData):
def test_sort_values(self):
frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]],
index=[1, 2, 3], columns=list('ABC'))
# by column (axis=0)
sorted_df = frame.sort_values(by='A')
indexer = frame['A'].argsort().values
expected = frame.loc[frame.index[indexer]]
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by='A', ascending=False)
indexer = indexer[::-1]
expected = frame.loc[frame.index[indexer]]
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by='A', ascending=False)
assert_frame_equal(sorted_df, expected)
# GH4839
sorted_df = frame.sort_values(by=['A'], ascending=[False])
assert_frame_equal(sorted_df, expected)
# multiple bys
sorted_df = frame.sort_values(by=['B', 'C'])
expected = frame.loc[[2, 1, 3]]
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=['B', 'C'], ascending=False)
assert_frame_equal(sorted_df, expected[::-1])
sorted_df = frame.sort_values(by=['B', 'A'], ascending=[True, False])
assert_frame_equal(sorted_df, expected)
pytest.raises(ValueError, lambda: frame.sort_values(
by=['A', 'B'], axis=2, inplace=True))
# by row (axis=1): GH 10806
sorted_df = frame.sort_values(by=3, axis=1)
expected = frame
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=3, axis=1, ascending=False)
expected = frame.reindex(columns=['C', 'B', 'A'])
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=[1, 2], axis='columns')
expected = frame.reindex(columns=['B', 'A', 'C'])
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=[1, 3], axis=1,
ascending=[True, False])
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False)
expected = frame.reindex(columns=['C', 'B', 'A'])
assert_frame_equal(sorted_df, expected)
msg = r'Length of ascending \(5\) != length of by \(2\)'
with pytest.raises(ValueError, match=msg):
frame.sort_values(by=['A', 'B'], axis=0, ascending=[True] * 5)
def test_sort_values_inplace(self):
frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4],
columns=['A', 'B', 'C', 'D'])
sorted_df = frame.copy()
sorted_df.sort_values(by='A', inplace=True)
expected = frame.sort_values(by='A')
assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
sorted_df.sort_values(by=1, axis=1, inplace=True)
expected = frame.sort_values(by=1, axis=1)
assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
sorted_df.sort_values(by='A', ascending=False, inplace=True)
expected = frame.sort_values(by='A', ascending=False)
assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
sorted_df.sort_values(by=['A', 'B'], ascending=False, inplace=True)
expected = frame.sort_values(by=['A', 'B'], ascending=False)
assert_frame_equal(sorted_df, expected)
def test_sort_nan(self):
# GH3917
nan = np.nan
df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
'B': [9, nan, 5, 2, 5, 4, 5]})
# sort one column only
expected = DataFrame(
{'A': [nan, 1, 1, 2, 4, 6, 8],
'B': [5, 9, 2, nan, 5, 5, 4]},
index=[2, 0, 3, 1, 6, 4, 5])
sorted_df = df.sort_values(['A'], na_position='first')
assert_frame_equal(sorted_df, expected)
expected = DataFrame(
{'A': [nan, 8, 6, 4, 2, 1, 1],
'B': [5, 4, 5, 5, nan, 9, 2]},
index=[2, 5, 4, 6, 1, 0, 3])
sorted_df = df.sort_values(['A'], na_position='first', ascending=False)
assert_frame_equal(sorted_df, expected)
expected = df.reindex(columns=['B', 'A'])
sorted_df = df.sort_values(by=1, axis=1, na_position='first')
assert_frame_equal(sorted_df, expected)
# na_position='last', order
expected = DataFrame(
{'A': [1, 1, 2, 4, 6, 8, nan],
'B': [2, 9, nan, 5, 5, 4, 5]},
index=[3, 0, 1, 6, 4, 5, 2])
sorted_df = df.sort_values(['A', 'B'])
assert_frame_equal(sorted_df, expected)
# na_position='first', order
expected = DataFrame(
{'A': [nan, 1, 1, 2, 4, 6, 8],
'B': [5, 2, 9, nan, 5, 5, 4]},
index=[2, 3, 0, 1, 6, 4, 5])
sorted_df = df.sort_values(['A', 'B'], na_position='first')
assert_frame_equal(sorted_df, expected)
# na_position='first', not order
expected = DataFrame(
{'A': [nan, 1, 1, 2, 4, 6, 8],
'B': [5, 9, 2, nan, 5, 5, 4]},
index=[2, 0, 3, 1, 6, 4, 5])
sorted_df = df.sort_values(['A', 'B'], ascending=[
1, 0], na_position='first')
assert_frame_equal(sorted_df, expected)
# na_position='last', not order
expected = DataFrame(
{'A': [8, 6, 4, 2, 1, 1, nan],
'B': [4, 5, 5, nan, 2, 9, 5]},
index=[5, 4, 6, 1, 3, 0, 2])
sorted_df = df.sort_values(['A', 'B'], ascending=[
0, 1], na_position='last')
assert_frame_equal(sorted_df, expected)
# Test DataFrame with nan label
df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
'B': [9, nan, 5, 2, 5, 4, 5]},
index=[1, 2, 3, 4, 5, 6, nan])
# NaN label, ascending=True, na_position='last'
sorted_df = df.sort_index(
kind='quicksort', ascending=True, na_position='last')
expected = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
'B': [9, nan, 5, 2, 5, 4, 5]},
index=[1, 2, 3, 4, 5, 6, nan])
assert_frame_equal(sorted_df, expected)
# NaN label, ascending=True, na_position='first'
sorted_df = df.sort_index(na_position='first')
expected = DataFrame({'A': [4, 1, 2, nan, 1, 6, 8],
'B': [5, 9, nan, 5, 2, 5, 4]},
index=[nan, 1, 2, 3, 4, 5, 6])
assert_frame_equal(sorted_df, expected)
# NaN label, ascending=False, na_position='last'
sorted_df = df.sort_index(kind='quicksort', ascending=False)
expected = DataFrame({'A': [8, 6, 1, nan, 2, 1, 4],
'B': [4, 5, 2, 5, nan, 9, 5]},
index=[6, 5, 4, 3, 2, 1, nan])
assert_frame_equal(sorted_df, expected)
# NaN label, ascending=False, na_position='first'
sorted_df = df.sort_index(
kind='quicksort', ascending=False, na_position='first')
expected = DataFrame({'A': [4, 8, 6, 1, nan, 2, 1],
'B': [5, 4, 5, 2, 5, nan, 9]},
index=[nan, 6, 5, 4, 3, 2, 1])
assert_frame_equal(sorted_df, expected)
def test_stable_descending_sort(self):
# GH #6399
df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']],
columns=['sort_col', 'order'])
sorted_df = df.sort_values(by='sort_col', kind='mergesort',
ascending=False)
assert_frame_equal(df, sorted_df)
def test_stable_descending_multicolumn_sort(self):
nan = np.nan
df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
'B': [9, nan, 5, 2, 5, 4, 5]})
# test stable mergesort
expected = DataFrame(
{'A': [nan, 8, 6, 4, 2, 1, 1],
'B': [5, 4, 5, 5, nan, 2, 9]},
index=[2, 5, 4, 6, 1, 3, 0])
sorted_df = df.sort_values(['A', 'B'], ascending=[0, 1],
na_position='first',
kind='mergesort')
assert_frame_equal(sorted_df, expected)
expected = DataFrame(
{'A': [nan, 8, 6, 4, 2, 1, 1],
'B': [5, 4, 5, 5, nan, 9, 2]},
index=[2, 5, 4, 6, 1, 0, 3])
sorted_df = df.sort_values(['A', 'B'], ascending=[0, 0],
na_position='first',
kind='mergesort')
assert_frame_equal(sorted_df, expected)
def test_stable_categorial(self):
# GH 16793
df = DataFrame({
'x': pd.Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)
})
expected = df.copy()
sorted_df = df.sort_values('x', kind='mergesort')
assert_frame_equal(sorted_df, expected)
def test_sort_datetimes(self):
# GH 3461, argsort / lexsort differences for a datetime column
df = DataFrame(['a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'],
columns=['A'],
index=date_range('20130101', periods=9))
dts = [Timestamp(x)
for x in ['2004-02-11', '2004-01-21', '2004-01-26',
'2005-09-20', '2010-10-04', '2009-05-12',
'2008-11-12', '2010-09-28', '2010-09-28']]
df['B'] = dts[::2] + dts[1::2]
df['C'] = 2.
df['A1'] = 3.
df1 = df.sort_values(by='A')
df2 = df.sort_values(by=['A'])
assert_frame_equal(df1, df2)
df1 = df.sort_values(by='B')
df2 = df.sort_values(by=['B'])
assert_frame_equal(df1, df2)
df1 = df.sort_values(by='B')
df2 = df.sort_values(by=['C', 'B'])
assert_frame_equal(df1, df2)
def test_frame_column_inplace_sort_exception(self):
s = self.frame['A']
with pytest.raises(ValueError, match="This Series is a view"):
s.sort_values(inplace=True)
cp = s.copy()
cp.sort_values() # it works!
def test_sort_nat_values_in_int_column(self):
# GH 14922: "sorting with large float and multiple columns incorrect"
# cause was that the int64 value NaT was considered as "na". Which is
# only correct for datetime64 columns.
int_values = (2, int(NaT))
float_values = (2.0, -1.797693e308)
df = DataFrame(dict(int=int_values, float=float_values),
columns=["int", "float"])
df_reversed = DataFrame(dict(int=int_values[::-1],
float=float_values[::-1]),
columns=["int", "float"],
index=[1, 0])
# NaT is not a "na" for int64 columns, so na_position must not
# influence the result:
df_sorted = df.sort_values(["int", "float"], na_position="last")
assert_frame_equal(df_sorted, df_reversed)
df_sorted = df.sort_values(["int", "float"], na_position="first")
assert_frame_equal(df_sorted, df_reversed)
# reverse sorting order
df_sorted = df.sort_values(["int", "float"], ascending=False)
assert_frame_equal(df_sorted, df)
# and now check if NaT is still considered as "na" for datetime64
# columns:
df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT],
float=float_values), columns=["datetime", "float"])
df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")],
float=float_values[::-1]),
columns=["datetime", "float"],
index=[1, 0])
df_sorted = df.sort_values(["datetime", "float"], na_position="first")
assert_frame_equal(df_sorted, df_reversed)
df_sorted = df.sort_values(["datetime", "float"], na_position="last")
assert_frame_equal(df_sorted, df)
# Ascending should not affect the results.
df_sorted = df.sort_values(["datetime", "float"], ascending=False)
assert_frame_equal(df_sorted, df)
def test_sort_nat(self):
# GH 16836
d1 = [Timestamp(x) for x in ['2016-01-01', '2015-01-01',
np.nan, '2016-01-01']]
d2 = [Timestamp(x) for x in ['2017-01-01', '2014-01-01',
'2016-01-01', '2015-01-01']]
df = pd.DataFrame({'a': d1, 'b': d2}, index=[0, 1, 2, 3])
d3 = [Timestamp(x) for x in ['2015-01-01', '2016-01-01',
'2016-01-01', np.nan]]
d4 = [Timestamp(x) for x in ['2014-01-01', '2015-01-01',
'2017-01-01', '2016-01-01']]
expected = pd.DataFrame({'a': d3, 'b': d4}, index=[1, 3, 0, 2])
sorted_df = df.sort_values(by=['a', 'b'], )
tm.assert_frame_equal(sorted_df, expected)
class TestDataFrameSortIndexKinds(TestData):
def test_sort_index_multicolumn(self):
A = np.arange(5).repeat(20)
B = np.tile(np.arange(5), 20)
random.shuffle(A)
random.shuffle(B)
frame = DataFrame({'A': A, 'B': B,
'C': np.random.randn(100)})
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
frame.sort_index(by=['A', 'B'])
result = frame.sort_values(by=['A', 'B'])
indexer = np.lexsort((frame['B'], frame['A']))
expected = frame.take(indexer)
assert_frame_equal(result, expected)
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
frame.sort_index(by=['A', 'B'], ascending=False)
result = frame.sort_values(by=['A', 'B'], ascending=False)
indexer = np.lexsort((frame['B'].rank(ascending=False),
frame['A'].rank(ascending=False)))
expected = frame.take(indexer)
assert_frame_equal(result, expected)
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
frame.sort_index(by=['B', 'A'])
result = frame.sort_values(by=['B', 'A'])
indexer = np.lexsort((frame['A'], frame['B']))
expected = frame.take(indexer)
assert_frame_equal(result, expected)
def test_sort_index_inplace(self):
frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4],
columns=['A', 'B', 'C', 'D'])
# axis=0
unordered = frame.loc[[3, 2, 4, 1]]
a_id = id(unordered['A'])
df = unordered.copy()
df.sort_index(inplace=True)
expected = frame
assert_frame_equal(df, expected)
assert a_id != id(df['A'])
df = unordered.copy()
df.sort_index(ascending=False, inplace=True)
expected = frame[::-1]
assert_frame_equal(df, expected)
# axis=1
unordered = frame.loc[:, ['D', 'B', 'C', 'A']]
df = unordered.copy()
df.sort_index(axis=1, inplace=True)
expected = frame
assert_frame_equal(df, expected)
df = unordered.copy()
df.sort_index(axis=1, ascending=False, inplace=True)
expected = frame.iloc[:, ::-1]
assert_frame_equal(df, expected)
def test_sort_index_different_sortorder(self):
A = np.arange(20).repeat(5)
B = np.tile(np.arange(5), 20)
indexer = np.random.permutation(100)
A = A.take(indexer)
B = B.take(indexer)
df = DataFrame({'A': A, 'B': B,
'C': np.random.randn(100)})
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by=['A', 'B'], ascending=[1, 0])
result = df.sort_values(by=['A', 'B'], ascending=[1, 0])
ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
expected = df.take(ex_indexer)
assert_frame_equal(result, expected)
# test with multiindex, too
idf = df.set_index(['A', 'B'])
result = idf.sort_index(ascending=[1, 0])
expected = idf.take(ex_indexer)
assert_frame_equal(result, expected)
# also, Series!
result = idf['C'].sort_index(ascending=[1, 0])
assert_series_equal(result, expected['C'])
def test_sort_index_duplicates(self):
# with 9816, these are all translated to .sort_values
df = DataFrame([lrange(5, 9), lrange(4)],
columns=['a', 'a', 'b', 'b'])
with pytest.raises(ValueError, match='not unique'):
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by='a')
with pytest.raises(ValueError, match='not unique'):
df.sort_values(by='a')
with pytest.raises(ValueError, match='not unique'):
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by=['a'])
with pytest.raises(ValueError, match='not unique'):
df.sort_values(by=['a'])
with pytest.raises(ValueError, match='not unique'):
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
# multi-column 'by' is separate codepath
df.sort_index(by=['a', 'b'])
with pytest.raises(ValueError, match='not unique'):
# multi-column 'by' is separate codepath
df.sort_values(by=['a', 'b'])
# with multi-index
# GH4370
df = DataFrame(np.random.randn(4, 2),
columns=MultiIndex.from_tuples([('a', 0), ('a', 1)]))
with pytest.raises(ValueError, match='level'):
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by='a')
with pytest.raises(ValueError, match='level'):
df.sort_values(by='a')
# convert tuples to a list of tuples
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by=[('a', 1)])
expected = df.sort_values(by=[('a', 1)])
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by=('a', 1))
result = df.sort_values(by=('a', 1))
assert_frame_equal(result, expected)
def test_sort_index_level(self):
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
df = DataFrame([[1, 2], [3, 4]], mi)
res = df.sort_index(level='A', sort_remaining=False)
assert_frame_equal(df, res)
res = df.sort_index(level=['A', 'B'], sort_remaining=False)
assert_frame_equal(df, res)
def test_sort_index_categorical_index(self):
df = (DataFrame({'A': np.arange(6, dtype='int64'),
'B': Series(list('aabbca'))
.astype(CategoricalDtype(list('cab')))})
.set_index('B'))
result = df.sort_index()
expected = df.iloc[[4, 0, 1, 5, 2, 3]]
assert_frame_equal(result, expected)
result = df.sort_index(ascending=False)
expected = df.iloc[[3, 2, 5, 1, 0, 4]]
assert_frame_equal(result, expected)
def test_sort_index(self):
# GH13496
frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
columns=['A', 'B', 'C', 'D'])
# axis=0 : sort rows by index labels
unordered = frame.loc[[3, 2, 4, 1]]
result = unordered.sort_index(axis=0)
expected = frame
assert_frame_equal(result, expected)
result = unordered.sort_index(ascending=False)
expected = frame[::-1]
assert_frame_equal(result, expected)
# axis=1 : sort columns by column names
unordered = frame.iloc[:, [2, 1, 3, 0]]
result = unordered.sort_index(axis=1)
assert_frame_equal(result, frame)
result = unordered.sort_index(axis=1, ascending=False)
expected = frame.iloc[:, ::-1]
assert_frame_equal(result, expected)
@pytest.mark.parametrize("level", ['A', 0]) # GH 21052
def test_sort_index_multiindex(self, level):
# GH13496
# sort rows by specified level of multi-index
mi = MultiIndex.from_tuples([
[2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list('ABC'))
df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi)
expected_mi = MultiIndex.from_tuples([
[1, 1, 1],
[2, 1, 2],
[2, 1, 3]], names=list('ABC'))
expected = pd.DataFrame([
[5, 6],
[3, 4],
[1, 2]], index=expected_mi)
result = df.sort_index(level=level)
assert_frame_equal(result, expected)
# sort_remaining=False
expected_mi = MultiIndex.from_tuples([
[1, 1, 1],
[2, 1, 3],
[2, 1, 2]], names=list('ABC'))
expected = pd.DataFrame([
[5, 6],
[1, 2],
[3, 4]], index=expected_mi)
result = df.sort_index(level=level, sort_remaining=False)
assert_frame_equal(result, expected)
def test_sort_index_intervalindex(self):
# this is a de-facto sort via unstack
# confirming that we sort in the order of the bins
y = Series(np.random.randn(100))
x1 = Series(np.sign(np.random.randn(100)))
x2 = pd.cut(Series(np.random.randn(100)),
bins=[-3, -0.5, 0, 0.5, 3])
model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])
result = model.groupby(['X1', 'X2'], observed=True).mean().unstack()
expected = IntervalIndex.from_tuples(
[(-3.0, -0.5), (-0.5, 0.0),
(0.0, 0.5), (0.5, 3.0)],
closed='right')
result = result.columns.levels[1].categories
tm.assert_index_equal(result, expected)
def test_sort_index_na_position_with_categories(self):
# GH 22556
# Positioning missing value properly when column is Categorical.
categories = ['A', 'B', 'C']
category_indices = [0, 2, 4]
list_of_nans = [np.nan, np.nan]
na_indices = [1, 3]
na_position_first = 'first'
na_position_last = 'last'
column_name = 'c'
reversed_categories = sorted(categories, reverse=True)
reversed_category_indices = sorted(category_indices, reverse=True)
reversed_na_indices = sorted(na_indices, reverse=True)
df = pd.DataFrame({
column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'],
categories=categories,
ordered=True)})
# sort ascending with na first
result = df.sort_values(by=column_name,
ascending=True,
na_position=na_position_first)
expected = DataFrame({
column_name: Categorical(list_of_nans + categories,
categories=categories,
ordered=True)
}, index=na_indices + category_indices)
assert_frame_equal(result, expected)
# sort ascending with na last
result = df.sort_values(by=column_name,
ascending=True,
na_position=na_position_last)
expected = DataFrame({
column_name: Categorical(categories + list_of_nans,
categories=categories,
ordered=True)
}, index=category_indices + na_indices)
assert_frame_equal(result, expected)
# sort descending with na first
result = df.sort_values(by=column_name,
ascending=False,
na_position=na_position_first)
expected = DataFrame({
column_name: Categorical(list_of_nans + reversed_categories,
categories=categories,
ordered=True)
}, index=reversed_na_indices + reversed_category_indices)
assert_frame_equal(result, expected)
# sort descending with na last
result = df.sort_values(by=column_name,
ascending=False,
na_position=na_position_last)
expected = DataFrame({
column_name: Categorical(reversed_categories + list_of_nans,
categories=categories,
ordered=True)
}, index=reversed_category_indices + reversed_na_indices)
assert_frame_equal(result, expected)
def test_sort_index_na_position_with_categories_raises(self):
df = pd.DataFrame({
'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'],
categories=['A', 'B', 'C'],
ordered=True)})
with pytest.raises(ValueError):
df.sort_values(by='c',
ascending=False,
na_position='bad_position')
@@ -0,0 +1,573 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Panel, Series
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
class TestDataFrameSubclassing(TestData):
def test_frame_subclassing_and_slicing(self):
# Subclass frame and ensure it returns the right class on slicing it
# In reference to PR 9632
class CustomSeries(Series):
@property
def _constructor(self):
return CustomSeries
def custom_series_function(self):
return 'OK'
class CustomDataFrame(DataFrame):
"""
Subclasses pandas DF, fills DF with simulation results, adds some
custom plotting functions.
"""
def __init__(self, *args, **kw):
super(CustomDataFrame, self).__init__(*args, **kw)
@property
def _constructor(self):
return CustomDataFrame
_constructor_sliced = CustomSeries
def custom_frame_function(self):
return 'OK'
data = {'col1': range(10),
'col2': range(10)}
cdf = CustomDataFrame(data)
# Did we get back our own DF class?
assert isinstance(cdf, CustomDataFrame)
# Do we get back our own Series class after selecting a column?
cdf_series = cdf.col1
assert isinstance(cdf_series, CustomSeries)
assert cdf_series.custom_series_function() == 'OK'
# Do we get back our own DF class after slicing row-wise?
cdf_rows = cdf[1:5]
assert isinstance(cdf_rows, CustomDataFrame)
assert cdf_rows.custom_frame_function() == 'OK'
# Make sure sliced part of multi-index frame is custom class
mcol = pd.MultiIndex.from_tuples([('A', 'A'), ('A', 'B')])
cdf_multi = CustomDataFrame([[0, 1], [2, 3]], columns=mcol)
assert isinstance(cdf_multi['A'], CustomDataFrame)
mcol = pd.MultiIndex.from_tuples([('A', ''), ('B', '')])
cdf_multi2 = CustomDataFrame([[0, 1], [2, 3]], columns=mcol)
assert isinstance(cdf_multi2['A'], CustomSeries)
def test_dataframe_metadata(self):
df = tm.SubclassedDataFrame({'X': [1, 2, 3], 'Y': [1, 2, 3]},
index=['a', 'b', 'c'])
df.testattr = 'XXX'
assert df.testattr == 'XXX'
assert df[['X']].testattr == 'XXX'
assert df.loc[['a', 'b'], :].testattr == 'XXX'
assert df.iloc[[0, 1], :].testattr == 'XXX'
# see gh-9776
assert df.iloc[0:1, :].testattr == 'XXX'
# see gh-10553
unpickled = tm.round_trip_pickle(df)
tm.assert_frame_equal(df, unpickled)
assert df._metadata == unpickled._metadata
assert df.testattr == unpickled.testattr
def test_indexing_sliced(self):
# GH 11559
df = tm.SubclassedDataFrame({'X': [1, 2, 3],
'Y': [4, 5, 6],
'Z': [7, 8, 9]},
index=['a', 'b', 'c'])
res = df.loc[:, 'X']
exp = tm.SubclassedSeries([1, 2, 3], index=list('abc'), name='X')
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
res = df.iloc[:, 1]
exp = tm.SubclassedSeries([4, 5, 6], index=list('abc'), name='Y')
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
res = df.loc[:, 'Z']
exp = tm.SubclassedSeries([7, 8, 9], index=list('abc'), name='Z')
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
res = df.loc['a', :]
exp = tm.SubclassedSeries([1, 4, 7], index=list('XYZ'), name='a')
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
res = df.iloc[1, :]
exp = tm.SubclassedSeries([2, 5, 8], index=list('XYZ'), name='b')
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
res = df.loc['c', :]
exp = tm.SubclassedSeries([3, 6, 9], index=list('XYZ'), name='c')
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
def test_to_panel_expanddim(self):
# GH 9762
class SubclassedFrame(DataFrame):
@property
def _constructor_expanddim(self):
return SubclassedPanel
class SubclassedPanel(Panel):
pass
index = MultiIndex.from_tuples([(0, 0), (0, 1), (0, 2)])
df = SubclassedFrame({'X': [1, 2, 3], 'Y': [4, 5, 6]}, index=index)
result = df.to_panel()
assert isinstance(result, SubclassedPanel)
expected = SubclassedPanel([[[1, 2, 3]], [[4, 5, 6]]],
items=['X', 'Y'], major_axis=[0],
minor_axis=[0, 1, 2],
dtype='int64')
tm.assert_panel_equal(result, expected)
def test_subclass_attr_err_propagation(self):
# GH 11808
class A(DataFrame):
@property
def bar(self):
return self.i_dont_exist
with pytest.raises(AttributeError, match='.*i_dont_exist.*'):
A().bar
def test_subclass_align(self):
# GH 12983
df1 = tm.SubclassedDataFrame({'a': [1, 3, 5],
'b': [1, 3, 5]}, index=list('ACE'))
df2 = tm.SubclassedDataFrame({'c': [1, 2, 4],
'd': [1, 2, 4]}, index=list('ABD'))
res1, res2 = df1.align(df2, axis=0)
exp1 = tm.SubclassedDataFrame({'a': [1, np.nan, 3, np.nan, 5],
'b': [1, np.nan, 3, np.nan, 5]},
index=list('ABCDE'))
exp2 = tm.SubclassedDataFrame({'c': [1, 2, np.nan, 4, np.nan],
'd': [1, 2, np.nan, 4, np.nan]},
index=list('ABCDE'))
assert isinstance(res1, tm.SubclassedDataFrame)
tm.assert_frame_equal(res1, exp1)
assert isinstance(res2, tm.SubclassedDataFrame)
tm.assert_frame_equal(res2, exp2)
res1, res2 = df1.a.align(df2.c)
assert isinstance(res1, tm.SubclassedSeries)
tm.assert_series_equal(res1, exp1.a)
assert isinstance(res2, tm.SubclassedSeries)
tm.assert_series_equal(res2, exp2.c)
def test_subclass_align_combinations(self):
# GH 12983
df = tm.SubclassedDataFrame({'a': [1, 3, 5],
'b': [1, 3, 5]}, index=list('ACE'))
s = tm.SubclassedSeries([1, 2, 4], index=list('ABD'), name='x')
# frame + series
res1, res2 = df.align(s, axis=0)
exp1 = pd.DataFrame({'a': [1, np.nan, 3, np.nan, 5],
'b': [1, np.nan, 3, np.nan, 5]},
index=list('ABCDE'))
# name is lost when
exp2 = pd.Series([1, 2, np.nan, 4, np.nan],
index=list('ABCDE'), name='x')
assert isinstance(res1, tm.SubclassedDataFrame)
tm.assert_frame_equal(res1, exp1)
assert isinstance(res2, tm.SubclassedSeries)
tm.assert_series_equal(res2, exp2)
# series + frame
res1, res2 = s.align(df)
assert isinstance(res1, tm.SubclassedSeries)
tm.assert_series_equal(res1, exp2)
assert isinstance(res2, tm.SubclassedDataFrame)
tm.assert_frame_equal(res2, exp1)
def test_subclass_iterrows(self):
# GH 13977
df = tm.SubclassedDataFrame({'a': [1]})
for i, row in df.iterrows():
assert isinstance(row, tm.SubclassedSeries)
tm.assert_series_equal(row, df.loc[i])
def test_subclass_sparse_slice(self):
rows = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
ssdf = tm.SubclassedSparseDataFrame(rows)
ssdf.testattr = "testattr"
tm.assert_sp_frame_equal(ssdf.loc[:2],
tm.SubclassedSparseDataFrame(rows[:3]))
tm.assert_sp_frame_equal(ssdf.iloc[:2],
tm.SubclassedSparseDataFrame(rows[:2]))
tm.assert_sp_frame_equal(ssdf[:2],
tm.SubclassedSparseDataFrame(rows[:2]))
assert ssdf.loc[:2].testattr == "testattr"
assert ssdf.iloc[:2].testattr == "testattr"
assert ssdf[:2].testattr == "testattr"
tm.assert_sp_series_equal(ssdf.loc[1],
tm.SubclassedSparseSeries(rows[1]),
check_names=False,
check_kind=False)
tm.assert_sp_series_equal(ssdf.iloc[1],
tm.SubclassedSparseSeries(rows[1]),
check_names=False,
check_kind=False)
def test_subclass_sparse_transpose(self):
ossdf = tm.SubclassedSparseDataFrame([[1, 2, 3],
[4, 5, 6]])
essdf = tm.SubclassedSparseDataFrame([[1, 4],
[2, 5],
[3, 6]])
tm.assert_sp_frame_equal(ossdf.T, essdf)
def test_subclass_stack(self):
# GH 15564
df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=['a', 'b', 'c'],
columns=['X', 'Y', 'Z'])
res = df.stack()
exp = tm.SubclassedSeries(
[1, 2, 3, 4, 5, 6, 7, 8, 9],
index=[list('aaabbbccc'), list('XYZXYZXYZ')])
tm.assert_series_equal(res, exp)
def test_subclass_stack_multi(self):
# GH 15564
df = tm.SubclassedDataFrame([
[10, 11, 12, 13],
[20, 21, 22, 23],
[30, 31, 32, 33],
[40, 41, 42, 43]],
index=MultiIndex.from_tuples(
list(zip(list('AABB'), list('cdcd'))),
names=['aaa', 'ccc']),
columns=MultiIndex.from_tuples(
list(zip(list('WWXX'), list('yzyz'))),
names=['www', 'yyy']))
exp = tm.SubclassedDataFrame([
[10, 12],
[11, 13],
[20, 22],
[21, 23],
[30, 32],
[31, 33],
[40, 42],
[41, 43]],
index=MultiIndex.from_tuples(list(zip(
list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))),
names=['aaa', 'ccc', 'yyy']),
columns=Index(['W', 'X'], name='www'))
res = df.stack()
tm.assert_frame_equal(res, exp)
res = df.stack('yyy')
tm.assert_frame_equal(res, exp)
exp = tm.SubclassedDataFrame([
[10, 11],
[12, 13],
[20, 21],
[22, 23],
[30, 31],
[32, 33],
[40, 41],
[42, 43]],
index=MultiIndex.from_tuples(list(zip(
list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))),
names=['aaa', 'ccc', 'www']),
columns=Index(['y', 'z'], name='yyy'))
res = df.stack('www')
tm.assert_frame_equal(res, exp)
def test_subclass_stack_multi_mixed(self):
# GH 15564
df = tm.SubclassedDataFrame([
[10, 11, 12.0, 13.0],
[20, 21, 22.0, 23.0],
[30, 31, 32.0, 33.0],
[40, 41, 42.0, 43.0]],
index=MultiIndex.from_tuples(
list(zip(list('AABB'), list('cdcd'))),
names=['aaa', 'ccc']),
columns=MultiIndex.from_tuples(
list(zip(list('WWXX'), list('yzyz'))),
names=['www', 'yyy']))
exp = tm.SubclassedDataFrame([
[10, 12.0],
[11, 13.0],
[20, 22.0],
[21, 23.0],
[30, 32.0],
[31, 33.0],
[40, 42.0],
[41, 43.0]],
index=MultiIndex.from_tuples(list(zip(
list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))),
names=['aaa', 'ccc', 'yyy']),
columns=Index(['W', 'X'], name='www'))
res = df.stack()
tm.assert_frame_equal(res, exp)
res = df.stack('yyy')
tm.assert_frame_equal(res, exp)
exp = tm.SubclassedDataFrame([
[10.0, 11.0],
[12.0, 13.0],
[20.0, 21.0],
[22.0, 23.0],
[30.0, 31.0],
[32.0, 33.0],
[40.0, 41.0],
[42.0, 43.0]],
index=MultiIndex.from_tuples(list(zip(
list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))),
names=['aaa', 'ccc', 'www']),
columns=Index(['y', 'z'], name='yyy'))
res = df.stack('www')
tm.assert_frame_equal(res, exp)
def test_subclass_unstack(self):
# GH 15564
df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=['a', 'b', 'c'],
columns=['X', 'Y', 'Z'])
res = df.unstack()
exp = tm.SubclassedSeries(
[1, 4, 7, 2, 5, 8, 3, 6, 9],
index=[list('XXXYYYZZZ'), list('abcabcabc')])
tm.assert_series_equal(res, exp)
def test_subclass_unstack_multi(self):
# GH 15564
df = tm.SubclassedDataFrame([
[10, 11, 12, 13],
[20, 21, 22, 23],
[30, 31, 32, 33],
[40, 41, 42, 43]],
index=MultiIndex.from_tuples(
list(zip(list('AABB'), list('cdcd'))),
names=['aaa', 'ccc']),
columns=MultiIndex.from_tuples(
list(zip(list('WWXX'), list('yzyz'))),
names=['www', 'yyy']))
exp = tm.SubclassedDataFrame([
[10, 20, 11, 21, 12, 22, 13, 23],
[30, 40, 31, 41, 32, 42, 33, 43]],
index=Index(['A', 'B'], name='aaa'),
columns=MultiIndex.from_tuples(list(zip(
list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))),
names=['www', 'yyy', 'ccc']))
res = df.unstack()
tm.assert_frame_equal(res, exp)
res = df.unstack('ccc')
tm.assert_frame_equal(res, exp)
exp = tm.SubclassedDataFrame([
[10, 30, 11, 31, 12, 32, 13, 33],
[20, 40, 21, 41, 22, 42, 23, 43]],
index=Index(['c', 'd'], name='ccc'),
columns=MultiIndex.from_tuples(list(zip(
list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))),
names=['www', 'yyy', 'aaa']))
res = df.unstack('aaa')
tm.assert_frame_equal(res, exp)
def test_subclass_unstack_multi_mixed(self):
# GH 15564
df = tm.SubclassedDataFrame([
[10, 11, 12.0, 13.0],
[20, 21, 22.0, 23.0],
[30, 31, 32.0, 33.0],
[40, 41, 42.0, 43.0]],
index=MultiIndex.from_tuples(
list(zip(list('AABB'), list('cdcd'))),
names=['aaa', 'ccc']),
columns=MultiIndex.from_tuples(
list(zip(list('WWXX'), list('yzyz'))),
names=['www', 'yyy']))
exp = tm.SubclassedDataFrame([
[10, 20, 11, 21, 12.0, 22.0, 13.0, 23.0],
[30, 40, 31, 41, 32.0, 42.0, 33.0, 43.0]],
index=Index(['A', 'B'], name='aaa'),
columns=MultiIndex.from_tuples(list(zip(
list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))),
names=['www', 'yyy', 'ccc']))
res = df.unstack()
tm.assert_frame_equal(res, exp)
res = df.unstack('ccc')
tm.assert_frame_equal(res, exp)
exp = tm.SubclassedDataFrame([
[10, 30, 11, 31, 12.0, 32.0, 13.0, 33.0],
[20, 40, 21, 41, 22.0, 42.0, 23.0, 43.0]],
index=Index(['c', 'd'], name='ccc'),
columns=MultiIndex.from_tuples(list(zip(
list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))),
names=['www', 'yyy', 'aaa']))
res = df.unstack('aaa')
tm.assert_frame_equal(res, exp)
def test_subclass_pivot(self):
# GH 15564
df = tm.SubclassedDataFrame({
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
'values': [1., 2., 3., 3., 2., 1.]})
pivoted = df.pivot(
index='index', columns='columns', values='values')
expected = tm.SubclassedDataFrame({
'One': {'A': 1., 'B': 2., 'C': 3.},
'Two': {'A': 1., 'B': 2., 'C': 3.}})
expected.index.name, expected.columns.name = 'index', 'columns'
tm.assert_frame_equal(pivoted, expected)
def test_subclassed_melt(self):
# GH 15564
cheese = tm.SubclassedDataFrame({
'first': ['John', 'Mary'],
'last': ['Doe', 'Bo'],
'height': [5.5, 6.0],
'weight': [130, 150]})
melted = pd.melt(cheese, id_vars=['first', 'last'])
expected = tm.SubclassedDataFrame([
['John', 'Doe', 'height', 5.5],
['Mary', 'Bo', 'height', 6.0],
['John', 'Doe', 'weight', 130],
['Mary', 'Bo', 'weight', 150]],
columns=['first', 'last', 'variable', 'value'])
tm.assert_frame_equal(melted, expected)
def test_subclassed_wide_to_long(self):
# GH 9762
np.random.seed(123)
x = np.random.randn(3)
df = tm.SubclassedDataFrame({
"A1970": {0: "a", 1: "b", 2: "c"},
"A1980": {0: "d", 1: "e", 2: "f"},
"B1970": {0: 2.5, 1: 1.2, 2: .7},
"B1980": {0: 3.2, 1: 1.3, 2: .1},
"X": dict(zip(range(3), x))})
df["id"] = df.index
exp_data = {"X": x.tolist() + x.tolist(),
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
"year": [1970, 1970, 1970, 1980, 1980, 1980],
"id": [0, 1, 2, 0, 1, 2]}
expected = tm.SubclassedDataFrame(exp_data)
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year")
tm.assert_frame_equal(long_frame, expected)
def test_subclassed_apply(self):
# GH 19822
def check_row_subclass(row):
assert isinstance(row, tm.SubclassedSeries)
def strech(row):
if row["variable"] == "height":
row["value"] += 0.5
return row
df = tm.SubclassedDataFrame([
['John', 'Doe', 'height', 5.5],
['Mary', 'Bo', 'height', 6.0],
['John', 'Doe', 'weight', 130],
['Mary', 'Bo', 'weight', 150]],
columns=['first', 'last', 'variable', 'value'])
df.apply(lambda x: check_row_subclass(x))
df.apply(lambda x: check_row_subclass(x), axis=1)
expected = tm.SubclassedDataFrame([
['John', 'Doe', 'height', 6.0],
['Mary', 'Bo', 'height', 6.5],
['John', 'Doe', 'weight', 130],
['Mary', 'Bo', 'weight', 150]],
columns=['first', 'last', 'variable', 'value'])
result = df.apply(lambda x: strech(x), axis=1)
assert isinstance(result, tm.SubclassedDataFrame)
tm.assert_frame_equal(result, expected)
expected = tm.SubclassedDataFrame([
[1, 2, 3],
[1, 2, 3],
[1, 2, 3],
[1, 2, 3]])
result = df.apply(lambda x: tm.SubclassedSeries([1, 2, 3]), axis=1)
assert isinstance(result, tm.SubclassedDataFrame)
tm.assert_frame_equal(result, expected)
result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand")
assert isinstance(result, tm.SubclassedDataFrame)
tm.assert_frame_equal(result, expected)
expected = tm.SubclassedSeries([
[1, 2, 3],
[1, 2, 3],
[1, 2, 3],
[1, 2, 3]])
result = df.apply(lambda x: [1, 2, 3], axis=1)
assert not isinstance(result, tm.SubclassedDataFrame)
tm.assert_series_equal(result, expected)
@@ -0,0 +1,899 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
from datetime import datetime, time
import numpy as np
import pytest
from pandas.compat import product
import pandas as pd
from pandas import (
DataFrame, DatetimeIndex, Index, MultiIndex, Series, Timestamp, date_range,
period_range, to_datetime)
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
from pandas.util.testing import (
assert_frame_equal, assert_index_equal, assert_series_equal)
import pandas.tseries.offsets as offsets
@pytest.fixture(params=product([True, False], [True, False]))
def close_open_fixture(request):
return request.param
class TestDataFrameTimeSeriesMethods(TestData):
def test_diff(self):
the_diff = self.tsframe.diff(1)
assert_series_equal(the_diff['A'],
self.tsframe['A'] - self.tsframe['A'].shift(1))
# int dtype
a = 10000000000000000
b = a + 1
s = Series([a, b])
rs = DataFrame({'s': s}).diff()
assert rs.s[1] == 1
# mixed numeric
tf = self.tsframe.astype('float32')
the_diff = tf.diff(1)
assert_series_equal(the_diff['A'],
tf['A'] - tf['A'].shift(1))
# issue 10907
df = pd.DataFrame({'y': pd.Series([2]), 'z': pd.Series([3])})
df.insert(0, 'x', 1)
result = df.diff(axis=1)
expected = pd.DataFrame({'x': np.nan, 'y': pd.Series(
1), 'z': pd.Series(1)}).astype('float64')
assert_frame_equal(result, expected)
@pytest.mark.parametrize('tz', [None, 'UTC'])
def test_diff_datetime_axis0(self, tz):
# GH 18578
df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz),
1: date_range('2010', freq='D', periods=2, tz=tz)})
result = df.diff(axis=0)
expected = DataFrame({0: pd.TimedeltaIndex(['NaT', '1 days']),
1: pd.TimedeltaIndex(['NaT', '1 days'])})
assert_frame_equal(result, expected)
@pytest.mark.parametrize('tz', [None, 'UTC'])
def test_diff_datetime_axis1(self, tz):
# GH 18578
df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz),
1: date_range('2010', freq='D', periods=2, tz=tz)})
if tz is None:
result = df.diff(axis=1)
expected = DataFrame({0: pd.TimedeltaIndex(['NaT', 'NaT']),
1: pd.TimedeltaIndex(['0 days',
'0 days'])})
assert_frame_equal(result, expected)
else:
with pytest.raises(NotImplementedError):
result = df.diff(axis=1)
def test_diff_timedelta(self):
# GH 4533
df = DataFrame(dict(time=[Timestamp('20130101 9:01'),
Timestamp('20130101 9:02')],
value=[1.0, 2.0]))
res = df.diff()
exp = DataFrame([[pd.NaT, np.nan],
[pd.Timedelta('00:01:00'), 1]],
columns=['time', 'value'])
assert_frame_equal(res, exp)
def test_diff_mixed_dtype(self):
df = DataFrame(np.random.randn(5, 3))
df['A'] = np.array([1, 2, 3, 4, 5], dtype=object)
result = df.diff()
assert result[0].dtype == np.float64
def test_diff_neg_n(self):
rs = self.tsframe.diff(-1)
xp = self.tsframe - self.tsframe.shift(-1)
assert_frame_equal(rs, xp)
def test_diff_float_n(self):
rs = self.tsframe.diff(1.)
xp = self.tsframe.diff(1)
assert_frame_equal(rs, xp)
def test_diff_axis(self):
# GH 9727
df = DataFrame([[1., 2.], [3., 4.]])
assert_frame_equal(df.diff(axis=1), DataFrame(
[[np.nan, 1.], [np.nan, 1.]]))
assert_frame_equal(df.diff(axis=0), DataFrame(
[[np.nan, np.nan], [2., 2.]]))
def test_pct_change(self):
rs = self.tsframe.pct_change(fill_method=None)
assert_frame_equal(rs, self.tsframe / self.tsframe.shift(1) - 1)
rs = self.tsframe.pct_change(2)
filled = self.tsframe.fillna(method='pad')
assert_frame_equal(rs, filled / filled.shift(2) - 1)
rs = self.tsframe.pct_change(fill_method='bfill', limit=1)
filled = self.tsframe.fillna(method='bfill', limit=1)
assert_frame_equal(rs, filled / filled.shift(1) - 1)
rs = self.tsframe.pct_change(freq='5D')
filled = self.tsframe.fillna(method='pad')
assert_frame_equal(rs,
(filled / filled.shift(freq='5D') - 1)
.reindex_like(filled))
def test_pct_change_shift_over_nas(self):
s = Series([1., 1.5, np.nan, 2.5, 3.])
df = DataFrame({'a': s, 'b': s})
chg = df.pct_change()
expected = Series([np.nan, 0.5, 0., 2.5 / 1.5 - 1, .2])
edf = DataFrame({'a': expected, 'b': expected})
assert_frame_equal(chg, edf)
@pytest.mark.parametrize("freq, periods, fill_method, limit",
[('5B', 5, None, None),
('3B', 3, None, None),
('3B', 3, 'bfill', None),
('7B', 7, 'pad', 1),
('7B', 7, 'bfill', 3),
('14B', 14, None, None)])
def test_pct_change_periods_freq(self, freq, periods, fill_method, limit):
# GH 7292
rs_freq = self.tsframe.pct_change(freq=freq,
fill_method=fill_method,
limit=limit)
rs_periods = self.tsframe.pct_change(periods,
fill_method=fill_method,
limit=limit)
assert_frame_equal(rs_freq, rs_periods)
empty_ts = DataFrame(index=self.tsframe.index,
columns=self.tsframe.columns)
rs_freq = empty_ts.pct_change(freq=freq,
fill_method=fill_method,
limit=limit)
rs_periods = empty_ts.pct_change(periods,
fill_method=fill_method,
limit=limit)
assert_frame_equal(rs_freq, rs_periods)
def test_frame_ctor_datetime64_column(self):
rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s')
dates = np.asarray(rng)
df = DataFrame({'A': np.random.randn(len(rng)), 'B': dates})
assert np.issubdtype(df['B'].dtype, np.dtype('M8[ns]'))
def test_frame_append_datetime64_column(self):
rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s')
df = DataFrame(index=np.arange(len(rng)))
df['A'] = rng
assert np.issubdtype(df['A'].dtype, np.dtype('M8[ns]'))
def test_frame_datetime64_pre1900_repr(self):
df = DataFrame({'year': date_range('1/1/1700', periods=50,
freq='A-DEC')})
# it works!
repr(df)
def test_frame_append_datetime64_col_other_units(self):
n = 100
units = ['h', 'm', 's', 'ms', 'D', 'M', 'Y']
ns_dtype = np.dtype('M8[ns]')
for unit in units:
dtype = np.dtype('M8[%s]' % unit)
vals = np.arange(n, dtype=np.int64).view(dtype)
df = DataFrame({'ints': np.arange(n)}, index=np.arange(n))
df[unit] = vals
ex_vals = to_datetime(vals.astype('O')).values
assert df[unit].dtype == ns_dtype
assert (df[unit].values == ex_vals).all()
# Test insertion into existing datetime64 column
df = DataFrame({'ints': np.arange(n)}, index=np.arange(n))
df['dates'] = np.arange(n, dtype=np.int64).view(ns_dtype)
for unit in units:
dtype = np.dtype('M8[%s]' % unit)
vals = np.arange(n, dtype=np.int64).view(dtype)
tmp = df.copy()
tmp['dates'] = vals
ex_vals = to_datetime(vals.astype('O')).values
assert (tmp['dates'].values == ex_vals).all()
def test_shift(self):
# naive shift
shiftedFrame = self.tsframe.shift(5)
tm.assert_index_equal(shiftedFrame.index, self.tsframe.index)
shiftedSeries = self.tsframe['A'].shift(5)
assert_series_equal(shiftedFrame['A'], shiftedSeries)
shiftedFrame = self.tsframe.shift(-5)
tm.assert_index_equal(shiftedFrame.index, self.tsframe.index)
shiftedSeries = self.tsframe['A'].shift(-5)
assert_series_equal(shiftedFrame['A'], shiftedSeries)
# shift by 0
unshifted = self.tsframe.shift(0)
assert_frame_equal(unshifted, self.tsframe)
# shift by DateOffset
shiftedFrame = self.tsframe.shift(5, freq=offsets.BDay())
assert len(shiftedFrame) == len(self.tsframe)
shiftedFrame2 = self.tsframe.shift(5, freq='B')
assert_frame_equal(shiftedFrame, shiftedFrame2)
d = self.tsframe.index[0]
shifted_d = d + offsets.BDay(5)
assert_series_equal(self.tsframe.xs(d),
shiftedFrame.xs(shifted_d), check_names=False)
# shift int frame
int_shifted = self.intframe.shift(1) # noqa
# Shifting with PeriodIndex
ps = tm.makePeriodFrame()
shifted = ps.shift(1)
unshifted = shifted.shift(-1)
tm.assert_index_equal(shifted.index, ps.index)
tm.assert_index_equal(unshifted.index, ps.index)
tm.assert_numpy_array_equal(unshifted.iloc[:, 0].dropna().values,
ps.iloc[:-1, 0].values)
shifted2 = ps.shift(1, 'B')
shifted3 = ps.shift(1, offsets.BDay())
assert_frame_equal(shifted2, shifted3)
assert_frame_equal(ps, shifted2.shift(-1, 'B'))
msg = 'does not match PeriodIndex freq'
with pytest.raises(ValueError, match=msg):
ps.shift(freq='D')
# shift other axis
# GH 6371
df = DataFrame(np.random.rand(10, 5))
expected = pd.concat([DataFrame(np.nan, index=df.index,
columns=[0]),
df.iloc[:, 0:-1]],
ignore_index=True, axis=1)
result = df.shift(1, axis=1)
assert_frame_equal(result, expected)
# shift named axis
df = DataFrame(np.random.rand(10, 5))
expected = pd.concat([DataFrame(np.nan, index=df.index,
columns=[0]),
df.iloc[:, 0:-1]],
ignore_index=True, axis=1)
result = df.shift(1, axis='columns')
assert_frame_equal(result, expected)
def test_shift_bool(self):
df = DataFrame({'high': [True, False],
'low': [False, False]})
rs = df.shift(1)
xp = DataFrame(np.array([[np.nan, np.nan],
[True, False]], dtype=object),
columns=['high', 'low'])
assert_frame_equal(rs, xp)
def test_shift_categorical(self):
# GH 9416
s1 = pd.Series(['a', 'b', 'c'], dtype='category')
s2 = pd.Series(['A', 'B', 'C'], dtype='category')
df = DataFrame({'one': s1, 'two': s2})
rs = df.shift(1)
xp = DataFrame({'one': s1.shift(1), 'two': s2.shift(1)})
assert_frame_equal(rs, xp)
def test_shift_fill_value(self):
# GH #24128
df = DataFrame([1, 2, 3, 4, 5],
index=date_range('1/1/2000', periods=5, freq='H'))
exp = DataFrame([0, 1, 2, 3, 4],
index=date_range('1/1/2000', periods=5, freq='H'))
result = df.shift(1, fill_value=0)
assert_frame_equal(result, exp)
exp = DataFrame([0, 0, 1, 2, 3],
index=date_range('1/1/2000', periods=5, freq='H'))
result = df.shift(2, fill_value=0)
assert_frame_equal(result, exp)
def test_shift_empty(self):
# Regression test for #8019
df = DataFrame({'foo': []})
rs = df.shift(-1)
assert_frame_equal(df, rs)
def test_shift_duplicate_columns(self):
# GH 9092; verify that position-based shifting works
# in the presence of duplicate columns
column_lists = [list(range(5)), [1] * 5, [1, 1, 2, 2, 1]]
data = np.random.randn(20, 5)
shifted = []
for columns in column_lists:
df = pd.DataFrame(data.copy(), columns=columns)
for s in range(5):
df.iloc[:, s] = df.iloc[:, s].shift(s + 1)
df.columns = range(5)
shifted.append(df)
# sanity check the base case
nulls = shifted[0].isna().sum()
assert_series_equal(nulls, Series(range(1, 6), dtype='int64'))
# check all answers are the same
assert_frame_equal(shifted[0], shifted[1])
assert_frame_equal(shifted[0], shifted[2])
def test_tshift(self):
# PeriodIndex
ps = tm.makePeriodFrame()
shifted = ps.tshift(1)
unshifted = shifted.tshift(-1)
assert_frame_equal(unshifted, ps)
shifted2 = ps.tshift(freq='B')
assert_frame_equal(shifted, shifted2)
shifted3 = ps.tshift(freq=offsets.BDay())
assert_frame_equal(shifted, shifted3)
with pytest.raises(ValueError, match='does not match'):
ps.tshift(freq='M')
# DatetimeIndex
shifted = self.tsframe.tshift(1)
unshifted = shifted.tshift(-1)
assert_frame_equal(self.tsframe, unshifted)
shifted2 = self.tsframe.tshift(freq=self.tsframe.index.freq)
assert_frame_equal(shifted, shifted2)
inferred_ts = DataFrame(self.tsframe.values,
Index(np.asarray(self.tsframe.index)),
columns=self.tsframe.columns)
shifted = inferred_ts.tshift(1)
unshifted = shifted.tshift(-1)
assert_frame_equal(shifted, self.tsframe.tshift(1))
assert_frame_equal(unshifted, inferred_ts)
no_freq = self.tsframe.iloc[[0, 5, 7], :]
pytest.raises(ValueError, no_freq.tshift)
def test_truncate(self):
ts = self.tsframe[::3]
start, end = self.tsframe.index[3], self.tsframe.index[6]
start_missing = self.tsframe.index[2]
end_missing = self.tsframe.index[7]
# neither specified
truncated = ts.truncate()
assert_frame_equal(truncated, ts)
# both specified
expected = ts[1:3]
truncated = ts.truncate(start, end)
assert_frame_equal(truncated, expected)
truncated = ts.truncate(start_missing, end_missing)
assert_frame_equal(truncated, expected)
# start specified
expected = ts[1:]
truncated = ts.truncate(before=start)
assert_frame_equal(truncated, expected)
truncated = ts.truncate(before=start_missing)
assert_frame_equal(truncated, expected)
# end specified
expected = ts[:3]
truncated = ts.truncate(after=end)
assert_frame_equal(truncated, expected)
truncated = ts.truncate(after=end_missing)
assert_frame_equal(truncated, expected)
pytest.raises(ValueError, ts.truncate,
before=ts.index[-1] - ts.index.freq,
after=ts.index[0] + ts.index.freq)
def test_truncate_copy(self):
index = self.tsframe.index
truncated = self.tsframe.truncate(index[5], index[10])
truncated.values[:] = 5.
assert not (self.tsframe.values[5:11] == 5).any()
def test_truncate_nonsortedindex(self):
# GH 17935
df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e']},
index=[5, 3, 2, 9, 0])
msg = 'truncate requires a sorted index'
with pytest.raises(ValueError, match=msg):
df.truncate(before=3, after=9)
rng = pd.date_range('2011-01-01', '2012-01-01', freq='W')
ts = pd.DataFrame({'A': np.random.randn(len(rng)),
'B': np.random.randn(len(rng))},
index=rng)
msg = 'truncate requires a sorted index'
with pytest.raises(ValueError, match=msg):
ts.sort_values('A', ascending=False).truncate(before='2011-11',
after='2011-12')
df = pd.DataFrame({3: np.random.randn(5),
20: np.random.randn(5),
2: np.random.randn(5),
0: np.random.randn(5)},
columns=[3, 20, 2, 0])
msg = 'truncate requires a sorted index'
with pytest.raises(ValueError, match=msg):
df.truncate(before=2, after=20, axis=1)
def test_asfreq(self):
offset_monthly = self.tsframe.asfreq(offsets.BMonthEnd())
rule_monthly = self.tsframe.asfreq('BM')
tm.assert_almost_equal(offset_monthly['A'], rule_monthly['A'])
filled = rule_monthly.asfreq('B', method='pad') # noqa
# TODO: actually check that this worked.
# don't forget!
filled_dep = rule_monthly.asfreq('B', method='pad') # noqa
# test does not blow up on length-0 DataFrame
zero_length = self.tsframe.reindex([])
result = zero_length.asfreq('BM')
assert result is not zero_length
def test_asfreq_datetimeindex(self):
df = DataFrame({'A': [1, 2, 3]},
index=[datetime(2011, 11, 1), datetime(2011, 11, 2),
datetime(2011, 11, 3)])
df = df.asfreq('B')
assert isinstance(df.index, DatetimeIndex)
ts = df['A'].asfreq('B')
assert isinstance(ts.index, DatetimeIndex)
def test_asfreq_fillvalue(self):
# test for fill value during upsampling, related to issue 3715
# setup
rng = pd.date_range('1/1/2016', periods=10, freq='2S')
ts = pd.Series(np.arange(len(rng)), index=rng)
df = pd.DataFrame({'one': ts})
# insert pre-existing missing value
df.loc['2016-01-01 00:00:08', 'one'] = None
actual_df = df.asfreq(freq='1S', fill_value=9.0)
expected_df = df.asfreq(freq='1S').fillna(9.0)
expected_df.loc['2016-01-01 00:00:08', 'one'] = None
assert_frame_equal(expected_df, actual_df)
expected_series = ts.asfreq(freq='1S').fillna(9.0)
actual_series = ts.asfreq(freq='1S', fill_value=9.0)
assert_series_equal(expected_series, actual_series)
@pytest.mark.parametrize("data,idx,expected_first,expected_last", [
({'A': [1, 2, 3]}, [1, 1, 2], 1, 2),
({'A': [1, 2, 3]}, [1, 2, 2], 1, 2),
({'A': [1, 2, 3, 4]}, ['d', 'd', 'd', 'd'], 'd', 'd'),
({'A': [1, np.nan, 3]}, [1, 1, 2], 1, 2),
({'A': [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2),
({'A': [1, np.nan, 3]}, [1, 2, 2], 1, 2)])
def test_first_last_valid(self, data, idx,
expected_first, expected_last):
N = len(self.frame.index)
mat = np.random.randn(N)
mat[:5] = np.nan
mat[-5:] = np.nan
frame = DataFrame({'foo': mat}, index=self.frame.index)
index = frame.first_valid_index()
assert index == frame.index[5]
index = frame.last_valid_index()
assert index == frame.index[-6]
# GH12800
empty = DataFrame()
assert empty.last_valid_index() is None
assert empty.first_valid_index() is None
# GH17400: no valid entries
frame[:] = np.nan
assert frame.last_valid_index() is None
assert frame.first_valid_index() is None
# GH20499: its preserves freq with holes
frame.index = date_range("20110101", periods=N, freq="B")
frame.iloc[1] = 1
frame.iloc[-2] = 1
assert frame.first_valid_index() == frame.index[1]
assert frame.last_valid_index() == frame.index[-2]
assert frame.first_valid_index().freq == frame.index.freq
assert frame.last_valid_index().freq == frame.index.freq
# GH 21441
df = DataFrame(data, index=idx)
assert expected_first == df.first_valid_index()
assert expected_last == df.last_valid_index()
def test_first_subset(self):
ts = tm.makeTimeDataFrame(freq='12h')
result = ts.first('10d')
assert len(result) == 20
ts = tm.makeTimeDataFrame(freq='D')
result = ts.first('10d')
assert len(result) == 10
result = ts.first('3M')
expected = ts[:'3/31/2000']
assert_frame_equal(result, expected)
result = ts.first('21D')
expected = ts[:21]
assert_frame_equal(result, expected)
result = ts[:0].first('3M')
assert_frame_equal(result, ts[:0])
def test_first_raises(self):
# GH20725
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
with pytest.raises(TypeError): # index is not a DatetimeIndex
df.first('1D')
def test_last_subset(self):
ts = tm.makeTimeDataFrame(freq='12h')
result = ts.last('10d')
assert len(result) == 20
ts = tm.makeTimeDataFrame(nper=30, freq='D')
result = ts.last('10d')
assert len(result) == 10
result = ts.last('21D')
expected = ts['2000-01-10':]
assert_frame_equal(result, expected)
result = ts.last('21D')
expected = ts[-21:]
assert_frame_equal(result, expected)
result = ts[:0].last('3M')
assert_frame_equal(result, ts[:0])
def test_last_raises(self):
# GH20725
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
with pytest.raises(TypeError): # index is not a DatetimeIndex
df.last('1D')
def test_at_time(self):
rng = date_range('1/1/2000', '1/5/2000', freq='5min')
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
rs = ts.at_time(rng[1])
assert (rs.index.hour == rng[1].hour).all()
assert (rs.index.minute == rng[1].minute).all()
assert (rs.index.second == rng[1].second).all()
result = ts.at_time('9:30')
expected = ts.at_time(time(9, 30))
assert_frame_equal(result, expected)
result = ts.loc[time(9, 30)]
expected = ts.loc[(rng.hour == 9) & (rng.minute == 30)]
assert_frame_equal(result, expected)
# midnight, everything
rng = date_range('1/1/2000', '1/31/2000')
ts = DataFrame(np.random.randn(len(rng), 3), index=rng)
result = ts.at_time(time(0, 0))
assert_frame_equal(result, ts)
# time doesn't exist
rng = date_range('1/1/2012', freq='23Min', periods=384)
ts = DataFrame(np.random.randn(len(rng), 2), rng)
rs = ts.at_time('16:00')
assert len(rs) == 0
def test_at_time_raises(self):
# GH20725
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
with pytest.raises(TypeError): # index is not a DatetimeIndex
df.at_time('00:00')
@pytest.mark.parametrize('axis', ['index', 'columns', 0, 1])
def test_at_time_axis(self, axis):
# issue 8839
rng = date_range('1/1/2000', '1/5/2000', freq='5min')
ts = DataFrame(np.random.randn(len(rng), len(rng)))
ts.index, ts.columns = rng, rng
indices = rng[(rng.hour == 9) & (rng.minute == 30) & (rng.second == 0)]
if axis in ['index', 0]:
expected = ts.loc[indices, :]
elif axis in ['columns', 1]:
expected = ts.loc[:, indices]
result = ts.at_time('9:30', axis=axis)
assert_frame_equal(result, expected)
def test_between_time(self, close_open_fixture):
rng = date_range('1/1/2000', '1/5/2000', freq='5min')
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
stime = time(0, 0)
etime = time(1, 0)
inc_start, inc_end = close_open_fixture
filtered = ts.between_time(stime, etime, inc_start, inc_end)
exp_len = 13 * 4 + 1
if not inc_start:
exp_len -= 5
if not inc_end:
exp_len -= 4
assert len(filtered) == exp_len
for rs in filtered.index:
t = rs.time()
if inc_start:
assert t >= stime
else:
assert t > stime
if inc_end:
assert t <= etime
else:
assert t < etime
result = ts.between_time('00:00', '01:00')
expected = ts.between_time(stime, etime)
assert_frame_equal(result, expected)
# across midnight
rng = date_range('1/1/2000', '1/5/2000', freq='5min')
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
stime = time(22, 0)
etime = time(9, 0)
filtered = ts.between_time(stime, etime, inc_start, inc_end)
exp_len = (12 * 11 + 1) * 4 + 1
if not inc_start:
exp_len -= 4
if not inc_end:
exp_len -= 4
assert len(filtered) == exp_len
for rs in filtered.index:
t = rs.time()
if inc_start:
assert (t >= stime) or (t <= etime)
else:
assert (t > stime) or (t <= etime)
if inc_end:
assert (t <= etime) or (t >= stime)
else:
assert (t < etime) or (t >= stime)
def test_between_time_raises(self):
# GH20725
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
with pytest.raises(TypeError): # index is not a DatetimeIndex
df.between_time(start_time='00:00', end_time='12:00')
def test_between_time_axis(self, axis):
# issue 8839
rng = date_range('1/1/2000', periods=100, freq='10min')
ts = DataFrame(np.random.randn(len(rng), len(rng)))
stime, etime = ('08:00:00', '09:00:00')
exp_len = 7
if axis in ['index', 0]:
ts.index = rng
assert len(ts.between_time(stime, etime)) == exp_len
assert len(ts.between_time(stime, etime, axis=0)) == exp_len
if axis in ['columns', 1]:
ts.columns = rng
selected = ts.between_time(stime, etime, axis=1).columns
assert len(selected) == exp_len
def test_between_time_axis_raises(self, axis):
# issue 8839
rng = date_range('1/1/2000', periods=100, freq='10min')
mask = np.arange(0, len(rng))
rand_data = np.random.randn(len(rng), len(rng))
ts = DataFrame(rand_data, index=rng, columns=rng)
stime, etime = ('08:00:00', '09:00:00')
if axis in ['columns', 1]:
ts.index = mask
pytest.raises(TypeError, ts.between_time, stime, etime)
pytest.raises(TypeError, ts.between_time, stime, etime, axis=0)
if axis in ['index', 0]:
ts.columns = mask
pytest.raises(TypeError, ts.between_time, stime, etime, axis=1)
def test_operation_on_NaT(self):
# Both NaT and Timestamp are in DataFrame.
df = pd.DataFrame({'foo': [pd.NaT, pd.NaT,
pd.Timestamp('2012-05-01')]})
res = df.min()
exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"])
tm.assert_series_equal(res, exp)
res = df.max()
exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"])
tm.assert_series_equal(res, exp)
# GH12941, only NaTs are in DataFrame.
df = pd.DataFrame({'foo': [pd.NaT, pd.NaT]})
res = df.min()
exp = pd.Series([pd.NaT], index=["foo"])
tm.assert_series_equal(res, exp)
res = df.max()
exp = pd.Series([pd.NaT], index=["foo"])
tm.assert_series_equal(res, exp)
def test_datetime_assignment_with_NaT_and_diff_time_units(self):
# GH 7492
data_ns = np.array([1, 'nat'], dtype='datetime64[ns]')
result = pd.Series(data_ns).to_frame()
result['new'] = data_ns
expected = pd.DataFrame({0: [1, None],
'new': [1, None]}, dtype='datetime64[ns]')
tm.assert_frame_equal(result, expected)
# OutOfBoundsDatetime error shouldn't occur
data_s = np.array([1, 'nat'], dtype='datetime64[s]')
result['new'] = data_s
expected = pd.DataFrame({0: [1, None],
'new': [1e9, None]}, dtype='datetime64[ns]')
tm.assert_frame_equal(result, expected)
def test_frame_to_period(self):
K = 5
dr = date_range('1/1/2000', '1/1/2001')
pr = period_range('1/1/2000', '1/1/2001')
df = DataFrame(np.random.randn(len(dr), K), index=dr)
df['mix'] = 'a'
pts = df.to_period()
exp = df.copy()
exp.index = pr
assert_frame_equal(pts, exp)
pts = df.to_period('M')
tm.assert_index_equal(pts.index, exp.index.asfreq('M'))
df = df.T
pts = df.to_period(axis=1)
exp = df.copy()
exp.columns = pr
assert_frame_equal(pts, exp)
pts = df.to_period('M', axis=1)
tm.assert_index_equal(pts.columns, exp.columns.asfreq('M'))
pytest.raises(ValueError, df.to_period, axis=2)
@pytest.mark.parametrize("fn", ['tz_localize', 'tz_convert'])
def test_tz_convert_and_localize(self, fn):
l0 = date_range('20140701', periods=5, freq='D')
l1 = date_range('20140701', periods=5, freq='D')
int_idx = Index(range(5))
if fn == 'tz_convert':
l0 = l0.tz_localize('UTC')
l1 = l1.tz_localize('UTC')
for idx in [l0, l1]:
l0_expected = getattr(idx, fn)('US/Pacific')
l1_expected = getattr(idx, fn)('US/Pacific')
df1 = DataFrame(np.ones(5), index=l0)
df1 = getattr(df1, fn)('US/Pacific')
assert_index_equal(df1.index, l0_expected)
# MultiIndex
# GH7846
df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1]))
df3 = getattr(df2, fn)('US/Pacific', level=0)
assert not df3.index.levels[0].equals(l0)
assert_index_equal(df3.index.levels[0], l0_expected)
assert_index_equal(df3.index.levels[1], l1)
assert not df3.index.levels[1].equals(l1_expected)
df3 = getattr(df2, fn)('US/Pacific', level=1)
assert_index_equal(df3.index.levels[0], l0)
assert not df3.index.levels[0].equals(l0_expected)
assert_index_equal(df3.index.levels[1], l1_expected)
assert not df3.index.levels[1].equals(l1)
df4 = DataFrame(np.ones(5),
MultiIndex.from_arrays([int_idx, l0]))
# TODO: untested
df5 = getattr(df4, fn)('US/Pacific', level=1) # noqa
assert_index_equal(df3.index.levels[0], l0)
assert not df3.index.levels[0].equals(l0_expected)
assert_index_equal(df3.index.levels[1], l1_expected)
assert not df3.index.levels[1].equals(l1)
# Bad Inputs
# Not DatetimeIndex / PeriodIndex
with pytest.raises(TypeError, match='DatetimeIndex'):
df = DataFrame(index=int_idx)
df = getattr(df, fn)('US/Pacific')
# Not DatetimeIndex / PeriodIndex
with pytest.raises(TypeError, match='DatetimeIndex'):
df = DataFrame(np.ones(5),
MultiIndex.from_arrays([int_idx, l0]))
df = getattr(df, fn)('US/Pacific', level=0)
# Invalid level
with pytest.raises(ValueError, match='not valid'):
df = DataFrame(index=l0)
df = getattr(df, fn)('US/Pacific', level=1)
@@ -0,0 +1,198 @@
# -*- coding: utf-8 -*-
"""
Tests for DataFrame timezone-related methods
"""
from datetime import datetime
import numpy as np
import pytest
import pytz
from pandas.compat import lrange
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
from pandas import DataFrame, Series
from pandas.core.indexes.datetimes import date_range
import pandas.util.testing as tm
class TestDataFrameTimezones(object):
def test_frame_values_with_tz(self):
tz = "US/Central"
df = DataFrame({"A": date_range('2000', periods=4, tz=tz)})
result = df.values
expected = np.array([
[pd.Timestamp('2000-01-01', tz=tz)],
[pd.Timestamp('2000-01-02', tz=tz)],
[pd.Timestamp('2000-01-03', tz=tz)],
[pd.Timestamp('2000-01-04', tz=tz)],
])
tm.assert_numpy_array_equal(result, expected)
# two columns, homogenous
df = df.assign(B=df.A)
result = df.values
expected = np.concatenate([expected, expected], axis=1)
tm.assert_numpy_array_equal(result, expected)
# three columns, heterogenous
est = "US/Eastern"
df = df.assign(C=df.A.dt.tz_convert(est))
new = np.array([
[pd.Timestamp('2000-01-01T01:00:00', tz=est)],
[pd.Timestamp('2000-01-02T01:00:00', tz=est)],
[pd.Timestamp('2000-01-03T01:00:00', tz=est)],
[pd.Timestamp('2000-01-04T01:00:00', tz=est)],
])
expected = np.concatenate([expected, new], axis=1)
result = df.values
tm.assert_numpy_array_equal(result, expected)
def test_frame_from_records_utc(self):
rec = {'datum': 1.5,
'begin_time': datetime(2006, 4, 27, tzinfo=pytz.utc)}
# it works
DataFrame.from_records([rec], index='begin_time')
def test_frame_tz_localize(self):
rng = date_range('1/1/2011', periods=100, freq='H')
df = DataFrame({'a': 1}, index=rng)
result = df.tz_localize('utc')
expected = DataFrame({'a': 1}, rng.tz_localize('UTC'))
assert result.index.tz.zone == 'UTC'
tm.assert_frame_equal(result, expected)
df = df.T
result = df.tz_localize('utc', axis=1)
assert result.columns.tz.zone == 'UTC'
tm.assert_frame_equal(result, expected.T)
def test_frame_tz_convert(self):
rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern')
df = DataFrame({'a': 1}, index=rng)
result = df.tz_convert('Europe/Berlin')
expected = DataFrame({'a': 1}, rng.tz_convert('Europe/Berlin'))
assert result.index.tz.zone == 'Europe/Berlin'
tm.assert_frame_equal(result, expected)
df = df.T
result = df.tz_convert('Europe/Berlin', axis=1)
assert result.columns.tz.zone == 'Europe/Berlin'
tm.assert_frame_equal(result, expected.T)
def test_frame_join_tzaware(self):
test1 = DataFrame(np.zeros((6, 3)),
index=date_range("2012-11-15 00:00:00", periods=6,
freq="100L", tz="US/Central"))
test2 = DataFrame(np.zeros((3, 3)),
index=date_range("2012-11-15 00:00:00", periods=3,
freq="250L", tz="US/Central"),
columns=lrange(3, 6))
result = test1.join(test2, how='outer')
ex_index = test1.index.union(test2.index)
tm.assert_index_equal(result.index, ex_index)
assert result.index.tz.zone == 'US/Central'
def test_frame_add_tz_mismatch_converts_to_utc(self):
rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern')
df = DataFrame(np.random.randn(len(rng)), index=rng, columns=['a'])
df_moscow = df.tz_convert('Europe/Moscow')
result = df + df_moscow
assert result.index.tz is pytz.utc
result = df_moscow + df
assert result.index.tz is pytz.utc
def test_frame_align_aware(self):
idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern')
idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern')
df1 = DataFrame(np.random.randn(len(idx1), 3), idx1)
df2 = DataFrame(np.random.randn(len(idx2), 3), idx2)
new1, new2 = df1.align(df2)
assert df1.index.tz == new1.index.tz
assert df2.index.tz == new2.index.tz
# different timezones convert to UTC
# frame with frame
df1_central = df1.tz_convert('US/Central')
new1, new2 = df1.align(df1_central)
assert new1.index.tz == pytz.UTC
assert new2.index.tz == pytz.UTC
# frame with Series
new1, new2 = df1.align(df1_central[0], axis=0)
assert new1.index.tz == pytz.UTC
assert new2.index.tz == pytz.UTC
df1[0].align(df1_central, axis=0)
assert new1.index.tz == pytz.UTC
assert new2.index.tz == pytz.UTC
@pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern'])
def test_frame_no_datetime64_dtype(self, tz):
# after GH#7822
# these retain the timezones on dict construction
dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI')
dr_tz = dr.tz_localize(tz)
df = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr)
tz_expected = DatetimeTZDtype('ns', dr_tz.tzinfo)
assert df['B'].dtype == tz_expected
# GH#2810 (with timezones)
datetimes_naive = [ts.to_pydatetime() for ts in dr]
datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz]
df = DataFrame({'dr': dr,
'dr_tz': dr_tz,
'datetimes_naive': datetimes_naive,
'datetimes_with_tz': datetimes_with_tz})
result = df.get_dtype_counts().sort_index()
expected = Series({'datetime64[ns]': 2,
str(tz_expected): 2}).sort_index()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern'])
def test_frame_reset_index(self, tz):
dr = date_range('2012-06-02', periods=10, tz=tz)
df = DataFrame(np.random.randn(len(dr)), dr)
roundtripped = df.reset_index().set_index('index')
xp = df.index.tz
rs = roundtripped.index.tz
assert xp == rs
@pytest.mark.parametrize('tz', [None, 'America/New_York'])
def test_boolean_compare_transpose_tzindex_with_dst(self, tz):
# GH 19970
idx = date_range('20161101', '20161130', freq='4H', tz=tz)
df = DataFrame({'a': range(len(idx)), 'b': range(len(idx))},
index=idx)
result = df.T == df.T
expected = DataFrame(True, index=list('ab'), columns=idx)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize('copy', [True, False])
@pytest.mark.parametrize('method, tz', [
['tz_localize', None],
['tz_convert', 'Europe/Berlin']
])
def test_tz_localize_convert_copy_inplace_mutate(self, copy, method, tz):
# GH 6326
result = DataFrame(np.arange(0, 5),
index=date_range('20131027', periods=5,
freq='1H', tz=tz))
getattr(result, method)('UTC', copy=copy)
expected = DataFrame(np.arange(0, 5),
index=date_range('20131027', periods=5,
freq='1H', tz=tz))
tm.assert_frame_equal(result, expected)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,32 @@
import pytest
from pandas.core.frame import DataFrame
@pytest.fixture
def dataframe():
return DataFrame({'a': [1, 2], 'b': [3, 4]})
class TestDataFrameValidate(object):
"""Tests for error handling related to data types of method arguments."""
@pytest.mark.parametrize("func", ["query", "eval", "set_index",
"reset_index", "dropna",
"drop_duplicates", "sort_values"])
@pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0])
def test_validate_bool_args(self, dataframe, func, inplace):
msg = "For argument \"inplace\" expected type bool"
kwargs = dict(inplace=inplace)
if func == "query":
kwargs["expr"] = "a > b"
elif func == "eval":
kwargs["expr"] = "a + b"
elif func == "set_index":
kwargs["keys"] = ["a"]
elif func == "sort_values":
kwargs["by"] = ["a"]
with pytest.raises(ValueError, match=msg):
getattr(dataframe, func)(**kwargs)