demo + utils venv
This commit is contained in:
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -0,0 +1,141 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas.util._decorators import cache_readonly
|
||||
|
||||
import pandas as pd
|
||||
from pandas import compat
|
||||
import pandas.util.testing as tm
|
||||
|
||||
_seriesd = tm.getSeriesData()
|
||||
_tsd = tm.getTimeSeriesData()
|
||||
|
||||
_frame = pd.DataFrame(_seriesd)
|
||||
_frame2 = pd.DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])
|
||||
_intframe = pd.DataFrame({k: v.astype(int)
|
||||
for k, v in compat.iteritems(_seriesd)})
|
||||
|
||||
_tsframe = pd.DataFrame(_tsd)
|
||||
|
||||
_mixed_frame = _frame.copy()
|
||||
_mixed_frame['foo'] = 'bar'
|
||||
|
||||
|
||||
class TestData(object):
|
||||
|
||||
@cache_readonly
|
||||
def frame(self):
|
||||
return _frame.copy()
|
||||
|
||||
@cache_readonly
|
||||
def frame2(self):
|
||||
return _frame2.copy()
|
||||
|
||||
@cache_readonly
|
||||
def intframe(self):
|
||||
# force these all to int64 to avoid platform testing issues
|
||||
return pd.DataFrame({c: s for c, s in compat.iteritems(_intframe)},
|
||||
dtype=np.int64)
|
||||
|
||||
@cache_readonly
|
||||
def tsframe(self):
|
||||
return _tsframe.copy()
|
||||
|
||||
@cache_readonly
|
||||
def mixed_frame(self):
|
||||
return _mixed_frame.copy()
|
||||
|
||||
@cache_readonly
|
||||
def mixed_float(self):
|
||||
return pd.DataFrame({'A': _frame['A'].copy().astype('float32'),
|
||||
'B': _frame['B'].copy().astype('float32'),
|
||||
'C': _frame['C'].copy().astype('float16'),
|
||||
'D': _frame['D'].copy().astype('float64')})
|
||||
|
||||
@cache_readonly
|
||||
def mixed_float2(self):
|
||||
return pd.DataFrame({'A': _frame2['A'].copy().astype('float32'),
|
||||
'B': _frame2['B'].copy().astype('float32'),
|
||||
'C': _frame2['C'].copy().astype('float16'),
|
||||
'D': _frame2['D'].copy().astype('float64')})
|
||||
|
||||
@cache_readonly
|
||||
def mixed_int(self):
|
||||
return pd.DataFrame({'A': _intframe['A'].copy().astype('int32'),
|
||||
'B': np.ones(len(_intframe['B']), dtype='uint64'),
|
||||
'C': _intframe['C'].copy().astype('uint8'),
|
||||
'D': _intframe['D'].copy().astype('int64')})
|
||||
|
||||
@cache_readonly
|
||||
def all_mixed(self):
|
||||
return pd.DataFrame({'a': 1., 'b': 2, 'c': 'foo',
|
||||
'float32': np.array([1.] * 10, dtype='float32'),
|
||||
'int32': np.array([1] * 10, dtype='int32')},
|
||||
index=np.arange(10))
|
||||
|
||||
@cache_readonly
|
||||
def tzframe(self):
|
||||
result = pd.DataFrame({'A': pd.date_range('20130101', periods=3),
|
||||
'B': pd.date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'C': pd.date_range('20130101', periods=3,
|
||||
tz='CET')})
|
||||
result.iloc[1, 1] = pd.NaT
|
||||
result.iloc[1, 2] = pd.NaT
|
||||
return result
|
||||
|
||||
@cache_readonly
|
||||
def empty(self):
|
||||
return pd.DataFrame({})
|
||||
|
||||
@cache_readonly
|
||||
def ts1(self):
|
||||
return tm.makeTimeSeries(nper=30)
|
||||
|
||||
@cache_readonly
|
||||
def ts2(self):
|
||||
return tm.makeTimeSeries(nper=30)[5:]
|
||||
|
||||
@cache_readonly
|
||||
def simple(self):
|
||||
arr = np.array([[1., 2., 3.],
|
||||
[4., 5., 6.],
|
||||
[7., 8., 9.]])
|
||||
|
||||
return pd.DataFrame(arr, columns=['one', 'two', 'three'],
|
||||
index=['a', 'b', 'c'])
|
||||
|
||||
# self.ts3 = tm.makeTimeSeries()[-5:]
|
||||
# self.ts4 = tm.makeTimeSeries()[1:-1]
|
||||
|
||||
|
||||
def _check_mixed_float(df, dtype=None):
|
||||
# float16 are most likely to be upcasted to float32
|
||||
dtypes = dict(A='float32', B='float32', C='float16', D='float64')
|
||||
if isinstance(dtype, compat.string_types):
|
||||
dtypes = {k: dtype for k, v in dtypes.items()}
|
||||
elif isinstance(dtype, dict):
|
||||
dtypes.update(dtype)
|
||||
if dtypes.get('A'):
|
||||
assert(df.dtypes['A'] == dtypes['A'])
|
||||
if dtypes.get('B'):
|
||||
assert(df.dtypes['B'] == dtypes['B'])
|
||||
if dtypes.get('C'):
|
||||
assert(df.dtypes['C'] == dtypes['C'])
|
||||
if dtypes.get('D'):
|
||||
assert(df.dtypes['D'] == dtypes['D'])
|
||||
|
||||
|
||||
def _check_mixed_int(df, dtype=None):
|
||||
dtypes = dict(A='int32', B='uint64', C='uint8', D='int64')
|
||||
if isinstance(dtype, compat.string_types):
|
||||
dtypes = {k: dtype for k, v in dtypes.items()}
|
||||
elif isinstance(dtype, dict):
|
||||
dtypes.update(dtype)
|
||||
if dtypes.get('A'):
|
||||
assert(df.dtypes['A'] == dtypes['A'])
|
||||
if dtypes.get('B'):
|
||||
assert(df.dtypes['B'] == dtypes['B'])
|
||||
if dtypes.get('C'):
|
||||
assert(df.dtypes['C'] == dtypes['C'])
|
||||
if dtypes.get('D'):
|
||||
assert(df.dtypes['D'] == dtypes['D'])
|
||||
@@ -0,0 +1,221 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, NaT, compat, date_range
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def float_frame():
|
||||
"""
|
||||
Fixture for DataFrame of floats with index of unique strings
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D'].
|
||||
"""
|
||||
return DataFrame(tm.getSeriesData())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def float_frame_with_na():
|
||||
"""
|
||||
Fixture for DataFrame of floats with index of unique strings
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D']; some entries are missing
|
||||
"""
|
||||
df = DataFrame(tm.getSeriesData())
|
||||
# set some NAs
|
||||
df.loc[5:10] = np.nan
|
||||
df.loc[15:20, -2:] = np.nan
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def float_frame2():
|
||||
"""
|
||||
Fixture for DataFrame of floats with index of unique strings
|
||||
|
||||
Columns are ['D', 'C', 'B', 'A']
|
||||
"""
|
||||
return DataFrame(tm.getSeriesData(), columns=['D', 'C', 'B', 'A'])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def bool_frame_with_na():
|
||||
"""
|
||||
Fixture for DataFrame of booleans with index of unique strings
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D']; some entries are missing
|
||||
"""
|
||||
df = DataFrame(tm.getSeriesData()) > 0
|
||||
df = df.astype(object)
|
||||
# set some NAs
|
||||
df.loc[5:10] = np.nan
|
||||
df.loc[15:20, -2:] = np.nan
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def int_frame():
|
||||
"""
|
||||
Fixture for DataFrame of ints with index of unique strings
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D']
|
||||
"""
|
||||
df = DataFrame({k: v.astype(int)
|
||||
for k, v in compat.iteritems(tm.getSeriesData())})
|
||||
# force these all to int64 to avoid platform testing issues
|
||||
return DataFrame({c: s for c, s in compat.iteritems(df)}, dtype=np.int64)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def datetime_frame():
|
||||
"""
|
||||
Fixture for DataFrame of floats with DatetimeIndex
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D']
|
||||
"""
|
||||
return DataFrame(tm.getTimeSeriesData())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def float_string_frame():
|
||||
"""
|
||||
Fixture for DataFrame of floats and strings with index of unique strings
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D', 'foo'].
|
||||
"""
|
||||
df = DataFrame(tm.getSeriesData())
|
||||
df['foo'] = 'bar'
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mixed_float_frame():
|
||||
"""
|
||||
Fixture for DataFrame of different float types with index of unique strings
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D'].
|
||||
"""
|
||||
df = DataFrame(tm.getSeriesData())
|
||||
df.A = df.A.astype('float32')
|
||||
df.B = df.B.astype('float32')
|
||||
df.C = df.C.astype('float16')
|
||||
df.D = df.D.astype('float64')
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mixed_float_frame2():
|
||||
"""
|
||||
Fixture for DataFrame of different float types with index of unique strings
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D'].
|
||||
"""
|
||||
df = DataFrame(tm.getSeriesData())
|
||||
df.D = df.D.astype('float32')
|
||||
df.C = df.C.astype('float32')
|
||||
df.B = df.B.astype('float16')
|
||||
df.D = df.D.astype('float64')
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mixed_int_frame():
|
||||
"""
|
||||
Fixture for DataFrame of different int types with index of unique strings
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D'].
|
||||
"""
|
||||
df = DataFrame({k: v.astype(int)
|
||||
for k, v in compat.iteritems(tm.getSeriesData())})
|
||||
df.A = df.A.astype('int32')
|
||||
df.B = np.ones(len(df.B), dtype='uint64')
|
||||
df.C = df.C.astype('uint8')
|
||||
df.D = df.C.astype('int64')
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mixed_type_frame():
|
||||
"""
|
||||
Fixture for DataFrame of float/int/string columns with RangeIndex
|
||||
|
||||
Columns are ['a', 'b', 'c', 'float32', 'int32'].
|
||||
"""
|
||||
return DataFrame({'a': 1., 'b': 2, 'c': 'foo',
|
||||
'float32': np.array([1.] * 10, dtype='float32'),
|
||||
'int32': np.array([1] * 10, dtype='int32')},
|
||||
index=np.arange(10))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def timezone_frame():
|
||||
"""
|
||||
Fixture for DataFrame of date_range Series with different time zones
|
||||
|
||||
Columns are ['A', 'B', 'C']; some entries are missing
|
||||
"""
|
||||
df = DataFrame({'A': date_range('20130101', periods=3),
|
||||
'B': date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'C': date_range('20130101', periods=3,
|
||||
tz='CET')})
|
||||
df.iloc[1, 1] = NaT
|
||||
df.iloc[1, 2] = NaT
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def empty_frame():
|
||||
"""
|
||||
Fixture for empty DataFrame
|
||||
"""
|
||||
return DataFrame({})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def datetime_series():
|
||||
"""
|
||||
Fixture for Series of floats with DatetimeIndex
|
||||
"""
|
||||
return tm.makeTimeSeries(nper=30)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def datetime_series_short():
|
||||
"""
|
||||
Fixture for Series of floats with DatetimeIndex
|
||||
"""
|
||||
return tm.makeTimeSeries(nper=30)[5:]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def simple_frame():
|
||||
"""
|
||||
Fixture for simple 3x3 DataFrame
|
||||
|
||||
Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c'].
|
||||
"""
|
||||
arr = np.array([[1., 2., 3.],
|
||||
[4., 5., 6.],
|
||||
[7., 8., 9.]])
|
||||
|
||||
return DataFrame(arr, columns=['one', 'two', 'three'],
|
||||
index=['a', 'b', 'c'])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def frame_of_index_cols():
|
||||
"""
|
||||
Fixture for DataFrame of columns that can be used for indexing
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
|
||||
'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.
|
||||
"""
|
||||
df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
|
||||
'B': ['one', 'two', 'three', 'one', 'two'],
|
||||
'C': ['a', 'b', 'c', 'd', 'e'],
|
||||
'D': np.random.randn(5),
|
||||
'E': np.random.randn(5),
|
||||
('tuple', 'as', 'label'): np.random.randn(5)})
|
||||
return df
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,534 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
# pylint: disable-msg=W0612,E1101
|
||||
from copy import deepcopy
|
||||
import pydoc
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import long, lrange, range
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical, DataFrame, Series, SparseDataFrame, compat, date_range,
|
||||
timedelta_range)
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (
|
||||
assert_almost_equal, assert_frame_equal, assert_series_equal)
|
||||
|
||||
|
||||
class SharedWithSparse(object):
|
||||
"""
|
||||
A collection of tests DataFrame and SparseDataFrame can share.
|
||||
|
||||
In generic tests on this class, use ``self._assert_frame_equal()`` and
|
||||
``self._assert_series_equal()`` which are implemented in sub-classes
|
||||
and dispatch correctly.
|
||||
"""
|
||||
def _assert_frame_equal(self, left, right):
|
||||
"""Dispatch to frame class dependent assertion"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _assert_series_equal(self, left, right):
|
||||
"""Dispatch to series class dependent assertion"""
|
||||
raise NotImplementedError
|
||||
|
||||
def test_copy_index_name_checking(self, float_frame):
|
||||
# don't want to be able to modify the index stored elsewhere after
|
||||
# making a copy
|
||||
for attr in ('index', 'columns'):
|
||||
ind = getattr(float_frame, attr)
|
||||
ind.name = None
|
||||
cp = float_frame.copy()
|
||||
getattr(cp, attr).name = 'foo'
|
||||
assert getattr(float_frame, attr).name is None
|
||||
|
||||
def test_getitem_pop_assign_name(self, float_frame):
|
||||
s = float_frame['A']
|
||||
assert s.name == 'A'
|
||||
|
||||
s = float_frame.pop('A')
|
||||
assert s.name == 'A'
|
||||
|
||||
s = float_frame.loc[:, 'B']
|
||||
assert s.name == 'B'
|
||||
|
||||
s2 = s.loc[:]
|
||||
assert s2.name == 'B'
|
||||
|
||||
def test_get_value(self, float_frame):
|
||||
for idx in float_frame.index:
|
||||
for col in float_frame.columns:
|
||||
with tm.assert_produces_warning(FutureWarning,
|
||||
check_stacklevel=False):
|
||||
result = float_frame.get_value(idx, col)
|
||||
expected = float_frame[col][idx]
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
def test_add_prefix_suffix(self, float_frame):
|
||||
with_prefix = float_frame.add_prefix('foo#')
|
||||
expected = pd.Index(['foo#%s' % c for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_prefix.columns, expected)
|
||||
|
||||
with_suffix = float_frame.add_suffix('#foo')
|
||||
expected = pd.Index(['%s#foo' % c for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_suffix.columns, expected)
|
||||
|
||||
with_pct_prefix = float_frame.add_prefix('%')
|
||||
expected = pd.Index(['%{}'.format(c) for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_pct_prefix.columns, expected)
|
||||
|
||||
with_pct_suffix = float_frame.add_suffix('%')
|
||||
expected = pd.Index(['{}%'.format(c) for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_pct_suffix.columns, expected)
|
||||
|
||||
def test_get_axis(self, float_frame):
|
||||
f = float_frame
|
||||
assert f._get_axis_number(0) == 0
|
||||
assert f._get_axis_number(1) == 1
|
||||
assert f._get_axis_number('index') == 0
|
||||
assert f._get_axis_number('rows') == 0
|
||||
assert f._get_axis_number('columns') == 1
|
||||
|
||||
assert f._get_axis_name(0) == 'index'
|
||||
assert f._get_axis_name(1) == 'columns'
|
||||
assert f._get_axis_name('index') == 'index'
|
||||
assert f._get_axis_name('rows') == 'index'
|
||||
assert f._get_axis_name('columns') == 'columns'
|
||||
|
||||
assert f._get_axis(0) is f.index
|
||||
assert f._get_axis(1) is f.columns
|
||||
|
||||
with pytest.raises(ValueError, match='No axis named'):
|
||||
f._get_axis_number(2)
|
||||
|
||||
with pytest.raises(ValueError, match='No axis.*foo'):
|
||||
f._get_axis_name('foo')
|
||||
|
||||
with pytest.raises(ValueError, match='No axis.*None'):
|
||||
f._get_axis_name(None)
|
||||
|
||||
with pytest.raises(ValueError, match='No axis named'):
|
||||
f._get_axis_number(None)
|
||||
|
||||
def test_keys(self, float_frame):
|
||||
getkeys = float_frame.keys
|
||||
assert getkeys() is float_frame.columns
|
||||
|
||||
def test_column_contains_typeerror(self, float_frame):
|
||||
try:
|
||||
float_frame.columns in float_frame
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
def test_tab_completion(self):
|
||||
# DataFrame whose columns are identifiers shall have them in __dir__.
|
||||
df = pd.DataFrame([list('abcd'), list('efgh')], columns=list('ABCD'))
|
||||
for key in list('ABCD'):
|
||||
assert key in dir(df)
|
||||
assert isinstance(df.__getitem__('A'), pd.Series)
|
||||
|
||||
# DataFrame whose first-level columns are identifiers shall have
|
||||
# them in __dir__.
|
||||
df = pd.DataFrame(
|
||||
[list('abcd'), list('efgh')],
|
||||
columns=pd.MultiIndex.from_tuples(list(zip('ABCD', 'EFGH'))))
|
||||
for key in list('ABCD'):
|
||||
assert key in dir(df)
|
||||
for key in list('EFGH'):
|
||||
assert key not in dir(df)
|
||||
assert isinstance(df.__getitem__('A'), pd.DataFrame)
|
||||
|
||||
def test_not_hashable(self, empty_frame):
|
||||
df = self.klass([1])
|
||||
pytest.raises(TypeError, hash, df)
|
||||
pytest.raises(TypeError, hash, empty_frame)
|
||||
|
||||
def test_new_empty_index(self):
|
||||
df1 = self.klass(np.random.randn(0, 3))
|
||||
df2 = self.klass(np.random.randn(0, 3))
|
||||
df1.index.name = 'foo'
|
||||
assert df2.index.name is None
|
||||
|
||||
def test_array_interface(self, float_frame):
|
||||
with np.errstate(all='ignore'):
|
||||
result = np.sqrt(float_frame)
|
||||
assert isinstance(result, type(float_frame))
|
||||
assert result.index is float_frame.index
|
||||
assert result.columns is float_frame.columns
|
||||
|
||||
self._assert_frame_equal(result, float_frame.apply(np.sqrt))
|
||||
|
||||
def test_get_agg_axis(self, float_frame):
|
||||
cols = float_frame._get_agg_axis(0)
|
||||
assert cols is float_frame.columns
|
||||
|
||||
idx = float_frame._get_agg_axis(1)
|
||||
assert idx is float_frame.index
|
||||
|
||||
pytest.raises(ValueError, float_frame._get_agg_axis, 2)
|
||||
|
||||
def test_nonzero(self, float_frame, float_string_frame, empty_frame):
|
||||
assert empty_frame.empty
|
||||
|
||||
assert not float_frame.empty
|
||||
assert not float_string_frame.empty
|
||||
|
||||
# corner case
|
||||
df = DataFrame({'A': [1., 2., 3.],
|
||||
'B': ['a', 'b', 'c']},
|
||||
index=np.arange(3))
|
||||
del df['A']
|
||||
assert not df.empty
|
||||
|
||||
def test_iteritems(self):
|
||||
df = self.klass([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b'])
|
||||
for k, v in compat.iteritems(df):
|
||||
assert isinstance(v, self.klass._constructor_sliced)
|
||||
|
||||
def test_items(self):
|
||||
# GH 17213, GH 13918
|
||||
cols = ['a', 'b', 'c']
|
||||
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols)
|
||||
for c, (k, v) in zip(cols, df.items()):
|
||||
assert c == k
|
||||
assert isinstance(v, Series)
|
||||
assert (df[k] == v).all()
|
||||
|
||||
def test_iter(self, float_frame):
|
||||
assert tm.equalContents(list(float_frame), float_frame.columns)
|
||||
|
||||
def test_iterrows(self, float_frame, float_string_frame):
|
||||
for k, v in float_frame.iterrows():
|
||||
exp = float_frame.loc[k]
|
||||
self._assert_series_equal(v, exp)
|
||||
|
||||
for k, v in float_string_frame.iterrows():
|
||||
exp = float_string_frame.loc[k]
|
||||
self._assert_series_equal(v, exp)
|
||||
|
||||
def test_iterrows_iso8601(self):
|
||||
# GH 19671
|
||||
if self.klass == SparseDataFrame:
|
||||
pytest.xfail(reason='SparseBlock datetime type not implemented.')
|
||||
|
||||
s = self.klass(
|
||||
{'non_iso8601': ['M1701', 'M1802', 'M1903', 'M2004'],
|
||||
'iso8601': date_range('2000-01-01', periods=4, freq='M')})
|
||||
for k, v in s.iterrows():
|
||||
exp = s.loc[k]
|
||||
self._assert_series_equal(v, exp)
|
||||
|
||||
def test_itertuples(self, float_frame):
|
||||
for i, tup in enumerate(float_frame.itertuples()):
|
||||
s = self.klass._constructor_sliced(tup[1:])
|
||||
s.name = tup[0]
|
||||
expected = float_frame.iloc[i, :].reset_index(drop=True)
|
||||
self._assert_series_equal(s, expected)
|
||||
|
||||
df = self.klass({'floats': np.random.randn(5),
|
||||
'ints': lrange(5)}, columns=['floats', 'ints'])
|
||||
|
||||
for tup in df.itertuples(index=False):
|
||||
assert isinstance(tup[1], (int, long))
|
||||
|
||||
df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]})
|
||||
dfaa = df[['a', 'a']]
|
||||
|
||||
assert (list(dfaa.itertuples()) ==
|
||||
[(0, 1, 1), (1, 2, 2), (2, 3, 3)])
|
||||
|
||||
# repr with be int/long on 32-bit/windows
|
||||
if not (compat.is_platform_windows() or compat.is_platform_32bit()):
|
||||
assert (repr(list(df.itertuples(name=None))) ==
|
||||
'[(0, 1, 4), (1, 2, 5), (2, 3, 6)]')
|
||||
|
||||
tup = next(df.itertuples(name='TestName'))
|
||||
assert tup._fields == ('Index', 'a', 'b')
|
||||
assert (tup.Index, tup.a, tup.b) == tup
|
||||
assert type(tup).__name__ == 'TestName'
|
||||
|
||||
df.columns = ['def', 'return']
|
||||
tup2 = next(df.itertuples(name='TestName'))
|
||||
assert tup2 == (0, 1, 4)
|
||||
assert tup2._fields == ('Index', '_1', '_2')
|
||||
|
||||
df3 = DataFrame({'f' + str(i): [i] for i in range(1024)})
|
||||
# will raise SyntaxError if trying to create namedtuple
|
||||
tup3 = next(df3.itertuples())
|
||||
assert not hasattr(tup3, '_fields')
|
||||
assert isinstance(tup3, tuple)
|
||||
|
||||
def test_sequence_like_with_categorical(self):
|
||||
|
||||
# GH 7839
|
||||
# make sure can iterate
|
||||
df = DataFrame({"id": [1, 2, 3, 4, 5, 6],
|
||||
"raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
|
||||
df['grade'] = Categorical(df['raw_grade'])
|
||||
|
||||
# basic sequencing testing
|
||||
result = list(df.grade.values)
|
||||
expected = np.array(df.grade.values).tolist()
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
# iteration
|
||||
for t in df.itertuples(index=False):
|
||||
str(t)
|
||||
|
||||
for row, s in df.iterrows():
|
||||
str(s)
|
||||
|
||||
for c, col in df.iteritems():
|
||||
str(s)
|
||||
|
||||
def test_len(self, float_frame):
|
||||
assert len(float_frame) == len(float_frame.index)
|
||||
|
||||
def test_values(self, float_frame, float_string_frame):
|
||||
frame = float_frame
|
||||
arr = frame.values
|
||||
|
||||
frame_cols = frame.columns
|
||||
for i, row in enumerate(arr):
|
||||
for j, value in enumerate(row):
|
||||
col = frame_cols[j]
|
||||
if np.isnan(value):
|
||||
assert np.isnan(frame[col][i])
|
||||
else:
|
||||
assert value == frame[col][i]
|
||||
|
||||
# mixed type
|
||||
arr = float_string_frame[['foo', 'A']].values
|
||||
assert arr[0, 0] == 'bar'
|
||||
|
||||
df = self.klass({'complex': [1j, 2j, 3j], 'real': [1, 2, 3]})
|
||||
arr = df.values
|
||||
assert arr[0, 0] == 1j
|
||||
|
||||
# single block corner case
|
||||
arr = float_frame[['A', 'B']].values
|
||||
expected = float_frame.reindex(columns=['A', 'B']).values
|
||||
assert_almost_equal(arr, expected)
|
||||
|
||||
def test_to_numpy(self):
|
||||
df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]})
|
||||
expected = np.array([[1, 3], [2, 4.5]])
|
||||
result = df.to_numpy()
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_to_numpy_dtype(self):
|
||||
df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]})
|
||||
expected = np.array([[1, 3], [2, 4]], dtype="int64")
|
||||
result = df.to_numpy(dtype="int64")
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_to_numpy_copy(self):
|
||||
arr = np.random.randn(4, 3)
|
||||
df = pd.DataFrame(arr)
|
||||
assert df.values.base is arr
|
||||
assert df.to_numpy(copy=False).base is arr
|
||||
assert df.to_numpy(copy=True).base is None
|
||||
|
||||
def test_transpose(self, float_frame):
|
||||
frame = float_frame
|
||||
dft = frame.T
|
||||
for idx, series in compat.iteritems(dft):
|
||||
for col, value in compat.iteritems(series):
|
||||
if np.isnan(value):
|
||||
assert np.isnan(frame[col][idx])
|
||||
else:
|
||||
assert value == frame[col][idx]
|
||||
|
||||
# mixed type
|
||||
index, data = tm.getMixedTypeDict()
|
||||
mixed = self.klass(data, index=index)
|
||||
|
||||
mixed_T = mixed.T
|
||||
for col, s in compat.iteritems(mixed_T):
|
||||
assert s.dtype == np.object_
|
||||
|
||||
def test_swapaxes(self):
|
||||
df = self.klass(np.random.randn(10, 5))
|
||||
self._assert_frame_equal(df.T, df.swapaxes(0, 1))
|
||||
self._assert_frame_equal(df.T, df.swapaxes(1, 0))
|
||||
self._assert_frame_equal(df, df.swapaxes(0, 0))
|
||||
pytest.raises(ValueError, df.swapaxes, 2, 5)
|
||||
|
||||
def test_axis_aliases(self, float_frame):
|
||||
f = float_frame
|
||||
|
||||
# reg name
|
||||
expected = f.sum(axis=0)
|
||||
result = f.sum(axis='index')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
expected = f.sum(axis=1)
|
||||
result = f.sum(axis='columns')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_class_axis(self):
|
||||
# GH 18147
|
||||
# no exception and no empty docstring
|
||||
assert pydoc.getdoc(DataFrame.index)
|
||||
assert pydoc.getdoc(DataFrame.columns)
|
||||
|
||||
def test_more_values(self, float_string_frame):
|
||||
values = float_string_frame.values
|
||||
assert values.shape[1] == len(float_string_frame.columns)
|
||||
|
||||
def test_repr_with_mi_nat(self, float_string_frame):
|
||||
df = self.klass({'X': [1, 2]},
|
||||
index=[[pd.NaT, pd.Timestamp('20130101')], ['a', 'b']])
|
||||
result = repr(df)
|
||||
expected = ' X\nNaT a 1\n2013-01-01 b 2'
|
||||
assert result == expected
|
||||
|
||||
def test_iteritems_names(self, float_string_frame):
|
||||
for k, v in compat.iteritems(float_string_frame):
|
||||
assert v.name == k
|
||||
|
||||
def test_series_put_names(self, float_string_frame):
|
||||
series = float_string_frame._series
|
||||
for k, v in compat.iteritems(series):
|
||||
assert v.name == k
|
||||
|
||||
def test_empty_nonzero(self):
|
||||
df = self.klass([1, 2, 3])
|
||||
assert not df.empty
|
||||
df = self.klass(index=[1], columns=[1])
|
||||
assert not df.empty
|
||||
df = self.klass(index=['a', 'b'], columns=['c', 'd']).dropna()
|
||||
assert df.empty
|
||||
assert df.T.empty
|
||||
empty_frames = [self.klass(),
|
||||
self.klass(index=[1]),
|
||||
self.klass(columns=[1]),
|
||||
self.klass({1: []})]
|
||||
for df in empty_frames:
|
||||
assert df.empty
|
||||
assert df.T.empty
|
||||
|
||||
def test_with_datetimelikes(self):
|
||||
|
||||
df = self.klass({'A': date_range('20130101', periods=10),
|
||||
'B': timedelta_range('1 day', periods=10)})
|
||||
t = df.T
|
||||
|
||||
result = t.get_dtype_counts()
|
||||
if self.klass is DataFrame:
|
||||
expected = Series({'object': 10})
|
||||
else:
|
||||
expected = Series({'Sparse[object, nan]': 10})
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
class TestDataFrameMisc(SharedWithSparse):
|
||||
|
||||
klass = DataFrame
|
||||
# SharedWithSparse tests use generic, klass-agnostic assertion
|
||||
_assert_frame_equal = staticmethod(assert_frame_equal)
|
||||
_assert_series_equal = staticmethod(assert_series_equal)
|
||||
|
||||
def test_values(self, float_frame):
|
||||
float_frame.values[:, 0] = 5.
|
||||
assert (float_frame.values[:, 0] == 5).all()
|
||||
|
||||
def test_as_matrix_deprecated(self, float_frame):
|
||||
# GH 18458
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
cols = float_frame.columns.tolist()
|
||||
result = float_frame.as_matrix(columns=cols)
|
||||
expected = float_frame.values
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_deepcopy(self, float_frame):
|
||||
cp = deepcopy(float_frame)
|
||||
series = cp['A']
|
||||
series[:] = 10
|
||||
for idx, value in compat.iteritems(series):
|
||||
assert float_frame['A'][idx] != value
|
||||
|
||||
def test_transpose_get_view(self, float_frame):
|
||||
dft = float_frame.T
|
||||
dft.values[:, 5:10] = 5
|
||||
|
||||
assert (float_frame.values[5:10] == 5).all()
|
||||
|
||||
def test_inplace_return_self(self):
|
||||
# GH 1893
|
||||
|
||||
data = DataFrame({'a': ['foo', 'bar', 'baz', 'qux'],
|
||||
'b': [0, 0, 1, 1],
|
||||
'c': [1, 2, 3, 4]})
|
||||
|
||||
def _check_f(base, f):
|
||||
result = f(base)
|
||||
assert result is None
|
||||
|
||||
# -----DataFrame-----
|
||||
|
||||
# set_index
|
||||
f = lambda x: x.set_index('a', inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# reset_index
|
||||
f = lambda x: x.reset_index(inplace=True)
|
||||
_check_f(data.set_index('a'), f)
|
||||
|
||||
# drop_duplicates
|
||||
f = lambda x: x.drop_duplicates(inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# sort
|
||||
f = lambda x: x.sort_values('b', inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# sort_index
|
||||
f = lambda x: x.sort_index(inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# fillna
|
||||
f = lambda x: x.fillna(0, inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# replace
|
||||
f = lambda x: x.replace(1, 0, inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# rename
|
||||
f = lambda x: x.rename({1: 'foo'}, inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# -----Series-----
|
||||
d = data.copy()['c']
|
||||
|
||||
# reset_index
|
||||
f = lambda x: x.reset_index(inplace=True, drop=True)
|
||||
_check_f(data.set_index('a')['c'], f)
|
||||
|
||||
# fillna
|
||||
f = lambda x: x.fillna(0, inplace=True)
|
||||
_check_f(d.copy(), f)
|
||||
|
||||
# replace
|
||||
f = lambda x: x.replace(1, 0, inplace=True)
|
||||
_check_f(d.copy(), f)
|
||||
|
||||
# rename
|
||||
f = lambda x: x.rename({1: 'foo'}, inplace=True)
|
||||
_check_f(d.copy(), f)
|
||||
|
||||
def test_tab_complete_warning(self, ip):
|
||||
# GH 16409
|
||||
pytest.importorskip('IPython', minversion="6.0.0")
|
||||
from IPython.core.completer import provisionalcompleter
|
||||
|
||||
code = "import pandas as pd; df = pd.DataFrame()"
|
||||
ip.run_code(code)
|
||||
with tm.assert_produces_warning(None):
|
||||
with provisionalcompleter('ignore'):
|
||||
list(ip.Completer.completions('df.', 1))
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,636 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from collections import deque
|
||||
from datetime import datetime
|
||||
import operator
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import range
|
||||
|
||||
import pandas as pd
|
||||
from pandas.tests.frame.common import _check_mixed_float, _check_mixed_int
|
||||
import pandas.util.testing as tm
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Comparisons
|
||||
|
||||
|
||||
class TestFrameComparisons(object):
|
||||
# Specifically _not_ flex-comparisons
|
||||
|
||||
def test_comparison_invalid(self):
|
||||
|
||||
def check(df, df2):
|
||||
|
||||
for (x, y) in [(df, df2), (df2, df)]:
|
||||
# we expect the result to match Series comparisons for
|
||||
# == and !=, inequalities should raise
|
||||
result = x == y
|
||||
expected = pd.DataFrame({col: x[col] == y[col]
|
||||
for col in x.columns},
|
||||
index=x.index, columns=x.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = x != y
|
||||
expected = pd.DataFrame({col: x[col] != y[col]
|
||||
for col in x.columns},
|
||||
index=x.index, columns=x.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
x >= y
|
||||
with pytest.raises(TypeError):
|
||||
x > y
|
||||
with pytest.raises(TypeError):
|
||||
x < y
|
||||
with pytest.raises(TypeError):
|
||||
x <= y
|
||||
|
||||
# GH4968
|
||||
# invalid date/int comparisons
|
||||
df = pd.DataFrame(np.random.randint(10, size=(10, 1)), columns=['a'])
|
||||
df['dates'] = pd.date_range('20010101', periods=len(df))
|
||||
|
||||
df2 = df.copy()
|
||||
df2['dates'] = df['a']
|
||||
check(df, df2)
|
||||
|
||||
df = pd.DataFrame(np.random.randint(10, size=(10, 2)),
|
||||
columns=['a', 'b'])
|
||||
df2 = pd.DataFrame({'a': pd.date_range('20010101', periods=len(df)),
|
||||
'b': pd.date_range('20100101', periods=len(df))})
|
||||
check(df, df2)
|
||||
|
||||
def test_timestamp_compare(self):
|
||||
# make sure we can compare Timestamps on the right AND left hand side
|
||||
# GH#4982
|
||||
df = pd. DataFrame({'dates1': pd.date_range('20010101', periods=10),
|
||||
'dates2': pd.date_range('20010102', periods=10),
|
||||
'intcol': np.random.randint(1000000000, size=10),
|
||||
'floatcol': np.random.randn(10),
|
||||
'stringcol': list(tm.rands(10))})
|
||||
df.loc[np.random.rand(len(df)) > 0.5, 'dates2'] = pd.NaT
|
||||
ops = {'gt': 'lt', 'lt': 'gt', 'ge': 'le', 'le': 'ge', 'eq': 'eq',
|
||||
'ne': 'ne'}
|
||||
|
||||
for left, right in ops.items():
|
||||
left_f = getattr(operator, left)
|
||||
right_f = getattr(operator, right)
|
||||
|
||||
# no nats
|
||||
if left in ['eq', 'ne']:
|
||||
expected = left_f(df, pd.Timestamp('20010109'))
|
||||
result = right_f(pd.Timestamp('20010109'), df)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(TypeError):
|
||||
left_f(df, pd.Timestamp('20010109'))
|
||||
with pytest.raises(TypeError):
|
||||
right_f(pd.Timestamp('20010109'), df)
|
||||
# nats
|
||||
expected = left_f(df, pd.Timestamp('nat'))
|
||||
result = right_f(pd.Timestamp('nat'), df)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_mixed_comparison(self):
|
||||
# GH#13128, GH#22163 != datetime64 vs non-dt64 should be False,
|
||||
# not raise TypeError
|
||||
# (this appears to be fixed before GH#22163, not sure when)
|
||||
df = pd.DataFrame([['1989-08-01', 1], ['1989-08-01', 2]])
|
||||
other = pd.DataFrame([['a', 'b'], ['c', 'd']])
|
||||
|
||||
result = df == other
|
||||
assert not result.any().any()
|
||||
|
||||
result = df != other
|
||||
assert result.all().all()
|
||||
|
||||
def test_df_boolean_comparison_error(self):
|
||||
# GH#4576, GH#22880
|
||||
# comparing DataFrame against list/tuple with len(obj) matching
|
||||
# len(df.columns) is supported as of GH#22800
|
||||
df = pd.DataFrame(np.arange(6).reshape((3, 2)))
|
||||
|
||||
expected = pd.DataFrame([[False, False],
|
||||
[True, False],
|
||||
[False, False]])
|
||||
|
||||
result = df == (2, 2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df == [2, 2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_df_float_none_comparison(self):
|
||||
df = pd.DataFrame(np.random.randn(8, 3), index=range(8),
|
||||
columns=['A', 'B', 'C'])
|
||||
|
||||
result = df.__eq__(None)
|
||||
assert not result.any().any()
|
||||
|
||||
def test_df_string_comparison(self):
|
||||
df = pd.DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}])
|
||||
mask_a = df.a > 1
|
||||
tm.assert_frame_equal(df[mask_a], df.loc[1:1, :])
|
||||
tm.assert_frame_equal(df[-mask_a], df.loc[0:0, :])
|
||||
|
||||
mask_b = df.b == "foo"
|
||||
tm.assert_frame_equal(df[mask_b], df.loc[0:0, :])
|
||||
tm.assert_frame_equal(df[-mask_b], df.loc[1:1, :])
|
||||
|
||||
|
||||
class TestFrameFlexComparisons(object):
|
||||
# TODO: test_bool_flex_frame needs a better name
|
||||
def test_bool_flex_frame(self):
|
||||
data = np.random.randn(5, 3)
|
||||
other_data = np.random.randn(5, 3)
|
||||
df = pd.DataFrame(data)
|
||||
other = pd.DataFrame(other_data)
|
||||
ndim_5 = np.ones(df.shape + (1, 3))
|
||||
|
||||
# Unaligned
|
||||
def _check_unaligned_frame(meth, op, df, other):
|
||||
part_o = other.loc[3:, 1:].copy()
|
||||
rs = meth(part_o)
|
||||
xp = op(df, part_o.reindex(index=df.index, columns=df.columns))
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
# DataFrame
|
||||
assert df.eq(df).values.all()
|
||||
assert not df.ne(df).values.any()
|
||||
for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']:
|
||||
f = getattr(df, op)
|
||||
o = getattr(operator, op)
|
||||
# No NAs
|
||||
tm.assert_frame_equal(f(other), o(df, other))
|
||||
_check_unaligned_frame(f, o, df, other)
|
||||
# ndarray
|
||||
tm.assert_frame_equal(f(other.values), o(df, other.values))
|
||||
# scalar
|
||||
tm.assert_frame_equal(f(0), o(df, 0))
|
||||
# NAs
|
||||
msg = "Unable to coerce to Series/DataFrame"
|
||||
tm.assert_frame_equal(f(np.nan), o(df, np.nan))
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
f(ndim_5)
|
||||
|
||||
# Series
|
||||
def _test_seq(df, idx_ser, col_ser):
|
||||
idx_eq = df.eq(idx_ser, axis=0)
|
||||
col_eq = df.eq(col_ser)
|
||||
idx_ne = df.ne(idx_ser, axis=0)
|
||||
col_ne = df.ne(col_ser)
|
||||
tm.assert_frame_equal(col_eq, df == pd.Series(col_ser))
|
||||
tm.assert_frame_equal(col_eq, -col_ne)
|
||||
tm.assert_frame_equal(idx_eq, -idx_ne)
|
||||
tm.assert_frame_equal(idx_eq, df.T.eq(idx_ser).T)
|
||||
tm.assert_frame_equal(col_eq, df.eq(list(col_ser)))
|
||||
tm.assert_frame_equal(idx_eq, df.eq(pd.Series(idx_ser), axis=0))
|
||||
tm.assert_frame_equal(idx_eq, df.eq(list(idx_ser), axis=0))
|
||||
|
||||
idx_gt = df.gt(idx_ser, axis=0)
|
||||
col_gt = df.gt(col_ser)
|
||||
idx_le = df.le(idx_ser, axis=0)
|
||||
col_le = df.le(col_ser)
|
||||
|
||||
tm.assert_frame_equal(col_gt, df > pd.Series(col_ser))
|
||||
tm.assert_frame_equal(col_gt, -col_le)
|
||||
tm.assert_frame_equal(idx_gt, -idx_le)
|
||||
tm.assert_frame_equal(idx_gt, df.T.gt(idx_ser).T)
|
||||
|
||||
idx_ge = df.ge(idx_ser, axis=0)
|
||||
col_ge = df.ge(col_ser)
|
||||
idx_lt = df.lt(idx_ser, axis=0)
|
||||
col_lt = df.lt(col_ser)
|
||||
tm.assert_frame_equal(col_ge, df >= pd.Series(col_ser))
|
||||
tm.assert_frame_equal(col_ge, -col_lt)
|
||||
tm.assert_frame_equal(idx_ge, -idx_lt)
|
||||
tm.assert_frame_equal(idx_ge, df.T.ge(idx_ser).T)
|
||||
|
||||
idx_ser = pd.Series(np.random.randn(5))
|
||||
col_ser = pd.Series(np.random.randn(3))
|
||||
_test_seq(df, idx_ser, col_ser)
|
||||
|
||||
# list/tuple
|
||||
_test_seq(df, idx_ser.values, col_ser.values)
|
||||
|
||||
# NA
|
||||
df.loc[0, 0] = np.nan
|
||||
rs = df.eq(df)
|
||||
assert not rs.loc[0, 0]
|
||||
rs = df.ne(df)
|
||||
assert rs.loc[0, 0]
|
||||
rs = df.gt(df)
|
||||
assert not rs.loc[0, 0]
|
||||
rs = df.lt(df)
|
||||
assert not rs.loc[0, 0]
|
||||
rs = df.ge(df)
|
||||
assert not rs.loc[0, 0]
|
||||
rs = df.le(df)
|
||||
assert not rs.loc[0, 0]
|
||||
|
||||
# complex
|
||||
arr = np.array([np.nan, 1, 6, np.nan])
|
||||
arr2 = np.array([2j, np.nan, 7, None])
|
||||
df = pd.DataFrame({'a': arr})
|
||||
df2 = pd.DataFrame({'a': arr2})
|
||||
rs = df.gt(df2)
|
||||
assert not rs.values.any()
|
||||
rs = df.ne(df2)
|
||||
assert rs.values.all()
|
||||
|
||||
arr3 = np.array([2j, np.nan, None])
|
||||
df3 = pd.DataFrame({'a': arr3})
|
||||
rs = df3.gt(2j)
|
||||
assert not rs.values.any()
|
||||
|
||||
# corner, dtype=object
|
||||
df1 = pd.DataFrame({'col': ['foo', np.nan, 'bar']})
|
||||
df2 = pd.DataFrame({'col': ['foo', datetime.now(), 'bar']})
|
||||
result = df1.ne(df2)
|
||||
exp = pd.DataFrame({'col': [False, True, False]})
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
def test_flex_comparison_nat(self):
|
||||
# GH 15697, GH 22163 df.eq(pd.NaT) should behave like df == pd.NaT,
|
||||
# and _definitely_ not be NaN
|
||||
df = pd.DataFrame([pd.NaT])
|
||||
|
||||
result = df == pd.NaT
|
||||
# result.iloc[0, 0] is a np.bool_ object
|
||||
assert result.iloc[0, 0].item() is False
|
||||
|
||||
result = df.eq(pd.NaT)
|
||||
assert result.iloc[0, 0].item() is False
|
||||
|
||||
result = df != pd.NaT
|
||||
assert result.iloc[0, 0].item() is True
|
||||
|
||||
result = df.ne(pd.NaT)
|
||||
assert result.iloc[0, 0].item() is True
|
||||
|
||||
@pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le'])
|
||||
def test_df_flex_cmp_constant_return_types(self, opname):
|
||||
# GH 15077, non-empty DataFrame
|
||||
df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]})
|
||||
const = 2
|
||||
|
||||
result = getattr(df, opname)(const).get_dtype_counts()
|
||||
tm.assert_series_equal(result, pd.Series([2], ['bool']))
|
||||
|
||||
@pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le'])
|
||||
def test_df_flex_cmp_constant_return_types_empty(self, opname):
|
||||
# GH 15077 empty DataFrame
|
||||
df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]})
|
||||
const = 2
|
||||
|
||||
empty = df.iloc[:0]
|
||||
result = getattr(empty, opname)(const).get_dtype_counts()
|
||||
tm.assert_series_equal(result, pd.Series([2], ['bool']))
|
||||
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Arithmetic
|
||||
|
||||
class TestFrameFlexArithmetic(object):
|
||||
|
||||
def test_df_add_td64_columnwise(self):
|
||||
# GH 22534 Check that column-wise addition broadcasts correctly
|
||||
dti = pd.date_range('2016-01-01', periods=10)
|
||||
tdi = pd.timedelta_range('1', periods=10)
|
||||
tser = pd.Series(tdi)
|
||||
df = pd.DataFrame({0: dti, 1: tdi})
|
||||
|
||||
result = df.add(tser, axis=0)
|
||||
expected = pd.DataFrame({0: dti + tdi,
|
||||
1: tdi + tdi})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_df_add_flex_filled_mixed_dtypes(self):
|
||||
# GH 19611
|
||||
dti = pd.date_range('2016-01-01', periods=3)
|
||||
ser = pd.Series(['1 Day', 'NaT', '2 Days'], dtype='timedelta64[ns]')
|
||||
df = pd.DataFrame({'A': dti, 'B': ser})
|
||||
other = pd.DataFrame({'A': ser, 'B': ser})
|
||||
fill = pd.Timedelta(days=1).to_timedelta64()
|
||||
result = df.add(other, fill_value=fill)
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{'A': pd.Series(['2016-01-02', '2016-01-03', '2016-01-05'],
|
||||
dtype='datetime64[ns]'),
|
||||
'B': ser * 2})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_arith_flex_frame(self, all_arithmetic_operators, float_frame,
|
||||
mixed_float_frame):
|
||||
# one instance of parametrized fixture
|
||||
op = all_arithmetic_operators
|
||||
|
||||
def f(x, y):
|
||||
# r-versions not in operator-stdlib; get op without "r" and invert
|
||||
if op.startswith('__r'):
|
||||
return getattr(operator, op.replace('__r', '__'))(y, x)
|
||||
return getattr(operator, op)(x, y)
|
||||
|
||||
result = getattr(float_frame, op)(2 * float_frame)
|
||||
expected = f(float_frame, 2 * float_frame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# vs mix float
|
||||
result = getattr(mixed_float_frame, op)(2 * mixed_float_frame)
|
||||
expected = f(mixed_float_frame, 2 * mixed_float_frame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
_check_mixed_float(result, dtype=dict(C=None))
|
||||
|
||||
@pytest.mark.parametrize('op', ['__add__', '__sub__', '__mul__'])
|
||||
def test_arith_flex_frame_mixed(self, op, int_frame, mixed_int_frame,
|
||||
mixed_float_frame):
|
||||
f = getattr(operator, op)
|
||||
|
||||
# vs mix int
|
||||
result = getattr(mixed_int_frame, op)(2 + mixed_int_frame)
|
||||
expected = f(mixed_int_frame, 2 + mixed_int_frame)
|
||||
|
||||
# no overflow in the uint
|
||||
dtype = None
|
||||
if op in ['__sub__']:
|
||||
dtype = dict(B='uint64', C=None)
|
||||
elif op in ['__add__', '__mul__']:
|
||||
dtype = dict(C=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
_check_mixed_int(result, dtype=dtype)
|
||||
|
||||
# vs mix float
|
||||
result = getattr(mixed_float_frame, op)(2 * mixed_float_frame)
|
||||
expected = f(mixed_float_frame, 2 * mixed_float_frame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
_check_mixed_float(result, dtype=dict(C=None))
|
||||
|
||||
# vs plain int
|
||||
result = getattr(int_frame, op)(2 * int_frame)
|
||||
expected = f(int_frame, 2 * int_frame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_arith_flex_frame_raise(self, all_arithmetic_operators,
|
||||
float_frame):
|
||||
# one instance of parametrized fixture
|
||||
op = all_arithmetic_operators
|
||||
|
||||
# Check that arrays with dim >= 3 raise
|
||||
for dim in range(3, 6):
|
||||
arr = np.ones((1,) * dim)
|
||||
msg = "Unable to coerce to Series/DataFrame"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
getattr(float_frame, op)(arr)
|
||||
|
||||
def test_arith_flex_frame_corner(self, float_frame):
|
||||
|
||||
const_add = float_frame.add(1)
|
||||
tm.assert_frame_equal(const_add, float_frame + 1)
|
||||
|
||||
# corner cases
|
||||
result = float_frame.add(float_frame[:0])
|
||||
tm.assert_frame_equal(result, float_frame * np.nan)
|
||||
|
||||
result = float_frame[:0].add(float_frame)
|
||||
tm.assert_frame_equal(result, float_frame * np.nan)
|
||||
|
||||
with pytest.raises(NotImplementedError, match='fill_value'):
|
||||
float_frame.add(float_frame.iloc[0], fill_value=3)
|
||||
|
||||
with pytest.raises(NotImplementedError, match='fill_value'):
|
||||
float_frame.add(float_frame.iloc[0], axis='index', fill_value=3)
|
||||
|
||||
def test_arith_flex_series(self, simple_frame):
|
||||
df = simple_frame
|
||||
|
||||
row = df.xs('a')
|
||||
col = df['two']
|
||||
# after arithmetic refactor, add truediv here
|
||||
ops = ['add', 'sub', 'mul', 'mod']
|
||||
for op in ops:
|
||||
f = getattr(df, op)
|
||||
op = getattr(operator, op)
|
||||
tm.assert_frame_equal(f(row), op(df, row))
|
||||
tm.assert_frame_equal(f(col, axis=0), op(df.T, col).T)
|
||||
|
||||
# special case for some reason
|
||||
tm.assert_frame_equal(df.add(row, axis=None), df + row)
|
||||
|
||||
# cases which will be refactored after big arithmetic refactor
|
||||
tm.assert_frame_equal(df.div(row), df / row)
|
||||
tm.assert_frame_equal(df.div(col, axis=0), (df.T / col).T)
|
||||
|
||||
# broadcasting issue in GH 7325
|
||||
df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='int64')
|
||||
expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
|
||||
result = df.div(df[0], axis='index')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='float64')
|
||||
expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
|
||||
result = df.div(df[0], axis='index')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_arith_flex_zero_len_raises(self):
|
||||
# GH 19522 passing fill_value to frame flex arith methods should
|
||||
# raise even in the zero-length special cases
|
||||
ser_len0 = pd.Series([])
|
||||
df_len0 = pd.DataFrame([], columns=['A', 'B'])
|
||||
df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
|
||||
|
||||
with pytest.raises(NotImplementedError, match='fill_value'):
|
||||
df.add(ser_len0, fill_value='E')
|
||||
|
||||
with pytest.raises(NotImplementedError, match='fill_value'):
|
||||
df_len0.sub(df['A'], axis=None, fill_value=3)
|
||||
|
||||
|
||||
class TestFrameArithmetic(object):
|
||||
def test_df_add_2d_array_rowlike_broadcasts(self):
|
||||
# GH#23000
|
||||
arr = np.arange(6).reshape(3, 2)
|
||||
df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C'])
|
||||
|
||||
rowlike = arr[[1], :] # shape --> (1, ncols)
|
||||
assert rowlike.shape == (1, df.shape[1])
|
||||
|
||||
expected = pd.DataFrame([[2, 4],
|
||||
[4, 6],
|
||||
[6, 8]],
|
||||
columns=df.columns, index=df.index,
|
||||
# specify dtype explicitly to avoid failing
|
||||
# on 32bit builds
|
||||
dtype=arr.dtype)
|
||||
result = df + rowlike
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = rowlike + df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_df_add_2d_array_collike_broadcasts(self):
|
||||
# GH#23000
|
||||
arr = np.arange(6).reshape(3, 2)
|
||||
df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C'])
|
||||
|
||||
collike = arr[:, [1]] # shape --> (nrows, 1)
|
||||
assert collike.shape == (df.shape[0], 1)
|
||||
|
||||
expected = pd.DataFrame([[1, 2],
|
||||
[5, 6],
|
||||
[9, 10]],
|
||||
columns=df.columns, index=df.index,
|
||||
# specify dtype explicitly to avoid failing
|
||||
# on 32bit builds
|
||||
dtype=arr.dtype)
|
||||
result = df + collike
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = collike + df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_df_arith_2d_array_rowlike_broadcasts(self,
|
||||
all_arithmetic_operators):
|
||||
# GH#23000
|
||||
opname = all_arithmetic_operators
|
||||
|
||||
arr = np.arange(6).reshape(3, 2)
|
||||
df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C'])
|
||||
|
||||
rowlike = arr[[1], :] # shape --> (1, ncols)
|
||||
assert rowlike.shape == (1, df.shape[1])
|
||||
|
||||
exvals = [getattr(df.loc['A'], opname)(rowlike.squeeze()),
|
||||
getattr(df.loc['B'], opname)(rowlike.squeeze()),
|
||||
getattr(df.loc['C'], opname)(rowlike.squeeze())]
|
||||
|
||||
expected = pd.DataFrame(exvals, columns=df.columns, index=df.index)
|
||||
|
||||
if opname in ['__rmod__', '__rfloordiv__']:
|
||||
# exvals will have dtypes [f8, i8, i8] so expected will be
|
||||
# all-f8, but the DataFrame operation will return mixed dtypes
|
||||
# use exvals[-1].dtype instead of "i8" for compat with 32-bit
|
||||
# systems/pythons
|
||||
expected[False] = expected[False].astype(exvals[-1].dtype)
|
||||
|
||||
result = getattr(df, opname)(rowlike)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_df_arith_2d_array_collike_broadcasts(self,
|
||||
all_arithmetic_operators):
|
||||
# GH#23000
|
||||
opname = all_arithmetic_operators
|
||||
|
||||
arr = np.arange(6).reshape(3, 2)
|
||||
df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C'])
|
||||
|
||||
collike = arr[:, [1]] # shape --> (nrows, 1)
|
||||
assert collike.shape == (df.shape[0], 1)
|
||||
|
||||
exvals = {True: getattr(df[True], opname)(collike.squeeze()),
|
||||
False: getattr(df[False], opname)(collike.squeeze())}
|
||||
|
||||
dtype = None
|
||||
if opname in ['__rmod__', '__rfloordiv__']:
|
||||
# Series ops may return mixed int/float dtypes in cases where
|
||||
# DataFrame op will return all-float. So we upcast `expected`
|
||||
dtype = np.common_type(*[x.values for x in exvals.values()])
|
||||
|
||||
expected = pd.DataFrame(exvals, columns=df.columns, index=df.index,
|
||||
dtype=dtype)
|
||||
|
||||
result = getattr(df, opname)(collike)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_df_bool_mul_int(self):
|
||||
# GH 22047, GH 22163 multiplication by 1 should result in int dtype,
|
||||
# not object dtype
|
||||
df = pd.DataFrame([[False, True], [False, False]])
|
||||
result = df * 1
|
||||
|
||||
# On appveyor this comes back as np.int32 instead of np.int64,
|
||||
# so we check dtype.kind instead of just dtype
|
||||
kinds = result.dtypes.apply(lambda x: x.kind)
|
||||
assert (kinds == 'i').all()
|
||||
|
||||
result = 1 * df
|
||||
kinds = result.dtypes.apply(lambda x: x.kind)
|
||||
assert (kinds == 'i').all()
|
||||
|
||||
def test_arith_mixed(self):
|
||||
|
||||
left = pd.DataFrame({'A': ['a', 'b', 'c'],
|
||||
'B': [1, 2, 3]})
|
||||
|
||||
result = left + left
|
||||
expected = pd.DataFrame({'A': ['aa', 'bb', 'cc'],
|
||||
'B': [2, 4, 6]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_arith_getitem_commute(self):
|
||||
df = pd.DataFrame({'A': [1.1, 3.3], 'B': [2.5, -3.9]})
|
||||
|
||||
def _test_op(df, op):
|
||||
result = op(df, 1)
|
||||
|
||||
if not df.columns.is_unique:
|
||||
raise ValueError("Only unique columns supported by this test")
|
||||
|
||||
for col in result.columns:
|
||||
tm.assert_series_equal(result[col], op(df[col], 1))
|
||||
|
||||
_test_op(df, operator.add)
|
||||
_test_op(df, operator.sub)
|
||||
_test_op(df, operator.mul)
|
||||
_test_op(df, operator.truediv)
|
||||
_test_op(df, operator.floordiv)
|
||||
_test_op(df, operator.pow)
|
||||
|
||||
_test_op(df, lambda x, y: y + x)
|
||||
_test_op(df, lambda x, y: y - x)
|
||||
_test_op(df, lambda x, y: y * x)
|
||||
_test_op(df, lambda x, y: y / x)
|
||||
_test_op(df, lambda x, y: y ** x)
|
||||
|
||||
_test_op(df, lambda x, y: x + y)
|
||||
_test_op(df, lambda x, y: x - y)
|
||||
_test_op(df, lambda x, y: x * y)
|
||||
_test_op(df, lambda x, y: x / y)
|
||||
_test_op(df, lambda x, y: x ** y)
|
||||
|
||||
@pytest.mark.parametrize('values', [[1, 2], (1, 2), np.array([1, 2]),
|
||||
range(1, 3), deque([1, 2])])
|
||||
def test_arith_alignment_non_pandas_object(self, values):
|
||||
# GH#17901
|
||||
df = pd.DataFrame({'A': [1, 1], 'B': [1, 1]})
|
||||
expected = pd.DataFrame({'A': [2, 2], 'B': [3, 3]})
|
||||
result = df + values
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_arith_non_pandas_object(self):
|
||||
df = pd.DataFrame(np.arange(1, 10, dtype='f8').reshape(3, 3),
|
||||
columns=['one', 'two', 'three'],
|
||||
index=['a', 'b', 'c'])
|
||||
|
||||
val1 = df.xs('a').values
|
||||
added = pd.DataFrame(df.values + val1,
|
||||
index=df.index, columns=df.columns)
|
||||
tm.assert_frame_equal(df + val1, added)
|
||||
|
||||
added = pd.DataFrame((df.values.T + val1).T,
|
||||
index=df.index, columns=df.columns)
|
||||
tm.assert_frame_equal(df.add(val1, axis=0), added)
|
||||
|
||||
val2 = list(df['two'])
|
||||
|
||||
added = pd.DataFrame(df.values + val2,
|
||||
index=df.index, columns=df.columns)
|
||||
tm.assert_frame_equal(df + val2, added)
|
||||
|
||||
added = pd.DataFrame((df.values.T + val2).T, index=df.index,
|
||||
columns=df.columns)
|
||||
tm.assert_frame_equal(df.add(val2, axis='index'), added)
|
||||
|
||||
val3 = np.random.rand(*df.shape)
|
||||
added = pd.DataFrame(df.values + val3,
|
||||
index=df.index, columns=df.columns)
|
||||
tm.assert_frame_equal(df.add(val3), added)
|
||||
@@ -0,0 +1,126 @@
|
||||
# coding=utf-8
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, Series, Timestamp, date_range, to_datetime
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from .common import TestData
|
||||
|
||||
|
||||
class TestFrameAsof(TestData):
|
||||
def setup_method(self, method):
|
||||
self.N = N = 50
|
||||
self.rng = date_range('1/1/1990', periods=N, freq='53s')
|
||||
self.df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
|
||||
index=self.rng)
|
||||
|
||||
def test_basic(self):
|
||||
df = self.df.copy()
|
||||
df.loc[15:30, 'A'] = np.nan
|
||||
dates = date_range('1/1/1990', periods=self.N * 3,
|
||||
freq='25s')
|
||||
|
||||
result = df.asof(dates)
|
||||
assert result.notna().all(1).all()
|
||||
lb = df.index[14]
|
||||
ub = df.index[30]
|
||||
|
||||
dates = list(dates)
|
||||
result = df.asof(dates)
|
||||
assert result.notna().all(1).all()
|
||||
|
||||
mask = (result.index >= lb) & (result.index < ub)
|
||||
rs = result[mask]
|
||||
assert (rs == 14).all(1).all()
|
||||
|
||||
def test_subset(self):
|
||||
N = 10
|
||||
rng = date_range('1/1/1990', periods=N, freq='53s')
|
||||
df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
|
||||
index=rng)
|
||||
df.loc[4:8, 'A'] = np.nan
|
||||
dates = date_range('1/1/1990', periods=N * 3,
|
||||
freq='25s')
|
||||
|
||||
# with a subset of A should be the same
|
||||
result = df.asof(dates, subset='A')
|
||||
expected = df.asof(dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# same with A/B
|
||||
result = df.asof(dates, subset=['A', 'B'])
|
||||
expected = df.asof(dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# B gives self.df.asof
|
||||
result = df.asof(dates, subset='B')
|
||||
expected = df.resample('25s', closed='right').ffill().reindex(dates)
|
||||
expected.iloc[20:] = 9
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_missing(self):
|
||||
# GH 15118
|
||||
# no match found - `where` value before earliest date in index
|
||||
N = 10
|
||||
rng = date_range('1/1/1990', periods=N, freq='53s')
|
||||
df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
|
||||
index=rng)
|
||||
result = df.asof('1989-12-31')
|
||||
|
||||
expected = Series(index=['A', 'B'], name=Timestamp('1989-12-31'))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.asof(to_datetime(['1989-12-31']))
|
||||
expected = DataFrame(index=to_datetime(['1989-12-31']),
|
||||
columns=['A', 'B'], dtype='float64')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_all_nans(self):
|
||||
# GH 15713
|
||||
# DataFrame is all nans
|
||||
result = DataFrame([np.nan]).asof([0])
|
||||
expected = DataFrame([np.nan])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# testing non-default indexes, multiple inputs
|
||||
dates = date_range('1/1/1990', periods=self.N * 3, freq='25s')
|
||||
result = DataFrame(np.nan, index=self.rng, columns=['A']).asof(dates)
|
||||
expected = DataFrame(np.nan, index=dates, columns=['A'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# testing multiple columns
|
||||
dates = date_range('1/1/1990', periods=self.N * 3, freq='25s')
|
||||
result = DataFrame(np.nan, index=self.rng,
|
||||
columns=['A', 'B', 'C']).asof(dates)
|
||||
expected = DataFrame(np.nan, index=dates, columns=['A', 'B', 'C'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# testing scalar input
|
||||
result = DataFrame(np.nan, index=[1, 2], columns=['A', 'B']).asof([3])
|
||||
expected = DataFrame(np.nan, index=[3], columns=['A', 'B'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = DataFrame(np.nan, index=[1, 2], columns=['A', 'B']).asof(3)
|
||||
expected = Series(np.nan, index=['A', 'B'], name=3)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"stamp,expected",
|
||||
[(Timestamp('2018-01-01 23:22:43.325+00:00'),
|
||||
Series(2.0, name=Timestamp('2018-01-01 23:22:43.325+00:00'))),
|
||||
(Timestamp('2018-01-01 22:33:20.682+01:00'),
|
||||
Series(1.0, name=Timestamp('2018-01-01 22:33:20.682+01:00'))),
|
||||
]
|
||||
)
|
||||
def test_time_zone_aware_index(self, stamp, expected):
|
||||
# GH21194
|
||||
# Testing awareness of DataFrame index considering different
|
||||
# UTC and timezone
|
||||
df = DataFrame(data=[1, 2],
|
||||
index=[Timestamp('2018-01-01 21:00:05.001+00:00'),
|
||||
Timestamp('2018-01-01 22:35:10.550+00:00')])
|
||||
result = df.asof(stamp)
|
||||
tm.assert_series_equal(result, expected)
|
||||
+1159
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,587 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical, DataFrame, Series, Timestamp, compat, date_range,
|
||||
option_context)
|
||||
from pandas.core.arrays import IntervalArray, integer_array
|
||||
from pandas.core.internals.blocks import IntBlock
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (
|
||||
assert_almost_equal, assert_frame_equal, assert_series_equal)
|
||||
|
||||
# Segregated collection of methods that require the BlockManager internal data
|
||||
# structure
|
||||
|
||||
|
||||
class TestDataFrameBlockInternals():
|
||||
def test_setitem_invalidates_datetime_index_freq(self):
|
||||
# GH#24096 altering a datetime64tz column inplace invalidates the
|
||||
# `freq` attribute on the underlying DatetimeIndex
|
||||
|
||||
dti = date_range('20130101', periods=3, tz='US/Eastern')
|
||||
ts = dti[1]
|
||||
|
||||
df = DataFrame({'B': dti})
|
||||
assert df['B']._values.freq == 'D'
|
||||
|
||||
df.iloc[1, 0] = pd.NaT
|
||||
assert df['B']._values.freq is None
|
||||
|
||||
# check that the DatetimeIndex was not altered in place
|
||||
assert dti.freq == 'D'
|
||||
assert dti[1] == ts
|
||||
|
||||
def test_cast_internals(self, float_frame):
|
||||
casted = DataFrame(float_frame._data, dtype=int)
|
||||
expected = DataFrame(float_frame._series, dtype=int)
|
||||
assert_frame_equal(casted, expected)
|
||||
|
||||
casted = DataFrame(float_frame._data, dtype=np.int32)
|
||||
expected = DataFrame(float_frame._series, dtype=np.int32)
|
||||
assert_frame_equal(casted, expected)
|
||||
|
||||
def test_consolidate(self, float_frame):
|
||||
float_frame['E'] = 7.
|
||||
consolidated = float_frame._consolidate()
|
||||
assert len(consolidated._data.blocks) == 1
|
||||
|
||||
# Ensure copy, do I want this?
|
||||
recons = consolidated._consolidate()
|
||||
assert recons is not consolidated
|
||||
tm.assert_frame_equal(recons, consolidated)
|
||||
|
||||
float_frame['F'] = 8.
|
||||
assert len(float_frame._data.blocks) == 3
|
||||
|
||||
float_frame._consolidate(inplace=True)
|
||||
assert len(float_frame._data.blocks) == 1
|
||||
|
||||
def test_consolidate_inplace(self, float_frame):
|
||||
frame = float_frame.copy() # noqa
|
||||
|
||||
# triggers in-place consolidation
|
||||
for letter in range(ord('A'), ord('Z')):
|
||||
float_frame[chr(letter)] = chr(letter)
|
||||
|
||||
def test_values_consolidate(self, float_frame):
|
||||
float_frame['E'] = 7.
|
||||
assert not float_frame._data.is_consolidated()
|
||||
_ = float_frame.values # noqa
|
||||
assert float_frame._data.is_consolidated()
|
||||
|
||||
def test_modify_values(self, float_frame):
|
||||
float_frame.values[5] = 5
|
||||
assert (float_frame.values[5] == 5).all()
|
||||
|
||||
# unconsolidated
|
||||
float_frame['E'] = 7.
|
||||
float_frame.values[6] = 6
|
||||
assert (float_frame.values[6] == 6).all()
|
||||
|
||||
def test_boolean_set_uncons(self, float_frame):
|
||||
float_frame['E'] = 7.
|
||||
|
||||
expected = float_frame.values.copy()
|
||||
expected[expected > 1] = 2
|
||||
|
||||
float_frame[float_frame > 1] = 2
|
||||
assert_almost_equal(expected, float_frame.values)
|
||||
|
||||
def test_values_numeric_cols(self, float_frame):
|
||||
float_frame['foo'] = 'bar'
|
||||
|
||||
values = float_frame[['A', 'B', 'C', 'D']].values
|
||||
assert values.dtype == np.float64
|
||||
|
||||
def test_values_lcd(self, mixed_float_frame, mixed_int_frame):
|
||||
|
||||
# mixed lcd
|
||||
values = mixed_float_frame[['A', 'B', 'C', 'D']].values
|
||||
assert values.dtype == np.float64
|
||||
|
||||
values = mixed_float_frame[['A', 'B', 'C']].values
|
||||
assert values.dtype == np.float32
|
||||
|
||||
values = mixed_float_frame[['C']].values
|
||||
assert values.dtype == np.float16
|
||||
|
||||
# GH 10364
|
||||
# B uint64 forces float because there are other signed int types
|
||||
values = mixed_int_frame[['A', 'B', 'C', 'D']].values
|
||||
assert values.dtype == np.float64
|
||||
|
||||
values = mixed_int_frame[['A', 'D']].values
|
||||
assert values.dtype == np.int64
|
||||
|
||||
# B uint64 forces float because there are other signed int types
|
||||
values = mixed_int_frame[['A', 'B', 'C']].values
|
||||
assert values.dtype == np.float64
|
||||
|
||||
# as B and C are both unsigned, no forcing to float is needed
|
||||
values = mixed_int_frame[['B', 'C']].values
|
||||
assert values.dtype == np.uint64
|
||||
|
||||
values = mixed_int_frame[['A', 'C']].values
|
||||
assert values.dtype == np.int32
|
||||
|
||||
values = mixed_int_frame[['C', 'D']].values
|
||||
assert values.dtype == np.int64
|
||||
|
||||
values = mixed_int_frame[['A']].values
|
||||
assert values.dtype == np.int32
|
||||
|
||||
values = mixed_int_frame[['C']].values
|
||||
assert values.dtype == np.uint8
|
||||
|
||||
def test_constructor_with_convert(self):
|
||||
# this is actually mostly a test of lib.maybe_convert_objects
|
||||
# #2845
|
||||
df = DataFrame({'A': [2 ** 63 - 1]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([2 ** 63 - 1], np.int64), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [2 ** 63]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([2 ** 63], np.uint64), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [datetime(2005, 1, 1), True]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([datetime(2005, 1, 1), True], np.object_),
|
||||
name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [None, 1]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([np.nan, 1], np.float_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [1.0, 2]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([1.0, 2], np.float_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [1.0 + 2.0j, 3]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [1.0 + 2.0j, 3.0]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [1.0 + 2.0j, True]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([1.0 + 2.0j, True], np.object_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [1.0, None]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([1.0, np.nan], np.float_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [1.0 + 2.0j, None]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray(
|
||||
[1.0 + 2.0j, np.nan], np.complex_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [2.0, 1, True, None]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray(
|
||||
[2.0, 1, True, None], np.object_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': [2.0, 1, datetime(2006, 1, 1), None]})
|
||||
result = df['A']
|
||||
expected = Series(np.asarray([2.0, 1, datetime(2006, 1, 1),
|
||||
None], np.object_), name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_construction_with_mixed(self, float_string_frame):
|
||||
# test construction edge cases with mixed types
|
||||
|
||||
# f7u12, this does not work without extensive workaround
|
||||
data = [[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
|
||||
[datetime(2000, 1, 2), datetime(2000, 1, 3),
|
||||
datetime(2000, 1, 1)]]
|
||||
df = DataFrame(data)
|
||||
|
||||
# check dtypes
|
||||
result = df.get_dtype_counts().sort_values()
|
||||
expected = Series({'datetime64[ns]': 3})
|
||||
|
||||
# mixed-type frames
|
||||
float_string_frame['datetime'] = datetime.now()
|
||||
float_string_frame['timedelta'] = timedelta(days=1, seconds=1)
|
||||
assert float_string_frame['datetime'].dtype == 'M8[ns]'
|
||||
assert float_string_frame['timedelta'].dtype == 'm8[ns]'
|
||||
result = float_string_frame.get_dtype_counts().sort_values()
|
||||
expected = Series({'float64': 4,
|
||||
'object': 1,
|
||||
'datetime64[ns]': 1,
|
||||
'timedelta64[ns]': 1}).sort_values()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_construction_with_conversions(self):
|
||||
|
||||
# convert from a numpy array of non-ns timedelta64
|
||||
arr = np.array([1, 2, 3], dtype='timedelta64[s]')
|
||||
df = DataFrame(index=range(3))
|
||||
df['A'] = arr
|
||||
expected = DataFrame({'A': pd.timedelta_range('00:00:01', periods=3,
|
||||
freq='s')},
|
||||
index=range(3))
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
expected = DataFrame({
|
||||
'dt1': Timestamp('20130101'),
|
||||
'dt2': date_range('20130101', periods=3),
|
||||
# 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'),
|
||||
}, index=range(3))
|
||||
|
||||
df = DataFrame(index=range(3))
|
||||
df['dt1'] = np.datetime64('2013-01-01')
|
||||
df['dt2'] = np.array(['2013-01-01', '2013-01-02', '2013-01-03'],
|
||||
dtype='datetime64[D]')
|
||||
|
||||
# df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01
|
||||
# 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')
|
||||
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_constructor_compound_dtypes(self):
|
||||
# GH 5191
|
||||
# compound dtypes should raise not-implementederror
|
||||
|
||||
def f(dtype):
|
||||
data = list(itertools.repeat((datetime(2001, 1, 1),
|
||||
"aa", 20), 9))
|
||||
return DataFrame(data=data,
|
||||
columns=["A", "B", "C"],
|
||||
dtype=dtype)
|
||||
|
||||
pytest.raises(NotImplementedError, f,
|
||||
[("A", "datetime64[h]"),
|
||||
("B", "str"),
|
||||
("C", "int32")])
|
||||
|
||||
# these work (though results may be unexpected)
|
||||
f('int64')
|
||||
f('float64')
|
||||
|
||||
# 10822
|
||||
# invalid error message on dt inference
|
||||
if not compat.is_platform_windows():
|
||||
f('M8[ns]')
|
||||
|
||||
def test_equals_different_blocks(self):
|
||||
# GH 9330
|
||||
df0 = pd.DataFrame({"A": ["x", "y"], "B": [1, 2],
|
||||
"C": ["w", "z"]})
|
||||
df1 = df0.reset_index()[["A", "B", "C"]]
|
||||
# this assert verifies that the above operations have
|
||||
# induced a block rearrangement
|
||||
assert (df0._data.blocks[0].dtype != df1._data.blocks[0].dtype)
|
||||
|
||||
# do the real tests
|
||||
assert_frame_equal(df0, df1)
|
||||
assert df0.equals(df1)
|
||||
assert df1.equals(df0)
|
||||
|
||||
def test_copy_blocks(self, float_frame):
|
||||
# API/ENH 9607
|
||||
df = DataFrame(float_frame, copy=True)
|
||||
column = df.columns[0]
|
||||
|
||||
# use the default copy=True, change a column
|
||||
|
||||
# deprecated 0.21.0
|
||||
with tm.assert_produces_warning(FutureWarning,
|
||||
check_stacklevel=False):
|
||||
blocks = df.as_blocks()
|
||||
for dtype, _df in blocks.items():
|
||||
if column in _df:
|
||||
_df.loc[:, column] = _df[column] + 1
|
||||
|
||||
# make sure we did not change the original DataFrame
|
||||
assert not _df[column].equals(df[column])
|
||||
|
||||
def test_no_copy_blocks(self, float_frame):
|
||||
# API/ENH 9607
|
||||
df = DataFrame(float_frame, copy=True)
|
||||
column = df.columns[0]
|
||||
|
||||
# use the copy=False, change a column
|
||||
|
||||
# deprecated 0.21.0
|
||||
with tm.assert_produces_warning(FutureWarning,
|
||||
check_stacklevel=False):
|
||||
blocks = df.as_blocks(copy=False)
|
||||
for dtype, _df in blocks.items():
|
||||
if column in _df:
|
||||
_df.loc[:, column] = _df[column] + 1
|
||||
|
||||
# make sure we did change the original DataFrame
|
||||
assert _df[column].equals(df[column])
|
||||
|
||||
def test_copy(self, float_frame, float_string_frame):
|
||||
cop = float_frame.copy()
|
||||
cop['E'] = cop['A']
|
||||
assert 'E' not in float_frame
|
||||
|
||||
# copy objects
|
||||
copy = float_string_frame.copy()
|
||||
assert copy._data is not float_string_frame._data
|
||||
|
||||
def test_pickle(self, float_string_frame, empty_frame, timezone_frame):
|
||||
unpickled = tm.round_trip_pickle(float_string_frame)
|
||||
assert_frame_equal(float_string_frame, unpickled)
|
||||
|
||||
# buglet
|
||||
float_string_frame._data.ndim
|
||||
|
||||
# empty
|
||||
unpickled = tm.round_trip_pickle(empty_frame)
|
||||
repr(unpickled)
|
||||
|
||||
# tz frame
|
||||
unpickled = tm.round_trip_pickle(timezone_frame)
|
||||
assert_frame_equal(timezone_frame, unpickled)
|
||||
|
||||
def test_consolidate_datetime64(self):
|
||||
# numpy vstack bug
|
||||
|
||||
data = """\
|
||||
starting,ending,measure
|
||||
2012-06-21 00:00,2012-06-23 07:00,77
|
||||
2012-06-23 07:00,2012-06-23 16:30,65
|
||||
2012-06-23 16:30,2012-06-25 08:00,77
|
||||
2012-06-25 08:00,2012-06-26 12:00,0
|
||||
2012-06-26 12:00,2012-06-27 08:00,77
|
||||
"""
|
||||
df = pd.read_csv(StringIO(data), parse_dates=[0, 1])
|
||||
|
||||
ser_starting = df.starting
|
||||
ser_starting.index = ser_starting.values
|
||||
ser_starting = ser_starting.tz_localize('US/Eastern')
|
||||
ser_starting = ser_starting.tz_convert('UTC')
|
||||
ser_starting.index.name = 'starting'
|
||||
|
||||
ser_ending = df.ending
|
||||
ser_ending.index = ser_ending.values
|
||||
ser_ending = ser_ending.tz_localize('US/Eastern')
|
||||
ser_ending = ser_ending.tz_convert('UTC')
|
||||
ser_ending.index.name = 'ending'
|
||||
|
||||
df.starting = ser_starting.index
|
||||
df.ending = ser_ending.index
|
||||
|
||||
tm.assert_index_equal(pd.DatetimeIndex(
|
||||
df.starting), ser_starting.index)
|
||||
tm.assert_index_equal(pd.DatetimeIndex(df.ending), ser_ending.index)
|
||||
|
||||
def test_is_mixed_type(self, float_frame, float_string_frame):
|
||||
assert not float_frame._is_mixed_type
|
||||
assert float_string_frame._is_mixed_type
|
||||
|
||||
def test_get_numeric_data(self):
|
||||
# TODO(wesm): unused?
|
||||
intname = np.dtype(np.int_).name # noqa
|
||||
floatname = np.dtype(np.float_).name # noqa
|
||||
|
||||
datetime64name = np.dtype('M8[ns]').name
|
||||
objectname = np.dtype(np.object_).name
|
||||
|
||||
df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
|
||||
'f': Timestamp('20010102')},
|
||||
index=np.arange(10))
|
||||
result = df.get_dtype_counts()
|
||||
expected = Series({'int64': 1, 'float64': 1,
|
||||
datetime64name: 1, objectname: 1})
|
||||
result = result.sort_index()
|
||||
expected = expected.sort_index()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
|
||||
'd': np.array([1.] * 10, dtype='float32'),
|
||||
'e': np.array([1] * 10, dtype='int32'),
|
||||
'f': np.array([1] * 10, dtype='int16'),
|
||||
'g': Timestamp('20010102')},
|
||||
index=np.arange(10))
|
||||
|
||||
result = df._get_numeric_data()
|
||||
expected = df.loc[:, ['a', 'b', 'd', 'e', 'f']]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
only_obj = df.loc[:, ['c', 'g']]
|
||||
result = only_obj._get_numeric_data()
|
||||
expected = df.loc[:, []]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame.from_dict(
|
||||
{'a': [1, 2], 'b': ['foo', 'bar'], 'c': [np.pi, np.e]})
|
||||
result = df._get_numeric_data()
|
||||
expected = DataFrame.from_dict({'a': [1, 2], 'c': [np.pi, np.e]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df = result.copy()
|
||||
result = df._get_numeric_data()
|
||||
expected = df
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_numeric_data_extension_dtype(self):
|
||||
# GH 22290
|
||||
df = DataFrame({
|
||||
'A': integer_array([-10, np.nan, 0, 10, 20, 30], dtype='Int64'),
|
||||
'B': Categorical(list('abcabc')),
|
||||
'C': integer_array([0, 1, 2, 3, np.nan, 5], dtype='UInt8'),
|
||||
'D': IntervalArray.from_breaks(range(7))})
|
||||
result = df._get_numeric_data()
|
||||
expected = df.loc[:, ['A', 'C']]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_objects(self, float_string_frame):
|
||||
|
||||
oops = float_string_frame.T.T
|
||||
converted = oops._convert(datetime=True)
|
||||
assert_frame_equal(converted, float_string_frame)
|
||||
assert converted['A'].dtype == np.float64
|
||||
|
||||
# force numeric conversion
|
||||
float_string_frame['H'] = '1.'
|
||||
float_string_frame['I'] = '1'
|
||||
|
||||
# add in some items that will be nan
|
||||
length = len(float_string_frame)
|
||||
float_string_frame['J'] = '1.'
|
||||
float_string_frame['K'] = '1'
|
||||
float_string_frame.loc[0:5, ['J', 'K']] = 'garbled'
|
||||
converted = float_string_frame._convert(datetime=True, numeric=True)
|
||||
assert converted['H'].dtype == 'float64'
|
||||
assert converted['I'].dtype == 'int64'
|
||||
assert converted['J'].dtype == 'float64'
|
||||
assert converted['K'].dtype == 'float64'
|
||||
assert len(converted['J'].dropna()) == length - 5
|
||||
assert len(converted['K'].dropna()) == length - 5
|
||||
|
||||
# via astype
|
||||
converted = float_string_frame.copy()
|
||||
converted['H'] = converted['H'].astype('float64')
|
||||
converted['I'] = converted['I'].astype('int64')
|
||||
assert converted['H'].dtype == 'float64'
|
||||
assert converted['I'].dtype == 'int64'
|
||||
|
||||
# via astype, but errors
|
||||
converted = float_string_frame.copy()
|
||||
with pytest.raises(ValueError, match='invalid literal'):
|
||||
converted['H'].astype('int32')
|
||||
|
||||
# mixed in a single column
|
||||
df = DataFrame(dict(s=Series([1, 'na', 3, 4])))
|
||||
result = df._convert(datetime=True, numeric=True)
|
||||
expected = DataFrame(dict(s=Series([1, np.nan, 3, 4])))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_objects_no_conversion(self):
|
||||
mixed1 = DataFrame(
|
||||
{'a': [1, 2, 3], 'b': [4.0, 5, 6], 'c': ['x', 'y', 'z']})
|
||||
mixed2 = mixed1._convert(datetime=True)
|
||||
assert_frame_equal(mixed1, mixed2)
|
||||
|
||||
def test_infer_objects(self):
|
||||
# GH 11221
|
||||
df = DataFrame({'a': ['a', 1, 2, 3],
|
||||
'b': ['b', 2.0, 3.0, 4.1],
|
||||
'c': ['c', datetime(2016, 1, 1),
|
||||
datetime(2016, 1, 2),
|
||||
datetime(2016, 1, 3)],
|
||||
'd': [1, 2, 3, 'd']},
|
||||
columns=['a', 'b', 'c', 'd'])
|
||||
df = df.iloc[1:].infer_objects()
|
||||
|
||||
assert df['a'].dtype == 'int64'
|
||||
assert df['b'].dtype == 'float64'
|
||||
assert df['c'].dtype == 'M8[ns]'
|
||||
assert df['d'].dtype == 'object'
|
||||
|
||||
expected = DataFrame({'a': [1, 2, 3],
|
||||
'b': [2.0, 3.0, 4.1],
|
||||
'c': [datetime(2016, 1, 1),
|
||||
datetime(2016, 1, 2),
|
||||
datetime(2016, 1, 3)],
|
||||
'd': [2, 3, 'd']},
|
||||
columns=['a', 'b', 'c', 'd'])
|
||||
# reconstruct frame to verify inference is same
|
||||
tm.assert_frame_equal(df.reset_index(drop=True), expected)
|
||||
|
||||
def test_stale_cached_series_bug_473(self):
|
||||
|
||||
# this is chained, but ok
|
||||
with option_context('chained_assignment', None):
|
||||
Y = DataFrame(np.random.random((4, 4)), index=('a', 'b', 'c', 'd'),
|
||||
columns=('e', 'f', 'g', 'h'))
|
||||
repr(Y)
|
||||
Y['e'] = Y['e'].astype('object')
|
||||
Y['g']['c'] = np.NaN
|
||||
repr(Y)
|
||||
result = Y.sum() # noqa
|
||||
exp = Y['g'].sum() # noqa
|
||||
assert pd.isna(Y['g']['c'])
|
||||
|
||||
def test_get_X_columns(self):
|
||||
# numeric and object columns
|
||||
|
||||
df = DataFrame({'a': [1, 2, 3],
|
||||
'b': [True, False, True],
|
||||
'c': ['foo', 'bar', 'baz'],
|
||||
'd': [None, None, None],
|
||||
'e': [3.14, 0.577, 2.773]})
|
||||
|
||||
tm.assert_index_equal(df._get_numeric_data().columns,
|
||||
pd.Index(['a', 'b', 'e']))
|
||||
|
||||
def test_strange_column_corruption_issue(self):
|
||||
# (wesm) Unclear how exactly this is related to internal matters
|
||||
df = DataFrame(index=[0, 1])
|
||||
df[0] = np.nan
|
||||
wasCol = {}
|
||||
# uncommenting these makes the results match
|
||||
# for col in xrange(100, 200):
|
||||
# wasCol[col] = 1
|
||||
# df[col] = np.nan
|
||||
|
||||
for i, dt in enumerate(df.index):
|
||||
for col in range(100, 200):
|
||||
if col not in wasCol:
|
||||
wasCol[col] = 1
|
||||
df[col] = np.nan
|
||||
df[col][dt] = i
|
||||
|
||||
myid = 100
|
||||
|
||||
first = len(df.loc[pd.isna(df[myid]), [myid]])
|
||||
second = len(df.loc[pd.isna(df[myid]), [myid]])
|
||||
assert first == second == 0
|
||||
|
||||
def test_constructor_no_pandas_array(self):
|
||||
# Ensure that PandasArray isn't allowed inside Series
|
||||
# See https://github.com/pandas-dev/pandas/issues/23995 for more.
|
||||
arr = pd.Series([1, 2, 3]).array
|
||||
result = pd.DataFrame({"A": arr})
|
||||
expected = pd.DataFrame({"A": [1, 2, 3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert isinstance(result._data.blocks[0], IntBlock)
|
||||
@@ -0,0 +1,863 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import lrange
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, Series, Timestamp, date_range
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
class TestDataFrameConcatCommon(TestData):
|
||||
|
||||
def test_concat_multiple_frames_dtypes(self):
|
||||
|
||||
# GH 2759
|
||||
A = DataFrame(data=np.ones((10, 2)), columns=[
|
||||
'foo', 'bar'], dtype=np.float64)
|
||||
B = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
|
||||
results = pd.concat((A, B), axis=1).get_dtype_counts()
|
||||
expected = Series(dict(float64=2, float32=2))
|
||||
assert_series_equal(results, expected)
|
||||
|
||||
@pytest.mark.parametrize('data', [
|
||||
pd.date_range('2000', periods=4),
|
||||
pd.date_range('2000', periods=4, tz="US/Central"),
|
||||
pd.period_range('2000', periods=4),
|
||||
pd.timedelta_range(0, periods=4),
|
||||
])
|
||||
def test_combine_datetlike_udf(self, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/23079
|
||||
df = pd.DataFrame({"A": data})
|
||||
other = df.copy()
|
||||
df.iloc[1, 0] = None
|
||||
|
||||
def combiner(a, b):
|
||||
return b
|
||||
|
||||
result = df.combine(other, combiner)
|
||||
tm.assert_frame_equal(result, other)
|
||||
|
||||
def test_concat_multiple_tzs(self):
|
||||
# GH 12467
|
||||
# combining datetime tz-aware and naive DataFrames
|
||||
ts1 = Timestamp('2015-01-01', tz=None)
|
||||
ts2 = Timestamp('2015-01-01', tz='UTC')
|
||||
ts3 = Timestamp('2015-01-01', tz='EST')
|
||||
|
||||
df1 = DataFrame(dict(time=[ts1]))
|
||||
df2 = DataFrame(dict(time=[ts2]))
|
||||
df3 = DataFrame(dict(time=[ts3]))
|
||||
|
||||
results = pd.concat([df1, df2]).reset_index(drop=True)
|
||||
expected = DataFrame(dict(time=[ts1, ts2]), dtype=object)
|
||||
assert_frame_equal(results, expected)
|
||||
|
||||
results = pd.concat([df1, df3]).reset_index(drop=True)
|
||||
expected = DataFrame(dict(time=[ts1, ts3]), dtype=object)
|
||||
assert_frame_equal(results, expected)
|
||||
|
||||
results = pd.concat([df2, df3]).reset_index(drop=True)
|
||||
expected = DataFrame(dict(time=[ts2, ts3]))
|
||||
assert_frame_equal(results, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
't1',
|
||||
[
|
||||
'2015-01-01',
|
||||
pytest.param(pd.NaT, marks=pytest.mark.xfail(
|
||||
reason='GH23037 incorrect dtype when concatenating'))])
|
||||
def test_concat_tz_NaT(self, t1):
|
||||
# GH 22796
|
||||
# Concating tz-aware multicolumn DataFrames
|
||||
ts1 = Timestamp(t1, tz='UTC')
|
||||
ts2 = Timestamp('2015-01-01', tz='UTC')
|
||||
ts3 = Timestamp('2015-01-01', tz='UTC')
|
||||
|
||||
df1 = DataFrame([[ts1, ts2]])
|
||||
df2 = DataFrame([[ts3]])
|
||||
|
||||
result = pd.concat([df1, df2])
|
||||
expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0])
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_tz_not_aligned(self):
|
||||
# GH 22796
|
||||
ts = pd.to_datetime([1, 2]).tz_localize("UTC")
|
||||
a = pd.DataFrame({"A": ts})
|
||||
b = pd.DataFrame({"A": ts, "B": ts})
|
||||
result = pd.concat([a, b], sort=True, ignore_index=True)
|
||||
expected = pd.DataFrame({"A": list(ts) + list(ts),
|
||||
"B": [pd.NaT, pd.NaT] + list(ts)})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_tuple_keys(self):
|
||||
# GH 14438
|
||||
df1 = pd.DataFrame(np.ones((2, 2)), columns=list('AB'))
|
||||
df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list('AB'))
|
||||
results = pd.concat((df1, df2), keys=[('bee', 'bah'), ('bee', 'boo')])
|
||||
expected = pd.DataFrame(
|
||||
{'A': {('bee', 'bah', 0): 1.0,
|
||||
('bee', 'bah', 1): 1.0,
|
||||
('bee', 'boo', 0): 2.0,
|
||||
('bee', 'boo', 1): 2.0,
|
||||
('bee', 'boo', 2): 2.0},
|
||||
'B': {('bee', 'bah', 0): 1.0,
|
||||
('bee', 'bah', 1): 1.0,
|
||||
('bee', 'boo', 0): 2.0,
|
||||
('bee', 'boo', 1): 2.0,
|
||||
('bee', 'boo', 2): 2.0}})
|
||||
assert_frame_equal(results, expected)
|
||||
|
||||
def test_append_series_dict(self):
|
||||
df = DataFrame(np.random.randn(5, 4),
|
||||
columns=['foo', 'bar', 'baz', 'qux'])
|
||||
|
||||
series = df.loc[4]
|
||||
msg = 'Indexes have overlapping values'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.append(series, verify_integrity=True)
|
||||
|
||||
series.name = None
|
||||
msg = 'Can only append a Series if ignore_index=True'
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.append(series, verify_integrity=True)
|
||||
|
||||
result = df.append(series[::-1], ignore_index=True)
|
||||
expected = df.append(DataFrame({0: series[::-1]}, index=df.columns).T,
|
||||
ignore_index=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# dict
|
||||
result = df.append(series.to_dict(), ignore_index=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.append(series[::-1][:3], ignore_index=True)
|
||||
expected = df.append(DataFrame({0: series[::-1][:3]}).T,
|
||||
ignore_index=True, sort=True)
|
||||
assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
# can append when name set
|
||||
row = df.loc[4]
|
||||
row.name = 5
|
||||
result = df.append(row)
|
||||
expected = df.append(df[-1:], ignore_index=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_list_of_series_dicts(self):
|
||||
df = DataFrame(np.random.randn(5, 4),
|
||||
columns=['foo', 'bar', 'baz', 'qux'])
|
||||
|
||||
dicts = [x.to_dict() for idx, x in df.iterrows()]
|
||||
|
||||
result = df.append(dicts, ignore_index=True)
|
||||
expected = df.append(df, ignore_index=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# different columns
|
||||
dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4},
|
||||
{'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}]
|
||||
result = df.append(dicts, ignore_index=True, sort=True)
|
||||
expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_empty_dataframe(self):
|
||||
|
||||
# Empty df append empty df
|
||||
df1 = DataFrame([])
|
||||
df2 = DataFrame([])
|
||||
result = df1.append(df2)
|
||||
expected = df1.copy()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Non-empty df append empty df
|
||||
df1 = DataFrame(np.random.randn(5, 2))
|
||||
df2 = DataFrame()
|
||||
result = df1.append(df2)
|
||||
expected = df1.copy()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Empty df with columns append empty df
|
||||
df1 = DataFrame(columns=['bar', 'foo'])
|
||||
df2 = DataFrame()
|
||||
result = df1.append(df2)
|
||||
expected = df1.copy()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Non-Empty df with columns append empty df
|
||||
df1 = DataFrame(np.random.randn(5, 2), columns=['bar', 'foo'])
|
||||
df2 = DataFrame()
|
||||
result = df1.append(df2)
|
||||
expected = df1.copy()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_dtypes(self):
|
||||
|
||||
# GH 5754
|
||||
# row appends of different dtypes (so need to do by-item)
|
||||
# can sometimes infer the correct type
|
||||
|
||||
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(5))
|
||||
df2 = DataFrame()
|
||||
result = df1.append(df2)
|
||||
expected = df1.copy()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
|
||||
df2 = DataFrame({'bar': 'foo'}, index=lrange(1, 2))
|
||||
result = df1.append(df2)
|
||||
expected = DataFrame({'bar': [Timestamp('20130101'), 'foo']})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
|
||||
df2 = DataFrame({'bar': np.nan}, index=lrange(1, 2))
|
||||
result = df1.append(df2)
|
||||
expected = DataFrame(
|
||||
{'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
|
||||
df2 = DataFrame({'bar': np.nan}, index=lrange(1, 2), dtype=object)
|
||||
result = df1.append(df2)
|
||||
expected = DataFrame(
|
||||
{'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({'bar': np.nan}, index=lrange(1))
|
||||
df2 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1, 2))
|
||||
result = df1.append(df2)
|
||||
expected = DataFrame(
|
||||
{'bar': Series([np.nan, Timestamp('20130101')], dtype='M8[ns]')})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
|
||||
df2 = DataFrame({'bar': 1}, index=lrange(1, 2), dtype=object)
|
||||
result = df1.append(df2)
|
||||
expected = DataFrame({'bar': Series([Timestamp('20130101'), 1])})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_update(self):
|
||||
df = DataFrame([[1.5, np.nan, 3.],
|
||||
[1.5, np.nan, 3.],
|
||||
[1.5, np.nan, 3],
|
||||
[1.5, np.nan, 3]])
|
||||
|
||||
other = DataFrame([[3.6, 2., np.nan],
|
||||
[np.nan, np.nan, 7]], index=[1, 3])
|
||||
|
||||
df.update(other)
|
||||
|
||||
expected = DataFrame([[1.5, np.nan, 3],
|
||||
[3.6, 2, 3],
|
||||
[1.5, np.nan, 3],
|
||||
[1.5, np.nan, 7.]])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_update_dtypes(self):
|
||||
|
||||
# gh 3016
|
||||
df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
|
||||
columns=['A', 'B', 'bool1', 'bool2'])
|
||||
|
||||
other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
|
||||
df.update(other)
|
||||
|
||||
expected = DataFrame([[45., 45., False, True], [4., 5., True, False]],
|
||||
columns=['A', 'B', 'bool1', 'bool2'])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_update_nooverwrite(self):
|
||||
df = DataFrame([[1.5, np.nan, 3.],
|
||||
[1.5, np.nan, 3.],
|
||||
[1.5, np.nan, 3],
|
||||
[1.5, np.nan, 3]])
|
||||
|
||||
other = DataFrame([[3.6, 2., np.nan],
|
||||
[np.nan, np.nan, 7]], index=[1, 3])
|
||||
|
||||
df.update(other, overwrite=False)
|
||||
|
||||
expected = DataFrame([[1.5, np.nan, 3],
|
||||
[1.5, 2, 3],
|
||||
[1.5, np.nan, 3],
|
||||
[1.5, np.nan, 3.]])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_update_filtered(self):
|
||||
df = DataFrame([[1.5, np.nan, 3.],
|
||||
[1.5, np.nan, 3.],
|
||||
[1.5, np.nan, 3],
|
||||
[1.5, np.nan, 3]])
|
||||
|
||||
other = DataFrame([[3.6, 2., np.nan],
|
||||
[np.nan, np.nan, 7]], index=[1, 3])
|
||||
|
||||
df.update(other, filter_func=lambda x: x > 2)
|
||||
|
||||
expected = DataFrame([[1.5, np.nan, 3],
|
||||
[1.5, np.nan, 3],
|
||||
[1.5, np.nan, 3],
|
||||
[1.5, np.nan, 7.]])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
@pytest.mark.parametrize('bad_kwarg, exception, msg', [
|
||||
# errors must be 'ignore' or 'raise'
|
||||
({'errors': 'something'}, ValueError, 'The parameter errors must.*'),
|
||||
({'join': 'inner'}, NotImplementedError, 'Only left join is supported')
|
||||
])
|
||||
def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg):
|
||||
df = DataFrame([[1.5, 1, 3.]])
|
||||
with pytest.raises(exception, match=msg):
|
||||
df.update(df, **bad_kwarg)
|
||||
|
||||
def test_update_raise_on_overlap(self):
|
||||
df = DataFrame([[1.5, 1, 3.],
|
||||
[1.5, np.nan, 3.],
|
||||
[1.5, np.nan, 3],
|
||||
[1.5, np.nan, 3]])
|
||||
|
||||
other = DataFrame([[2., np.nan],
|
||||
[np.nan, 7]], index=[1, 3], columns=[1, 2])
|
||||
with pytest.raises(ValueError, match="Data overlaps"):
|
||||
df.update(other, errors='raise')
|
||||
|
||||
@pytest.mark.parametrize('raise_conflict', [True, False])
|
||||
def test_update_deprecation(self, raise_conflict):
|
||||
df = DataFrame([[1.5, 1, 3.]])
|
||||
other = DataFrame()
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.update(other, raise_conflict=raise_conflict)
|
||||
|
||||
def test_update_from_non_df(self):
|
||||
d = {'a': Series([1, 2, 3, 4]), 'b': Series([5, 6, 7, 8])}
|
||||
df = DataFrame(d)
|
||||
|
||||
d['a'] = Series([5, 6, 7, 8])
|
||||
df.update(d)
|
||||
|
||||
expected = DataFrame(d)
|
||||
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
d = {'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}
|
||||
df = DataFrame(d)
|
||||
|
||||
d['a'] = [5, 6, 7, 8]
|
||||
df.update(d)
|
||||
|
||||
expected = DataFrame(d)
|
||||
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_join_str_datetime(self):
|
||||
str_dates = ['20120209', '20120222']
|
||||
dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
|
||||
|
||||
A = DataFrame(str_dates, index=lrange(2), columns=['aa'])
|
||||
C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)
|
||||
|
||||
tst = A.join(C, on='aa')
|
||||
|
||||
assert len(tst.columns) == 3
|
||||
|
||||
def test_join_multiindex_leftright(self):
|
||||
# GH 10741
|
||||
df1 = (pd.DataFrame([['a', 'x', 0.471780], ['a', 'y', 0.774908],
|
||||
['a', 'z', 0.563634], ['b', 'x', -0.353756],
|
||||
['b', 'y', 0.368062], ['b', 'z', -1.721840],
|
||||
['c', 'x', 1], ['c', 'y', 2], ['c', 'z', 3]],
|
||||
columns=['first', 'second', 'value1'])
|
||||
.set_index(['first', 'second']))
|
||||
|
||||
df2 = (pd.DataFrame([['a', 10], ['b', 20]],
|
||||
columns=['first', 'value2'])
|
||||
.set_index(['first']))
|
||||
|
||||
exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10],
|
||||
[-0.353756, 20], [0.368062, 20],
|
||||
[-1.721840, 20],
|
||||
[1.000000, np.nan], [2.000000, np.nan],
|
||||
[3.000000, np.nan]],
|
||||
index=df1.index, columns=['value1', 'value2'])
|
||||
|
||||
# these must be the same results (but columns are flipped)
|
||||
assert_frame_equal(df1.join(df2, how='left'), exp)
|
||||
assert_frame_equal(df2.join(df1, how='right'),
|
||||
exp[['value2', 'value1']])
|
||||
|
||||
exp_idx = pd.MultiIndex.from_product([['a', 'b'], ['x', 'y', 'z']],
|
||||
names=['first', 'second'])
|
||||
exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10],
|
||||
[-0.353756, 20], [0.368062, 20], [-1.721840, 20]],
|
||||
index=exp_idx, columns=['value1', 'value2'])
|
||||
|
||||
assert_frame_equal(df1.join(df2, how='right'), exp)
|
||||
assert_frame_equal(df2.join(df1, how='left'),
|
||||
exp[['value2', 'value1']])
|
||||
|
||||
def test_concat_named_keys(self):
|
||||
# GH 14252
|
||||
df = pd.DataFrame({'foo': [1, 2], 'bar': [0.1, 0.2]})
|
||||
index = Index(['a', 'b'], name='baz')
|
||||
concatted_named_from_keys = pd.concat([df, df], keys=index)
|
||||
expected_named = pd.DataFrame(
|
||||
{'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]},
|
||||
index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]),
|
||||
names=['baz', None]))
|
||||
assert_frame_equal(concatted_named_from_keys, expected_named)
|
||||
|
||||
index_no_name = Index(['a', 'b'], name=None)
|
||||
concatted_named_from_names = pd.concat(
|
||||
[df, df], keys=index_no_name, names=['baz'])
|
||||
assert_frame_equal(concatted_named_from_names, expected_named)
|
||||
|
||||
concatted_unnamed = pd.concat([df, df], keys=index_no_name)
|
||||
expected_unnamed = pd.DataFrame(
|
||||
{'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]},
|
||||
index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]),
|
||||
names=[None, None]))
|
||||
assert_frame_equal(concatted_unnamed, expected_unnamed)
|
||||
|
||||
def test_concat_axis_parameter(self):
|
||||
# GH 14369
|
||||
df1 = pd.DataFrame({'A': [0.1, 0.2]}, index=range(2))
|
||||
df2 = pd.DataFrame({'A': [0.3, 0.4]}, index=range(2))
|
||||
|
||||
# Index/row/0 DataFrame
|
||||
expected_index = pd.DataFrame(
|
||||
{'A': [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1])
|
||||
|
||||
concatted_index = pd.concat([df1, df2], axis='index')
|
||||
assert_frame_equal(concatted_index, expected_index)
|
||||
|
||||
concatted_row = pd.concat([df1, df2], axis='rows')
|
||||
assert_frame_equal(concatted_row, expected_index)
|
||||
|
||||
concatted_0 = pd.concat([df1, df2], axis=0)
|
||||
assert_frame_equal(concatted_0, expected_index)
|
||||
|
||||
# Columns/1 DataFrame
|
||||
expected_columns = pd.DataFrame(
|
||||
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=['A', 'A'])
|
||||
|
||||
concatted_columns = pd.concat([df1, df2], axis='columns')
|
||||
assert_frame_equal(concatted_columns, expected_columns)
|
||||
|
||||
concatted_1 = pd.concat([df1, df2], axis=1)
|
||||
assert_frame_equal(concatted_1, expected_columns)
|
||||
|
||||
series1 = pd.Series([0.1, 0.2])
|
||||
series2 = pd.Series([0.3, 0.4])
|
||||
|
||||
# Index/row/0 Series
|
||||
expected_index_series = pd.Series(
|
||||
[0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1])
|
||||
|
||||
concatted_index_series = pd.concat([series1, series2], axis='index')
|
||||
assert_series_equal(concatted_index_series, expected_index_series)
|
||||
|
||||
concatted_row_series = pd.concat([series1, series2], axis='rows')
|
||||
assert_series_equal(concatted_row_series, expected_index_series)
|
||||
|
||||
concatted_0_series = pd.concat([series1, series2], axis=0)
|
||||
assert_series_equal(concatted_0_series, expected_index_series)
|
||||
|
||||
# Columns/1 Series
|
||||
expected_columns_series = pd.DataFrame(
|
||||
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1])
|
||||
|
||||
concatted_columns_series = pd.concat(
|
||||
[series1, series2], axis='columns')
|
||||
assert_frame_equal(concatted_columns_series, expected_columns_series)
|
||||
|
||||
concatted_1_series = pd.concat([series1, series2], axis=1)
|
||||
assert_frame_equal(concatted_1_series, expected_columns_series)
|
||||
|
||||
# Testing ValueError
|
||||
with pytest.raises(ValueError, match='No axis named'):
|
||||
pd.concat([series1, series2], axis='something')
|
||||
|
||||
def test_concat_numerical_names(self):
|
||||
# #15262 # #12223
|
||||
df = pd.DataFrame({'col': range(9)},
|
||||
dtype='int32',
|
||||
index=(pd.MultiIndex
|
||||
.from_product([['A0', 'A1', 'A2'],
|
||||
['B0', 'B1', 'B2']],
|
||||
names=[1, 2])))
|
||||
result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :]))
|
||||
expected = pd.DataFrame({'col': [0, 1, 7, 8]},
|
||||
dtype='int32',
|
||||
index=pd.MultiIndex.from_tuples([('A0', 'B0'),
|
||||
('A0', 'B1'),
|
||||
('A2', 'B1'),
|
||||
('A2', 'B2')],
|
||||
names=[1, 2]))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestDataFrameCombineFirst(TestData):
|
||||
|
||||
def test_combine_first_mixed(self):
|
||||
a = Series(['a', 'b'], index=lrange(2))
|
||||
b = Series(lrange(2), index=lrange(2))
|
||||
f = DataFrame({'A': a, 'B': b})
|
||||
|
||||
a = Series(['a', 'b'], index=lrange(5, 7))
|
||||
b = Series(lrange(2), index=lrange(5, 7))
|
||||
g = DataFrame({'A': a, 'B': b})
|
||||
|
||||
exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]},
|
||||
index=[0, 1, 5, 6])
|
||||
combined = f.combine_first(g)
|
||||
tm.assert_frame_equal(combined, exp)
|
||||
|
||||
def test_combine_first(self):
|
||||
# disjoint
|
||||
head, tail = self.frame[:5], self.frame[5:]
|
||||
|
||||
combined = head.combine_first(tail)
|
||||
reordered_frame = self.frame.reindex(combined.index)
|
||||
assert_frame_equal(combined, reordered_frame)
|
||||
assert tm.equalContents(combined.columns, self.frame.columns)
|
||||
assert_series_equal(combined['A'], reordered_frame['A'])
|
||||
|
||||
# same index
|
||||
fcopy = self.frame.copy()
|
||||
fcopy['A'] = 1
|
||||
del fcopy['C']
|
||||
|
||||
fcopy2 = self.frame.copy()
|
||||
fcopy2['B'] = 0
|
||||
del fcopy2['D']
|
||||
|
||||
combined = fcopy.combine_first(fcopy2)
|
||||
|
||||
assert (combined['A'] == 1).all()
|
||||
assert_series_equal(combined['B'], fcopy['B'])
|
||||
assert_series_equal(combined['C'], fcopy2['C'])
|
||||
assert_series_equal(combined['D'], fcopy['D'])
|
||||
|
||||
# overlap
|
||||
head, tail = reordered_frame[:10].copy(), reordered_frame
|
||||
head['A'] = 1
|
||||
|
||||
combined = head.combine_first(tail)
|
||||
assert (combined['A'][:10] == 1).all()
|
||||
|
||||
# reverse overlap
|
||||
tail['A'][:10] = 0
|
||||
combined = tail.combine_first(head)
|
||||
assert (combined['A'][:10] == 0).all()
|
||||
|
||||
# no overlap
|
||||
f = self.frame[:10]
|
||||
g = self.frame[10:]
|
||||
combined = f.combine_first(g)
|
||||
assert_series_equal(combined['A'].reindex(f.index), f['A'])
|
||||
assert_series_equal(combined['A'].reindex(g.index), g['A'])
|
||||
|
||||
# corner cases
|
||||
comb = self.frame.combine_first(self.empty)
|
||||
assert_frame_equal(comb, self.frame)
|
||||
|
||||
comb = self.empty.combine_first(self.frame)
|
||||
assert_frame_equal(comb, self.frame)
|
||||
|
||||
comb = self.frame.combine_first(DataFrame(index=["faz", "boo"]))
|
||||
assert "faz" in comb.index
|
||||
|
||||
# #2525
|
||||
df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)])
|
||||
df2 = DataFrame({}, columns=['b'])
|
||||
result = df.combine_first(df2)
|
||||
assert 'b' in result
|
||||
|
||||
def test_combine_first_mixed_bug(self):
|
||||
idx = Index(['a', 'b', 'c', 'e'])
|
||||
ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
|
||||
ser2 = Series(['a', 'b', 'c', 'e'], index=idx)
|
||||
ser3 = Series([12, 4, 5, 97], index=idx)
|
||||
|
||||
frame1 = DataFrame({"col0": ser1,
|
||||
"col2": ser2,
|
||||
"col3": ser3})
|
||||
|
||||
idx = Index(['a', 'b', 'c', 'f'])
|
||||
ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
|
||||
ser2 = Series(['a', 'b', 'c', 'f'], index=idx)
|
||||
ser3 = Series([12, 4, 5, 97], index=idx)
|
||||
|
||||
frame2 = DataFrame({"col1": ser1,
|
||||
"col2": ser2,
|
||||
"col5": ser3})
|
||||
|
||||
combined = frame1.combine_first(frame2)
|
||||
assert len(combined.columns) == 5
|
||||
|
||||
# gh 3016 (same as in update)
|
||||
df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
|
||||
columns=['A', 'B', 'bool1', 'bool2'])
|
||||
|
||||
other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
|
||||
result = df.combine_first(other)
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
df.loc[0, 'A'] = np.nan
|
||||
result = df.combine_first(other)
|
||||
df.loc[0, 'A'] = 45
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
# doc example
|
||||
df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan],
|
||||
'B': [np.nan, 2., 3., np.nan, 6.]})
|
||||
|
||||
df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.],
|
||||
'B': [np.nan, np.nan, 3., 4., 6., 8.]})
|
||||
|
||||
result = df1.combine_first(df2)
|
||||
expected = DataFrame(
|
||||
{'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH3552, return object dtype with bools
|
||||
df1 = DataFrame(
|
||||
[[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]])
|
||||
df2 = DataFrame(
|
||||
[[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2])
|
||||
|
||||
result = df1.combine_first(df2)[2]
|
||||
expected = Series([True, True, False], name=2)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# GH 3593, converting datetime64[ns] incorrecly
|
||||
df0 = DataFrame({"a": [datetime(2000, 1, 1),
|
||||
datetime(2000, 1, 2),
|
||||
datetime(2000, 1, 3)]})
|
||||
df1 = DataFrame({"a": [None, None, None]})
|
||||
df2 = df1.combine_first(df0)
|
||||
assert_frame_equal(df2, df0)
|
||||
|
||||
df2 = df0.combine_first(df1)
|
||||
assert_frame_equal(df2, df0)
|
||||
|
||||
df0 = DataFrame({"a": [datetime(2000, 1, 1),
|
||||
datetime(2000, 1, 2),
|
||||
datetime(2000, 1, 3)]})
|
||||
df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
|
||||
df2 = df1.combine_first(df0)
|
||||
result = df0.copy()
|
||||
result.iloc[0, :] = df1.iloc[0, :]
|
||||
assert_frame_equal(df2, result)
|
||||
|
||||
df2 = df0.combine_first(df1)
|
||||
assert_frame_equal(df2, df0)
|
||||
|
||||
def test_combine_first_align_nan(self):
|
||||
# GH 7509 (not fixed)
|
||||
dfa = pd.DataFrame([[pd.Timestamp('2011-01-01'), 2]],
|
||||
columns=['a', 'b'])
|
||||
dfb = pd.DataFrame([[4], [5]], columns=['b'])
|
||||
assert dfa['a'].dtype == 'datetime64[ns]'
|
||||
assert dfa['b'].dtype == 'int64'
|
||||
|
||||
res = dfa.combine_first(dfb)
|
||||
exp = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'), pd.NaT],
|
||||
'b': [2., 5.]}, columns=['a', 'b'])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res['a'].dtype == 'datetime64[ns]'
|
||||
# ToDo: this must be int64
|
||||
assert res['b'].dtype == 'float64'
|
||||
|
||||
res = dfa.iloc[:0].combine_first(dfb)
|
||||
exp = pd.DataFrame({'a': [np.nan, np.nan],
|
||||
'b': [4, 5]}, columns=['a', 'b'])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
# ToDo: this must be datetime64
|
||||
assert res['a'].dtype == 'float64'
|
||||
# ToDo: this must be int64
|
||||
assert res['b'].dtype == 'int64'
|
||||
|
||||
def test_combine_first_timezone(self):
|
||||
# see gh-7630
|
||||
data1 = pd.to_datetime('20100101 01:01').tz_localize('UTC')
|
||||
df1 = pd.DataFrame(columns=['UTCdatetime', 'abc'],
|
||||
data=data1,
|
||||
index=pd.date_range('20140627', periods=1))
|
||||
data2 = pd.to_datetime('20121212 12:12').tz_localize('UTC')
|
||||
df2 = pd.DataFrame(columns=['UTCdatetime', 'xyz'],
|
||||
data=data2,
|
||||
index=pd.date_range('20140628', periods=1))
|
||||
res = df2[['UTCdatetime']].combine_first(df1)
|
||||
exp = pd.DataFrame({'UTCdatetime': [pd.Timestamp('2010-01-01 01:01',
|
||||
tz='UTC'),
|
||||
pd.Timestamp('2012-12-12 12:12',
|
||||
tz='UTC')],
|
||||
'abc': [pd.Timestamp('2010-01-01 01:01:00',
|
||||
tz='UTC'), pd.NaT]},
|
||||
columns=['UTCdatetime', 'abc'],
|
||||
index=pd.date_range('20140627', periods=2,
|
||||
freq='D'))
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res['UTCdatetime'].dtype == 'datetime64[ns, UTC]'
|
||||
assert res['abc'].dtype == 'datetime64[ns, UTC]'
|
||||
|
||||
# see gh-10567
|
||||
dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='UTC')
|
||||
df1 = pd.DataFrame({'DATE': dts1})
|
||||
dts2 = pd.date_range('2015-01-03', '2015-01-05', tz='UTC')
|
||||
df2 = pd.DataFrame({'DATE': dts2})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(res, df1)
|
||||
assert res['DATE'].dtype == 'datetime64[ns, UTC]'
|
||||
|
||||
dts1 = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03',
|
||||
'2011-01-04'], tz='US/Eastern')
|
||||
df1 = pd.DataFrame({'DATE': dts1}, index=[1, 3, 5, 7])
|
||||
dts2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02',
|
||||
'2012-01-03'], tz='US/Eastern')
|
||||
df2 = pd.DataFrame({'DATE': dts2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.DatetimeIndex(['2011-01-01', '2012-01-01', 'NaT',
|
||||
'2012-01-02', '2011-01-03', '2011-01-04'],
|
||||
tz='US/Eastern')
|
||||
exp = pd.DataFrame({'DATE': exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# different tz
|
||||
dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='US/Eastern')
|
||||
df1 = pd.DataFrame({'DATE': dts1})
|
||||
dts2 = pd.date_range('2015-01-03', '2015-01-05')
|
||||
df2 = pd.DataFrame({'DATE': dts2})
|
||||
|
||||
# if df1 doesn't have NaN, keep its dtype
|
||||
res = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(res, df1)
|
||||
assert res['DATE'].dtype == 'datetime64[ns, US/Eastern]'
|
||||
|
||||
dts1 = pd.date_range('2015-01-01', '2015-01-02', tz='US/Eastern')
|
||||
df1 = pd.DataFrame({'DATE': dts1})
|
||||
dts2 = pd.date_range('2015-01-01', '2015-01-03')
|
||||
df2 = pd.DataFrame({'DATE': dts2})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = [pd.Timestamp('2015-01-01', tz='US/Eastern'),
|
||||
pd.Timestamp('2015-01-02', tz='US/Eastern'),
|
||||
pd.Timestamp('2015-01-03')]
|
||||
exp = pd.DataFrame({'DATE': exp_dts})
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res['DATE'].dtype == 'object'
|
||||
|
||||
def test_combine_first_timedelta(self):
|
||||
data1 = pd.TimedeltaIndex(['1 day', 'NaT', '3 day', '4day'])
|
||||
df1 = pd.DataFrame({'TD': data1}, index=[1, 3, 5, 7])
|
||||
data2 = pd.TimedeltaIndex(['10 day', '11 day', '12 day'])
|
||||
df2 = pd.DataFrame({'TD': data2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.TimedeltaIndex(['1 day', '10 day', 'NaT',
|
||||
'11 day', '3 day', '4 day'])
|
||||
exp = pd.DataFrame({'TD': exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res['TD'].dtype == 'timedelta64[ns]'
|
||||
|
||||
def test_combine_first_period(self):
|
||||
data1 = pd.PeriodIndex(['2011-01', 'NaT', '2011-03',
|
||||
'2011-04'], freq='M')
|
||||
df1 = pd.DataFrame({'P': data1}, index=[1, 3, 5, 7])
|
||||
data2 = pd.PeriodIndex(['2012-01-01', '2012-02',
|
||||
'2012-03'], freq='M')
|
||||
df2 = pd.DataFrame({'P': data2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.PeriodIndex(['2011-01', '2012-01', 'NaT',
|
||||
'2012-02', '2011-03', '2011-04'],
|
||||
freq='M')
|
||||
exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res['P'].dtype == data1.dtype
|
||||
|
||||
# different freq
|
||||
dts2 = pd.PeriodIndex(['2012-01-01', '2012-01-02',
|
||||
'2012-01-03'], freq='D')
|
||||
df2 = pd.DataFrame({'P': dts2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = [pd.Period('2011-01', freq='M'),
|
||||
pd.Period('2012-01-01', freq='D'),
|
||||
pd.NaT,
|
||||
pd.Period('2012-01-02', freq='D'),
|
||||
pd.Period('2011-03', freq='M'),
|
||||
pd.Period('2011-04', freq='M')]
|
||||
exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res['P'].dtype == 'object'
|
||||
|
||||
def test_combine_first_int(self):
|
||||
# GH14687 - integer series that do no align exactly
|
||||
|
||||
df1 = pd.DataFrame({'a': [0, 1, 3, 5]}, dtype='int64')
|
||||
df2 = pd.DataFrame({'a': [1, 4]}, dtype='int64')
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(res, df1)
|
||||
assert res['a'].dtype == 'int64'
|
||||
|
||||
@pytest.mark.parametrize("val", [1, 1.0])
|
||||
def test_combine_first_with_asymmetric_other(self, val):
|
||||
# see gh-20699
|
||||
df1 = pd.DataFrame({'isNum': [val]})
|
||||
df2 = pd.DataFrame({'isBool': [True]})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp = pd.DataFrame({'isBool': [True], 'isNum': [val]})
|
||||
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_concat_datetime_datetime64_frame(self):
|
||||
# #2624
|
||||
rows = []
|
||||
rows.append([datetime(2010, 1, 1), 1])
|
||||
rows.append([datetime(2010, 1, 2), 'hi'])
|
||||
|
||||
df2_obj = DataFrame.from_records(rows, columns=['date', 'test'])
|
||||
|
||||
ind = date_range(start="2000/1/1", freq="D", periods=10)
|
||||
df1 = DataFrame({'date': ind, 'test': lrange(10)})
|
||||
|
||||
# it works!
|
||||
pd.concat([df1, df2_obj])
|
||||
|
||||
|
||||
class TestDataFrameUpdate(TestData):
|
||||
|
||||
def test_update_nan(self):
|
||||
# #15593 #15617
|
||||
# test 1
|
||||
df1 = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)})
|
||||
df2 = DataFrame({'A': [None, 2, 3]})
|
||||
expected = df1.copy()
|
||||
df1.update(df2, overwrite=False)
|
||||
|
||||
tm.assert_frame_equal(df1, expected)
|
||||
|
||||
# test 2
|
||||
df1 = DataFrame({'A': [1.0, None, 3],
|
||||
'B': date_range('2000', periods=3)})
|
||||
df2 = DataFrame({'A': [None, 2, 3]})
|
||||
expected = DataFrame({'A': [1.0, 2, 3],
|
||||
'B': date_range('2000', periods=3)})
|
||||
df1.update(df2, overwrite=False)
|
||||
|
||||
tm.assert_frame_equal(df1, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,490 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import collections
|
||||
from collections import OrderedDict, defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
from pandas.compat import long
|
||||
|
||||
from pandas import DataFrame, MultiIndex, Series, Timestamp, compat, date_range
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
class TestDataFrameConvertTo(TestData):
|
||||
|
||||
def test_to_dict_timestamp(self):
|
||||
|
||||
# GH11247
|
||||
# split/records producing np.datetime64 rather than Timestamps
|
||||
# on datetime64[ns] dtypes only
|
||||
|
||||
tsmp = Timestamp('20130101')
|
||||
test_data = DataFrame({'A': [tsmp, tsmp], 'B': [tsmp, tsmp]})
|
||||
test_data_mixed = DataFrame({'A': [tsmp, tsmp], 'B': [1, 2]})
|
||||
|
||||
expected_records = [{'A': tsmp, 'B': tsmp},
|
||||
{'A': tsmp, 'B': tsmp}]
|
||||
expected_records_mixed = [{'A': tsmp, 'B': 1},
|
||||
{'A': tsmp, 'B': 2}]
|
||||
|
||||
assert (test_data.to_dict(orient='records') ==
|
||||
expected_records)
|
||||
assert (test_data_mixed.to_dict(orient='records') ==
|
||||
expected_records_mixed)
|
||||
|
||||
expected_series = {
|
||||
'A': Series([tsmp, tsmp], name='A'),
|
||||
'B': Series([tsmp, tsmp], name='B'),
|
||||
}
|
||||
expected_series_mixed = {
|
||||
'A': Series([tsmp, tsmp], name='A'),
|
||||
'B': Series([1, 2], name='B'),
|
||||
}
|
||||
|
||||
tm.assert_dict_equal(test_data.to_dict(orient='series'),
|
||||
expected_series)
|
||||
tm.assert_dict_equal(test_data_mixed.to_dict(orient='series'),
|
||||
expected_series_mixed)
|
||||
|
||||
expected_split = {
|
||||
'index': [0, 1],
|
||||
'data': [[tsmp, tsmp],
|
||||
[tsmp, tsmp]],
|
||||
'columns': ['A', 'B']
|
||||
}
|
||||
expected_split_mixed = {
|
||||
'index': [0, 1],
|
||||
'data': [[tsmp, 1],
|
||||
[tsmp, 2]],
|
||||
'columns': ['A', 'B']
|
||||
}
|
||||
|
||||
tm.assert_dict_equal(test_data.to_dict(orient='split'),
|
||||
expected_split)
|
||||
tm.assert_dict_equal(test_data_mixed.to_dict(orient='split'),
|
||||
expected_split_mixed)
|
||||
|
||||
def test_to_dict_index_not_unique_with_index_orient(self):
|
||||
# GH22801
|
||||
# Data loss when indexes are not unique. Raise ValueError.
|
||||
df = DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
|
||||
pytest.raises(ValueError, df.to_dict, orient='index')
|
||||
|
||||
def test_to_dict_invalid_orient(self):
|
||||
df = DataFrame({'A': [0, 1]})
|
||||
pytest.raises(ValueError, df.to_dict, orient='xinvalid')
|
||||
|
||||
def test_to_records_dt64(self):
|
||||
df = DataFrame([["one", "two", "three"],
|
||||
["four", "five", "six"]],
|
||||
index=date_range("2012-01-01", "2012-01-02"))
|
||||
|
||||
# convert_datetime64 defaults to None
|
||||
expected = df.index.values[0]
|
||||
result = df.to_records()['index'][0]
|
||||
assert expected == result
|
||||
|
||||
# check for FutureWarning if convert_datetime64=False is passed
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
expected = df.index.values[0]
|
||||
result = df.to_records(convert_datetime64=False)['index'][0]
|
||||
assert expected == result
|
||||
|
||||
# check for FutureWarning if convert_datetime64=True is passed
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
expected = df.index[0]
|
||||
result = df.to_records(convert_datetime64=True)['index'][0]
|
||||
assert expected == result
|
||||
|
||||
def test_to_records_with_multindex(self):
|
||||
# GH3189
|
||||
index = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
|
||||
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
|
||||
data = np.zeros((8, 4))
|
||||
df = DataFrame(data, index=index)
|
||||
r = df.to_records(index=True)['level_0']
|
||||
assert 'bar' in r
|
||||
assert 'one' not in r
|
||||
|
||||
def test_to_records_with_Mapping_type(self):
|
||||
import email
|
||||
from email.parser import Parser
|
||||
|
||||
compat.Mapping.register(email.message.Message)
|
||||
|
||||
headers = Parser().parsestr('From: <user@example.com>\n'
|
||||
'To: <someone_else@example.com>\n'
|
||||
'Subject: Test message\n'
|
||||
'\n'
|
||||
'Body would go here\n')
|
||||
|
||||
frame = DataFrame.from_records([headers])
|
||||
all(x in frame for x in ['Type', 'Subject', 'From'])
|
||||
|
||||
def test_to_records_floats(self):
|
||||
df = DataFrame(np.random.rand(10, 10))
|
||||
df.to_records()
|
||||
|
||||
def test_to_records_index_name(self):
|
||||
df = DataFrame(np.random.randn(3, 3))
|
||||
df.index.name = 'X'
|
||||
rs = df.to_records()
|
||||
assert 'X' in rs.dtype.fields
|
||||
|
||||
df = DataFrame(np.random.randn(3, 3))
|
||||
rs = df.to_records()
|
||||
assert 'index' in rs.dtype.fields
|
||||
|
||||
df.index = MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')])
|
||||
df.index.names = ['A', None]
|
||||
rs = df.to_records()
|
||||
assert 'level_0' in rs.dtype.fields
|
||||
|
||||
def test_to_records_with_unicode_index(self):
|
||||
# GH13172
|
||||
# unicode_literals conflict with to_records
|
||||
result = DataFrame([{u'a': u'x', u'b': 'y'}]).set_index(u'a') \
|
||||
.to_records()
|
||||
expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')])
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
def test_to_records_with_unicode_column_names(self):
|
||||
# xref issue: https://github.com/numpy/numpy/issues/2407
|
||||
# Issue #11879. to_records used to raise an exception when used
|
||||
# with column names containing non-ascii characters in Python 2
|
||||
result = DataFrame(data={u"accented_name_é": [1.0]}).to_records()
|
||||
|
||||
# Note that numpy allows for unicode field names but dtypes need
|
||||
# to be specified using dictionary instead of list of tuples.
|
||||
expected = np.rec.array(
|
||||
[(0, 1.0)],
|
||||
dtype={"names": ["index", u"accented_name_é"],
|
||||
"formats": ['=i8', '=f8']}
|
||||
)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
def test_to_records_with_categorical(self):
|
||||
|
||||
# GH8626
|
||||
|
||||
# dict creation
|
||||
df = DataFrame({'A': list('abc')}, dtype='category')
|
||||
expected = Series(list('abc'), dtype='category', name='A')
|
||||
tm.assert_series_equal(df['A'], expected)
|
||||
|
||||
# list-like creation
|
||||
df = DataFrame(list('abc'), dtype='category')
|
||||
expected = Series(list('abc'), dtype='category', name=0)
|
||||
tm.assert_series_equal(df[0], expected)
|
||||
|
||||
# to record array
|
||||
# this coerces
|
||||
result = df.to_records()
|
||||
expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')],
|
||||
dtype=[('index', '=i8'), ('0', 'O')])
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("kwargs,expected", [
|
||||
# No dtypes --> default to array dtypes.
|
||||
(dict(),
|
||||
np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
|
||||
dtype=[("index", "<i8"), ("A", "<i8"),
|
||||
("B", "<f8"), ("C", "O")])),
|
||||
|
||||
# Should have no effect in this case.
|
||||
(dict(index=True),
|
||||
np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
|
||||
dtype=[("index", "<i8"), ("A", "<i8"),
|
||||
("B", "<f8"), ("C", "O")])),
|
||||
|
||||
# Column dtype applied across the board. Index unaffected.
|
||||
(dict(column_dtypes="<U4"),
|
||||
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[("index", "<i8"), ("A", "<U4"),
|
||||
("B", "<U4"), ("C", "<U4")])),
|
||||
|
||||
# Index dtype applied across the board. Columns unaffected.
|
||||
(dict(index_dtypes="<U1"),
|
||||
np.rec.array([("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
|
||||
dtype=[("index", "<U1"), ("A", "<i8"),
|
||||
("B", "<f8"), ("C", "O")])),
|
||||
|
||||
# Pass in a type instance.
|
||||
(dict(column_dtypes=np.unicode),
|
||||
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[("index", "<i8"), ("A", "<U"),
|
||||
("B", "<U"), ("C", "<U")])),
|
||||
|
||||
# Pass in a dictionary (name-only).
|
||||
(dict(column_dtypes={"A": np.int8, "B": np.float32, "C": "<U2"}),
|
||||
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[("index", "<i8"), ("A", "i1"),
|
||||
("B", "<f4"), ("C", "<U2")])),
|
||||
|
||||
# Pass in a dictionary (indices-only).
|
||||
(dict(index_dtypes={0: "int16"}),
|
||||
np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
|
||||
dtype=[("index", "i2"), ("A", "<i8"),
|
||||
("B", "<f8"), ("C", "O")])),
|
||||
|
||||
# Ignore index mappings if index is not True.
|
||||
(dict(index=False, index_dtypes="<U2"),
|
||||
np.rec.array([(1, 0.2, "a"), (2, 1.5, "bc")],
|
||||
dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")])),
|
||||
|
||||
# Non-existent names / indices in mapping should not error.
|
||||
(dict(index_dtypes={0: "int16", "not-there": "float32"}),
|
||||
np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
|
||||
dtype=[("index", "i2"), ("A", "<i8"),
|
||||
("B", "<f8"), ("C", "O")])),
|
||||
|
||||
# Names / indices not in mapping default to array dtype.
|
||||
(dict(column_dtypes={"A": np.int8, "B": np.float32}),
|
||||
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[("index", "<i8"), ("A", "i1"),
|
||||
("B", "<f4"), ("C", "O")])),
|
||||
|
||||
# Mixture of everything.
|
||||
(dict(column_dtypes={"A": np.int8, "B": np.float32},
|
||||
index_dtypes="<U2"),
|
||||
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[("index", "<U2"), ("A", "i1"),
|
||||
("B", "<f4"), ("C", "O")])),
|
||||
|
||||
# Invalid dype values.
|
||||
(dict(index=False, column_dtypes=list()),
|
||||
"Invalid dtype \\[\\] specified for column A"),
|
||||
|
||||
(dict(index=False, column_dtypes={"A": "int32", "B": 5}),
|
||||
"Invalid dtype 5 specified for column B"),
|
||||
])
|
||||
def test_to_records_dtype(self, kwargs, expected):
|
||||
# see gh-18146
|
||||
df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
|
||||
|
||||
if isinstance(expected, str):
|
||||
with pytest.raises(ValueError, match=expected):
|
||||
df.to_records(**kwargs)
|
||||
else:
|
||||
result = df.to_records(**kwargs)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("df,kwargs,expected", [
|
||||
# MultiIndex in the index.
|
||||
(DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=list("abc")).set_index(["a", "b"]),
|
||||
dict(column_dtypes="float64", index_dtypes={0: "int32", 1: "int8"}),
|
||||
np.rec.array([(1, 2, 3.), (4, 5, 6.), (7, 8, 9.)],
|
||||
dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")])),
|
||||
|
||||
# MultiIndex in the columns.
|
||||
(DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
|
||||
("c", "f")])),
|
||||
dict(column_dtypes={0: "<U1", 2: "float32"}, index_dtypes="float32"),
|
||||
np.rec.array([(0., u"1", 2, 3.), (1., u"4", 5, 6.),
|
||||
(2., u"7", 8, 9.)],
|
||||
dtype=[("index", "<f4"),
|
||||
("('a', 'd')", "<U1"),
|
||||
("('b', 'e')", "<i8"),
|
||||
("('c', 'f')", "<f4")])),
|
||||
|
||||
# MultiIndex in both the columns and index.
|
||||
(DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=MultiIndex.from_tuples([
|
||||
("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")),
|
||||
index=MultiIndex.from_tuples([
|
||||
("d", -4), ("d", -5), ("f", -6)], names=list("cd"))),
|
||||
dict(column_dtypes="float64", index_dtypes={0: "<U2", 1: "int8"}),
|
||||
np.rec.array([("d", -4, 1., 2., 3.), ("d", -5, 4., 5., 6.),
|
||||
("f", -6, 7, 8, 9.)],
|
||||
dtype=[("c", "<U2"), ("d", "i1"),
|
||||
("('a', 'd')", "<f8"), ("('b', 'e')", "<f8"),
|
||||
("('c', 'f')", "<f8")]))
|
||||
])
|
||||
def test_to_records_dtype_mi(self, df, kwargs, expected):
|
||||
# see gh-18146
|
||||
result = df.to_records(**kwargs)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
def test_to_records_dict_like(self):
|
||||
# see gh-18146
|
||||
class DictLike(object):
|
||||
def __init__(self, **kwargs):
|
||||
self.d = kwargs.copy()
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.d.__getitem__(key)
|
||||
|
||||
def __contains__(self, key):
|
||||
return key in self.d
|
||||
|
||||
def keys(self):
|
||||
return self.d.keys()
|
||||
|
||||
df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
|
||||
|
||||
dtype_mappings = dict(column_dtypes=DictLike(**{"A": np.int8,
|
||||
"B": np.float32}),
|
||||
index_dtypes="<U2")
|
||||
|
||||
result = df.to_records(**dtype_mappings)
|
||||
expected = np.rec.array([("0", "1", "0.2", "a"),
|
||||
("1", "2", "1.5", "bc")],
|
||||
dtype=[("index", "<U2"), ("A", "i1"),
|
||||
("B", "<f4"), ("C", "O")])
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('mapping', [
|
||||
dict,
|
||||
collections.defaultdict(list),
|
||||
collections.OrderedDict])
|
||||
def test_to_dict(self, mapping):
|
||||
test_data = {
|
||||
'A': {'1': 1, '2': 2},
|
||||
'B': {'1': '1', '2': '2', '3': '3'},
|
||||
}
|
||||
|
||||
# GH16122
|
||||
recons_data = DataFrame(test_data).to_dict(into=mapping)
|
||||
|
||||
for k, v in compat.iteritems(test_data):
|
||||
for k2, v2 in compat.iteritems(v):
|
||||
assert (v2 == recons_data[k][k2])
|
||||
|
||||
recons_data = DataFrame(test_data).to_dict("l", mapping)
|
||||
|
||||
for k, v in compat.iteritems(test_data):
|
||||
for k2, v2 in compat.iteritems(v):
|
||||
assert (v2 == recons_data[k][int(k2) - 1])
|
||||
|
||||
recons_data = DataFrame(test_data).to_dict("s", mapping)
|
||||
|
||||
for k, v in compat.iteritems(test_data):
|
||||
for k2, v2 in compat.iteritems(v):
|
||||
assert (v2 == recons_data[k][k2])
|
||||
|
||||
recons_data = DataFrame(test_data).to_dict("sp", mapping)
|
||||
expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'],
|
||||
'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]}
|
||||
tm.assert_dict_equal(recons_data, expected_split)
|
||||
|
||||
recons_data = DataFrame(test_data).to_dict("r", mapping)
|
||||
expected_records = [{'A': 1.0, 'B': '1'},
|
||||
{'A': 2.0, 'B': '2'},
|
||||
{'A': np.nan, 'B': '3'}]
|
||||
assert isinstance(recons_data, list)
|
||||
assert (len(recons_data) == 3)
|
||||
for l, r in zip(recons_data, expected_records):
|
||||
tm.assert_dict_equal(l, r)
|
||||
|
||||
# GH10844
|
||||
recons_data = DataFrame(test_data).to_dict("i")
|
||||
|
||||
for k, v in compat.iteritems(test_data):
|
||||
for k2, v2 in compat.iteritems(v):
|
||||
assert (v2 == recons_data[k2][k])
|
||||
|
||||
df = DataFrame(test_data)
|
||||
df['duped'] = df[df.columns[0]]
|
||||
recons_data = df.to_dict("i")
|
||||
comp_data = test_data.copy()
|
||||
comp_data['duped'] = comp_data[df.columns[0]]
|
||||
for k, v in compat.iteritems(comp_data):
|
||||
for k2, v2 in compat.iteritems(v):
|
||||
assert (v2 == recons_data[k2][k])
|
||||
|
||||
@pytest.mark.parametrize('mapping', [
|
||||
list,
|
||||
collections.defaultdict,
|
||||
[]])
|
||||
def test_to_dict_errors(self, mapping):
|
||||
# GH16122
|
||||
df = DataFrame(np.random.randn(3, 3))
|
||||
with pytest.raises(TypeError):
|
||||
df.to_dict(into=mapping)
|
||||
|
||||
def test_to_dict_not_unique_warning(self):
|
||||
# GH16927: When converting to a dict, if a column has a non-unique name
|
||||
# it will be dropped, throwing a warning.
|
||||
df = DataFrame([[1, 2, 3]], columns=['a', 'a', 'b'])
|
||||
with tm.assert_produces_warning(UserWarning):
|
||||
df.to_dict()
|
||||
|
||||
@pytest.mark.parametrize('tz', ['UTC', 'GMT', 'US/Eastern'])
|
||||
def test_to_records_datetimeindex_with_tz(self, tz):
|
||||
# GH13937
|
||||
dr = date_range('2016-01-01', periods=10,
|
||||
freq='S', tz=tz)
|
||||
|
||||
df = DataFrame({'datetime': dr}, index=dr)
|
||||
|
||||
expected = df.to_records()
|
||||
result = df.tz_convert("UTC").to_records()
|
||||
|
||||
# both converted to UTC, so they are equal
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# orient - orient argument to to_dict function
|
||||
# item_getter - function for extracting value from
|
||||
# the resulting dict using column name and index
|
||||
@pytest.mark.parametrize('orient,item_getter', [
|
||||
('dict', lambda d, col, idx: d[col][idx]),
|
||||
('records', lambda d, col, idx: d[idx][col]),
|
||||
('list', lambda d, col, idx: d[col][idx]),
|
||||
('split', lambda d, col, idx: d['data'][idx][d['columns'].index(col)]),
|
||||
('index', lambda d, col, idx: d[idx][col])
|
||||
])
|
||||
def test_to_dict_box_scalars(self, orient, item_getter):
|
||||
# 14216, 23753
|
||||
# make sure that we are boxing properly
|
||||
df = DataFrame({'a': [1, 2], 'b': [.1, .2]})
|
||||
result = df.to_dict(orient=orient)
|
||||
assert isinstance(item_getter(result, 'a', 0), (int, long))
|
||||
assert isinstance(item_getter(result, 'b', 0), float)
|
||||
|
||||
def test_frame_to_dict_tz(self):
|
||||
# GH18372 When converting to dict with orient='records' columns of
|
||||
# datetime that are tz-aware were not converted to required arrays
|
||||
data = [(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
|
||||
(datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc,),)]
|
||||
df = DataFrame(list(data), columns=["d", ])
|
||||
|
||||
result = df.to_dict(orient='records')
|
||||
expected = [
|
||||
{'d': Timestamp('2017-11-18 21:53:00.219225+0000', tz=pytz.utc)},
|
||||
{'d': Timestamp('2017-11-18 22:06:30.061810+0000', tz=pytz.utc)},
|
||||
]
|
||||
tm.assert_dict_equal(result[0], expected[0])
|
||||
tm.assert_dict_equal(result[1], expected[1])
|
||||
|
||||
@pytest.mark.parametrize('into, expected', [
|
||||
(dict, {0: {'int_col': 1, 'float_col': 1.0},
|
||||
1: {'int_col': 2, 'float_col': 2.0},
|
||||
2: {'int_col': 3, 'float_col': 3.0}}),
|
||||
(OrderedDict, OrderedDict([(0, {'int_col': 1, 'float_col': 1.0}),
|
||||
(1, {'int_col': 2, 'float_col': 2.0}),
|
||||
(2, {'int_col': 3, 'float_col': 3.0})])),
|
||||
(defaultdict(list), defaultdict(list,
|
||||
{0: {'int_col': 1, 'float_col': 1.0},
|
||||
1: {'int_col': 2, 'float_col': 2.0},
|
||||
2: {'int_col': 3, 'float_col': 3.0}}))
|
||||
])
|
||||
def test_to_dict_index_dtypes(self, into, expected):
|
||||
# GH 18580
|
||||
# When using to_dict(orient='index') on a dataframe with int
|
||||
# and float columns only the int columns were cast to float
|
||||
|
||||
df = DataFrame({'int_col': [1, 2, 3],
|
||||
'float_col': [1.0, 2.0, 3.0]})
|
||||
|
||||
result = df.to_dict(orient='index', into=into)
|
||||
cols = ['int_col', 'float_col']
|
||||
result = DataFrame.from_dict(result, orient='index')[cols]
|
||||
expected = DataFrame.from_dict(expected, orient='index')[cols]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,989 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from datetime import timedelta
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import u
|
||||
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical, DataFrame, Series, Timedelta, Timestamp,
|
||||
_np_version_under1p14, compat, concat, date_range, option_context)
|
||||
from pandas.core.arrays import integer_array
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (
|
||||
assert_frame_equal, assert_series_equal, makeCustomDataframe as mkdf)
|
||||
|
||||
|
||||
@pytest.fixture(params=[str, compat.text_type])
|
||||
def text_dtype(request):
|
||||
return request.param
|
||||
|
||||
|
||||
class TestDataFrameDataTypes(TestData):
|
||||
|
||||
def test_concat_empty_dataframe_dtypes(self):
|
||||
df = DataFrame(columns=list("abc"))
|
||||
df['a'] = df['a'].astype(np.bool_)
|
||||
df['b'] = df['b'].astype(np.int32)
|
||||
df['c'] = df['c'].astype(np.float64)
|
||||
|
||||
result = pd.concat([df, df])
|
||||
assert result['a'].dtype == np.bool_
|
||||
assert result['b'].dtype == np.int32
|
||||
assert result['c'].dtype == np.float64
|
||||
|
||||
result = pd.concat([df, df.astype(np.float64)])
|
||||
assert result['a'].dtype == np.object_
|
||||
assert result['b'].dtype == np.float64
|
||||
assert result['c'].dtype == np.float64
|
||||
|
||||
def test_empty_frame_dtypes_ftypes(self):
|
||||
empty_df = pd.DataFrame()
|
||||
assert_series_equal(empty_df.dtypes, pd.Series(dtype=np.object))
|
||||
assert_series_equal(empty_df.ftypes, pd.Series(dtype=np.object))
|
||||
|
||||
nocols_df = pd.DataFrame(index=[1, 2, 3])
|
||||
assert_series_equal(nocols_df.dtypes, pd.Series(dtype=np.object))
|
||||
assert_series_equal(nocols_df.ftypes, pd.Series(dtype=np.object))
|
||||
|
||||
norows_df = pd.DataFrame(columns=list("abc"))
|
||||
assert_series_equal(norows_df.dtypes, pd.Series(
|
||||
np.object, index=list("abc")))
|
||||
assert_series_equal(norows_df.ftypes, pd.Series(
|
||||
'object:dense', index=list("abc")))
|
||||
|
||||
norows_int_df = pd.DataFrame(columns=list("abc")).astype(np.int32)
|
||||
assert_series_equal(norows_int_df.dtypes, pd.Series(
|
||||
np.dtype('int32'), index=list("abc")))
|
||||
assert_series_equal(norows_int_df.ftypes, pd.Series(
|
||||
'int32:dense', index=list("abc")))
|
||||
|
||||
odict = compat.OrderedDict
|
||||
df = pd.DataFrame(odict([('a', 1), ('b', True), ('c', 1.0)]),
|
||||
index=[1, 2, 3])
|
||||
ex_dtypes = pd.Series(odict([('a', np.int64),
|
||||
('b', np.bool),
|
||||
('c', np.float64)]))
|
||||
ex_ftypes = pd.Series(odict([('a', 'int64:dense'),
|
||||
('b', 'bool:dense'),
|
||||
('c', 'float64:dense')]))
|
||||
assert_series_equal(df.dtypes, ex_dtypes)
|
||||
assert_series_equal(df.ftypes, ex_ftypes)
|
||||
|
||||
# same but for empty slice of df
|
||||
assert_series_equal(df[:0].dtypes, ex_dtypes)
|
||||
assert_series_equal(df[:0].ftypes, ex_ftypes)
|
||||
|
||||
def test_datetime_with_tz_dtypes(self):
|
||||
tzframe = DataFrame({'A': date_range('20130101', periods=3),
|
||||
'B': date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'C': date_range('20130101', periods=3, tz='CET')})
|
||||
tzframe.iloc[1, 1] = pd.NaT
|
||||
tzframe.iloc[1, 2] = pd.NaT
|
||||
result = tzframe.dtypes.sort_index()
|
||||
expected = Series([np.dtype('datetime64[ns]'),
|
||||
DatetimeTZDtype('ns', 'US/Eastern'),
|
||||
DatetimeTZDtype('ns', 'CET')],
|
||||
['A', 'B', 'C'])
|
||||
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_dtypes_are_correct_after_column_slice(self):
|
||||
# GH6525
|
||||
df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_)
|
||||
odict = compat.OrderedDict
|
||||
assert_series_equal(df.dtypes,
|
||||
pd.Series(odict([('a', np.float_),
|
||||
('b', np.float_),
|
||||
('c', np.float_)])))
|
||||
assert_series_equal(df.iloc[:, 2:].dtypes,
|
||||
pd.Series(odict([('c', np.float_)])))
|
||||
assert_series_equal(df.dtypes,
|
||||
pd.Series(odict([('a', np.float_),
|
||||
('b', np.float_),
|
||||
('c', np.float_)])))
|
||||
|
||||
def test_select_dtypes_include_using_list_like(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.Categorical(list('abc')),
|
||||
'g': pd.date_range('20130101', periods=3),
|
||||
'h': pd.date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'i': pd.date_range('20130101', periods=3,
|
||||
tz='CET'),
|
||||
'j': pd.period_range('2013-01', periods=3,
|
||||
freq='M'),
|
||||
'k': pd.timedelta_range('1 day', periods=3)})
|
||||
|
||||
ri = df.select_dtypes(include=[np.number])
|
||||
ei = df[['b', 'c', 'd', 'k']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=[np.number], exclude=['timedelta'])
|
||||
ei = df[['b', 'c', 'd']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=[np.number, 'category'],
|
||||
exclude=['timedelta'])
|
||||
ei = df[['b', 'c', 'd', 'f']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=['datetime'])
|
||||
ei = df[['g']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=['datetime64'])
|
||||
ei = df[['g']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=['datetimetz'])
|
||||
ei = df[['h', 'i']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
pytest.raises(NotImplementedError,
|
||||
lambda: df.select_dtypes(include=['period']))
|
||||
|
||||
def test_select_dtypes_exclude_using_list_like(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True]})
|
||||
re = df.select_dtypes(exclude=[np.number])
|
||||
ee = df[['a', 'e']]
|
||||
assert_frame_equal(re, ee)
|
||||
|
||||
def test_select_dtypes_exclude_include_using_list_like(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.date_range('now', periods=3).values})
|
||||
exclude = np.datetime64,
|
||||
include = np.bool_, 'integer'
|
||||
r = df.select_dtypes(include=include, exclude=exclude)
|
||||
e = df[['b', 'c', 'e']]
|
||||
assert_frame_equal(r, e)
|
||||
|
||||
exclude = 'datetime',
|
||||
include = 'bool', 'int64', 'int32'
|
||||
r = df.select_dtypes(include=include, exclude=exclude)
|
||||
e = df[['b', 'e']]
|
||||
assert_frame_equal(r, e)
|
||||
|
||||
def test_select_dtypes_include_using_scalars(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.Categorical(list('abc')),
|
||||
'g': pd.date_range('20130101', periods=3),
|
||||
'h': pd.date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'i': pd.date_range('20130101', periods=3,
|
||||
tz='CET'),
|
||||
'j': pd.period_range('2013-01', periods=3,
|
||||
freq='M'),
|
||||
'k': pd.timedelta_range('1 day', periods=3)})
|
||||
|
||||
ri = df.select_dtypes(include=np.number)
|
||||
ei = df[['b', 'c', 'd', 'k']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include='datetime')
|
||||
ei = df[['g']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include='datetime64')
|
||||
ei = df[['g']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include='category')
|
||||
ei = df[['f']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
pytest.raises(NotImplementedError,
|
||||
lambda: df.select_dtypes(include='period'))
|
||||
|
||||
def test_select_dtypes_exclude_using_scalars(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.Categorical(list('abc')),
|
||||
'g': pd.date_range('20130101', periods=3),
|
||||
'h': pd.date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'i': pd.date_range('20130101', periods=3,
|
||||
tz='CET'),
|
||||
'j': pd.period_range('2013-01', periods=3,
|
||||
freq='M'),
|
||||
'k': pd.timedelta_range('1 day', periods=3)})
|
||||
|
||||
ri = df.select_dtypes(exclude=np.number)
|
||||
ei = df[['a', 'e', 'f', 'g', 'h', 'i', 'j']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(exclude='category')
|
||||
ei = df[['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
pytest.raises(NotImplementedError,
|
||||
lambda: df.select_dtypes(exclude='period'))
|
||||
|
||||
def test_select_dtypes_include_exclude_using_scalars(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.Categorical(list('abc')),
|
||||
'g': pd.date_range('20130101', periods=3),
|
||||
'h': pd.date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'i': pd.date_range('20130101', periods=3,
|
||||
tz='CET'),
|
||||
'j': pd.period_range('2013-01', periods=3,
|
||||
freq='M'),
|
||||
'k': pd.timedelta_range('1 day', periods=3)})
|
||||
|
||||
ri = df.select_dtypes(include=np.number, exclude='floating')
|
||||
ei = df[['b', 'c', 'k']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
def test_select_dtypes_include_exclude_mixed_scalars_lists(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.Categorical(list('abc')),
|
||||
'g': pd.date_range('20130101', periods=3),
|
||||
'h': pd.date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'i': pd.date_range('20130101', periods=3,
|
||||
tz='CET'),
|
||||
'j': pd.period_range('2013-01', periods=3,
|
||||
freq='M'),
|
||||
'k': pd.timedelta_range('1 day', periods=3)})
|
||||
|
||||
ri = df.select_dtypes(include=np.number,
|
||||
exclude=['floating', 'timedelta'])
|
||||
ei = df[['b', 'c']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
ri = df.select_dtypes(include=[np.number, 'category'],
|
||||
exclude='floating')
|
||||
ei = df[['b', 'c', 'f', 'k']]
|
||||
assert_frame_equal(ri, ei)
|
||||
|
||||
def test_select_dtypes_duplicate_columns(self):
|
||||
# GH20839
|
||||
odict = compat.OrderedDict
|
||||
df = DataFrame(odict([('a', list('abc')),
|
||||
('b', list(range(1, 4))),
|
||||
('c', np.arange(3, 6).astype('u1')),
|
||||
('d', np.arange(4.0, 7.0, dtype='float64')),
|
||||
('e', [True, False, True]),
|
||||
('f', pd.date_range('now', periods=3).values)]))
|
||||
df.columns = ['a', 'a', 'b', 'b', 'b', 'c']
|
||||
|
||||
expected = DataFrame({'a': list(range(1, 4)),
|
||||
'b': np.arange(3, 6).astype('u1')})
|
||||
|
||||
result = df.select_dtypes(include=[np.number], exclude=['floating'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_select_dtypes_not_an_attr_but_still_valid_dtype(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.date_range('now', periods=3).values})
|
||||
df['g'] = df.f.diff()
|
||||
assert not hasattr(np, 'u8')
|
||||
r = df.select_dtypes(include=['i8', 'O'], exclude=['timedelta'])
|
||||
e = df[['a', 'b']]
|
||||
assert_frame_equal(r, e)
|
||||
|
||||
r = df.select_dtypes(include=['i8', 'O', 'timedelta64[ns]'])
|
||||
e = df[['a', 'b', 'g']]
|
||||
assert_frame_equal(r, e)
|
||||
|
||||
def test_select_dtypes_empty(self):
|
||||
df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))})
|
||||
msg = 'at least one of include or exclude must be nonempty'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.select_dtypes()
|
||||
|
||||
def test_select_dtypes_bad_datetime64(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.date_range('now', periods=3).values})
|
||||
with pytest.raises(ValueError, match='.+ is too specific'):
|
||||
df.select_dtypes(include=['datetime64[D]'])
|
||||
|
||||
with pytest.raises(ValueError, match='.+ is too specific'):
|
||||
df.select_dtypes(exclude=['datetime64[as]'])
|
||||
|
||||
def test_select_dtypes_datetime_with_tz(self):
|
||||
|
||||
df2 = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'),
|
||||
B=Timestamp('20130603', tz='CET')),
|
||||
index=range(5))
|
||||
df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
|
||||
result = df3.select_dtypes(include=['datetime64[ns]'])
|
||||
expected = df3.reindex(columns=[])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("dtype", [
|
||||
str, "str", np.string_, "S1", "unicode", np.unicode_, "U1",
|
||||
compat.text_type
|
||||
])
|
||||
@pytest.mark.parametrize("arg", ["include", "exclude"])
|
||||
def test_select_dtypes_str_raises(self, dtype, arg):
|
||||
df = DataFrame({"a": list("abc"),
|
||||
"g": list(u("abc")),
|
||||
"b": list(range(1, 4)),
|
||||
"c": np.arange(3, 6).astype("u1"),
|
||||
"d": np.arange(4.0, 7.0, dtype="float64"),
|
||||
"e": [True, False, True],
|
||||
"f": pd.date_range("now", periods=3).values})
|
||||
msg = "string dtypes are not allowed"
|
||||
kwargs = {arg: [dtype]}
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.select_dtypes(**kwargs)
|
||||
|
||||
def test_select_dtypes_bad_arg_raises(self):
|
||||
df = DataFrame({'a': list('abc'),
|
||||
'g': list(u('abc')),
|
||||
'b': list(range(1, 4)),
|
||||
'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.date_range('now', periods=3).values})
|
||||
|
||||
msg = 'data type.*not understood'
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.select_dtypes(['blargy, blarg, blarg'])
|
||||
|
||||
def test_select_dtypes_typecodes(self):
|
||||
# GH 11990
|
||||
df = mkdf(30, 3, data_gen_f=lambda x, y: np.random.random())
|
||||
expected = df
|
||||
FLOAT_TYPES = list(np.typecodes['AllFloat'])
|
||||
assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)
|
||||
|
||||
def test_dtypes_gh8722(self):
|
||||
self.mixed_frame['bool'] = self.mixed_frame['A'] > 0
|
||||
result = self.mixed_frame.dtypes
|
||||
expected = Series({k: v.dtype
|
||||
for k, v in compat.iteritems(self.mixed_frame)},
|
||||
index=result.index)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# compat, GH 8722
|
||||
with option_context('use_inf_as_na', True):
|
||||
df = DataFrame([[1]])
|
||||
result = df.dtypes
|
||||
assert_series_equal(result, Series({0: np.dtype('int64')}))
|
||||
|
||||
def test_ftypes(self):
|
||||
frame = self.mixed_float
|
||||
expected = Series(dict(A='float32:dense',
|
||||
B='float32:dense',
|
||||
C='float16:dense',
|
||||
D='float64:dense')).sort_values()
|
||||
result = frame.ftypes.sort_values()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_astype(self):
|
||||
casted = self.frame.astype(int)
|
||||
expected = DataFrame(self.frame.values.astype(int),
|
||||
index=self.frame.index,
|
||||
columns=self.frame.columns)
|
||||
assert_frame_equal(casted, expected)
|
||||
|
||||
casted = self.frame.astype(np.int32)
|
||||
expected = DataFrame(self.frame.values.astype(np.int32),
|
||||
index=self.frame.index,
|
||||
columns=self.frame.columns)
|
||||
assert_frame_equal(casted, expected)
|
||||
|
||||
self.frame['foo'] = '5'
|
||||
casted = self.frame.astype(int)
|
||||
expected = DataFrame(self.frame.values.astype(int),
|
||||
index=self.frame.index,
|
||||
columns=self.frame.columns)
|
||||
assert_frame_equal(casted, expected)
|
||||
|
||||
# mixed casting
|
||||
def _check_cast(df, v):
|
||||
assert (list({s.dtype.name for
|
||||
_, s in compat.iteritems(df)})[0] == v)
|
||||
|
||||
mn = self.all_mixed._get_numeric_data().copy()
|
||||
mn['little_float'] = np.array(12345., dtype='float16')
|
||||
mn['big_float'] = np.array(123456789101112., dtype='float64')
|
||||
|
||||
casted = mn.astype('float64')
|
||||
_check_cast(casted, 'float64')
|
||||
|
||||
casted = mn.astype('int64')
|
||||
_check_cast(casted, 'int64')
|
||||
|
||||
casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float32')
|
||||
_check_cast(casted, 'float32')
|
||||
|
||||
casted = mn.reindex(columns=['little_float']).astype('float16')
|
||||
_check_cast(casted, 'float16')
|
||||
|
||||
casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float16')
|
||||
_check_cast(casted, 'float16')
|
||||
|
||||
casted = mn.astype('float32')
|
||||
_check_cast(casted, 'float32')
|
||||
|
||||
casted = mn.astype('int32')
|
||||
_check_cast(casted, 'int32')
|
||||
|
||||
# to object
|
||||
casted = mn.astype('O')
|
||||
_check_cast(casted, 'object')
|
||||
|
||||
def test_astype_with_exclude_string(self):
|
||||
df = self.frame.copy()
|
||||
expected = self.frame.astype(int)
|
||||
df['string'] = 'foo'
|
||||
casted = df.astype(int, errors='ignore')
|
||||
|
||||
expected['string'] = 'foo'
|
||||
assert_frame_equal(casted, expected)
|
||||
|
||||
df = self.frame.copy()
|
||||
expected = self.frame.astype(np.int32)
|
||||
df['string'] = 'foo'
|
||||
casted = df.astype(np.int32, errors='ignore')
|
||||
|
||||
expected['string'] = 'foo'
|
||||
assert_frame_equal(casted, expected)
|
||||
|
||||
def test_astype_with_view(self):
|
||||
|
||||
tf = self.mixed_float.reindex(columns=['A', 'B', 'C'])
|
||||
|
||||
casted = tf.astype(np.int64)
|
||||
|
||||
casted = tf.astype(np.float32)
|
||||
|
||||
# this is the only real reason to do it this way
|
||||
tf = np.round(self.frame).astype(np.int32)
|
||||
casted = tf.astype(np.float32, copy=False)
|
||||
|
||||
# TODO(wesm): verification?
|
||||
tf = self.frame.astype(np.float64)
|
||||
casted = tf.astype(np.int64, copy=False) # noqa
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.int32, np.int64])
|
||||
@pytest.mark.parametrize("val", [np.nan, np.inf])
|
||||
def test_astype_cast_nan_inf_int(self, val, dtype):
|
||||
# see gh-14265
|
||||
#
|
||||
# Check NaN and inf --> raise error when converting to int.
|
||||
msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
|
||||
df = DataFrame([val])
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.astype(dtype)
|
||||
|
||||
def test_astype_str(self, text_dtype):
|
||||
# see gh-9757
|
||||
a = Series(date_range("2010-01-04", periods=5))
|
||||
b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
|
||||
c = Series([Timedelta(x, unit="d") for x in range(5)])
|
||||
d = Series(range(5))
|
||||
e = Series([0.0, 0.2, 0.4, 0.6, 0.8])
|
||||
|
||||
df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})
|
||||
|
||||
# Datetime-like
|
||||
# Test str and unicode on Python 2.x and just str on Python 3.x
|
||||
result = df.astype(text_dtype)
|
||||
|
||||
expected = DataFrame({
|
||||
"a": list(map(text_dtype,
|
||||
map(lambda x: Timestamp(x)._date_repr, a._values))),
|
||||
"b": list(map(text_dtype, map(Timestamp, b._values))),
|
||||
"c": list(map(text_dtype,
|
||||
map(lambda x: Timedelta(x)._repr_base(format="all"),
|
||||
c._values))),
|
||||
"d": list(map(text_dtype, d._values)),
|
||||
"e": list(map(text_dtype, e._values)),
|
||||
})
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_astype_str_float(self, text_dtype):
|
||||
# see gh-11302
|
||||
result = DataFrame([np.NaN]).astype(text_dtype)
|
||||
expected = DataFrame(["nan"])
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
result = DataFrame([1.12345678901234567890]).astype(text_dtype)
|
||||
|
||||
# < 1.14 truncates
|
||||
# >= 1.14 preserves the full repr
|
||||
val = ("1.12345678901" if _np_version_under1p14
|
||||
else "1.1234567890123457")
|
||||
expected = DataFrame([val])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("dtype_class", [dict, Series])
|
||||
def test_astype_dict_like(self, dtype_class):
|
||||
# GH7271 & GH16717
|
||||
a = Series(date_range('2010-01-04', periods=5))
|
||||
b = Series(range(5))
|
||||
c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
|
||||
d = Series(['1.0', '2', '3.14', '4', '5.4'])
|
||||
df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d})
|
||||
original = df.copy(deep=True)
|
||||
|
||||
# change type of a subset of columns
|
||||
dt1 = dtype_class({'b': 'str', 'd': 'float32'})
|
||||
result = df.astype(dt1)
|
||||
expected = DataFrame({
|
||||
'a': a,
|
||||
'b': Series(['0', '1', '2', '3', '4']),
|
||||
'c': c,
|
||||
'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')})
|
||||
assert_frame_equal(result, expected)
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
dt2 = dtype_class({'b': np.float32, 'c': 'float32', 'd': np.float64})
|
||||
result = df.astype(dt2)
|
||||
expected = DataFrame({
|
||||
'a': a,
|
||||
'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'),
|
||||
'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'),
|
||||
'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')})
|
||||
assert_frame_equal(result, expected)
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
# change all columns
|
||||
dt3 = dtype_class({'a': str, 'b': str, 'c': str, 'd': str})
|
||||
assert_frame_equal(df.astype(dt3),
|
||||
df.astype(str))
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
# error should be raised when using something other than column labels
|
||||
# in the keys of the dtype dict
|
||||
dt4 = dtype_class({'b': str, 2: str})
|
||||
dt5 = dtype_class({'e': str})
|
||||
pytest.raises(KeyError, df.astype, dt4)
|
||||
pytest.raises(KeyError, df.astype, dt5)
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
# if the dtypes provided are the same as the original dtypes, the
|
||||
# resulting DataFrame should be the same as the original DataFrame
|
||||
dt6 = dtype_class({col: df[col].dtype for col in df.columns})
|
||||
equiv = df.astype(dt6)
|
||||
assert_frame_equal(df, equiv)
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
# GH 16717
|
||||
# if dtypes provided is empty, the resulting DataFrame
|
||||
# should be the same as the original DataFrame
|
||||
dt7 = dtype_class({})
|
||||
result = df.astype(dt7)
|
||||
assert_frame_equal(df, equiv)
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
def test_astype_duplicate_col(self):
|
||||
a1 = Series([1, 2, 3, 4, 5], name='a')
|
||||
b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name='b')
|
||||
a2 = Series([0, 1, 2, 3, 4], name='a')
|
||||
df = concat([a1, b, a2], axis=1)
|
||||
|
||||
result = df.astype(str)
|
||||
a1_str = Series(['1', '2', '3', '4', '5'], dtype='str', name='a')
|
||||
b_str = Series(['0.1', '0.2', '0.4', '0.6', '0.8'], dtype=str,
|
||||
name='b')
|
||||
a2_str = Series(['0', '1', '2', '3', '4'], dtype='str', name='a')
|
||||
expected = concat([a1_str, b_str, a2_str], axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.astype({'a': 'str'})
|
||||
expected = concat([a1_str, b, a2_str], axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('dtype', [
|
||||
'category',
|
||||
CategoricalDtype(),
|
||||
CategoricalDtype(ordered=True),
|
||||
CategoricalDtype(ordered=False),
|
||||
CategoricalDtype(categories=list('abcdef')),
|
||||
CategoricalDtype(categories=list('edba'), ordered=False),
|
||||
CategoricalDtype(categories=list('edcb'), ordered=True)], ids=repr)
|
||||
def test_astype_categorical(self, dtype):
|
||||
# GH 18099
|
||||
d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')}
|
||||
df = DataFrame(d)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("cls", [
|
||||
pd.api.types.CategoricalDtype,
|
||||
pd.api.types.DatetimeTZDtype,
|
||||
pd.api.types.IntervalDtype
|
||||
])
|
||||
def test_astype_categoricaldtype_class_raises(self, cls):
|
||||
df = DataFrame({"A": ['a', 'a', 'b', 'c']})
|
||||
xpr = "Expected an instance of {}".format(cls.__name__)
|
||||
with pytest.raises(TypeError, match=xpr):
|
||||
df.astype({"A": cls})
|
||||
|
||||
with pytest.raises(TypeError, match=xpr):
|
||||
df['A'].astype(cls)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ['Int64', 'Int32', 'Int16'])
|
||||
def test_astype_extension_dtypes(self, dtype):
|
||||
# GH 22578
|
||||
df = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], columns=['a', 'b'])
|
||||
|
||||
expected1 = pd.DataFrame({'a': integer_array([1, 3, 5],
|
||||
dtype=dtype),
|
||||
'b': integer_array([2, 4, 6],
|
||||
dtype=dtype)})
|
||||
tm.assert_frame_equal(df.astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype(dtype).astype('float64'), df)
|
||||
|
||||
df = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], columns=['a', 'b'])
|
||||
df['b'] = df['b'].astype(dtype)
|
||||
expected2 = pd.DataFrame({'a': [1., 3., 5.],
|
||||
'b': integer_array([2, 4, 6],
|
||||
dtype=dtype)})
|
||||
tm.assert_frame_equal(df, expected2)
|
||||
|
||||
tm.assert_frame_equal(df.astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ['Int64', 'Int32', 'Int16'])
|
||||
def test_astype_extension_dtypes_1d(self, dtype):
|
||||
# GH 22578
|
||||
df = pd.DataFrame({'a': [1., 2., 3.]})
|
||||
|
||||
expected1 = pd.DataFrame({'a': integer_array([1, 2, 3],
|
||||
dtype=dtype)})
|
||||
tm.assert_frame_equal(df.astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1)
|
||||
|
||||
df = pd.DataFrame({'a': [1., 2., 3.]})
|
||||
df['a'] = df['a'].astype(dtype)
|
||||
expected2 = pd.DataFrame({'a': integer_array([1, 2, 3],
|
||||
dtype=dtype)})
|
||||
tm.assert_frame_equal(df, expected2)
|
||||
|
||||
tm.assert_frame_equal(df.astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ['category', 'Int64'])
|
||||
def test_astype_extension_dtypes_duplicate_col(self, dtype):
|
||||
# GH 24704
|
||||
a1 = Series([0, np.nan, 4], name='a')
|
||||
a2 = Series([np.nan, 3, 5], name='a')
|
||||
df = concat([a1, a2], axis=1)
|
||||
|
||||
result = df.astype(dtype)
|
||||
expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('dtype', [
|
||||
{100: 'float64', 200: 'uint64'}, 'category', 'float64'])
|
||||
def test_astype_column_metadata(self, dtype):
|
||||
# GH 19920
|
||||
columns = pd.UInt64Index([100, 200, 300], name='foo')
|
||||
df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
|
||||
df = df.astype(dtype)
|
||||
tm.assert_index_equal(df.columns, columns)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["M8", "m8"])
|
||||
@pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
|
||||
def test_astype_from_datetimelike_to_objectt(self, dtype, unit):
|
||||
# tests astype to object dtype
|
||||
# gh-19223 / gh-12425
|
||||
dtype = "{}[{}]".format(dtype, unit)
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(object)
|
||||
assert (result.dtypes == object).all()
|
||||
|
||||
if dtype.startswith('M8'):
|
||||
assert result.iloc[0, 0] == pd.to_datetime(1, unit=unit)
|
||||
else:
|
||||
assert result.iloc[0, 0] == pd.to_timedelta(1, unit=unit)
|
||||
|
||||
@pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
|
||||
@pytest.mark.parametrize("dtype", ["M8", "m8"])
|
||||
@pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
|
||||
def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
|
||||
# tests all units from numeric origination
|
||||
# gh-19223 / gh-12425
|
||||
dtype = "{}[{}]".format(dtype, unit)
|
||||
arr = np.array([[1, 2, 3]], dtype=arr_dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame(arr.astype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
|
||||
def test_astype_to_datetime_unit(self, unit):
|
||||
# tests all units from datetime origination
|
||||
# gh-19223
|
||||
dtype = "M8[{}]".format(unit)
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame(arr.astype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ['ns'])
|
||||
def test_astype_to_timedelta_unit_ns(self, unit):
|
||||
# preserver the timedelta conversion
|
||||
# gh-19223
|
||||
dtype = "m8[{}]".format(unit)
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame(arr.astype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ['us', 'ms', 's', 'h', 'm', 'D'])
|
||||
def test_astype_to_timedelta_unit(self, unit):
|
||||
# coerce to float
|
||||
# gh-19223
|
||||
dtype = "m8[{}]".format(unit)
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame(df.values.astype(dtype).astype(float))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
|
||||
def test_astype_to_incorrect_datetimelike(self, unit):
|
||||
# trying to astype a m to a M, or vice-versa
|
||||
# gh-19224
|
||||
dtype = "M8[{}]".format(unit)
|
||||
other = "m8[{}]".format(unit)
|
||||
|
||||
df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
|
||||
with pytest.raises(TypeError):
|
||||
df.astype(other)
|
||||
|
||||
df = DataFrame(np.array([[1, 2, 3]], dtype=other))
|
||||
with pytest.raises(TypeError):
|
||||
df.astype(dtype)
|
||||
|
||||
def test_timedeltas(self):
|
||||
df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3,
|
||||
freq='D')),
|
||||
B=Series([timedelta(days=i) for i in range(3)])))
|
||||
result = df.get_dtype_counts().sort_index()
|
||||
expected = Series(
|
||||
{'datetime64[ns]': 1, 'timedelta64[ns]': 1}).sort_index()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df['C'] = df['A'] + df['B']
|
||||
expected = Series(
|
||||
{'datetime64[ns]': 2, 'timedelta64[ns]': 1}).sort_values()
|
||||
result = df.get_dtype_counts().sort_values()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# mixed int types
|
||||
df['D'] = 1
|
||||
expected = Series({'datetime64[ns]': 2,
|
||||
'timedelta64[ns]': 1,
|
||||
'int64': 1}).sort_values()
|
||||
result = df.get_dtype_counts().sort_values()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_arg_for_errors_in_astype(self):
|
||||
# issue #14878
|
||||
|
||||
df = DataFrame([1, 2, 3])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
df.astype(np.float64, errors=True)
|
||||
|
||||
df.astype(np.int8, errors='ignore')
|
||||
|
||||
@pytest.mark.parametrize('input_vals', [
|
||||
([1, 2]),
|
||||
(['1', '2']),
|
||||
(list(pd.date_range('1/1/2011', periods=2, freq='H'))),
|
||||
(list(pd.date_range('1/1/2011', periods=2, freq='H',
|
||||
tz='US/Eastern'))),
|
||||
([pd.Interval(left=0, right=5)]),
|
||||
])
|
||||
def test_constructor_list_str(self, input_vals, string_dtype):
|
||||
# GH 16605
|
||||
# Ensure that data elements are converted to strings when
|
||||
# dtype is str, 'str', or 'U'
|
||||
|
||||
result = DataFrame({'A': input_vals}, dtype=string_dtype)
|
||||
expected = DataFrame({'A': input_vals}).astype({'A': string_dtype})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_constructor_list_str_na(self, string_dtype):
|
||||
|
||||
result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
|
||||
expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("data, expected", [
|
||||
# empty
|
||||
(DataFrame(), True),
|
||||
# multi-same
|
||||
(DataFrame({"A": [1, 2], "B": [1, 2]}), True),
|
||||
# multi-object
|
||||
(DataFrame({"A": np.array([1, 2], dtype=object),
|
||||
"B": np.array(["a", "b"], dtype=object)}), True),
|
||||
# multi-extension
|
||||
(DataFrame({"A": pd.Categorical(['a', 'b']),
|
||||
"B": pd.Categorical(['a', 'b'])}), True),
|
||||
# differ types
|
||||
(DataFrame({"A": [1, 2], "B": [1., 2.]}), False),
|
||||
# differ sizes
|
||||
(DataFrame({"A": np.array([1, 2], dtype=np.int32),
|
||||
"B": np.array([1, 2], dtype=np.int64)}), False),
|
||||
# multi-extension differ
|
||||
(DataFrame({"A": pd.Categorical(['a', 'b']),
|
||||
"B": pd.Categorical(['b', 'c'])}), False),
|
||||
|
||||
])
|
||||
def test_is_homogeneous_type(self, data, expected):
|
||||
assert data._is_homogeneous_type is expected
|
||||
|
||||
def test_asarray_homogenous(self):
|
||||
df = pd.DataFrame({"A": pd.Categorical([1, 2]),
|
||||
"B": pd.Categorical([1, 2])})
|
||||
result = np.asarray(df)
|
||||
# may change from object in the future
|
||||
expected = np.array([[1, 1], [2, 2]], dtype='object')
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
class TestDataFrameDatetimeWithTZ(TestData):
|
||||
|
||||
def test_interleave(self):
|
||||
|
||||
# interleave with object
|
||||
result = self.tzframe.assign(D='foo').values
|
||||
expected = np.array([[Timestamp('2013-01-01 00:00:00'),
|
||||
Timestamp('2013-01-02 00:00:00'),
|
||||
Timestamp('2013-01-03 00:00:00')],
|
||||
[Timestamp('2013-01-01 00:00:00-0500',
|
||||
tz='US/Eastern'),
|
||||
pd.NaT,
|
||||
Timestamp('2013-01-03 00:00:00-0500',
|
||||
tz='US/Eastern')],
|
||||
[Timestamp('2013-01-01 00:00:00+0100', tz='CET'),
|
||||
pd.NaT,
|
||||
Timestamp('2013-01-03 00:00:00+0100', tz='CET')],
|
||||
['foo', 'foo', 'foo']], dtype=object).T
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# interleave with only datetime64[ns]
|
||||
result = self.tzframe.values
|
||||
expected = np.array([[Timestamp('2013-01-01 00:00:00'),
|
||||
Timestamp('2013-01-02 00:00:00'),
|
||||
Timestamp('2013-01-03 00:00:00')],
|
||||
[Timestamp('2013-01-01 00:00:00-0500',
|
||||
tz='US/Eastern'),
|
||||
pd.NaT,
|
||||
Timestamp('2013-01-03 00:00:00-0500',
|
||||
tz='US/Eastern')],
|
||||
[Timestamp('2013-01-01 00:00:00+0100', tz='CET'),
|
||||
pd.NaT,
|
||||
Timestamp('2013-01-03 00:00:00+0100',
|
||||
tz='CET')]], dtype=object).T
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_astype(self):
|
||||
# astype
|
||||
expected = np.array([[Timestamp('2013-01-01 00:00:00'),
|
||||
Timestamp('2013-01-02 00:00:00'),
|
||||
Timestamp('2013-01-03 00:00:00')],
|
||||
[Timestamp('2013-01-01 00:00:00-0500',
|
||||
tz='US/Eastern'),
|
||||
pd.NaT,
|
||||
Timestamp('2013-01-03 00:00:00-0500',
|
||||
tz='US/Eastern')],
|
||||
[Timestamp('2013-01-01 00:00:00+0100', tz='CET'),
|
||||
pd.NaT,
|
||||
Timestamp('2013-01-03 00:00:00+0100',
|
||||
tz='CET')]],
|
||||
dtype=object).T
|
||||
result = self.tzframe.astype(object)
|
||||
assert_frame_equal(result, DataFrame(
|
||||
expected, index=self.tzframe.index, columns=self.tzframe.columns))
|
||||
|
||||
result = self.tzframe.astype('datetime64[ns]')
|
||||
expected = DataFrame({'A': date_range('20130101', periods=3),
|
||||
'B': (date_range('20130101', periods=3,
|
||||
tz='US/Eastern')
|
||||
.tz_convert('UTC')
|
||||
.tz_localize(None)),
|
||||
'C': (date_range('20130101', periods=3,
|
||||
tz='CET')
|
||||
.tz_convert('UTC')
|
||||
.tz_localize(None))})
|
||||
expected.iloc[1, 1] = pd.NaT
|
||||
expected.iloc[1, 2] = pd.NaT
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_astype_str(self):
|
||||
# str formatting
|
||||
result = self.tzframe.astype(str)
|
||||
expected = DataFrame([['2013-01-01', '2013-01-01 00:00:00-05:00',
|
||||
'2013-01-01 00:00:00+01:00'],
|
||||
['2013-01-02', 'NaT', 'NaT'],
|
||||
['2013-01-03', '2013-01-03 00:00:00-05:00',
|
||||
'2013-01-03 00:00:00+01:00']],
|
||||
columns=self.tzframe.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with option_context('display.max_columns', 20):
|
||||
result = str(self.tzframe)
|
||||
assert ('0 2013-01-01 2013-01-01 00:00:00-05:00 '
|
||||
'2013-01-01 00:00:00+01:00') in result
|
||||
assert ('1 2013-01-02 '
|
||||
'NaT NaT') in result
|
||||
assert ('2 2013-01-03 2013-01-03 00:00:00-05:00 '
|
||||
'2013-01-03 00:00:00+01:00') in result
|
||||
@@ -0,0 +1,455 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import lrange, string_types
|
||||
|
||||
from pandas import DataFrame, Series
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']])
|
||||
def test_duplicated_with_misspelled_column_name(subset):
|
||||
# GH 19730
|
||||
df = DataFrame({'A': [0, 0, 1],
|
||||
'B': [0, 0, 1],
|
||||
'C': [0, 0, 1]})
|
||||
|
||||
with pytest.raises(KeyError):
|
||||
df.duplicated(subset)
|
||||
|
||||
with pytest.raises(KeyError):
|
||||
df.drop_duplicates(subset)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_duplicated_do_not_fail_on_wide_dataframes():
|
||||
# gh-21524
|
||||
# Given the wide dataframe with a lot of columns
|
||||
# with different (important!) values
|
||||
data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
|
||||
for i in range(100)}
|
||||
df = DataFrame(data).T
|
||||
result = df.duplicated()
|
||||
|
||||
# Then duplicates produce the bool Series as a result and don't fail during
|
||||
# calculation. Actual values doesn't matter here, though usually it's all
|
||||
# False in this case
|
||||
assert isinstance(result, Series)
|
||||
assert result.dtype == np.bool
|
||||
|
||||
|
||||
@pytest.mark.parametrize('keep, expected', [
|
||||
('first', Series([False, False, True, False, True])),
|
||||
('last', Series([True, True, False, False, False])),
|
||||
(False, Series([True, True, True, False, True]))
|
||||
])
|
||||
def test_duplicated_keep(keep, expected):
|
||||
df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})
|
||||
|
||||
result = df.duplicated(keep=keep)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal")
|
||||
@pytest.mark.parametrize('keep, expected', [
|
||||
('first', Series([False, False, True, False, True])),
|
||||
('last', Series([True, True, False, False, False])),
|
||||
(False, Series([True, True, True, False, True]))
|
||||
])
|
||||
def test_duplicated_nan_none(keep, expected):
|
||||
df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object)
|
||||
|
||||
result = df.duplicated(keep=keep)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('keep', ['first', 'last', False])
|
||||
@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
|
||||
def test_duplicated_subset(subset, keep):
|
||||
df = DataFrame({'A': [0, 1, 1, 2, 0],
|
||||
'B': ['a', 'b', 'b', 'c', 'a'],
|
||||
'C': [np.nan, 3, 3, None, np.nan]})
|
||||
|
||||
if subset is None:
|
||||
subset = list(df.columns)
|
||||
elif isinstance(subset, string_types):
|
||||
# need to have a DataFrame, not a Series
|
||||
# -> select columns with singleton list, not string
|
||||
subset = [subset]
|
||||
|
||||
expected = df[subset].duplicated(keep=keep)
|
||||
result = df.duplicated(keep=keep, subset=subset)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_drop_duplicates():
|
||||
df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
|
||||
'foo', 'bar', 'bar', 'foo'],
|
||||
'B': ['one', 'one', 'two', 'two',
|
||||
'two', 'two', 'one', 'two'],
|
||||
'C': [1, 1, 2, 2, 2, 2, 1, 2],
|
||||
'D': lrange(8)})
|
||||
|
||||
# single column
|
||||
result = df.drop_duplicates('AAA')
|
||||
expected = df[:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates('AAA', keep='last')
|
||||
expected = df.loc[[6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates('AAA', keep=False)
|
||||
expected = df.loc[[]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert len(result) == 0
|
||||
|
||||
# multi column
|
||||
expected = df.loc[[0, 1, 2, 3]]
|
||||
result = df.drop_duplicates(np.array(['AAA', 'B']))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.drop_duplicates(['AAA', 'B'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(('AAA', 'B'), keep='last')
|
||||
expected = df.loc[[0, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(('AAA', 'B'), keep=False)
|
||||
expected = df.loc[[0]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# consider everything
|
||||
df2 = df.loc[:, ['AAA', 'B', 'C']]
|
||||
|
||||
result = df2.drop_duplicates()
|
||||
# in this case only
|
||||
expected = df2.drop_duplicates(['AAA', 'B'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df2.drop_duplicates(keep='last')
|
||||
expected = df2.drop_duplicates(['AAA', 'B'], keep='last')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df2.drop_duplicates(keep=False)
|
||||
expected = df2.drop_duplicates(['AAA', 'B'], keep=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# integers
|
||||
result = df.drop_duplicates('C')
|
||||
expected = df.iloc[[0, 2]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.drop_duplicates('C', keep='last')
|
||||
expected = df.iloc[[-2, -1]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df['E'] = df['C'].astype('int8')
|
||||
result = df.drop_duplicates('E')
|
||||
expected = df.iloc[[0, 2]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.drop_duplicates('E', keep='last')
|
||||
expected = df.iloc[[-2, -1]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 11376
|
||||
df = DataFrame({'x': [7, 6, 3, 3, 4, 8, 0],
|
||||
'y': [0, 6, 5, 5, 9, 1, 2]})
|
||||
expected = df.loc[df.index != 3]
|
||||
tm.assert_frame_equal(df.drop_duplicates(), expected)
|
||||
|
||||
df = DataFrame([[1, 0], [0, 2]])
|
||||
tm.assert_frame_equal(df.drop_duplicates(), df)
|
||||
|
||||
df = DataFrame([[-2, 0], [0, -4]])
|
||||
tm.assert_frame_equal(df.drop_duplicates(), df)
|
||||
|
||||
x = np.iinfo(np.int64).max / 3 * 2
|
||||
df = DataFrame([[-x, x], [0, x + 4]])
|
||||
tm.assert_frame_equal(df.drop_duplicates(), df)
|
||||
|
||||
df = DataFrame([[-x, x], [x, x + 4]])
|
||||
tm.assert_frame_equal(df.drop_duplicates(), df)
|
||||
|
||||
# GH 11864
|
||||
df = DataFrame([i] * 9 for i in range(16))
|
||||
df = df.append([[1] + [0] * 8], ignore_index=True)
|
||||
|
||||
for keep in ['first', 'last', False]:
|
||||
assert df.duplicated(keep=keep).sum() == 0
|
||||
|
||||
|
||||
def test_drop_duplicates_with_duplicate_column_names():
|
||||
# GH17836
|
||||
df = DataFrame([
|
||||
[1, 2, 5],
|
||||
[3, 4, 6],
|
||||
[3, 4, 7]
|
||||
], columns=['a', 'a', 'b'])
|
||||
|
||||
result0 = df.drop_duplicates()
|
||||
tm.assert_frame_equal(result0, df)
|
||||
|
||||
result1 = df.drop_duplicates('a')
|
||||
expected1 = df[:2]
|
||||
tm.assert_frame_equal(result1, expected1)
|
||||
|
||||
|
||||
def test_drop_duplicates_for_take_all():
|
||||
df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar',
|
||||
'foo', 'bar', 'qux', 'foo'],
|
||||
'B': ['one', 'one', 'two', 'two',
|
||||
'two', 'two', 'one', 'two'],
|
||||
'C': [1, 1, 2, 2, 2, 2, 1, 2],
|
||||
'D': lrange(8)})
|
||||
|
||||
# single column
|
||||
result = df.drop_duplicates('AAA')
|
||||
expected = df.iloc[[0, 1, 2, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates('AAA', keep='last')
|
||||
expected = df.iloc[[2, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates('AAA', keep=False)
|
||||
expected = df.iloc[[2, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# multiple columns
|
||||
result = df.drop_duplicates(['AAA', 'B'])
|
||||
expected = df.iloc[[0, 1, 2, 3, 4, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(['AAA', 'B'], keep='last')
|
||||
expected = df.iloc[[0, 1, 2, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(['AAA', 'B'], keep=False)
|
||||
expected = df.iloc[[0, 1, 2, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_drop_duplicates_tuple():
|
||||
df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar',
|
||||
'foo', 'bar', 'bar', 'foo'],
|
||||
'B': ['one', 'one', 'two', 'two',
|
||||
'two', 'two', 'one', 'two'],
|
||||
'C': [1, 1, 2, 2, 2, 2, 1, 2],
|
||||
'D': lrange(8)})
|
||||
|
||||
# single column
|
||||
result = df.drop_duplicates(('AA', 'AB'))
|
||||
expected = df[:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(('AA', 'AB'), keep='last')
|
||||
expected = df.loc[[6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(('AA', 'AB'), keep=False)
|
||||
expected = df.loc[[]] # empty df
|
||||
assert len(result) == 0
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# multi column
|
||||
expected = df.loc[[0, 1, 2, 3]]
|
||||
result = df.drop_duplicates((('AA', 'AB'), 'B'))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('df', [
|
||||
DataFrame(),
|
||||
DataFrame(columns=[]),
|
||||
DataFrame(columns=['A', 'B', 'C']),
|
||||
DataFrame(index=[]),
|
||||
DataFrame(index=['A', 'B', 'C'])
|
||||
])
|
||||
def test_drop_duplicates_empty(df):
|
||||
# GH 20516
|
||||
result = df.drop_duplicates()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
result = df.copy()
|
||||
result.drop_duplicates(inplace=True)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_drop_duplicates_NA():
|
||||
# none
|
||||
df = DataFrame({'A': [None, None, 'foo', 'bar',
|
||||
'foo', 'bar', 'bar', 'foo'],
|
||||
'B': ['one', 'one', 'two', 'two',
|
||||
'two', 'two', 'one', 'two'],
|
||||
'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
|
||||
'D': lrange(8)})
|
||||
|
||||
# single column
|
||||
result = df.drop_duplicates('A')
|
||||
expected = df.loc[[0, 2, 3]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates('A', keep='last')
|
||||
expected = df.loc[[1, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates('A', keep=False)
|
||||
expected = df.loc[[]] # empty df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert len(result) == 0
|
||||
|
||||
# multi column
|
||||
result = df.drop_duplicates(['A', 'B'])
|
||||
expected = df.loc[[0, 2, 3, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(['A', 'B'], keep='last')
|
||||
expected = df.loc[[1, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(['A', 'B'], keep=False)
|
||||
expected = df.loc[[6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# nan
|
||||
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
|
||||
'foo', 'bar', 'bar', 'foo'],
|
||||
'B': ['one', 'one', 'two', 'two',
|
||||
'two', 'two', 'one', 'two'],
|
||||
'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
|
||||
'D': lrange(8)})
|
||||
|
||||
# single column
|
||||
result = df.drop_duplicates('C')
|
||||
expected = df[:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates('C', keep='last')
|
||||
expected = df.loc[[3, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates('C', keep=False)
|
||||
expected = df.loc[[]] # empty df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert len(result) == 0
|
||||
|
||||
# multi column
|
||||
result = df.drop_duplicates(['C', 'B'])
|
||||
expected = df.loc[[0, 1, 2, 4]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(['C', 'B'], keep='last')
|
||||
expected = df.loc[[1, 3, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(['C', 'B'], keep=False)
|
||||
expected = df.loc[[1]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_drop_duplicates_NA_for_take_all():
|
||||
# none
|
||||
df = DataFrame({'A': [None, None, 'foo', 'bar',
|
||||
'foo', 'baz', 'bar', 'qux'],
|
||||
'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]})
|
||||
|
||||
# single column
|
||||
result = df.drop_duplicates('A')
|
||||
expected = df.iloc[[0, 2, 3, 5, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates('A', keep='last')
|
||||
expected = df.iloc[[1, 4, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates('A', keep=False)
|
||||
expected = df.iloc[[5, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# nan
|
||||
|
||||
# single column
|
||||
result = df.drop_duplicates('C')
|
||||
expected = df.iloc[[0, 1, 5, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates('C', keep='last')
|
||||
expected = df.iloc[[3, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates('C', keep=False)
|
||||
expected = df.iloc[[5, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_drop_duplicates_inplace():
|
||||
orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
|
||||
'foo', 'bar', 'bar', 'foo'],
|
||||
'B': ['one', 'one', 'two', 'two',
|
||||
'two', 'two', 'one', 'two'],
|
||||
'C': [1, 1, 2, 2, 2, 2, 1, 2],
|
||||
'D': lrange(8)})
|
||||
|
||||
# single column
|
||||
df = orig.copy()
|
||||
df.drop_duplicates('A', inplace=True)
|
||||
expected = orig[:2]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = orig.copy()
|
||||
df.drop_duplicates('A', keep='last', inplace=True)
|
||||
expected = orig.loc[[6, 7]]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = orig.copy()
|
||||
df.drop_duplicates('A', keep=False, inplace=True)
|
||||
expected = orig.loc[[]]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert len(df) == 0
|
||||
|
||||
# multi column
|
||||
df = orig.copy()
|
||||
df.drop_duplicates(['A', 'B'], inplace=True)
|
||||
expected = orig.loc[[0, 1, 2, 3]]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = orig.copy()
|
||||
df.drop_duplicates(['A', 'B'], keep='last', inplace=True)
|
||||
expected = orig.loc[[0, 5, 6, 7]]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = orig.copy()
|
||||
df.drop_duplicates(['A', 'B'], keep=False, inplace=True)
|
||||
expected = orig.loc[[0]]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# consider everything
|
||||
orig2 = orig.loc[:, ['A', 'B', 'C']].copy()
|
||||
|
||||
df2 = orig2.copy()
|
||||
df2.drop_duplicates(inplace=True)
|
||||
# in this case only
|
||||
expected = orig2.drop_duplicates(['A', 'B'])
|
||||
result = df2
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df2 = orig2.copy()
|
||||
df2.drop_duplicates(keep='last', inplace=True)
|
||||
expected = orig2.drop_duplicates(['A', 'B'], keep='last')
|
||||
result = df2
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df2 = orig2.copy()
|
||||
df2.drop_duplicates(keep=False, inplace=True)
|
||||
expected = orig2.drop_duplicates(['A', 'B'], keep=False)
|
||||
result = df2
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,185 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, Index, period_range
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def frame_with_period_index():
|
||||
return DataFrame(
|
||||
data=np.arange(20).reshape(4, 5),
|
||||
columns=list('abcde'),
|
||||
index=period_range(start='2000', freq='A', periods=4))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def frame():
|
||||
return TestData().frame
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left():
|
||||
return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right():
|
||||
return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"how, sort, expected",
|
||||
[('inner', False, DataFrame({'a': [20, 10],
|
||||
'b': [200, 100]},
|
||||
index=[2, 1])),
|
||||
('inner', True, DataFrame({'a': [10, 20],
|
||||
'b': [100, 200]},
|
||||
index=[1, 2])),
|
||||
('left', False, DataFrame({'a': [20, 10, 0],
|
||||
'b': [200, 100, np.nan]},
|
||||
index=[2, 1, 0])),
|
||||
('left', True, DataFrame({'a': [0, 10, 20],
|
||||
'b': [np.nan, 100, 200]},
|
||||
index=[0, 1, 2])),
|
||||
('right', False, DataFrame({'a': [np.nan, 10, 20],
|
||||
'b': [300, 100, 200]},
|
||||
index=[3, 1, 2])),
|
||||
('right', True, DataFrame({'a': [10, 20, np.nan],
|
||||
'b': [100, 200, 300]},
|
||||
index=[1, 2, 3])),
|
||||
('outer', False, DataFrame({'a': [0, 10, 20, np.nan],
|
||||
'b': [np.nan, 100, 200, 300]},
|
||||
index=[0, 1, 2, 3])),
|
||||
('outer', True, DataFrame({'a': [0, 10, 20, np.nan],
|
||||
'b': [np.nan, 100, 200, 300]},
|
||||
index=[0, 1, 2, 3]))])
|
||||
def test_join(left, right, how, sort, expected):
|
||||
|
||||
result = left.join(right, how=how, sort=sort)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_join_index(frame):
|
||||
# left / right
|
||||
|
||||
f = frame.loc[frame.index[:10], ['A', 'B']]
|
||||
f2 = frame.loc[frame.index[5:], ['C', 'D']].iloc[::-1]
|
||||
|
||||
joined = f.join(f2)
|
||||
tm.assert_index_equal(f.index, joined.index)
|
||||
expected_columns = Index(['A', 'B', 'C', 'D'])
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
joined = f.join(f2, how='left')
|
||||
tm.assert_index_equal(joined.index, f.index)
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
joined = f.join(f2, how='right')
|
||||
tm.assert_index_equal(joined.index, f2.index)
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
# inner
|
||||
|
||||
joined = f.join(f2, how='inner')
|
||||
tm.assert_index_equal(joined.index, f.index[5:10])
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
# outer
|
||||
|
||||
joined = f.join(f2, how='outer')
|
||||
tm.assert_index_equal(joined.index, frame.index.sort_values())
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
with pytest.raises(ValueError, match='join method'):
|
||||
f.join(f2, how='foo')
|
||||
|
||||
# corner case - overlapping columns
|
||||
msg = 'columns overlap but no suffix'
|
||||
for how in ('outer', 'left', 'inner'):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
frame.join(frame, how=how)
|
||||
|
||||
|
||||
def test_join_index_more(frame):
|
||||
af = frame.loc[:, ['A', 'B']]
|
||||
bf = frame.loc[::2, ['C', 'D']]
|
||||
|
||||
expected = af.copy()
|
||||
expected['C'] = frame['C'][::2]
|
||||
expected['D'] = frame['D'][::2]
|
||||
|
||||
result = af.join(bf)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = af.join(bf, how='right')
|
||||
tm.assert_frame_equal(result, expected[::2])
|
||||
|
||||
result = bf.join(af, how='right')
|
||||
tm.assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
|
||||
def test_join_index_series(frame):
|
||||
df = frame.copy()
|
||||
s = df.pop(frame.columns[-1])
|
||||
joined = df.join(s)
|
||||
|
||||
# TODO should this check_names ?
|
||||
tm.assert_frame_equal(joined, frame, check_names=False)
|
||||
|
||||
s.name = None
|
||||
with pytest.raises(ValueError, match='must have a name'):
|
||||
df.join(s)
|
||||
|
||||
|
||||
def test_join_overlap(frame):
|
||||
df1 = frame.loc[:, ['A', 'B', 'C']]
|
||||
df2 = frame.loc[:, ['B', 'C', 'D']]
|
||||
|
||||
joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2')
|
||||
df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1')
|
||||
df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2')
|
||||
|
||||
no_overlap = frame.loc[:, ['A', 'D']]
|
||||
expected = df1_suf.join(df2_suf).join(no_overlap)
|
||||
|
||||
# column order not necessarily sorted
|
||||
tm.assert_frame_equal(joined, expected.loc[:, joined.columns])
|
||||
|
||||
|
||||
def test_join_period_index(frame_with_period_index):
|
||||
other = frame_with_period_index.rename(
|
||||
columns=lambda x: '{key}{key}'.format(key=x))
|
||||
|
||||
joined_values = np.concatenate(
|
||||
[frame_with_period_index.values] * 2, axis=1)
|
||||
|
||||
joined_cols = frame_with_period_index.columns.append(other.columns)
|
||||
|
||||
joined = frame_with_period_index.join(other)
|
||||
expected = DataFrame(
|
||||
data=joined_values,
|
||||
columns=joined_cols,
|
||||
index=frame_with_period_index.index)
|
||||
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
|
||||
|
||||
def test_join_left_sequence_non_unique_index():
|
||||
# https://github.com/pandas-dev/pandas/issues/19607
|
||||
df1 = DataFrame({'a': [0, 10, 20]}, index=[1, 2, 3])
|
||||
df2 = DataFrame({'b': [100, 200, 300]}, index=[4, 3, 2])
|
||||
df3 = DataFrame({'c': [400, 500, 600]}, index=[2, 2, 4])
|
||||
|
||||
joined = df1.join([df2, df3], how='left')
|
||||
|
||||
expected = DataFrame({
|
||||
'a': [0, 10, 10, 20],
|
||||
'b': [np.nan, 300, 300, 200],
|
||||
'c': [np.nan, 400, 500, np.nan]
|
||||
}, index=[1, 2, 2, 3])
|
||||
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
@@ -0,0 +1,863 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import datetime
|
||||
from distutils.version import LooseVersion
|
||||
|
||||
import dateutil
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import lrange
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import Categorical, DataFrame, Series, Timestamp, date_range
|
||||
from pandas.tests.frame.common import TestData, _check_mixed_float
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
try:
|
||||
import scipy
|
||||
_is_scipy_ge_0190 = (LooseVersion(scipy.__version__) >=
|
||||
LooseVersion('0.19.0'))
|
||||
except ImportError:
|
||||
_is_scipy_ge_0190 = False
|
||||
|
||||
|
||||
def _skip_if_no_pchip():
|
||||
try:
|
||||
from scipy.interpolate import pchip_interpolate # noqa
|
||||
except ImportError:
|
||||
import pytest
|
||||
pytest.skip('scipy.interpolate.pchip missing')
|
||||
|
||||
|
||||
class TestDataFrameMissingData(TestData):
|
||||
|
||||
def test_dropEmptyRows(self):
|
||||
N = len(self.frame.index)
|
||||
mat = np.random.randn(N)
|
||||
mat[:5] = np.nan
|
||||
|
||||
frame = DataFrame({'foo': mat}, index=self.frame.index)
|
||||
original = Series(mat, index=self.frame.index, name='foo')
|
||||
expected = original.dropna()
|
||||
inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()
|
||||
|
||||
smaller_frame = frame.dropna(how='all')
|
||||
# check that original was preserved
|
||||
assert_series_equal(frame['foo'], original)
|
||||
inplace_frame1.dropna(how='all', inplace=True)
|
||||
assert_series_equal(smaller_frame['foo'], expected)
|
||||
assert_series_equal(inplace_frame1['foo'], expected)
|
||||
|
||||
smaller_frame = frame.dropna(how='all', subset=['foo'])
|
||||
inplace_frame2.dropna(how='all', subset=['foo'], inplace=True)
|
||||
assert_series_equal(smaller_frame['foo'], expected)
|
||||
assert_series_equal(inplace_frame2['foo'], expected)
|
||||
|
||||
def test_dropIncompleteRows(self):
|
||||
N = len(self.frame.index)
|
||||
mat = np.random.randn(N)
|
||||
mat[:5] = np.nan
|
||||
|
||||
frame = DataFrame({'foo': mat}, index=self.frame.index)
|
||||
frame['bar'] = 5
|
||||
original = Series(mat, index=self.frame.index, name='foo')
|
||||
inp_frame1, inp_frame2 = frame.copy(), frame.copy()
|
||||
|
||||
smaller_frame = frame.dropna()
|
||||
assert_series_equal(frame['foo'], original)
|
||||
inp_frame1.dropna(inplace=True)
|
||||
|
||||
exp = Series(mat[5:], index=self.frame.index[5:], name='foo')
|
||||
tm.assert_series_equal(smaller_frame['foo'], exp)
|
||||
tm.assert_series_equal(inp_frame1['foo'], exp)
|
||||
|
||||
samesize_frame = frame.dropna(subset=['bar'])
|
||||
assert_series_equal(frame['foo'], original)
|
||||
assert (frame['bar'] == 5).all()
|
||||
inp_frame2.dropna(subset=['bar'], inplace=True)
|
||||
tm.assert_index_equal(samesize_frame.index, self.frame.index)
|
||||
tm.assert_index_equal(inp_frame2.index, self.frame.index)
|
||||
|
||||
def test_dropna(self):
|
||||
df = DataFrame(np.random.randn(6, 4))
|
||||
df[2][:2] = np.nan
|
||||
|
||||
dropped = df.dropna(axis=1)
|
||||
expected = df.loc[:, [0, 1, 3]]
|
||||
inp = df.copy()
|
||||
inp.dropna(axis=1, inplace=True)
|
||||
assert_frame_equal(dropped, expected)
|
||||
assert_frame_equal(inp, expected)
|
||||
|
||||
dropped = df.dropna(axis=0)
|
||||
expected = df.loc[lrange(2, 6)]
|
||||
inp = df.copy()
|
||||
inp.dropna(axis=0, inplace=True)
|
||||
assert_frame_equal(dropped, expected)
|
||||
assert_frame_equal(inp, expected)
|
||||
|
||||
# threshold
|
||||
dropped = df.dropna(axis=1, thresh=5)
|
||||
expected = df.loc[:, [0, 1, 3]]
|
||||
inp = df.copy()
|
||||
inp.dropna(axis=1, thresh=5, inplace=True)
|
||||
assert_frame_equal(dropped, expected)
|
||||
assert_frame_equal(inp, expected)
|
||||
|
||||
dropped = df.dropna(axis=0, thresh=4)
|
||||
expected = df.loc[lrange(2, 6)]
|
||||
inp = df.copy()
|
||||
inp.dropna(axis=0, thresh=4, inplace=True)
|
||||
assert_frame_equal(dropped, expected)
|
||||
assert_frame_equal(inp, expected)
|
||||
|
||||
dropped = df.dropna(axis=1, thresh=4)
|
||||
assert_frame_equal(dropped, df)
|
||||
|
||||
dropped = df.dropna(axis=1, thresh=3)
|
||||
assert_frame_equal(dropped, df)
|
||||
|
||||
# subset
|
||||
dropped = df.dropna(axis=0, subset=[0, 1, 3])
|
||||
inp = df.copy()
|
||||
inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
|
||||
assert_frame_equal(dropped, df)
|
||||
assert_frame_equal(inp, df)
|
||||
|
||||
# all
|
||||
dropped = df.dropna(axis=1, how='all')
|
||||
assert_frame_equal(dropped, df)
|
||||
|
||||
df[2] = np.nan
|
||||
dropped = df.dropna(axis=1, how='all')
|
||||
expected = df.loc[:, [0, 1, 3]]
|
||||
assert_frame_equal(dropped, expected)
|
||||
|
||||
# bad input
|
||||
pytest.raises(ValueError, df.dropna, axis=3)
|
||||
|
||||
def test_drop_and_dropna_caching(self):
|
||||
# tst that cacher updates
|
||||
original = Series([1, 2, np.nan], name='A')
|
||||
expected = Series([1, 2], dtype=original.dtype, name='A')
|
||||
df = pd.DataFrame({'A': original.values.copy()})
|
||||
df2 = df.copy()
|
||||
df['A'].dropna()
|
||||
assert_series_equal(df['A'], original)
|
||||
df['A'].dropna(inplace=True)
|
||||
assert_series_equal(df['A'], expected)
|
||||
df2['A'].drop([1])
|
||||
assert_series_equal(df2['A'], original)
|
||||
df2['A'].drop([1], inplace=True)
|
||||
assert_series_equal(df2['A'], original.drop([1]))
|
||||
|
||||
def test_dropna_corner(self):
|
||||
# bad input
|
||||
pytest.raises(ValueError, self.frame.dropna, how='foo')
|
||||
pytest.raises(TypeError, self.frame.dropna, how=None)
|
||||
# non-existent column - 8303
|
||||
pytest.raises(KeyError, self.frame.dropna, subset=['A', 'X'])
|
||||
|
||||
def test_dropna_multiple_axes(self):
|
||||
df = DataFrame([[1, np.nan, 2, 3],
|
||||
[4, np.nan, 5, 6],
|
||||
[np.nan, np.nan, np.nan, np.nan],
|
||||
[7, np.nan, 8, 9]])
|
||||
cp = df.copy()
|
||||
|
||||
# GH20987
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
result = df.dropna(how='all', axis=[0, 1])
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
result2 = df.dropna(how='all', axis=(0, 1))
|
||||
expected = df.dropna(how='all').dropna(how='all', axis=1)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
assert_frame_equal(result2, expected)
|
||||
assert_frame_equal(df, cp)
|
||||
|
||||
inp = df.copy()
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
inp.dropna(how='all', axis=(0, 1), inplace=True)
|
||||
assert_frame_equal(inp, expected)
|
||||
|
||||
def test_dropna_tz_aware_datetime(self):
|
||||
# GH13407
|
||||
df = DataFrame()
|
||||
dt1 = datetime.datetime(2015, 1, 1,
|
||||
tzinfo=dateutil.tz.tzutc())
|
||||
dt2 = datetime.datetime(2015, 2, 2,
|
||||
tzinfo=dateutil.tz.tzutc())
|
||||
df['Time'] = [dt1]
|
||||
result = df.dropna(axis=0)
|
||||
expected = DataFrame({'Time': [dt1]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Ex2
|
||||
df = DataFrame({'Time': [dt1, None, np.nan, dt2]})
|
||||
result = df.dropna(axis=0)
|
||||
expected = DataFrame([dt1, dt2],
|
||||
columns=['Time'],
|
||||
index=[0, 3])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna(self):
|
||||
tf = self.tsframe
|
||||
tf.loc[tf.index[:5], 'A'] = np.nan
|
||||
tf.loc[tf.index[-5:], 'A'] = np.nan
|
||||
|
||||
zero_filled = self.tsframe.fillna(0)
|
||||
assert (zero_filled.loc[zero_filled.index[:5], 'A'] == 0).all()
|
||||
|
||||
padded = self.tsframe.fillna(method='pad')
|
||||
assert np.isnan(padded.loc[padded.index[:5], 'A']).all()
|
||||
assert (padded.loc[padded.index[-5:], 'A'] ==
|
||||
padded.loc[padded.index[-5], 'A']).all()
|
||||
|
||||
# mixed type
|
||||
mf = self.mixed_frame
|
||||
mf.loc[mf.index[5:20], 'foo'] = np.nan
|
||||
mf.loc[mf.index[-10:], 'A'] = np.nan
|
||||
result = self.mixed_frame.fillna(value=0)
|
||||
result = self.mixed_frame.fillna(method='pad')
|
||||
|
||||
pytest.raises(ValueError, self.tsframe.fillna)
|
||||
pytest.raises(ValueError, self.tsframe.fillna, 5, method='ffill')
|
||||
|
||||
# mixed numeric (but no float16)
|
||||
mf = self.mixed_float.reindex(columns=['A', 'B', 'D'])
|
||||
mf.loc[mf.index[-10:], 'A'] = np.nan
|
||||
result = mf.fillna(value=0)
|
||||
_check_mixed_float(result, dtype=dict(C=None))
|
||||
|
||||
result = mf.fillna(method='pad')
|
||||
_check_mixed_float(result, dtype=dict(C=None))
|
||||
|
||||
# empty frame (GH #2778)
|
||||
df = DataFrame(columns=['x'])
|
||||
for m in ['pad', 'backfill']:
|
||||
df.x.fillna(method=m, inplace=True)
|
||||
df.x.fillna(method=m)
|
||||
|
||||
# with different dtype (GH3386)
|
||||
df = DataFrame([['a', 'a', np.nan, 'a'], [
|
||||
'b', 'b', np.nan, 'b'], ['c', 'c', np.nan, 'c']])
|
||||
|
||||
result = df.fillna({2: 'foo'})
|
||||
expected = DataFrame([['a', 'a', 'foo', 'a'],
|
||||
['b', 'b', 'foo', 'b'],
|
||||
['c', 'c', 'foo', 'c']])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df.fillna({2: 'foo'}, inplace=True)
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# limit and value
|
||||
df = DataFrame(np.random.randn(10, 3))
|
||||
df.iloc[2:7, 0] = np.nan
|
||||
df.iloc[3:5, 2] = np.nan
|
||||
|
||||
expected = df.copy()
|
||||
expected.iloc[2, 0] = 999
|
||||
expected.iloc[3, 2] = 999
|
||||
result = df.fillna(999, limit=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# with datelike
|
||||
# GH 6344
|
||||
df = DataFrame({
|
||||
'Date': [pd.NaT, Timestamp("2014-1-1")],
|
||||
'Date2': [Timestamp("2013-1-1"), pd.NaT]
|
||||
})
|
||||
|
||||
expected = df.copy()
|
||||
expected['Date'] = expected['Date'].fillna(
|
||||
df.loc[df.index[0], 'Date2'])
|
||||
result = df.fillna(value={'Date': df['Date2']})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# with timezone
|
||||
# GH 15855
|
||||
df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
|
||||
pd.NaT]})
|
||||
exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
|
||||
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
|
||||
assert_frame_equal(df.fillna(method='pad'), exp)
|
||||
|
||||
df = pd.DataFrame({'A': [pd.NaT,
|
||||
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
|
||||
exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
|
||||
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
|
||||
assert_frame_equal(df.fillna(method='bfill'), exp)
|
||||
|
||||
# with timezone in another column
|
||||
# GH 15522
|
||||
df = pd.DataFrame({'A': pd.date_range('20130101', periods=4,
|
||||
tz='US/Eastern'),
|
||||
'B': [1, 2, np.nan, np.nan]})
|
||||
result = df.fillna(method='pad')
|
||||
expected = pd.DataFrame({'A': pd.date_range('20130101', periods=4,
|
||||
tz='US/Eastern'),
|
||||
'B': [1., 2., 2., 2.]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_na_actions_categorical(self):
|
||||
|
||||
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
|
||||
vals = ["a", "b", np.nan, "d"]
|
||||
df = DataFrame({"cats": cat, "vals": vals})
|
||||
cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
|
||||
vals2 = ["a", "b", "b", "d"]
|
||||
df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
|
||||
cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
|
||||
vals3 = ["a", "b", np.nan]
|
||||
df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
|
||||
cat4 = Categorical([1, 2], categories=[1, 2, 3])
|
||||
vals4 = ["a", "b"]
|
||||
df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
|
||||
|
||||
# fillna
|
||||
res = df.fillna(value={"cats": 3, "vals": "b"})
|
||||
tm.assert_frame_equal(res, df_exp_fill)
|
||||
|
||||
with pytest.raises(ValueError, match=("fill value must "
|
||||
"be in categories")):
|
||||
df.fillna(value={"cats": 4, "vals": "c"})
|
||||
|
||||
res = df.fillna(method='pad')
|
||||
tm.assert_frame_equal(res, df_exp_fill)
|
||||
|
||||
# dropna
|
||||
res = df.dropna(subset=["cats"])
|
||||
tm.assert_frame_equal(res, df_exp_drop_cats)
|
||||
|
||||
res = df.dropna()
|
||||
tm.assert_frame_equal(res, df_exp_drop_all)
|
||||
|
||||
# make sure that fillna takes missing values into account
|
||||
c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
|
||||
df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})
|
||||
|
||||
cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
|
||||
df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
|
||||
|
||||
res = df.fillna("a")
|
||||
tm.assert_frame_equal(res, df_exp)
|
||||
|
||||
def test_fillna_categorical_nan(self):
|
||||
# GH 14021
|
||||
# np.nan should always be a valid filler
|
||||
cat = Categorical([np.nan, 2, np.nan])
|
||||
val = Categorical([np.nan, np.nan, np.nan])
|
||||
df = DataFrame({"cats": cat, "vals": val})
|
||||
res = df.fillna(df.median())
|
||||
v_exp = [np.nan, np.nan, np.nan]
|
||||
df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
|
||||
dtype='category')
|
||||
tm.assert_frame_equal(res, df_exp)
|
||||
|
||||
result = df.cats.fillna(np.nan)
|
||||
tm.assert_series_equal(result, df.cats)
|
||||
result = df.vals.fillna(np.nan)
|
||||
tm.assert_series_equal(result, df.vals)
|
||||
|
||||
idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
|
||||
'2011-01-01 09:00', pd.NaT, pd.NaT])
|
||||
df = DataFrame({'a': Categorical(idx)})
|
||||
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
|
||||
|
||||
idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01',
|
||||
pd.NaT, pd.NaT], freq='M')
|
||||
df = DataFrame({'a': Categorical(idx)})
|
||||
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
|
||||
|
||||
idx = pd.TimedeltaIndex(['1 days', '2 days',
|
||||
'1 days', pd.NaT, pd.NaT])
|
||||
df = DataFrame({'a': Categorical(idx)})
|
||||
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
|
||||
|
||||
def test_fillna_downcast(self):
|
||||
# GH 15277
|
||||
# infer int64 from float64
|
||||
df = pd.DataFrame({'a': [1., np.nan]})
|
||||
result = df.fillna(0, downcast='infer')
|
||||
expected = pd.DataFrame({'a': [1, 0]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# infer int64 from float64 when fillna value is a dict
|
||||
df = pd.DataFrame({'a': [1., np.nan]})
|
||||
result = df.fillna({'a': 0}, downcast='infer')
|
||||
expected = pd.DataFrame({'a': [1, 0]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_dtype_conversion(self):
|
||||
# make sure that fillna on an empty frame works
|
||||
df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
|
||||
result = df.get_dtype_counts().sort_values()
|
||||
expected = Series({'object': 5})
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df.fillna(1)
|
||||
expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
|
||||
result = result.get_dtype_counts().sort_values()
|
||||
expected = Series({'int64': 5})
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# empty block
|
||||
df = DataFrame(index=lrange(3), columns=['A', 'B'], dtype='float64')
|
||||
result = df.fillna('nan')
|
||||
expected = DataFrame('nan', index=lrange(3), columns=['A', 'B'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# equiv of replace
|
||||
df = DataFrame(dict(A=[1, np.nan], B=[1., 2.]))
|
||||
for v in ['', 1, np.nan, 1.0]:
|
||||
expected = df.replace(np.nan, v)
|
||||
result = df.fillna(v)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_datetime_columns(self):
|
||||
# GH 7095
|
||||
df = pd.DataFrame({'A': [-1, -2, np.nan],
|
||||
'B': date_range('20130101', periods=3),
|
||||
'C': ['foo', 'bar', None],
|
||||
'D': ['foo2', 'bar2', None]},
|
||||
index=date_range('20130110', periods=3))
|
||||
result = df.fillna('?')
|
||||
expected = pd.DataFrame({'A': [-1, -2, '?'],
|
||||
'B': date_range('20130101', periods=3),
|
||||
'C': ['foo', 'bar', '?'],
|
||||
'D': ['foo2', 'bar2', '?']},
|
||||
index=date_range('20130110', periods=3))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = pd.DataFrame({'A': [-1, -2, np.nan],
|
||||
'B': [pd.Timestamp('2013-01-01'),
|
||||
pd.Timestamp('2013-01-02'), pd.NaT],
|
||||
'C': ['foo', 'bar', None],
|
||||
'D': ['foo2', 'bar2', None]},
|
||||
index=date_range('20130110', periods=3))
|
||||
result = df.fillna('?')
|
||||
expected = pd.DataFrame({'A': [-1, -2, '?'],
|
||||
'B': [pd.Timestamp('2013-01-01'),
|
||||
pd.Timestamp('2013-01-02'), '?'],
|
||||
'C': ['foo', 'bar', '?'],
|
||||
'D': ['foo2', 'bar2', '?']},
|
||||
index=pd.date_range('20130110', periods=3))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_ffill(self):
|
||||
self.tsframe['A'][:5] = np.nan
|
||||
self.tsframe['A'][-5:] = np.nan
|
||||
|
||||
assert_frame_equal(self.tsframe.ffill(),
|
||||
self.tsframe.fillna(method='ffill'))
|
||||
|
||||
def test_bfill(self):
|
||||
self.tsframe['A'][:5] = np.nan
|
||||
self.tsframe['A'][-5:] = np.nan
|
||||
|
||||
assert_frame_equal(self.tsframe.bfill(),
|
||||
self.tsframe.fillna(method='bfill'))
|
||||
|
||||
def test_frame_pad_backfill_limit(self):
|
||||
index = np.arange(10)
|
||||
df = DataFrame(np.random.randn(10, 4), index=index)
|
||||
|
||||
result = df[:2].reindex(index, method='pad', limit=5)
|
||||
|
||||
expected = df[:2].reindex(index).fillna(method='pad')
|
||||
expected.values[-3:] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df[-2:].reindex(index, method='backfill', limit=5)
|
||||
|
||||
expected = df[-2:].reindex(index).fillna(method='backfill')
|
||||
expected.values[:3] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_frame_fillna_limit(self):
|
||||
index = np.arange(10)
|
||||
df = DataFrame(np.random.randn(10, 4), index=index)
|
||||
|
||||
result = df[:2].reindex(index)
|
||||
result = result.fillna(method='pad', limit=5)
|
||||
|
||||
expected = df[:2].reindex(index).fillna(method='pad')
|
||||
expected.values[-3:] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df[-2:].reindex(index)
|
||||
result = result.fillna(method='backfill', limit=5)
|
||||
|
||||
expected = df[-2:].reindex(index).fillna(method='backfill')
|
||||
expected.values[:3] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_skip_certain_blocks(self):
|
||||
# don't try to fill boolean, int blocks
|
||||
|
||||
df = DataFrame(np.random.randn(10, 4).astype(int))
|
||||
|
||||
# it works!
|
||||
df.fillna(np.nan)
|
||||
|
||||
def test_fillna_inplace(self):
|
||||
df = DataFrame(np.random.randn(10, 4))
|
||||
df[1][:4] = np.nan
|
||||
df[3][-4:] = np.nan
|
||||
|
||||
expected = df.fillna(value=0)
|
||||
assert expected is not df
|
||||
|
||||
df.fillna(value=0, inplace=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
expected = df.fillna(value={0: 0}, inplace=True)
|
||||
assert expected is None
|
||||
|
||||
df[1][:4] = np.nan
|
||||
df[3][-4:] = np.nan
|
||||
expected = df.fillna(method='ffill')
|
||||
assert expected is not df
|
||||
|
||||
df.fillna(method='ffill', inplace=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_fillna_dict_series(self):
|
||||
df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan],
|
||||
'b': [1, 2, 3, np.nan, np.nan],
|
||||
'c': [np.nan, 1, 2, 3, 4]})
|
||||
|
||||
result = df.fillna({'a': 0, 'b': 5})
|
||||
|
||||
expected = df.copy()
|
||||
expected['a'] = expected['a'].fillna(0)
|
||||
expected['b'] = expected['b'].fillna(5)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# it works
|
||||
result = df.fillna({'a': 0, 'b': 5, 'd': 7})
|
||||
|
||||
# Series treated same as dict
|
||||
result = df.fillna(df.max())
|
||||
expected = df.fillna(df.max().to_dict())
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# disable this for now
|
||||
with pytest.raises(NotImplementedError, match='column by column'):
|
||||
df.fillna(df.max(1), axis=1)
|
||||
|
||||
def test_fillna_dataframe(self):
|
||||
# GH 8377
|
||||
df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan],
|
||||
'b': [1, 2, 3, np.nan, np.nan],
|
||||
'c': [np.nan, 1, 2, 3, 4]},
|
||||
index=list('VWXYZ'))
|
||||
|
||||
# df2 may have different index and columns
|
||||
df2 = DataFrame({'a': [np.nan, 10, 20, 30, 40],
|
||||
'b': [50, 60, 70, 80, 90],
|
||||
'foo': ['bar'] * 5},
|
||||
index=list('VWXuZ'))
|
||||
|
||||
result = df.fillna(df2)
|
||||
|
||||
# only those columns and indices which are shared get filled
|
||||
expected = DataFrame({'a': [np.nan, 1, 2, np.nan, 40],
|
||||
'b': [1, 2, 3, np.nan, 90],
|
||||
'c': [np.nan, 1, 2, 3, 4]},
|
||||
index=list('VWXYZ'))
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_columns(self):
|
||||
df = DataFrame(np.random.randn(10, 10))
|
||||
df.values[:, ::2] = np.nan
|
||||
|
||||
result = df.fillna(method='ffill', axis=1)
|
||||
expected = df.T.fillna(method='pad').T
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df.insert(6, 'foo', 5)
|
||||
result = df.fillna(method='ffill', axis=1)
|
||||
expected = df.astype(float).fillna(method='ffill', axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_invalid_method(self):
|
||||
with pytest.raises(ValueError, match='ffil'):
|
||||
self.frame.fillna(method='ffil')
|
||||
|
||||
def test_fillna_invalid_value(self):
|
||||
# list
|
||||
pytest.raises(TypeError, self.frame.fillna, [1, 2])
|
||||
# tuple
|
||||
pytest.raises(TypeError, self.frame.fillna, (1, 2))
|
||||
# frame with series
|
||||
pytest.raises(TypeError, self.frame.iloc[:, 0].fillna, self.frame)
|
||||
|
||||
def test_fillna_col_reordering(self):
|
||||
cols = ["COL." + str(i) for i in range(5, 0, -1)]
|
||||
data = np.random.rand(20, 5)
|
||||
df = DataFrame(index=lrange(20), columns=cols, data=data)
|
||||
filled = df.fillna(method='ffill')
|
||||
assert df.columns.tolist() == filled.columns.tolist()
|
||||
|
||||
def test_fill_corner(self):
|
||||
mf = self.mixed_frame
|
||||
mf.loc[mf.index[5:20], 'foo'] = np.nan
|
||||
mf.loc[mf.index[-10:], 'A'] = np.nan
|
||||
|
||||
filled = self.mixed_frame.fillna(value=0)
|
||||
assert (filled.loc[filled.index[5:20], 'foo'] == 0).all()
|
||||
del self.mixed_frame['foo']
|
||||
|
||||
empty_float = self.frame.reindex(columns=[])
|
||||
|
||||
# TODO(wesm): unused?
|
||||
result = empty_float.fillna(value=0) # noqa
|
||||
|
||||
def test_fill_value_when_combine_const(self):
|
||||
# GH12723
|
||||
dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float')
|
||||
df = DataFrame({'foo': dat}, index=range(6))
|
||||
|
||||
exp = df.fillna(0).add(2)
|
||||
res = df.add(2, fill_value=0)
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
|
||||
class TestDataFrameInterpolate(TestData):
|
||||
|
||||
def test_interp_basic(self):
|
||||
df = DataFrame({'A': [1, 2, np.nan, 4],
|
||||
'B': [1, 4, 9, np.nan],
|
||||
'C': [1, 2, 3, 5],
|
||||
'D': list('abcd')})
|
||||
expected = DataFrame({'A': [1., 2., 3., 4.],
|
||||
'B': [1., 4., 9., 9.],
|
||||
'C': [1, 2, 3, 5],
|
||||
'D': list('abcd')})
|
||||
result = df.interpolate()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.set_index('C').interpolate()
|
||||
expected = df.set_index('C')
|
||||
expected.loc[3, 'A'] = 3
|
||||
expected.loc[5, 'B'] = 9
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_bad_method(self):
|
||||
df = DataFrame({'A': [1, 2, np.nan, 4],
|
||||
'B': [1, 4, 9, np.nan],
|
||||
'C': [1, 2, 3, 5],
|
||||
'D': list('abcd')})
|
||||
with pytest.raises(ValueError):
|
||||
df.interpolate(method='not_a_method')
|
||||
|
||||
def test_interp_combo(self):
|
||||
df = DataFrame({'A': [1., 2., np.nan, 4.],
|
||||
'B': [1, 4, 9, np.nan],
|
||||
'C': [1, 2, 3, 5],
|
||||
'D': list('abcd')})
|
||||
|
||||
result = df['A'].interpolate()
|
||||
expected = Series([1., 2., 3., 4.], name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df['A'].interpolate(downcast='infer')
|
||||
expected = Series([1, 2, 3, 4], name='A')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_interp_nan_idx(self):
|
||||
df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]})
|
||||
df = df.set_index('A')
|
||||
with pytest.raises(NotImplementedError):
|
||||
df.interpolate(method='values')
|
||||
|
||||
@td.skip_if_no_scipy
|
||||
def test_interp_various(self):
|
||||
df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
|
||||
'C': [1, 2, 3, 5, 8, 13, 21]})
|
||||
df = df.set_index('C')
|
||||
expected = df.copy()
|
||||
result = df.interpolate(method='polynomial', order=1)
|
||||
|
||||
expected.A.loc[3] = 2.66666667
|
||||
expected.A.loc[13] = 5.76923076
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method='cubic')
|
||||
# GH #15662.
|
||||
# new cubic and quadratic interpolation algorithms from scipy 0.19.0.
|
||||
# previously `splmake` was used. See scipy/scipy#6710
|
||||
if _is_scipy_ge_0190:
|
||||
expected.A.loc[3] = 2.81547781
|
||||
expected.A.loc[13] = 5.52964175
|
||||
else:
|
||||
expected.A.loc[3] = 2.81621174
|
||||
expected.A.loc[13] = 5.64146581
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method='nearest')
|
||||
expected.A.loc[3] = 2
|
||||
expected.A.loc[13] = 5
|
||||
assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
result = df.interpolate(method='quadratic')
|
||||
if _is_scipy_ge_0190:
|
||||
expected.A.loc[3] = 2.82150771
|
||||
expected.A.loc[13] = 6.12648668
|
||||
else:
|
||||
expected.A.loc[3] = 2.82533638
|
||||
expected.A.loc[13] = 6.02817974
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method='slinear')
|
||||
expected.A.loc[3] = 2.66666667
|
||||
expected.A.loc[13] = 5.76923077
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method='zero')
|
||||
expected.A.loc[3] = 2.
|
||||
expected.A.loc[13] = 5
|
||||
assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
@td.skip_if_no_scipy
|
||||
def test_interp_alt_scipy(self):
|
||||
df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
|
||||
'C': [1, 2, 3, 5, 8, 13, 21]})
|
||||
result = df.interpolate(method='barycentric')
|
||||
expected = df.copy()
|
||||
expected.loc[2, 'A'] = 3
|
||||
expected.loc[5, 'A'] = 6
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method='barycentric', downcast='infer')
|
||||
assert_frame_equal(result, expected.astype(np.int64))
|
||||
|
||||
result = df.interpolate(method='krogh')
|
||||
expectedk = df.copy()
|
||||
expectedk['A'] = expected['A']
|
||||
assert_frame_equal(result, expectedk)
|
||||
|
||||
_skip_if_no_pchip()
|
||||
import scipy
|
||||
result = df.interpolate(method='pchip')
|
||||
expected.loc[2, 'A'] = 3
|
||||
|
||||
if LooseVersion(scipy.__version__) >= LooseVersion('0.17.0'):
|
||||
expected.loc[5, 'A'] = 6.0
|
||||
else:
|
||||
expected.loc[5, 'A'] = 6.125
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_rowwise(self):
|
||||
df = DataFrame({0: [1, 2, np.nan, 4],
|
||||
1: [2, 3, 4, np.nan],
|
||||
2: [np.nan, 4, 5, 6],
|
||||
3: [4, np.nan, 6, 7],
|
||||
4: [1, 2, 3, 4]})
|
||||
result = df.interpolate(axis=1)
|
||||
expected = df.copy()
|
||||
expected.loc[3, 1] = 5
|
||||
expected.loc[0, 2] = 3
|
||||
expected.loc[1, 3] = 3
|
||||
expected[4] = expected[4].astype(np.float64)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(axis=1, method='values')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(axis=0)
|
||||
expected = df.interpolate()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_rowwise_alt(self):
|
||||
df = DataFrame({0: [0, .5, 1., np.nan, 4, 8, np.nan, np.nan, 64],
|
||||
1: [1, 2, 3, 4, 3, 2, 1, 0, -1]})
|
||||
df.interpolate(axis=0)
|
||||
|
||||
@pytest.mark.parametrize("check_scipy", [
|
||||
False, pytest.param(True, marks=td.skip_if_no_scipy)
|
||||
])
|
||||
def test_interp_leading_nans(self, check_scipy):
|
||||
df = DataFrame({"A": [np.nan, np.nan, .5, .25, 0],
|
||||
"B": [np.nan, -3, -3.5, np.nan, -4]})
|
||||
result = df.interpolate()
|
||||
expected = df.copy()
|
||||
expected['B'].loc[3] = -3.75
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
if check_scipy:
|
||||
result = df.interpolate(method='polynomial', order=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_raise_on_only_mixed(self):
|
||||
df = DataFrame({'A': [1, 2, np.nan, 4],
|
||||
'B': ['a', 'b', 'c', 'd'],
|
||||
'C': [np.nan, 2, 5, 7],
|
||||
'D': [np.nan, np.nan, 9, 9],
|
||||
'E': [1, 2, 3, 4]})
|
||||
with pytest.raises(TypeError):
|
||||
df.interpolate(axis=1)
|
||||
|
||||
def test_interp_raise_on_all_object_dtype(self):
|
||||
# GH 22985
|
||||
df = DataFrame({
|
||||
'A': [1, 2, 3],
|
||||
'B': [4, 5, 6]},
|
||||
dtype='object')
|
||||
msg = ("Cannot interpolate with all object-dtype columns "
|
||||
"in the DataFrame. Try setting at least one "
|
||||
"column to a numeric dtype.")
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.interpolate()
|
||||
|
||||
def test_interp_inplace(self):
|
||||
df = DataFrame({'a': [1., 2., np.nan, 4.]})
|
||||
expected = DataFrame({'a': [1., 2., 3., 4.]})
|
||||
result = df.copy()
|
||||
result['a'].interpolate(inplace=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.copy()
|
||||
result['a'].interpolate(inplace=True, downcast='infer')
|
||||
assert_frame_equal(result, expected.astype('int64'))
|
||||
|
||||
def test_interp_inplace_row(self):
|
||||
# GH 10395
|
||||
result = DataFrame({'a': [1., 2., 3., 4.],
|
||||
'b': [np.nan, 2., 3., 4.],
|
||||
'c': [3, 2, 2, 2]})
|
||||
expected = result.interpolate(method='linear', axis=1, inplace=False)
|
||||
result.interpolate(method='linear', axis=1, inplace=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_ignore_all_good(self):
|
||||
# GH
|
||||
df = DataFrame({'A': [1, 2, np.nan, 4],
|
||||
'B': [1, 2, 3, 4],
|
||||
'C': [1., 2., np.nan, 4.],
|
||||
'D': [1., 2., 3., 4.]})
|
||||
expected = DataFrame({'A': np.array(
|
||||
[1, 2, 3, 4], dtype='float64'),
|
||||
'B': np.array(
|
||||
[1, 2, 3, 4], dtype='int64'),
|
||||
'C': np.array(
|
||||
[1., 2., 3, 4.], dtype='float64'),
|
||||
'D': np.array(
|
||||
[1., 2., 3., 4.], dtype='float64')})
|
||||
|
||||
result = df.interpolate(downcast=None)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# all good
|
||||
result = df[['B', 'D']].interpolate(downcast=None)
|
||||
assert_frame_equal(result, df[['B', 'D']])
|
||||
@@ -0,0 +1,280 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY36, lrange, range
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex, Series
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
# Column add, remove, delete.
|
||||
|
||||
|
||||
class TestDataFrameMutateColumns(TestData):
|
||||
|
||||
def test_assign(self):
|
||||
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
original = df.copy()
|
||||
result = df.assign(C=df.B / df.A)
|
||||
expected = df.copy()
|
||||
expected['C'] = [4, 2.5, 2]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# lambda syntax
|
||||
result = df.assign(C=lambda x: x.B / x.A)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# original is unmodified
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
# Non-Series array-like
|
||||
result = df.assign(C=[4, 2.5, 2])
|
||||
assert_frame_equal(result, expected)
|
||||
# original is unmodified
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
result = df.assign(B=df.B / df.A)
|
||||
expected = expected.drop('B', axis=1).rename(columns={'C': 'B'})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# overwrite
|
||||
result = df.assign(A=df.A + df.B)
|
||||
expected = df.copy()
|
||||
expected['A'] = [5, 7, 9]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# lambda
|
||||
result = df.assign(A=lambda x: x.A + x.B)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_multiple(self):
|
||||
df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=['A', 'B'])
|
||||
result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
|
||||
expected = DataFrame([[1, 4, 7, 1, 4], [2, 5, 8, 2, 5],
|
||||
[3, 6, 9, 3, 6]], columns=list('ABCDE'))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_order(self):
|
||||
# GH 9818
|
||||
df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
|
||||
result = df.assign(D=df.A + df.B, C=df.A - df.B)
|
||||
|
||||
if PY36:
|
||||
expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]],
|
||||
columns=list('ABDC'))
|
||||
else:
|
||||
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
|
||||
columns=list('ABCD'))
|
||||
assert_frame_equal(result, expected)
|
||||
result = df.assign(C=df.A - df.B, D=df.A + df.B)
|
||||
|
||||
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
|
||||
columns=list('ABCD'))
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_bad(self):
|
||||
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
|
||||
# non-keyword argument
|
||||
with pytest.raises(TypeError):
|
||||
df.assign(lambda x: x.A)
|
||||
with pytest.raises(AttributeError):
|
||||
df.assign(C=df.A, D=df.A + df.C)
|
||||
|
||||
@pytest.mark.skipif(PY36, reason="""Issue #14207: valid for python
|
||||
3.6 and above""")
|
||||
def test_assign_dependent_old_python(self):
|
||||
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
|
||||
# Key C does not exist at definition time of df
|
||||
with pytest.raises(KeyError):
|
||||
df.assign(C=lambda df: df.A,
|
||||
D=lambda df: df['A'] + df['C'])
|
||||
with pytest.raises(KeyError):
|
||||
df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
|
||||
|
||||
@pytest.mark.skipif(not PY36, reason="""Issue #14207: not valid for
|
||||
python 3.5 and below""")
|
||||
def test_assign_dependent(self):
|
||||
df = DataFrame({'A': [1, 2], 'B': [3, 4]})
|
||||
|
||||
result = df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
|
||||
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
|
||||
columns=list('ABCD'))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.assign(C=lambda df: df.A,
|
||||
D=lambda df: df['A'] + df['C'])
|
||||
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
|
||||
columns=list('ABCD'))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_insert_error_msmgs(self):
|
||||
|
||||
# GH 7432
|
||||
df = DataFrame({'foo': ['a', 'b', 'c'], 'bar': [
|
||||
1, 2, 3], 'baz': ['d', 'e', 'f']}).set_index('foo')
|
||||
s = DataFrame({'foo': ['a', 'b', 'c', 'a'], 'fiz': [
|
||||
'g', 'h', 'i', 'j']}).set_index('foo')
|
||||
msg = 'cannot reindex from a duplicate axis'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df['newcol'] = s
|
||||
|
||||
# GH 4107, more descriptive error message
|
||||
df = DataFrame(np.random.randint(0, 2, (4, 4)),
|
||||
columns=['a', 'b', 'c', 'd'])
|
||||
|
||||
msg = 'incompatible index of inserted column with frame index'
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df['gr'] = df.groupby(['b', 'c']).count()
|
||||
|
||||
def test_insert_benchmark(self):
|
||||
# from the vb_suite/frame_methods/frame_insert_columns
|
||||
N = 10
|
||||
K = 5
|
||||
df = DataFrame(index=lrange(N))
|
||||
new_col = np.random.randn(N)
|
||||
for i in range(K):
|
||||
df[i] = new_col
|
||||
expected = DataFrame(np.repeat(new_col, K).reshape(N, K),
|
||||
index=lrange(N))
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_insert(self):
|
||||
df = DataFrame(np.random.randn(5, 3), index=np.arange(5),
|
||||
columns=['c', 'b', 'a'])
|
||||
|
||||
df.insert(0, 'foo', df['a'])
|
||||
tm.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a']))
|
||||
tm.assert_series_equal(df['a'], df['foo'], check_names=False)
|
||||
|
||||
df.insert(2, 'bar', df['c'])
|
||||
tm.assert_index_equal(df.columns,
|
||||
Index(['foo', 'c', 'bar', 'b', 'a']))
|
||||
tm.assert_almost_equal(df['c'], df['bar'], check_names=False)
|
||||
|
||||
# diff dtype
|
||||
|
||||
# new item
|
||||
df['x'] = df['a'].astype('float32')
|
||||
result = Series(dict(float32=1, float64=5))
|
||||
assert (df.get_dtype_counts().sort_index() == result).all()
|
||||
|
||||
# replacing current (in different block)
|
||||
df['a'] = df['a'].astype('float32')
|
||||
result = Series(dict(float32=2, float64=4))
|
||||
assert (df.get_dtype_counts().sort_index() == result).all()
|
||||
|
||||
df['y'] = df['a'].astype('int32')
|
||||
result = Series(dict(float32=2, float64=4, int32=1))
|
||||
assert (df.get_dtype_counts().sort_index() == result).all()
|
||||
|
||||
with pytest.raises(ValueError, match='already exists'):
|
||||
df.insert(1, 'a', df['b'])
|
||||
pytest.raises(ValueError, df.insert, 1, 'c', df['b'])
|
||||
|
||||
df.columns.name = 'some_name'
|
||||
# preserve columns name field
|
||||
df.insert(0, 'baz', df['c'])
|
||||
assert df.columns.name == 'some_name'
|
||||
|
||||
# GH 13522
|
||||
df = DataFrame(index=['A', 'B', 'C'])
|
||||
df['X'] = df.index
|
||||
df['X'] = ['x', 'y', 'z']
|
||||
exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C'])
|
||||
assert_frame_equal(df, exp)
|
||||
|
||||
def test_delitem(self):
|
||||
del self.frame['A']
|
||||
assert 'A' not in self.frame
|
||||
|
||||
def test_delitem_multiindex(self):
|
||||
midx = MultiIndex.from_product([['A', 'B'], [1, 2]])
|
||||
df = DataFrame(np.random.randn(4, 4), columns=midx)
|
||||
assert len(df.columns) == 4
|
||||
assert ('A', ) in df.columns
|
||||
assert 'A' in df.columns
|
||||
|
||||
result = df['A']
|
||||
assert isinstance(result, DataFrame)
|
||||
del df['A']
|
||||
|
||||
assert len(df.columns) == 2
|
||||
|
||||
# A still in the levels, BUT get a KeyError if trying
|
||||
# to delete
|
||||
assert ('A', ) not in df.columns
|
||||
with pytest.raises(KeyError):
|
||||
del df[('A',)]
|
||||
|
||||
# behavior of dropped/deleted MultiIndex levels changed from
|
||||
# GH 2770 to GH 19027: MultiIndex no longer '.__contains__'
|
||||
# levels which are dropped/deleted
|
||||
assert 'A' not in df.columns
|
||||
with pytest.raises(KeyError):
|
||||
del df['A']
|
||||
|
||||
def test_pop(self):
|
||||
self.frame.columns.name = 'baz'
|
||||
|
||||
self.frame.pop('A')
|
||||
assert 'A' not in self.frame
|
||||
|
||||
self.frame['foo'] = 'bar'
|
||||
self.frame.pop('foo')
|
||||
assert 'foo' not in self.frame
|
||||
assert self.frame.columns.name == 'baz'
|
||||
|
||||
# gh-10912: inplace ops cause caching issue
|
||||
a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[
|
||||
'A', 'B', 'C'], index=['X', 'Y'])
|
||||
b = a.pop('B')
|
||||
b += 1
|
||||
|
||||
# original frame
|
||||
expected = DataFrame([[1, 3], [4, 6]], columns=[
|
||||
'A', 'C'], index=['X', 'Y'])
|
||||
tm.assert_frame_equal(a, expected)
|
||||
|
||||
# result
|
||||
expected = Series([2, 5], index=['X', 'Y'], name='B') + 1
|
||||
tm.assert_series_equal(b, expected)
|
||||
|
||||
def test_pop_non_unique_cols(self):
|
||||
df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]})
|
||||
df.columns = ["a", "b", "a"]
|
||||
|
||||
res = df.pop("a")
|
||||
assert type(res) == DataFrame
|
||||
assert len(res) == 2
|
||||
assert len(df.columns) == 1
|
||||
assert "b" in df.columns
|
||||
assert "a" not in df.columns
|
||||
assert len(df.index) == 2
|
||||
|
||||
def test_insert_column_bug_4032(self):
|
||||
|
||||
# GH4032, inserting a column and renaming causing errors
|
||||
df = DataFrame({'b': [1.1, 2.2]})
|
||||
df = df.rename(columns={})
|
||||
df.insert(0, 'a', [1, 2])
|
||||
|
||||
result = df.rename(columns={})
|
||||
str(result)
|
||||
expected = DataFrame([[1, 1.1], [2, 2.2]], columns=['a', 'b'])
|
||||
assert_frame_equal(result, expected)
|
||||
df.insert(0, 'c', [1.3, 2.3])
|
||||
|
||||
result = df.rename(columns={})
|
||||
str(result)
|
||||
|
||||
expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]],
|
||||
columns=['c', 'a', 'b'])
|
||||
assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,477 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import lrange, u
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, MultiIndex, Series, date_range
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
class TestDataFrameNonuniqueIndexes(TestData):
|
||||
|
||||
def test_column_dups_operations(self):
|
||||
|
||||
def check(result, expected=None):
|
||||
if expected is not None:
|
||||
assert_frame_equal(result, expected)
|
||||
result.dtypes
|
||||
str(result)
|
||||
|
||||
# assignment
|
||||
# GH 3687
|
||||
arr = np.random.randn(3, 2)
|
||||
idx = lrange(2)
|
||||
df = DataFrame(arr, columns=['A', 'A'])
|
||||
df.columns = idx
|
||||
expected = DataFrame(arr, columns=idx)
|
||||
check(df, expected)
|
||||
|
||||
idx = date_range('20130101', periods=4, freq='Q-NOV')
|
||||
df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
|
||||
columns=['a', 'a', 'a', 'a'])
|
||||
df.columns = idx
|
||||
expected = DataFrame(
|
||||
[[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
|
||||
check(df, expected)
|
||||
|
||||
# insert
|
||||
df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
|
||||
columns=['foo', 'bar', 'foo', 'hello'])
|
||||
df['string'] = 'bah'
|
||||
expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'],
|
||||
[2, 1, 3, 5, 'bah']],
|
||||
columns=['foo', 'bar', 'foo', 'hello', 'string'])
|
||||
check(df, expected)
|
||||
with pytest.raises(ValueError, match='Length of value'):
|
||||
df.insert(0, 'AnotherColumn', range(len(df.index) - 1))
|
||||
|
||||
# insert same dtype
|
||||
df['foo2'] = 3
|
||||
expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3],
|
||||
[2, 1, 3, 5, 'bah', 3]],
|
||||
columns=['foo', 'bar', 'foo', 'hello',
|
||||
'string', 'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
# set (non-dup)
|
||||
df['foo2'] = 4
|
||||
expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4],
|
||||
[2, 1, 3, 5, 'bah', 4]],
|
||||
columns=['foo', 'bar', 'foo', 'hello',
|
||||
'string', 'foo2'])
|
||||
check(df, expected)
|
||||
df['foo2'] = 3
|
||||
|
||||
# delete (non dup)
|
||||
del df['bar']
|
||||
expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3],
|
||||
[2, 3, 5, 'bah', 3]],
|
||||
columns=['foo', 'foo', 'hello', 'string', 'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
# try to delete again (its not consolidated)
|
||||
del df['hello']
|
||||
expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
|
||||
[2, 3, 'bah', 3]],
|
||||
columns=['foo', 'foo', 'string', 'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
# consolidate
|
||||
df = df._consolidate()
|
||||
expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
|
||||
[2, 3, 'bah', 3]],
|
||||
columns=['foo', 'foo', 'string', 'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
# insert
|
||||
df.insert(2, 'new_col', 5.)
|
||||
expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3],
|
||||
[2, 3, 5., 'bah', 3]],
|
||||
columns=['foo', 'foo', 'new_col', 'string',
|
||||
'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
# insert a dup
|
||||
with pytest.raises(ValueError, match='cannot insert'):
|
||||
df.insert(2, 'new_col', 4.)
|
||||
|
||||
df.insert(2, 'new_col', 4., allow_duplicates=True)
|
||||
expected = DataFrame([[1, 1, 4., 5., 'bah', 3],
|
||||
[1, 2, 4., 5., 'bah', 3],
|
||||
[2, 3, 4., 5., 'bah', 3]],
|
||||
columns=['foo', 'foo', 'new_col',
|
||||
'new_col', 'string', 'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
# delete (dup)
|
||||
del df['foo']
|
||||
expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3],
|
||||
[4., 5., 'bah', 3]],
|
||||
columns=['new_col', 'new_col', 'string', 'foo2'])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# dup across dtypes
|
||||
df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]],
|
||||
columns=['foo', 'bar', 'foo', 'hello'])
|
||||
check(df)
|
||||
|
||||
df['foo2'] = 7.
|
||||
expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.],
|
||||
[2, 1, 3., 5, 7.]],
|
||||
columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
result = df['foo']
|
||||
expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]],
|
||||
columns=['foo', 'foo'])
|
||||
check(result, expected)
|
||||
|
||||
# multiple replacements
|
||||
df['foo'] = 'string'
|
||||
expected = DataFrame([['string', 1, 'string', 5, 7.],
|
||||
['string', 1, 'string', 5, 7.],
|
||||
['string', 1, 'string', 5, 7.]],
|
||||
columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
del df['foo']
|
||||
expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[
|
||||
'bar', 'hello', 'foo2'])
|
||||
check(df, expected)
|
||||
|
||||
# values
|
||||
df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x'])
|
||||
result = df.values
|
||||
expected = np.array([[1, 2.5], [3, 4.5]])
|
||||
assert (result == expected).all().all()
|
||||
|
||||
# rename, GH 4403
|
||||
df4 = DataFrame(
|
||||
{'RT': [0.0454],
|
||||
'TClose': [22.02],
|
||||
'TExg': [0.0422]},
|
||||
index=MultiIndex.from_tuples([(600809, 20130331)],
|
||||
names=['STK_ID', 'RPT_Date']))
|
||||
|
||||
df5 = DataFrame({'RPT_Date': [20120930, 20121231, 20130331],
|
||||
'STK_ID': [600809] * 3,
|
||||
'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')],
|
||||
'TClose': [38.05, 41.66, 30.01]},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(600809, 20120930),
|
||||
(600809, 20121231),
|
||||
(600809, 20130331)],
|
||||
names=['STK_ID', 'RPT_Date']))
|
||||
|
||||
k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True)
|
||||
result = k.rename(
|
||||
columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'})
|
||||
str(result)
|
||||
result.dtypes
|
||||
|
||||
expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809,
|
||||
u('饡驦'), 30.01]],
|
||||
columns=['RT', 'TClose', 'TExg',
|
||||
'RPT_Date', 'STK_ID', 'STK_Name',
|
||||
'QT_Close'])
|
||||
.set_index(['STK_ID', 'RPT_Date'], drop=False))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# reindex is invalid!
|
||||
df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
|
||||
columns=['bar', 'a', 'a'])
|
||||
pytest.raises(ValueError, df.reindex, columns=['bar'])
|
||||
pytest.raises(ValueError, df.reindex, columns=['bar', 'foo'])
|
||||
|
||||
# drop
|
||||
df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
|
||||
columns=['bar', 'a', 'a'])
|
||||
result = df.drop(['a'], axis=1)
|
||||
expected = DataFrame([[1], [1], [1]], columns=['bar'])
|
||||
check(result, expected)
|
||||
result = df.drop('a', axis=1)
|
||||
check(result, expected)
|
||||
|
||||
# describe
|
||||
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
|
||||
columns=['bar', 'a', 'a'], dtype='float64')
|
||||
result = df.describe()
|
||||
s = df.iloc[:, 0].describe()
|
||||
expected = pd.concat([s, s, s], keys=df.columns, axis=1)
|
||||
check(result, expected)
|
||||
|
||||
# check column dups with index equal and not equal to df's index
|
||||
df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'],
|
||||
columns=['A', 'B', 'A'])
|
||||
for index in [df.index, pd.Index(list('edcba'))]:
|
||||
this_df = df.copy()
|
||||
expected_ser = pd.Series(index.values, index=this_df.index)
|
||||
expected_df = DataFrame({'A': expected_ser,
|
||||
'B': this_df['B'],
|
||||
'A': expected_ser},
|
||||
columns=['A', 'B', 'A'])
|
||||
this_df['A'] = index
|
||||
check(this_df, expected_df)
|
||||
|
||||
# operations
|
||||
for op in ['__add__', '__mul__', '__sub__', '__truediv__']:
|
||||
df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
|
||||
expected = getattr(df, op)(df)
|
||||
expected.columns = ['A', 'A']
|
||||
df.columns = ['A', 'A']
|
||||
result = getattr(df, op)(df)
|
||||
check(result, expected)
|
||||
|
||||
# multiple assignments that change dtypes
|
||||
# the location indexer is a slice
|
||||
# GH 6120
|
||||
df = DataFrame(np.random.randn(5, 2), columns=['that', 'that'])
|
||||
expected = DataFrame(1.0, index=range(5), columns=['that', 'that'])
|
||||
|
||||
df['that'] = 1.0
|
||||
check(df, expected)
|
||||
|
||||
df = DataFrame(np.random.rand(5, 2), columns=['that', 'that'])
|
||||
expected = DataFrame(1, index=range(5), columns=['that', 'that'])
|
||||
|
||||
df['that'] = 1
|
||||
check(df, expected)
|
||||
|
||||
def test_column_dups2(self):
|
||||
|
||||
# drop buggy GH 6240
|
||||
df = DataFrame({'A': np.random.randn(5),
|
||||
'B': np.random.randn(5),
|
||||
'C': np.random.randn(5),
|
||||
'D': ['a', 'b', 'c', 'd', 'e']})
|
||||
|
||||
expected = df.take([0, 1, 1], axis=1)
|
||||
df2 = df.take([2, 0, 1, 2, 1], axis=1)
|
||||
result = df2.drop('C', axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# dropna
|
||||
df = DataFrame({'A': np.random.randn(5),
|
||||
'B': np.random.randn(5),
|
||||
'C': np.random.randn(5),
|
||||
'D': ['a', 'b', 'c', 'd', 'e']})
|
||||
df.iloc[2, [0, 1, 2]] = np.nan
|
||||
df.iloc[0, 0] = np.nan
|
||||
df.iloc[1, 1] = np.nan
|
||||
df.iloc[:, 3] = np.nan
|
||||
expected = df.dropna(subset=['A', 'B', 'C'], how='all')
|
||||
expected.columns = ['A', 'A', 'B', 'C']
|
||||
|
||||
df.columns = ['A', 'A', 'B', 'C']
|
||||
|
||||
result = df.dropna(subset=['A', 'C'], how='all')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_column_dups_indexing(self):
|
||||
def check(result, expected=None):
|
||||
if expected is not None:
|
||||
assert_frame_equal(result, expected)
|
||||
result.dtypes
|
||||
str(result)
|
||||
|
||||
# boolean indexing
|
||||
# GH 4879
|
||||
dups = ['A', 'A', 'C', 'D']
|
||||
df = DataFrame(np.arange(12).reshape(3, 4), columns=[
|
||||
'A', 'B', 'C', 'D'], dtype='float64')
|
||||
expected = df[df.C > 6]
|
||||
expected.columns = dups
|
||||
df = DataFrame(np.arange(12).reshape(3, 4),
|
||||
columns=dups, dtype='float64')
|
||||
result = df[df.C > 6]
|
||||
check(result, expected)
|
||||
|
||||
# where
|
||||
df = DataFrame(np.arange(12).reshape(3, 4), columns=[
|
||||
'A', 'B', 'C', 'D'], dtype='float64')
|
||||
expected = df[df > 6]
|
||||
expected.columns = dups
|
||||
df = DataFrame(np.arange(12).reshape(3, 4),
|
||||
columns=dups, dtype='float64')
|
||||
result = df[df > 6]
|
||||
check(result, expected)
|
||||
|
||||
# boolean with the duplicate raises
|
||||
df = DataFrame(np.arange(12).reshape(3, 4),
|
||||
columns=dups, dtype='float64')
|
||||
pytest.raises(ValueError, lambda: df[df.A > 6])
|
||||
|
||||
# dup aligining operations should work
|
||||
# GH 5185
|
||||
df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
|
||||
df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
|
||||
expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
|
||||
result = df1.sub(df2)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# equality
|
||||
df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]],
|
||||
columns=['A', 'B'])
|
||||
df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]],
|
||||
columns=['A', 'A'])
|
||||
|
||||
# not-comparing like-labelled
|
||||
pytest.raises(ValueError, lambda: df1 == df2)
|
||||
|
||||
df1r = df1.reindex_like(df2)
|
||||
result = df1r == df2
|
||||
expected = DataFrame([[False, True], [True, False], [False, False], [
|
||||
True, False]], columns=['A', 'A'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# mixed column selection
|
||||
# GH 5639
|
||||
dfbool = DataFrame({'one': Series([True, True, False],
|
||||
index=['a', 'b', 'c']),
|
||||
'two': Series([False, False, True, False],
|
||||
index=['a', 'b', 'c', 'd']),
|
||||
'three': Series([False, True, True, True],
|
||||
index=['a', 'b', 'c', 'd'])})
|
||||
expected = pd.concat(
|
||||
[dfbool['one'], dfbool['three'], dfbool['one']], axis=1)
|
||||
result = dfbool[['one', 'three', 'one']]
|
||||
check(result, expected)
|
||||
|
||||
# multi-axis dups
|
||||
# GH 6121
|
||||
df = DataFrame(np.arange(25.).reshape(5, 5),
|
||||
index=['a', 'b', 'c', 'd', 'e'],
|
||||
columns=['A', 'B', 'C', 'D', 'E'])
|
||||
z = df[['A', 'C', 'A']].copy()
|
||||
expected = z.loc[['a', 'c', 'a']]
|
||||
|
||||
df = DataFrame(np.arange(25.).reshape(5, 5),
|
||||
index=['a', 'b', 'c', 'd', 'e'],
|
||||
columns=['A', 'B', 'C', 'D', 'E'])
|
||||
z = df[['A', 'C', 'A']]
|
||||
result = z.loc[['a', 'c', 'a']]
|
||||
check(result, expected)
|
||||
|
||||
def test_column_dups_indexing2(self):
|
||||
|
||||
# GH 8363
|
||||
# datetime ops with a non-unique index
|
||||
df = DataFrame({'A': np.arange(5, dtype='int64'),
|
||||
'B': np.arange(1, 6, dtype='int64')},
|
||||
index=[2, 2, 3, 3, 4])
|
||||
result = df.B - df.A
|
||||
expected = Series(1, index=[2, 2, 3, 3, 4])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({'A': date_range('20130101', periods=5),
|
||||
'B': date_range('20130101 09:00:00', periods=5)},
|
||||
index=[2, 2, 3, 3, 4])
|
||||
result = df.B - df.A
|
||||
expected = Series(pd.Timedelta('9 hours'), index=[2, 2, 3, 3, 4])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_columns_with_dups(self):
|
||||
# GH 3468 related
|
||||
|
||||
# basic
|
||||
df = DataFrame([[1, 2]], columns=['a', 'a'])
|
||||
df.columns = ['a', 'a.1']
|
||||
str(df)
|
||||
expected = DataFrame([[1, 2]], columns=['a', 'a.1'])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
df = DataFrame([[1, 2, 3]], columns=['b', 'a', 'a'])
|
||||
df.columns = ['b', 'a', 'a.1']
|
||||
str(df)
|
||||
expected = DataFrame([[1, 2, 3]], columns=['b', 'a', 'a.1'])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# with a dup index
|
||||
df = DataFrame([[1, 2]], columns=['a', 'a'])
|
||||
df.columns = ['b', 'b']
|
||||
str(df)
|
||||
expected = DataFrame([[1, 2]], columns=['b', 'b'])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# multi-dtype
|
||||
df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
|
||||
columns=['a', 'a', 'b', 'b', 'd', 'c', 'c'])
|
||||
df.columns = list('ABCDEFG')
|
||||
str(df)
|
||||
expected = DataFrame(
|
||||
[[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('ABCDEFG'))
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# this is an error because we cannot disambiguate the dup columns
|
||||
pytest.raises(Exception, lambda x: DataFrame(
|
||||
[[1, 2, 'foo', 'bar']], columns=['a', 'a', 'a', 'a']))
|
||||
|
||||
# dups across blocks
|
||||
df_float = DataFrame(np.random.randn(10, 3), dtype='float64')
|
||||
df_int = DataFrame(np.random.randn(10, 3), dtype='int64')
|
||||
df_bool = DataFrame(True, index=df_float.index,
|
||||
columns=df_float.columns)
|
||||
df_object = DataFrame('foo', index=df_float.index,
|
||||
columns=df_float.columns)
|
||||
df_dt = DataFrame(pd.Timestamp('20010101'),
|
||||
index=df_float.index,
|
||||
columns=df_float.columns)
|
||||
df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
|
||||
|
||||
assert len(df._data._blknos) == len(df.columns)
|
||||
assert len(df._data._blklocs) == len(df.columns)
|
||||
|
||||
# testing iloc
|
||||
for i in range(len(df.columns)):
|
||||
df.iloc[:, i]
|
||||
|
||||
# dup columns across dtype GH 2079/2194
|
||||
vals = [[1, -1, 2.], [2, -2, 3.]]
|
||||
rs = DataFrame(vals, columns=['A', 'A', 'B'])
|
||||
xp = DataFrame(vals)
|
||||
xp.columns = ['A', 'A', 'B']
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_values_duplicates(self):
|
||||
df = DataFrame([[1, 2, 'a', 'b'],
|
||||
[1, 2, 'a', 'b']],
|
||||
columns=['one', 'one', 'two', 'two'])
|
||||
|
||||
result = df.values
|
||||
expected = np.array([[1, 2, 'a', 'b'], [1, 2, 'a', 'b']],
|
||||
dtype=object)
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_set_value_by_index(self):
|
||||
# See gh-12344
|
||||
df = DataFrame(np.arange(9).reshape(3, 3).T)
|
||||
df.columns = list('AAA')
|
||||
expected = df.iloc[:, 2]
|
||||
|
||||
df.iloc[:, 0] = 3
|
||||
assert_series_equal(df.iloc[:, 2], expected)
|
||||
|
||||
df = DataFrame(np.arange(9).reshape(3, 3).T)
|
||||
df.columns = [2, float(2), str(2)]
|
||||
expected = df.iloc[:, 1]
|
||||
|
||||
df.iloc[:, 0] = 3
|
||||
assert_series_equal(df.iloc[:, 1], expected)
|
||||
|
||||
def test_insert_with_columns_dups(self):
|
||||
# GH 14291
|
||||
df = pd.DataFrame()
|
||||
df.insert(0, 'A', ['g', 'h', 'i'], allow_duplicates=True)
|
||||
df.insert(0, 'A', ['d', 'e', 'f'], allow_duplicates=True)
|
||||
df.insert(0, 'A', ['a', 'b', 'c'], allow_duplicates=True)
|
||||
exp = pd.DataFrame([['a', 'd', 'g'], ['b', 'e', 'h'],
|
||||
['c', 'f', 'i']], columns=['A', 'A', 'A'])
|
||||
assert_frame_equal(df, exp)
|
||||
@@ -0,0 +1,802 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from decimal import Decimal
|
||||
import operator
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import range
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, MultiIndex, Series, compat
|
||||
import pandas.core.common as com
|
||||
from pandas.tests.frame.common import TestData, _check_mixed_float
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (
|
||||
assert_frame_equal, assert_numpy_array_equal, assert_series_equal)
|
||||
|
||||
|
||||
class TestDataFrameUnaryOperators(object):
|
||||
# __pos__, __neg__, __inv__
|
||||
|
||||
@pytest.mark.parametrize('df,expected', [
|
||||
(pd.DataFrame({'a': [-1, 1]}), pd.DataFrame({'a': [1, -1]})),
|
||||
(pd.DataFrame({'a': [False, True]}),
|
||||
pd.DataFrame({'a': [True, False]})),
|
||||
(pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}),
|
||||
pd.DataFrame({'a': pd.Series(pd.to_timedelta([1, -1]))}))
|
||||
])
|
||||
def test_neg_numeric(self, df, expected):
|
||||
assert_frame_equal(-df, expected)
|
||||
assert_series_equal(-df['a'], expected['a'])
|
||||
|
||||
@pytest.mark.parametrize('df, expected', [
|
||||
(np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)),
|
||||
([Decimal('1.0'), Decimal('2.0')], [Decimal('-1.0'), Decimal('-2.0')]),
|
||||
])
|
||||
def test_neg_object(self, df, expected):
|
||||
# GH#21380
|
||||
df = pd.DataFrame({'a': df})
|
||||
expected = pd.DataFrame({'a': expected})
|
||||
assert_frame_equal(-df, expected)
|
||||
assert_series_equal(-df['a'], expected['a'])
|
||||
|
||||
@pytest.mark.parametrize('df', [
|
||||
pd.DataFrame({'a': ['a', 'b']}),
|
||||
pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}),
|
||||
])
|
||||
def test_neg_raises(self, df):
|
||||
with pytest.raises(TypeError):
|
||||
(- df)
|
||||
with pytest.raises(TypeError):
|
||||
(- df['a'])
|
||||
|
||||
def test_invert(self):
|
||||
_seriesd = tm.getSeriesData()
|
||||
df = pd.DataFrame(_seriesd)
|
||||
|
||||
assert_frame_equal(-(df < 0), ~(df < 0))
|
||||
|
||||
@pytest.mark.parametrize('df', [
|
||||
pd.DataFrame({'a': [-1, 1]}),
|
||||
pd.DataFrame({'a': [False, True]}),
|
||||
pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}),
|
||||
])
|
||||
def test_pos_numeric(self, df):
|
||||
# GH#16073
|
||||
assert_frame_equal(+df, df)
|
||||
assert_series_equal(+df['a'], df['a'])
|
||||
|
||||
@pytest.mark.parametrize('df', [
|
||||
# numpy changing behavior in the future
|
||||
pytest.param(pd.DataFrame({'a': ['a', 'b']}),
|
||||
marks=[pytest.mark.filterwarnings("ignore")]),
|
||||
pd.DataFrame({'a': np.array([-1, 2], dtype=object)}),
|
||||
pd.DataFrame({'a': [Decimal('-1.0'), Decimal('2.0')]}),
|
||||
])
|
||||
def test_pos_object(self, df):
|
||||
# GH#21380
|
||||
assert_frame_equal(+df, df)
|
||||
assert_series_equal(+df['a'], df['a'])
|
||||
|
||||
@pytest.mark.parametrize('df', [
|
||||
pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}),
|
||||
])
|
||||
def test_pos_raises(self, df):
|
||||
with pytest.raises(TypeError):
|
||||
(+ df)
|
||||
with pytest.raises(TypeError):
|
||||
(+ df['a'])
|
||||
|
||||
|
||||
class TestDataFrameLogicalOperators(object):
|
||||
# &, |, ^
|
||||
|
||||
def test_logical_ops_empty_frame(self):
|
||||
# GH#5808
|
||||
# empty frames, non-mixed dtype
|
||||
df = DataFrame(index=[1])
|
||||
|
||||
result = df & df
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
result = df | df
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
df2 = DataFrame(index=[1, 2])
|
||||
result = df & df2
|
||||
assert_frame_equal(result, df2)
|
||||
|
||||
dfa = DataFrame(index=[1], columns=['A'])
|
||||
|
||||
result = dfa & dfa
|
||||
assert_frame_equal(result, dfa)
|
||||
|
||||
def test_logical_ops_bool_frame(self):
|
||||
# GH#5808
|
||||
df1a_bool = DataFrame(True, index=[1], columns=['A'])
|
||||
|
||||
result = df1a_bool & df1a_bool
|
||||
assert_frame_equal(result, df1a_bool)
|
||||
|
||||
result = df1a_bool | df1a_bool
|
||||
assert_frame_equal(result, df1a_bool)
|
||||
|
||||
def test_logical_ops_int_frame(self):
|
||||
# GH#5808
|
||||
df1a_int = DataFrame(1, index=[1], columns=['A'])
|
||||
df1a_bool = DataFrame(True, index=[1], columns=['A'])
|
||||
|
||||
result = df1a_int | df1a_bool
|
||||
assert_frame_equal(result, df1a_int)
|
||||
|
||||
def test_logical_ops_invalid(self):
|
||||
# GH#5808
|
||||
|
||||
df1 = DataFrame(1.0, index=[1], columns=['A'])
|
||||
df2 = DataFrame(True, index=[1], columns=['A'])
|
||||
with pytest.raises(TypeError):
|
||||
df1 | df2
|
||||
|
||||
df1 = DataFrame('foo', index=[1], columns=['A'])
|
||||
df2 = DataFrame(True, index=[1], columns=['A'])
|
||||
with pytest.raises(TypeError):
|
||||
df1 | df2
|
||||
|
||||
def test_logical_operators(self):
|
||||
|
||||
def _check_bin_op(op):
|
||||
result = op(df1, df2)
|
||||
expected = DataFrame(op(df1.values, df2.values), index=df1.index,
|
||||
columns=df1.columns)
|
||||
assert result.values.dtype == np.bool_
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def _check_unary_op(op):
|
||||
result = op(df1)
|
||||
expected = DataFrame(op(df1.values), index=df1.index,
|
||||
columns=df1.columns)
|
||||
assert result.values.dtype == np.bool_
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = {'a': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True},
|
||||
'b': {'a': False, 'b': True, 'c': False,
|
||||
'd': False, 'e': False},
|
||||
'c': {'a': False, 'b': False, 'c': True,
|
||||
'd': False, 'e': False},
|
||||
'd': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True},
|
||||
'e': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True}}
|
||||
|
||||
df2 = {'a': {'a': True, 'b': False, 'c': True, 'd': False, 'e': False},
|
||||
'b': {'a': False, 'b': True, 'c': False,
|
||||
'd': False, 'e': False},
|
||||
'c': {'a': True, 'b': False, 'c': True, 'd': False, 'e': False},
|
||||
'd': {'a': False, 'b': False, 'c': False,
|
||||
'd': True, 'e': False},
|
||||
'e': {'a': False, 'b': False, 'c': False,
|
||||
'd': False, 'e': True}}
|
||||
|
||||
df1 = DataFrame(df1)
|
||||
df2 = DataFrame(df2)
|
||||
|
||||
_check_bin_op(operator.and_)
|
||||
_check_bin_op(operator.or_)
|
||||
_check_bin_op(operator.xor)
|
||||
|
||||
_check_unary_op(operator.inv) # TODO: belongs elsewhere
|
||||
|
||||
def test_logical_with_nas(self):
|
||||
d = DataFrame({'a': [np.nan, False], 'b': [True, True]})
|
||||
|
||||
# GH4947
|
||||
# bool comparisons should return bool
|
||||
result = d['a'] | d['b']
|
||||
expected = Series([False, True])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# GH4604, automatic casting here
|
||||
result = d['a'].fillna(False) | d['b']
|
||||
expected = Series([True, True])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = d['a'].fillna(False, downcast=False) | d['b']
|
||||
expected = Series([True, True])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
|
||||
class TestDataFrameOperators(TestData):
|
||||
|
||||
@pytest.mark.parametrize('op', [operator.add, operator.sub,
|
||||
operator.mul, operator.truediv])
|
||||
def test_operators_none_as_na(self, op):
|
||||
df = DataFrame({"col1": [2, 5.0, 123, None],
|
||||
"col2": [1, 2, 3, 4]}, dtype=object)
|
||||
|
||||
# since filling converts dtypes from object, changed expected to be
|
||||
# object
|
||||
filled = df.fillna(np.nan)
|
||||
result = op(df, 3)
|
||||
expected = op(filled, 3).astype(object)
|
||||
expected[com.isna(expected)] = None
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = op(df, df)
|
||||
expected = op(filled, filled).astype(object)
|
||||
expected[com.isna(expected)] = None
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = op(df, df.fillna(7))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = op(df.fillna(7), df)
|
||||
assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
@pytest.mark.parametrize('op,res', [('__eq__', False),
|
||||
('__ne__', True)])
|
||||
# TODO: not sure what's correct here.
|
||||
@pytest.mark.filterwarnings("ignore:elementwise:FutureWarning")
|
||||
def test_logical_typeerror_with_non_valid(self, op, res):
|
||||
# we are comparing floats vs a string
|
||||
result = getattr(self.frame, op)('foo')
|
||||
assert bool(result.all().all()) is res
|
||||
|
||||
def test_binary_ops_align(self):
|
||||
|
||||
# test aligning binary ops
|
||||
|
||||
# GH 6681
|
||||
index = MultiIndex.from_product([list('abc'),
|
||||
['one', 'two', 'three'],
|
||||
[1, 2, 3]],
|
||||
names=['first', 'second', 'third'])
|
||||
|
||||
df = DataFrame(np.arange(27 * 3).reshape(27, 3),
|
||||
index=index,
|
||||
columns=['value1', 'value2', 'value3']).sort_index()
|
||||
|
||||
idx = pd.IndexSlice
|
||||
for op in ['add', 'sub', 'mul', 'div', 'truediv']:
|
||||
opa = getattr(operator, op, None)
|
||||
if opa is None:
|
||||
continue
|
||||
|
||||
x = Series([1.0, 10.0, 100.0], [1, 2, 3])
|
||||
result = getattr(df, op)(x, level='third', axis=0)
|
||||
|
||||
expected = pd.concat([opa(df.loc[idx[:, :, i], :], v)
|
||||
for i, v in x.iteritems()]).sort_index()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
x = Series([1.0, 10.0], ['two', 'three'])
|
||||
result = getattr(df, op)(x, level='second', axis=0)
|
||||
|
||||
expected = (pd.concat([opa(df.loc[idx[:, i], :], v)
|
||||
for i, v in x.iteritems()])
|
||||
.reindex_like(df).sort_index())
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH9463 (alignment level of dataframe with series)
|
||||
|
||||
midx = MultiIndex.from_product([['A', 'B'], ['a', 'b']])
|
||||
df = DataFrame(np.ones((2, 4), dtype='int64'), columns=midx)
|
||||
s = pd.Series({'a': 1, 'b': 2})
|
||||
|
||||
df2 = df.copy()
|
||||
df2.columns.names = ['lvl0', 'lvl1']
|
||||
s2 = s.copy()
|
||||
s2.index.name = 'lvl1'
|
||||
|
||||
# different cases of integer/string level names:
|
||||
res1 = df.mul(s, axis=1, level=1)
|
||||
res2 = df.mul(s2, axis=1, level=1)
|
||||
res3 = df2.mul(s, axis=1, level=1)
|
||||
res4 = df2.mul(s2, axis=1, level=1)
|
||||
res5 = df2.mul(s, axis=1, level='lvl1')
|
||||
res6 = df2.mul(s2, axis=1, level='lvl1')
|
||||
|
||||
exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype='int64'),
|
||||
columns=midx)
|
||||
|
||||
for res in [res1, res2]:
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
exp.columns.names = ['lvl0', 'lvl1']
|
||||
for res in [res3, res4, res5, res6]:
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
def test_dti_tz_convert_to_utc(self):
|
||||
base = pd.DatetimeIndex(['2011-01-01', '2011-01-02',
|
||||
'2011-01-03'], tz='UTC')
|
||||
idx1 = base.tz_convert('Asia/Tokyo')[:2]
|
||||
idx2 = base.tz_convert('US/Eastern')[1:]
|
||||
|
||||
df1 = DataFrame({'A': [1, 2]}, index=idx1)
|
||||
df2 = DataFrame({'A': [1, 1]}, index=idx2)
|
||||
exp = DataFrame({'A': [np.nan, 3, np.nan]}, index=base)
|
||||
assert_frame_equal(df1 + df2, exp)
|
||||
|
||||
def test_combineFrame(self):
|
||||
frame_copy = self.frame.reindex(self.frame.index[::2])
|
||||
|
||||
del frame_copy['D']
|
||||
frame_copy['C'][:5] = np.nan
|
||||
|
||||
added = self.frame + frame_copy
|
||||
|
||||
indexer = added['A'].dropna().index
|
||||
exp = (self.frame['A'] * 2).copy()
|
||||
|
||||
tm.assert_series_equal(added['A'].dropna(), exp.loc[indexer])
|
||||
|
||||
exp.loc[~exp.index.isin(indexer)] = np.nan
|
||||
tm.assert_series_equal(added['A'], exp.loc[added['A'].index])
|
||||
|
||||
assert np.isnan(added['C'].reindex(frame_copy.index)[:5]).all()
|
||||
|
||||
# assert(False)
|
||||
|
||||
assert np.isnan(added['D']).all()
|
||||
|
||||
self_added = self.frame + self.frame
|
||||
tm.assert_index_equal(self_added.index, self.frame.index)
|
||||
|
||||
added_rev = frame_copy + self.frame
|
||||
assert np.isnan(added['D']).all()
|
||||
assert np.isnan(added_rev['D']).all()
|
||||
|
||||
# corner cases
|
||||
|
||||
# empty
|
||||
plus_empty = self.frame + self.empty
|
||||
assert np.isnan(plus_empty.values).all()
|
||||
|
||||
empty_plus = self.empty + self.frame
|
||||
assert np.isnan(empty_plus.values).all()
|
||||
|
||||
empty_empty = self.empty + self.empty
|
||||
assert empty_empty.empty
|
||||
|
||||
# out of order
|
||||
reverse = self.frame.reindex(columns=self.frame.columns[::-1])
|
||||
|
||||
assert_frame_equal(reverse + self.frame, self.frame * 2)
|
||||
|
||||
# mix vs float64, upcast
|
||||
added = self.frame + self.mixed_float
|
||||
_check_mixed_float(added, dtype='float64')
|
||||
added = self.mixed_float + self.frame
|
||||
_check_mixed_float(added, dtype='float64')
|
||||
|
||||
# mix vs mix
|
||||
added = self.mixed_float + self.mixed_float2
|
||||
_check_mixed_float(added, dtype=dict(C=None))
|
||||
added = self.mixed_float2 + self.mixed_float
|
||||
_check_mixed_float(added, dtype=dict(C=None))
|
||||
|
||||
# with int
|
||||
added = self.frame + self.mixed_int
|
||||
_check_mixed_float(added, dtype='float64')
|
||||
|
||||
def test_combineSeries(self):
|
||||
|
||||
# Series
|
||||
series = self.frame.xs(self.frame.index[0])
|
||||
|
||||
added = self.frame + series
|
||||
|
||||
for key, s in compat.iteritems(added):
|
||||
assert_series_equal(s, self.frame[key] + series[key])
|
||||
|
||||
larger_series = series.to_dict()
|
||||
larger_series['E'] = 1
|
||||
larger_series = Series(larger_series)
|
||||
larger_added = self.frame + larger_series
|
||||
|
||||
for key, s in compat.iteritems(self.frame):
|
||||
assert_series_equal(larger_added[key], s + series[key])
|
||||
assert 'E' in larger_added
|
||||
assert np.isnan(larger_added['E']).all()
|
||||
|
||||
# no upcast needed
|
||||
added = self.mixed_float + series
|
||||
_check_mixed_float(added)
|
||||
|
||||
# vs mix (upcast) as needed
|
||||
added = self.mixed_float + series.astype('float32')
|
||||
_check_mixed_float(added, dtype=dict(C=None))
|
||||
added = self.mixed_float + series.astype('float16')
|
||||
_check_mixed_float(added, dtype=dict(C=None))
|
||||
|
||||
# these raise with numexpr.....as we are adding an int64 to an
|
||||
# uint64....weird vs int
|
||||
|
||||
# added = self.mixed_int + (100*series).astype('int64')
|
||||
# _check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C =
|
||||
# 'int64', D = 'int64'))
|
||||
# added = self.mixed_int + (100*series).astype('int32')
|
||||
# _check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C =
|
||||
# 'int32', D = 'int64'))
|
||||
|
||||
# TimeSeries
|
||||
ts = self.tsframe['A']
|
||||
|
||||
# 10890
|
||||
# we no longer allow auto timeseries broadcasting
|
||||
# and require explicit broadcasting
|
||||
added = self.tsframe.add(ts, axis='index')
|
||||
|
||||
for key, col in compat.iteritems(self.tsframe):
|
||||
result = col + ts
|
||||
assert_series_equal(added[key], result, check_names=False)
|
||||
assert added[key].name == key
|
||||
if col.name == ts.name:
|
||||
assert result.name == 'A'
|
||||
else:
|
||||
assert result.name is None
|
||||
|
||||
smaller_frame = self.tsframe[:-5]
|
||||
smaller_added = smaller_frame.add(ts, axis='index')
|
||||
|
||||
tm.assert_index_equal(smaller_added.index, self.tsframe.index)
|
||||
|
||||
smaller_ts = ts[:-5]
|
||||
smaller_added2 = self.tsframe.add(smaller_ts, axis='index')
|
||||
assert_frame_equal(smaller_added, smaller_added2)
|
||||
|
||||
# length 0, result is all-nan
|
||||
result = self.tsframe.add(ts[:0], axis='index')
|
||||
expected = DataFrame(np.nan, index=self.tsframe.index,
|
||||
columns=self.tsframe.columns)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Frame is all-nan
|
||||
result = self.tsframe[:0].add(ts, axis='index')
|
||||
expected = DataFrame(np.nan, index=self.tsframe.index,
|
||||
columns=self.tsframe.columns)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# empty but with non-empty index
|
||||
frame = self.tsframe[:1].reindex(columns=[])
|
||||
result = frame.mul(ts, axis='index')
|
||||
assert len(result) == len(ts)
|
||||
|
||||
def test_combineFunc(self):
|
||||
result = self.frame * 2
|
||||
tm.assert_numpy_array_equal(result.values, self.frame.values * 2)
|
||||
|
||||
# vs mix
|
||||
result = self.mixed_float * 2
|
||||
for c, s in compat.iteritems(result):
|
||||
tm.assert_numpy_array_equal(
|
||||
s.values, self.mixed_float[c].values * 2)
|
||||
_check_mixed_float(result, dtype=dict(C=None))
|
||||
|
||||
result = self.empty * 2
|
||||
assert result.index is self.empty.index
|
||||
assert len(result.columns) == 0
|
||||
|
||||
def test_comparisons(self):
|
||||
df1 = tm.makeTimeDataFrame()
|
||||
df2 = tm.makeTimeDataFrame()
|
||||
|
||||
row = self.simple.xs('a')
|
||||
ndim_5 = np.ones(df1.shape + (1, 1, 1))
|
||||
|
||||
def test_comp(func):
|
||||
result = func(df1, df2)
|
||||
tm.assert_numpy_array_equal(result.values,
|
||||
func(df1.values, df2.values))
|
||||
|
||||
with pytest.raises(ValueError, match='dim must be <= 2'):
|
||||
func(df1, ndim_5)
|
||||
|
||||
result2 = func(self.simple, row)
|
||||
tm.assert_numpy_array_equal(result2.values,
|
||||
func(self.simple.values, row.values))
|
||||
|
||||
result3 = func(self.frame, 0)
|
||||
tm.assert_numpy_array_equal(result3.values,
|
||||
func(self.frame.values, 0))
|
||||
|
||||
msg = 'Can only compare identically-labeled DataFrame'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
func(self.simple, self.simple[:2])
|
||||
|
||||
test_comp(operator.eq)
|
||||
test_comp(operator.ne)
|
||||
test_comp(operator.lt)
|
||||
test_comp(operator.gt)
|
||||
test_comp(operator.ge)
|
||||
test_comp(operator.le)
|
||||
|
||||
def test_comparison_protected_from_errstate(self):
|
||||
missing_df = tm.makeDataFrame()
|
||||
missing_df.iloc[0]['A'] = np.nan
|
||||
with np.errstate(invalid='ignore'):
|
||||
expected = missing_df.values < 0
|
||||
with np.errstate(invalid='raise'):
|
||||
result = (missing_df < 0).values
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_boolean_comparison(self):
|
||||
|
||||
# GH 4576
|
||||
# boolean comparisons with a tuple/list give unexpected results
|
||||
df = DataFrame(np.arange(6).reshape((3, 2)))
|
||||
b = np.array([2, 2])
|
||||
b_r = np.atleast_2d([2, 2])
|
||||
b_c = b_r.T
|
||||
lst = [2, 2, 2]
|
||||
tup = tuple(lst)
|
||||
|
||||
# gt
|
||||
expected = DataFrame([[False, False], [False, True], [True, True]])
|
||||
result = df > b
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.values > b
|
||||
assert_numpy_array_equal(result, expected.values)
|
||||
|
||||
msg1d = 'Unable to coerce to Series, length must be 2: given 3'
|
||||
msg2d = 'Unable to coerce to DataFrame, shape must be'
|
||||
msg2db = 'operands could not be broadcast together with shapes'
|
||||
with pytest.raises(ValueError, match=msg1d):
|
||||
# wrong shape
|
||||
df > lst
|
||||
|
||||
with pytest.raises(ValueError, match=msg1d):
|
||||
# wrong shape
|
||||
result = df > tup
|
||||
|
||||
# broadcasts like ndarray (GH#23000)
|
||||
result = df > b_r
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.values > b_r
|
||||
assert_numpy_array_equal(result, expected.values)
|
||||
|
||||
with pytest.raises(ValueError, match=msg2d):
|
||||
df > b_c
|
||||
|
||||
with pytest.raises(ValueError, match=msg2db):
|
||||
df.values > b_c
|
||||
|
||||
# ==
|
||||
expected = DataFrame([[False, False], [True, False], [False, False]])
|
||||
result = df == b
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(ValueError, match=msg1d):
|
||||
result = df == lst
|
||||
|
||||
with pytest.raises(ValueError, match=msg1d):
|
||||
result = df == tup
|
||||
|
||||
# broadcasts like ndarray (GH#23000)
|
||||
result = df == b_r
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.values == b_r
|
||||
assert_numpy_array_equal(result, expected.values)
|
||||
|
||||
with pytest.raises(ValueError, match=msg2d):
|
||||
df == b_c
|
||||
|
||||
assert df.values.shape != b_c.shape
|
||||
|
||||
# with alignment
|
||||
df = DataFrame(np.arange(6).reshape((3, 2)),
|
||||
columns=list('AB'), index=list('abc'))
|
||||
expected.index = df.index
|
||||
expected.columns = df.columns
|
||||
|
||||
with pytest.raises(ValueError, match=msg1d):
|
||||
result = df == lst
|
||||
|
||||
with pytest.raises(ValueError, match=msg1d):
|
||||
result = df == tup
|
||||
|
||||
def test_combine_generic(self):
|
||||
df1 = self.frame
|
||||
df2 = self.frame.loc[self.frame.index[:-5], ['A', 'B', 'C']]
|
||||
|
||||
combined = df1.combine(df2, np.add)
|
||||
combined2 = df2.combine(df1, np.add)
|
||||
assert combined['D'].isna().all()
|
||||
assert combined2['D'].isna().all()
|
||||
|
||||
chunk = combined.loc[combined.index[:-5], ['A', 'B', 'C']]
|
||||
chunk2 = combined2.loc[combined2.index[:-5], ['A', 'B', 'C']]
|
||||
|
||||
exp = self.frame.loc[self.frame.index[:-5],
|
||||
['A', 'B', 'C']].reindex_like(chunk) * 2
|
||||
assert_frame_equal(chunk, exp)
|
||||
assert_frame_equal(chunk2, exp)
|
||||
|
||||
def test_inplace_ops_alignment(self):
|
||||
|
||||
# inplace ops / ops alignment
|
||||
# GH 8511
|
||||
|
||||
columns = list('abcdefg')
|
||||
X_orig = DataFrame(np.arange(10 * len(columns))
|
||||
.reshape(-1, len(columns)),
|
||||
columns=columns, index=range(10))
|
||||
Z = 100 * X_orig.iloc[:, 1:-1].copy()
|
||||
block1 = list('bedcf')
|
||||
subs = list('bcdef')
|
||||
|
||||
# add
|
||||
X = X_orig.copy()
|
||||
result1 = (X[block1] + Z).reindex(columns=subs)
|
||||
|
||||
X[block1] += Z
|
||||
result2 = X.reindex(columns=subs)
|
||||
|
||||
X = X_orig.copy()
|
||||
result3 = (X[block1] + Z[block1]).reindex(columns=subs)
|
||||
|
||||
X[block1] += Z[block1]
|
||||
result4 = X.reindex(columns=subs)
|
||||
|
||||
assert_frame_equal(result1, result2)
|
||||
assert_frame_equal(result1, result3)
|
||||
assert_frame_equal(result1, result4)
|
||||
|
||||
# sub
|
||||
X = X_orig.copy()
|
||||
result1 = (X[block1] - Z).reindex(columns=subs)
|
||||
|
||||
X[block1] -= Z
|
||||
result2 = X.reindex(columns=subs)
|
||||
|
||||
X = X_orig.copy()
|
||||
result3 = (X[block1] - Z[block1]).reindex(columns=subs)
|
||||
|
||||
X[block1] -= Z[block1]
|
||||
result4 = X.reindex(columns=subs)
|
||||
|
||||
assert_frame_equal(result1, result2)
|
||||
assert_frame_equal(result1, result3)
|
||||
assert_frame_equal(result1, result4)
|
||||
|
||||
def test_inplace_ops_identity(self):
|
||||
|
||||
# GH 5104
|
||||
# make sure that we are actually changing the object
|
||||
s_orig = Series([1, 2, 3])
|
||||
df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5))
|
||||
|
||||
# no dtype change
|
||||
s = s_orig.copy()
|
||||
s2 = s
|
||||
s += 1
|
||||
assert_series_equal(s, s2)
|
||||
assert_series_equal(s_orig + 1, s)
|
||||
assert s is s2
|
||||
assert s._data is s2._data
|
||||
|
||||
df = df_orig.copy()
|
||||
df2 = df
|
||||
df += 1
|
||||
assert_frame_equal(df, df2)
|
||||
assert_frame_equal(df_orig + 1, df)
|
||||
assert df is df2
|
||||
assert df._data is df2._data
|
||||
|
||||
# dtype change
|
||||
s = s_orig.copy()
|
||||
s2 = s
|
||||
s += 1.5
|
||||
assert_series_equal(s, s2)
|
||||
assert_series_equal(s_orig + 1.5, s)
|
||||
|
||||
df = df_orig.copy()
|
||||
df2 = df
|
||||
df += 1.5
|
||||
assert_frame_equal(df, df2)
|
||||
assert_frame_equal(df_orig + 1.5, df)
|
||||
assert df is df2
|
||||
assert df._data is df2._data
|
||||
|
||||
# mixed dtype
|
||||
arr = np.random.randint(0, 10, size=5)
|
||||
df_orig = DataFrame({'A': arr.copy(), 'B': 'foo'})
|
||||
df = df_orig.copy()
|
||||
df2 = df
|
||||
df['A'] += 1
|
||||
expected = DataFrame({'A': arr.copy() + 1, 'B': 'foo'})
|
||||
assert_frame_equal(df, expected)
|
||||
assert_frame_equal(df2, expected)
|
||||
assert df._data is df2._data
|
||||
|
||||
df = df_orig.copy()
|
||||
df2 = df
|
||||
df['A'] += 1.5
|
||||
expected = DataFrame({'A': arr.copy() + 1.5, 'B': 'foo'})
|
||||
assert_frame_equal(df, expected)
|
||||
assert_frame_equal(df2, expected)
|
||||
assert df._data is df2._data
|
||||
|
||||
@pytest.mark.parametrize('op', ['add', 'and', 'div', 'floordiv', 'mod',
|
||||
'mul', 'or', 'pow', 'sub', 'truediv',
|
||||
'xor'])
|
||||
def test_inplace_ops_identity2(self, op):
|
||||
|
||||
if compat.PY3 and op == 'div':
|
||||
return
|
||||
|
||||
df = DataFrame({'a': [1., 2., 3.],
|
||||
'b': [1, 2, 3]})
|
||||
|
||||
operand = 2
|
||||
if op in ('and', 'or', 'xor'):
|
||||
# cannot use floats for boolean ops
|
||||
df['a'] = [True, False, True]
|
||||
|
||||
df_copy = df.copy()
|
||||
iop = '__i{}__'.format(op)
|
||||
op = '__{}__'.format(op)
|
||||
|
||||
# no id change and value is correct
|
||||
getattr(df, iop)(operand)
|
||||
expected = getattr(df_copy, op)(operand)
|
||||
assert_frame_equal(df, expected)
|
||||
expected = id(df)
|
||||
assert id(df) == expected
|
||||
|
||||
def test_alignment_non_pandas(self):
|
||||
index = ['A', 'B', 'C']
|
||||
columns = ['X', 'Y', 'Z']
|
||||
df = pd.DataFrame(np.random.randn(3, 3), index=index, columns=columns)
|
||||
|
||||
align = pd.core.ops._align_method_FRAME
|
||||
for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.int64),
|
||||
range(1, 4)]:
|
||||
|
||||
tm.assert_series_equal(align(df, val, 'index'),
|
||||
Series([1, 2, 3], index=df.index))
|
||||
tm.assert_series_equal(align(df, val, 'columns'),
|
||||
Series([1, 2, 3], index=df.columns))
|
||||
|
||||
# length mismatch
|
||||
msg = 'Unable to coerce to Series, length must be 3: given 2'
|
||||
for val in [[1, 2], (1, 2), np.array([1, 2]), range(1, 3)]:
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
align(df, val, 'index')
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
align(df, val, 'columns')
|
||||
|
||||
val = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
||||
tm.assert_frame_equal(align(df, val, 'index'),
|
||||
DataFrame(val, index=df.index,
|
||||
columns=df.columns))
|
||||
tm.assert_frame_equal(align(df, val, 'columns'),
|
||||
DataFrame(val, index=df.index,
|
||||
columns=df.columns))
|
||||
|
||||
# shape mismatch
|
||||
msg = 'Unable to coerce to DataFrame, shape must be'
|
||||
val = np.array([[1, 2, 3], [4, 5, 6]])
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
align(df, val, 'index')
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
align(df, val, 'columns')
|
||||
|
||||
val = np.zeros((3, 3, 3))
|
||||
with pytest.raises(ValueError):
|
||||
align(df, val, 'index')
|
||||
with pytest.raises(ValueError):
|
||||
align(df, val, 'columns')
|
||||
|
||||
def test_no_warning(self, all_arithmetic_operators):
|
||||
df = pd.DataFrame({"A": [0., 0.], "B": [0., None]})
|
||||
b = df['B']
|
||||
with tm.assert_produces_warning(None):
|
||||
getattr(df, all_arithmetic_operators)(b, 0)
|
||||
@@ -0,0 +1,147 @@
|
||||
from datetime import timedelta
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame, DatetimeIndex, Index, PeriodIndex, Timedelta, date_range,
|
||||
period_range, to_datetime)
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def _permute(obj):
|
||||
return obj.take(np.random.permutation(len(obj)))
|
||||
|
||||
|
||||
class TestPeriodIndex(object):
|
||||
|
||||
def test_as_frame_columns(self):
|
||||
rng = period_range('1/1/2000', periods=5)
|
||||
df = DataFrame(np.random.randn(10, 5), columns=rng)
|
||||
|
||||
ts = df[rng[0]]
|
||||
tm.assert_series_equal(ts, df.iloc[:, 0])
|
||||
|
||||
# GH # 1211
|
||||
repr(df)
|
||||
|
||||
ts = df['1/1/2000']
|
||||
tm.assert_series_equal(ts, df.iloc[:, 0])
|
||||
|
||||
def test_frame_setitem(self):
|
||||
rng = period_range('1/1/2000', periods=5, name='index')
|
||||
df = DataFrame(np.random.randn(5, 3), index=rng)
|
||||
|
||||
df['Index'] = rng
|
||||
rs = Index(df['Index'])
|
||||
tm.assert_index_equal(rs, rng, check_names=False)
|
||||
assert rs.name == 'Index'
|
||||
assert rng.name == 'index'
|
||||
|
||||
rs = df.reset_index().set_index('index')
|
||||
assert isinstance(rs.index, PeriodIndex)
|
||||
tm.assert_index_equal(rs.index, rng)
|
||||
|
||||
def test_frame_to_time_stamp(self):
|
||||
K = 5
|
||||
index = period_range(freq='A', start='1/1/2001', end='12/1/2009')
|
||||
df = DataFrame(np.random.randn(len(index), K), index=index)
|
||||
df['mix'] = 'a'
|
||||
|
||||
exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC')
|
||||
exp_index = exp_index + Timedelta(1, 'D') - Timedelta(1, 'ns')
|
||||
result = df.to_timestamp('D', 'end')
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
tm.assert_numpy_array_equal(result.values, df.values)
|
||||
|
||||
exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN')
|
||||
result = df.to_timestamp('D', 'start')
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
def _get_with_delta(delta, freq='A-DEC'):
|
||||
return date_range(to_datetime('1/1/2001') + delta,
|
||||
to_datetime('12/31/2009') + delta, freq=freq)
|
||||
|
||||
delta = timedelta(hours=23)
|
||||
result = df.to_timestamp('H', 'end')
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, 'h') - Timedelta(1, 'ns')
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
delta = timedelta(hours=23, minutes=59)
|
||||
result = df.to_timestamp('T', 'end')
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, 'm') - Timedelta(1, 'ns')
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
result = df.to_timestamp('S', 'end')
|
||||
delta = timedelta(hours=23, minutes=59, seconds=59)
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, 's') - Timedelta(1, 'ns')
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
# columns
|
||||
df = df.T
|
||||
|
||||
exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC')
|
||||
exp_index = exp_index + Timedelta(1, 'D') - Timedelta(1, 'ns')
|
||||
result = df.to_timestamp('D', 'end', axis=1)
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
tm.assert_numpy_array_equal(result.values, df.values)
|
||||
|
||||
exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN')
|
||||
result = df.to_timestamp('D', 'start', axis=1)
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
|
||||
delta = timedelta(hours=23)
|
||||
result = df.to_timestamp('H', 'end', axis=1)
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, 'h') - Timedelta(1, 'ns')
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
|
||||
delta = timedelta(hours=23, minutes=59)
|
||||
result = df.to_timestamp('T', 'end', axis=1)
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, 'm') - Timedelta(1, 'ns')
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
|
||||
result = df.to_timestamp('S', 'end', axis=1)
|
||||
delta = timedelta(hours=23, minutes=59, seconds=59)
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, 's') - Timedelta(1, 'ns')
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
|
||||
# invalid axis
|
||||
with pytest.raises(ValueError, match='axis'):
|
||||
df.to_timestamp(axis=2)
|
||||
|
||||
result1 = df.to_timestamp('5t', axis=1)
|
||||
result2 = df.to_timestamp('t', axis=1)
|
||||
expected = pd.date_range('2001-01-01', '2009-01-01', freq='AS')
|
||||
assert isinstance(result1.columns, DatetimeIndex)
|
||||
assert isinstance(result2.columns, DatetimeIndex)
|
||||
tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8)
|
||||
tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8)
|
||||
# PeriodIndex.to_timestamp always use 'infer'
|
||||
assert result1.columns.freqstr == 'AS-JAN'
|
||||
assert result2.columns.freqstr == 'AS-JAN'
|
||||
|
||||
def test_frame_index_to_string(self):
|
||||
index = PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M')
|
||||
frame = DataFrame(np.random.randn(3, 4), index=index)
|
||||
|
||||
# it works!
|
||||
frame.to_string()
|
||||
|
||||
def test_align_frame(self):
|
||||
rng = period_range('1/1/2000', '1/1/2010', freq='A')
|
||||
ts = DataFrame(np.random.randn(len(rng), 3), index=rng)
|
||||
|
||||
result = ts + ts[::2]
|
||||
expected = ts + ts
|
||||
expected.values[1::2] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = ts + _permute(ts[::2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,384 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Series, Timestamp
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
class TestDataFrameQuantile(TestData):
|
||||
|
||||
def test_quantile(self):
|
||||
from numpy import percentile
|
||||
|
||||
q = self.tsframe.quantile(0.1, axis=0)
|
||||
assert q['A'] == percentile(self.tsframe['A'], 10)
|
||||
tm.assert_index_equal(q.index, self.tsframe.columns)
|
||||
|
||||
q = self.tsframe.quantile(0.9, axis=1)
|
||||
assert (q['2000-01-17'] ==
|
||||
percentile(self.tsframe.loc['2000-01-17'], 90))
|
||||
tm.assert_index_equal(q.index, self.tsframe.index)
|
||||
|
||||
# test degenerate case
|
||||
q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0)
|
||||
assert(np.isnan(q['x']) and np.isnan(q['y']))
|
||||
|
||||
# non-numeric exclusion
|
||||
df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]})
|
||||
rs = df.quantile(0.5)
|
||||
xp = df.median().rename(0.5)
|
||||
assert_series_equal(rs, xp)
|
||||
|
||||
# axis
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
||||
result = df.quantile(.5, axis=1)
|
||||
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df.quantile([.5, .75], axis=1)
|
||||
expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75],
|
||||
3: [3.5, 3.75]}, index=[0.5, 0.75])
|
||||
assert_frame_equal(result, expected, check_index_type=True)
|
||||
|
||||
# We may want to break API in the future to change this
|
||||
# so that we exclude non-numeric along the same axis
|
||||
# See GH #7312
|
||||
df = DataFrame([[1, 2, 3],
|
||||
['a', 'b', 4]])
|
||||
result = df.quantile(.5, axis=1)
|
||||
expected = Series([3., 4.], index=[0, 1], name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_quantile_axis_mixed(self):
|
||||
|
||||
# mixed on axis=1
|
||||
df = DataFrame({"A": [1, 2, 3],
|
||||
"B": [2., 3., 4.],
|
||||
"C": pd.date_range('20130101', periods=3),
|
||||
"D": ['foo', 'bar', 'baz']})
|
||||
result = df.quantile(.5, axis=1)
|
||||
expected = Series([1.5, 2.5, 3.5], name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# must raise
|
||||
with pytest.raises(TypeError):
|
||||
df.quantile(.5, axis=1, numeric_only=False)
|
||||
|
||||
def test_quantile_axis_parameter(self):
|
||||
# GH 9543/9544
|
||||
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
||||
|
||||
result = df.quantile(.5, axis=0)
|
||||
|
||||
expected = Series([2., 3.], index=["A", "B"], name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
expected = df.quantile(.5, axis="index")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df.quantile(.5, axis=1)
|
||||
|
||||
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df.quantile(.5, axis="columns")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
pytest.raises(ValueError, df.quantile, 0.1, axis=-1)
|
||||
pytest.raises(ValueError, df.quantile, 0.1, axis="column")
|
||||
|
||||
def test_quantile_interpolation(self):
|
||||
# see gh-10174
|
||||
from numpy import percentile
|
||||
|
||||
# interpolation = linear (default case)
|
||||
q = self.tsframe.quantile(0.1, axis=0, interpolation='linear')
|
||||
assert q['A'] == percentile(self.tsframe['A'], 10)
|
||||
q = self.intframe.quantile(0.1)
|
||||
assert q['A'] == percentile(self.intframe['A'], 10)
|
||||
|
||||
# test with and without interpolation keyword
|
||||
q1 = self.intframe.quantile(0.1)
|
||||
assert q1['A'] == np.percentile(self.intframe['A'], 10)
|
||||
tm.assert_series_equal(q, q1)
|
||||
|
||||
# interpolation method other than default linear
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
||||
result = df.quantile(.5, axis=1, interpolation='nearest')
|
||||
expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# cross-check interpolation=nearest results in original dtype
|
||||
exp = np.percentile(np.array([[1, 2, 3], [2, 3, 4]]), .5,
|
||||
axis=0, interpolation='nearest')
|
||||
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='int64')
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# float
|
||||
df = DataFrame({"A": [1., 2., 3.], "B": [2., 3., 4.]}, index=[1, 2, 3])
|
||||
result = df.quantile(.5, axis=1, interpolation='nearest')
|
||||
expected = Series([1., 2., 3.], index=[1, 2, 3], name=0.5)
|
||||
tm.assert_series_equal(result, expected)
|
||||
exp = np.percentile(np.array([[1., 2., 3.], [2., 3., 4.]]), .5,
|
||||
axis=0, interpolation='nearest')
|
||||
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='float64')
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# axis
|
||||
result = df.quantile([.5, .75], axis=1, interpolation='lower')
|
||||
expected = DataFrame({1: [1., 1.], 2: [2., 2.],
|
||||
3: [3., 3.]}, index=[0.5, 0.75])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# test degenerate case
|
||||
df = DataFrame({'x': [], 'y': []})
|
||||
q = df.quantile(0.1, axis=0, interpolation='higher')
|
||||
assert(np.isnan(q['x']) and np.isnan(q['y']))
|
||||
|
||||
# multi
|
||||
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
|
||||
columns=['a', 'b', 'c'])
|
||||
result = df.quantile([.25, .5], interpolation='midpoint')
|
||||
|
||||
# https://github.com/numpy/numpy/issues/7163
|
||||
expected = DataFrame([[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
|
||||
index=[.25, .5], columns=['a', 'b', 'c'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_quantile_multi(self):
|
||||
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
|
||||
columns=['a', 'b', 'c'])
|
||||
result = df.quantile([.25, .5])
|
||||
expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
|
||||
index=[.25, .5], columns=['a', 'b', 'c'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# axis = 1
|
||||
result = df.quantile([.25, .5], axis=1)
|
||||
expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
|
||||
index=[.25, .5], columns=[0, 1, 2])
|
||||
|
||||
# empty
|
||||
result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0)
|
||||
expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]},
|
||||
index=[.1, .9])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_quantile_datetime(self):
|
||||
df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]})
|
||||
|
||||
# exclude datetime
|
||||
result = df.quantile(.5)
|
||||
expected = Series([2.5], index=['b'])
|
||||
|
||||
# datetime
|
||||
result = df.quantile(.5, numeric_only=False)
|
||||
expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5],
|
||||
index=['a', 'b'],
|
||||
name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# datetime w/ multi
|
||||
result = df.quantile([.5], numeric_only=False)
|
||||
expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]],
|
||||
index=[.5], columns=['a', 'b'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# axis = 1
|
||||
df['c'] = pd.to_datetime(['2011', '2012'])
|
||||
result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False)
|
||||
expected = Series([Timestamp('2010-07-02 12:00:00'),
|
||||
Timestamp('2011-07-02 12:00:00')],
|
||||
index=[0, 1],
|
||||
name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False)
|
||||
expected = DataFrame([[Timestamp('2010-07-02 12:00:00'),
|
||||
Timestamp('2011-07-02 12:00:00')]],
|
||||
index=[0.5], columns=[0, 1])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# empty when numeric_only=True
|
||||
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
|
||||
# result = df[['a', 'c']].quantile(.5)
|
||||
# result = df[['a', 'c']].quantile([.5])
|
||||
|
||||
def test_quantile_invalid(self):
|
||||
msg = 'percentiles should all be in the interval \\[0, 1\\]'
|
||||
for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
self.tsframe.quantile(invalid)
|
||||
|
||||
def test_quantile_box(self):
|
||||
df = DataFrame({'A': [pd.Timestamp('2011-01-01'),
|
||||
pd.Timestamp('2011-01-02'),
|
||||
pd.Timestamp('2011-01-03')],
|
||||
'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.Timestamp('2011-01-03', tz='US/Eastern')],
|
||||
'C': [pd.Timedelta('1 days'),
|
||||
pd.Timedelta('2 days'),
|
||||
pd.Timedelta('3 days')]})
|
||||
|
||||
res = df.quantile(0.5, numeric_only=False)
|
||||
|
||||
exp = pd.Series([pd.Timestamp('2011-01-02'),
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.Timedelta('2 days')],
|
||||
name=0.5, index=['A', 'B', 'C'])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5], numeric_only=False)
|
||||
exp = pd.DataFrame([[pd.Timestamp('2011-01-02'),
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.Timedelta('2 days')]],
|
||||
index=[0.5], columns=['A', 'B', 'C'])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# DatetimeBlock may be consolidated and contain NaT in different loc
|
||||
df = DataFrame({'A': [pd.Timestamp('2011-01-01'),
|
||||
pd.NaT,
|
||||
pd.Timestamp('2011-01-02'),
|
||||
pd.Timestamp('2011-01-03')],
|
||||
'a': [pd.Timestamp('2011-01-01'),
|
||||
pd.Timestamp('2011-01-02'),
|
||||
pd.NaT,
|
||||
pd.Timestamp('2011-01-03')],
|
||||
'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
|
||||
pd.NaT,
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.Timestamp('2011-01-03', tz='US/Eastern')],
|
||||
'b': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.NaT,
|
||||
pd.Timestamp('2011-01-03', tz='US/Eastern')],
|
||||
'C': [pd.Timedelta('1 days'),
|
||||
pd.Timedelta('2 days'),
|
||||
pd.Timedelta('3 days'),
|
||||
pd.NaT],
|
||||
'c': [pd.NaT,
|
||||
pd.Timedelta('1 days'),
|
||||
pd.Timedelta('2 days'),
|
||||
pd.Timedelta('3 days')]},
|
||||
columns=list('AaBbCc'))
|
||||
|
||||
res = df.quantile(0.5, numeric_only=False)
|
||||
exp = pd.Series([pd.Timestamp('2011-01-02'),
|
||||
pd.Timestamp('2011-01-02'),
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.Timedelta('2 days'),
|
||||
pd.Timedelta('2 days')],
|
||||
name=0.5, index=list('AaBbCc'))
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5], numeric_only=False)
|
||||
exp = pd.DataFrame([[pd.Timestamp('2011-01-02'),
|
||||
pd.Timestamp('2011-01-02'),
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
||||
pd.Timedelta('2 days'),
|
||||
pd.Timedelta('2 days')]],
|
||||
index=[0.5], columns=list('AaBbCc'))
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_quantile_nan(self):
|
||||
|
||||
# GH 14357 - float block where some cols have missing values
|
||||
df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)})
|
||||
df.iloc[-1, 1] = np.nan
|
||||
|
||||
res = df.quantile(0.5)
|
||||
exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5, 0.75])
|
||||
exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.quantile(0.5, axis=1)
|
||||
exp = Series(np.arange(1.0, 6.0), name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5, 0.75], axis=1)
|
||||
exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# full-nan column
|
||||
df['b'] = np.nan
|
||||
|
||||
res = df.quantile(0.5)
|
||||
exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5, 0.75])
|
||||
exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]},
|
||||
index=[0.5, 0.75])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_quantile_nat(self):
|
||||
|
||||
# full NaT column
|
||||
df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]})
|
||||
|
||||
res = df.quantile(0.5, numeric_only=False)
|
||||
exp = Series([pd.NaT], index=['a'], name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5], numeric_only=False)
|
||||
exp = DataFrame({'a': [pd.NaT]}, index=[0.5])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# mixed non-null / full null column
|
||||
df = DataFrame({'a': [pd.Timestamp('2012-01-01'),
|
||||
pd.Timestamp('2012-01-02'),
|
||||
pd.Timestamp('2012-01-03')],
|
||||
'b': [pd.NaT, pd.NaT, pd.NaT]})
|
||||
|
||||
res = df.quantile(0.5, numeric_only=False)
|
||||
exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'],
|
||||
name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5], numeric_only=False)
|
||||
exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5],
|
||||
columns=['a', 'b'])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_quantile_empty(self):
|
||||
|
||||
# floats
|
||||
df = DataFrame(columns=['a', 'b'], dtype='float64')
|
||||
|
||||
res = df.quantile(0.5)
|
||||
exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5])
|
||||
exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
|
||||
# res = df.quantile(0.5, axis=1)
|
||||
# res = df.quantile([0.5], axis=1)
|
||||
|
||||
# ints
|
||||
df = DataFrame(columns=['a', 'b'], dtype='int64')
|
||||
|
||||
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
|
||||
# res = df.quantile(0.5)
|
||||
|
||||
# datetimes
|
||||
df = DataFrame(columns=['a', 'b'], dtype='datetime64[ns]')
|
||||
|
||||
# FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0)
|
||||
# res = df.quantile(0.5, numeric_only=False)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,318 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from datetime import datetime, timedelta
|
||||
from distutils.version import LooseVersion
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, Series
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
|
||||
class TestRank(TestData):
|
||||
s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])
|
||||
df = DataFrame({'A': s, 'B': s})
|
||||
|
||||
results = {
|
||||
'average': np.array([1.5, 5.5, 7.0, 3.5, np.nan,
|
||||
3.5, 1.5, 8.0, np.nan, 5.5]),
|
||||
'min': np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]),
|
||||
'max': np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]),
|
||||
'first': np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]),
|
||||
'dense': np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]),
|
||||
}
|
||||
|
||||
@pytest.fixture(params=['average', 'min', 'max', 'first', 'dense'])
|
||||
def method(self, request):
|
||||
"""
|
||||
Fixture for trying all rank methods
|
||||
"""
|
||||
return request.param
|
||||
|
||||
def test_rank(self):
|
||||
rankdata = pytest.importorskip('scipy.stats.rankdata')
|
||||
|
||||
self.frame['A'][::2] = np.nan
|
||||
self.frame['B'][::3] = np.nan
|
||||
self.frame['C'][::4] = np.nan
|
||||
self.frame['D'][::5] = np.nan
|
||||
|
||||
ranks0 = self.frame.rank()
|
||||
ranks1 = self.frame.rank(1)
|
||||
mask = np.isnan(self.frame.values)
|
||||
|
||||
fvals = self.frame.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(rankdata, 0, fvals)
|
||||
exp0[mask] = np.nan
|
||||
|
||||
exp1 = np.apply_along_axis(rankdata, 1, fvals)
|
||||
exp1[mask] = np.nan
|
||||
|
||||
tm.assert_almost_equal(ranks0.values, exp0)
|
||||
tm.assert_almost_equal(ranks1.values, exp1)
|
||||
|
||||
# integers
|
||||
df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))
|
||||
|
||||
result = df.rank()
|
||||
exp = df.astype(float).rank()
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
result = df.rank(1)
|
||||
exp = df.astype(float).rank(1)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
def test_rank2(self):
|
||||
df = DataFrame([[1, 3, 2], [1, 2, 3]])
|
||||
expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
|
||||
result = df.rank(1, pct=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame([[1, 3, 2], [1, 2, 3]])
|
||||
expected = df.rank(0) / 2.0
|
||||
result = df.rank(0, pct=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']])
|
||||
expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
|
||||
result = df.rank(1, numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
|
||||
result = df.rank(0, numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']])
|
||||
expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]])
|
||||
result = df.rank(1, numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]])
|
||||
result = df.rank(0, numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# f7u12, this does not work without extensive workaround
|
||||
data = [[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
|
||||
[datetime(2000, 1, 2), datetime(2000, 1, 3),
|
||||
datetime(2000, 1, 1)]]
|
||||
df = DataFrame(data)
|
||||
|
||||
# check the rank
|
||||
expected = DataFrame([[2., np.nan, 1.],
|
||||
[2., 3., 1.]])
|
||||
result = df.rank(1, numeric_only=False, ascending=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([[1., np.nan, 2.],
|
||||
[2., 1., 3.]])
|
||||
result = df.rank(1, numeric_only=False, ascending=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# mixed-type frames
|
||||
self.mixed_frame['datetime'] = datetime.now()
|
||||
self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1)
|
||||
|
||||
result = self.mixed_frame.rank(1)
|
||||
expected = self.mixed_frame.rank(1, numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10,
|
||||
1e60, 1e80, 1e-30]})
|
||||
exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]})
|
||||
tm.assert_frame_equal(df.rank(), exp)
|
||||
|
||||
def test_rank_na_option(self):
|
||||
rankdata = pytest.importorskip('scipy.stats.rankdata')
|
||||
|
||||
self.frame['A'][::2] = np.nan
|
||||
self.frame['B'][::3] = np.nan
|
||||
self.frame['C'][::4] = np.nan
|
||||
self.frame['D'][::5] = np.nan
|
||||
|
||||
# bottom
|
||||
ranks0 = self.frame.rank(na_option='bottom')
|
||||
ranks1 = self.frame.rank(1, na_option='bottom')
|
||||
|
||||
fvals = self.frame.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(rankdata, 0, fvals)
|
||||
exp1 = np.apply_along_axis(rankdata, 1, fvals)
|
||||
|
||||
tm.assert_almost_equal(ranks0.values, exp0)
|
||||
tm.assert_almost_equal(ranks1.values, exp1)
|
||||
|
||||
# top
|
||||
ranks0 = self.frame.rank(na_option='top')
|
||||
ranks1 = self.frame.rank(1, na_option='top')
|
||||
|
||||
fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
|
||||
fval1 = self.frame.T
|
||||
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
|
||||
fval1 = fval1.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(rankdata, 0, fval0)
|
||||
exp1 = np.apply_along_axis(rankdata, 1, fval1)
|
||||
|
||||
tm.assert_almost_equal(ranks0.values, exp0)
|
||||
tm.assert_almost_equal(ranks1.values, exp1)
|
||||
|
||||
# descending
|
||||
|
||||
# bottom
|
||||
ranks0 = self.frame.rank(na_option='top', ascending=False)
|
||||
ranks1 = self.frame.rank(1, na_option='top', ascending=False)
|
||||
|
||||
fvals = self.frame.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(rankdata, 0, -fvals)
|
||||
exp1 = np.apply_along_axis(rankdata, 1, -fvals)
|
||||
|
||||
tm.assert_almost_equal(ranks0.values, exp0)
|
||||
tm.assert_almost_equal(ranks1.values, exp1)
|
||||
|
||||
# descending
|
||||
|
||||
# top
|
||||
ranks0 = self.frame.rank(na_option='bottom', ascending=False)
|
||||
ranks1 = self.frame.rank(1, na_option='bottom', ascending=False)
|
||||
|
||||
fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
|
||||
fval1 = self.frame.T
|
||||
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
|
||||
fval1 = fval1.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(rankdata, 0, -fval0)
|
||||
exp1 = np.apply_along_axis(rankdata, 1, -fval1)
|
||||
|
||||
tm.assert_numpy_array_equal(ranks0.values, exp0)
|
||||
tm.assert_numpy_array_equal(ranks1.values, exp1)
|
||||
|
||||
# bad values throw error
|
||||
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
self.frame.rank(na_option='bad', ascending=False)
|
||||
|
||||
# invalid type
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
self.frame.rank(na_option=True, ascending=False)
|
||||
|
||||
def test_rank_axis(self):
|
||||
# check if using axes' names gives the same result
|
||||
df = DataFrame([[2, 1], [4, 3]])
|
||||
tm.assert_frame_equal(df.rank(axis=0), df.rank(axis='index'))
|
||||
tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns'))
|
||||
|
||||
def test_rank_methods_frame(self):
|
||||
pytest.importorskip('scipy.stats.special')
|
||||
rankdata = pytest.importorskip('scipy.stats.rankdata')
|
||||
import scipy
|
||||
|
||||
xs = np.random.randint(0, 21, (100, 26))
|
||||
xs = (xs - 10.0) / 10.0
|
||||
cols = [chr(ord('z') - i) for i in range(xs.shape[1])]
|
||||
|
||||
for vals in [xs, xs + 1e6, xs * 1e-6]:
|
||||
df = DataFrame(vals, columns=cols)
|
||||
|
||||
for ax in [0, 1]:
|
||||
for m in ['average', 'min', 'max', 'first', 'dense']:
|
||||
result = df.rank(axis=ax, method=m)
|
||||
sprank = np.apply_along_axis(
|
||||
rankdata, ax, vals,
|
||||
m if m != 'first' else 'ordinal')
|
||||
sprank = sprank.astype(np.float64)
|
||||
expected = DataFrame(sprank, columns=cols)
|
||||
|
||||
if (LooseVersion(scipy.__version__) >=
|
||||
LooseVersion('0.17.0')):
|
||||
expected = expected.astype('float64')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
|
||||
def test_rank_descending(self, method, dtype):
|
||||
|
||||
if 'i' in dtype:
|
||||
df = self.df.dropna()
|
||||
else:
|
||||
df = self.df.astype(dtype)
|
||||
|
||||
res = df.rank(ascending=False)
|
||||
expected = (df.max() - df).rank()
|
||||
assert_frame_equal(res, expected)
|
||||
|
||||
if method == 'first' and dtype == 'O':
|
||||
return
|
||||
|
||||
expected = (df.max() - df).rank(method=method)
|
||||
|
||||
if dtype != 'O':
|
||||
res2 = df.rank(method=method, ascending=False,
|
||||
numeric_only=True)
|
||||
assert_frame_equal(res2, expected)
|
||||
|
||||
res3 = df.rank(method=method, ascending=False,
|
||||
numeric_only=False)
|
||||
assert_frame_equal(res3, expected)
|
||||
|
||||
@pytest.mark.parametrize('axis', [0, 1])
|
||||
@pytest.mark.parametrize('dtype', [None, object])
|
||||
def test_rank_2d_tie_methods(self, method, axis, dtype):
|
||||
df = self.df
|
||||
|
||||
def _check2d(df, expected, method='average', axis=0):
|
||||
exp_df = DataFrame({'A': expected, 'B': expected})
|
||||
|
||||
if axis == 1:
|
||||
df = df.T
|
||||
exp_df = exp_df.T
|
||||
|
||||
result = df.rank(method=method, axis=axis)
|
||||
assert_frame_equal(result, exp_df)
|
||||
|
||||
disabled = {(object, 'first')}
|
||||
if (dtype, method) in disabled:
|
||||
return
|
||||
frame = df if dtype is None else df.astype(dtype)
|
||||
_check2d(frame, self.results[method], method=method, axis=axis)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method,exp", [("dense",
|
||||
[[1., 1., 1.],
|
||||
[1., 0.5, 2. / 3],
|
||||
[1., 0.5, 1. / 3]]),
|
||||
("min",
|
||||
[[1. / 3, 1., 1.],
|
||||
[1. / 3, 1. / 3, 2. / 3],
|
||||
[1. / 3, 1. / 3, 1. / 3]]),
|
||||
("max",
|
||||
[[1., 1., 1.],
|
||||
[1., 2. / 3, 2. / 3],
|
||||
[1., 2. / 3, 1. / 3]]),
|
||||
("average",
|
||||
[[2. / 3, 1., 1.],
|
||||
[2. / 3, 0.5, 2. / 3],
|
||||
[2. / 3, 0.5, 1. / 3]]),
|
||||
("first",
|
||||
[[1. / 3, 1., 1.],
|
||||
[2. / 3, 1. / 3, 2. / 3],
|
||||
[3. / 3, 2. / 3, 1. / 3]])])
|
||||
def test_rank_pct_true(self, method, exp):
|
||||
# see gh-15630.
|
||||
|
||||
df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
|
||||
result = df.rank(method=method, pct=True)
|
||||
|
||||
expected = DataFrame(exp)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.single
|
||||
def test_pct_max_many_rows(self):
|
||||
# GH 18271
|
||||
df = DataFrame({'A': np.arange(2**24 + 1),
|
||||
'B': np.arange(2**24 + 1, 0, -1)})
|
||||
result = df.rank(pct=True).max()
|
||||
assert (result == 1).all()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,523 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
import re
|
||||
import sys
|
||||
import textwrap
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PYPY, StringIO, lrange, u
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical, DataFrame, Series, compat, date_range, option_context,
|
||||
period_range)
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
|
||||
import pandas.io.formats.format as fmt
|
||||
|
||||
# Segregated collection of methods that require the BlockManager internal data
|
||||
# structure
|
||||
|
||||
|
||||
class TestDataFrameReprInfoEtc(TestData):
|
||||
|
||||
def test_repr_empty(self):
|
||||
# empty
|
||||
foo = repr(self.empty) # noqa
|
||||
|
||||
# empty with index
|
||||
frame = DataFrame(index=np.arange(1000))
|
||||
foo = repr(frame) # noqa
|
||||
|
||||
def test_repr_mixed(self):
|
||||
buf = StringIO()
|
||||
|
||||
# mixed
|
||||
foo = repr(self.mixed_frame) # noqa
|
||||
self.mixed_frame.info(verbose=False, buf=buf)
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_repr_mixed_big(self):
|
||||
# big mixed
|
||||
biggie = DataFrame({'A': np.random.randn(200),
|
||||
'B': tm.makeStringIndex(200)},
|
||||
index=lrange(200))
|
||||
biggie.loc[:20, 'A'] = np.nan
|
||||
biggie.loc[:20, 'B'] = np.nan
|
||||
|
||||
foo = repr(biggie) # noqa
|
||||
|
||||
def test_repr(self):
|
||||
buf = StringIO()
|
||||
|
||||
# small one
|
||||
foo = repr(self.frame)
|
||||
self.frame.info(verbose=False, buf=buf)
|
||||
|
||||
# even smaller
|
||||
self.frame.reindex(columns=['A']).info(verbose=False, buf=buf)
|
||||
self.frame.reindex(columns=['A', 'B']).info(verbose=False, buf=buf)
|
||||
|
||||
# exhausting cases in DataFrame.info
|
||||
|
||||
# columns but no index
|
||||
no_index = DataFrame(columns=[0, 1, 3])
|
||||
foo = repr(no_index) # noqa
|
||||
|
||||
# no columns or index
|
||||
self.empty.info(buf=buf)
|
||||
|
||||
df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"])
|
||||
assert "\t" not in repr(df)
|
||||
assert "\r" not in repr(df)
|
||||
assert "a\n" not in repr(df)
|
||||
|
||||
def test_repr_dimensions(self):
|
||||
df = DataFrame([[1, 2, ], [3, 4]])
|
||||
with option_context('display.show_dimensions', True):
|
||||
assert "2 rows x 2 columns" in repr(df)
|
||||
|
||||
with option_context('display.show_dimensions', False):
|
||||
assert "2 rows x 2 columns" not in repr(df)
|
||||
|
||||
with option_context('display.show_dimensions', 'truncate'):
|
||||
assert "2 rows x 2 columns" not in repr(df)
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_repr_big(self):
|
||||
# big one
|
||||
biggie = DataFrame(np.zeros((200, 4)), columns=lrange(4),
|
||||
index=lrange(200))
|
||||
repr(biggie)
|
||||
|
||||
def test_repr_unsortable(self):
|
||||
# columns are not sortable
|
||||
import warnings
|
||||
warn_filters = warnings.filters
|
||||
warnings.filterwarnings('ignore',
|
||||
category=FutureWarning,
|
||||
module=".*format")
|
||||
|
||||
unsortable = DataFrame({'foo': [1] * 50,
|
||||
datetime.today(): [1] * 50,
|
||||
'bar': ['bar'] * 50,
|
||||
datetime.today() + timedelta(1): ['bar'] * 50},
|
||||
index=np.arange(50))
|
||||
repr(unsortable)
|
||||
|
||||
fmt.set_option('display.precision', 3, 'display.column_space', 10)
|
||||
repr(self.frame)
|
||||
|
||||
fmt.set_option('display.max_rows', 10, 'display.max_columns', 2)
|
||||
repr(self.frame)
|
||||
|
||||
fmt.set_option('display.max_rows', 1000, 'display.max_columns', 1000)
|
||||
repr(self.frame)
|
||||
|
||||
tm.reset_display_options()
|
||||
|
||||
warnings.filters = warn_filters
|
||||
|
||||
def test_repr_unicode(self):
|
||||
uval = u('\u03c3\u03c3\u03c3\u03c3')
|
||||
|
||||
# TODO(wesm): is this supposed to be used?
|
||||
bval = uval.encode('utf-8') # noqa
|
||||
|
||||
df = DataFrame({'A': [uval, uval]})
|
||||
|
||||
result = repr(df)
|
||||
ex_top = ' A'
|
||||
assert result.split('\n')[0].rstrip() == ex_top
|
||||
|
||||
df = DataFrame({'A': [uval, uval]})
|
||||
result = repr(df)
|
||||
assert result.split('\n')[0].rstrip() == ex_top
|
||||
|
||||
def test_unicode_string_with_unicode(self):
|
||||
df = DataFrame({'A': [u("\u05d0")]})
|
||||
|
||||
if compat.PY3:
|
||||
str(df)
|
||||
else:
|
||||
compat.text_type(df)
|
||||
|
||||
def test_bytestring_with_unicode(self):
|
||||
df = DataFrame({'A': [u("\u05d0")]})
|
||||
if compat.PY3:
|
||||
bytes(df)
|
||||
else:
|
||||
str(df)
|
||||
|
||||
def test_very_wide_info_repr(self):
|
||||
df = DataFrame(np.random.randn(10, 20),
|
||||
columns=tm.rands_array(10, 20))
|
||||
repr(df)
|
||||
|
||||
def test_repr_column_name_unicode_truncation_bug(self):
|
||||
# #1906
|
||||
df = DataFrame({'Id': [7117434],
|
||||
'StringCol': ('Is it possible to modify drop plot code'
|
||||
' so that the output graph is displayed '
|
||||
'in iphone simulator, Is it possible to '
|
||||
'modify drop plot code so that the '
|
||||
'output graph is \xe2\x80\xa8displayed '
|
||||
'in iphone simulator.Now we are adding '
|
||||
'the CSV file externally. I want to Call'
|
||||
' the File through the code..')})
|
||||
|
||||
with option_context('display.max_columns', 20):
|
||||
assert 'StringCol' in repr(df)
|
||||
|
||||
def test_latex_repr(self):
|
||||
result = r"""\begin{tabular}{llll}
|
||||
\toprule
|
||||
{} & 0 & 1 & 2 \\
|
||||
\midrule
|
||||
0 & $\alpha$ & b & c \\
|
||||
1 & 1 & 2 & 3 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
with option_context("display.latex.escape", False,
|
||||
'display.latex.repr', True):
|
||||
df = DataFrame([[r'$\alpha$', 'b', 'c'], [1, 2, 3]])
|
||||
assert result == df._repr_latex_()
|
||||
|
||||
# GH 12182
|
||||
assert df._repr_latex_() is None
|
||||
|
||||
def test_info(self):
|
||||
io = StringIO()
|
||||
self.frame.info(buf=io)
|
||||
self.tsframe.info(buf=io)
|
||||
|
||||
frame = DataFrame(np.random.randn(5, 3))
|
||||
|
||||
frame.info()
|
||||
frame.info(verbose=False)
|
||||
|
||||
def test_info_memory(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/21056
|
||||
df = pd.DataFrame({'a': pd.Series([1, 2], dtype='i8')})
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
result = buf.getvalue()
|
||||
bytes = float(df.memory_usage().sum())
|
||||
|
||||
expected = textwrap.dedent("""\
|
||||
<class 'pandas.core.frame.DataFrame'>
|
||||
RangeIndex: 2 entries, 0 to 1
|
||||
Data columns (total 1 columns):
|
||||
a 2 non-null int64
|
||||
dtypes: int64(1)
|
||||
memory usage: {} bytes
|
||||
""".format(bytes))
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_info_wide(self):
|
||||
from pandas import set_option, reset_option
|
||||
io = StringIO()
|
||||
df = DataFrame(np.random.randn(5, 101))
|
||||
df.info(buf=io)
|
||||
|
||||
io = StringIO()
|
||||
df.info(buf=io, max_cols=101)
|
||||
rs = io.getvalue()
|
||||
assert len(rs.splitlines()) > 100
|
||||
xp = rs
|
||||
|
||||
set_option('display.max_info_columns', 101)
|
||||
io = StringIO()
|
||||
df.info(buf=io)
|
||||
assert rs == xp
|
||||
reset_option('display.max_info_columns')
|
||||
|
||||
def test_info_duplicate_columns(self):
|
||||
io = StringIO()
|
||||
|
||||
# it works!
|
||||
frame = DataFrame(np.random.randn(1500, 4),
|
||||
columns=['a', 'a', 'b', 'b'])
|
||||
frame.info(buf=io)
|
||||
|
||||
def test_info_duplicate_columns_shows_correct_dtypes(self):
|
||||
# GH11761
|
||||
io = StringIO()
|
||||
|
||||
frame = DataFrame([[1, 2.0]],
|
||||
columns=['a', 'a'])
|
||||
frame.info(buf=io)
|
||||
io.seek(0)
|
||||
lines = io.readlines()
|
||||
assert 'a 1 non-null int64\n' == lines[3]
|
||||
assert 'a 1 non-null float64\n' == lines[4]
|
||||
|
||||
def test_info_shows_column_dtypes(self):
|
||||
dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
|
||||
'complex128', 'object', 'bool']
|
||||
data = {}
|
||||
n = 10
|
||||
for i, dtype in enumerate(dtypes):
|
||||
data[i] = np.random.randint(2, size=n).astype(dtype)
|
||||
df = DataFrame(data)
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
res = buf.getvalue()
|
||||
for i, dtype in enumerate(dtypes):
|
||||
name = '%d %d non-null %s' % (i, n, dtype)
|
||||
assert name in res
|
||||
|
||||
def test_info_max_cols(self):
|
||||
df = DataFrame(np.random.randn(10, 5))
|
||||
for len_, verbose in [(5, None), (5, False), (10, True)]:
|
||||
# For verbose always ^ setting ^ summarize ^ full output
|
||||
with option_context('max_info_columns', 4):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf, verbose=verbose)
|
||||
res = buf.getvalue()
|
||||
assert len(res.strip().split('\n')) == len_
|
||||
|
||||
for len_, verbose in [(10, None), (5, False), (10, True)]:
|
||||
|
||||
# max_cols no exceeded
|
||||
with option_context('max_info_columns', 5):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf, verbose=verbose)
|
||||
res = buf.getvalue()
|
||||
assert len(res.strip().split('\n')) == len_
|
||||
|
||||
for len_, max_cols in [(10, 5), (5, 4)]:
|
||||
# setting truncates
|
||||
with option_context('max_info_columns', 4):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf, max_cols=max_cols)
|
||||
res = buf.getvalue()
|
||||
assert len(res.strip().split('\n')) == len_
|
||||
|
||||
# setting wouldn't truncate
|
||||
with option_context('max_info_columns', 5):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf, max_cols=max_cols)
|
||||
res = buf.getvalue()
|
||||
assert len(res.strip().split('\n')) == len_
|
||||
|
||||
def test_info_memory_usage(self):
|
||||
# Ensure memory usage is displayed, when asserted, on the last line
|
||||
dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
|
||||
'complex128', 'object', 'bool']
|
||||
data = {}
|
||||
n = 10
|
||||
for i, dtype in enumerate(dtypes):
|
||||
data[i] = np.random.randint(2, size=n).astype(dtype)
|
||||
df = DataFrame(data)
|
||||
buf = StringIO()
|
||||
|
||||
# display memory usage case
|
||||
df.info(buf=buf, memory_usage=True)
|
||||
res = buf.getvalue().splitlines()
|
||||
assert "memory usage: " in res[-1]
|
||||
|
||||
# do not display memory usage case
|
||||
df.info(buf=buf, memory_usage=False)
|
||||
res = buf.getvalue().splitlines()
|
||||
assert "memory usage: " not in res[-1]
|
||||
|
||||
df.info(buf=buf, memory_usage=True)
|
||||
res = buf.getvalue().splitlines()
|
||||
|
||||
# memory usage is a lower bound, so print it as XYZ+ MB
|
||||
assert re.match(r"memory usage: [^+]+\+", res[-1])
|
||||
|
||||
df.iloc[:, :5].info(buf=buf, memory_usage=True)
|
||||
res = buf.getvalue().splitlines()
|
||||
|
||||
# excluded column with object dtype, so estimate is accurate
|
||||
assert not re.match(r"memory usage: [^+]+\+", res[-1])
|
||||
|
||||
# Test a DataFrame with duplicate columns
|
||||
dtypes = ['int64', 'int64', 'int64', 'float64']
|
||||
data = {}
|
||||
n = 100
|
||||
for i, dtype in enumerate(dtypes):
|
||||
data[i] = np.random.randint(2, size=n).astype(dtype)
|
||||
df = DataFrame(data)
|
||||
df.columns = dtypes
|
||||
|
||||
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
|
||||
df_with_object_index.info(buf=buf, memory_usage=True)
|
||||
res = buf.getvalue().splitlines()
|
||||
assert re.match(r"memory usage: [^+]+\+", res[-1])
|
||||
|
||||
df_with_object_index.info(buf=buf, memory_usage='deep')
|
||||
res = buf.getvalue().splitlines()
|
||||
assert re.match(r"memory usage: [^+]+$", res[-1])
|
||||
|
||||
# Ensure df size is as expected
|
||||
# (cols * rows * bytes) + index size
|
||||
df_size = df.memory_usage().sum()
|
||||
exp_size = len(dtypes) * n * 8 + df.index.nbytes
|
||||
assert df_size == exp_size
|
||||
|
||||
# Ensure number of cols in memory_usage is the same as df
|
||||
size_df = np.size(df.columns.values) + 1 # index=True; default
|
||||
assert size_df == np.size(df.memory_usage())
|
||||
|
||||
# assert deep works only on object
|
||||
assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
|
||||
|
||||
# test for validity
|
||||
DataFrame(1, index=['a'], columns=['A']
|
||||
).memory_usage(index=True)
|
||||
DataFrame(1, index=['a'], columns=['A']
|
||||
).index.nbytes
|
||||
df = DataFrame(
|
||||
data=1,
|
||||
index=pd.MultiIndex.from_product(
|
||||
[['a'], range(1000)]),
|
||||
columns=['A']
|
||||
)
|
||||
df.index.nbytes
|
||||
df.memory_usage(index=True)
|
||||
df.index.values.nbytes
|
||||
|
||||
mem = df.memory_usage(deep=True).sum()
|
||||
assert mem > 0
|
||||
|
||||
@pytest.mark.skipif(PYPY,
|
||||
reason="on PyPy deep=True doesn't change result")
|
||||
def test_info_memory_usage_deep_not_pypy(self):
|
||||
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
|
||||
assert (df_with_object_index.memory_usage(
|
||||
index=True, deep=True).sum() >
|
||||
df_with_object_index.memory_usage(
|
||||
index=True).sum())
|
||||
|
||||
df_object = pd.DataFrame({'a': ['a']})
|
||||
assert (df_object.memory_usage(deep=True).sum() >
|
||||
df_object.memory_usage().sum())
|
||||
|
||||
@pytest.mark.skipif(not PYPY,
|
||||
reason="on PyPy deep=True does not change result")
|
||||
def test_info_memory_usage_deep_pypy(self):
|
||||
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
|
||||
assert (df_with_object_index.memory_usage(
|
||||
index=True, deep=True).sum() ==
|
||||
df_with_object_index.memory_usage(
|
||||
index=True).sum())
|
||||
|
||||
df_object = pd.DataFrame({'a': ['a']})
|
||||
assert (df_object.memory_usage(deep=True).sum() ==
|
||||
df_object.memory_usage().sum())
|
||||
|
||||
@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
|
||||
def test_usage_via_getsizeof(self):
|
||||
df = DataFrame(
|
||||
data=1,
|
||||
index=pd.MultiIndex.from_product(
|
||||
[['a'], range(1000)]),
|
||||
columns=['A']
|
||||
)
|
||||
mem = df.memory_usage(deep=True).sum()
|
||||
# sys.getsizeof will call the .memory_usage with
|
||||
# deep=True, and add on some GC overhead
|
||||
diff = mem - sys.getsizeof(df)
|
||||
assert abs(diff) < 100
|
||||
|
||||
def test_info_memory_usage_qualified(self):
|
||||
|
||||
buf = StringIO()
|
||||
df = DataFrame(1, columns=list('ab'),
|
||||
index=[1, 2, 3])
|
||||
df.info(buf=buf)
|
||||
assert '+' not in buf.getvalue()
|
||||
|
||||
buf = StringIO()
|
||||
df = DataFrame(1, columns=list('ab'),
|
||||
index=list('ABC'))
|
||||
df.info(buf=buf)
|
||||
assert '+' in buf.getvalue()
|
||||
|
||||
buf = StringIO()
|
||||
df = DataFrame(1, columns=list('ab'),
|
||||
index=pd.MultiIndex.from_product(
|
||||
[range(3), range(3)]))
|
||||
df.info(buf=buf)
|
||||
assert '+' not in buf.getvalue()
|
||||
|
||||
buf = StringIO()
|
||||
df = DataFrame(1, columns=list('ab'),
|
||||
index=pd.MultiIndex.from_product(
|
||||
[range(3), ['foo', 'bar']]))
|
||||
df.info(buf=buf)
|
||||
assert '+' in buf.getvalue()
|
||||
|
||||
def test_info_memory_usage_bug_on_multiindex(self):
|
||||
# GH 14308
|
||||
# memory usage introspection should not materialize .values
|
||||
|
||||
from string import ascii_uppercase as uppercase
|
||||
|
||||
def memory_usage(f):
|
||||
return f.memory_usage(deep=True).sum()
|
||||
|
||||
N = 100
|
||||
M = len(uppercase)
|
||||
index = pd.MultiIndex.from_product([list(uppercase),
|
||||
pd.date_range('20160101',
|
||||
periods=N)],
|
||||
names=['id', 'date'])
|
||||
df = DataFrame({'value': np.random.randn(N * M)}, index=index)
|
||||
|
||||
unstacked = df.unstack('id')
|
||||
assert df.values.nbytes == unstacked.values.nbytes
|
||||
assert memory_usage(df) > memory_usage(unstacked)
|
||||
|
||||
# high upper bound
|
||||
assert memory_usage(unstacked) - memory_usage(df) < 2000
|
||||
|
||||
def test_info_categorical(self):
|
||||
# GH14298
|
||||
idx = pd.CategoricalIndex(['a', 'b'])
|
||||
df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
|
||||
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
|
||||
def test_info_categorical_column(self):
|
||||
|
||||
# make sure it works
|
||||
n = 2500
|
||||
df = DataFrame({'int64': np.random.randint(100, size=n)})
|
||||
df['category'] = Series(np.array(list('abcdefghij')).take(
|
||||
np.random.randint(0, 10, size=n))).astype('category')
|
||||
df.isna()
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
|
||||
df2 = df[df['category'] == 'd']
|
||||
buf = compat.StringIO()
|
||||
df2.info(buf=buf)
|
||||
|
||||
def test_repr_categorical_dates_periods(self):
|
||||
# normal DataFrame
|
||||
dt = date_range('2011-01-01 09:00', freq='H', periods=5,
|
||||
tz='US/Eastern')
|
||||
p = period_range('2011-01', freq='M', periods=5)
|
||||
df = DataFrame({'dt': dt, 'p': p})
|
||||
exp = """ dt p
|
||||
0 2011-01-01 09:00:00-05:00 2011-01
|
||||
1 2011-01-01 10:00:00-05:00 2011-02
|
||||
2 2011-01-01 11:00:00-05:00 2011-03
|
||||
3 2011-01-01 12:00:00-05:00 2011-04
|
||||
4 2011-01-01 13:00:00-05:00 2011-05"""
|
||||
|
||||
df = DataFrame({'dt': Categorical(dt), 'p': Categorical(p)})
|
||||
assert repr(df) == exp
|
||||
@@ -0,0 +1,968 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from datetime import datetime
|
||||
import itertools
|
||||
from warnings import catch_warnings, simplefilter
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import u
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame, Index, MultiIndex, Period, Series, Timedelta, date_range)
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
class TestDataFrameReshape(TestData):
|
||||
|
||||
def test_pivot(self):
|
||||
data = {
|
||||
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
|
||||
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
|
||||
'values': [1., 2., 3., 3., 2., 1.]
|
||||
}
|
||||
|
||||
frame = DataFrame(data)
|
||||
pivoted = frame.pivot(
|
||||
index='index', columns='columns', values='values')
|
||||
|
||||
expected = DataFrame({
|
||||
'One': {'A': 1., 'B': 2., 'C': 3.},
|
||||
'Two': {'A': 1., 'B': 2., 'C': 3.}
|
||||
})
|
||||
|
||||
expected.index.name, expected.columns.name = 'index', 'columns'
|
||||
tm.assert_frame_equal(pivoted, expected)
|
||||
|
||||
# name tracking
|
||||
assert pivoted.index.name == 'index'
|
||||
assert pivoted.columns.name == 'columns'
|
||||
|
||||
# don't specify values
|
||||
pivoted = frame.pivot(index='index', columns='columns')
|
||||
assert pivoted.index.name == 'index'
|
||||
assert pivoted.columns.names == (None, 'columns')
|
||||
|
||||
with catch_warnings(record=True):
|
||||
# pivot multiple columns
|
||||
simplefilter("ignore", FutureWarning)
|
||||
wp = tm.makePanel()
|
||||
lp = wp.to_frame()
|
||||
df = lp.reset_index()
|
||||
tm.assert_frame_equal(df.pivot('major', 'minor'), lp.unstack())
|
||||
|
||||
def test_pivot_duplicates(self):
|
||||
data = DataFrame({'a': ['bar', 'bar', 'foo', 'foo', 'foo'],
|
||||
'b': ['one', 'two', 'one', 'one', 'two'],
|
||||
'c': [1., 2., 3., 3., 4.]})
|
||||
with pytest.raises(ValueError, match='duplicate entries'):
|
||||
data.pivot('a', 'b', 'c')
|
||||
|
||||
def test_pivot_empty(self):
|
||||
df = DataFrame({}, columns=['a', 'b', 'c'])
|
||||
result = df.pivot('a', 'b', 'c')
|
||||
expected = DataFrame({})
|
||||
tm.assert_frame_equal(result, expected, check_names=False)
|
||||
|
||||
def test_pivot_integer_bug(self):
|
||||
df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")])
|
||||
|
||||
result = df.pivot(index=1, columns=0, values=2)
|
||||
repr(result)
|
||||
tm.assert_index_equal(result.columns, Index(['A', 'B'], name=0))
|
||||
|
||||
def test_pivot_index_none(self):
|
||||
# gh-3962
|
||||
data = {
|
||||
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
|
||||
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
|
||||
'values': [1., 2., 3., 3., 2., 1.]
|
||||
}
|
||||
|
||||
frame = DataFrame(data).set_index('index')
|
||||
result = frame.pivot(columns='columns', values='values')
|
||||
expected = DataFrame({
|
||||
'One': {'A': 1., 'B': 2., 'C': 3.},
|
||||
'Two': {'A': 1., 'B': 2., 'C': 3.}
|
||||
})
|
||||
|
||||
expected.index.name, expected.columns.name = 'index', 'columns'
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# omit values
|
||||
result = frame.pivot(columns='columns')
|
||||
|
||||
expected.columns = pd.MultiIndex.from_tuples([('values', 'One'),
|
||||
('values', 'Two')],
|
||||
names=[None, 'columns'])
|
||||
expected.index.name = 'index'
|
||||
tm.assert_frame_equal(result, expected, check_names=False)
|
||||
assert result.index.name == 'index'
|
||||
assert result.columns.names == (None, 'columns')
|
||||
expected.columns = expected.columns.droplevel(0)
|
||||
result = frame.pivot(columns='columns', values='values')
|
||||
|
||||
expected.columns.name = 'columns'
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_stack_unstack(self):
|
||||
df = self.frame.copy()
|
||||
df[:] = np.arange(np.prod(df.shape)).reshape(df.shape)
|
||||
|
||||
stacked = df.stack()
|
||||
stacked_df = DataFrame({'foo': stacked, 'bar': stacked})
|
||||
|
||||
unstacked = stacked.unstack()
|
||||
unstacked_df = stacked_df.unstack()
|
||||
|
||||
assert_frame_equal(unstacked, df)
|
||||
assert_frame_equal(unstacked_df['bar'], df)
|
||||
|
||||
unstacked_cols = stacked.unstack(0)
|
||||
unstacked_cols_df = stacked_df.unstack(0)
|
||||
assert_frame_equal(unstacked_cols.T, df)
|
||||
assert_frame_equal(unstacked_cols_df['bar'].T, df)
|
||||
|
||||
def test_stack_mixed_level(self):
|
||||
# GH 18310
|
||||
levels = [range(3), [3, 'a', 'b'], [1, 2]]
|
||||
|
||||
# flat columns:
|
||||
df = DataFrame(1, index=levels[0], columns=levels[1])
|
||||
result = df.stack()
|
||||
expected = Series(1, index=MultiIndex.from_product(levels[:2]))
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# MultiIndex columns:
|
||||
df = DataFrame(1, index=levels[0],
|
||||
columns=MultiIndex.from_product(levels[1:]))
|
||||
result = df.stack(1)
|
||||
expected = DataFrame(1, index=MultiIndex.from_product([levels[0],
|
||||
levels[2]]),
|
||||
columns=levels[1])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# as above, but used labels in level are actually of homogeneous type
|
||||
result = df[['a', 'b']].stack(1)
|
||||
expected = expected[['a', 'b']]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_fill(self):
|
||||
|
||||
# GH #9746: fill_value keyword argument for Series
|
||||
# and DataFrame unstack
|
||||
|
||||
# From a series
|
||||
data = Series([1, 2, 4, 5], dtype=np.int16)
|
||||
data.index = MultiIndex.from_tuples(
|
||||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
||||
|
||||
result = data.unstack(fill_value=-1)
|
||||
expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]},
|
||||
index=['x', 'y', 'z'], dtype=np.int16)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# From a series with incorrect data type for fill_value
|
||||
result = data.unstack(fill_value=0.5)
|
||||
expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]},
|
||||
index=['x', 'y', 'z'], dtype=np.float)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH #13971: fill_value when unstacking multiple levels:
|
||||
df = DataFrame({'x': ['a', 'a', 'b'],
|
||||
'y': ['j', 'k', 'j'],
|
||||
'z': [0, 1, 2],
|
||||
'w': [0, 1, 2]}).set_index(['x', 'y', 'z'])
|
||||
unstacked = df.unstack(['x', 'y'], fill_value=0)
|
||||
key = ('w', 'b', 'j')
|
||||
expected = unstacked[key]
|
||||
result = pd.Series([0, 0, 2], index=unstacked.index, name=key)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
stacked = unstacked.stack(['x', 'y'])
|
||||
stacked.index = stacked.index.reorder_levels(df.index.names)
|
||||
# Workaround for GH #17886 (unnecessarily casts to float):
|
||||
stacked = stacked.astype(np.int64)
|
||||
result = stacked.loc[df.index]
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
# From a series
|
||||
s = df['w']
|
||||
result = s.unstack(['x', 'y'], fill_value=0)
|
||||
expected = unstacked['w']
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_fill_frame(self):
|
||||
|
||||
# From a dataframe
|
||||
rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
|
||||
df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
|
||||
df.index = MultiIndex.from_tuples(
|
||||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
||||
|
||||
result = df.unstack(fill_value=-1)
|
||||
|
||||
rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
|
||||
expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
|
||||
expected.columns = MultiIndex.from_tuples(
|
||||
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# From a mixed type dataframe
|
||||
df['A'] = df['A'].astype(np.int16)
|
||||
df['B'] = df['B'].astype(np.float64)
|
||||
|
||||
result = df.unstack(fill_value=-1)
|
||||
expected['A'] = expected['A'].astype(np.int16)
|
||||
expected['B'] = expected['B'].astype(np.float64)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# From a dataframe with incorrect data type for fill_value
|
||||
result = df.unstack(fill_value=0.5)
|
||||
|
||||
rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
|
||||
expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
|
||||
expected.columns = MultiIndex.from_tuples(
|
||||
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_fill_frame_datetime(self):
|
||||
|
||||
# Test unstacking with date times
|
||||
dv = pd.date_range('2012-01-01', periods=4).values
|
||||
data = Series(dv)
|
||||
data.index = MultiIndex.from_tuples(
|
||||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
||||
|
||||
result = data.unstack()
|
||||
expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]],
|
||||
'b': [dv[1], dv[2], pd.NaT]},
|
||||
index=['x', 'y', 'z'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = data.unstack(fill_value=dv[0])
|
||||
expected = DataFrame({'a': [dv[0], dv[0], dv[3]],
|
||||
'b': [dv[1], dv[2], dv[0]]},
|
||||
index=['x', 'y', 'z'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_fill_frame_timedelta(self):
|
||||
|
||||
# Test unstacking with time deltas
|
||||
td = [Timedelta(days=i) for i in range(4)]
|
||||
data = Series(td)
|
||||
data.index = MultiIndex.from_tuples(
|
||||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
||||
|
||||
result = data.unstack()
|
||||
expected = DataFrame({'a': [td[0], pd.NaT, td[3]],
|
||||
'b': [td[1], td[2], pd.NaT]},
|
||||
index=['x', 'y', 'z'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = data.unstack(fill_value=td[1])
|
||||
expected = DataFrame({'a': [td[0], td[1], td[3]],
|
||||
'b': [td[1], td[2], td[1]]},
|
||||
index=['x', 'y', 'z'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_fill_frame_period(self):
|
||||
|
||||
# Test unstacking with period
|
||||
periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'),
|
||||
Period('2012-04')]
|
||||
data = Series(periods)
|
||||
data.index = MultiIndex.from_tuples(
|
||||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
||||
|
||||
result = data.unstack()
|
||||
expected = DataFrame({'a': [periods[0], None, periods[3]],
|
||||
'b': [periods[1], periods[2], None]},
|
||||
index=['x', 'y', 'z'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = data.unstack(fill_value=periods[1])
|
||||
expected = DataFrame({'a': [periods[0], periods[1], periods[3]],
|
||||
'b': [periods[1], periods[2], periods[1]]},
|
||||
index=['x', 'y', 'z'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_fill_frame_categorical(self):
|
||||
|
||||
# Test unstacking with categorical
|
||||
data = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
|
||||
data.index = pd.MultiIndex.from_tuples(
|
||||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')],
|
||||
)
|
||||
|
||||
# By default missing values will be NaN
|
||||
result = data.unstack()
|
||||
expected = DataFrame({'a': pd.Categorical(list('axa'),
|
||||
categories=list('abc')),
|
||||
'b': pd.Categorical(list('bcx'),
|
||||
categories=list('abc'))},
|
||||
index=list('xyz'))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Fill with non-category results in a TypeError
|
||||
msg = r"'fill_value' \('d'\) is not in"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
data.unstack(fill_value='d')
|
||||
|
||||
# Fill with category value replaces missing values as expected
|
||||
result = data.unstack(fill_value='c')
|
||||
expected = DataFrame({'a': pd.Categorical(list('aca'),
|
||||
categories=list('abc')),
|
||||
'b': pd.Categorical(list('bcc'),
|
||||
categories=list('abc'))},
|
||||
index=list('xyz'))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_preserve_dtypes(self):
|
||||
# Checks fix for #11847
|
||||
df = pd.DataFrame(dict(state=['IL', 'MI', 'NC'],
|
||||
index=['a', 'b', 'c'],
|
||||
some_categories=pd.Series(['a', 'b', 'c']
|
||||
).astype('category'),
|
||||
A=np.random.rand(3),
|
||||
B=1,
|
||||
C='foo',
|
||||
D=pd.Timestamp('20010102'),
|
||||
E=pd.Series([1.0, 50.0, 100.0]
|
||||
).astype('float32'),
|
||||
F=pd.Series([3.0, 4.0, 5.0]).astype('float64'),
|
||||
G=False,
|
||||
H=pd.Series([1, 200, 923442], dtype='int8')))
|
||||
|
||||
def unstack_and_compare(df, column_name):
|
||||
unstacked1 = df.unstack([column_name])
|
||||
unstacked2 = df.unstack(column_name)
|
||||
assert_frame_equal(unstacked1, unstacked2)
|
||||
|
||||
df1 = df.set_index(['state', 'index'])
|
||||
unstack_and_compare(df1, 'index')
|
||||
|
||||
df1 = df.set_index(['state', 'some_categories'])
|
||||
unstack_and_compare(df1, 'some_categories')
|
||||
|
||||
df1 = df.set_index(['F', 'C'])
|
||||
unstack_and_compare(df1, 'F')
|
||||
|
||||
df1 = df.set_index(['G', 'B', 'state'])
|
||||
unstack_and_compare(df1, 'B')
|
||||
|
||||
df1 = df.set_index(['E', 'A'])
|
||||
unstack_and_compare(df1, 'E')
|
||||
|
||||
df1 = df.set_index(['state', 'index'])
|
||||
s = df1['A']
|
||||
unstack_and_compare(s, 'index')
|
||||
|
||||
def test_stack_ints(self):
|
||||
columns = MultiIndex.from_tuples(list(itertools.product(range(3),
|
||||
repeat=3)))
|
||||
df = DataFrame(np.random.randn(30, 27), columns=columns)
|
||||
|
||||
assert_frame_equal(df.stack(level=[1, 2]),
|
||||
df.stack(level=1).stack(level=1))
|
||||
assert_frame_equal(df.stack(level=[-2, -1]),
|
||||
df.stack(level=1).stack(level=1))
|
||||
|
||||
df_named = df.copy()
|
||||
df_named.columns.set_names(range(3), inplace=True)
|
||||
|
||||
assert_frame_equal(df_named.stack(level=[1, 2]),
|
||||
df_named.stack(level=1).stack(level=1))
|
||||
|
||||
def test_stack_mixed_levels(self):
|
||||
columns = MultiIndex.from_tuples(
|
||||
[('A', 'cat', 'long'), ('B', 'cat', 'long'),
|
||||
('A', 'dog', 'short'), ('B', 'dog', 'short')],
|
||||
names=['exp', 'animal', 'hair_length']
|
||||
)
|
||||
df = DataFrame(np.random.randn(4, 4), columns=columns)
|
||||
|
||||
animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
|
||||
exp_hair_stacked = df.stack(level=['exp', 'hair_length'])
|
||||
|
||||
# GH #8584: Need to check that stacking works when a number
|
||||
# is passed that is both a level name and in the range of
|
||||
# the level numbers
|
||||
df2 = df.copy()
|
||||
df2.columns.names = ['exp', 'animal', 1]
|
||||
assert_frame_equal(df2.stack(level=['animal', 1]),
|
||||
animal_hair_stacked, check_names=False)
|
||||
assert_frame_equal(df2.stack(level=['exp', 1]),
|
||||
exp_hair_stacked, check_names=False)
|
||||
|
||||
# When mixed types are passed and the ints are not level
|
||||
# names, raise
|
||||
pytest.raises(ValueError, df2.stack, level=['animal', 0])
|
||||
|
||||
# GH #8584: Having 0 in the level names could raise a
|
||||
# strange error about lexsort depth
|
||||
df3 = df.copy()
|
||||
df3.columns.names = ['exp', 'animal', 0]
|
||||
assert_frame_equal(df3.stack(level=['animal', 0]),
|
||||
animal_hair_stacked, check_names=False)
|
||||
|
||||
def test_stack_int_level_names(self):
|
||||
columns = MultiIndex.from_tuples(
|
||||
[('A', 'cat', 'long'), ('B', 'cat', 'long'),
|
||||
('A', 'dog', 'short'), ('B', 'dog', 'short')],
|
||||
names=['exp', 'animal', 'hair_length']
|
||||
)
|
||||
df = DataFrame(np.random.randn(4, 4), columns=columns)
|
||||
|
||||
exp_animal_stacked = df.stack(level=['exp', 'animal'])
|
||||
animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
|
||||
exp_hair_stacked = df.stack(level=['exp', 'hair_length'])
|
||||
|
||||
df2 = df.copy()
|
||||
df2.columns.names = [0, 1, 2]
|
||||
assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked,
|
||||
check_names=False)
|
||||
assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked,
|
||||
check_names=False)
|
||||
assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked,
|
||||
check_names=False)
|
||||
|
||||
# Out-of-order int column names
|
||||
df3 = df.copy()
|
||||
df3.columns.names = [2, 0, 1]
|
||||
assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked,
|
||||
check_names=False)
|
||||
assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked,
|
||||
check_names=False)
|
||||
assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked,
|
||||
check_names=False)
|
||||
|
||||
def test_unstack_bool(self):
|
||||
df = DataFrame([False, False],
|
||||
index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]),
|
||||
columns=['col'])
|
||||
rs = df.unstack()
|
||||
xp = DataFrame(np.array([[False, np.nan], [np.nan, False]],
|
||||
dtype=object),
|
||||
index=['a', 'b'],
|
||||
columns=MultiIndex.from_arrays([['col', 'col'],
|
||||
['c', 'l']]))
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_unstack_level_binding(self):
|
||||
# GH9856
|
||||
mi = pd.MultiIndex(
|
||||
levels=[[u('foo'), u('bar')], [u('one'), u('two')],
|
||||
[u('a'), u('b')]],
|
||||
codes=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]],
|
||||
names=[u('first'), u('second'), u('third')])
|
||||
s = pd.Series(0, index=mi)
|
||||
result = s.unstack([1, 2]).stack(0)
|
||||
|
||||
expected_mi = pd.MultiIndex(
|
||||
levels=[['foo', 'bar'], ['one', 'two']],
|
||||
codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
|
||||
names=['first', 'second'])
|
||||
|
||||
expected = pd.DataFrame(np.array([[np.nan, 0],
|
||||
[0, np.nan],
|
||||
[np.nan, 0],
|
||||
[0, np.nan]],
|
||||
dtype=np.float64),
|
||||
index=expected_mi,
|
||||
columns=pd.Index(['a', 'b'], name='third'))
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_to_series(self):
|
||||
# check reversibility
|
||||
data = self.frame.unstack()
|
||||
|
||||
assert isinstance(data, Series)
|
||||
undo = data.unstack().T
|
||||
assert_frame_equal(undo, self.frame)
|
||||
|
||||
# check NA handling
|
||||
data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]})
|
||||
data.index = Index(['a', 'b', 'c'])
|
||||
result = data.unstack()
|
||||
|
||||
midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']],
|
||||
codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
|
||||
expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx)
|
||||
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# check composability of unstack
|
||||
old_data = data.copy()
|
||||
for _ in range(4):
|
||||
data = data.unstack()
|
||||
assert_frame_equal(old_data, data)
|
||||
|
||||
def test_unstack_dtypes(self):
|
||||
|
||||
# GH 2929
|
||||
rows = [[1, 1, 3, 4],
|
||||
[1, 2, 3, 4],
|
||||
[2, 1, 3, 4],
|
||||
[2, 2, 3, 4]]
|
||||
|
||||
df = DataFrame(rows, columns=list('ABCD'))
|
||||
result = df.get_dtype_counts()
|
||||
expected = Series({'int64': 4})
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# single dtype
|
||||
df2 = df.set_index(['A', 'B'])
|
||||
df3 = df2.unstack('B')
|
||||
result = df3.get_dtype_counts()
|
||||
expected = Series({'int64': 4})
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# mixed
|
||||
df2 = df.set_index(['A', 'B'])
|
||||
df2['C'] = 3.
|
||||
df3 = df2.unstack('B')
|
||||
result = df3.get_dtype_counts()
|
||||
expected = Series({'int64': 2, 'float64': 2})
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df2['D'] = 'foo'
|
||||
df3 = df2.unstack('B')
|
||||
result = df3.get_dtype_counts()
|
||||
expected = Series({'float64': 2, 'object': 2})
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# GH7405
|
||||
for c, d in (np.zeros(5), np.zeros(5)), \
|
||||
(np.arange(5, dtype='f8'), np.arange(5, 10, dtype='f8')):
|
||||
|
||||
df = DataFrame({'A': ['a'] * 5, 'C': c, 'D': d,
|
||||
'B': pd.date_range('2012-01-01', periods=5)})
|
||||
|
||||
right = df.iloc[:3].copy(deep=True)
|
||||
|
||||
df = df.set_index(['A', 'B'])
|
||||
df['D'] = df['D'].astype('int64')
|
||||
|
||||
left = df.iloc[:3].unstack(0)
|
||||
right = right.set_index(['A', 'B']).unstack(0)
|
||||
right[('D', 'a')] = right[('D', 'a')].astype('int64')
|
||||
|
||||
assert left.shape == (3, 2)
|
||||
tm.assert_frame_equal(left, right)
|
||||
|
||||
def test_unstack_non_unique_index_names(self):
|
||||
idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')],
|
||||
names=['c1', 'c1'])
|
||||
df = DataFrame([1, 2], index=idx)
|
||||
with pytest.raises(ValueError):
|
||||
df.unstack('c1')
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
df.T.stack('c1')
|
||||
|
||||
def test_unstack_unused_levels(self):
|
||||
# GH 17845: unused codes in index make unstack() cast int to float
|
||||
idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1]
|
||||
df = pd.DataFrame([[1, 0]] * 3, index=idx)
|
||||
|
||||
result = df.unstack()
|
||||
exp_col = pd.MultiIndex.from_product([[0, 1], ['A', 'B', 'C']])
|
||||
expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=['a'],
|
||||
columns=exp_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert((result.columns.levels[1] == idx.levels[1]).all())
|
||||
|
||||
# Unused items on both levels
|
||||
levels = [[0, 1, 7], [0, 1, 2, 3]]
|
||||
codes = [[0, 0, 1, 1], [0, 2, 0, 2]]
|
||||
idx = pd.MultiIndex(levels, codes)
|
||||
block = np.arange(4).reshape(2, 2)
|
||||
df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx)
|
||||
result = df.unstack()
|
||||
expected = pd.DataFrame(np.concatenate([block * 2, block * 2 + 1],
|
||||
axis=1),
|
||||
columns=idx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert((result.columns.levels[1] == idx.levels[1]).all())
|
||||
|
||||
# With mixed dtype and NaN
|
||||
levels = [['a', 2, 'c'], [1, 3, 5, 7]]
|
||||
codes = [[0, -1, 1, 1], [0, 2, -1, 2]]
|
||||
idx = pd.MultiIndex(levels, codes)
|
||||
data = np.arange(8)
|
||||
df = pd.DataFrame(data.reshape(4, 2), index=idx)
|
||||
|
||||
cases = ((0, [13, 16, 6, 9, 2, 5, 8, 11],
|
||||
[np.nan, 'a', 2], [np.nan, 5, 1]),
|
||||
(1, [8, 11, 1, 4, 12, 15, 13, 16],
|
||||
[np.nan, 5, 1], [np.nan, 'a', 2]))
|
||||
for level, idces, col_level, idx_level in cases:
|
||||
result = df.unstack(level=level)
|
||||
exp_data = np.zeros(18) * np.nan
|
||||
exp_data[idces] = data
|
||||
cols = pd.MultiIndex.from_product([[0, 1], col_level])
|
||||
expected = pd.DataFrame(exp_data.reshape(3, 6),
|
||||
index=idx_level, columns=cols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("cols", [['A', 'C'], slice(None)])
|
||||
def test_unstack_unused_level(self, cols):
|
||||
# GH 18562 : unused codes on the unstacked level
|
||||
df = pd.DataFrame([[2010, 'a', 'I'],
|
||||
[2011, 'b', 'II']],
|
||||
columns=['A', 'B', 'C'])
|
||||
|
||||
ind = df.set_index(['A', 'B', 'C'], drop=False)
|
||||
selection = ind.loc[(slice(None), slice(None), 'I'), cols]
|
||||
result = selection.unstack()
|
||||
|
||||
expected = ind.iloc[[0]][cols]
|
||||
expected.columns = MultiIndex.from_product([expected.columns, ['I']],
|
||||
names=[None, 'C'])
|
||||
expected.index = expected.index.droplevel('C')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_unstack_nan_index(self): # GH7466
|
||||
cast = lambda val: '{0:1}'.format('' if val != val else val)
|
||||
|
||||
def verify(df):
|
||||
mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
|
||||
rows, cols = df.notna().values.nonzero()
|
||||
for i, j in zip(rows, cols):
|
||||
left = sorted(df.iloc[i, j].split('.'))
|
||||
right = mk_list(df.index[i]) + mk_list(df.columns[j])
|
||||
right = sorted(list(map(cast, right)))
|
||||
assert left == right
|
||||
|
||||
df = DataFrame({'jim': ['a', 'b', np.nan, 'd'],
|
||||
'joe': ['w', 'x', 'y', 'z'],
|
||||
'jolie': ['a.w', 'b.x', ' .y', 'd.z']})
|
||||
|
||||
left = df.set_index(['jim', 'joe']).unstack()['jolie']
|
||||
right = df.set_index(['joe', 'jim']).unstack()['jolie'].T
|
||||
assert_frame_equal(left, right)
|
||||
|
||||
for idx in itertools.permutations(df.columns[:2]):
|
||||
mi = df.set_index(list(idx))
|
||||
for lev in range(2):
|
||||
udf = mi.unstack(level=lev)
|
||||
assert udf.notna().values.sum() == len(df)
|
||||
verify(udf['jolie'])
|
||||
|
||||
df = DataFrame({'1st': ['d'] * 3 + [np.nan] * 5 + ['a'] * 2 +
|
||||
['c'] * 3 + ['e'] * 2 + ['b'] * 5,
|
||||
'2nd': ['y'] * 2 + ['w'] * 3 + [np.nan] * 3 +
|
||||
['z'] * 4 + [np.nan] * 3 + ['x'] * 3 + [np.nan] * 2,
|
||||
'3rd': [67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59,
|
||||
50, 62, 59, 76, 52, 14, 53, 60, 51]})
|
||||
|
||||
df['4th'], df['5th'] = \
|
||||
df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \
|
||||
df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1)
|
||||
|
||||
for idx in itertools.permutations(['1st', '2nd', '3rd']):
|
||||
mi = df.set_index(list(idx))
|
||||
for lev in range(3):
|
||||
udf = mi.unstack(level=lev)
|
||||
assert udf.notna().values.sum() == 2 * len(df)
|
||||
for col in ['4th', '5th']:
|
||||
verify(udf[col])
|
||||
|
||||
# GH7403
|
||||
df = pd.DataFrame(
|
||||
{'A': list('aaaabbbb'), 'B': range(8), 'C': range(8)})
|
||||
df.iloc[3, 1] = np.NaN
|
||||
left = df.set_index(['A', 'B']).unstack(0)
|
||||
|
||||
vals = [[3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7]]
|
||||
vals = list(map(list, zip(*vals)))
|
||||
idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name='B')
|
||||
cols = MultiIndex(levels=[['C'], ['a', 'b']],
|
||||
codes=[[0, 0], [0, 1]],
|
||||
names=[None, 'A'])
|
||||
|
||||
right = DataFrame(vals, columns=cols, index=idx)
|
||||
assert_frame_equal(left, right)
|
||||
|
||||
df = DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
|
||||
'C': range(8)})
|
||||
df.iloc[2, 1] = np.NaN
|
||||
left = df.set_index(['A', 'B']).unstack(0)
|
||||
|
||||
vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]]
|
||||
cols = MultiIndex(levels=[['C'], ['a', 'b']],
|
||||
codes=[[0, 0], [0, 1]],
|
||||
names=[None, 'A'])
|
||||
idx = Index([np.nan, 0, 1, 2, 3], name='B')
|
||||
right = DataFrame(vals, columns=cols, index=idx)
|
||||
assert_frame_equal(left, right)
|
||||
|
||||
df = pd.DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
|
||||
'C': range(8)})
|
||||
df.iloc[3, 1] = np.NaN
|
||||
left = df.set_index(['A', 'B']).unstack(0)
|
||||
|
||||
vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]]
|
||||
cols = MultiIndex(levels=[['C'], ['a', 'b']],
|
||||
codes=[[0, 0], [0, 1]],
|
||||
names=[None, 'A'])
|
||||
idx = Index([np.nan, 0, 1, 2, 3], name='B')
|
||||
right = DataFrame(vals, columns=cols, index=idx)
|
||||
assert_frame_equal(left, right)
|
||||
|
||||
# GH7401
|
||||
df = pd.DataFrame({'A': list('aaaaabbbbb'),
|
||||
'B': (date_range('2012-01-01', periods=5)
|
||||
.tolist() * 2),
|
||||
'C': np.arange(10)})
|
||||
|
||||
df.iloc[3, 1] = np.NaN
|
||||
left = df.set_index(['A', 'B']).unstack()
|
||||
|
||||
vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]])
|
||||
idx = Index(['a', 'b'], name='A')
|
||||
cols = MultiIndex(levels=[['C'], date_range('2012-01-01', periods=5)],
|
||||
codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
|
||||
names=[None, 'B'])
|
||||
|
||||
right = DataFrame(vals, columns=cols, index=idx)
|
||||
assert_frame_equal(left, right)
|
||||
|
||||
# GH4862
|
||||
vals = [['Hg', np.nan, np.nan, 680585148],
|
||||
['U', 0.0, np.nan, 680585148],
|
||||
['Pb', 7.07e-06, np.nan, 680585148],
|
||||
['Sn', 2.3614e-05, 0.0133, 680607017],
|
||||
['Ag', 0.0, 0.0133, 680607017],
|
||||
['Hg', -0.00015, 0.0133, 680607017]]
|
||||
df = DataFrame(vals, columns=['agent', 'change', 'dosage', 's_id'],
|
||||
index=[17263, 17264, 17265, 17266, 17267, 17268])
|
||||
|
||||
left = df.copy().set_index(['s_id', 'dosage', 'agent']).unstack()
|
||||
|
||||
vals = [[np.nan, np.nan, 7.07e-06, np.nan, 0.0],
|
||||
[0.0, -0.00015, np.nan, 2.3614e-05, np.nan]]
|
||||
|
||||
idx = MultiIndex(levels=[[680585148, 680607017], [0.0133]],
|
||||
codes=[[0, 1], [-1, 0]],
|
||||
names=['s_id', 'dosage'])
|
||||
|
||||
cols = MultiIndex(levels=[['change'], ['Ag', 'Hg', 'Pb', 'Sn', 'U']],
|
||||
codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
|
||||
names=[None, 'agent'])
|
||||
|
||||
right = DataFrame(vals, columns=cols, index=idx)
|
||||
assert_frame_equal(left, right)
|
||||
|
||||
left = df.loc[17264:].copy().set_index(['s_id', 'dosage', 'agent'])
|
||||
assert_frame_equal(left.unstack(), right)
|
||||
|
||||
# GH9497 - multiple unstack with nulls
|
||||
df = DataFrame({'1st': [1, 2, 1, 2, 1, 2],
|
||||
'2nd': pd.date_range('2014-02-01', periods=6,
|
||||
freq='D'),
|
||||
'jim': 100 + np.arange(6),
|
||||
'joe': (np.random.randn(6) * 10).round(2)})
|
||||
|
||||
df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02')
|
||||
df.loc[1, '2nd'] = df.loc[3, '2nd'] = np.nan
|
||||
df.loc[1, '3rd'] = df.loc[4, '3rd'] = np.nan
|
||||
|
||||
left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd'])
|
||||
assert left.notna().values.sum() == 2 * len(df)
|
||||
|
||||
for col in ['jim', 'joe']:
|
||||
for _, r in df.iterrows():
|
||||
key = r['1st'], (col, r['2nd'], r['3rd'])
|
||||
assert r[col] == left.loc[key]
|
||||
|
||||
def test_stack_datetime_column_multiIndex(self):
|
||||
# GH 8039
|
||||
t = datetime(2014, 1, 1)
|
||||
df = DataFrame(
|
||||
[1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, 'A', 'B')]))
|
||||
result = df.stack()
|
||||
|
||||
eidx = MultiIndex.from_product([(0, 1, 2, 3), ('B',)])
|
||||
ecols = MultiIndex.from_tuples([(t, 'A')])
|
||||
expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_stack_partial_multiIndex(self):
|
||||
# GH 8844
|
||||
def _test_stack_with_multiindex(multiindex):
|
||||
df = DataFrame(np.arange(3 * len(multiindex))
|
||||
.reshape(3, len(multiindex)),
|
||||
columns=multiindex)
|
||||
for level in (-1, 0, 1, [0, 1], [1, 0]):
|
||||
result = df.stack(level=level, dropna=False)
|
||||
|
||||
if isinstance(level, int):
|
||||
# Stacking a single level should not make any all-NaN rows,
|
||||
# so df.stack(level=level, dropna=False) should be the same
|
||||
# as df.stack(level=level, dropna=True).
|
||||
expected = df.stack(level=level, dropna=True)
|
||||
if isinstance(expected, Series):
|
||||
assert_series_equal(result, expected)
|
||||
else:
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df.columns = MultiIndex.from_tuples(df.columns.get_values(),
|
||||
names=df.columns.names)
|
||||
expected = df.stack(level=level, dropna=False)
|
||||
if isinstance(expected, Series):
|
||||
assert_series_equal(result, expected)
|
||||
else:
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
full_multiindex = MultiIndex.from_tuples([('B', 'x'), ('B', 'z'),
|
||||
('A', 'y'),
|
||||
('C', 'x'), ('C', 'u')],
|
||||
names=['Upper', 'Lower'])
|
||||
for multiindex_columns in ([0, 1, 2, 3, 4],
|
||||
[0, 1, 2, 3], [0, 1, 2, 4],
|
||||
[0, 1, 2], [1, 2, 3], [2, 3, 4],
|
||||
[0, 1], [0, 2], [0, 3],
|
||||
[0], [2], [4]):
|
||||
_test_stack_with_multiindex(full_multiindex[multiindex_columns])
|
||||
if len(multiindex_columns) > 1:
|
||||
multiindex_columns.reverse()
|
||||
_test_stack_with_multiindex(
|
||||
full_multiindex[multiindex_columns])
|
||||
|
||||
df = DataFrame(np.arange(6).reshape(2, 3),
|
||||
columns=full_multiindex[[0, 1, 3]])
|
||||
result = df.stack(dropna=False)
|
||||
expected = DataFrame([[0, 2], [1, np.nan], [3, 5], [4, np.nan]],
|
||||
index=MultiIndex(
|
||||
levels=[[0, 1], ['u', 'x', 'y', 'z']],
|
||||
codes=[[0, 0, 1, 1],
|
||||
[1, 3, 1, 3]],
|
||||
names=[None, 'Lower']),
|
||||
columns=Index(['B', 'C'], name='Upper'),
|
||||
dtype=df.dtypes[0])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('ordered', [False, True])
|
||||
@pytest.mark.parametrize('labels', [list("yxz"), list("yxy")])
|
||||
def test_stack_preserve_categorical_dtype(self, ordered, labels):
|
||||
# GH13854
|
||||
cidx = pd.CategoricalIndex(labels, categories=list("xyz"),
|
||||
ordered=ordered)
|
||||
df = DataFrame([[10, 11, 12]], columns=cidx)
|
||||
result = df.stack()
|
||||
|
||||
# `MutliIndex.from_product` preserves categorical dtype -
|
||||
# it's tested elsewhere.
|
||||
midx = pd.MultiIndex.from_product([df.index, cidx])
|
||||
expected = Series([10, 11, 12], index=midx)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_stack_preserve_categorical_dtype_values(self):
|
||||
# GH-23077
|
||||
cat = pd.Categorical(['a', 'a', 'b', 'c'])
|
||||
df = pd.DataFrame({"A": cat, "B": cat})
|
||||
result = df.stack()
|
||||
index = pd.MultiIndex.from_product([[0, 1, 2, 3], ['A', 'B']])
|
||||
expected = pd.Series(pd.Categorical(['a', 'a', 'a', 'a',
|
||||
'b', 'b', 'c', 'c']),
|
||||
index=index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('level', [0, 1])
|
||||
def test_unstack_mixed_extension_types(self, level):
|
||||
index = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 1)],
|
||||
names=['a', 'b'])
|
||||
df = pd.DataFrame({"A": pd.core.arrays.integer_array([0, 1, None]),
|
||||
"B": pd.Categorical(['a', 'a', 'b'])}, index=index)
|
||||
|
||||
result = df.unstack(level=level)
|
||||
expected = df.astype(object).unstack(level=level)
|
||||
|
||||
expected_dtypes = pd.Series([df.A.dtype] * 2 + [df.B.dtype] * 2,
|
||||
index=result.columns)
|
||||
tm.assert_series_equal(result.dtypes, expected_dtypes)
|
||||
tm.assert_frame_equal(result.astype(object), expected)
|
||||
|
||||
@pytest.mark.parametrize("level", [0, 'baz'])
|
||||
def test_unstack_swaplevel_sortlevel(self, level):
|
||||
# GH 20994
|
||||
mi = pd.MultiIndex.from_product([[0], ['d', 'c']],
|
||||
names=['bar', 'baz'])
|
||||
df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=['B', 'A'])
|
||||
df.columns.name = 'foo'
|
||||
|
||||
expected = pd.DataFrame([
|
||||
[3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples([
|
||||
('c', 'A'), ('c', 'B'), ('d', 'A'), ('d', 'B')], names=[
|
||||
'baz', 'foo']))
|
||||
expected.index.name = 'bar'
|
||||
|
||||
result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_unstack_fill_frame_object():
|
||||
# GH12815 Test unstacking with object.
|
||||
data = pd.Series(['a', 'b', 'c', 'a'], dtype='object')
|
||||
data.index = pd.MultiIndex.from_tuples(
|
||||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
||||
|
||||
# By default missing values will be NaN
|
||||
result = data.unstack()
|
||||
expected = pd.DataFrame(
|
||||
{'a': ['a', np.nan, 'a'], 'b': ['b', 'c', np.nan]},
|
||||
index=list('xyz')
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Fill with any value replaces missing values as expected
|
||||
result = data.unstack(fill_value='d')
|
||||
expected = pd.DataFrame(
|
||||
{'a': ['a', 'd', 'a'], 'b': ['b', 'c', 'd']},
|
||||
index=list('xyz')
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_unstack_timezone_aware_values():
|
||||
# GH 18338
|
||||
df = pd.DataFrame({
|
||||
'timestamp': [
|
||||
pd.Timestamp('2017-08-27 01:00:00.709949+0000', tz='UTC')],
|
||||
'a': ['a'],
|
||||
'b': ['b'],
|
||||
'c': ['c'],
|
||||
}, columns=['timestamp', 'a', 'b', 'c'])
|
||||
result = df.set_index(['a', 'b']).unstack()
|
||||
expected = pd.DataFrame([[pd.Timestamp('2017-08-27 01:00:00.709949+0000',
|
||||
tz='UTC'),
|
||||
'c']],
|
||||
index=pd.Index(['a'], name='a'),
|
||||
columns=pd.MultiIndex(
|
||||
levels=[['timestamp', 'c'], ['b']],
|
||||
codes=[[0, 1], [0, 0]],
|
||||
names=[None, 'b']))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_stack_timezone_aware_values():
|
||||
# GH 19420
|
||||
ts = pd.date_range(freq="D", start="20180101", end="20180103",
|
||||
tz="America/New_York")
|
||||
df = pd.DataFrame({"A": ts}, index=["a", "b", "c"])
|
||||
result = df.stack()
|
||||
expected = pd.Series(ts,
|
||||
index=pd.MultiIndex(levels=[['a', 'b', 'c'], ['A']],
|
||||
codes=[[0, 1, 2], [0, 0, 0]]))
|
||||
assert_series_equal(result, expected)
|
||||
+96
@@ -0,0 +1,96 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import PerformanceWarning
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.util import testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_none():
|
||||
return DataFrame({
|
||||
'outer': ['a', 'a', 'a', 'b', 'b', 'b'],
|
||||
'inner': [1, 2, 2, 2, 1, 1],
|
||||
'A': np.arange(6, 0, -1),
|
||||
('B', 5): ['one', 'one', 'two', 'two', 'one', 'one']})
|
||||
|
||||
|
||||
@pytest.fixture(params=[
|
||||
['outer'],
|
||||
['outer', 'inner']
|
||||
])
|
||||
def df_idx(request, df_none):
|
||||
levels = request.param
|
||||
return df_none.set_index(levels)
|
||||
|
||||
|
||||
@pytest.fixture(params=[
|
||||
'inner', # index level
|
||||
['outer'], # list of index level
|
||||
'A', # column
|
||||
[('B', 5)], # list of column
|
||||
['inner', 'outer'], # two index levels
|
||||
[('B', 5), 'outer'], # index level and column
|
||||
['A', ('B', 5)], # Two columns
|
||||
['inner', 'outer'] # two index levels and column
|
||||
])
|
||||
def sort_names(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def ascending(request):
|
||||
return request.param
|
||||
|
||||
|
||||
def test_sort_index_level_and_column_label(
|
||||
df_none, df_idx, sort_names, ascending):
|
||||
|
||||
# GH 14353
|
||||
|
||||
# Get index levels from df_idx
|
||||
levels = df_idx.index.names
|
||||
|
||||
# Compute expected by sorting on columns and the setting index
|
||||
expected = df_none.sort_values(by=sort_names,
|
||||
ascending=ascending,
|
||||
axis=0).set_index(levels)
|
||||
|
||||
# Compute result sorting on mix on columns and index levels
|
||||
result = df_idx.sort_values(by=sort_names,
|
||||
ascending=ascending,
|
||||
axis=0)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_sort_column_level_and_index_label(
|
||||
df_none, df_idx, sort_names, ascending):
|
||||
|
||||
# GH 14353
|
||||
|
||||
# Get levels from df_idx
|
||||
levels = df_idx.index.names
|
||||
|
||||
# Compute expected by sorting on axis=0, setting index levels, and then
|
||||
# transposing. For some cases this will result in a frame with
|
||||
# multiple column levels
|
||||
expected = df_none.sort_values(by=sort_names,
|
||||
ascending=ascending,
|
||||
axis=0).set_index(levels).T
|
||||
|
||||
# Compute result by transposing and sorting on axis=1.
|
||||
result = df_idx.T.sort_values(by=sort_names,
|
||||
ascending=ascending,
|
||||
axis=1)
|
||||
|
||||
if len(levels) > 1:
|
||||
# Accessing multi-level columns that are not lexsorted raises a
|
||||
# performance warning
|
||||
with tm.assert_produces_warning(PerformanceWarning,
|
||||
check_stacklevel=False):
|
||||
assert_frame_equal(result, expected)
|
||||
else:
|
||||
assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,670 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import lrange
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical, DataFrame, IntervalIndex, MultiIndex, NaT, Series, Timestamp,
|
||||
date_range)
|
||||
from pandas.api.types import CategoricalDtype
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
class TestDataFrameSorting(TestData):
|
||||
|
||||
def test_sort_values(self):
|
||||
frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]],
|
||||
index=[1, 2, 3], columns=list('ABC'))
|
||||
|
||||
# by column (axis=0)
|
||||
sorted_df = frame.sort_values(by='A')
|
||||
indexer = frame['A'].argsort().values
|
||||
expected = frame.loc[frame.index[indexer]]
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by='A', ascending=False)
|
||||
indexer = indexer[::-1]
|
||||
expected = frame.loc[frame.index[indexer]]
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by='A', ascending=False)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# GH4839
|
||||
sorted_df = frame.sort_values(by=['A'], ascending=[False])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# multiple bys
|
||||
sorted_df = frame.sort_values(by=['B', 'C'])
|
||||
expected = frame.loc[[2, 1, 3]]
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=['B', 'C'], ascending=False)
|
||||
assert_frame_equal(sorted_df, expected[::-1])
|
||||
|
||||
sorted_df = frame.sort_values(by=['B', 'A'], ascending=[True, False])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
pytest.raises(ValueError, lambda: frame.sort_values(
|
||||
by=['A', 'B'], axis=2, inplace=True))
|
||||
|
||||
# by row (axis=1): GH 10806
|
||||
sorted_df = frame.sort_values(by=3, axis=1)
|
||||
expected = frame
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=3, axis=1, ascending=False)
|
||||
expected = frame.reindex(columns=['C', 'B', 'A'])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=[1, 2], axis='columns')
|
||||
expected = frame.reindex(columns=['B', 'A', 'C'])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=[1, 3], axis=1,
|
||||
ascending=[True, False])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False)
|
||||
expected = frame.reindex(columns=['C', 'B', 'A'])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
msg = r'Length of ascending \(5\) != length of by \(2\)'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
frame.sort_values(by=['A', 'B'], axis=0, ascending=[True] * 5)
|
||||
|
||||
def test_sort_values_inplace(self):
|
||||
frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4],
|
||||
columns=['A', 'B', 'C', 'D'])
|
||||
|
||||
sorted_df = frame.copy()
|
||||
sorted_df.sort_values(by='A', inplace=True)
|
||||
expected = frame.sort_values(by='A')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
sorted_df.sort_values(by=1, axis=1, inplace=True)
|
||||
expected = frame.sort_values(by=1, axis=1)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
sorted_df.sort_values(by='A', ascending=False, inplace=True)
|
||||
expected = frame.sort_values(by='A', ascending=False)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
sorted_df.sort_values(by=['A', 'B'], ascending=False, inplace=True)
|
||||
expected = frame.sort_values(by=['A', 'B'], ascending=False)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_sort_nan(self):
|
||||
# GH3917
|
||||
nan = np.nan
|
||||
df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
|
||||
'B': [9, nan, 5, 2, 5, 4, 5]})
|
||||
|
||||
# sort one column only
|
||||
expected = DataFrame(
|
||||
{'A': [nan, 1, 1, 2, 4, 6, 8],
|
||||
'B': [5, 9, 2, nan, 5, 5, 4]},
|
||||
index=[2, 0, 3, 1, 6, 4, 5])
|
||||
sorted_df = df.sort_values(['A'], na_position='first')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{'A': [nan, 8, 6, 4, 2, 1, 1],
|
||||
'B': [5, 4, 5, 5, nan, 9, 2]},
|
||||
index=[2, 5, 4, 6, 1, 0, 3])
|
||||
sorted_df = df.sort_values(['A'], na_position='first', ascending=False)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
expected = df.reindex(columns=['B', 'A'])
|
||||
sorted_df = df.sort_values(by=1, axis=1, na_position='first')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# na_position='last', order
|
||||
expected = DataFrame(
|
||||
{'A': [1, 1, 2, 4, 6, 8, nan],
|
||||
'B': [2, 9, nan, 5, 5, 4, 5]},
|
||||
index=[3, 0, 1, 6, 4, 5, 2])
|
||||
sorted_df = df.sort_values(['A', 'B'])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# na_position='first', order
|
||||
expected = DataFrame(
|
||||
{'A': [nan, 1, 1, 2, 4, 6, 8],
|
||||
'B': [5, 2, 9, nan, 5, 5, 4]},
|
||||
index=[2, 3, 0, 1, 6, 4, 5])
|
||||
sorted_df = df.sort_values(['A', 'B'], na_position='first')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# na_position='first', not order
|
||||
expected = DataFrame(
|
||||
{'A': [nan, 1, 1, 2, 4, 6, 8],
|
||||
'B': [5, 9, 2, nan, 5, 5, 4]},
|
||||
index=[2, 0, 3, 1, 6, 4, 5])
|
||||
sorted_df = df.sort_values(['A', 'B'], ascending=[
|
||||
1, 0], na_position='first')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# na_position='last', not order
|
||||
expected = DataFrame(
|
||||
{'A': [8, 6, 4, 2, 1, 1, nan],
|
||||
'B': [4, 5, 5, nan, 2, 9, 5]},
|
||||
index=[5, 4, 6, 1, 3, 0, 2])
|
||||
sorted_df = df.sort_values(['A', 'B'], ascending=[
|
||||
0, 1], na_position='last')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# Test DataFrame with nan label
|
||||
df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
|
||||
'B': [9, nan, 5, 2, 5, 4, 5]},
|
||||
index=[1, 2, 3, 4, 5, 6, nan])
|
||||
|
||||
# NaN label, ascending=True, na_position='last'
|
||||
sorted_df = df.sort_index(
|
||||
kind='quicksort', ascending=True, na_position='last')
|
||||
expected = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
|
||||
'B': [9, nan, 5, 2, 5, 4, 5]},
|
||||
index=[1, 2, 3, 4, 5, 6, nan])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# NaN label, ascending=True, na_position='first'
|
||||
sorted_df = df.sort_index(na_position='first')
|
||||
expected = DataFrame({'A': [4, 1, 2, nan, 1, 6, 8],
|
||||
'B': [5, 9, nan, 5, 2, 5, 4]},
|
||||
index=[nan, 1, 2, 3, 4, 5, 6])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# NaN label, ascending=False, na_position='last'
|
||||
sorted_df = df.sort_index(kind='quicksort', ascending=False)
|
||||
expected = DataFrame({'A': [8, 6, 1, nan, 2, 1, 4],
|
||||
'B': [4, 5, 2, 5, nan, 9, 5]},
|
||||
index=[6, 5, 4, 3, 2, 1, nan])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# NaN label, ascending=False, na_position='first'
|
||||
sorted_df = df.sort_index(
|
||||
kind='quicksort', ascending=False, na_position='first')
|
||||
expected = DataFrame({'A': [4, 8, 6, 1, nan, 2, 1],
|
||||
'B': [5, 4, 5, 2, 5, nan, 9]},
|
||||
index=[nan, 6, 5, 4, 3, 2, 1])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_stable_descending_sort(self):
|
||||
# GH #6399
|
||||
df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']],
|
||||
columns=['sort_col', 'order'])
|
||||
sorted_df = df.sort_values(by='sort_col', kind='mergesort',
|
||||
ascending=False)
|
||||
assert_frame_equal(df, sorted_df)
|
||||
|
||||
def test_stable_descending_multicolumn_sort(self):
|
||||
nan = np.nan
|
||||
df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
|
||||
'B': [9, nan, 5, 2, 5, 4, 5]})
|
||||
# test stable mergesort
|
||||
expected = DataFrame(
|
||||
{'A': [nan, 8, 6, 4, 2, 1, 1],
|
||||
'B': [5, 4, 5, 5, nan, 2, 9]},
|
||||
index=[2, 5, 4, 6, 1, 3, 0])
|
||||
sorted_df = df.sort_values(['A', 'B'], ascending=[0, 1],
|
||||
na_position='first',
|
||||
kind='mergesort')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{'A': [nan, 8, 6, 4, 2, 1, 1],
|
||||
'B': [5, 4, 5, 5, nan, 9, 2]},
|
||||
index=[2, 5, 4, 6, 1, 0, 3])
|
||||
sorted_df = df.sort_values(['A', 'B'], ascending=[0, 0],
|
||||
na_position='first',
|
||||
kind='mergesort')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_stable_categorial(self):
|
||||
# GH 16793
|
||||
df = DataFrame({
|
||||
'x': pd.Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)
|
||||
})
|
||||
expected = df.copy()
|
||||
sorted_df = df.sort_values('x', kind='mergesort')
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_sort_datetimes(self):
|
||||
|
||||
# GH 3461, argsort / lexsort differences for a datetime column
|
||||
df = DataFrame(['a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'],
|
||||
columns=['A'],
|
||||
index=date_range('20130101', periods=9))
|
||||
dts = [Timestamp(x)
|
||||
for x in ['2004-02-11', '2004-01-21', '2004-01-26',
|
||||
'2005-09-20', '2010-10-04', '2009-05-12',
|
||||
'2008-11-12', '2010-09-28', '2010-09-28']]
|
||||
df['B'] = dts[::2] + dts[1::2]
|
||||
df['C'] = 2.
|
||||
df['A1'] = 3.
|
||||
|
||||
df1 = df.sort_values(by='A')
|
||||
df2 = df.sort_values(by=['A'])
|
||||
assert_frame_equal(df1, df2)
|
||||
|
||||
df1 = df.sort_values(by='B')
|
||||
df2 = df.sort_values(by=['B'])
|
||||
assert_frame_equal(df1, df2)
|
||||
|
||||
df1 = df.sort_values(by='B')
|
||||
|
||||
df2 = df.sort_values(by=['C', 'B'])
|
||||
assert_frame_equal(df1, df2)
|
||||
|
||||
def test_frame_column_inplace_sort_exception(self):
|
||||
s = self.frame['A']
|
||||
with pytest.raises(ValueError, match="This Series is a view"):
|
||||
s.sort_values(inplace=True)
|
||||
|
||||
cp = s.copy()
|
||||
cp.sort_values() # it works!
|
||||
|
||||
def test_sort_nat_values_in_int_column(self):
|
||||
|
||||
# GH 14922: "sorting with large float and multiple columns incorrect"
|
||||
|
||||
# cause was that the int64 value NaT was considered as "na". Which is
|
||||
# only correct for datetime64 columns.
|
||||
|
||||
int_values = (2, int(NaT))
|
||||
float_values = (2.0, -1.797693e308)
|
||||
|
||||
df = DataFrame(dict(int=int_values, float=float_values),
|
||||
columns=["int", "float"])
|
||||
|
||||
df_reversed = DataFrame(dict(int=int_values[::-1],
|
||||
float=float_values[::-1]),
|
||||
columns=["int", "float"],
|
||||
index=[1, 0])
|
||||
|
||||
# NaT is not a "na" for int64 columns, so na_position must not
|
||||
# influence the result:
|
||||
df_sorted = df.sort_values(["int", "float"], na_position="last")
|
||||
assert_frame_equal(df_sorted, df_reversed)
|
||||
|
||||
df_sorted = df.sort_values(["int", "float"], na_position="first")
|
||||
assert_frame_equal(df_sorted, df_reversed)
|
||||
|
||||
# reverse sorting order
|
||||
df_sorted = df.sort_values(["int", "float"], ascending=False)
|
||||
assert_frame_equal(df_sorted, df)
|
||||
|
||||
# and now check if NaT is still considered as "na" for datetime64
|
||||
# columns:
|
||||
df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT],
|
||||
float=float_values), columns=["datetime", "float"])
|
||||
|
||||
df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")],
|
||||
float=float_values[::-1]),
|
||||
columns=["datetime", "float"],
|
||||
index=[1, 0])
|
||||
|
||||
df_sorted = df.sort_values(["datetime", "float"], na_position="first")
|
||||
assert_frame_equal(df_sorted, df_reversed)
|
||||
|
||||
df_sorted = df.sort_values(["datetime", "float"], na_position="last")
|
||||
assert_frame_equal(df_sorted, df)
|
||||
|
||||
# Ascending should not affect the results.
|
||||
df_sorted = df.sort_values(["datetime", "float"], ascending=False)
|
||||
assert_frame_equal(df_sorted, df)
|
||||
|
||||
def test_sort_nat(self):
|
||||
|
||||
# GH 16836
|
||||
|
||||
d1 = [Timestamp(x) for x in ['2016-01-01', '2015-01-01',
|
||||
np.nan, '2016-01-01']]
|
||||
d2 = [Timestamp(x) for x in ['2017-01-01', '2014-01-01',
|
||||
'2016-01-01', '2015-01-01']]
|
||||
df = pd.DataFrame({'a': d1, 'b': d2}, index=[0, 1, 2, 3])
|
||||
|
||||
d3 = [Timestamp(x) for x in ['2015-01-01', '2016-01-01',
|
||||
'2016-01-01', np.nan]]
|
||||
d4 = [Timestamp(x) for x in ['2014-01-01', '2015-01-01',
|
||||
'2017-01-01', '2016-01-01']]
|
||||
expected = pd.DataFrame({'a': d3, 'b': d4}, index=[1, 3, 0, 2])
|
||||
sorted_df = df.sort_values(by=['a', 'b'], )
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
|
||||
class TestDataFrameSortIndexKinds(TestData):
|
||||
|
||||
def test_sort_index_multicolumn(self):
|
||||
A = np.arange(5).repeat(20)
|
||||
B = np.tile(np.arange(5), 20)
|
||||
random.shuffle(A)
|
||||
random.shuffle(B)
|
||||
frame = DataFrame({'A': A, 'B': B,
|
||||
'C': np.random.randn(100)})
|
||||
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
frame.sort_index(by=['A', 'B'])
|
||||
result = frame.sort_values(by=['A', 'B'])
|
||||
indexer = np.lexsort((frame['B'], frame['A']))
|
||||
expected = frame.take(indexer)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
frame.sort_index(by=['A', 'B'], ascending=False)
|
||||
result = frame.sort_values(by=['A', 'B'], ascending=False)
|
||||
indexer = np.lexsort((frame['B'].rank(ascending=False),
|
||||
frame['A'].rank(ascending=False)))
|
||||
expected = frame.take(indexer)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
frame.sort_index(by=['B', 'A'])
|
||||
result = frame.sort_values(by=['B', 'A'])
|
||||
indexer = np.lexsort((frame['A'], frame['B']))
|
||||
expected = frame.take(indexer)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_index_inplace(self):
|
||||
frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4],
|
||||
columns=['A', 'B', 'C', 'D'])
|
||||
|
||||
# axis=0
|
||||
unordered = frame.loc[[3, 2, 4, 1]]
|
||||
a_id = id(unordered['A'])
|
||||
df = unordered.copy()
|
||||
df.sort_index(inplace=True)
|
||||
expected = frame
|
||||
assert_frame_equal(df, expected)
|
||||
assert a_id != id(df['A'])
|
||||
|
||||
df = unordered.copy()
|
||||
df.sort_index(ascending=False, inplace=True)
|
||||
expected = frame[::-1]
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# axis=1
|
||||
unordered = frame.loc[:, ['D', 'B', 'C', 'A']]
|
||||
df = unordered.copy()
|
||||
df.sort_index(axis=1, inplace=True)
|
||||
expected = frame
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
df = unordered.copy()
|
||||
df.sort_index(axis=1, ascending=False, inplace=True)
|
||||
expected = frame.iloc[:, ::-1]
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_sort_index_different_sortorder(self):
|
||||
A = np.arange(20).repeat(5)
|
||||
B = np.tile(np.arange(5), 20)
|
||||
|
||||
indexer = np.random.permutation(100)
|
||||
A = A.take(indexer)
|
||||
B = B.take(indexer)
|
||||
|
||||
df = DataFrame({'A': A, 'B': B,
|
||||
'C': np.random.randn(100)})
|
||||
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by=['A', 'B'], ascending=[1, 0])
|
||||
result = df.sort_values(by=['A', 'B'], ascending=[1, 0])
|
||||
|
||||
ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
|
||||
expected = df.take(ex_indexer)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# test with multiindex, too
|
||||
idf = df.set_index(['A', 'B'])
|
||||
|
||||
result = idf.sort_index(ascending=[1, 0])
|
||||
expected = idf.take(ex_indexer)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# also, Series!
|
||||
result = idf['C'].sort_index(ascending=[1, 0])
|
||||
assert_series_equal(result, expected['C'])
|
||||
|
||||
def test_sort_index_duplicates(self):
|
||||
|
||||
# with 9816, these are all translated to .sort_values
|
||||
|
||||
df = DataFrame([lrange(5, 9), lrange(4)],
|
||||
columns=['a', 'a', 'b', 'b'])
|
||||
|
||||
with pytest.raises(ValueError, match='not unique'):
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by='a')
|
||||
with pytest.raises(ValueError, match='not unique'):
|
||||
df.sort_values(by='a')
|
||||
|
||||
with pytest.raises(ValueError, match='not unique'):
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by=['a'])
|
||||
with pytest.raises(ValueError, match='not unique'):
|
||||
df.sort_values(by=['a'])
|
||||
|
||||
with pytest.raises(ValueError, match='not unique'):
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
# multi-column 'by' is separate codepath
|
||||
df.sort_index(by=['a', 'b'])
|
||||
with pytest.raises(ValueError, match='not unique'):
|
||||
# multi-column 'by' is separate codepath
|
||||
df.sort_values(by=['a', 'b'])
|
||||
|
||||
# with multi-index
|
||||
# GH4370
|
||||
df = DataFrame(np.random.randn(4, 2),
|
||||
columns=MultiIndex.from_tuples([('a', 0), ('a', 1)]))
|
||||
with pytest.raises(ValueError, match='level'):
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by='a')
|
||||
with pytest.raises(ValueError, match='level'):
|
||||
df.sort_values(by='a')
|
||||
|
||||
# convert tuples to a list of tuples
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by=[('a', 1)])
|
||||
expected = df.sort_values(by=[('a', 1)])
|
||||
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by=('a', 1))
|
||||
result = df.sort_values(by=('a', 1))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_index_level(self):
|
||||
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
|
||||
df = DataFrame([[1, 2], [3, 4]], mi)
|
||||
res = df.sort_index(level='A', sort_remaining=False)
|
||||
assert_frame_equal(df, res)
|
||||
|
||||
res = df.sort_index(level=['A', 'B'], sort_remaining=False)
|
||||
assert_frame_equal(df, res)
|
||||
|
||||
def test_sort_index_categorical_index(self):
|
||||
|
||||
df = (DataFrame({'A': np.arange(6, dtype='int64'),
|
||||
'B': Series(list('aabbca'))
|
||||
.astype(CategoricalDtype(list('cab')))})
|
||||
.set_index('B'))
|
||||
|
||||
result = df.sort_index()
|
||||
expected = df.iloc[[4, 0, 1, 5, 2, 3]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.sort_index(ascending=False)
|
||||
expected = df.iloc[[3, 2, 5, 1, 0, 4]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_index(self):
|
||||
# GH13496
|
||||
|
||||
frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
|
||||
columns=['A', 'B', 'C', 'D'])
|
||||
|
||||
# axis=0 : sort rows by index labels
|
||||
unordered = frame.loc[[3, 2, 4, 1]]
|
||||
result = unordered.sort_index(axis=0)
|
||||
expected = frame
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = unordered.sort_index(ascending=False)
|
||||
expected = frame[::-1]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# axis=1 : sort columns by column names
|
||||
unordered = frame.iloc[:, [2, 1, 3, 0]]
|
||||
result = unordered.sort_index(axis=1)
|
||||
assert_frame_equal(result, frame)
|
||||
|
||||
result = unordered.sort_index(axis=1, ascending=False)
|
||||
expected = frame.iloc[:, ::-1]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("level", ['A', 0]) # GH 21052
|
||||
def test_sort_index_multiindex(self, level):
|
||||
# GH13496
|
||||
|
||||
# sort rows by specified level of multi-index
|
||||
mi = MultiIndex.from_tuples([
|
||||
[2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list('ABC'))
|
||||
df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi)
|
||||
|
||||
expected_mi = MultiIndex.from_tuples([
|
||||
[1, 1, 1],
|
||||
[2, 1, 2],
|
||||
[2, 1, 3]], names=list('ABC'))
|
||||
expected = pd.DataFrame([
|
||||
[5, 6],
|
||||
[3, 4],
|
||||
[1, 2]], index=expected_mi)
|
||||
result = df.sort_index(level=level)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# sort_remaining=False
|
||||
expected_mi = MultiIndex.from_tuples([
|
||||
[1, 1, 1],
|
||||
[2, 1, 3],
|
||||
[2, 1, 2]], names=list('ABC'))
|
||||
expected = pd.DataFrame([
|
||||
[5, 6],
|
||||
[1, 2],
|
||||
[3, 4]], index=expected_mi)
|
||||
result = df.sort_index(level=level, sort_remaining=False)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_index_intervalindex(self):
|
||||
# this is a de-facto sort via unstack
|
||||
# confirming that we sort in the order of the bins
|
||||
y = Series(np.random.randn(100))
|
||||
x1 = Series(np.sign(np.random.randn(100)))
|
||||
x2 = pd.cut(Series(np.random.randn(100)),
|
||||
bins=[-3, -0.5, 0, 0.5, 3])
|
||||
model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])
|
||||
|
||||
result = model.groupby(['X1', 'X2'], observed=True).mean().unstack()
|
||||
expected = IntervalIndex.from_tuples(
|
||||
[(-3.0, -0.5), (-0.5, 0.0),
|
||||
(0.0, 0.5), (0.5, 3.0)],
|
||||
closed='right')
|
||||
result = result.columns.levels[1].categories
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
def test_sort_index_na_position_with_categories(self):
|
||||
# GH 22556
|
||||
# Positioning missing value properly when column is Categorical.
|
||||
categories = ['A', 'B', 'C']
|
||||
category_indices = [0, 2, 4]
|
||||
list_of_nans = [np.nan, np.nan]
|
||||
na_indices = [1, 3]
|
||||
na_position_first = 'first'
|
||||
na_position_last = 'last'
|
||||
column_name = 'c'
|
||||
|
||||
reversed_categories = sorted(categories, reverse=True)
|
||||
reversed_category_indices = sorted(category_indices, reverse=True)
|
||||
reversed_na_indices = sorted(na_indices, reverse=True)
|
||||
|
||||
df = pd.DataFrame({
|
||||
column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'],
|
||||
categories=categories,
|
||||
ordered=True)})
|
||||
# sort ascending with na first
|
||||
result = df.sort_values(by=column_name,
|
||||
ascending=True,
|
||||
na_position=na_position_first)
|
||||
expected = DataFrame({
|
||||
column_name: Categorical(list_of_nans + categories,
|
||||
categories=categories,
|
||||
ordered=True)
|
||||
}, index=na_indices + category_indices)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# sort ascending with na last
|
||||
result = df.sort_values(by=column_name,
|
||||
ascending=True,
|
||||
na_position=na_position_last)
|
||||
expected = DataFrame({
|
||||
column_name: Categorical(categories + list_of_nans,
|
||||
categories=categories,
|
||||
ordered=True)
|
||||
}, index=category_indices + na_indices)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# sort descending with na first
|
||||
result = df.sort_values(by=column_name,
|
||||
ascending=False,
|
||||
na_position=na_position_first)
|
||||
expected = DataFrame({
|
||||
column_name: Categorical(list_of_nans + reversed_categories,
|
||||
categories=categories,
|
||||
ordered=True)
|
||||
}, index=reversed_na_indices + reversed_category_indices)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# sort descending with na last
|
||||
result = df.sort_values(by=column_name,
|
||||
ascending=False,
|
||||
na_position=na_position_last)
|
||||
expected = DataFrame({
|
||||
column_name: Categorical(reversed_categories + list_of_nans,
|
||||
categories=categories,
|
||||
ordered=True)
|
||||
}, index=reversed_category_indices + reversed_na_indices)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_index_na_position_with_categories_raises(self):
|
||||
df = pd.DataFrame({
|
||||
'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'],
|
||||
categories=['A', 'B', 'C'],
|
||||
ordered=True)})
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
df.sort_values(by='c',
|
||||
ascending=False,
|
||||
na_position='bad_position')
|
||||
@@ -0,0 +1,573 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, Panel, Series
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
class TestDataFrameSubclassing(TestData):
|
||||
|
||||
def test_frame_subclassing_and_slicing(self):
|
||||
# Subclass frame and ensure it returns the right class on slicing it
|
||||
# In reference to PR 9632
|
||||
|
||||
class CustomSeries(Series):
|
||||
|
||||
@property
|
||||
def _constructor(self):
|
||||
return CustomSeries
|
||||
|
||||
def custom_series_function(self):
|
||||
return 'OK'
|
||||
|
||||
class CustomDataFrame(DataFrame):
|
||||
"""
|
||||
Subclasses pandas DF, fills DF with simulation results, adds some
|
||||
custom plotting functions.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kw):
|
||||
super(CustomDataFrame, self).__init__(*args, **kw)
|
||||
|
||||
@property
|
||||
def _constructor(self):
|
||||
return CustomDataFrame
|
||||
|
||||
_constructor_sliced = CustomSeries
|
||||
|
||||
def custom_frame_function(self):
|
||||
return 'OK'
|
||||
|
||||
data = {'col1': range(10),
|
||||
'col2': range(10)}
|
||||
cdf = CustomDataFrame(data)
|
||||
|
||||
# Did we get back our own DF class?
|
||||
assert isinstance(cdf, CustomDataFrame)
|
||||
|
||||
# Do we get back our own Series class after selecting a column?
|
||||
cdf_series = cdf.col1
|
||||
assert isinstance(cdf_series, CustomSeries)
|
||||
assert cdf_series.custom_series_function() == 'OK'
|
||||
|
||||
# Do we get back our own DF class after slicing row-wise?
|
||||
cdf_rows = cdf[1:5]
|
||||
assert isinstance(cdf_rows, CustomDataFrame)
|
||||
assert cdf_rows.custom_frame_function() == 'OK'
|
||||
|
||||
# Make sure sliced part of multi-index frame is custom class
|
||||
mcol = pd.MultiIndex.from_tuples([('A', 'A'), ('A', 'B')])
|
||||
cdf_multi = CustomDataFrame([[0, 1], [2, 3]], columns=mcol)
|
||||
assert isinstance(cdf_multi['A'], CustomDataFrame)
|
||||
|
||||
mcol = pd.MultiIndex.from_tuples([('A', ''), ('B', '')])
|
||||
cdf_multi2 = CustomDataFrame([[0, 1], [2, 3]], columns=mcol)
|
||||
assert isinstance(cdf_multi2['A'], CustomSeries)
|
||||
|
||||
def test_dataframe_metadata(self):
|
||||
df = tm.SubclassedDataFrame({'X': [1, 2, 3], 'Y': [1, 2, 3]},
|
||||
index=['a', 'b', 'c'])
|
||||
df.testattr = 'XXX'
|
||||
|
||||
assert df.testattr == 'XXX'
|
||||
assert df[['X']].testattr == 'XXX'
|
||||
assert df.loc[['a', 'b'], :].testattr == 'XXX'
|
||||
assert df.iloc[[0, 1], :].testattr == 'XXX'
|
||||
|
||||
# see gh-9776
|
||||
assert df.iloc[0:1, :].testattr == 'XXX'
|
||||
|
||||
# see gh-10553
|
||||
unpickled = tm.round_trip_pickle(df)
|
||||
tm.assert_frame_equal(df, unpickled)
|
||||
assert df._metadata == unpickled._metadata
|
||||
assert df.testattr == unpickled.testattr
|
||||
|
||||
def test_indexing_sliced(self):
|
||||
# GH 11559
|
||||
df = tm.SubclassedDataFrame({'X': [1, 2, 3],
|
||||
'Y': [4, 5, 6],
|
||||
'Z': [7, 8, 9]},
|
||||
index=['a', 'b', 'c'])
|
||||
res = df.loc[:, 'X']
|
||||
exp = tm.SubclassedSeries([1, 2, 3], index=list('abc'), name='X')
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
res = df.iloc[:, 1]
|
||||
exp = tm.SubclassedSeries([4, 5, 6], index=list('abc'), name='Y')
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
res = df.loc[:, 'Z']
|
||||
exp = tm.SubclassedSeries([7, 8, 9], index=list('abc'), name='Z')
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
res = df.loc['a', :]
|
||||
exp = tm.SubclassedSeries([1, 4, 7], index=list('XYZ'), name='a')
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
res = df.iloc[1, :]
|
||||
exp = tm.SubclassedSeries([2, 5, 8], index=list('XYZ'), name='b')
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
res = df.loc['c', :]
|
||||
exp = tm.SubclassedSeries([3, 6, 9], index=list('XYZ'), name='c')
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
|
||||
def test_to_panel_expanddim(self):
|
||||
# GH 9762
|
||||
|
||||
class SubclassedFrame(DataFrame):
|
||||
|
||||
@property
|
||||
def _constructor_expanddim(self):
|
||||
return SubclassedPanel
|
||||
|
||||
class SubclassedPanel(Panel):
|
||||
pass
|
||||
|
||||
index = MultiIndex.from_tuples([(0, 0), (0, 1), (0, 2)])
|
||||
df = SubclassedFrame({'X': [1, 2, 3], 'Y': [4, 5, 6]}, index=index)
|
||||
result = df.to_panel()
|
||||
assert isinstance(result, SubclassedPanel)
|
||||
expected = SubclassedPanel([[[1, 2, 3]], [[4, 5, 6]]],
|
||||
items=['X', 'Y'], major_axis=[0],
|
||||
minor_axis=[0, 1, 2],
|
||||
dtype='int64')
|
||||
tm.assert_panel_equal(result, expected)
|
||||
|
||||
def test_subclass_attr_err_propagation(self):
|
||||
# GH 11808
|
||||
class A(DataFrame):
|
||||
|
||||
@property
|
||||
def bar(self):
|
||||
return self.i_dont_exist
|
||||
with pytest.raises(AttributeError, match='.*i_dont_exist.*'):
|
||||
A().bar
|
||||
|
||||
def test_subclass_align(self):
|
||||
# GH 12983
|
||||
df1 = tm.SubclassedDataFrame({'a': [1, 3, 5],
|
||||
'b': [1, 3, 5]}, index=list('ACE'))
|
||||
df2 = tm.SubclassedDataFrame({'c': [1, 2, 4],
|
||||
'd': [1, 2, 4]}, index=list('ABD'))
|
||||
|
||||
res1, res2 = df1.align(df2, axis=0)
|
||||
exp1 = tm.SubclassedDataFrame({'a': [1, np.nan, 3, np.nan, 5],
|
||||
'b': [1, np.nan, 3, np.nan, 5]},
|
||||
index=list('ABCDE'))
|
||||
exp2 = tm.SubclassedDataFrame({'c': [1, 2, np.nan, 4, np.nan],
|
||||
'd': [1, 2, np.nan, 4, np.nan]},
|
||||
index=list('ABCDE'))
|
||||
assert isinstance(res1, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(res1, exp1)
|
||||
assert isinstance(res2, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(res2, exp2)
|
||||
|
||||
res1, res2 = df1.a.align(df2.c)
|
||||
assert isinstance(res1, tm.SubclassedSeries)
|
||||
tm.assert_series_equal(res1, exp1.a)
|
||||
assert isinstance(res2, tm.SubclassedSeries)
|
||||
tm.assert_series_equal(res2, exp2.c)
|
||||
|
||||
def test_subclass_align_combinations(self):
|
||||
# GH 12983
|
||||
df = tm.SubclassedDataFrame({'a': [1, 3, 5],
|
||||
'b': [1, 3, 5]}, index=list('ACE'))
|
||||
s = tm.SubclassedSeries([1, 2, 4], index=list('ABD'), name='x')
|
||||
|
||||
# frame + series
|
||||
res1, res2 = df.align(s, axis=0)
|
||||
exp1 = pd.DataFrame({'a': [1, np.nan, 3, np.nan, 5],
|
||||
'b': [1, np.nan, 3, np.nan, 5]},
|
||||
index=list('ABCDE'))
|
||||
# name is lost when
|
||||
exp2 = pd.Series([1, 2, np.nan, 4, np.nan],
|
||||
index=list('ABCDE'), name='x')
|
||||
|
||||
assert isinstance(res1, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(res1, exp1)
|
||||
assert isinstance(res2, tm.SubclassedSeries)
|
||||
tm.assert_series_equal(res2, exp2)
|
||||
|
||||
# series + frame
|
||||
res1, res2 = s.align(df)
|
||||
assert isinstance(res1, tm.SubclassedSeries)
|
||||
tm.assert_series_equal(res1, exp2)
|
||||
assert isinstance(res2, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(res2, exp1)
|
||||
|
||||
def test_subclass_iterrows(self):
|
||||
# GH 13977
|
||||
df = tm.SubclassedDataFrame({'a': [1]})
|
||||
for i, row in df.iterrows():
|
||||
assert isinstance(row, tm.SubclassedSeries)
|
||||
tm.assert_series_equal(row, df.loc[i])
|
||||
|
||||
def test_subclass_sparse_slice(self):
|
||||
rows = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
|
||||
ssdf = tm.SubclassedSparseDataFrame(rows)
|
||||
ssdf.testattr = "testattr"
|
||||
|
||||
tm.assert_sp_frame_equal(ssdf.loc[:2],
|
||||
tm.SubclassedSparseDataFrame(rows[:3]))
|
||||
tm.assert_sp_frame_equal(ssdf.iloc[:2],
|
||||
tm.SubclassedSparseDataFrame(rows[:2]))
|
||||
tm.assert_sp_frame_equal(ssdf[:2],
|
||||
tm.SubclassedSparseDataFrame(rows[:2]))
|
||||
assert ssdf.loc[:2].testattr == "testattr"
|
||||
assert ssdf.iloc[:2].testattr == "testattr"
|
||||
assert ssdf[:2].testattr == "testattr"
|
||||
|
||||
tm.assert_sp_series_equal(ssdf.loc[1],
|
||||
tm.SubclassedSparseSeries(rows[1]),
|
||||
check_names=False,
|
||||
check_kind=False)
|
||||
tm.assert_sp_series_equal(ssdf.iloc[1],
|
||||
tm.SubclassedSparseSeries(rows[1]),
|
||||
check_names=False,
|
||||
check_kind=False)
|
||||
|
||||
def test_subclass_sparse_transpose(self):
|
||||
ossdf = tm.SubclassedSparseDataFrame([[1, 2, 3],
|
||||
[4, 5, 6]])
|
||||
essdf = tm.SubclassedSparseDataFrame([[1, 4],
|
||||
[2, 5],
|
||||
[3, 6]])
|
||||
tm.assert_sp_frame_equal(ossdf.T, essdf)
|
||||
|
||||
def test_subclass_stack(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=['a', 'b', 'c'],
|
||||
columns=['X', 'Y', 'Z'])
|
||||
|
||||
res = df.stack()
|
||||
exp = tm.SubclassedSeries(
|
||||
[1, 2, 3, 4, 5, 6, 7, 8, 9],
|
||||
index=[list('aaabbbccc'), list('XYZXYZXYZ')])
|
||||
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_subclass_stack_multi(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame([
|
||||
[10, 11, 12, 13],
|
||||
[20, 21, 22, 23],
|
||||
[30, 31, 32, 33],
|
||||
[40, 41, 42, 43]],
|
||||
index=MultiIndex.from_tuples(
|
||||
list(zip(list('AABB'), list('cdcd'))),
|
||||
names=['aaa', 'ccc']),
|
||||
columns=MultiIndex.from_tuples(
|
||||
list(zip(list('WWXX'), list('yzyz'))),
|
||||
names=['www', 'yyy']))
|
||||
|
||||
exp = tm.SubclassedDataFrame([
|
||||
[10, 12],
|
||||
[11, 13],
|
||||
[20, 22],
|
||||
[21, 23],
|
||||
[30, 32],
|
||||
[31, 33],
|
||||
[40, 42],
|
||||
[41, 43]],
|
||||
index=MultiIndex.from_tuples(list(zip(
|
||||
list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))),
|
||||
names=['aaa', 'ccc', 'yyy']),
|
||||
columns=Index(['W', 'X'], name='www'))
|
||||
|
||||
res = df.stack()
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.stack('yyy')
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
exp = tm.SubclassedDataFrame([
|
||||
[10, 11],
|
||||
[12, 13],
|
||||
[20, 21],
|
||||
[22, 23],
|
||||
[30, 31],
|
||||
[32, 33],
|
||||
[40, 41],
|
||||
[42, 43]],
|
||||
index=MultiIndex.from_tuples(list(zip(
|
||||
list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))),
|
||||
names=['aaa', 'ccc', 'www']),
|
||||
columns=Index(['y', 'z'], name='yyy'))
|
||||
|
||||
res = df.stack('www')
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_subclass_stack_multi_mixed(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame([
|
||||
[10, 11, 12.0, 13.0],
|
||||
[20, 21, 22.0, 23.0],
|
||||
[30, 31, 32.0, 33.0],
|
||||
[40, 41, 42.0, 43.0]],
|
||||
index=MultiIndex.from_tuples(
|
||||
list(zip(list('AABB'), list('cdcd'))),
|
||||
names=['aaa', 'ccc']),
|
||||
columns=MultiIndex.from_tuples(
|
||||
list(zip(list('WWXX'), list('yzyz'))),
|
||||
names=['www', 'yyy']))
|
||||
|
||||
exp = tm.SubclassedDataFrame([
|
||||
[10, 12.0],
|
||||
[11, 13.0],
|
||||
[20, 22.0],
|
||||
[21, 23.0],
|
||||
[30, 32.0],
|
||||
[31, 33.0],
|
||||
[40, 42.0],
|
||||
[41, 43.0]],
|
||||
index=MultiIndex.from_tuples(list(zip(
|
||||
list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))),
|
||||
names=['aaa', 'ccc', 'yyy']),
|
||||
columns=Index(['W', 'X'], name='www'))
|
||||
|
||||
res = df.stack()
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.stack('yyy')
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
exp = tm.SubclassedDataFrame([
|
||||
[10.0, 11.0],
|
||||
[12.0, 13.0],
|
||||
[20.0, 21.0],
|
||||
[22.0, 23.0],
|
||||
[30.0, 31.0],
|
||||
[32.0, 33.0],
|
||||
[40.0, 41.0],
|
||||
[42.0, 43.0]],
|
||||
index=MultiIndex.from_tuples(list(zip(
|
||||
list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))),
|
||||
names=['aaa', 'ccc', 'www']),
|
||||
columns=Index(['y', 'z'], name='yyy'))
|
||||
|
||||
res = df.stack('www')
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_subclass_unstack(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=['a', 'b', 'c'],
|
||||
columns=['X', 'Y', 'Z'])
|
||||
|
||||
res = df.unstack()
|
||||
exp = tm.SubclassedSeries(
|
||||
[1, 4, 7, 2, 5, 8, 3, 6, 9],
|
||||
index=[list('XXXYYYZZZ'), list('abcabcabc')])
|
||||
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_subclass_unstack_multi(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame([
|
||||
[10, 11, 12, 13],
|
||||
[20, 21, 22, 23],
|
||||
[30, 31, 32, 33],
|
||||
[40, 41, 42, 43]],
|
||||
index=MultiIndex.from_tuples(
|
||||
list(zip(list('AABB'), list('cdcd'))),
|
||||
names=['aaa', 'ccc']),
|
||||
columns=MultiIndex.from_tuples(
|
||||
list(zip(list('WWXX'), list('yzyz'))),
|
||||
names=['www', 'yyy']))
|
||||
|
||||
exp = tm.SubclassedDataFrame([
|
||||
[10, 20, 11, 21, 12, 22, 13, 23],
|
||||
[30, 40, 31, 41, 32, 42, 33, 43]],
|
||||
index=Index(['A', 'B'], name='aaa'),
|
||||
columns=MultiIndex.from_tuples(list(zip(
|
||||
list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))),
|
||||
names=['www', 'yyy', 'ccc']))
|
||||
|
||||
res = df.unstack()
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.unstack('ccc')
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
exp = tm.SubclassedDataFrame([
|
||||
[10, 30, 11, 31, 12, 32, 13, 33],
|
||||
[20, 40, 21, 41, 22, 42, 23, 43]],
|
||||
index=Index(['c', 'd'], name='ccc'),
|
||||
columns=MultiIndex.from_tuples(list(zip(
|
||||
list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))),
|
||||
names=['www', 'yyy', 'aaa']))
|
||||
|
||||
res = df.unstack('aaa')
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_subclass_unstack_multi_mixed(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame([
|
||||
[10, 11, 12.0, 13.0],
|
||||
[20, 21, 22.0, 23.0],
|
||||
[30, 31, 32.0, 33.0],
|
||||
[40, 41, 42.0, 43.0]],
|
||||
index=MultiIndex.from_tuples(
|
||||
list(zip(list('AABB'), list('cdcd'))),
|
||||
names=['aaa', 'ccc']),
|
||||
columns=MultiIndex.from_tuples(
|
||||
list(zip(list('WWXX'), list('yzyz'))),
|
||||
names=['www', 'yyy']))
|
||||
|
||||
exp = tm.SubclassedDataFrame([
|
||||
[10, 20, 11, 21, 12.0, 22.0, 13.0, 23.0],
|
||||
[30, 40, 31, 41, 32.0, 42.0, 33.0, 43.0]],
|
||||
index=Index(['A', 'B'], name='aaa'),
|
||||
columns=MultiIndex.from_tuples(list(zip(
|
||||
list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))),
|
||||
names=['www', 'yyy', 'ccc']))
|
||||
|
||||
res = df.unstack()
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.unstack('ccc')
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
exp = tm.SubclassedDataFrame([
|
||||
[10, 30, 11, 31, 12.0, 32.0, 13.0, 33.0],
|
||||
[20, 40, 21, 41, 22.0, 42.0, 23.0, 43.0]],
|
||||
index=Index(['c', 'd'], name='ccc'),
|
||||
columns=MultiIndex.from_tuples(list(zip(
|
||||
list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))),
|
||||
names=['www', 'yyy', 'aaa']))
|
||||
|
||||
res = df.unstack('aaa')
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_subclass_pivot(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame({
|
||||
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
|
||||
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
|
||||
'values': [1., 2., 3., 3., 2., 1.]})
|
||||
|
||||
pivoted = df.pivot(
|
||||
index='index', columns='columns', values='values')
|
||||
|
||||
expected = tm.SubclassedDataFrame({
|
||||
'One': {'A': 1., 'B': 2., 'C': 3.},
|
||||
'Two': {'A': 1., 'B': 2., 'C': 3.}})
|
||||
|
||||
expected.index.name, expected.columns.name = 'index', 'columns'
|
||||
|
||||
tm.assert_frame_equal(pivoted, expected)
|
||||
|
||||
def test_subclassed_melt(self):
|
||||
# GH 15564
|
||||
cheese = tm.SubclassedDataFrame({
|
||||
'first': ['John', 'Mary'],
|
||||
'last': ['Doe', 'Bo'],
|
||||
'height': [5.5, 6.0],
|
||||
'weight': [130, 150]})
|
||||
|
||||
melted = pd.melt(cheese, id_vars=['first', 'last'])
|
||||
|
||||
expected = tm.SubclassedDataFrame([
|
||||
['John', 'Doe', 'height', 5.5],
|
||||
['Mary', 'Bo', 'height', 6.0],
|
||||
['John', 'Doe', 'weight', 130],
|
||||
['Mary', 'Bo', 'weight', 150]],
|
||||
columns=['first', 'last', 'variable', 'value'])
|
||||
|
||||
tm.assert_frame_equal(melted, expected)
|
||||
|
||||
def test_subclassed_wide_to_long(self):
|
||||
# GH 9762
|
||||
|
||||
np.random.seed(123)
|
||||
x = np.random.randn(3)
|
||||
df = tm.SubclassedDataFrame({
|
||||
"A1970": {0: "a", 1: "b", 2: "c"},
|
||||
"A1980": {0: "d", 1: "e", 2: "f"},
|
||||
"B1970": {0: 2.5, 1: 1.2, 2: .7},
|
||||
"B1980": {0: 3.2, 1: 1.3, 2: .1},
|
||||
"X": dict(zip(range(3), x))})
|
||||
|
||||
df["id"] = df.index
|
||||
exp_data = {"X": x.tolist() + x.tolist(),
|
||||
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
|
||||
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
|
||||
"year": [1970, 1970, 1970, 1980, 1980, 1980],
|
||||
"id": [0, 1, 2, 0, 1, 2]}
|
||||
expected = tm.SubclassedDataFrame(exp_data)
|
||||
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
|
||||
long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year")
|
||||
|
||||
tm.assert_frame_equal(long_frame, expected)
|
||||
|
||||
def test_subclassed_apply(self):
|
||||
# GH 19822
|
||||
|
||||
def check_row_subclass(row):
|
||||
assert isinstance(row, tm.SubclassedSeries)
|
||||
|
||||
def strech(row):
|
||||
if row["variable"] == "height":
|
||||
row["value"] += 0.5
|
||||
return row
|
||||
|
||||
df = tm.SubclassedDataFrame([
|
||||
['John', 'Doe', 'height', 5.5],
|
||||
['Mary', 'Bo', 'height', 6.0],
|
||||
['John', 'Doe', 'weight', 130],
|
||||
['Mary', 'Bo', 'weight', 150]],
|
||||
columns=['first', 'last', 'variable', 'value'])
|
||||
|
||||
df.apply(lambda x: check_row_subclass(x))
|
||||
df.apply(lambda x: check_row_subclass(x), axis=1)
|
||||
|
||||
expected = tm.SubclassedDataFrame([
|
||||
['John', 'Doe', 'height', 6.0],
|
||||
['Mary', 'Bo', 'height', 6.5],
|
||||
['John', 'Doe', 'weight', 130],
|
||||
['Mary', 'Bo', 'weight', 150]],
|
||||
columns=['first', 'last', 'variable', 'value'])
|
||||
|
||||
result = df.apply(lambda x: strech(x), axis=1)
|
||||
assert isinstance(result, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = tm.SubclassedDataFrame([
|
||||
[1, 2, 3],
|
||||
[1, 2, 3],
|
||||
[1, 2, 3],
|
||||
[1, 2, 3]])
|
||||
|
||||
result = df.apply(lambda x: tm.SubclassedSeries([1, 2, 3]), axis=1)
|
||||
assert isinstance(result, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand")
|
||||
assert isinstance(result, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = tm.SubclassedSeries([
|
||||
[1, 2, 3],
|
||||
[1, 2, 3],
|
||||
[1, 2, 3],
|
||||
[1, 2, 3]])
|
||||
|
||||
result = df.apply(lambda x: [1, 2, 3], axis=1)
|
||||
assert not isinstance(result, tm.SubclassedDataFrame)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,899 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from datetime import datetime, time
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import product
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame, DatetimeIndex, Index, MultiIndex, Series, Timestamp, date_range,
|
||||
period_range, to_datetime)
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (
|
||||
assert_frame_equal, assert_index_equal, assert_series_equal)
|
||||
|
||||
import pandas.tseries.offsets as offsets
|
||||
|
||||
|
||||
@pytest.fixture(params=product([True, False], [True, False]))
|
||||
def close_open_fixture(request):
|
||||
return request.param
|
||||
|
||||
|
||||
class TestDataFrameTimeSeriesMethods(TestData):
|
||||
|
||||
def test_diff(self):
|
||||
the_diff = self.tsframe.diff(1)
|
||||
|
||||
assert_series_equal(the_diff['A'],
|
||||
self.tsframe['A'] - self.tsframe['A'].shift(1))
|
||||
|
||||
# int dtype
|
||||
a = 10000000000000000
|
||||
b = a + 1
|
||||
s = Series([a, b])
|
||||
|
||||
rs = DataFrame({'s': s}).diff()
|
||||
assert rs.s[1] == 1
|
||||
|
||||
# mixed numeric
|
||||
tf = self.tsframe.astype('float32')
|
||||
the_diff = tf.diff(1)
|
||||
assert_series_equal(the_diff['A'],
|
||||
tf['A'] - tf['A'].shift(1))
|
||||
|
||||
# issue 10907
|
||||
df = pd.DataFrame({'y': pd.Series([2]), 'z': pd.Series([3])})
|
||||
df.insert(0, 'x', 1)
|
||||
result = df.diff(axis=1)
|
||||
expected = pd.DataFrame({'x': np.nan, 'y': pd.Series(
|
||||
1), 'z': pd.Series(1)}).astype('float64')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('tz', [None, 'UTC'])
|
||||
def test_diff_datetime_axis0(self, tz):
|
||||
# GH 18578
|
||||
df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz),
|
||||
1: date_range('2010', freq='D', periods=2, tz=tz)})
|
||||
|
||||
result = df.diff(axis=0)
|
||||
expected = DataFrame({0: pd.TimedeltaIndex(['NaT', '1 days']),
|
||||
1: pd.TimedeltaIndex(['NaT', '1 days'])})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('tz', [None, 'UTC'])
|
||||
def test_diff_datetime_axis1(self, tz):
|
||||
# GH 18578
|
||||
df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz),
|
||||
1: date_range('2010', freq='D', periods=2, tz=tz)})
|
||||
if tz is None:
|
||||
result = df.diff(axis=1)
|
||||
expected = DataFrame({0: pd.TimedeltaIndex(['NaT', 'NaT']),
|
||||
1: pd.TimedeltaIndex(['0 days',
|
||||
'0 days'])})
|
||||
assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(NotImplementedError):
|
||||
result = df.diff(axis=1)
|
||||
|
||||
def test_diff_timedelta(self):
|
||||
# GH 4533
|
||||
df = DataFrame(dict(time=[Timestamp('20130101 9:01'),
|
||||
Timestamp('20130101 9:02')],
|
||||
value=[1.0, 2.0]))
|
||||
|
||||
res = df.diff()
|
||||
exp = DataFrame([[pd.NaT, np.nan],
|
||||
[pd.Timedelta('00:01:00'), 1]],
|
||||
columns=['time', 'value'])
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
def test_diff_mixed_dtype(self):
|
||||
df = DataFrame(np.random.randn(5, 3))
|
||||
df['A'] = np.array([1, 2, 3, 4, 5], dtype=object)
|
||||
|
||||
result = df.diff()
|
||||
assert result[0].dtype == np.float64
|
||||
|
||||
def test_diff_neg_n(self):
|
||||
rs = self.tsframe.diff(-1)
|
||||
xp = self.tsframe - self.tsframe.shift(-1)
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_diff_float_n(self):
|
||||
rs = self.tsframe.diff(1.)
|
||||
xp = self.tsframe.diff(1)
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_diff_axis(self):
|
||||
# GH 9727
|
||||
df = DataFrame([[1., 2.], [3., 4.]])
|
||||
assert_frame_equal(df.diff(axis=1), DataFrame(
|
||||
[[np.nan, 1.], [np.nan, 1.]]))
|
||||
assert_frame_equal(df.diff(axis=0), DataFrame(
|
||||
[[np.nan, np.nan], [2., 2.]]))
|
||||
|
||||
def test_pct_change(self):
|
||||
rs = self.tsframe.pct_change(fill_method=None)
|
||||
assert_frame_equal(rs, self.tsframe / self.tsframe.shift(1) - 1)
|
||||
|
||||
rs = self.tsframe.pct_change(2)
|
||||
filled = self.tsframe.fillna(method='pad')
|
||||
assert_frame_equal(rs, filled / filled.shift(2) - 1)
|
||||
|
||||
rs = self.tsframe.pct_change(fill_method='bfill', limit=1)
|
||||
filled = self.tsframe.fillna(method='bfill', limit=1)
|
||||
assert_frame_equal(rs, filled / filled.shift(1) - 1)
|
||||
|
||||
rs = self.tsframe.pct_change(freq='5D')
|
||||
filled = self.tsframe.fillna(method='pad')
|
||||
assert_frame_equal(rs,
|
||||
(filled / filled.shift(freq='5D') - 1)
|
||||
.reindex_like(filled))
|
||||
|
||||
def test_pct_change_shift_over_nas(self):
|
||||
s = Series([1., 1.5, np.nan, 2.5, 3.])
|
||||
|
||||
df = DataFrame({'a': s, 'b': s})
|
||||
|
||||
chg = df.pct_change()
|
||||
expected = Series([np.nan, 0.5, 0., 2.5 / 1.5 - 1, .2])
|
||||
edf = DataFrame({'a': expected, 'b': expected})
|
||||
assert_frame_equal(chg, edf)
|
||||
|
||||
@pytest.mark.parametrize("freq, periods, fill_method, limit",
|
||||
[('5B', 5, None, None),
|
||||
('3B', 3, None, None),
|
||||
('3B', 3, 'bfill', None),
|
||||
('7B', 7, 'pad', 1),
|
||||
('7B', 7, 'bfill', 3),
|
||||
('14B', 14, None, None)])
|
||||
def test_pct_change_periods_freq(self, freq, periods, fill_method, limit):
|
||||
# GH 7292
|
||||
rs_freq = self.tsframe.pct_change(freq=freq,
|
||||
fill_method=fill_method,
|
||||
limit=limit)
|
||||
rs_periods = self.tsframe.pct_change(periods,
|
||||
fill_method=fill_method,
|
||||
limit=limit)
|
||||
assert_frame_equal(rs_freq, rs_periods)
|
||||
|
||||
empty_ts = DataFrame(index=self.tsframe.index,
|
||||
columns=self.tsframe.columns)
|
||||
rs_freq = empty_ts.pct_change(freq=freq,
|
||||
fill_method=fill_method,
|
||||
limit=limit)
|
||||
rs_periods = empty_ts.pct_change(periods,
|
||||
fill_method=fill_method,
|
||||
limit=limit)
|
||||
assert_frame_equal(rs_freq, rs_periods)
|
||||
|
||||
def test_frame_ctor_datetime64_column(self):
|
||||
rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s')
|
||||
dates = np.asarray(rng)
|
||||
|
||||
df = DataFrame({'A': np.random.randn(len(rng)), 'B': dates})
|
||||
assert np.issubdtype(df['B'].dtype, np.dtype('M8[ns]'))
|
||||
|
||||
def test_frame_append_datetime64_column(self):
|
||||
rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s')
|
||||
df = DataFrame(index=np.arange(len(rng)))
|
||||
|
||||
df['A'] = rng
|
||||
assert np.issubdtype(df['A'].dtype, np.dtype('M8[ns]'))
|
||||
|
||||
def test_frame_datetime64_pre1900_repr(self):
|
||||
df = DataFrame({'year': date_range('1/1/1700', periods=50,
|
||||
freq='A-DEC')})
|
||||
# it works!
|
||||
repr(df)
|
||||
|
||||
def test_frame_append_datetime64_col_other_units(self):
|
||||
n = 100
|
||||
|
||||
units = ['h', 'm', 's', 'ms', 'D', 'M', 'Y']
|
||||
|
||||
ns_dtype = np.dtype('M8[ns]')
|
||||
|
||||
for unit in units:
|
||||
dtype = np.dtype('M8[%s]' % unit)
|
||||
vals = np.arange(n, dtype=np.int64).view(dtype)
|
||||
|
||||
df = DataFrame({'ints': np.arange(n)}, index=np.arange(n))
|
||||
df[unit] = vals
|
||||
|
||||
ex_vals = to_datetime(vals.astype('O')).values
|
||||
|
||||
assert df[unit].dtype == ns_dtype
|
||||
assert (df[unit].values == ex_vals).all()
|
||||
|
||||
# Test insertion into existing datetime64 column
|
||||
df = DataFrame({'ints': np.arange(n)}, index=np.arange(n))
|
||||
df['dates'] = np.arange(n, dtype=np.int64).view(ns_dtype)
|
||||
|
||||
for unit in units:
|
||||
dtype = np.dtype('M8[%s]' % unit)
|
||||
vals = np.arange(n, dtype=np.int64).view(dtype)
|
||||
|
||||
tmp = df.copy()
|
||||
|
||||
tmp['dates'] = vals
|
||||
ex_vals = to_datetime(vals.astype('O')).values
|
||||
|
||||
assert (tmp['dates'].values == ex_vals).all()
|
||||
|
||||
def test_shift(self):
|
||||
# naive shift
|
||||
shiftedFrame = self.tsframe.shift(5)
|
||||
tm.assert_index_equal(shiftedFrame.index, self.tsframe.index)
|
||||
|
||||
shiftedSeries = self.tsframe['A'].shift(5)
|
||||
assert_series_equal(shiftedFrame['A'], shiftedSeries)
|
||||
|
||||
shiftedFrame = self.tsframe.shift(-5)
|
||||
tm.assert_index_equal(shiftedFrame.index, self.tsframe.index)
|
||||
|
||||
shiftedSeries = self.tsframe['A'].shift(-5)
|
||||
assert_series_equal(shiftedFrame['A'], shiftedSeries)
|
||||
|
||||
# shift by 0
|
||||
unshifted = self.tsframe.shift(0)
|
||||
assert_frame_equal(unshifted, self.tsframe)
|
||||
|
||||
# shift by DateOffset
|
||||
shiftedFrame = self.tsframe.shift(5, freq=offsets.BDay())
|
||||
assert len(shiftedFrame) == len(self.tsframe)
|
||||
|
||||
shiftedFrame2 = self.tsframe.shift(5, freq='B')
|
||||
assert_frame_equal(shiftedFrame, shiftedFrame2)
|
||||
|
||||
d = self.tsframe.index[0]
|
||||
shifted_d = d + offsets.BDay(5)
|
||||
assert_series_equal(self.tsframe.xs(d),
|
||||
shiftedFrame.xs(shifted_d), check_names=False)
|
||||
|
||||
# shift int frame
|
||||
int_shifted = self.intframe.shift(1) # noqa
|
||||
|
||||
# Shifting with PeriodIndex
|
||||
ps = tm.makePeriodFrame()
|
||||
shifted = ps.shift(1)
|
||||
unshifted = shifted.shift(-1)
|
||||
tm.assert_index_equal(shifted.index, ps.index)
|
||||
tm.assert_index_equal(unshifted.index, ps.index)
|
||||
tm.assert_numpy_array_equal(unshifted.iloc[:, 0].dropna().values,
|
||||
ps.iloc[:-1, 0].values)
|
||||
|
||||
shifted2 = ps.shift(1, 'B')
|
||||
shifted3 = ps.shift(1, offsets.BDay())
|
||||
assert_frame_equal(shifted2, shifted3)
|
||||
assert_frame_equal(ps, shifted2.shift(-1, 'B'))
|
||||
|
||||
msg = 'does not match PeriodIndex freq'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ps.shift(freq='D')
|
||||
|
||||
# shift other axis
|
||||
# GH 6371
|
||||
df = DataFrame(np.random.rand(10, 5))
|
||||
expected = pd.concat([DataFrame(np.nan, index=df.index,
|
||||
columns=[0]),
|
||||
df.iloc[:, 0:-1]],
|
||||
ignore_index=True, axis=1)
|
||||
result = df.shift(1, axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# shift named axis
|
||||
df = DataFrame(np.random.rand(10, 5))
|
||||
expected = pd.concat([DataFrame(np.nan, index=df.index,
|
||||
columns=[0]),
|
||||
df.iloc[:, 0:-1]],
|
||||
ignore_index=True, axis=1)
|
||||
result = df.shift(1, axis='columns')
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_shift_bool(self):
|
||||
df = DataFrame({'high': [True, False],
|
||||
'low': [False, False]})
|
||||
rs = df.shift(1)
|
||||
xp = DataFrame(np.array([[np.nan, np.nan],
|
||||
[True, False]], dtype=object),
|
||||
columns=['high', 'low'])
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_shift_categorical(self):
|
||||
# GH 9416
|
||||
s1 = pd.Series(['a', 'b', 'c'], dtype='category')
|
||||
s2 = pd.Series(['A', 'B', 'C'], dtype='category')
|
||||
df = DataFrame({'one': s1, 'two': s2})
|
||||
rs = df.shift(1)
|
||||
xp = DataFrame({'one': s1.shift(1), 'two': s2.shift(1)})
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_shift_fill_value(self):
|
||||
# GH #24128
|
||||
df = DataFrame([1, 2, 3, 4, 5],
|
||||
index=date_range('1/1/2000', periods=5, freq='H'))
|
||||
exp = DataFrame([0, 1, 2, 3, 4],
|
||||
index=date_range('1/1/2000', periods=5, freq='H'))
|
||||
result = df.shift(1, fill_value=0)
|
||||
assert_frame_equal(result, exp)
|
||||
|
||||
exp = DataFrame([0, 0, 1, 2, 3],
|
||||
index=date_range('1/1/2000', periods=5, freq='H'))
|
||||
result = df.shift(2, fill_value=0)
|
||||
assert_frame_equal(result, exp)
|
||||
|
||||
def test_shift_empty(self):
|
||||
# Regression test for #8019
|
||||
df = DataFrame({'foo': []})
|
||||
rs = df.shift(-1)
|
||||
|
||||
assert_frame_equal(df, rs)
|
||||
|
||||
def test_shift_duplicate_columns(self):
|
||||
# GH 9092; verify that position-based shifting works
|
||||
# in the presence of duplicate columns
|
||||
column_lists = [list(range(5)), [1] * 5, [1, 1, 2, 2, 1]]
|
||||
data = np.random.randn(20, 5)
|
||||
|
||||
shifted = []
|
||||
for columns in column_lists:
|
||||
df = pd.DataFrame(data.copy(), columns=columns)
|
||||
for s in range(5):
|
||||
df.iloc[:, s] = df.iloc[:, s].shift(s + 1)
|
||||
df.columns = range(5)
|
||||
shifted.append(df)
|
||||
|
||||
# sanity check the base case
|
||||
nulls = shifted[0].isna().sum()
|
||||
assert_series_equal(nulls, Series(range(1, 6), dtype='int64'))
|
||||
|
||||
# check all answers are the same
|
||||
assert_frame_equal(shifted[0], shifted[1])
|
||||
assert_frame_equal(shifted[0], shifted[2])
|
||||
|
||||
def test_tshift(self):
|
||||
# PeriodIndex
|
||||
ps = tm.makePeriodFrame()
|
||||
shifted = ps.tshift(1)
|
||||
unshifted = shifted.tshift(-1)
|
||||
|
||||
assert_frame_equal(unshifted, ps)
|
||||
|
||||
shifted2 = ps.tshift(freq='B')
|
||||
assert_frame_equal(shifted, shifted2)
|
||||
|
||||
shifted3 = ps.tshift(freq=offsets.BDay())
|
||||
assert_frame_equal(shifted, shifted3)
|
||||
|
||||
with pytest.raises(ValueError, match='does not match'):
|
||||
ps.tshift(freq='M')
|
||||
|
||||
# DatetimeIndex
|
||||
shifted = self.tsframe.tshift(1)
|
||||
unshifted = shifted.tshift(-1)
|
||||
|
||||
assert_frame_equal(self.tsframe, unshifted)
|
||||
|
||||
shifted2 = self.tsframe.tshift(freq=self.tsframe.index.freq)
|
||||
assert_frame_equal(shifted, shifted2)
|
||||
|
||||
inferred_ts = DataFrame(self.tsframe.values,
|
||||
Index(np.asarray(self.tsframe.index)),
|
||||
columns=self.tsframe.columns)
|
||||
shifted = inferred_ts.tshift(1)
|
||||
unshifted = shifted.tshift(-1)
|
||||
assert_frame_equal(shifted, self.tsframe.tshift(1))
|
||||
assert_frame_equal(unshifted, inferred_ts)
|
||||
|
||||
no_freq = self.tsframe.iloc[[0, 5, 7], :]
|
||||
pytest.raises(ValueError, no_freq.tshift)
|
||||
|
||||
def test_truncate(self):
|
||||
ts = self.tsframe[::3]
|
||||
|
||||
start, end = self.tsframe.index[3], self.tsframe.index[6]
|
||||
|
||||
start_missing = self.tsframe.index[2]
|
||||
end_missing = self.tsframe.index[7]
|
||||
|
||||
# neither specified
|
||||
truncated = ts.truncate()
|
||||
assert_frame_equal(truncated, ts)
|
||||
|
||||
# both specified
|
||||
expected = ts[1:3]
|
||||
|
||||
truncated = ts.truncate(start, end)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
truncated = ts.truncate(start_missing, end_missing)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
# start specified
|
||||
expected = ts[1:]
|
||||
|
||||
truncated = ts.truncate(before=start)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
truncated = ts.truncate(before=start_missing)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
# end specified
|
||||
expected = ts[:3]
|
||||
|
||||
truncated = ts.truncate(after=end)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
truncated = ts.truncate(after=end_missing)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
pytest.raises(ValueError, ts.truncate,
|
||||
before=ts.index[-1] - ts.index.freq,
|
||||
after=ts.index[0] + ts.index.freq)
|
||||
|
||||
def test_truncate_copy(self):
|
||||
index = self.tsframe.index
|
||||
truncated = self.tsframe.truncate(index[5], index[10])
|
||||
truncated.values[:] = 5.
|
||||
assert not (self.tsframe.values[5:11] == 5).any()
|
||||
|
||||
def test_truncate_nonsortedindex(self):
|
||||
# GH 17935
|
||||
|
||||
df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e']},
|
||||
index=[5, 3, 2, 9, 0])
|
||||
msg = 'truncate requires a sorted index'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.truncate(before=3, after=9)
|
||||
|
||||
rng = pd.date_range('2011-01-01', '2012-01-01', freq='W')
|
||||
ts = pd.DataFrame({'A': np.random.randn(len(rng)),
|
||||
'B': np.random.randn(len(rng))},
|
||||
index=rng)
|
||||
msg = 'truncate requires a sorted index'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ts.sort_values('A', ascending=False).truncate(before='2011-11',
|
||||
after='2011-12')
|
||||
|
||||
df = pd.DataFrame({3: np.random.randn(5),
|
||||
20: np.random.randn(5),
|
||||
2: np.random.randn(5),
|
||||
0: np.random.randn(5)},
|
||||
columns=[3, 20, 2, 0])
|
||||
msg = 'truncate requires a sorted index'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.truncate(before=2, after=20, axis=1)
|
||||
|
||||
def test_asfreq(self):
|
||||
offset_monthly = self.tsframe.asfreq(offsets.BMonthEnd())
|
||||
rule_monthly = self.tsframe.asfreq('BM')
|
||||
|
||||
tm.assert_almost_equal(offset_monthly['A'], rule_monthly['A'])
|
||||
|
||||
filled = rule_monthly.asfreq('B', method='pad') # noqa
|
||||
# TODO: actually check that this worked.
|
||||
|
||||
# don't forget!
|
||||
filled_dep = rule_monthly.asfreq('B', method='pad') # noqa
|
||||
|
||||
# test does not blow up on length-0 DataFrame
|
||||
zero_length = self.tsframe.reindex([])
|
||||
result = zero_length.asfreq('BM')
|
||||
assert result is not zero_length
|
||||
|
||||
def test_asfreq_datetimeindex(self):
|
||||
df = DataFrame({'A': [1, 2, 3]},
|
||||
index=[datetime(2011, 11, 1), datetime(2011, 11, 2),
|
||||
datetime(2011, 11, 3)])
|
||||
df = df.asfreq('B')
|
||||
assert isinstance(df.index, DatetimeIndex)
|
||||
|
||||
ts = df['A'].asfreq('B')
|
||||
assert isinstance(ts.index, DatetimeIndex)
|
||||
|
||||
def test_asfreq_fillvalue(self):
|
||||
# test for fill value during upsampling, related to issue 3715
|
||||
|
||||
# setup
|
||||
rng = pd.date_range('1/1/2016', periods=10, freq='2S')
|
||||
ts = pd.Series(np.arange(len(rng)), index=rng)
|
||||
df = pd.DataFrame({'one': ts})
|
||||
|
||||
# insert pre-existing missing value
|
||||
df.loc['2016-01-01 00:00:08', 'one'] = None
|
||||
|
||||
actual_df = df.asfreq(freq='1S', fill_value=9.0)
|
||||
expected_df = df.asfreq(freq='1S').fillna(9.0)
|
||||
expected_df.loc['2016-01-01 00:00:08', 'one'] = None
|
||||
assert_frame_equal(expected_df, actual_df)
|
||||
|
||||
expected_series = ts.asfreq(freq='1S').fillna(9.0)
|
||||
actual_series = ts.asfreq(freq='1S', fill_value=9.0)
|
||||
assert_series_equal(expected_series, actual_series)
|
||||
|
||||
@pytest.mark.parametrize("data,idx,expected_first,expected_last", [
|
||||
({'A': [1, 2, 3]}, [1, 1, 2], 1, 2),
|
||||
({'A': [1, 2, 3]}, [1, 2, 2], 1, 2),
|
||||
({'A': [1, 2, 3, 4]}, ['d', 'd', 'd', 'd'], 'd', 'd'),
|
||||
({'A': [1, np.nan, 3]}, [1, 1, 2], 1, 2),
|
||||
({'A': [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2),
|
||||
({'A': [1, np.nan, 3]}, [1, 2, 2], 1, 2)])
|
||||
def test_first_last_valid(self, data, idx,
|
||||
expected_first, expected_last):
|
||||
N = len(self.frame.index)
|
||||
mat = np.random.randn(N)
|
||||
mat[:5] = np.nan
|
||||
mat[-5:] = np.nan
|
||||
|
||||
frame = DataFrame({'foo': mat}, index=self.frame.index)
|
||||
index = frame.first_valid_index()
|
||||
|
||||
assert index == frame.index[5]
|
||||
|
||||
index = frame.last_valid_index()
|
||||
assert index == frame.index[-6]
|
||||
|
||||
# GH12800
|
||||
empty = DataFrame()
|
||||
assert empty.last_valid_index() is None
|
||||
assert empty.first_valid_index() is None
|
||||
|
||||
# GH17400: no valid entries
|
||||
frame[:] = np.nan
|
||||
assert frame.last_valid_index() is None
|
||||
assert frame.first_valid_index() is None
|
||||
|
||||
# GH20499: its preserves freq with holes
|
||||
frame.index = date_range("20110101", periods=N, freq="B")
|
||||
frame.iloc[1] = 1
|
||||
frame.iloc[-2] = 1
|
||||
assert frame.first_valid_index() == frame.index[1]
|
||||
assert frame.last_valid_index() == frame.index[-2]
|
||||
assert frame.first_valid_index().freq == frame.index.freq
|
||||
assert frame.last_valid_index().freq == frame.index.freq
|
||||
|
||||
# GH 21441
|
||||
df = DataFrame(data, index=idx)
|
||||
assert expected_first == df.first_valid_index()
|
||||
assert expected_last == df.last_valid_index()
|
||||
|
||||
def test_first_subset(self):
|
||||
ts = tm.makeTimeDataFrame(freq='12h')
|
||||
result = ts.first('10d')
|
||||
assert len(result) == 20
|
||||
|
||||
ts = tm.makeTimeDataFrame(freq='D')
|
||||
result = ts.first('10d')
|
||||
assert len(result) == 10
|
||||
|
||||
result = ts.first('3M')
|
||||
expected = ts[:'3/31/2000']
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = ts.first('21D')
|
||||
expected = ts[:21]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = ts[:0].first('3M')
|
||||
assert_frame_equal(result, ts[:0])
|
||||
|
||||
def test_first_raises(self):
|
||||
# GH20725
|
||||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
with pytest.raises(TypeError): # index is not a DatetimeIndex
|
||||
df.first('1D')
|
||||
|
||||
def test_last_subset(self):
|
||||
ts = tm.makeTimeDataFrame(freq='12h')
|
||||
result = ts.last('10d')
|
||||
assert len(result) == 20
|
||||
|
||||
ts = tm.makeTimeDataFrame(nper=30, freq='D')
|
||||
result = ts.last('10d')
|
||||
assert len(result) == 10
|
||||
|
||||
result = ts.last('21D')
|
||||
expected = ts['2000-01-10':]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = ts.last('21D')
|
||||
expected = ts[-21:]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = ts[:0].last('3M')
|
||||
assert_frame_equal(result, ts[:0])
|
||||
|
||||
def test_last_raises(self):
|
||||
# GH20725
|
||||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
with pytest.raises(TypeError): # index is not a DatetimeIndex
|
||||
df.last('1D')
|
||||
|
||||
def test_at_time(self):
|
||||
rng = date_range('1/1/2000', '1/5/2000', freq='5min')
|
||||
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
|
||||
rs = ts.at_time(rng[1])
|
||||
assert (rs.index.hour == rng[1].hour).all()
|
||||
assert (rs.index.minute == rng[1].minute).all()
|
||||
assert (rs.index.second == rng[1].second).all()
|
||||
|
||||
result = ts.at_time('9:30')
|
||||
expected = ts.at_time(time(9, 30))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = ts.loc[time(9, 30)]
|
||||
expected = ts.loc[(rng.hour == 9) & (rng.minute == 30)]
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# midnight, everything
|
||||
rng = date_range('1/1/2000', '1/31/2000')
|
||||
ts = DataFrame(np.random.randn(len(rng), 3), index=rng)
|
||||
|
||||
result = ts.at_time(time(0, 0))
|
||||
assert_frame_equal(result, ts)
|
||||
|
||||
# time doesn't exist
|
||||
rng = date_range('1/1/2012', freq='23Min', periods=384)
|
||||
ts = DataFrame(np.random.randn(len(rng), 2), rng)
|
||||
rs = ts.at_time('16:00')
|
||||
assert len(rs) == 0
|
||||
|
||||
def test_at_time_raises(self):
|
||||
# GH20725
|
||||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
with pytest.raises(TypeError): # index is not a DatetimeIndex
|
||||
df.at_time('00:00')
|
||||
|
||||
@pytest.mark.parametrize('axis', ['index', 'columns', 0, 1])
|
||||
def test_at_time_axis(self, axis):
|
||||
# issue 8839
|
||||
rng = date_range('1/1/2000', '1/5/2000', freq='5min')
|
||||
ts = DataFrame(np.random.randn(len(rng), len(rng)))
|
||||
ts.index, ts.columns = rng, rng
|
||||
|
||||
indices = rng[(rng.hour == 9) & (rng.minute == 30) & (rng.second == 0)]
|
||||
|
||||
if axis in ['index', 0]:
|
||||
expected = ts.loc[indices, :]
|
||||
elif axis in ['columns', 1]:
|
||||
expected = ts.loc[:, indices]
|
||||
|
||||
result = ts.at_time('9:30', axis=axis)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_between_time(self, close_open_fixture):
|
||||
rng = date_range('1/1/2000', '1/5/2000', freq='5min')
|
||||
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
|
||||
stime = time(0, 0)
|
||||
etime = time(1, 0)
|
||||
inc_start, inc_end = close_open_fixture
|
||||
|
||||
filtered = ts.between_time(stime, etime, inc_start, inc_end)
|
||||
exp_len = 13 * 4 + 1
|
||||
if not inc_start:
|
||||
exp_len -= 5
|
||||
if not inc_end:
|
||||
exp_len -= 4
|
||||
|
||||
assert len(filtered) == exp_len
|
||||
for rs in filtered.index:
|
||||
t = rs.time()
|
||||
if inc_start:
|
||||
assert t >= stime
|
||||
else:
|
||||
assert t > stime
|
||||
|
||||
if inc_end:
|
||||
assert t <= etime
|
||||
else:
|
||||
assert t < etime
|
||||
|
||||
result = ts.between_time('00:00', '01:00')
|
||||
expected = ts.between_time(stime, etime)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# across midnight
|
||||
rng = date_range('1/1/2000', '1/5/2000', freq='5min')
|
||||
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
|
||||
stime = time(22, 0)
|
||||
etime = time(9, 0)
|
||||
|
||||
filtered = ts.between_time(stime, etime, inc_start, inc_end)
|
||||
exp_len = (12 * 11 + 1) * 4 + 1
|
||||
if not inc_start:
|
||||
exp_len -= 4
|
||||
if not inc_end:
|
||||
exp_len -= 4
|
||||
|
||||
assert len(filtered) == exp_len
|
||||
for rs in filtered.index:
|
||||
t = rs.time()
|
||||
if inc_start:
|
||||
assert (t >= stime) or (t <= etime)
|
||||
else:
|
||||
assert (t > stime) or (t <= etime)
|
||||
|
||||
if inc_end:
|
||||
assert (t <= etime) or (t >= stime)
|
||||
else:
|
||||
assert (t < etime) or (t >= stime)
|
||||
|
||||
def test_between_time_raises(self):
|
||||
# GH20725
|
||||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
with pytest.raises(TypeError): # index is not a DatetimeIndex
|
||||
df.between_time(start_time='00:00', end_time='12:00')
|
||||
|
||||
def test_between_time_axis(self, axis):
|
||||
# issue 8839
|
||||
rng = date_range('1/1/2000', periods=100, freq='10min')
|
||||
ts = DataFrame(np.random.randn(len(rng), len(rng)))
|
||||
stime, etime = ('08:00:00', '09:00:00')
|
||||
exp_len = 7
|
||||
|
||||
if axis in ['index', 0]:
|
||||
ts.index = rng
|
||||
assert len(ts.between_time(stime, etime)) == exp_len
|
||||
assert len(ts.between_time(stime, etime, axis=0)) == exp_len
|
||||
|
||||
if axis in ['columns', 1]:
|
||||
ts.columns = rng
|
||||
selected = ts.between_time(stime, etime, axis=1).columns
|
||||
assert len(selected) == exp_len
|
||||
|
||||
def test_between_time_axis_raises(self, axis):
|
||||
# issue 8839
|
||||
rng = date_range('1/1/2000', periods=100, freq='10min')
|
||||
mask = np.arange(0, len(rng))
|
||||
rand_data = np.random.randn(len(rng), len(rng))
|
||||
ts = DataFrame(rand_data, index=rng, columns=rng)
|
||||
stime, etime = ('08:00:00', '09:00:00')
|
||||
|
||||
if axis in ['columns', 1]:
|
||||
ts.index = mask
|
||||
pytest.raises(TypeError, ts.between_time, stime, etime)
|
||||
pytest.raises(TypeError, ts.between_time, stime, etime, axis=0)
|
||||
|
||||
if axis in ['index', 0]:
|
||||
ts.columns = mask
|
||||
pytest.raises(TypeError, ts.between_time, stime, etime, axis=1)
|
||||
|
||||
def test_operation_on_NaT(self):
|
||||
# Both NaT and Timestamp are in DataFrame.
|
||||
df = pd.DataFrame({'foo': [pd.NaT, pd.NaT,
|
||||
pd.Timestamp('2012-05-01')]})
|
||||
|
||||
res = df.min()
|
||||
exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.max()
|
||||
exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
# GH12941, only NaTs are in DataFrame.
|
||||
df = pd.DataFrame({'foo': [pd.NaT, pd.NaT]})
|
||||
|
||||
res = df.min()
|
||||
exp = pd.Series([pd.NaT], index=["foo"])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.max()
|
||||
exp = pd.Series([pd.NaT], index=["foo"])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_datetime_assignment_with_NaT_and_diff_time_units(self):
|
||||
# GH 7492
|
||||
data_ns = np.array([1, 'nat'], dtype='datetime64[ns]')
|
||||
result = pd.Series(data_ns).to_frame()
|
||||
result['new'] = data_ns
|
||||
expected = pd.DataFrame({0: [1, None],
|
||||
'new': [1, None]}, dtype='datetime64[ns]')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
# OutOfBoundsDatetime error shouldn't occur
|
||||
data_s = np.array([1, 'nat'], dtype='datetime64[s]')
|
||||
result['new'] = data_s
|
||||
expected = pd.DataFrame({0: [1, None],
|
||||
'new': [1e9, None]}, dtype='datetime64[ns]')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_frame_to_period(self):
|
||||
K = 5
|
||||
|
||||
dr = date_range('1/1/2000', '1/1/2001')
|
||||
pr = period_range('1/1/2000', '1/1/2001')
|
||||
df = DataFrame(np.random.randn(len(dr), K), index=dr)
|
||||
df['mix'] = 'a'
|
||||
|
||||
pts = df.to_period()
|
||||
exp = df.copy()
|
||||
exp.index = pr
|
||||
assert_frame_equal(pts, exp)
|
||||
|
||||
pts = df.to_period('M')
|
||||
tm.assert_index_equal(pts.index, exp.index.asfreq('M'))
|
||||
|
||||
df = df.T
|
||||
pts = df.to_period(axis=1)
|
||||
exp = df.copy()
|
||||
exp.columns = pr
|
||||
assert_frame_equal(pts, exp)
|
||||
|
||||
pts = df.to_period('M', axis=1)
|
||||
tm.assert_index_equal(pts.columns, exp.columns.asfreq('M'))
|
||||
|
||||
pytest.raises(ValueError, df.to_period, axis=2)
|
||||
|
||||
@pytest.mark.parametrize("fn", ['tz_localize', 'tz_convert'])
|
||||
def test_tz_convert_and_localize(self, fn):
|
||||
l0 = date_range('20140701', periods=5, freq='D')
|
||||
l1 = date_range('20140701', periods=5, freq='D')
|
||||
|
||||
int_idx = Index(range(5))
|
||||
|
||||
if fn == 'tz_convert':
|
||||
l0 = l0.tz_localize('UTC')
|
||||
l1 = l1.tz_localize('UTC')
|
||||
|
||||
for idx in [l0, l1]:
|
||||
|
||||
l0_expected = getattr(idx, fn)('US/Pacific')
|
||||
l1_expected = getattr(idx, fn)('US/Pacific')
|
||||
|
||||
df1 = DataFrame(np.ones(5), index=l0)
|
||||
df1 = getattr(df1, fn)('US/Pacific')
|
||||
assert_index_equal(df1.index, l0_expected)
|
||||
|
||||
# MultiIndex
|
||||
# GH7846
|
||||
df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1]))
|
||||
|
||||
df3 = getattr(df2, fn)('US/Pacific', level=0)
|
||||
assert not df3.index.levels[0].equals(l0)
|
||||
assert_index_equal(df3.index.levels[0], l0_expected)
|
||||
assert_index_equal(df3.index.levels[1], l1)
|
||||
assert not df3.index.levels[1].equals(l1_expected)
|
||||
|
||||
df3 = getattr(df2, fn)('US/Pacific', level=1)
|
||||
assert_index_equal(df3.index.levels[0], l0)
|
||||
assert not df3.index.levels[0].equals(l0_expected)
|
||||
assert_index_equal(df3.index.levels[1], l1_expected)
|
||||
assert not df3.index.levels[1].equals(l1)
|
||||
|
||||
df4 = DataFrame(np.ones(5),
|
||||
MultiIndex.from_arrays([int_idx, l0]))
|
||||
|
||||
# TODO: untested
|
||||
df5 = getattr(df4, fn)('US/Pacific', level=1) # noqa
|
||||
|
||||
assert_index_equal(df3.index.levels[0], l0)
|
||||
assert not df3.index.levels[0].equals(l0_expected)
|
||||
assert_index_equal(df3.index.levels[1], l1_expected)
|
||||
assert not df3.index.levels[1].equals(l1)
|
||||
|
||||
# Bad Inputs
|
||||
|
||||
# Not DatetimeIndex / PeriodIndex
|
||||
with pytest.raises(TypeError, match='DatetimeIndex'):
|
||||
df = DataFrame(index=int_idx)
|
||||
df = getattr(df, fn)('US/Pacific')
|
||||
|
||||
# Not DatetimeIndex / PeriodIndex
|
||||
with pytest.raises(TypeError, match='DatetimeIndex'):
|
||||
df = DataFrame(np.ones(5),
|
||||
MultiIndex.from_arrays([int_idx, l0]))
|
||||
df = getattr(df, fn)('US/Pacific', level=0)
|
||||
|
||||
# Invalid level
|
||||
with pytest.raises(ValueError, match='not valid'):
|
||||
df = DataFrame(index=l0)
|
||||
df = getattr(df, fn)('US/Pacific', level=1)
|
||||
@@ -0,0 +1,198 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for DataFrame timezone-related methods
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
from pandas.compat import lrange
|
||||
|
||||
from pandas.core.dtypes.dtypes import DatetimeTZDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Series
|
||||
from pandas.core.indexes.datetimes import date_range
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
class TestDataFrameTimezones(object):
|
||||
|
||||
def test_frame_values_with_tz(self):
|
||||
tz = "US/Central"
|
||||
df = DataFrame({"A": date_range('2000', periods=4, tz=tz)})
|
||||
result = df.values
|
||||
expected = np.array([
|
||||
[pd.Timestamp('2000-01-01', tz=tz)],
|
||||
[pd.Timestamp('2000-01-02', tz=tz)],
|
||||
[pd.Timestamp('2000-01-03', tz=tz)],
|
||||
[pd.Timestamp('2000-01-04', tz=tz)],
|
||||
])
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# two columns, homogenous
|
||||
|
||||
df = df.assign(B=df.A)
|
||||
result = df.values
|
||||
expected = np.concatenate([expected, expected], axis=1)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# three columns, heterogenous
|
||||
est = "US/Eastern"
|
||||
df = df.assign(C=df.A.dt.tz_convert(est))
|
||||
|
||||
new = np.array([
|
||||
[pd.Timestamp('2000-01-01T01:00:00', tz=est)],
|
||||
[pd.Timestamp('2000-01-02T01:00:00', tz=est)],
|
||||
[pd.Timestamp('2000-01-03T01:00:00', tz=est)],
|
||||
[pd.Timestamp('2000-01-04T01:00:00', tz=est)],
|
||||
])
|
||||
expected = np.concatenate([expected, new], axis=1)
|
||||
result = df.values
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_frame_from_records_utc(self):
|
||||
rec = {'datum': 1.5,
|
||||
'begin_time': datetime(2006, 4, 27, tzinfo=pytz.utc)}
|
||||
|
||||
# it works
|
||||
DataFrame.from_records([rec], index='begin_time')
|
||||
|
||||
def test_frame_tz_localize(self):
|
||||
rng = date_range('1/1/2011', periods=100, freq='H')
|
||||
|
||||
df = DataFrame({'a': 1}, index=rng)
|
||||
result = df.tz_localize('utc')
|
||||
expected = DataFrame({'a': 1}, rng.tz_localize('UTC'))
|
||||
assert result.index.tz.zone == 'UTC'
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = df.T
|
||||
result = df.tz_localize('utc', axis=1)
|
||||
assert result.columns.tz.zone == 'UTC'
|
||||
tm.assert_frame_equal(result, expected.T)
|
||||
|
||||
def test_frame_tz_convert(self):
|
||||
rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern')
|
||||
|
||||
df = DataFrame({'a': 1}, index=rng)
|
||||
result = df.tz_convert('Europe/Berlin')
|
||||
expected = DataFrame({'a': 1}, rng.tz_convert('Europe/Berlin'))
|
||||
assert result.index.tz.zone == 'Europe/Berlin'
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = df.T
|
||||
result = df.tz_convert('Europe/Berlin', axis=1)
|
||||
assert result.columns.tz.zone == 'Europe/Berlin'
|
||||
tm.assert_frame_equal(result, expected.T)
|
||||
|
||||
def test_frame_join_tzaware(self):
|
||||
test1 = DataFrame(np.zeros((6, 3)),
|
||||
index=date_range("2012-11-15 00:00:00", periods=6,
|
||||
freq="100L", tz="US/Central"))
|
||||
test2 = DataFrame(np.zeros((3, 3)),
|
||||
index=date_range("2012-11-15 00:00:00", periods=3,
|
||||
freq="250L", tz="US/Central"),
|
||||
columns=lrange(3, 6))
|
||||
|
||||
result = test1.join(test2, how='outer')
|
||||
ex_index = test1.index.union(test2.index)
|
||||
|
||||
tm.assert_index_equal(result.index, ex_index)
|
||||
assert result.index.tz.zone == 'US/Central'
|
||||
|
||||
def test_frame_add_tz_mismatch_converts_to_utc(self):
|
||||
rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern')
|
||||
df = DataFrame(np.random.randn(len(rng)), index=rng, columns=['a'])
|
||||
|
||||
df_moscow = df.tz_convert('Europe/Moscow')
|
||||
result = df + df_moscow
|
||||
assert result.index.tz is pytz.utc
|
||||
|
||||
result = df_moscow + df
|
||||
assert result.index.tz is pytz.utc
|
||||
|
||||
def test_frame_align_aware(self):
|
||||
idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern')
|
||||
idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern')
|
||||
df1 = DataFrame(np.random.randn(len(idx1), 3), idx1)
|
||||
df2 = DataFrame(np.random.randn(len(idx2), 3), idx2)
|
||||
new1, new2 = df1.align(df2)
|
||||
assert df1.index.tz == new1.index.tz
|
||||
assert df2.index.tz == new2.index.tz
|
||||
|
||||
# different timezones convert to UTC
|
||||
|
||||
# frame with frame
|
||||
df1_central = df1.tz_convert('US/Central')
|
||||
new1, new2 = df1.align(df1_central)
|
||||
assert new1.index.tz == pytz.UTC
|
||||
assert new2.index.tz == pytz.UTC
|
||||
|
||||
# frame with Series
|
||||
new1, new2 = df1.align(df1_central[0], axis=0)
|
||||
assert new1.index.tz == pytz.UTC
|
||||
assert new2.index.tz == pytz.UTC
|
||||
|
||||
df1[0].align(df1_central, axis=0)
|
||||
assert new1.index.tz == pytz.UTC
|
||||
assert new2.index.tz == pytz.UTC
|
||||
|
||||
@pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern'])
|
||||
def test_frame_no_datetime64_dtype(self, tz):
|
||||
# after GH#7822
|
||||
# these retain the timezones on dict construction
|
||||
dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI')
|
||||
dr_tz = dr.tz_localize(tz)
|
||||
df = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr)
|
||||
tz_expected = DatetimeTZDtype('ns', dr_tz.tzinfo)
|
||||
assert df['B'].dtype == tz_expected
|
||||
|
||||
# GH#2810 (with timezones)
|
||||
datetimes_naive = [ts.to_pydatetime() for ts in dr]
|
||||
datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz]
|
||||
df = DataFrame({'dr': dr,
|
||||
'dr_tz': dr_tz,
|
||||
'datetimes_naive': datetimes_naive,
|
||||
'datetimes_with_tz': datetimes_with_tz})
|
||||
result = df.get_dtype_counts().sort_index()
|
||||
expected = Series({'datetime64[ns]': 2,
|
||||
str(tz_expected): 2}).sort_index()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern'])
|
||||
def test_frame_reset_index(self, tz):
|
||||
dr = date_range('2012-06-02', periods=10, tz=tz)
|
||||
df = DataFrame(np.random.randn(len(dr)), dr)
|
||||
roundtripped = df.reset_index().set_index('index')
|
||||
xp = df.index.tz
|
||||
rs = roundtripped.index.tz
|
||||
assert xp == rs
|
||||
|
||||
@pytest.mark.parametrize('tz', [None, 'America/New_York'])
|
||||
def test_boolean_compare_transpose_tzindex_with_dst(self, tz):
|
||||
# GH 19970
|
||||
idx = date_range('20161101', '20161130', freq='4H', tz=tz)
|
||||
df = DataFrame({'a': range(len(idx)), 'b': range(len(idx))},
|
||||
index=idx)
|
||||
result = df.T == df.T
|
||||
expected = DataFrame(True, index=list('ab'), columns=idx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('copy', [True, False])
|
||||
@pytest.mark.parametrize('method, tz', [
|
||||
['tz_localize', None],
|
||||
['tz_convert', 'Europe/Berlin']
|
||||
])
|
||||
def test_tz_localize_convert_copy_inplace_mutate(self, copy, method, tz):
|
||||
# GH 6326
|
||||
result = DataFrame(np.arange(0, 5),
|
||||
index=date_range('20131027', periods=5,
|
||||
freq='1H', tz=tz))
|
||||
getattr(result, method)('UTC', copy=copy)
|
||||
expected = DataFrame(np.arange(0, 5),
|
||||
index=date_range('20131027', periods=5,
|
||||
freq='1H', tz=tz))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,32 @@
|
||||
import pytest
|
||||
|
||||
from pandas.core.frame import DataFrame
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dataframe():
|
||||
return DataFrame({'a': [1, 2], 'b': [3, 4]})
|
||||
|
||||
|
||||
class TestDataFrameValidate(object):
|
||||
"""Tests for error handling related to data types of method arguments."""
|
||||
|
||||
@pytest.mark.parametrize("func", ["query", "eval", "set_index",
|
||||
"reset_index", "dropna",
|
||||
"drop_duplicates", "sort_values"])
|
||||
@pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0])
|
||||
def test_validate_bool_args(self, dataframe, func, inplace):
|
||||
msg = "For argument \"inplace\" expected type bool"
|
||||
kwargs = dict(inplace=inplace)
|
||||
|
||||
if func == "query":
|
||||
kwargs["expr"] = "a > b"
|
||||
elif func == "eval":
|
||||
kwargs["expr"] = "a + b"
|
||||
elif func == "set_index":
|
||||
kwargs["keys"] = ["a"]
|
||||
elif func == "sort_values":
|
||||
kwargs["by"] = ["a"]
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
getattr(dataframe, func)(**kwargs)
|
||||
Reference in New Issue
Block a user