pruned venvs

This commit is contained in:
d3m1g0d
2019-03-12 21:56:25 +01:00
parent 8ee094481c
commit 33f0511081
4095 changed files with 0 additions and 748399 deletions
@@ -1,140 +0,0 @@
import numpy as np
from pandas import compat
from pandas.util._decorators import cache_readonly
import pandas.util.testing as tm
import pandas as pd
_seriesd = tm.getSeriesData()
_tsd = tm.getTimeSeriesData()
_frame = pd.DataFrame(_seriesd)
_frame2 = pd.DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])
_intframe = pd.DataFrame({k: v.astype(int)
for k, v in compat.iteritems(_seriesd)})
_tsframe = pd.DataFrame(_tsd)
_mixed_frame = _frame.copy()
_mixed_frame['foo'] = 'bar'
class TestData(object):
@cache_readonly
def frame(self):
return _frame.copy()
@cache_readonly
def frame2(self):
return _frame2.copy()
@cache_readonly
def intframe(self):
# force these all to int64 to avoid platform testing issues
return pd.DataFrame({c: s for c, s in compat.iteritems(_intframe)},
dtype=np.int64)
@cache_readonly
def tsframe(self):
return _tsframe.copy()
@cache_readonly
def mixed_frame(self):
return _mixed_frame.copy()
@cache_readonly
def mixed_float(self):
return pd.DataFrame({'A': _frame['A'].copy().astype('float32'),
'B': _frame['B'].copy().astype('float32'),
'C': _frame['C'].copy().astype('float16'),
'D': _frame['D'].copy().astype('float64')})
@cache_readonly
def mixed_float2(self):
return pd.DataFrame({'A': _frame2['A'].copy().astype('float32'),
'B': _frame2['B'].copy().astype('float32'),
'C': _frame2['C'].copy().astype('float16'),
'D': _frame2['D'].copy().astype('float64')})
@cache_readonly
def mixed_int(self):
return pd.DataFrame({'A': _intframe['A'].copy().astype('int32'),
'B': np.ones(len(_intframe['B']), dtype='uint64'),
'C': _intframe['C'].copy().astype('uint8'),
'D': _intframe['D'].copy().astype('int64')})
@cache_readonly
def all_mixed(self):
return pd.DataFrame({'a': 1., 'b': 2, 'c': 'foo',
'float32': np.array([1.] * 10, dtype='float32'),
'int32': np.array([1] * 10, dtype='int32')},
index=np.arange(10))
@cache_readonly
def tzframe(self):
result = pd.DataFrame({'A': pd.date_range('20130101', periods=3),
'B': pd.date_range('20130101', periods=3,
tz='US/Eastern'),
'C': pd.date_range('20130101', periods=3,
tz='CET')})
result.iloc[1, 1] = pd.NaT
result.iloc[1, 2] = pd.NaT
return result
@cache_readonly
def empty(self):
return pd.DataFrame({})
@cache_readonly
def ts1(self):
return tm.makeTimeSeries(nper=30)
@cache_readonly
def ts2(self):
return tm.makeTimeSeries(nper=30)[5:]
@cache_readonly
def simple(self):
arr = np.array([[1., 2., 3.],
[4., 5., 6.],
[7., 8., 9.]])
return pd.DataFrame(arr, columns=['one', 'two', 'three'],
index=['a', 'b', 'c'])
# self.ts3 = tm.makeTimeSeries()[-5:]
# self.ts4 = tm.makeTimeSeries()[1:-1]
def _check_mixed_float(df, dtype=None):
# float16 are most likely to be upcasted to float32
dtypes = dict(A='float32', B='float32', C='float16', D='float64')
if isinstance(dtype, compat.string_types):
dtypes = {k: dtype for k, v in dtypes.items()}
elif isinstance(dtype, dict):
dtypes.update(dtype)
if dtypes.get('A'):
assert(df.dtypes['A'] == dtypes['A'])
if dtypes.get('B'):
assert(df.dtypes['B'] == dtypes['B'])
if dtypes.get('C'):
assert(df.dtypes['C'] == dtypes['C'])
if dtypes.get('D'):
assert(df.dtypes['D'] == dtypes['D'])
def _check_mixed_int(df, dtype=None):
dtypes = dict(A='int32', B='uint64', C='uint8', D='int64')
if isinstance(dtype, compat.string_types):
dtypes = {k: dtype for k, v in dtypes.items()}
elif isinstance(dtype, dict):
dtypes.update(dtype)
if dtypes.get('A'):
assert(df.dtypes['A'] == dtypes['A'])
if dtypes.get('B'):
assert(df.dtypes['B'] == dtypes['B'])
if dtypes.get('C'):
assert(df.dtypes['C'] == dtypes['C'])
if dtypes.get('D'):
assert(df.dtypes['D'] == dtypes['D'])
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -1,515 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import pytest
# pylint: disable-msg=W0612,E1101
from copy import deepcopy
import pydoc
from pandas.compat import range, lrange, long
from pandas import compat
from numpy.random import randn
import numpy as np
from pandas import (DataFrame, Series, date_range, timedelta_range,
Categorical, SparseDataFrame)
import pandas as pd
from pandas.util.testing import (assert_almost_equal,
assert_series_equal,
assert_frame_equal)
import pandas.util.testing as tm
from pandas.tests.frame.common import TestData
class SharedWithSparse(object):
"""
A collection of tests DataFrame and SparseDataFrame can share.
In generic tests on this class, use ``self._assert_frame_equal()`` and
``self._assert_series_equal()`` which are implemented in sub-classes
and dispatch correctly.
"""
def _assert_frame_equal(self, left, right):
"""Dispatch to frame class dependent assertion"""
raise NotImplementedError
def _assert_series_equal(self, left, right):
"""Dispatch to series class dependent assertion"""
raise NotImplementedError
def test_copy_index_name_checking(self):
# don't want to be able to modify the index stored elsewhere after
# making a copy
for attr in ('index', 'columns'):
ind = getattr(self.frame, attr)
ind.name = None
cp = self.frame.copy()
getattr(cp, attr).name = 'foo'
assert getattr(self.frame, attr).name is None
def test_getitem_pop_assign_name(self):
s = self.frame['A']
assert s.name == 'A'
s = self.frame.pop('A')
assert s.name == 'A'
s = self.frame.loc[:, 'B']
assert s.name == 'B'
s2 = s.loc[:]
assert s2.name == 'B'
def test_get_value(self):
for idx in self.frame.index:
for col in self.frame.columns:
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = self.frame.get_value(idx, col)
expected = self.frame[col][idx]
tm.assert_almost_equal(result, expected)
def test_add_prefix_suffix(self):
with_prefix = self.frame.add_prefix('foo#')
expected = pd.Index(['foo#%s' % c for c in self.frame.columns])
tm.assert_index_equal(with_prefix.columns, expected)
with_suffix = self.frame.add_suffix('#foo')
expected = pd.Index(['%s#foo' % c for c in self.frame.columns])
tm.assert_index_equal(with_suffix.columns, expected)
with_pct_prefix = self.frame.add_prefix('%')
expected = pd.Index(['%{}'.format(c) for c in self.frame.columns])
tm.assert_index_equal(with_pct_prefix.columns, expected)
with_pct_suffix = self.frame.add_suffix('%')
expected = pd.Index(['{}%'.format(c) for c in self.frame.columns])
tm.assert_index_equal(with_pct_suffix.columns, expected)
def test_get_axis(self):
f = self.frame
assert f._get_axis_number(0) == 0
assert f._get_axis_number(1) == 1
assert f._get_axis_number('index') == 0
assert f._get_axis_number('rows') == 0
assert f._get_axis_number('columns') == 1
assert f._get_axis_name(0) == 'index'
assert f._get_axis_name(1) == 'columns'
assert f._get_axis_name('index') == 'index'
assert f._get_axis_name('rows') == 'index'
assert f._get_axis_name('columns') == 'columns'
assert f._get_axis(0) is f.index
assert f._get_axis(1) is f.columns
tm.assert_raises_regex(
ValueError, 'No axis named', f._get_axis_number, 2)
tm.assert_raises_regex(
ValueError, 'No axis.*foo', f._get_axis_name, 'foo')
tm.assert_raises_regex(
ValueError, 'No axis.*None', f._get_axis_name, None)
tm.assert_raises_regex(ValueError, 'No axis named',
f._get_axis_number, None)
def test_keys(self):
getkeys = self.frame.keys
assert getkeys() is self.frame.columns
def test_column_contains_typeerror(self):
try:
self.frame.columns in self.frame
except TypeError:
pass
def test_tab_completion(self):
# DataFrame whose columns are identifiers shall have them in __dir__.
df = pd.DataFrame([list('abcd'), list('efgh')], columns=list('ABCD'))
for key in list('ABCD'):
assert key in dir(df)
assert isinstance(df.__getitem__('A'), pd.Series)
# DataFrame whose first-level columns are identifiers shall have
# them in __dir__.
df = pd.DataFrame(
[list('abcd'), list('efgh')],
columns=pd.MultiIndex.from_tuples(list(zip('ABCD', 'EFGH'))))
for key in list('ABCD'):
assert key in dir(df)
for key in list('EFGH'):
assert key not in dir(df)
assert isinstance(df.__getitem__('A'), pd.DataFrame)
def test_not_hashable(self):
df = self.klass([1])
pytest.raises(TypeError, hash, df)
pytest.raises(TypeError, hash, self.empty)
def test_new_empty_index(self):
df1 = self.klass(randn(0, 3))
df2 = self.klass(randn(0, 3))
df1.index.name = 'foo'
assert df2.index.name is None
def test_array_interface(self):
with np.errstate(all='ignore'):
result = np.sqrt(self.frame)
assert isinstance(result, type(self.frame))
assert result.index is self.frame.index
assert result.columns is self.frame.columns
self._assert_frame_equal(result, self.frame.apply(np.sqrt))
def test_get_agg_axis(self):
cols = self.frame._get_agg_axis(0)
assert cols is self.frame.columns
idx = self.frame._get_agg_axis(1)
assert idx is self.frame.index
pytest.raises(ValueError, self.frame._get_agg_axis, 2)
def test_nonzero(self):
assert self.empty.empty
assert not self.frame.empty
assert not self.mixed_frame.empty
# corner case
df = DataFrame({'A': [1., 2., 3.],
'B': ['a', 'b', 'c']},
index=np.arange(3))
del df['A']
assert not df.empty
def test_iteritems(self):
df = self.klass([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b'])
for k, v in compat.iteritems(df):
assert isinstance(v, self.klass._constructor_sliced)
def test_items(self):
# issue #17213, #13918
cols = ['a', 'b', 'c']
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols)
for c, (k, v) in zip(cols, df.items()):
assert c == k
assert isinstance(v, Series)
assert (df[k] == v).all()
def test_iter(self):
assert tm.equalContents(list(self.frame), self.frame.columns)
def test_iterrows(self):
for k, v in self.frame.iterrows():
exp = self.frame.loc[k]
self._assert_series_equal(v, exp)
for k, v in self.mixed_frame.iterrows():
exp = self.mixed_frame.loc[k]
self._assert_series_equal(v, exp)
def test_iterrows_iso8601(self):
# GH19671
if self.klass == SparseDataFrame:
pytest.xfail(reason='SparseBlock datetime type not implemented.')
s = self.klass(
{'non_iso8601': ['M1701', 'M1802', 'M1903', 'M2004'],
'iso8601': date_range('2000-01-01', periods=4, freq='M')})
for k, v in s.iterrows():
exp = s.loc[k]
self._assert_series_equal(v, exp)
def test_itertuples(self):
for i, tup in enumerate(self.frame.itertuples()):
s = self.klass._constructor_sliced(tup[1:])
s.name = tup[0]
expected = self.frame.iloc[i, :].reset_index(drop=True)
self._assert_series_equal(s, expected)
df = self.klass({'floats': np.random.randn(5),
'ints': lrange(5)}, columns=['floats', 'ints'])
for tup in df.itertuples(index=False):
assert isinstance(tup[1], (int, long))
df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]})
dfaa = df[['a', 'a']]
assert (list(dfaa.itertuples()) ==
[(0, 1, 1), (1, 2, 2), (2, 3, 3)])
# repr with be int/long on 32-bit/windows
if not (compat.is_platform_windows() or compat.is_platform_32bit()):
assert (repr(list(df.itertuples(name=None))) ==
'[(0, 1, 4), (1, 2, 5), (2, 3, 6)]')
tup = next(df.itertuples(name='TestName'))
assert tup._fields == ('Index', 'a', 'b')
assert (tup.Index, tup.a, tup.b) == tup
assert type(tup).__name__ == 'TestName'
df.columns = ['def', 'return']
tup2 = next(df.itertuples(name='TestName'))
assert tup2 == (0, 1, 4)
assert tup2._fields == ('Index', '_1', '_2')
df3 = DataFrame({'f' + str(i): [i] for i in range(1024)})
# will raise SyntaxError if trying to create namedtuple
tup3 = next(df3.itertuples())
assert not hasattr(tup3, '_fields')
assert isinstance(tup3, tuple)
def test_sequence_like_with_categorical(self):
# GH 7839
# make sure can iterate
df = DataFrame({"id": [1, 2, 3, 4, 5, 6],
"raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
df['grade'] = Categorical(df['raw_grade'])
# basic sequencing testing
result = list(df.grade.values)
expected = np.array(df.grade.values).tolist()
tm.assert_almost_equal(result, expected)
# iteration
for t in df.itertuples(index=False):
str(t)
for row, s in df.iterrows():
str(s)
for c, col in df.iteritems():
str(s)
def test_len(self):
assert len(self.frame) == len(self.frame.index)
def test_values(self):
frame = self.frame
arr = frame.values
frame_cols = frame.columns
for i, row in enumerate(arr):
for j, value in enumerate(row):
col = frame_cols[j]
if np.isnan(value):
assert np.isnan(frame[col][i])
else:
assert value == frame[col][i]
# mixed type
arr = self.mixed_frame[['foo', 'A']].values
assert arr[0, 0] == 'bar'
df = self.klass({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]})
arr = df.values
assert arr[0, 0] == 1j
# single block corner case
arr = self.frame[['A', 'B']].values
expected = self.frame.reindex(columns=['A', 'B']).values
assert_almost_equal(arr, expected)
def test_transpose(self):
frame = self.frame
dft = frame.T
for idx, series in compat.iteritems(dft):
for col, value in compat.iteritems(series):
if np.isnan(value):
assert np.isnan(frame[col][idx])
else:
assert value == frame[col][idx]
# mixed type
index, data = tm.getMixedTypeDict()
mixed = self.klass(data, index=index)
mixed_T = mixed.T
for col, s in compat.iteritems(mixed_T):
assert s.dtype == np.object_
def test_swapaxes(self):
df = self.klass(np.random.randn(10, 5))
self._assert_frame_equal(df.T, df.swapaxes(0, 1))
self._assert_frame_equal(df.T, df.swapaxes(1, 0))
self._assert_frame_equal(df, df.swapaxes(0, 0))
pytest.raises(ValueError, df.swapaxes, 2, 5)
def test_axis_aliases(self):
f = self.frame
# reg name
expected = f.sum(axis=0)
result = f.sum(axis='index')
assert_series_equal(result, expected)
expected = f.sum(axis=1)
result = f.sum(axis='columns')
assert_series_equal(result, expected)
def test_class_axis(self):
# https://github.com/pandas-dev/pandas/issues/18147
# no exception and no empty docstring
assert pydoc.getdoc(DataFrame.index)
assert pydoc.getdoc(DataFrame.columns)
def test_more_values(self):
values = self.mixed_frame.values
assert values.shape[1] == len(self.mixed_frame.columns)
def test_repr_with_mi_nat(self):
df = self.klass({'X': [1, 2]},
index=[[pd.NaT, pd.Timestamp('20130101')], ['a', 'b']])
res = repr(df)
exp = ' X\nNaT a 1\n2013-01-01 b 2'
assert res == exp
def test_iteritems_names(self):
for k, v in compat.iteritems(self.mixed_frame):
assert v.name == k
def test_series_put_names(self):
series = self.mixed_frame._series
for k, v in compat.iteritems(series):
assert v.name == k
def test_empty_nonzero(self):
df = self.klass([1, 2, 3])
assert not df.empty
df = self.klass(index=[1], columns=[1])
assert not df.empty
df = self.klass(index=['a', 'b'], columns=['c', 'd']).dropna()
assert df.empty
assert df.T.empty
empty_frames = [self.klass(),
self.klass(index=[1]),
self.klass(columns=[1]),
self.klass({1: []})]
for df in empty_frames:
assert df.empty
assert df.T.empty
def test_with_datetimelikes(self):
df = self.klass({'A': date_range('20130101', periods=10),
'B': timedelta_range('1 day', periods=10)})
t = df.T
result = t.get_dtype_counts()
expected = Series({'object': 10})
tm.assert_series_equal(result, expected)
class TestDataFrameMisc(SharedWithSparse, TestData):
klass = DataFrame
# SharedWithSparse tests use generic, klass-agnostic assertion
_assert_frame_equal = staticmethod(assert_frame_equal)
_assert_series_equal = staticmethod(assert_series_equal)
def test_values(self):
self.frame.values[:, 0] = 5.
assert (self.frame.values[:, 0] == 5).all()
def test_as_matrix_deprecated(self):
# GH18458
with tm.assert_produces_warning(FutureWarning):
result = self.frame.as_matrix(columns=self.frame.columns.tolist())
expected = self.frame.values
tm.assert_numpy_array_equal(result, expected)
def test_deepcopy(self):
cp = deepcopy(self.frame)
series = cp['A']
series[:] = 10
for idx, value in compat.iteritems(series):
assert self.frame['A'][idx] != value
def test_transpose_get_view(self):
dft = self.frame.T
dft.values[:, 5:10] = 5
assert (self.frame.values[5:10] == 5).all()
def test_inplace_return_self(self):
# re #1893
data = DataFrame({'a': ['foo', 'bar', 'baz', 'qux'],
'b': [0, 0, 1, 1],
'c': [1, 2, 3, 4]})
def _check_f(base, f):
result = f(base)
assert result is None
# -----DataFrame-----
# set_index
f = lambda x: x.set_index('a', inplace=True)
_check_f(data.copy(), f)
# reset_index
f = lambda x: x.reset_index(inplace=True)
_check_f(data.set_index('a'), f)
# drop_duplicates
f = lambda x: x.drop_duplicates(inplace=True)
_check_f(data.copy(), f)
# sort
f = lambda x: x.sort_values('b', inplace=True)
_check_f(data.copy(), f)
# sort_index
f = lambda x: x.sort_index(inplace=True)
_check_f(data.copy(), f)
# fillna
f = lambda x: x.fillna(0, inplace=True)
_check_f(data.copy(), f)
# replace
f = lambda x: x.replace(1, 0, inplace=True)
_check_f(data.copy(), f)
# rename
f = lambda x: x.rename({1: 'foo'}, inplace=True)
_check_f(data.copy(), f)
# -----Series-----
d = data.copy()['c']
# reset_index
f = lambda x: x.reset_index(inplace=True, drop=True)
_check_f(data.set_index('a')['c'], f)
# fillna
f = lambda x: x.fillna(0, inplace=True)
_check_f(d.copy(), f)
# replace
f = lambda x: x.replace(1, 0, inplace=True)
_check_f(d.copy(), f)
# rename
f = lambda x: x.rename({1: 'foo'}, inplace=True)
_check_f(d.copy(), f)
def test_tab_complete_warning(self, ip):
# https://github.com/pandas-dev/pandas/issues/16409
pytest.importorskip('IPython', minversion="6.0.0")
from IPython.core.completer import provisionalcompleter
code = "import pandas as pd; df = pd.DataFrame()"
ip.run_code(code)
with tm.assert_produces_warning(None):
with provisionalcompleter('ignore'):
list(ip.Completer.completions('df.', 1))
File diff suppressed because it is too large Load Diff
@@ -1,277 +0,0 @@
# -*- coding: utf-8 -*-
import pytest
import numpy as np
from pandas.compat import range
import pandas as pd
import pandas.util.testing as tm
# -------------------------------------------------------------------
# Comparisons
class TestFrameComparisons(object):
def test_df_boolean_comparison_error(self):
# GH#4576
# boolean comparisons with a tuple/list give unexpected results
df = pd.DataFrame(np.arange(6).reshape((3, 2)))
# not shape compatible
with pytest.raises(ValueError):
df == (2, 2)
with pytest.raises(ValueError):
df == [2, 2]
def test_df_float_none_comparison(self):
df = pd.DataFrame(np.random.randn(8, 3), index=range(8),
columns=['A', 'B', 'C'])
with pytest.raises(TypeError):
df.__eq__(None)
def test_df_string_comparison(self):
df = pd.DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}])
mask_a = df.a > 1
tm.assert_frame_equal(df[mask_a], df.loc[1:1, :])
tm.assert_frame_equal(df[-mask_a], df.loc[0:0, :])
mask_b = df.b == "foo"
tm.assert_frame_equal(df[mask_b], df.loc[0:0, :])
tm.assert_frame_equal(df[-mask_b], df.loc[1:1, :])
@pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le'])
def test_df_flex_cmp_constant_return_types(self, opname):
# GH#15077, non-empty DataFrame
df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]})
const = 2
result = getattr(df, opname)(const).get_dtype_counts()
tm.assert_series_equal(result, pd.Series([2], ['bool']))
@pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le'])
def test_df_flex_cmp_constant_return_types_empty(self, opname):
# GH#15077 empty DataFrame
df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]})
const = 2
empty = df.iloc[:0]
result = getattr(empty, opname)(const).get_dtype_counts()
tm.assert_series_equal(result, pd.Series([2], ['bool']))
@pytest.mark.parametrize('timestamps', [
[pd.Timestamp('2012-01-01 13:00:00+00:00')] * 2,
[pd.Timestamp('2012-01-01 13:00:00')] * 2])
def test_tz_aware_scalar_comparison(self, timestamps):
# Test for issue #15966
df = pd.DataFrame({'test': timestamps})
expected = pd.DataFrame({'test': [False, False]})
tm.assert_frame_equal(df == -1, expected)
# -------------------------------------------------------------------
# Arithmetic
class TestFrameFlexArithmetic(object):
def test_df_add_flex_filled_mixed_dtypes(self):
# GH#19611
dti = pd.date_range('2016-01-01', periods=3)
ser = pd.Series(['1 Day', 'NaT', '2 Days'], dtype='timedelta64[ns]')
df = pd.DataFrame({'A': dti, 'B': ser})
other = pd.DataFrame({'A': ser, 'B': ser})
fill = pd.Timedelta(days=1).to_timedelta64()
result = df.add(other, fill_value=fill)
expected = pd.DataFrame(
{'A': pd.Series(['2016-01-02', '2016-01-03', '2016-01-05'],
dtype='datetime64[ns]'),
'B': ser * 2})
tm.assert_frame_equal(result, expected)
class TestFrameMulDiv(object):
"""Tests for DataFrame multiplication and division"""
# ------------------------------------------------------------------
# Mod By Zero
def test_df_mod_zero_df(self):
# GH#3590, modulo as ints
df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
# this is technically wrong, as the integer portion is coerced to float
# ###
first = pd.Series([0, 0, 0, 0], dtype='float64')
second = pd.Series([np.nan, np.nan, np.nan, 0])
expected = pd.DataFrame({'first': first, 'second': second})
result = df % df
tm.assert_frame_equal(result, expected)
def test_df_mod_zero_array(self):
# GH#3590, modulo as ints
df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
# this is technically wrong, as the integer portion is coerced to float
# ###
first = pd.Series([0, 0, 0, 0], dtype='float64')
second = pd.Series([np.nan, np.nan, np.nan, 0])
expected = pd.DataFrame({'first': first, 'second': second})
# numpy has a slightly different (wrong) treatment
with np.errstate(all='ignore'):
arr = df.values % df.values
result2 = pd.DataFrame(arr, index=df.index,
columns=df.columns, dtype='float64')
result2.iloc[0:3, 1] = np.nan
tm.assert_frame_equal(result2, expected)
def test_df_mod_zero_int(self):
# GH#3590, modulo as ints
df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
result = df % 0
expected = pd.DataFrame(np.nan, index=df.index, columns=df.columns)
tm.assert_frame_equal(result, expected)
# numpy has a slightly different (wrong) treatment
with np.errstate(all='ignore'):
arr = df.values.astype('float64') % 0
result2 = pd.DataFrame(arr, index=df.index, columns=df.columns)
tm.assert_frame_equal(result2, expected)
def test_df_mod_zero_series_does_not_commute(self):
# GH#3590, modulo as ints
# not commutative with series
df = pd.DataFrame(np.random.randn(10, 5))
ser = df[0]
res = ser % df
res2 = df % ser
assert not res.fillna(0).equals(res2.fillna(0))
# ------------------------------------------------------------------
# Division By Zero
def test_df_div_zero_df(self):
# integer div, but deal with the 0's (GH#9144)
df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
result = df / df
first = pd.Series([1.0, 1.0, 1.0, 1.0])
second = pd.Series([np.nan, np.nan, np.nan, 1])
expected = pd.DataFrame({'first': first, 'second': second})
tm.assert_frame_equal(result, expected)
def test_df_div_zero_array(self):
# integer div, but deal with the 0's (GH#9144)
df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
first = pd.Series([1.0, 1.0, 1.0, 1.0])
second = pd.Series([np.nan, np.nan, np.nan, 1])
expected = pd.DataFrame({'first': first, 'second': second})
with np.errstate(all='ignore'):
arr = df.values.astype('float') / df.values
result = pd.DataFrame(arr, index=df.index,
columns=df.columns)
tm.assert_frame_equal(result, expected)
def test_df_div_zero_int(self):
# integer div, but deal with the 0's (GH#9144)
df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
result = df / 0
expected = pd.DataFrame(np.inf, index=df.index, columns=df.columns)
expected.iloc[0:3, 1] = np.nan
tm.assert_frame_equal(result, expected)
# numpy has a slightly different (wrong) treatment
with np.errstate(all='ignore'):
arr = df.values.astype('float64') / 0
result2 = pd.DataFrame(arr, index=df.index,
columns=df.columns)
tm.assert_frame_equal(result2, expected)
def test_df_div_zero_series_does_not_commute(self):
# integer div, but deal with the 0's (GH#9144)
df = pd.DataFrame(np.random.randn(10, 5))
ser = df[0]
res = ser / df
res2 = df / ser
assert not res.fillna(0).equals(res2.fillna(0))
class TestFrameArithmetic(object):
@pytest.mark.xfail(reason='GH#7996 datetime64 units not converted to nano')
def test_df_sub_datetime64_not_ns(self):
df = pd.DataFrame(pd.date_range('20130101', periods=3))
dt64 = np.datetime64('2013-01-01')
assert dt64.dtype == 'datetime64[D]'
res = df - dt64
expected = pd.DataFrame([pd.Timedelta(days=0), pd.Timedelta(days=1),
pd.Timedelta(days=2)])
tm.assert_frame_equal(res, expected)
@pytest.mark.parametrize('data', [
[1, 2, 3],
[1.1, 2.2, 3.3],
[pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), pd.NaT],
['x', 'y', 1]])
@pytest.mark.parametrize('dtype', [None, object])
def test_df_radd_str_invalid(self, dtype, data):
df = pd.DataFrame(data, dtype=dtype)
with pytest.raises(TypeError):
'foo_' + df
@pytest.mark.parametrize('dtype', [None, object])
def test_df_with_dtype_radd_int(self, dtype):
df = pd.DataFrame([1, 2, 3], dtype=dtype)
expected = pd.DataFrame([2, 3, 4], dtype=dtype)
result = 1 + df
tm.assert_frame_equal(result, expected)
result = df + 1
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize('dtype', [None, object])
def test_df_with_dtype_radd_nan(self, dtype):
df = pd.DataFrame([1, 2, 3], dtype=dtype)
expected = pd.DataFrame([np.nan, np.nan, np.nan], dtype=dtype)
result = np.nan + df
tm.assert_frame_equal(result, expected)
result = df + np.nan
tm.assert_frame_equal(result, expected)
def test_df_radd_str(self):
df = pd.DataFrame(['x', np.nan, 'x'])
tm.assert_frame_equal('a' + df, pd.DataFrame(['ax', np.nan, 'ax']))
tm.assert_frame_equal(df + 'a', pd.DataFrame(['xa', np.nan, 'xa']))
class TestPeriodFrameArithmetic(object):
def test_ops_frame_period(self):
# GH 13043
df = pd.DataFrame({'A': [pd.Period('2015-01', freq='M'),
pd.Period('2015-02', freq='M')],
'B': [pd.Period('2014-01', freq='M'),
pd.Period('2014-02', freq='M')]})
assert df['A'].dtype == object
assert df['B'].dtype == object
p = pd.Period('2015-03', freq='M')
# dtype will be object because of original dtype
exp = pd.DataFrame({'A': np.array([2, 1], dtype=object),
'B': np.array([14, 13], dtype=object)})
tm.assert_frame_equal(p - df, exp)
tm.assert_frame_equal(df - p, -1 * exp)
df2 = pd.DataFrame({'A': [pd.Period('2015-05', freq='M'),
pd.Period('2015-06', freq='M')],
'B': [pd.Period('2015-05', freq='M'),
pd.Period('2015-06', freq='M')]})
assert df2['A'].dtype == object
assert df2['B'].dtype == object
exp = pd.DataFrame({'A': np.array([4, 4], dtype=object),
'B': np.array([16, 16], dtype=object)})
tm.assert_frame_equal(df2 - df, exp)
tm.assert_frame_equal(df - df2, -1 * exp)
@@ -1,108 +0,0 @@
# coding=utf-8
import numpy as np
from pandas import (DataFrame, date_range, Timestamp, Series,
to_datetime)
import pandas.util.testing as tm
from .common import TestData
class TestFrameAsof(TestData):
def setup_method(self, method):
self.N = N = 50
self.rng = date_range('1/1/1990', periods=N, freq='53s')
self.df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
index=self.rng)
def test_basic(self):
df = self.df.copy()
df.loc[15:30, 'A'] = np.nan
dates = date_range('1/1/1990', periods=self.N * 3,
freq='25s')
result = df.asof(dates)
assert result.notna().all(1).all()
lb = df.index[14]
ub = df.index[30]
dates = list(dates)
result = df.asof(dates)
assert result.notna().all(1).all()
mask = (result.index >= lb) & (result.index < ub)
rs = result[mask]
assert (rs == 14).all(1).all()
def test_subset(self):
N = 10
rng = date_range('1/1/1990', periods=N, freq='53s')
df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
index=rng)
df.loc[4:8, 'A'] = np.nan
dates = date_range('1/1/1990', periods=N * 3,
freq='25s')
# with a subset of A should be the same
result = df.asof(dates, subset='A')
expected = df.asof(dates)
tm.assert_frame_equal(result, expected)
# same with A/B
result = df.asof(dates, subset=['A', 'B'])
expected = df.asof(dates)
tm.assert_frame_equal(result, expected)
# B gives self.df.asof
result = df.asof(dates, subset='B')
expected = df.resample('25s', closed='right').ffill().reindex(dates)
expected.iloc[20:] = 9
tm.assert_frame_equal(result, expected)
def test_missing(self):
# GH 15118
# no match found - `where` value before earliest date in index
N = 10
rng = date_range('1/1/1990', periods=N, freq='53s')
df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
index=rng)
result = df.asof('1989-12-31')
expected = Series(index=['A', 'B'], name=Timestamp('1989-12-31'))
tm.assert_series_equal(result, expected)
result = df.asof(to_datetime(['1989-12-31']))
expected = DataFrame(index=to_datetime(['1989-12-31']),
columns=['A', 'B'], dtype='float64')
tm.assert_frame_equal(result, expected)
def test_all_nans(self):
# GH 15713
# DataFrame is all nans
result = DataFrame([np.nan]).asof([0])
expected = DataFrame([np.nan])
tm.assert_frame_equal(result, expected)
# testing non-default indexes, multiple inputs
dates = date_range('1/1/1990', periods=self.N * 3, freq='25s')
result = DataFrame(np.nan, index=self.rng, columns=['A']).asof(dates)
expected = DataFrame(np.nan, index=dates, columns=['A'])
tm.assert_frame_equal(result, expected)
# testing multiple columns
dates = date_range('1/1/1990', periods=self.N * 3, freq='25s')
result = DataFrame(np.nan, index=self.rng,
columns=['A', 'B', 'C']).asof(dates)
expected = DataFrame(np.nan, index=dates, columns=['A', 'B', 'C'])
tm.assert_frame_equal(result, expected)
# testing scalar input
result = DataFrame(np.nan, index=[1, 2], columns=['A', 'B']).asof([3])
expected = DataFrame(np.nan, index=[3], columns=['A', 'B'])
tm.assert_frame_equal(result, expected)
result = DataFrame(np.nan, index=[1, 2], columns=['A', 'B']).asof(3)
expected = Series(np.nan, index=['A', 'B'], name=3)
tm.assert_series_equal(result, expected)
@@ -1,560 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import pytest
from datetime import datetime, timedelta
import itertools
from numpy import nan
import numpy as np
from pandas import (DataFrame, Series, Timestamp, date_range, compat,
option_context)
from pandas.compat import StringIO
import pandas as pd
from pandas.util.testing import (assert_almost_equal,
assert_series_equal,
assert_frame_equal)
import pandas.util.testing as tm
from pandas.tests.frame.common import TestData
# Segregated collection of methods that require the BlockManager internal data
# structure
class TestDataFrameBlockInternals(TestData):
def test_cast_internals(self):
casted = DataFrame(self.frame._data, dtype=int)
expected = DataFrame(self.frame._series, dtype=int)
assert_frame_equal(casted, expected)
casted = DataFrame(self.frame._data, dtype=np.int32)
expected = DataFrame(self.frame._series, dtype=np.int32)
assert_frame_equal(casted, expected)
def test_consolidate(self):
self.frame['E'] = 7.
consolidated = self.frame._consolidate()
assert len(consolidated._data.blocks) == 1
# Ensure copy, do I want this?
recons = consolidated._consolidate()
assert recons is not consolidated
tm.assert_frame_equal(recons, consolidated)
self.frame['F'] = 8.
assert len(self.frame._data.blocks) == 3
self.frame._consolidate(inplace=True)
assert len(self.frame._data.blocks) == 1
def test_consolidate_deprecation(self):
self.frame['E'] = 7
with tm.assert_produces_warning(FutureWarning):
self.frame.consolidate()
def test_consolidate_inplace(self):
frame = self.frame.copy() # noqa
# triggers in-place consolidation
for letter in range(ord('A'), ord('Z')):
self.frame[chr(letter)] = chr(letter)
def test_values_consolidate(self):
self.frame['E'] = 7.
assert not self.frame._data.is_consolidated()
_ = self.frame.values # noqa
assert self.frame._data.is_consolidated()
def test_modify_values(self):
self.frame.values[5] = 5
assert (self.frame.values[5] == 5).all()
# unconsolidated
self.frame['E'] = 7.
self.frame.values[6] = 6
assert (self.frame.values[6] == 6).all()
def test_boolean_set_uncons(self):
self.frame['E'] = 7.
expected = self.frame.values.copy()
expected[expected > 1] = 2
self.frame[self.frame > 1] = 2
assert_almost_equal(expected, self.frame.values)
def test_values_numeric_cols(self):
self.frame['foo'] = 'bar'
values = self.frame[['A', 'B', 'C', 'D']].values
assert values.dtype == np.float64
def test_values_lcd(self):
# mixed lcd
values = self.mixed_float[['A', 'B', 'C', 'D']].values
assert values.dtype == np.float64
values = self.mixed_float[['A', 'B', 'C']].values
assert values.dtype == np.float32
values = self.mixed_float[['C']].values
assert values.dtype == np.float16
# GH 10364
# B uint64 forces float because there are other signed int types
values = self.mixed_int[['A', 'B', 'C', 'D']].values
assert values.dtype == np.float64
values = self.mixed_int[['A', 'D']].values
assert values.dtype == np.int64
# B uint64 forces float because there are other signed int types
values = self.mixed_int[['A', 'B', 'C']].values
assert values.dtype == np.float64
# as B and C are both unsigned, no forcing to float is needed
values = self.mixed_int[['B', 'C']].values
assert values.dtype == np.uint64
values = self.mixed_int[['A', 'C']].values
assert values.dtype == np.int32
values = self.mixed_int[['C', 'D']].values
assert values.dtype == np.int64
values = self.mixed_int[['A']].values
assert values.dtype == np.int32
values = self.mixed_int[['C']].values
assert values.dtype == np.uint8
def test_constructor_with_convert(self):
# this is actually mostly a test of lib.maybe_convert_objects
# #2845
df = DataFrame({'A': [2 ** 63 - 1]})
result = df['A']
expected = Series(np.asarray([2 ** 63 - 1], np.int64), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [2 ** 63]})
result = df['A']
expected = Series(np.asarray([2 ** 63], np.uint64), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [datetime(2005, 1, 1), True]})
result = df['A']
expected = Series(np.asarray([datetime(2005, 1, 1), True], np.object_),
name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [None, 1]})
result = df['A']
expected = Series(np.asarray([np.nan, 1], np.float_), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [1.0, 2]})
result = df['A']
expected = Series(np.asarray([1.0, 2], np.float_), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [1.0 + 2.0j, 3]})
result = df['A']
expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex_), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [1.0 + 2.0j, 3.0]})
result = df['A']
expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex_), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [1.0 + 2.0j, True]})
result = df['A']
expected = Series(np.asarray([1.0 + 2.0j, True], np.object_), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [1.0, None]})
result = df['A']
expected = Series(np.asarray([1.0, np.nan], np.float_), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [1.0 + 2.0j, None]})
result = df['A']
expected = Series(np.asarray(
[1.0 + 2.0j, np.nan], np.complex_), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [2.0, 1, True, None]})
result = df['A']
expected = Series(np.asarray(
[2.0, 1, True, None], np.object_), name='A')
assert_series_equal(result, expected)
df = DataFrame({'A': [2.0, 1, datetime(2006, 1, 1), None]})
result = df['A']
expected = Series(np.asarray([2.0, 1, datetime(2006, 1, 1),
None], np.object_), name='A')
assert_series_equal(result, expected)
def test_construction_with_mixed(self):
# test construction edge cases with mixed types
# f7u12, this does not work without extensive workaround
data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)],
[datetime(2000, 1, 2), datetime(2000, 1, 3),
datetime(2000, 1, 1)]]
df = DataFrame(data)
# check dtypes
result = df.get_dtype_counts().sort_values()
expected = Series({'datetime64[ns]': 3})
# mixed-type frames
self.mixed_frame['datetime'] = datetime.now()
self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1)
assert self.mixed_frame['datetime'].dtype == 'M8[ns]'
assert self.mixed_frame['timedelta'].dtype == 'm8[ns]'
result = self.mixed_frame.get_dtype_counts().sort_values()
expected = Series({'float64': 4,
'object': 1,
'datetime64[ns]': 1,
'timedelta64[ns]': 1}).sort_values()
assert_series_equal(result, expected)
def test_construction_with_conversions(self):
# convert from a numpy array of non-ns timedelta64
arr = np.array([1, 2, 3], dtype='timedelta64[s]')
df = DataFrame(index=range(3))
df['A'] = arr
expected = DataFrame({'A': pd.timedelta_range('00:00:01', periods=3,
freq='s')},
index=range(3))
assert_frame_equal(df, expected)
expected = DataFrame({
'dt1': Timestamp('20130101'),
'dt2': date_range('20130101', periods=3),
# 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'),
}, index=range(3))
df = DataFrame(index=range(3))
df['dt1'] = np.datetime64('2013-01-01')
df['dt2'] = np.array(['2013-01-01', '2013-01-02', '2013-01-03'],
dtype='datetime64[D]')
# df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01
# 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')
assert_frame_equal(df, expected)
def test_constructor_compound_dtypes(self):
# GH 5191
# compound dtypes should raise not-implementederror
def f(dtype):
data = list(itertools.repeat((datetime(2001, 1, 1),
"aa", 20), 9))
return DataFrame(data=data,
columns=["A", "B", "C"],
dtype=dtype)
pytest.raises(NotImplementedError, f,
[("A", "datetime64[h]"),
("B", "str"),
("C", "int32")])
# these work (though results may be unexpected)
f('int64')
f('float64')
# 10822
# invalid error message on dt inference
if not compat.is_platform_windows():
f('M8[ns]')
def test_equals_different_blocks(self):
# GH 9330
df0 = pd.DataFrame({"A": ["x", "y"], "B": [1, 2],
"C": ["w", "z"]})
df1 = df0.reset_index()[["A", "B", "C"]]
# this assert verifies that the above operations have
# induced a block rearrangement
assert (df0._data.blocks[0].dtype != df1._data.blocks[0].dtype)
# do the real tests
assert_frame_equal(df0, df1)
assert df0.equals(df1)
assert df1.equals(df0)
def test_copy_blocks(self):
# API/ENH 9607
df = DataFrame(self.frame, copy=True)
column = df.columns[0]
# use the default copy=True, change a column
# deprecated 0.21.0
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
blocks = df.as_blocks()
for dtype, _df in blocks.items():
if column in _df:
_df.loc[:, column] = _df[column] + 1
# make sure we did not change the original DataFrame
assert not _df[column].equals(df[column])
def test_no_copy_blocks(self):
# API/ENH 9607
df = DataFrame(self.frame, copy=True)
column = df.columns[0]
# use the copy=False, change a column
# deprecated 0.21.0
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
blocks = df.as_blocks(copy=False)
for dtype, _df in blocks.items():
if column in _df:
_df.loc[:, column] = _df[column] + 1
# make sure we did change the original DataFrame
assert _df[column].equals(df[column])
def test_copy(self):
cop = self.frame.copy()
cop['E'] = cop['A']
assert 'E' not in self.frame
# copy objects
copy = self.mixed_frame.copy()
assert copy._data is not self.mixed_frame._data
def test_pickle(self):
unpickled = tm.round_trip_pickle(self.mixed_frame)
assert_frame_equal(self.mixed_frame, unpickled)
# buglet
self.mixed_frame._data.ndim
# empty
unpickled = tm.round_trip_pickle(self.empty)
repr(unpickled)
# tz frame
unpickled = tm.round_trip_pickle(self.tzframe)
assert_frame_equal(self.tzframe, unpickled)
def test_consolidate_datetime64(self):
# numpy vstack bug
data = """\
starting,ending,measure
2012-06-21 00:00,2012-06-23 07:00,77
2012-06-23 07:00,2012-06-23 16:30,65
2012-06-23 16:30,2012-06-25 08:00,77
2012-06-25 08:00,2012-06-26 12:00,0
2012-06-26 12:00,2012-06-27 08:00,77
"""
df = pd.read_csv(StringIO(data), parse_dates=[0, 1])
ser_starting = df.starting
ser_starting.index = ser_starting.values
ser_starting = ser_starting.tz_localize('US/Eastern')
ser_starting = ser_starting.tz_convert('UTC')
ser_starting.index.name = 'starting'
ser_ending = df.ending
ser_ending.index = ser_ending.values
ser_ending = ser_ending.tz_localize('US/Eastern')
ser_ending = ser_ending.tz_convert('UTC')
ser_ending.index.name = 'ending'
df.starting = ser_starting.index
df.ending = ser_ending.index
tm.assert_index_equal(pd.DatetimeIndex(
df.starting), ser_starting.index)
tm.assert_index_equal(pd.DatetimeIndex(df.ending), ser_ending.index)
def test_is_mixed_type(self):
assert not self.frame._is_mixed_type
assert self.mixed_frame._is_mixed_type
def test_get_numeric_data(self):
# TODO(wesm): unused?
intname = np.dtype(np.int_).name # noqa
floatname = np.dtype(np.float_).name # noqa
datetime64name = np.dtype('M8[ns]').name
objectname = np.dtype(np.object_).name
df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
'f': Timestamp('20010102')},
index=np.arange(10))
result = df.get_dtype_counts()
expected = Series({'int64': 1, 'float64': 1,
datetime64name: 1, objectname: 1})
result = result.sort_index()
expected = expected.sort_index()
assert_series_equal(result, expected)
df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
'd': np.array([1.] * 10, dtype='float32'),
'e': np.array([1] * 10, dtype='int32'),
'f': np.array([1] * 10, dtype='int16'),
'g': Timestamp('20010102')},
index=np.arange(10))
result = df._get_numeric_data()
expected = df.loc[:, ['a', 'b', 'd', 'e', 'f']]
assert_frame_equal(result, expected)
only_obj = df.loc[:, ['c', 'g']]
result = only_obj._get_numeric_data()
expected = df.loc[:, []]
assert_frame_equal(result, expected)
df = DataFrame.from_dict(
{'a': [1, 2], 'b': ['foo', 'bar'], 'c': [np.pi, np.e]})
result = df._get_numeric_data()
expected = DataFrame.from_dict({'a': [1, 2], 'c': [np.pi, np.e]})
assert_frame_equal(result, expected)
df = result.copy()
result = df._get_numeric_data()
expected = df
assert_frame_equal(result, expected)
def test_convert_objects(self):
oops = self.mixed_frame.T.T
converted = oops._convert(datetime=True)
assert_frame_equal(converted, self.mixed_frame)
assert converted['A'].dtype == np.float64
# force numeric conversion
self.mixed_frame['H'] = '1.'
self.mixed_frame['I'] = '1'
# add in some items that will be nan
length = len(self.mixed_frame)
self.mixed_frame['J'] = '1.'
self.mixed_frame['K'] = '1'
self.mixed_frame.loc[0:5, ['J', 'K']] = 'garbled'
converted = self.mixed_frame._convert(datetime=True, numeric=True)
assert converted['H'].dtype == 'float64'
assert converted['I'].dtype == 'int64'
assert converted['J'].dtype == 'float64'
assert converted['K'].dtype == 'float64'
assert len(converted['J'].dropna()) == length - 5
assert len(converted['K'].dropna()) == length - 5
# via astype
converted = self.mixed_frame.copy()
converted['H'] = converted['H'].astype('float64')
converted['I'] = converted['I'].astype('int64')
assert converted['H'].dtype == 'float64'
assert converted['I'].dtype == 'int64'
# via astype, but errors
converted = self.mixed_frame.copy()
with tm.assert_raises_regex(ValueError, 'invalid literal'):
converted['H'].astype('int32')
# mixed in a single column
df = DataFrame(dict(s=Series([1, 'na', 3, 4])))
result = df._convert(datetime=True, numeric=True)
expected = DataFrame(dict(s=Series([1, np.nan, 3, 4])))
assert_frame_equal(result, expected)
def test_convert_objects_no_conversion(self):
mixed1 = DataFrame(
{'a': [1, 2, 3], 'b': [4.0, 5, 6], 'c': ['x', 'y', 'z']})
mixed2 = mixed1._convert(datetime=True)
assert_frame_equal(mixed1, mixed2)
def test_infer_objects(self):
# GH 11221
df = DataFrame({'a': ['a', 1, 2, 3],
'b': ['b', 2.0, 3.0, 4.1],
'c': ['c', datetime(2016, 1, 1),
datetime(2016, 1, 2),
datetime(2016, 1, 3)],
'd': [1, 2, 3, 'd']},
columns=['a', 'b', 'c', 'd'])
df = df.iloc[1:].infer_objects()
assert df['a'].dtype == 'int64'
assert df['b'].dtype == 'float64'
assert df['c'].dtype == 'M8[ns]'
assert df['d'].dtype == 'object'
expected = DataFrame({'a': [1, 2, 3],
'b': [2.0, 3.0, 4.1],
'c': [datetime(2016, 1, 1),
datetime(2016, 1, 2),
datetime(2016, 1, 3)],
'd': [2, 3, 'd']},
columns=['a', 'b', 'c', 'd'])
# reconstruct frame to verify inference is same
tm.assert_frame_equal(df.reset_index(drop=True), expected)
def test_stale_cached_series_bug_473(self):
# this is chained, but ok
with option_context('chained_assignment', None):
Y = DataFrame(np.random.random((4, 4)), index=('a', 'b', 'c', 'd'),
columns=('e', 'f', 'g', 'h'))
repr(Y)
Y['e'] = Y['e'].astype('object')
Y['g']['c'] = np.NaN
repr(Y)
result = Y.sum() # noqa
exp = Y['g'].sum() # noqa
assert pd.isna(Y['g']['c'])
def test_get_X_columns(self):
# numeric and object columns
df = DataFrame({'a': [1, 2, 3],
'b': [True, False, True],
'c': ['foo', 'bar', 'baz'],
'd': [None, None, None],
'e': [3.14, 0.577, 2.773]})
tm.assert_index_equal(df._get_numeric_data().columns,
pd.Index(['a', 'b', 'e']))
def test_strange_column_corruption_issue(self):
# (wesm) Unclear how exactly this is related to internal matters
df = DataFrame(index=[0, 1])
df[0] = nan
wasCol = {}
# uncommenting these makes the results match
# for col in xrange(100, 200):
# wasCol[col] = 1
# df[col] = nan
for i, dt in enumerate(df.index):
for col in range(100, 200):
if col not in wasCol:
wasCol[col] = 1
df[col] = nan
df[col][dt] = i
myid = 100
first = len(df.loc[pd.isna(df[myid]), [myid]])
second = len(df.loc[pd.isna(df[myid]), [myid]])
assert first == second == 0
@@ -1,788 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
from datetime import datetime
import numpy as np
from numpy import nan
import pandas as pd
from pandas import DataFrame, Index, Series, Timestamp, date_range
from pandas.compat import lrange
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal
class TestDataFrameConcatCommon(TestData):
def test_concat_multiple_frames_dtypes(self):
# GH 2759
A = DataFrame(data=np.ones((10, 2)), columns=[
'foo', 'bar'], dtype=np.float64)
B = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
results = pd.concat((A, B), axis=1).get_dtype_counts()
expected = Series(dict(float64=2, float32=2))
assert_series_equal(results, expected)
def test_concat_multiple_tzs(self):
# GH 12467
# combining datetime tz-aware and naive DataFrames
ts1 = Timestamp('2015-01-01', tz=None)
ts2 = Timestamp('2015-01-01', tz='UTC')
ts3 = Timestamp('2015-01-01', tz='EST')
df1 = DataFrame(dict(time=[ts1]))
df2 = DataFrame(dict(time=[ts2]))
df3 = DataFrame(dict(time=[ts3]))
results = pd.concat([df1, df2]).reset_index(drop=True)
expected = DataFrame(dict(time=[ts1, ts2]), dtype=object)
assert_frame_equal(results, expected)
results = pd.concat([df1, df3]).reset_index(drop=True)
expected = DataFrame(dict(time=[ts1, ts3]), dtype=object)
assert_frame_equal(results, expected)
results = pd.concat([df2, df3]).reset_index(drop=True)
expected = DataFrame(dict(time=[ts2, ts3]))
assert_frame_equal(results, expected)
def test_concat_tuple_keys(self):
# GH 14438
df1 = pd.DataFrame(np.ones((2, 2)), columns=list('AB'))
df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list('AB'))
results = pd.concat((df1, df2), keys=[('bee', 'bah'), ('bee', 'boo')])
expected = pd.DataFrame(
{'A': {('bee', 'bah', 0): 1.0,
('bee', 'bah', 1): 1.0,
('bee', 'boo', 0): 2.0,
('bee', 'boo', 1): 2.0,
('bee', 'boo', 2): 2.0},
'B': {('bee', 'bah', 0): 1.0,
('bee', 'bah', 1): 1.0,
('bee', 'boo', 0): 2.0,
('bee', 'boo', 1): 2.0,
('bee', 'boo', 2): 2.0}})
assert_frame_equal(results, expected)
def test_append_series_dict(self):
df = DataFrame(np.random.randn(5, 4),
columns=['foo', 'bar', 'baz', 'qux'])
series = df.loc[4]
with tm.assert_raises_regex(ValueError,
'Indexes have overlapping values'):
df.append(series, verify_integrity=True)
series.name = None
with tm.assert_raises_regex(TypeError,
'Can only append a Series if '
'ignore_index=True'):
df.append(series, verify_integrity=True)
result = df.append(series[::-1], ignore_index=True)
expected = df.append(DataFrame({0: series[::-1]}, index=df.columns).T,
ignore_index=True)
assert_frame_equal(result, expected)
# dict
result = df.append(series.to_dict(), ignore_index=True)
assert_frame_equal(result, expected)
result = df.append(series[::-1][:3], ignore_index=True)
expected = df.append(DataFrame({0: series[::-1][:3]}).T,
ignore_index=True, sort=True)
assert_frame_equal(result, expected.loc[:, result.columns])
# can append when name set
row = df.loc[4]
row.name = 5
result = df.append(row)
expected = df.append(df[-1:], ignore_index=True)
assert_frame_equal(result, expected)
def test_append_list_of_series_dicts(self):
df = DataFrame(np.random.randn(5, 4),
columns=['foo', 'bar', 'baz', 'qux'])
dicts = [x.to_dict() for idx, x in df.iterrows()]
result = df.append(dicts, ignore_index=True)
expected = df.append(df, ignore_index=True)
assert_frame_equal(result, expected)
# different columns
dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4},
{'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}]
result = df.append(dicts, ignore_index=True, sort=True)
expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
assert_frame_equal(result, expected)
def test_append_empty_dataframe(self):
# Empty df append empty df
df1 = DataFrame([])
df2 = DataFrame([])
result = df1.append(df2)
expected = df1.copy()
assert_frame_equal(result, expected)
# Non-empty df append empty df
df1 = DataFrame(np.random.randn(5, 2))
df2 = DataFrame()
result = df1.append(df2)
expected = df1.copy()
assert_frame_equal(result, expected)
# Empty df with columns append empty df
df1 = DataFrame(columns=['bar', 'foo'])
df2 = DataFrame()
result = df1.append(df2)
expected = df1.copy()
assert_frame_equal(result, expected)
# Non-Empty df with columns append empty df
df1 = DataFrame(np.random.randn(5, 2), columns=['bar', 'foo'])
df2 = DataFrame()
result = df1.append(df2)
expected = df1.copy()
assert_frame_equal(result, expected)
def test_append_dtypes(self):
# GH 5754
# row appends of different dtypes (so need to do by-item)
# can sometimes infer the correct type
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(5))
df2 = DataFrame()
result = df1.append(df2)
expected = df1.copy()
assert_frame_equal(result, expected)
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
df2 = DataFrame({'bar': 'foo'}, index=lrange(1, 2))
result = df1.append(df2)
expected = DataFrame({'bar': [Timestamp('20130101'), 'foo']})
assert_frame_equal(result, expected)
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
df2 = DataFrame({'bar': np.nan}, index=lrange(1, 2))
result = df1.append(df2)
expected = DataFrame(
{'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')})
assert_frame_equal(result, expected)
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
df2 = DataFrame({'bar': np.nan}, index=lrange(1, 2), dtype=object)
result = df1.append(df2)
expected = DataFrame(
{'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')})
assert_frame_equal(result, expected)
df1 = DataFrame({'bar': np.nan}, index=lrange(1))
df2 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1, 2))
result = df1.append(df2)
expected = DataFrame(
{'bar': Series([np.nan, Timestamp('20130101')], dtype='M8[ns]')})
assert_frame_equal(result, expected)
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
df2 = DataFrame({'bar': 1}, index=lrange(1, 2), dtype=object)
result = df1.append(df2)
expected = DataFrame({'bar': Series([Timestamp('20130101'), 1])})
assert_frame_equal(result, expected)
def test_update(self):
df = DataFrame([[1.5, nan, 3.],
[1.5, nan, 3.],
[1.5, nan, 3],
[1.5, nan, 3]])
other = DataFrame([[3.6, 2., np.nan],
[np.nan, np.nan, 7]], index=[1, 3])
df.update(other)
expected = DataFrame([[1.5, nan, 3],
[3.6, 2, 3],
[1.5, nan, 3],
[1.5, nan, 7.]])
assert_frame_equal(df, expected)
def test_update_dtypes(self):
# gh 3016
df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
columns=['A', 'B', 'bool1', 'bool2'])
other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
df.update(other)
expected = DataFrame([[45., 45., False, True], [4., 5., True, False]],
columns=['A', 'B', 'bool1', 'bool2'])
assert_frame_equal(df, expected)
def test_update_nooverwrite(self):
df = DataFrame([[1.5, nan, 3.],
[1.5, nan, 3.],
[1.5, nan, 3],
[1.5, nan, 3]])
other = DataFrame([[3.6, 2., np.nan],
[np.nan, np.nan, 7]], index=[1, 3])
df.update(other, overwrite=False)
expected = DataFrame([[1.5, nan, 3],
[1.5, 2, 3],
[1.5, nan, 3],
[1.5, nan, 3.]])
assert_frame_equal(df, expected)
def test_update_filtered(self):
df = DataFrame([[1.5, nan, 3.],
[1.5, nan, 3.],
[1.5, nan, 3],
[1.5, nan, 3]])
other = DataFrame([[3.6, 2., np.nan],
[np.nan, np.nan, 7]], index=[1, 3])
df.update(other, filter_func=lambda x: x > 2)
expected = DataFrame([[1.5, nan, 3],
[1.5, nan, 3],
[1.5, nan, 3],
[1.5, nan, 7.]])
assert_frame_equal(df, expected)
def test_update_raise(self):
df = DataFrame([[1.5, 1, 3.],
[1.5, nan, 3.],
[1.5, nan, 3],
[1.5, nan, 3]])
other = DataFrame([[2., nan],
[nan, 7]], index=[1, 3], columns=[1, 2])
with tm.assert_raises_regex(ValueError, "Data overlaps"):
df.update(other, raise_conflict=True)
def test_update_from_non_df(self):
d = {'a': Series([1, 2, 3, 4]), 'b': Series([5, 6, 7, 8])}
df = DataFrame(d)
d['a'] = Series([5, 6, 7, 8])
df.update(d)
expected = DataFrame(d)
assert_frame_equal(df, expected)
d = {'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}
df = DataFrame(d)
d['a'] = [5, 6, 7, 8]
df.update(d)
expected = DataFrame(d)
assert_frame_equal(df, expected)
def test_join_str_datetime(self):
str_dates = ['20120209', '20120222']
dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
A = DataFrame(str_dates, index=lrange(2), columns=['aa'])
C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)
tst = A.join(C, on='aa')
assert len(tst.columns) == 3
def test_join_multiindex_leftright(self):
# GH 10741
df1 = (pd.DataFrame([['a', 'x', 0.471780], ['a', 'y', 0.774908],
['a', 'z', 0.563634], ['b', 'x', -0.353756],
['b', 'y', 0.368062], ['b', 'z', -1.721840],
['c', 'x', 1], ['c', 'y', 2], ['c', 'z', 3]],
columns=['first', 'second', 'value1'])
.set_index(['first', 'second']))
df2 = (pd.DataFrame([['a', 10], ['b', 20]],
columns=['first', 'value2'])
.set_index(['first']))
exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10],
[-0.353756, 20], [0.368062, 20],
[-1.721840, 20],
[1.000000, np.nan], [2.000000, np.nan],
[3.000000, np.nan]],
index=df1.index, columns=['value1', 'value2'])
# these must be the same results (but columns are flipped)
assert_frame_equal(df1.join(df2, how='left'), exp)
assert_frame_equal(df2.join(df1, how='right'),
exp[['value2', 'value1']])
exp_idx = pd.MultiIndex.from_product([['a', 'b'], ['x', 'y', 'z']],
names=['first', 'second'])
exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10],
[-0.353756, 20], [0.368062, 20], [-1.721840, 20]],
index=exp_idx, columns=['value1', 'value2'])
assert_frame_equal(df1.join(df2, how='right'), exp)
assert_frame_equal(df2.join(df1, how='left'),
exp[['value2', 'value1']])
def test_concat_named_keys(self):
# GH 14252
df = pd.DataFrame({'foo': [1, 2], 'bar': [0.1, 0.2]})
index = Index(['a', 'b'], name='baz')
concatted_named_from_keys = pd.concat([df, df], keys=index)
expected_named = pd.DataFrame(
{'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]},
index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]),
names=['baz', None]))
assert_frame_equal(concatted_named_from_keys, expected_named)
index_no_name = Index(['a', 'b'], name=None)
concatted_named_from_names = pd.concat(
[df, df], keys=index_no_name, names=['baz'])
assert_frame_equal(concatted_named_from_names, expected_named)
concatted_unnamed = pd.concat([df, df], keys=index_no_name)
expected_unnamed = pd.DataFrame(
{'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]},
index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]),
names=[None, None]))
assert_frame_equal(concatted_unnamed, expected_unnamed)
def test_concat_axis_parameter(self):
# GH 14369
df1 = pd.DataFrame({'A': [0.1, 0.2]}, index=range(2))
df2 = pd.DataFrame({'A': [0.3, 0.4]}, index=range(2))
# Index/row/0 DataFrame
expected_index = pd.DataFrame(
{'A': [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1])
concatted_index = pd.concat([df1, df2], axis='index')
assert_frame_equal(concatted_index, expected_index)
concatted_row = pd.concat([df1, df2], axis='rows')
assert_frame_equal(concatted_row, expected_index)
concatted_0 = pd.concat([df1, df2], axis=0)
assert_frame_equal(concatted_0, expected_index)
# Columns/1 DataFrame
expected_columns = pd.DataFrame(
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=['A', 'A'])
concatted_columns = pd.concat([df1, df2], axis='columns')
assert_frame_equal(concatted_columns, expected_columns)
concatted_1 = pd.concat([df1, df2], axis=1)
assert_frame_equal(concatted_1, expected_columns)
series1 = pd.Series([0.1, 0.2])
series2 = pd.Series([0.3, 0.4])
# Index/row/0 Series
expected_index_series = pd.Series(
[0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1])
concatted_index_series = pd.concat([series1, series2], axis='index')
assert_series_equal(concatted_index_series, expected_index_series)
concatted_row_series = pd.concat([series1, series2], axis='rows')
assert_series_equal(concatted_row_series, expected_index_series)
concatted_0_series = pd.concat([series1, series2], axis=0)
assert_series_equal(concatted_0_series, expected_index_series)
# Columns/1 Series
expected_columns_series = pd.DataFrame(
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1])
concatted_columns_series = pd.concat(
[series1, series2], axis='columns')
assert_frame_equal(concatted_columns_series, expected_columns_series)
concatted_1_series = pd.concat([series1, series2], axis=1)
assert_frame_equal(concatted_1_series, expected_columns_series)
# Testing ValueError
with tm.assert_raises_regex(ValueError, 'No axis named'):
pd.concat([series1, series2], axis='something')
def test_concat_numerical_names(self):
# #15262 # #12223
df = pd.DataFrame({'col': range(9)},
dtype='int32',
index=(pd.MultiIndex
.from_product([['A0', 'A1', 'A2'],
['B0', 'B1', 'B2']],
names=[1, 2])))
result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :]))
expected = pd.DataFrame({'col': [0, 1, 7, 8]},
dtype='int32',
index=pd.MultiIndex.from_tuples([('A0', 'B0'),
('A0', 'B1'),
('A2', 'B1'),
('A2', 'B2')],
names=[1, 2]))
tm.assert_frame_equal(result, expected)
class TestDataFrameCombineFirst(TestData):
def test_combine_first_mixed(self):
a = Series(['a', 'b'], index=lrange(2))
b = Series(lrange(2), index=lrange(2))
f = DataFrame({'A': a, 'B': b})
a = Series(['a', 'b'], index=lrange(5, 7))
b = Series(lrange(2), index=lrange(5, 7))
g = DataFrame({'A': a, 'B': b})
exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]},
index=[0, 1, 5, 6])
combined = f.combine_first(g)
tm.assert_frame_equal(combined, exp)
def test_combine_first(self):
# disjoint
head, tail = self.frame[:5], self.frame[5:]
combined = head.combine_first(tail)
reordered_frame = self.frame.reindex(combined.index)
assert_frame_equal(combined, reordered_frame)
assert tm.equalContents(combined.columns, self.frame.columns)
assert_series_equal(combined['A'], reordered_frame['A'])
# same index
fcopy = self.frame.copy()
fcopy['A'] = 1
del fcopy['C']
fcopy2 = self.frame.copy()
fcopy2['B'] = 0
del fcopy2['D']
combined = fcopy.combine_first(fcopy2)
assert (combined['A'] == 1).all()
assert_series_equal(combined['B'], fcopy['B'])
assert_series_equal(combined['C'], fcopy2['C'])
assert_series_equal(combined['D'], fcopy['D'])
# overlap
head, tail = reordered_frame[:10].copy(), reordered_frame
head['A'] = 1
combined = head.combine_first(tail)
assert (combined['A'][:10] == 1).all()
# reverse overlap
tail['A'][:10] = 0
combined = tail.combine_first(head)
assert (combined['A'][:10] == 0).all()
# no overlap
f = self.frame[:10]
g = self.frame[10:]
combined = f.combine_first(g)
assert_series_equal(combined['A'].reindex(f.index), f['A'])
assert_series_equal(combined['A'].reindex(g.index), g['A'])
# corner cases
comb = self.frame.combine_first(self.empty)
assert_frame_equal(comb, self.frame)
comb = self.empty.combine_first(self.frame)
assert_frame_equal(comb, self.frame)
comb = self.frame.combine_first(DataFrame(index=["faz", "boo"]))
assert "faz" in comb.index
# #2525
df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)])
df2 = DataFrame({}, columns=['b'])
result = df.combine_first(df2)
assert 'b' in result
def test_combine_first_mixed_bug(self):
idx = Index(['a', 'b', 'c', 'e'])
ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
ser2 = Series(['a', 'b', 'c', 'e'], index=idx)
ser3 = Series([12, 4, 5, 97], index=idx)
frame1 = DataFrame({"col0": ser1,
"col2": ser2,
"col3": ser3})
idx = Index(['a', 'b', 'c', 'f'])
ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
ser2 = Series(['a', 'b', 'c', 'f'], index=idx)
ser3 = Series([12, 4, 5, 97], index=idx)
frame2 = DataFrame({"col1": ser1,
"col2": ser2,
"col5": ser3})
combined = frame1.combine_first(frame2)
assert len(combined.columns) == 5
# gh 3016 (same as in update)
df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
columns=['A', 'B', 'bool1', 'bool2'])
other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
result = df.combine_first(other)
assert_frame_equal(result, df)
df.loc[0, 'A'] = np.nan
result = df.combine_first(other)
df.loc[0, 'A'] = 45
assert_frame_equal(result, df)
# doc example
df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan],
'B': [np.nan, 2., 3., np.nan, 6.]})
df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.],
'B': [np.nan, np.nan, 3., 4., 6., 8.]})
result = df1.combine_first(df2)
expected = DataFrame(
{'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]})
assert_frame_equal(result, expected)
# GH3552, return object dtype with bools
df1 = DataFrame(
[[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]])
df2 = DataFrame(
[[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2])
result = df1.combine_first(df2)[2]
expected = Series([True, True, False], name=2)
assert_series_equal(result, expected)
# GH 3593, converting datetime64[ns] incorrecly
df0 = DataFrame({"a": [datetime(2000, 1, 1),
datetime(2000, 1, 2),
datetime(2000, 1, 3)]})
df1 = DataFrame({"a": [None, None, None]})
df2 = df1.combine_first(df0)
assert_frame_equal(df2, df0)
df2 = df0.combine_first(df1)
assert_frame_equal(df2, df0)
df0 = DataFrame({"a": [datetime(2000, 1, 1),
datetime(2000, 1, 2),
datetime(2000, 1, 3)]})
df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
df2 = df1.combine_first(df0)
result = df0.copy()
result.iloc[0, :] = df1.iloc[0, :]
assert_frame_equal(df2, result)
df2 = df0.combine_first(df1)
assert_frame_equal(df2, df0)
def test_combine_first_align_nan(self):
# GH 7509 (not fixed)
dfa = pd.DataFrame([[pd.Timestamp('2011-01-01'), 2]],
columns=['a', 'b'])
dfb = pd.DataFrame([[4], [5]], columns=['b'])
assert dfa['a'].dtype == 'datetime64[ns]'
assert dfa['b'].dtype == 'int64'
res = dfa.combine_first(dfb)
exp = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'), pd.NaT],
'b': [2., 5.]}, columns=['a', 'b'])
tm.assert_frame_equal(res, exp)
assert res['a'].dtype == 'datetime64[ns]'
# ToDo: this must be int64
assert res['b'].dtype == 'float64'
res = dfa.iloc[:0].combine_first(dfb)
exp = pd.DataFrame({'a': [np.nan, np.nan],
'b': [4, 5]}, columns=['a', 'b'])
tm.assert_frame_equal(res, exp)
# ToDo: this must be datetime64
assert res['a'].dtype == 'float64'
# ToDo: this must be int64
assert res['b'].dtype == 'int64'
def test_combine_first_timezone(self):
# see gh-7630
data1 = pd.to_datetime('20100101 01:01').tz_localize('UTC')
df1 = pd.DataFrame(columns=['UTCdatetime', 'abc'],
data=data1,
index=pd.date_range('20140627', periods=1))
data2 = pd.to_datetime('20121212 12:12').tz_localize('UTC')
df2 = pd.DataFrame(columns=['UTCdatetime', 'xyz'],
data=data2,
index=pd.date_range('20140628', periods=1))
res = df2[['UTCdatetime']].combine_first(df1)
exp = pd.DataFrame({'UTCdatetime': [pd.Timestamp('2010-01-01 01:01',
tz='UTC'),
pd.Timestamp('2012-12-12 12:12',
tz='UTC')],
'abc': [pd.Timestamp('2010-01-01 01:01:00',
tz='UTC'), pd.NaT]},
columns=['UTCdatetime', 'abc'],
index=pd.date_range('20140627', periods=2,
freq='D'))
tm.assert_frame_equal(res, exp)
assert res['UTCdatetime'].dtype == 'datetime64[ns, UTC]'
assert res['abc'].dtype == 'datetime64[ns, UTC]'
# see gh-10567
dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='UTC')
df1 = pd.DataFrame({'DATE': dts1})
dts2 = pd.date_range('2015-01-03', '2015-01-05', tz='UTC')
df2 = pd.DataFrame({'DATE': dts2})
res = df1.combine_first(df2)
tm.assert_frame_equal(res, df1)
assert res['DATE'].dtype == 'datetime64[ns, UTC]'
dts1 = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03',
'2011-01-04'], tz='US/Eastern')
df1 = pd.DataFrame({'DATE': dts1}, index=[1, 3, 5, 7])
dts2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02',
'2012-01-03'], tz='US/Eastern')
df2 = pd.DataFrame({'DATE': dts2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = pd.DatetimeIndex(['2011-01-01', '2012-01-01', 'NaT',
'2012-01-02', '2011-01-03', '2011-01-04'],
tz='US/Eastern')
exp = pd.DataFrame({'DATE': exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
# different tz
dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='US/Eastern')
df1 = pd.DataFrame({'DATE': dts1})
dts2 = pd.date_range('2015-01-03', '2015-01-05')
df2 = pd.DataFrame({'DATE': dts2})
# if df1 doesn't have NaN, keep its dtype
res = df1.combine_first(df2)
tm.assert_frame_equal(res, df1)
assert res['DATE'].dtype == 'datetime64[ns, US/Eastern]'
dts1 = pd.date_range('2015-01-01', '2015-01-02', tz='US/Eastern')
df1 = pd.DataFrame({'DATE': dts1})
dts2 = pd.date_range('2015-01-01', '2015-01-03')
df2 = pd.DataFrame({'DATE': dts2})
res = df1.combine_first(df2)
exp_dts = [pd.Timestamp('2015-01-01', tz='US/Eastern'),
pd.Timestamp('2015-01-02', tz='US/Eastern'),
pd.Timestamp('2015-01-03')]
exp = pd.DataFrame({'DATE': exp_dts})
tm.assert_frame_equal(res, exp)
assert res['DATE'].dtype == 'object'
def test_combine_first_timedelta(self):
data1 = pd.TimedeltaIndex(['1 day', 'NaT', '3 day', '4day'])
df1 = pd.DataFrame({'TD': data1}, index=[1, 3, 5, 7])
data2 = pd.TimedeltaIndex(['10 day', '11 day', '12 day'])
df2 = pd.DataFrame({'TD': data2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = pd.TimedeltaIndex(['1 day', '10 day', 'NaT',
'11 day', '3 day', '4 day'])
exp = pd.DataFrame({'TD': exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
assert res['TD'].dtype == 'timedelta64[ns]'
def test_combine_first_period(self):
data1 = pd.PeriodIndex(['2011-01', 'NaT', '2011-03',
'2011-04'], freq='M')
df1 = pd.DataFrame({'P': data1}, index=[1, 3, 5, 7])
data2 = pd.PeriodIndex(['2012-01-01', '2012-02',
'2012-03'], freq='M')
df2 = pd.DataFrame({'P': data2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = pd.PeriodIndex(['2011-01', '2012-01', 'NaT',
'2012-02', '2011-03', '2011-04'],
freq='M')
exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
assert res['P'].dtype == 'object'
# different freq
dts2 = pd.PeriodIndex(['2012-01-01', '2012-01-02',
'2012-01-03'], freq='D')
df2 = pd.DataFrame({'P': dts2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = [pd.Period('2011-01', freq='M'),
pd.Period('2012-01-01', freq='D'),
pd.NaT,
pd.Period('2012-01-02', freq='D'),
pd.Period('2011-03', freq='M'),
pd.Period('2011-04', freq='M')]
exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
assert res['P'].dtype == 'object'
def test_combine_first_int(self):
# GH14687 - integer series that do no align exactly
df1 = pd.DataFrame({'a': [0, 1, 3, 5]}, dtype='int64')
df2 = pd.DataFrame({'a': [1, 4]}, dtype='int64')
res = df1.combine_first(df2)
tm.assert_frame_equal(res, df1)
assert res['a'].dtype == 'int64'
def test_concat_datetime_datetime64_frame(self):
# #2624
rows = []
rows.append([datetime(2010, 1, 1), 1])
rows.append([datetime(2010, 1, 2), 'hi'])
df2_obj = DataFrame.from_records(rows, columns=['date', 'test'])
ind = date_range(start="2000/1/1", freq="D", periods=10)
df1 = DataFrame({'date': ind, 'test': lrange(10)})
# it works!
pd.concat([df1, df2_obj])
class TestDataFrameUpdate(TestData):
def test_update_nan(self):
# #15593 #15617
# test 1
df1 = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)})
df2 = DataFrame({'A': [None, 2, 3]})
expected = df1.copy()
df1.update(df2, overwrite=False)
tm.assert_frame_equal(df1, expected)
# test 2
df1 = DataFrame({'A': [1.0, None, 3],
'B': date_range('2000', periods=3)})
df2 = DataFrame({'A': [None, 2, 3]})
expected = DataFrame({'A': [1.0, 2, 3],
'B': date_range('2000', periods=3)})
df1.update(df2, overwrite=False)
tm.assert_frame_equal(df1, expected)
File diff suppressed because it is too large Load Diff
@@ -1,330 +0,0 @@
# -*- coding: utf-8 -*-
from datetime import datetime
import pytest
import pytz
import collections
from collections import OrderedDict, defaultdict
import numpy as np
from pandas import compat
from pandas.compat import long
from pandas import (DataFrame, Series, MultiIndex, Timestamp,
date_range)
import pandas.util.testing as tm
from pandas.tests.frame.common import TestData
class TestDataFrameConvertTo(TestData):
def test_to_dict_timestamp(self):
# GH11247
# split/records producing np.datetime64 rather than Timestamps
# on datetime64[ns] dtypes only
tsmp = Timestamp('20130101')
test_data = DataFrame({'A': [tsmp, tsmp], 'B': [tsmp, tsmp]})
test_data_mixed = DataFrame({'A': [tsmp, tsmp], 'B': [1, 2]})
expected_records = [{'A': tsmp, 'B': tsmp},
{'A': tsmp, 'B': tsmp}]
expected_records_mixed = [{'A': tsmp, 'B': 1},
{'A': tsmp, 'B': 2}]
assert (test_data.to_dict(orient='records') ==
expected_records)
assert (test_data_mixed.to_dict(orient='records') ==
expected_records_mixed)
expected_series = {
'A': Series([tsmp, tsmp], name='A'),
'B': Series([tsmp, tsmp], name='B'),
}
expected_series_mixed = {
'A': Series([tsmp, tsmp], name='A'),
'B': Series([1, 2], name='B'),
}
tm.assert_dict_equal(test_data.to_dict(orient='series'),
expected_series)
tm.assert_dict_equal(test_data_mixed.to_dict(orient='series'),
expected_series_mixed)
expected_split = {
'index': [0, 1],
'data': [[tsmp, tsmp],
[tsmp, tsmp]],
'columns': ['A', 'B']
}
expected_split_mixed = {
'index': [0, 1],
'data': [[tsmp, 1],
[tsmp, 2]],
'columns': ['A', 'B']
}
tm.assert_dict_equal(test_data.to_dict(orient='split'),
expected_split)
tm.assert_dict_equal(test_data_mixed.to_dict(orient='split'),
expected_split_mixed)
def test_to_dict_invalid_orient(self):
df = DataFrame({'A': [0, 1]})
pytest.raises(ValueError, df.to_dict, orient='xinvalid')
def test_to_records_dt64(self):
df = DataFrame([["one", "two", "three"],
["four", "five", "six"]],
index=date_range("2012-01-01", "2012-01-02"))
# convert_datetime64 defaults to None
expected = df.index.values[0]
result = df.to_records()['index'][0]
assert expected == result
# check for FutureWarning if convert_datetime64=False is passed
with tm.assert_produces_warning(FutureWarning):
expected = df.index.values[0]
result = df.to_records(convert_datetime64=False)['index'][0]
assert expected == result
# check for FutureWarning if convert_datetime64=True is passed
with tm.assert_produces_warning(FutureWarning):
expected = df.index[0]
result = df.to_records(convert_datetime64=True)['index'][0]
assert expected == result
def test_to_records_with_multindex(self):
# GH3189
index = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
data = np.zeros((8, 4))
df = DataFrame(data, index=index)
r = df.to_records(index=True)['level_0']
assert 'bar' in r
assert 'one' not in r
def test_to_records_with_Mapping_type(self):
import email
from email.parser import Parser
import collections
collections.Mapping.register(email.message.Message)
headers = Parser().parsestr('From: <user@example.com>\n'
'To: <someone_else@example.com>\n'
'Subject: Test message\n'
'\n'
'Body would go here\n')
frame = DataFrame.from_records([headers])
all(x in frame for x in ['Type', 'Subject', 'From'])
def test_to_records_floats(self):
df = DataFrame(np.random.rand(10, 10))
df.to_records()
def test_to_records_index_name(self):
df = DataFrame(np.random.randn(3, 3))
df.index.name = 'X'
rs = df.to_records()
assert 'X' in rs.dtype.fields
df = DataFrame(np.random.randn(3, 3))
rs = df.to_records()
assert 'index' in rs.dtype.fields
df.index = MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')])
df.index.names = ['A', None]
rs = df.to_records()
assert 'level_0' in rs.dtype.fields
def test_to_records_with_unicode_index(self):
# GH13172
# unicode_literals conflict with to_records
result = DataFrame([{u'a': u'x', u'b': 'y'}]).set_index(u'a')\
.to_records()
expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')])
tm.assert_almost_equal(result, expected)
def test_to_records_with_unicode_column_names(self):
# xref issue: https://github.com/numpy/numpy/issues/2407
# Issue #11879. to_records used to raise an exception when used
# with column names containing non-ascii characters in Python 2
result = DataFrame(data={u"accented_name_é": [1.0]}).to_records()
# Note that numpy allows for unicode field names but dtypes need
# to be specified using dictionary instead of list of tuples.
expected = np.rec.array(
[(0, 1.0)],
dtype={"names": ["index", u"accented_name_é"],
"formats": ['=i8', '=f8']}
)
tm.assert_almost_equal(result, expected)
def test_to_records_with_categorical(self):
# GH8626
# dict creation
df = DataFrame({'A': list('abc')}, dtype='category')
expected = Series(list('abc'), dtype='category', name='A')
tm.assert_series_equal(df['A'], expected)
# list-like creation
df = DataFrame(list('abc'), dtype='category')
expected = Series(list('abc'), dtype='category', name=0)
tm.assert_series_equal(df[0], expected)
# to record array
# this coerces
result = df.to_records()
expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')],
dtype=[('index', '=i8'), ('0', 'O')])
tm.assert_almost_equal(result, expected)
@pytest.mark.parametrize('mapping', [
dict,
collections.defaultdict(list),
collections.OrderedDict])
def test_to_dict(self, mapping):
test_data = {
'A': {'1': 1, '2': 2},
'B': {'1': '1', '2': '2', '3': '3'},
}
# GH16122
recons_data = DataFrame(test_data).to_dict(into=mapping)
for k, v in compat.iteritems(test_data):
for k2, v2 in compat.iteritems(v):
assert (v2 == recons_data[k][k2])
recons_data = DataFrame(test_data).to_dict("l", mapping)
for k, v in compat.iteritems(test_data):
for k2, v2 in compat.iteritems(v):
assert (v2 == recons_data[k][int(k2) - 1])
recons_data = DataFrame(test_data).to_dict("s", mapping)
for k, v in compat.iteritems(test_data):
for k2, v2 in compat.iteritems(v):
assert (v2 == recons_data[k][k2])
recons_data = DataFrame(test_data).to_dict("sp", mapping)
expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'],
'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]}
tm.assert_dict_equal(recons_data, expected_split)
recons_data = DataFrame(test_data).to_dict("r", mapping)
expected_records = [{'A': 1.0, 'B': '1'},
{'A': 2.0, 'B': '2'},
{'A': np.nan, 'B': '3'}]
assert isinstance(recons_data, list)
assert (len(recons_data) == 3)
for l, r in zip(recons_data, expected_records):
tm.assert_dict_equal(l, r)
# GH10844
recons_data = DataFrame(test_data).to_dict("i")
for k, v in compat.iteritems(test_data):
for k2, v2 in compat.iteritems(v):
assert (v2 == recons_data[k2][k])
df = DataFrame(test_data)
df['duped'] = df[df.columns[0]]
recons_data = df.to_dict("i")
comp_data = test_data.copy()
comp_data['duped'] = comp_data[df.columns[0]]
for k, v in compat.iteritems(comp_data):
for k2, v2 in compat.iteritems(v):
assert (v2 == recons_data[k2][k])
@pytest.mark.parametrize('mapping', [
list,
collections.defaultdict,
[]])
def test_to_dict_errors(self, mapping):
# GH16122
df = DataFrame(np.random.randn(3, 3))
with pytest.raises(TypeError):
df.to_dict(into=mapping)
def test_to_dict_not_unique_warning(self):
# GH16927: When converting to a dict, if a column has a non-unique name
# it will be dropped, throwing a warning.
df = DataFrame([[1, 2, 3]], columns=['a', 'a', 'b'])
with tm.assert_produces_warning(UserWarning):
df.to_dict()
@pytest.mark.parametrize('tz', ['UTC', 'GMT', 'US/Eastern'])
def test_to_records_datetimeindex_with_tz(self, tz):
# GH13937
dr = date_range('2016-01-01', periods=10,
freq='S', tz=tz)
df = DataFrame({'datetime': dr}, index=dr)
expected = df.to_records()
result = df.tz_convert("UTC").to_records()
# both converted to UTC, so they are equal
tm.assert_numpy_array_equal(result, expected)
def test_to_dict_box_scalars(self):
# 14216
# make sure that we are boxing properly
d = {'a': [1], 'b': ['b']}
result = DataFrame(d).to_dict()
assert isinstance(list(result['a'])[0], (int, long))
assert isinstance(list(result['b'])[0], (int, long))
result = DataFrame(d).to_dict(orient='records')
assert isinstance(result[0]['a'], (int, long))
def test_frame_to_dict_tz(self):
# GH18372 When converting to dict with orient='records' columns of
# datetime that are tz-aware were not converted to required arrays
data = [(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
(datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc,),)]
df = DataFrame(list(data), columns=["d", ])
result = df.to_dict(orient='records')
expected = [
{'d': Timestamp('2017-11-18 21:53:00.219225+0000', tz=pytz.utc)},
{'d': Timestamp('2017-11-18 22:06:30.061810+0000', tz=pytz.utc)},
]
tm.assert_dict_equal(result[0], expected[0])
tm.assert_dict_equal(result[1], expected[1])
@pytest.mark.parametrize('into, expected', [
(dict, {0: {'int_col': 1, 'float_col': 1.0},
1: {'int_col': 2, 'float_col': 2.0},
2: {'int_col': 3, 'float_col': 3.0}}),
(OrderedDict, OrderedDict([(0, {'int_col': 1, 'float_col': 1.0}),
(1, {'int_col': 2, 'float_col': 2.0}),
(2, {'int_col': 3, 'float_col': 3.0})])),
(defaultdict(list), defaultdict(list,
{0: {'int_col': 1, 'float_col': 1.0},
1: {'int_col': 2, 'float_col': 2.0},
2: {'int_col': 3, 'float_col': 3.0}}))
])
def test_to_dict_index_dtypes(self, into, expected):
# GH 18580
# When using to_dict(orient='index') on a dataframe with int
# and float columns only the int columns were cast to float
df = DataFrame({'int_col': [1, 2, 3],
'float_col': [1.0, 2.0, 3.0]})
result = df.to_dict(orient='index', into=into)
cols = ['int_col', 'float_col']
result = DataFrame.from_dict(result, orient='index')[cols]
expected = DataFrame.from_dict(expected, orient='index')[cols]
tm.assert_frame_equal(result, expected)
@@ -1,906 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import pytest
from datetime import timedelta
import numpy as np
from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp,
Categorical, compat, concat, option_context)
from pandas.compat import u
from pandas import _np_version_under1p14
from pandas.core.dtypes.dtypes import DatetimeTZDtype, CategoricalDtype
from pandas.tests.frame.common import TestData
from pandas.util.testing import (assert_series_equal,
assert_frame_equal,
makeCustomDataframe as mkdf)
import pandas.util.testing as tm
import pandas as pd
class TestDataFrameDataTypes(TestData):
def test_concat_empty_dataframe_dtypes(self):
df = DataFrame(columns=list("abc"))
df['a'] = df['a'].astype(np.bool_)
df['b'] = df['b'].astype(np.int32)
df['c'] = df['c'].astype(np.float64)
result = pd.concat([df, df])
assert result['a'].dtype == np.bool_
assert result['b'].dtype == np.int32
assert result['c'].dtype == np.float64
result = pd.concat([df, df.astype(np.float64)])
assert result['a'].dtype == np.object_
assert result['b'].dtype == np.float64
assert result['c'].dtype == np.float64
def test_empty_frame_dtypes_ftypes(self):
empty_df = pd.DataFrame()
assert_series_equal(empty_df.dtypes, pd.Series(dtype=np.object))
assert_series_equal(empty_df.ftypes, pd.Series(dtype=np.object))
nocols_df = pd.DataFrame(index=[1, 2, 3])
assert_series_equal(nocols_df.dtypes, pd.Series(dtype=np.object))
assert_series_equal(nocols_df.ftypes, pd.Series(dtype=np.object))
norows_df = pd.DataFrame(columns=list("abc"))
assert_series_equal(norows_df.dtypes, pd.Series(
np.object, index=list("abc")))
assert_series_equal(norows_df.ftypes, pd.Series(
'object:dense', index=list("abc")))
norows_int_df = pd.DataFrame(columns=list("abc")).astype(np.int32)
assert_series_equal(norows_int_df.dtypes, pd.Series(
np.dtype('int32'), index=list("abc")))
assert_series_equal(norows_int_df.ftypes, pd.Series(
'int32:dense', index=list("abc")))
odict = compat.OrderedDict
df = pd.DataFrame(odict([('a', 1), ('b', True), ('c', 1.0)]),
index=[1, 2, 3])
ex_dtypes = pd.Series(odict([('a', np.int64),
('b', np.bool),
('c', np.float64)]))
ex_ftypes = pd.Series(odict([('a', 'int64:dense'),
('b', 'bool:dense'),
('c', 'float64:dense')]))
assert_series_equal(df.dtypes, ex_dtypes)
assert_series_equal(df.ftypes, ex_ftypes)
# same but for empty slice of df
assert_series_equal(df[:0].dtypes, ex_dtypes)
assert_series_equal(df[:0].ftypes, ex_ftypes)
def test_datetime_with_tz_dtypes(self):
tzframe = DataFrame({'A': date_range('20130101', periods=3),
'B': date_range('20130101', periods=3,
tz='US/Eastern'),
'C': date_range('20130101', periods=3, tz='CET')})
tzframe.iloc[1, 1] = pd.NaT
tzframe.iloc[1, 2] = pd.NaT
result = tzframe.dtypes.sort_index()
expected = Series([np.dtype('datetime64[ns]'),
DatetimeTZDtype('datetime64[ns, US/Eastern]'),
DatetimeTZDtype('datetime64[ns, CET]')],
['A', 'B', 'C'])
assert_series_equal(result, expected)
def test_dtypes_are_correct_after_column_slice(self):
# GH6525
df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_)
odict = compat.OrderedDict
assert_series_equal(df.dtypes,
pd.Series(odict([('a', np.float_),
('b', np.float_),
('c', np.float_)])))
assert_series_equal(df.iloc[:, 2:].dtypes,
pd.Series(odict([('c', np.float_)])))
assert_series_equal(df.dtypes,
pd.Series(odict([('a', np.float_),
('b', np.float_),
('c', np.float_)])))
def test_select_dtypes_include_using_list_like(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.Categorical(list('abc')),
'g': pd.date_range('20130101', periods=3),
'h': pd.date_range('20130101', periods=3,
tz='US/Eastern'),
'i': pd.date_range('20130101', periods=3,
tz='CET'),
'j': pd.period_range('2013-01', periods=3,
freq='M'),
'k': pd.timedelta_range('1 day', periods=3)})
ri = df.select_dtypes(include=[np.number])
ei = df[['b', 'c', 'd', 'k']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=[np.number], exclude=['timedelta'])
ei = df[['b', 'c', 'd']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=[np.number, 'category'],
exclude=['timedelta'])
ei = df[['b', 'c', 'd', 'f']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=['datetime'])
ei = df[['g']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=['datetime64'])
ei = df[['g']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=['datetimetz'])
ei = df[['h', 'i']]
assert_frame_equal(ri, ei)
pytest.raises(NotImplementedError,
lambda: df.select_dtypes(include=['period']))
def test_select_dtypes_exclude_using_list_like(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True]})
re = df.select_dtypes(exclude=[np.number])
ee = df[['a', 'e']]
assert_frame_equal(re, ee)
def test_select_dtypes_exclude_include_using_list_like(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.date_range('now', periods=3).values})
exclude = np.datetime64,
include = np.bool_, 'integer'
r = df.select_dtypes(include=include, exclude=exclude)
e = df[['b', 'c', 'e']]
assert_frame_equal(r, e)
exclude = 'datetime',
include = 'bool', 'int64', 'int32'
r = df.select_dtypes(include=include, exclude=exclude)
e = df[['b', 'e']]
assert_frame_equal(r, e)
def test_select_dtypes_include_using_scalars(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.Categorical(list('abc')),
'g': pd.date_range('20130101', periods=3),
'h': pd.date_range('20130101', periods=3,
tz='US/Eastern'),
'i': pd.date_range('20130101', periods=3,
tz='CET'),
'j': pd.period_range('2013-01', periods=3,
freq='M'),
'k': pd.timedelta_range('1 day', periods=3)})
ri = df.select_dtypes(include=np.number)
ei = df[['b', 'c', 'd', 'k']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include='datetime')
ei = df[['g']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include='datetime64')
ei = df[['g']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include='category')
ei = df[['f']]
assert_frame_equal(ri, ei)
pytest.raises(NotImplementedError,
lambda: df.select_dtypes(include='period'))
def test_select_dtypes_exclude_using_scalars(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.Categorical(list('abc')),
'g': pd.date_range('20130101', periods=3),
'h': pd.date_range('20130101', periods=3,
tz='US/Eastern'),
'i': pd.date_range('20130101', periods=3,
tz='CET'),
'j': pd.period_range('2013-01', periods=3,
freq='M'),
'k': pd.timedelta_range('1 day', periods=3)})
ri = df.select_dtypes(exclude=np.number)
ei = df[['a', 'e', 'f', 'g', 'h', 'i', 'j']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(exclude='category')
ei = df[['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k']]
assert_frame_equal(ri, ei)
pytest.raises(NotImplementedError,
lambda: df.select_dtypes(exclude='period'))
def test_select_dtypes_include_exclude_using_scalars(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.Categorical(list('abc')),
'g': pd.date_range('20130101', periods=3),
'h': pd.date_range('20130101', periods=3,
tz='US/Eastern'),
'i': pd.date_range('20130101', periods=3,
tz='CET'),
'j': pd.period_range('2013-01', periods=3,
freq='M'),
'k': pd.timedelta_range('1 day', periods=3)})
ri = df.select_dtypes(include=np.number, exclude='floating')
ei = df[['b', 'c', 'k']]
assert_frame_equal(ri, ei)
def test_select_dtypes_include_exclude_mixed_scalars_lists(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.Categorical(list('abc')),
'g': pd.date_range('20130101', periods=3),
'h': pd.date_range('20130101', periods=3,
tz='US/Eastern'),
'i': pd.date_range('20130101', periods=3,
tz='CET'),
'j': pd.period_range('2013-01', periods=3,
freq='M'),
'k': pd.timedelta_range('1 day', periods=3)})
ri = df.select_dtypes(include=np.number,
exclude=['floating', 'timedelta'])
ei = df[['b', 'c']]
assert_frame_equal(ri, ei)
ri = df.select_dtypes(include=[np.number, 'category'],
exclude='floating')
ei = df[['b', 'c', 'f', 'k']]
assert_frame_equal(ri, ei)
def test_select_dtypes_duplicate_columns(self):
# GH20839
odict = compat.OrderedDict
df = DataFrame(odict([('a', list('abc')),
('b', list(range(1, 4))),
('c', np.arange(3, 6).astype('u1')),
('d', np.arange(4.0, 7.0, dtype='float64')),
('e', [True, False, True]),
('f', pd.date_range('now', periods=3).values)]))
df.columns = ['a', 'a', 'b', 'b', 'b', 'c']
expected = DataFrame({'a': list(range(1, 4)),
'b': np.arange(3, 6).astype('u1')})
result = df.select_dtypes(include=[np.number], exclude=['floating'])
assert_frame_equal(result, expected)
def test_select_dtypes_not_an_attr_but_still_valid_dtype(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.date_range('now', periods=3).values})
df['g'] = df.f.diff()
assert not hasattr(np, 'u8')
r = df.select_dtypes(include=['i8', 'O'], exclude=['timedelta'])
e = df[['a', 'b']]
assert_frame_equal(r, e)
r = df.select_dtypes(include=['i8', 'O', 'timedelta64[ns]'])
e = df[['a', 'b', 'g']]
assert_frame_equal(r, e)
def test_select_dtypes_empty(self):
df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))})
with tm.assert_raises_regex(ValueError, 'at least one of '
'include or exclude '
'must be nonempty'):
df.select_dtypes()
def test_select_dtypes_bad_datetime64(self):
df = DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.date_range('now', periods=3).values})
with tm.assert_raises_regex(ValueError, '.+ is too specific'):
df.select_dtypes(include=['datetime64[D]'])
with tm.assert_raises_regex(ValueError, '.+ is too specific'):
df.select_dtypes(exclude=['datetime64[as]'])
def test_select_dtypes_datetime_with_tz(self):
df2 = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'),
B=Timestamp('20130603', tz='CET')),
index=range(5))
df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
result = df3.select_dtypes(include=['datetime64[ns]'])
expected = df3.reindex(columns=[])
assert_frame_equal(result, expected)
def test_select_dtypes_str_raises(self):
df = DataFrame({'a': list('abc'),
'g': list(u('abc')),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.date_range('now', periods=3).values})
string_dtypes = set((str, 'str', np.string_, 'S1',
'unicode', np.unicode_, 'U1'))
try:
string_dtypes.add(unicode)
except NameError:
pass
for dt in string_dtypes:
with tm.assert_raises_regex(TypeError,
'string dtypes are not allowed'):
df.select_dtypes(include=[dt])
with tm.assert_raises_regex(TypeError,
'string dtypes are not allowed'):
df.select_dtypes(exclude=[dt])
def test_select_dtypes_bad_arg_raises(self):
df = DataFrame({'a': list('abc'),
'g': list(u('abc')),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.date_range('now', periods=3).values})
with tm.assert_raises_regex(TypeError, 'data type.'
'*not understood'):
df.select_dtypes(['blargy, blarg, blarg'])
def test_select_dtypes_typecodes(self):
# GH 11990
df = mkdf(30, 3, data_gen_f=lambda x, y: np.random.random())
expected = df
FLOAT_TYPES = list(np.typecodes['AllFloat'])
assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)
def test_dtypes_gh8722(self):
self.mixed_frame['bool'] = self.mixed_frame['A'] > 0
result = self.mixed_frame.dtypes
expected = Series(dict((k, v.dtype)
for k, v in compat.iteritems(self.mixed_frame)),
index=result.index)
assert_series_equal(result, expected)
# compat, GH 8722
with option_context('use_inf_as_na', True):
df = DataFrame([[1]])
result = df.dtypes
assert_series_equal(result, Series({0: np.dtype('int64')}))
def test_ftypes(self):
frame = self.mixed_float
expected = Series(dict(A='float32:dense',
B='float32:dense',
C='float16:dense',
D='float64:dense')).sort_values()
result = frame.ftypes.sort_values()
assert_series_equal(result, expected)
def test_astype(self):
casted = self.frame.astype(int)
expected = DataFrame(self.frame.values.astype(int),
index=self.frame.index,
columns=self.frame.columns)
assert_frame_equal(casted, expected)
casted = self.frame.astype(np.int32)
expected = DataFrame(self.frame.values.astype(np.int32),
index=self.frame.index,
columns=self.frame.columns)
assert_frame_equal(casted, expected)
self.frame['foo'] = '5'
casted = self.frame.astype(int)
expected = DataFrame(self.frame.values.astype(int),
index=self.frame.index,
columns=self.frame.columns)
assert_frame_equal(casted, expected)
# mixed casting
def _check_cast(df, v):
assert (list(set(s.dtype.name for
_, s in compat.iteritems(df)))[0] == v)
mn = self.all_mixed._get_numeric_data().copy()
mn['little_float'] = np.array(12345., dtype='float16')
mn['big_float'] = np.array(123456789101112., dtype='float64')
casted = mn.astype('float64')
_check_cast(casted, 'float64')
casted = mn.astype('int64')
_check_cast(casted, 'int64')
casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float32')
_check_cast(casted, 'float32')
casted = mn.reindex(columns=['little_float']).astype('float16')
_check_cast(casted, 'float16')
casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float16')
_check_cast(casted, 'float16')
casted = mn.astype('float32')
_check_cast(casted, 'float32')
casted = mn.astype('int32')
_check_cast(casted, 'int32')
# to object
casted = mn.astype('O')
_check_cast(casted, 'object')
def test_astype_with_exclude_string(self):
df = self.frame.copy()
expected = self.frame.astype(int)
df['string'] = 'foo'
casted = df.astype(int, errors='ignore')
expected['string'] = 'foo'
assert_frame_equal(casted, expected)
df = self.frame.copy()
expected = self.frame.astype(np.int32)
df['string'] = 'foo'
casted = df.astype(np.int32, errors='ignore')
expected['string'] = 'foo'
assert_frame_equal(casted, expected)
def test_astype_with_view(self):
tf = self.mixed_float.reindex(columns=['A', 'B', 'C'])
casted = tf.astype(np.int64)
casted = tf.astype(np.float32)
# this is the only real reason to do it this way
tf = np.round(self.frame).astype(np.int32)
casted = tf.astype(np.float32, copy=False)
# TODO(wesm): verification?
tf = self.frame.astype(np.float64)
casted = tf.astype(np.int64, copy=False) # noqa
def test_astype_cast_nan_inf_int(self):
# GH14265, check nan and inf raise error when converting to int
types = [np.int32, np.int64]
values = [np.nan, np.inf]
msg = 'Cannot convert non-finite values \\(NA or inf\\) to integer'
for this_type in types:
for this_val in values:
df = DataFrame([this_val])
with tm.assert_raises_regex(ValueError, msg):
df.astype(this_type)
def test_astype_str(self):
# GH9757
a = Series(date_range('2010-01-04', periods=5))
b = Series(date_range('3/6/2012 00:00', periods=5, tz='US/Eastern'))
c = Series([Timedelta(x, unit='d') for x in range(5)])
d = Series(range(5))
e = Series([0.0, 0.2, 0.4, 0.6, 0.8])
df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d, 'e': e})
# datetimelike
# Test str and unicode on python 2.x and just str on python 3.x
for tt in set([str, compat.text_type]):
result = df.astype(tt)
expected = DataFrame({
'a': list(map(tt, map(lambda x: Timestamp(x)._date_repr,
a._values))),
'b': list(map(tt, map(Timestamp, b._values))),
'c': list(map(tt, map(lambda x: Timedelta(x)
._repr_base(format='all'), c._values))),
'd': list(map(tt, d._values)),
'e': list(map(tt, e._values)),
})
assert_frame_equal(result, expected)
# float/nan
# 11302
# consistency in astype(str)
for tt in set([str, compat.text_type]):
result = DataFrame([np.NaN]).astype(tt)
expected = DataFrame(['nan'])
assert_frame_equal(result, expected)
result = DataFrame([1.12345678901234567890]).astype(tt)
if _np_version_under1p14:
# < 1.14 truncates
expected = DataFrame(['1.12345678901'])
else:
# >= 1.14 preserves the full repr
expected = DataFrame(['1.1234567890123457'])
assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype_class", [dict, Series])
def test_astype_dict_like(self, dtype_class):
# GH7271 & GH16717
a = Series(date_range('2010-01-04', periods=5))
b = Series(range(5))
c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
d = Series(['1.0', '2', '3.14', '4', '5.4'])
df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d})
original = df.copy(deep=True)
# change type of a subset of columns
dt1 = dtype_class({'b': 'str', 'd': 'float32'})
result = df.astype(dt1)
expected = DataFrame({
'a': a,
'b': Series(['0', '1', '2', '3', '4']),
'c': c,
'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')})
assert_frame_equal(result, expected)
assert_frame_equal(df, original)
dt2 = dtype_class({'b': np.float32, 'c': 'float32', 'd': np.float64})
result = df.astype(dt2)
expected = DataFrame({
'a': a,
'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'),
'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'),
'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')})
assert_frame_equal(result, expected)
assert_frame_equal(df, original)
# change all columns
dt3 = dtype_class({'a': str, 'b': str, 'c': str, 'd': str})
assert_frame_equal(df.astype(dt3),
df.astype(str))
assert_frame_equal(df, original)
# error should be raised when using something other than column labels
# in the keys of the dtype dict
dt4 = dtype_class({'b': str, 2: str})
dt5 = dtype_class({'e': str})
pytest.raises(KeyError, df.astype, dt4)
pytest.raises(KeyError, df.astype, dt5)
assert_frame_equal(df, original)
# if the dtypes provided are the same as the original dtypes, the
# resulting DataFrame should be the same as the original DataFrame
dt6 = dtype_class({col: df[col].dtype for col in df.columns})
equiv = df.astype(dt6)
assert_frame_equal(df, equiv)
assert_frame_equal(df, original)
# GH 16717
# if dtypes provided is empty, the resulting DataFrame
# should be the same as the original DataFrame
dt7 = dtype_class({})
result = df.astype(dt7)
assert_frame_equal(df, equiv)
assert_frame_equal(df, original)
def test_astype_duplicate_col(self):
a1 = Series([1, 2, 3, 4, 5], name='a')
b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name='b')
a2 = Series([0, 1, 2, 3, 4], name='a')
df = concat([a1, b, a2], axis=1)
result = df.astype(str)
a1_str = Series(['1', '2', '3', '4', '5'], dtype='str', name='a')
b_str = Series(['0.1', '0.2', '0.4', '0.6', '0.8'], dtype=str,
name='b')
a2_str = Series(['0', '1', '2', '3', '4'], dtype='str', name='a')
expected = concat([a1_str, b_str, a2_str], axis=1)
assert_frame_equal(result, expected)
result = df.astype({'a': 'str'})
expected = concat([a1_str, b, a2_str], axis=1)
assert_frame_equal(result, expected)
@pytest.mark.parametrize('dtype', [
'category',
CategoricalDtype(),
CategoricalDtype(ordered=True),
CategoricalDtype(ordered=False),
CategoricalDtype(categories=list('abcdef')),
CategoricalDtype(categories=list('edba'), ordered=False),
CategoricalDtype(categories=list('edcb'), ordered=True)], ids=repr)
def test_astype_categorical(self, dtype):
# GH 18099
d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')}
df = DataFrame(d)
result = df.astype(dtype)
expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("cls", [
pd.api.types.CategoricalDtype,
pd.api.types.DatetimeTZDtype,
pd.api.types.IntervalDtype
])
def test_astype_categoricaldtype_class_raises(self, cls):
df = DataFrame({"A": ['a', 'a', 'b', 'c']})
xpr = "Expected an instance of {}".format(cls.__name__)
with tm.assert_raises_regex(TypeError, xpr):
df.astype({"A": cls})
with tm.assert_raises_regex(TypeError, xpr):
df['A'].astype(cls)
@pytest.mark.parametrize('dtype', [
{100: 'float64', 200: 'uint64'}, 'category', 'float64'])
def test_astype_column_metadata(self, dtype):
# GH 19920
columns = pd.UInt64Index([100, 200, 300], name='foo')
df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
df = df.astype(dtype)
tm.assert_index_equal(df.columns, columns)
@pytest.mark.parametrize("dtype", ["M8", "m8"])
@pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
def test_astype_from_datetimelike_to_objectt(self, dtype, unit):
# tests astype to object dtype
# gh-19223 / gh-12425
dtype = "{}[{}]".format(dtype, unit)
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
result = df.astype(object)
assert (result.dtypes == object).all()
if dtype.startswith('M8'):
assert result.iloc[0, 0] == pd.to_datetime(1, unit=unit)
else:
assert result.iloc[0, 0] == pd.to_timedelta(1, unit=unit)
@pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
@pytest.mark.parametrize("dtype", ["M8", "m8"])
@pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
# tests all units from numeric origination
# gh-19223 / gh-12425
dtype = "{}[{}]".format(dtype, unit)
arr = np.array([[1, 2, 3]], dtype=arr_dtype)
df = DataFrame(arr)
result = df.astype(dtype)
expected = DataFrame(arr.astype(dtype))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
def test_astype_to_datetime_unit(self, unit):
# tests all units from datetime origination
# gh-19223
dtype = "M8[{}]".format(unit)
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
result = df.astype(dtype)
expected = DataFrame(arr.astype(dtype))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("unit", ['ns'])
def test_astype_to_timedelta_unit_ns(self, unit):
# preserver the timedelta conversion
# gh-19223
dtype = "m8[{}]".format(unit)
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
result = df.astype(dtype)
expected = DataFrame(arr.astype(dtype))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("unit", ['us', 'ms', 's', 'h', 'm', 'D'])
def test_astype_to_timedelta_unit(self, unit):
# coerce to float
# gh-19223
dtype = "m8[{}]".format(unit)
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
result = df.astype(dtype)
expected = DataFrame(df.values.astype(dtype).astype(float))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
def test_astype_to_incorrect_datetimelike(self, unit):
# trying to astype a m to a M, or vice-versa
# gh-19224
dtype = "M8[{}]".format(unit)
other = "m8[{}]".format(unit)
df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
with pytest.raises(TypeError):
df.astype(other)
df = DataFrame(np.array([[1, 2, 3]], dtype=other))
with pytest.raises(TypeError):
df.astype(dtype)
def test_timedeltas(self):
df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3,
freq='D')),
B=Series([timedelta(days=i) for i in range(3)])))
result = df.get_dtype_counts().sort_index()
expected = Series(
{'datetime64[ns]': 1, 'timedelta64[ns]': 1}).sort_index()
assert_series_equal(result, expected)
df['C'] = df['A'] + df['B']
expected = Series(
{'datetime64[ns]': 2, 'timedelta64[ns]': 1}).sort_values()
result = df.get_dtype_counts().sort_values()
assert_series_equal(result, expected)
# mixed int types
df['D'] = 1
expected = Series({'datetime64[ns]': 2,
'timedelta64[ns]': 1,
'int64': 1}).sort_values()
result = df.get_dtype_counts().sort_values()
assert_series_equal(result, expected)
def test_arg_for_errors_in_astype(self):
# issue #14878
df = DataFrame([1, 2, 3])
with pytest.raises(ValueError):
df.astype(np.float64, errors=True)
with tm.assert_produces_warning(FutureWarning):
df.astype(np.int8, raise_on_error=False)
df.astype(np.int8, errors='ignore')
@pytest.mark.parametrize('input_vals', [
([1, 2]),
(['1', '2']),
(list(pd.date_range('1/1/2011', periods=2, freq='H'))),
(list(pd.date_range('1/1/2011', periods=2, freq='H',
tz='US/Eastern'))),
([pd.Interval(left=0, right=5)]),
])
def test_constructor_list_str(self, input_vals, string_dtype):
# GH 16605
# Ensure that data elements are converted to strings when
# dtype is str, 'str', or 'U'
result = DataFrame({'A': input_vals}, dtype=string_dtype)
expected = DataFrame({'A': input_vals}).astype({'A': string_dtype})
assert_frame_equal(result, expected)
def test_constructor_list_str_na(self, string_dtype):
result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object)
assert_frame_equal(result, expected)
class TestDataFrameDatetimeWithTZ(TestData):
def test_interleave(self):
# interleave with object
result = self.tzframe.assign(D='foo').values
expected = np.array([[Timestamp('2013-01-01 00:00:00'),
Timestamp('2013-01-02 00:00:00'),
Timestamp('2013-01-03 00:00:00')],
[Timestamp('2013-01-01 00:00:00-0500',
tz='US/Eastern'),
pd.NaT,
Timestamp('2013-01-03 00:00:00-0500',
tz='US/Eastern')],
[Timestamp('2013-01-01 00:00:00+0100', tz='CET'),
pd.NaT,
Timestamp('2013-01-03 00:00:00+0100', tz='CET')],
['foo', 'foo', 'foo']], dtype=object).T
tm.assert_numpy_array_equal(result, expected)
# interleave with only datetime64[ns]
result = self.tzframe.values
expected = np.array([[Timestamp('2013-01-01 00:00:00'),
Timestamp('2013-01-02 00:00:00'),
Timestamp('2013-01-03 00:00:00')],
[Timestamp('2013-01-01 00:00:00-0500',
tz='US/Eastern'),
pd.NaT,
Timestamp('2013-01-03 00:00:00-0500',
tz='US/Eastern')],
[Timestamp('2013-01-01 00:00:00+0100', tz='CET'),
pd.NaT,
Timestamp('2013-01-03 00:00:00+0100',
tz='CET')]], dtype=object).T
tm.assert_numpy_array_equal(result, expected)
def test_astype(self):
# astype
expected = np.array([[Timestamp('2013-01-01 00:00:00'),
Timestamp('2013-01-02 00:00:00'),
Timestamp('2013-01-03 00:00:00')],
[Timestamp('2013-01-01 00:00:00-0500',
tz='US/Eastern'),
pd.NaT,
Timestamp('2013-01-03 00:00:00-0500',
tz='US/Eastern')],
[Timestamp('2013-01-01 00:00:00+0100', tz='CET'),
pd.NaT,
Timestamp('2013-01-03 00:00:00+0100',
tz='CET')]],
dtype=object).T
result = self.tzframe.astype(object)
assert_frame_equal(result, DataFrame(
expected, index=self.tzframe.index, columns=self.tzframe.columns))
result = self.tzframe.astype('datetime64[ns]')
expected = DataFrame({'A': date_range('20130101', periods=3),
'B': (date_range('20130101', periods=3,
tz='US/Eastern')
.tz_convert('UTC')
.tz_localize(None)),
'C': (date_range('20130101', periods=3,
tz='CET')
.tz_convert('UTC')
.tz_localize(None))})
expected.iloc[1, 1] = pd.NaT
expected.iloc[1, 2] = pd.NaT
assert_frame_equal(result, expected)
def test_astype_str(self):
# str formatting
result = self.tzframe.astype(str)
expected = DataFrame([['2013-01-01', '2013-01-01 00:00:00-05:00',
'2013-01-01 00:00:00+01:00'],
['2013-01-02', 'NaT', 'NaT'],
['2013-01-03', '2013-01-03 00:00:00-05:00',
'2013-01-03 00:00:00+01:00']],
columns=self.tzframe.columns)
tm.assert_frame_equal(result, expected)
with option_context('display.max_columns', 20):
result = str(self.tzframe)
assert ('0 2013-01-01 2013-01-01 00:00:00-05:00 '
'2013-01-01 00:00:00+01:00') in result
assert ('1 2013-01-02 '
'NaT NaT') in result
assert ('2 2013-01-03 2013-01-03 00:00:00-05:00 '
'2013-01-03 00:00:00+01:00') in result
File diff suppressed because it is too large Load Diff
@@ -1,184 +0,0 @@
# -*- coding: utf-8 -*-
import pytest
import numpy as np
from pandas import DataFrame, Index, PeriodIndex
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
@pytest.fixture
def frame_with_period_index():
return DataFrame(
data=np.arange(20).reshape(4, 5),
columns=list('abcde'),
index=PeriodIndex(start='2000', freq='A', periods=4))
@pytest.fixture
def frame():
return TestData().frame
@pytest.fixture
def left():
return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0])
@pytest.fixture
def right():
return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2])
@pytest.mark.parametrize(
"how, sort, expected",
[('inner', False, DataFrame({'a': [20, 10],
'b': [200, 100]},
index=[2, 1])),
('inner', True, DataFrame({'a': [10, 20],
'b': [100, 200]},
index=[1, 2])),
('left', False, DataFrame({'a': [20, 10, 0],
'b': [200, 100, np.nan]},
index=[2, 1, 0])),
('left', True, DataFrame({'a': [0, 10, 20],
'b': [np.nan, 100, 200]},
index=[0, 1, 2])),
('right', False, DataFrame({'a': [np.nan, 10, 20],
'b': [300, 100, 200]},
index=[3, 1, 2])),
('right', True, DataFrame({'a': [10, 20, np.nan],
'b': [100, 200, 300]},
index=[1, 2, 3])),
('outer', False, DataFrame({'a': [0, 10, 20, np.nan],
'b': [np.nan, 100, 200, 300]},
index=[0, 1, 2, 3])),
('outer', True, DataFrame({'a': [0, 10, 20, np.nan],
'b': [np.nan, 100, 200, 300]},
index=[0, 1, 2, 3]))])
def test_join(left, right, how, sort, expected):
result = left.join(right, how=how, sort=sort)
tm.assert_frame_equal(result, expected)
def test_join_index(frame):
# left / right
f = frame.loc[frame.index[:10], ['A', 'B']]
f2 = frame.loc[frame.index[5:], ['C', 'D']].iloc[::-1]
joined = f.join(f2)
tm.assert_index_equal(f.index, joined.index)
expected_columns = Index(['A', 'B', 'C', 'D'])
tm.assert_index_equal(joined.columns, expected_columns)
joined = f.join(f2, how='left')
tm.assert_index_equal(joined.index, f.index)
tm.assert_index_equal(joined.columns, expected_columns)
joined = f.join(f2, how='right')
tm.assert_index_equal(joined.index, f2.index)
tm.assert_index_equal(joined.columns, expected_columns)
# inner
joined = f.join(f2, how='inner')
tm.assert_index_equal(joined.index, f.index[5:10])
tm.assert_index_equal(joined.columns, expected_columns)
# outer
joined = f.join(f2, how='outer')
tm.assert_index_equal(joined.index, frame.index.sort_values())
tm.assert_index_equal(joined.columns, expected_columns)
tm.assert_raises_regex(
ValueError, 'join method', f.join, f2, how='foo')
# corner case - overlapping columns
for how in ('outer', 'left', 'inner'):
with tm.assert_raises_regex(ValueError, 'columns overlap but '
'no suffix'):
frame.join(frame, how=how)
def test_join_index_more(frame):
af = frame.loc[:, ['A', 'B']]
bf = frame.loc[::2, ['C', 'D']]
expected = af.copy()
expected['C'] = frame['C'][::2]
expected['D'] = frame['D'][::2]
result = af.join(bf)
tm.assert_frame_equal(result, expected)
result = af.join(bf, how='right')
tm.assert_frame_equal(result, expected[::2])
result = bf.join(af, how='right')
tm.assert_frame_equal(result, expected.loc[:, result.columns])
def test_join_index_series(frame):
df = frame.copy()
s = df.pop(frame.columns[-1])
joined = df.join(s)
# TODO should this check_names ?
tm.assert_frame_equal(joined, frame, check_names=False)
s.name = None
tm.assert_raises_regex(ValueError, 'must have a name', df.join, s)
def test_join_overlap(frame):
df1 = frame.loc[:, ['A', 'B', 'C']]
df2 = frame.loc[:, ['B', 'C', 'D']]
joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2')
df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1')
df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2')
no_overlap = frame.loc[:, ['A', 'D']]
expected = df1_suf.join(df2_suf).join(no_overlap)
# column order not necessarily sorted
tm.assert_frame_equal(joined, expected.loc[:, joined.columns])
def test_join_period_index(frame_with_period_index):
other = frame_with_period_index.rename(
columns=lambda x: '{key}{key}'.format(key=x))
joined_values = np.concatenate(
[frame_with_period_index.values] * 2, axis=1)
joined_cols = frame_with_period_index.columns.append(other.columns)
joined = frame_with_period_index.join(other)
expected = DataFrame(
data=joined_values,
columns=joined_cols,
index=frame_with_period_index.index)
tm.assert_frame_equal(joined, expected)
def test_join_left_sequence_non_unique_index():
# https://github.com/pandas-dev/pandas/issues/19607
df1 = DataFrame({'a': [0, 10, 20]}, index=[1, 2, 3])
df2 = DataFrame({'b': [100, 200, 300]}, index=[4, 3, 2])
df3 = DataFrame({'c': [400, 500, 600]}, index=[2, 2, 4])
joined = df1.join([df2, df3], how='left')
expected = DataFrame({
'a': [0, 10, 10, 20],
'b': [np.nan, 300, 300, 200],
'c': [np.nan, 400, 500, np.nan]
}, index=[1, 2, 2, 3])
tm.assert_frame_equal(joined, expected)
@@ -1,846 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import pytest
from distutils.version import LooseVersion
from numpy import nan, random
import numpy as np
import datetime
import dateutil
from pandas.compat import lrange
from pandas import (DataFrame, Series, Timestamp,
date_range, Categorical)
import pandas as pd
from pandas.util.testing import assert_series_equal, assert_frame_equal
import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas.tests.frame.common import TestData, _check_mixed_float
try:
import scipy
_is_scipy_ge_0190 = (LooseVersion(scipy.__version__) >=
LooseVersion('0.19.0'))
except:
_is_scipy_ge_0190 = False
def _skip_if_no_pchip():
try:
from scipy.interpolate import pchip_interpolate # noqa
except ImportError:
import pytest
pytest.skip('scipy.interpolate.pchip missing')
class TestDataFrameMissingData(TestData):
def test_dropEmptyRows(self):
N = len(self.frame.index)
mat = random.randn(N)
mat[:5] = nan
frame = DataFrame({'foo': mat}, index=self.frame.index)
original = Series(mat, index=self.frame.index, name='foo')
expected = original.dropna()
inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()
smaller_frame = frame.dropna(how='all')
# check that original was preserved
assert_series_equal(frame['foo'], original)
inplace_frame1.dropna(how='all', inplace=True)
assert_series_equal(smaller_frame['foo'], expected)
assert_series_equal(inplace_frame1['foo'], expected)
smaller_frame = frame.dropna(how='all', subset=['foo'])
inplace_frame2.dropna(how='all', subset=['foo'], inplace=True)
assert_series_equal(smaller_frame['foo'], expected)
assert_series_equal(inplace_frame2['foo'], expected)
def test_dropIncompleteRows(self):
N = len(self.frame.index)
mat = random.randn(N)
mat[:5] = nan
frame = DataFrame({'foo': mat}, index=self.frame.index)
frame['bar'] = 5
original = Series(mat, index=self.frame.index, name='foo')
inp_frame1, inp_frame2 = frame.copy(), frame.copy()
smaller_frame = frame.dropna()
assert_series_equal(frame['foo'], original)
inp_frame1.dropna(inplace=True)
exp = Series(mat[5:], index=self.frame.index[5:], name='foo')
tm.assert_series_equal(smaller_frame['foo'], exp)
tm.assert_series_equal(inp_frame1['foo'], exp)
samesize_frame = frame.dropna(subset=['bar'])
assert_series_equal(frame['foo'], original)
assert (frame['bar'] == 5).all()
inp_frame2.dropna(subset=['bar'], inplace=True)
tm.assert_index_equal(samesize_frame.index, self.frame.index)
tm.assert_index_equal(inp_frame2.index, self.frame.index)
def test_dropna(self):
df = DataFrame(np.random.randn(6, 4))
df[2][:2] = nan
dropped = df.dropna(axis=1)
expected = df.loc[:, [0, 1, 3]]
inp = df.copy()
inp.dropna(axis=1, inplace=True)
assert_frame_equal(dropped, expected)
assert_frame_equal(inp, expected)
dropped = df.dropna(axis=0)
expected = df.loc[lrange(2, 6)]
inp = df.copy()
inp.dropna(axis=0, inplace=True)
assert_frame_equal(dropped, expected)
assert_frame_equal(inp, expected)
# threshold
dropped = df.dropna(axis=1, thresh=5)
expected = df.loc[:, [0, 1, 3]]
inp = df.copy()
inp.dropna(axis=1, thresh=5, inplace=True)
assert_frame_equal(dropped, expected)
assert_frame_equal(inp, expected)
dropped = df.dropna(axis=0, thresh=4)
expected = df.loc[lrange(2, 6)]
inp = df.copy()
inp.dropna(axis=0, thresh=4, inplace=True)
assert_frame_equal(dropped, expected)
assert_frame_equal(inp, expected)
dropped = df.dropna(axis=1, thresh=4)
assert_frame_equal(dropped, df)
dropped = df.dropna(axis=1, thresh=3)
assert_frame_equal(dropped, df)
# subset
dropped = df.dropna(axis=0, subset=[0, 1, 3])
inp = df.copy()
inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
assert_frame_equal(dropped, df)
assert_frame_equal(inp, df)
# all
dropped = df.dropna(axis=1, how='all')
assert_frame_equal(dropped, df)
df[2] = nan
dropped = df.dropna(axis=1, how='all')
expected = df.loc[:, [0, 1, 3]]
assert_frame_equal(dropped, expected)
# bad input
pytest.raises(ValueError, df.dropna, axis=3)
def test_drop_and_dropna_caching(self):
# tst that cacher updates
original = Series([1, 2, np.nan], name='A')
expected = Series([1, 2], dtype=original.dtype, name='A')
df = pd.DataFrame({'A': original.values.copy()})
df2 = df.copy()
df['A'].dropna()
assert_series_equal(df['A'], original)
df['A'].dropna(inplace=True)
assert_series_equal(df['A'], expected)
df2['A'].drop([1])
assert_series_equal(df2['A'], original)
df2['A'].drop([1], inplace=True)
assert_series_equal(df2['A'], original.drop([1]))
def test_dropna_corner(self):
# bad input
pytest.raises(ValueError, self.frame.dropna, how='foo')
pytest.raises(TypeError, self.frame.dropna, how=None)
# non-existent column - 8303
pytest.raises(KeyError, self.frame.dropna, subset=['A', 'X'])
def test_dropna_multiple_axes(self):
df = DataFrame([[1, np.nan, 2, 3],
[4, np.nan, 5, 6],
[np.nan, np.nan, np.nan, np.nan],
[7, np.nan, 8, 9]])
cp = df.copy()
# GH20987
with tm.assert_produces_warning(FutureWarning):
result = df.dropna(how='all', axis=[0, 1])
with tm.assert_produces_warning(FutureWarning):
result2 = df.dropna(how='all', axis=(0, 1))
expected = df.dropna(how='all').dropna(how='all', axis=1)
assert_frame_equal(result, expected)
assert_frame_equal(result2, expected)
assert_frame_equal(df, cp)
inp = df.copy()
with tm.assert_produces_warning(FutureWarning):
inp.dropna(how='all', axis=(0, 1), inplace=True)
assert_frame_equal(inp, expected)
def test_dropna_tz_aware_datetime(self):
# GH13407
df = DataFrame()
dt1 = datetime.datetime(2015, 1, 1,
tzinfo=dateutil.tz.tzutc())
dt2 = datetime.datetime(2015, 2, 2,
tzinfo=dateutil.tz.tzutc())
df['Time'] = [dt1]
result = df.dropna(axis=0)
expected = DataFrame({'Time': [dt1]})
assert_frame_equal(result, expected)
# Ex2
df = DataFrame({'Time': [dt1, None, np.nan, dt2]})
result = df.dropna(axis=0)
expected = DataFrame([dt1, dt2],
columns=['Time'],
index=[0, 3])
assert_frame_equal(result, expected)
def test_fillna(self):
tf = self.tsframe
tf.loc[tf.index[:5], 'A'] = nan
tf.loc[tf.index[-5:], 'A'] = nan
zero_filled = self.tsframe.fillna(0)
assert (zero_filled.loc[zero_filled.index[:5], 'A'] == 0).all()
padded = self.tsframe.fillna(method='pad')
assert np.isnan(padded.loc[padded.index[:5], 'A']).all()
assert (padded.loc[padded.index[-5:], 'A'] ==
padded.loc[padded.index[-5], 'A']).all()
# mixed type
mf = self.mixed_frame
mf.loc[mf.index[5:20], 'foo'] = nan
mf.loc[mf.index[-10:], 'A'] = nan
result = self.mixed_frame.fillna(value=0)
result = self.mixed_frame.fillna(method='pad')
pytest.raises(ValueError, self.tsframe.fillna)
pytest.raises(ValueError, self.tsframe.fillna, 5, method='ffill')
# mixed numeric (but no float16)
mf = self.mixed_float.reindex(columns=['A', 'B', 'D'])
mf.loc[mf.index[-10:], 'A'] = nan
result = mf.fillna(value=0)
_check_mixed_float(result, dtype=dict(C=None))
result = mf.fillna(method='pad')
_check_mixed_float(result, dtype=dict(C=None))
# empty frame (GH #2778)
df = DataFrame(columns=['x'])
for m in ['pad', 'backfill']:
df.x.fillna(method=m, inplace=True)
df.x.fillna(method=m)
# with different dtype (GH3386)
df = DataFrame([['a', 'a', np.nan, 'a'], [
'b', 'b', np.nan, 'b'], ['c', 'c', np.nan, 'c']])
result = df.fillna({2: 'foo'})
expected = DataFrame([['a', 'a', 'foo', 'a'],
['b', 'b', 'foo', 'b'],
['c', 'c', 'foo', 'c']])
assert_frame_equal(result, expected)
df.fillna({2: 'foo'}, inplace=True)
assert_frame_equal(df, expected)
# limit and value
df = DataFrame(np.random.randn(10, 3))
df.iloc[2:7, 0] = np.nan
df.iloc[3:5, 2] = np.nan
expected = df.copy()
expected.iloc[2, 0] = 999
expected.iloc[3, 2] = 999
result = df.fillna(999, limit=1)
assert_frame_equal(result, expected)
# with datelike
# GH 6344
df = DataFrame({
'Date': [pd.NaT, Timestamp("2014-1-1")],
'Date2': [Timestamp("2013-1-1"), pd.NaT]
})
expected = df.copy()
expected['Date'] = expected['Date'].fillna(
df.loc[df.index[0], 'Date2'])
result = df.fillna(value={'Date': df['Date2']})
assert_frame_equal(result, expected)
# with timezone
# GH 15855
df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
pd.NaT]})
exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
assert_frame_equal(df.fillna(method='pad'), exp)
df = pd.DataFrame({'A': [pd.NaT,
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
assert_frame_equal(df.fillna(method='bfill'), exp)
def test_na_actions_categorical(self):
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
vals = ["a", "b", np.nan, "d"]
df = DataFrame({"cats": cat, "vals": vals})
cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
vals2 = ["a", "b", "b", "d"]
df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
vals3 = ["a", "b", np.nan]
df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
cat4 = Categorical([1, 2], categories=[1, 2, 3])
vals4 = ["a", "b"]
df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
# fillna
res = df.fillna(value={"cats": 3, "vals": "b"})
tm.assert_frame_equal(res, df_exp_fill)
with tm.assert_raises_regex(ValueError, "fill value must be "
"in categories"):
df.fillna(value={"cats": 4, "vals": "c"})
res = df.fillna(method='pad')
tm.assert_frame_equal(res, df_exp_fill)
# dropna
res = df.dropna(subset=["cats"])
tm.assert_frame_equal(res, df_exp_drop_cats)
res = df.dropna()
tm.assert_frame_equal(res, df_exp_drop_all)
# make sure that fillna takes missing values into account
c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})
cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
res = df.fillna("a")
tm.assert_frame_equal(res, df_exp)
def test_fillna_categorical_nan(self):
# GH 14021
# np.nan should always be a valid filler
cat = Categorical([np.nan, 2, np.nan])
val = Categorical([np.nan, np.nan, np.nan])
df = DataFrame({"cats": cat, "vals": val})
res = df.fillna(df.median())
v_exp = [np.nan, np.nan, np.nan]
df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
dtype='category')
tm.assert_frame_equal(res, df_exp)
result = df.cats.fillna(np.nan)
tm.assert_series_equal(result, df.cats)
result = df.vals.fillna(np.nan)
tm.assert_series_equal(result, df.vals)
idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
'2011-01-01 09:00', pd.NaT, pd.NaT])
df = DataFrame({'a': Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01',
pd.NaT, pd.NaT], freq='M')
df = DataFrame({'a': Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
idx = pd.TimedeltaIndex(['1 days', '2 days',
'1 days', pd.NaT, pd.NaT])
df = DataFrame({'a': Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
def test_fillna_downcast(self):
# GH 15277
# infer int64 from float64
df = pd.DataFrame({'a': [1., np.nan]})
result = df.fillna(0, downcast='infer')
expected = pd.DataFrame({'a': [1, 0]})
assert_frame_equal(result, expected)
# infer int64 from float64 when fillna value is a dict
df = pd.DataFrame({'a': [1., np.nan]})
result = df.fillna({'a': 0}, downcast='infer')
expected = pd.DataFrame({'a': [1, 0]})
assert_frame_equal(result, expected)
def test_fillna_dtype_conversion(self):
# make sure that fillna on an empty frame works
df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
result = df.get_dtype_counts().sort_values()
expected = Series({'object': 5})
assert_series_equal(result, expected)
result = df.fillna(1)
expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
result = result.get_dtype_counts().sort_values()
expected = Series({'int64': 5})
assert_series_equal(result, expected)
# empty block
df = DataFrame(index=lrange(3), columns=['A', 'B'], dtype='float64')
result = df.fillna('nan')
expected = DataFrame('nan', index=lrange(3), columns=['A', 'B'])
assert_frame_equal(result, expected)
# equiv of replace
df = DataFrame(dict(A=[1, np.nan], B=[1., 2.]))
for v in ['', 1, np.nan, 1.0]:
expected = df.replace(np.nan, v)
result = df.fillna(v)
assert_frame_equal(result, expected)
def test_fillna_datetime_columns(self):
# GH 7095
df = pd.DataFrame({'A': [-1, -2, np.nan],
'B': date_range('20130101', periods=3),
'C': ['foo', 'bar', None],
'D': ['foo2', 'bar2', None]},
index=date_range('20130110', periods=3))
result = df.fillna('?')
expected = pd.DataFrame({'A': [-1, -2, '?'],
'B': date_range('20130101', periods=3),
'C': ['foo', 'bar', '?'],
'D': ['foo2', 'bar2', '?']},
index=date_range('20130110', periods=3))
tm.assert_frame_equal(result, expected)
df = pd.DataFrame({'A': [-1, -2, np.nan],
'B': [pd.Timestamp('2013-01-01'),
pd.Timestamp('2013-01-02'), pd.NaT],
'C': ['foo', 'bar', None],
'D': ['foo2', 'bar2', None]},
index=date_range('20130110', periods=3))
result = df.fillna('?')
expected = pd.DataFrame({'A': [-1, -2, '?'],
'B': [pd.Timestamp('2013-01-01'),
pd.Timestamp('2013-01-02'), '?'],
'C': ['foo', 'bar', '?'],
'D': ['foo2', 'bar2', '?']},
index=pd.date_range('20130110', periods=3))
tm.assert_frame_equal(result, expected)
def test_ffill(self):
self.tsframe['A'][:5] = nan
self.tsframe['A'][-5:] = nan
assert_frame_equal(self.tsframe.ffill(),
self.tsframe.fillna(method='ffill'))
def test_bfill(self):
self.tsframe['A'][:5] = nan
self.tsframe['A'][-5:] = nan
assert_frame_equal(self.tsframe.bfill(),
self.tsframe.fillna(method='bfill'))
def test_frame_pad_backfill_limit(self):
index = np.arange(10)
df = DataFrame(np.random.randn(10, 4), index=index)
result = df[:2].reindex(index, method='pad', limit=5)
expected = df[:2].reindex(index).fillna(method='pad')
expected.values[-3:] = np.nan
tm.assert_frame_equal(result, expected)
result = df[-2:].reindex(index, method='backfill', limit=5)
expected = df[-2:].reindex(index).fillna(method='backfill')
expected.values[:3] = np.nan
tm.assert_frame_equal(result, expected)
def test_frame_fillna_limit(self):
index = np.arange(10)
df = DataFrame(np.random.randn(10, 4), index=index)
result = df[:2].reindex(index)
result = result.fillna(method='pad', limit=5)
expected = df[:2].reindex(index).fillna(method='pad')
expected.values[-3:] = np.nan
tm.assert_frame_equal(result, expected)
result = df[-2:].reindex(index)
result = result.fillna(method='backfill', limit=5)
expected = df[-2:].reindex(index).fillna(method='backfill')
expected.values[:3] = np.nan
tm.assert_frame_equal(result, expected)
def test_fillna_skip_certain_blocks(self):
# don't try to fill boolean, int blocks
df = DataFrame(np.random.randn(10, 4).astype(int))
# it works!
df.fillna(np.nan)
def test_fillna_inplace(self):
df = DataFrame(np.random.randn(10, 4))
df[1][:4] = np.nan
df[3][-4:] = np.nan
expected = df.fillna(value=0)
assert expected is not df
df.fillna(value=0, inplace=True)
tm.assert_frame_equal(df, expected)
expected = df.fillna(value={0: 0}, inplace=True)
assert expected is None
df[1][:4] = np.nan
df[3][-4:] = np.nan
expected = df.fillna(method='ffill')
assert expected is not df
df.fillna(method='ffill', inplace=True)
tm.assert_frame_equal(df, expected)
def test_fillna_dict_series(self):
df = DataFrame({'a': [nan, 1, 2, nan, nan],
'b': [1, 2, 3, nan, nan],
'c': [nan, 1, 2, 3, 4]})
result = df.fillna({'a': 0, 'b': 5})
expected = df.copy()
expected['a'] = expected['a'].fillna(0)
expected['b'] = expected['b'].fillna(5)
assert_frame_equal(result, expected)
# it works
result = df.fillna({'a': 0, 'b': 5, 'd': 7})
# Series treated same as dict
result = df.fillna(df.max())
expected = df.fillna(df.max().to_dict())
assert_frame_equal(result, expected)
# disable this for now
with tm.assert_raises_regex(NotImplementedError,
'column by column'):
df.fillna(df.max(1), axis=1)
def test_fillna_dataframe(self):
# GH 8377
df = DataFrame({'a': [nan, 1, 2, nan, nan],
'b': [1, 2, 3, nan, nan],
'c': [nan, 1, 2, 3, 4]},
index=list('VWXYZ'))
# df2 may have different index and columns
df2 = DataFrame({'a': [nan, 10, 20, 30, 40],
'b': [50, 60, 70, 80, 90],
'foo': ['bar'] * 5},
index=list('VWXuZ'))
result = df.fillna(df2)
# only those columns and indices which are shared get filled
expected = DataFrame({'a': [nan, 1, 2, nan, 40],
'b': [1, 2, 3, nan, 90],
'c': [nan, 1, 2, 3, 4]},
index=list('VWXYZ'))
assert_frame_equal(result, expected)
def test_fillna_columns(self):
df = DataFrame(np.random.randn(10, 10))
df.values[:, ::2] = np.nan
result = df.fillna(method='ffill', axis=1)
expected = df.T.fillna(method='pad').T
assert_frame_equal(result, expected)
df.insert(6, 'foo', 5)
result = df.fillna(method='ffill', axis=1)
expected = df.astype(float).fillna(method='ffill', axis=1)
assert_frame_equal(result, expected)
def test_fillna_invalid_method(self):
with tm.assert_raises_regex(ValueError, 'ffil'):
self.frame.fillna(method='ffil')
def test_fillna_invalid_value(self):
# list
pytest.raises(TypeError, self.frame.fillna, [1, 2])
# tuple
pytest.raises(TypeError, self.frame.fillna, (1, 2))
# frame with series
pytest.raises(TypeError, self.frame.iloc[:, 0].fillna, self.frame)
def test_fillna_col_reordering(self):
cols = ["COL." + str(i) for i in range(5, 0, -1)]
data = np.random.rand(20, 5)
df = DataFrame(index=lrange(20), columns=cols, data=data)
filled = df.fillna(method='ffill')
assert df.columns.tolist() == filled.columns.tolist()
def test_fill_corner(self):
mf = self.mixed_frame
mf.loc[mf.index[5:20], 'foo'] = nan
mf.loc[mf.index[-10:], 'A'] = nan
filled = self.mixed_frame.fillna(value=0)
assert (filled.loc[filled.index[5:20], 'foo'] == 0).all()
del self.mixed_frame['foo']
empty_float = self.frame.reindex(columns=[])
# TODO(wesm): unused?
result = empty_float.fillna(value=0) # noqa
def test_fill_value_when_combine_const(self):
# GH12723
dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float')
df = DataFrame({'foo': dat}, index=range(6))
exp = df.fillna(0).add(2)
res = df.add(2, fill_value=0)
assert_frame_equal(res, exp)
class TestDataFrameInterpolate(TestData):
def test_interp_basic(self):
df = DataFrame({'A': [1, 2, np.nan, 4],
'B': [1, 4, 9, np.nan],
'C': [1, 2, 3, 5],
'D': list('abcd')})
expected = DataFrame({'A': [1., 2., 3., 4.],
'B': [1., 4., 9., 9.],
'C': [1, 2, 3, 5],
'D': list('abcd')})
result = df.interpolate()
assert_frame_equal(result, expected)
result = df.set_index('C').interpolate()
expected = df.set_index('C')
expected.loc[3, 'A'] = 3
expected.loc[5, 'B'] = 9
assert_frame_equal(result, expected)
def test_interp_bad_method(self):
df = DataFrame({'A': [1, 2, np.nan, 4],
'B': [1, 4, 9, np.nan],
'C': [1, 2, 3, 5],
'D': list('abcd')})
with pytest.raises(ValueError):
df.interpolate(method='not_a_method')
def test_interp_combo(self):
df = DataFrame({'A': [1., 2., np.nan, 4.],
'B': [1, 4, 9, np.nan],
'C': [1, 2, 3, 5],
'D': list('abcd')})
result = df['A'].interpolate()
expected = Series([1., 2., 3., 4.], name='A')
assert_series_equal(result, expected)
result = df['A'].interpolate(downcast='infer')
expected = Series([1, 2, 3, 4], name='A')
assert_series_equal(result, expected)
def test_interp_nan_idx(self):
df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]})
df = df.set_index('A')
with pytest.raises(NotImplementedError):
df.interpolate(method='values')
@td.skip_if_no_scipy
def test_interp_various(self):
df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
'C': [1, 2, 3, 5, 8, 13, 21]})
df = df.set_index('C')
expected = df.copy()
result = df.interpolate(method='polynomial', order=1)
expected.A.loc[3] = 2.66666667
expected.A.loc[13] = 5.76923076
assert_frame_equal(result, expected)
result = df.interpolate(method='cubic')
# GH #15662.
# new cubic and quadratic interpolation algorithms from scipy 0.19.0.
# previously `splmake` was used. See scipy/scipy#6710
if _is_scipy_ge_0190:
expected.A.loc[3] = 2.81547781
expected.A.loc[13] = 5.52964175
else:
expected.A.loc[3] = 2.81621174
expected.A.loc[13] = 5.64146581
assert_frame_equal(result, expected)
result = df.interpolate(method='nearest')
expected.A.loc[3] = 2
expected.A.loc[13] = 5
assert_frame_equal(result, expected, check_dtype=False)
result = df.interpolate(method='quadratic')
if _is_scipy_ge_0190:
expected.A.loc[3] = 2.82150771
expected.A.loc[13] = 6.12648668
else:
expected.A.loc[3] = 2.82533638
expected.A.loc[13] = 6.02817974
assert_frame_equal(result, expected)
result = df.interpolate(method='slinear')
expected.A.loc[3] = 2.66666667
expected.A.loc[13] = 5.76923077
assert_frame_equal(result, expected)
result = df.interpolate(method='zero')
expected.A.loc[3] = 2.
expected.A.loc[13] = 5
assert_frame_equal(result, expected, check_dtype=False)
@td.skip_if_no_scipy
def test_interp_alt_scipy(self):
df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
'C': [1, 2, 3, 5, 8, 13, 21]})
result = df.interpolate(method='barycentric')
expected = df.copy()
expected.loc[2, 'A'] = 3
expected.loc[5, 'A'] = 6
assert_frame_equal(result, expected)
result = df.interpolate(method='barycentric', downcast='infer')
assert_frame_equal(result, expected.astype(np.int64))
result = df.interpolate(method='krogh')
expectedk = df.copy()
expectedk['A'] = expected['A']
assert_frame_equal(result, expectedk)
_skip_if_no_pchip()
import scipy
result = df.interpolate(method='pchip')
expected.loc[2, 'A'] = 3
if LooseVersion(scipy.__version__) >= LooseVersion('0.17.0'):
expected.loc[5, 'A'] = 6.0
else:
expected.loc[5, 'A'] = 6.125
assert_frame_equal(result, expected)
def test_interp_rowwise(self):
df = DataFrame({0: [1, 2, np.nan, 4],
1: [2, 3, 4, np.nan],
2: [np.nan, 4, 5, 6],
3: [4, np.nan, 6, 7],
4: [1, 2, 3, 4]})
result = df.interpolate(axis=1)
expected = df.copy()
expected.loc[3, 1] = 5
expected.loc[0, 2] = 3
expected.loc[1, 3] = 3
expected[4] = expected[4].astype(np.float64)
assert_frame_equal(result, expected)
result = df.interpolate(axis=1, method='values')
assert_frame_equal(result, expected)
result = df.interpolate(axis=0)
expected = df.interpolate()
assert_frame_equal(result, expected)
def test_rowwise_alt(self):
df = DataFrame({0: [0, .5, 1., np.nan, 4, 8, np.nan, np.nan, 64],
1: [1, 2, 3, 4, 3, 2, 1, 0, -1]})
df.interpolate(axis=0)
@pytest.mark.parametrize("check_scipy", [
False, pytest.param(True, marks=td.skip_if_no_scipy)
])
def test_interp_leading_nans(self, check_scipy):
df = DataFrame({"A": [np.nan, np.nan, .5, .25, 0],
"B": [np.nan, -3, -3.5, np.nan, -4]})
result = df.interpolate()
expected = df.copy()
expected['B'].loc[3] = -3.75
assert_frame_equal(result, expected)
if check_scipy:
result = df.interpolate(method='polynomial', order=1)
assert_frame_equal(result, expected)
def test_interp_raise_on_only_mixed(self):
df = DataFrame({'A': [1, 2, np.nan, 4],
'B': ['a', 'b', 'c', 'd'],
'C': [np.nan, 2, 5, 7],
'D': [np.nan, np.nan, 9, 9],
'E': [1, 2, 3, 4]})
with pytest.raises(TypeError):
df.interpolate(axis=1)
def test_interp_inplace(self):
df = DataFrame({'a': [1., 2., np.nan, 4.]})
expected = DataFrame({'a': [1., 2., 3., 4.]})
result = df.copy()
result['a'].interpolate(inplace=True)
assert_frame_equal(result, expected)
result = df.copy()
result['a'].interpolate(inplace=True, downcast='infer')
assert_frame_equal(result, expected.astype('int64'))
def test_interp_inplace_row(self):
# GH 10395
result = DataFrame({'a': [1., 2., 3., 4.],
'b': [np.nan, 2., 3., 4.],
'c': [3, 2, 2, 2]})
expected = result.interpolate(method='linear', axis=1, inplace=False)
result.interpolate(method='linear', axis=1, inplace=True)
assert_frame_equal(result, expected)
def test_interp_ignore_all_good(self):
# GH
df = DataFrame({'A': [1, 2, np.nan, 4],
'B': [1, 2, 3, 4],
'C': [1., 2., np.nan, 4.],
'D': [1., 2., 3., 4.]})
expected = DataFrame({'A': np.array(
[1, 2, 3, 4], dtype='float64'),
'B': np.array(
[1, 2, 3, 4], dtype='int64'),
'C': np.array(
[1., 2., 3, 4.], dtype='float64'),
'D': np.array(
[1., 2., 3., 4.], dtype='float64')})
result = df.interpolate(downcast=None)
assert_frame_equal(result, expected)
# all good
result = df[['B', 'D']].interpolate(downcast=None)
assert_frame_equal(result, df[['B', 'D']])
@@ -1,283 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import pytest
from pandas.compat import range, lrange
import numpy as np
from pandas.compat import PY36
from pandas import DataFrame, Series, Index, MultiIndex
from pandas.util.testing import assert_frame_equal
import pandas.util.testing as tm
from pandas.tests.frame.common import TestData
# Column add, remove, delete.
class TestDataFrameMutateColumns(TestData):
def test_assign(self):
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
original = df.copy()
result = df.assign(C=df.B / df.A)
expected = df.copy()
expected['C'] = [4, 2.5, 2]
assert_frame_equal(result, expected)
# lambda syntax
result = df.assign(C=lambda x: x.B / x.A)
assert_frame_equal(result, expected)
# original is unmodified
assert_frame_equal(df, original)
# Non-Series array-like
result = df.assign(C=[4, 2.5, 2])
assert_frame_equal(result, expected)
# original is unmodified
assert_frame_equal(df, original)
result = df.assign(B=df.B / df.A)
expected = expected.drop('B', axis=1).rename(columns={'C': 'B'})
assert_frame_equal(result, expected)
# overwrite
result = df.assign(A=df.A + df.B)
expected = df.copy()
expected['A'] = [5, 7, 9]
assert_frame_equal(result, expected)
# lambda
result = df.assign(A=lambda x: x.A + x.B)
assert_frame_equal(result, expected)
def test_assign_multiple(self):
df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=['A', 'B'])
result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
expected = DataFrame([[1, 4, 7, 1, 4], [2, 5, 8, 2, 5],
[3, 6, 9, 3, 6]], columns=list('ABCDE'))
assert_frame_equal(result, expected)
def test_assign_order(self):
# GH 9818
df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
result = df.assign(D=df.A + df.B, C=df.A - df.B)
if PY36:
expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]],
columns=list('ABDC'))
else:
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
columns=list('ABCD'))
assert_frame_equal(result, expected)
result = df.assign(C=df.A - df.B, D=df.A + df.B)
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
columns=list('ABCD'))
assert_frame_equal(result, expected)
def test_assign_bad(self):
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
# non-keyword argument
with pytest.raises(TypeError):
df.assign(lambda x: x.A)
with pytest.raises(AttributeError):
df.assign(C=df.A, D=df.A + df.C)
@pytest.mark.skipif(PY36, reason="""Issue #14207: valid for python
3.6 and above""")
def test_assign_dependent_old_python(self):
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
# Key C does not exist at definition time of df
with pytest.raises(KeyError):
df.assign(C=lambda df: df.A,
D=lambda df: df['A'] + df['C'])
with pytest.raises(KeyError):
df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
@pytest.mark.skipif(not PY36, reason="""Issue #14207: not valid for
python 3.5 and below""")
def test_assign_dependent(self):
df = DataFrame({'A': [1, 2], 'B': [3, 4]})
result = df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
columns=list('ABCD'))
assert_frame_equal(result, expected)
result = df.assign(C=lambda df: df.A,
D=lambda df: df['A'] + df['C'])
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
columns=list('ABCD'))
assert_frame_equal(result, expected)
def test_insert_error_msmgs(self):
# GH 7432
df = DataFrame({'foo': ['a', 'b', 'c'], 'bar': [
1, 2, 3], 'baz': ['d', 'e', 'f']}).set_index('foo')
s = DataFrame({'foo': ['a', 'b', 'c', 'a'], 'fiz': [
'g', 'h', 'i', 'j']}).set_index('foo')
msg = 'cannot reindex from a duplicate axis'
with tm.assert_raises_regex(ValueError, msg):
df['newcol'] = s
# GH 4107, more descriptive error message
df = DataFrame(np.random.randint(0, 2, (4, 4)),
columns=['a', 'b', 'c', 'd'])
msg = 'incompatible index of inserted column with frame index'
with tm.assert_raises_regex(TypeError, msg):
df['gr'] = df.groupby(['b', 'c']).count()
def test_insert_benchmark(self):
# from the vb_suite/frame_methods/frame_insert_columns
N = 10
K = 5
df = DataFrame(index=lrange(N))
new_col = np.random.randn(N)
for i in range(K):
df[i] = new_col
expected = DataFrame(np.repeat(new_col, K).reshape(N, K),
index=lrange(N))
assert_frame_equal(df, expected)
def test_insert(self):
df = DataFrame(np.random.randn(5, 3), index=np.arange(5),
columns=['c', 'b', 'a'])
df.insert(0, 'foo', df['a'])
tm.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a']))
tm.assert_series_equal(df['a'], df['foo'], check_names=False)
df.insert(2, 'bar', df['c'])
tm.assert_index_equal(df.columns,
Index(['foo', 'c', 'bar', 'b', 'a']))
tm.assert_almost_equal(df['c'], df['bar'], check_names=False)
# diff dtype
# new item
df['x'] = df['a'].astype('float32')
result = Series(dict(float32=1, float64=5))
assert (df.get_dtype_counts().sort_index() == result).all()
# replacing current (in different block)
df['a'] = df['a'].astype('float32')
result = Series(dict(float32=2, float64=4))
assert (df.get_dtype_counts().sort_index() == result).all()
df['y'] = df['a'].astype('int32')
result = Series(dict(float32=2, float64=4, int32=1))
assert (df.get_dtype_counts().sort_index() == result).all()
with tm.assert_raises_regex(ValueError, 'already exists'):
df.insert(1, 'a', df['b'])
pytest.raises(ValueError, df.insert, 1, 'c', df['b'])
df.columns.name = 'some_name'
# preserve columns name field
df.insert(0, 'baz', df['c'])
assert df.columns.name == 'some_name'
# GH 13522
df = DataFrame(index=['A', 'B', 'C'])
df['X'] = df.index
df['X'] = ['x', 'y', 'z']
exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C'])
assert_frame_equal(df, exp)
def test_delitem(self):
del self.frame['A']
assert 'A' not in self.frame
def test_delitem_multiindex(self):
midx = MultiIndex.from_product([['A', 'B'], [1, 2]])
df = DataFrame(np.random.randn(4, 4), columns=midx)
assert len(df.columns) == 4
assert ('A', ) in df.columns
assert 'A' in df.columns
result = df['A']
assert isinstance(result, DataFrame)
del df['A']
assert len(df.columns) == 2
# A still in the levels, BUT get a KeyError if trying
# to delete
assert ('A', ) not in df.columns
with pytest.raises(KeyError):
del df[('A',)]
# behavior of dropped/deleted MultiIndex levels changed from
# GH 2770 to GH 19027: MultiIndex no longer '.__contains__'
# levels which are dropped/deleted
assert 'A' not in df.columns
with pytest.raises(KeyError):
del df['A']
def test_pop(self):
self.frame.columns.name = 'baz'
self.frame.pop('A')
assert 'A' not in self.frame
self.frame['foo'] = 'bar'
self.frame.pop('foo')
assert 'foo' not in self.frame
# TODO assert self.frame.columns.name == 'baz'
# gh-10912: inplace ops cause caching issue
a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[
'A', 'B', 'C'], index=['X', 'Y'])
b = a.pop('B')
b += 1
# original frame
expected = DataFrame([[1, 3], [4, 6]], columns=[
'A', 'C'], index=['X', 'Y'])
tm.assert_frame_equal(a, expected)
# result
expected = Series([2, 5], index=['X', 'Y'], name='B') + 1
tm.assert_series_equal(b, expected)
def test_pop_non_unique_cols(self):
df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]})
df.columns = ["a", "b", "a"]
res = df.pop("a")
assert type(res) == DataFrame
assert len(res) == 2
assert len(df.columns) == 1
assert "b" in df.columns
assert "a" not in df.columns
assert len(df.index) == 2
def test_insert_column_bug_4032(self):
# GH4032, inserting a column and renaming causing errors
df = DataFrame({'b': [1.1, 2.2]})
df = df.rename(columns={})
df.insert(0, 'a', [1, 2])
result = df.rename(columns={})
str(result)
expected = DataFrame([[1, 1.1], [2, 2.2]], columns=['a', 'b'])
assert_frame_equal(result, expected)
df.insert(0, 'c', [1.3, 2.3])
result = df.rename(columns={})
str(result)
expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]],
columns=['c', 'a', 'b'])
assert_frame_equal(result, expected)
@@ -1,478 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import pytest
import numpy as np
from pandas.compat import lrange, u
from pandas import DataFrame, Series, MultiIndex, date_range
import pandas as pd
from pandas.util.testing import assert_series_equal, assert_frame_equal
import pandas.util.testing as tm
from pandas.tests.frame.common import TestData
class TestDataFrameNonuniqueIndexes(TestData):
def test_column_dups_operations(self):
def check(result, expected=None):
if expected is not None:
assert_frame_equal(result, expected)
result.dtypes
str(result)
# assignment
# GH 3687
arr = np.random.randn(3, 2)
idx = lrange(2)
df = DataFrame(arr, columns=['A', 'A'])
df.columns = idx
expected = DataFrame(arr, columns=idx)
check(df, expected)
idx = date_range('20130101', periods=4, freq='Q-NOV')
df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
columns=['a', 'a', 'a', 'a'])
df.columns = idx
expected = DataFrame(
[[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
check(df, expected)
# insert
df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
columns=['foo', 'bar', 'foo', 'hello'])
df['string'] = 'bah'
expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'],
[2, 1, 3, 5, 'bah']],
columns=['foo', 'bar', 'foo', 'hello', 'string'])
check(df, expected)
with tm.assert_raises_regex(ValueError, 'Length of value'):
df.insert(0, 'AnotherColumn', range(len(df.index) - 1))
# insert same dtype
df['foo2'] = 3
expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3],
[2, 1, 3, 5, 'bah', 3]],
columns=['foo', 'bar', 'foo', 'hello',
'string', 'foo2'])
check(df, expected)
# set (non-dup)
df['foo2'] = 4
expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4],
[2, 1, 3, 5, 'bah', 4]],
columns=['foo', 'bar', 'foo', 'hello',
'string', 'foo2'])
check(df, expected)
df['foo2'] = 3
# delete (non dup)
del df['bar']
expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3],
[2, 3, 5, 'bah', 3]],
columns=['foo', 'foo', 'hello', 'string', 'foo2'])
check(df, expected)
# try to delete again (its not consolidated)
del df['hello']
expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
[2, 3, 'bah', 3]],
columns=['foo', 'foo', 'string', 'foo2'])
check(df, expected)
# consolidate
df = df._consolidate()
expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
[2, 3, 'bah', 3]],
columns=['foo', 'foo', 'string', 'foo2'])
check(df, expected)
# insert
df.insert(2, 'new_col', 5.)
expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3],
[2, 3, 5., 'bah', 3]],
columns=['foo', 'foo', 'new_col', 'string',
'foo2'])
check(df, expected)
# insert a dup
tm.assert_raises_regex(ValueError, 'cannot insert',
df.insert, 2, 'new_col', 4.)
df.insert(2, 'new_col', 4., allow_duplicates=True)
expected = DataFrame([[1, 1, 4., 5., 'bah', 3],
[1, 2, 4., 5., 'bah', 3],
[2, 3, 4., 5., 'bah', 3]],
columns=['foo', 'foo', 'new_col',
'new_col', 'string', 'foo2'])
check(df, expected)
# delete (dup)
del df['foo']
expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3],
[4., 5., 'bah', 3]],
columns=['new_col', 'new_col', 'string', 'foo2'])
assert_frame_equal(df, expected)
# dup across dtypes
df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]],
columns=['foo', 'bar', 'foo', 'hello'])
check(df)
df['foo2'] = 7.
expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.],
[2, 1, 3., 5, 7.]],
columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
check(df, expected)
result = df['foo']
expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]],
columns=['foo', 'foo'])
check(result, expected)
# multiple replacements
df['foo'] = 'string'
expected = DataFrame([['string', 1, 'string', 5, 7.],
['string', 1, 'string', 5, 7.],
['string', 1, 'string', 5, 7.]],
columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
check(df, expected)
del df['foo']
expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[
'bar', 'hello', 'foo2'])
check(df, expected)
# values
df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x'])
result = df.values
expected = np.array([[1, 2.5], [3, 4.5]])
assert (result == expected).all().all()
# rename, GH 4403
df4 = DataFrame(
{'RT': [0.0454],
'TClose': [22.02],
'TExg': [0.0422]},
index=MultiIndex.from_tuples([(600809, 20130331)],
names=['STK_ID', 'RPT_Date']))
df5 = DataFrame({'RPT_Date': [20120930, 20121231, 20130331],
'STK_ID': [600809] * 3,
'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')],
'TClose': [38.05, 41.66, 30.01]},
index=MultiIndex.from_tuples(
[(600809, 20120930),
(600809, 20121231),
(600809, 20130331)],
names=['STK_ID', 'RPT_Date']))
k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True)
result = k.rename(
columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'})
str(result)
result.dtypes
expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809,
u('饡驦'), 30.01]],
columns=['RT', 'TClose', 'TExg',
'RPT_Date', 'STK_ID', 'STK_Name',
'QT_Close'])
.set_index(['STK_ID', 'RPT_Date'], drop=False))
assert_frame_equal(result, expected)
# reindex is invalid!
df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
columns=['bar', 'a', 'a'])
pytest.raises(ValueError, df.reindex, columns=['bar'])
pytest.raises(ValueError, df.reindex, columns=['bar', 'foo'])
# drop
df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
columns=['bar', 'a', 'a'])
result = df.drop(['a'], axis=1)
expected = DataFrame([[1], [1], [1]], columns=['bar'])
check(result, expected)
result = df.drop('a', axis=1)
check(result, expected)
# describe
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
columns=['bar', 'a', 'a'], dtype='float64')
result = df.describe()
s = df.iloc[:, 0].describe()
expected = pd.concat([s, s, s], keys=df.columns, axis=1)
check(result, expected)
# check column dups with index equal and not equal to df's index
df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'],
columns=['A', 'B', 'A'])
for index in [df.index, pd.Index(list('edcba'))]:
this_df = df.copy()
expected_ser = pd.Series(index.values, index=this_df.index)
expected_df = DataFrame({'A': expected_ser,
'B': this_df['B'],
'A': expected_ser},
columns=['A', 'B', 'A'])
this_df['A'] = index
check(this_df, expected_df)
# operations
for op in ['__add__', '__mul__', '__sub__', '__truediv__']:
df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
expected = getattr(df, op)(df)
expected.columns = ['A', 'A']
df.columns = ['A', 'A']
result = getattr(df, op)(df)
check(result, expected)
# multiple assignments that change dtypes
# the location indexer is a slice
# GH 6120
df = DataFrame(np.random.randn(5, 2), columns=['that', 'that'])
expected = DataFrame(1.0, index=range(5), columns=['that', 'that'])
df['that'] = 1.0
check(df, expected)
df = DataFrame(np.random.rand(5, 2), columns=['that', 'that'])
expected = DataFrame(1, index=range(5), columns=['that', 'that'])
df['that'] = 1
check(df, expected)
def test_column_dups2(self):
# drop buggy GH 6240
df = DataFrame({'A': np.random.randn(5),
'B': np.random.randn(5),
'C': np.random.randn(5),
'D': ['a', 'b', 'c', 'd', 'e']})
expected = df.take([0, 1, 1], axis=1)
df2 = df.take([2, 0, 1, 2, 1], axis=1)
result = df2.drop('C', axis=1)
assert_frame_equal(result, expected)
# dropna
df = DataFrame({'A': np.random.randn(5),
'B': np.random.randn(5),
'C': np.random.randn(5),
'D': ['a', 'b', 'c', 'd', 'e']})
df.iloc[2, [0, 1, 2]] = np.nan
df.iloc[0, 0] = np.nan
df.iloc[1, 1] = np.nan
df.iloc[:, 3] = np.nan
expected = df.dropna(subset=['A', 'B', 'C'], how='all')
expected.columns = ['A', 'A', 'B', 'C']
df.columns = ['A', 'A', 'B', 'C']
result = df.dropna(subset=['A', 'C'], how='all')
assert_frame_equal(result, expected)
def test_column_dups_indexing(self):
def check(result, expected=None):
if expected is not None:
assert_frame_equal(result, expected)
result.dtypes
str(result)
# boolean indexing
# GH 4879
dups = ['A', 'A', 'C', 'D']
df = DataFrame(np.arange(12).reshape(3, 4), columns=[
'A', 'B', 'C', 'D'], dtype='float64')
expected = df[df.C > 6]
expected.columns = dups
df = DataFrame(np.arange(12).reshape(3, 4),
columns=dups, dtype='float64')
result = df[df.C > 6]
check(result, expected)
# where
df = DataFrame(np.arange(12).reshape(3, 4), columns=[
'A', 'B', 'C', 'D'], dtype='float64')
expected = df[df > 6]
expected.columns = dups
df = DataFrame(np.arange(12).reshape(3, 4),
columns=dups, dtype='float64')
result = df[df > 6]
check(result, expected)
# boolean with the duplicate raises
df = DataFrame(np.arange(12).reshape(3, 4),
columns=dups, dtype='float64')
pytest.raises(ValueError, lambda: df[df.A > 6])
# dup aligining operations should work
# GH 5185
df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
result = df1.sub(df2)
assert_frame_equal(result, expected)
# equality
df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]],
columns=['A', 'B'])
df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]],
columns=['A', 'A'])
# not-comparing like-labelled
pytest.raises(ValueError, lambda: df1 == df2)
df1r = df1.reindex_like(df2)
result = df1r == df2
expected = DataFrame([[False, True], [True, False], [False, False], [
True, False]], columns=['A', 'A'])
assert_frame_equal(result, expected)
# mixed column selection
# GH 5639
dfbool = DataFrame({'one': Series([True, True, False],
index=['a', 'b', 'c']),
'two': Series([False, False, True, False],
index=['a', 'b', 'c', 'd']),
'three': Series([False, True, True, True],
index=['a', 'b', 'c', 'd'])})
expected = pd.concat(
[dfbool['one'], dfbool['three'], dfbool['one']], axis=1)
result = dfbool[['one', 'three', 'one']]
check(result, expected)
# multi-axis dups
# GH 6121
df = DataFrame(np.arange(25.).reshape(5, 5),
index=['a', 'b', 'c', 'd', 'e'],
columns=['A', 'B', 'C', 'D', 'E'])
z = df[['A', 'C', 'A']].copy()
expected = z.loc[['a', 'c', 'a']]
df = DataFrame(np.arange(25.).reshape(5, 5),
index=['a', 'b', 'c', 'd', 'e'],
columns=['A', 'B', 'C', 'D', 'E'])
z = df[['A', 'C', 'A']]
result = z.loc[['a', 'c', 'a']]
check(result, expected)
def test_column_dups_indexing2(self):
# GH 8363
# datetime ops with a non-unique index
df = DataFrame({'A': np.arange(5, dtype='int64'),
'B': np.arange(1, 6, dtype='int64')},
index=[2, 2, 3, 3, 4])
result = df.B - df.A
expected = Series(1, index=[2, 2, 3, 3, 4])
assert_series_equal(result, expected)
df = DataFrame({'A': date_range('20130101', periods=5),
'B': date_range('20130101 09:00:00', periods=5)},
index=[2, 2, 3, 3, 4])
result = df.B - df.A
expected = Series(pd.Timedelta('9 hours'), index=[2, 2, 3, 3, 4])
assert_series_equal(result, expected)
def test_columns_with_dups(self):
# GH 3468 related
# basic
df = DataFrame([[1, 2]], columns=['a', 'a'])
df.columns = ['a', 'a.1']
str(df)
expected = DataFrame([[1, 2]], columns=['a', 'a.1'])
assert_frame_equal(df, expected)
df = DataFrame([[1, 2, 3]], columns=['b', 'a', 'a'])
df.columns = ['b', 'a', 'a.1']
str(df)
expected = DataFrame([[1, 2, 3]], columns=['b', 'a', 'a.1'])
assert_frame_equal(df, expected)
# with a dup index
df = DataFrame([[1, 2]], columns=['a', 'a'])
df.columns = ['b', 'b']
str(df)
expected = DataFrame([[1, 2]], columns=['b', 'b'])
assert_frame_equal(df, expected)
# multi-dtype
df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
columns=['a', 'a', 'b', 'b', 'd', 'c', 'c'])
df.columns = list('ABCDEFG')
str(df)
expected = DataFrame(
[[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('ABCDEFG'))
assert_frame_equal(df, expected)
# this is an error because we cannot disambiguate the dup columns
pytest.raises(Exception, lambda x: DataFrame(
[[1, 2, 'foo', 'bar']], columns=['a', 'a', 'a', 'a']))
# dups across blocks
df_float = DataFrame(np.random.randn(10, 3), dtype='float64')
df_int = DataFrame(np.random.randn(10, 3), dtype='int64')
df_bool = DataFrame(True, index=df_float.index,
columns=df_float.columns)
df_object = DataFrame('foo', index=df_float.index,
columns=df_float.columns)
df_dt = DataFrame(pd.Timestamp('20010101'),
index=df_float.index,
columns=df_float.columns)
df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
assert len(df._data._blknos) == len(df.columns)
assert len(df._data._blklocs) == len(df.columns)
# testing iloc
for i in range(len(df.columns)):
df.iloc[:, i]
# dup columns across dtype GH 2079/2194
vals = [[1, -1, 2.], [2, -2, 3.]]
rs = DataFrame(vals, columns=['A', 'A', 'B'])
xp = DataFrame(vals)
xp.columns = ['A', 'A', 'B']
assert_frame_equal(rs, xp)
def test_values_duplicates(self):
df = DataFrame([[1, 2, 'a', 'b'],
[1, 2, 'a', 'b']],
columns=['one', 'one', 'two', 'two'])
result = df.values
expected = np.array([[1, 2, 'a', 'b'], [1, 2, 'a', 'b']],
dtype=object)
tm.assert_numpy_array_equal(result, expected)
def test_set_value_by_index(self):
# See gh-12344
df = DataFrame(np.arange(9).reshape(3, 3).T)
df.columns = list('AAA')
expected = df.iloc[:, 2]
df.iloc[:, 0] = 3
assert_series_equal(df.iloc[:, 2], expected)
df = DataFrame(np.arange(9).reshape(3, 3).T)
df.columns = [2, float(2), str(2)]
expected = df.iloc[:, 1]
df.iloc[:, 0] = 3
assert_series_equal(df.iloc[:, 1], expected)
def test_insert_with_columns_dups(self):
# GH 14291
df = pd.DataFrame()
df.insert(0, 'A', ['g', 'h', 'i'], allow_duplicates=True)
df.insert(0, 'A', ['d', 'e', 'f'], allow_duplicates=True)
df.insert(0, 'A', ['a', 'b', 'c'], allow_duplicates=True)
exp = pd.DataFrame([['a', 'd', 'g'], ['b', 'e', 'h'],
['c', 'f', 'i']], columns=['A', 'A', 'A'])
assert_frame_equal(df, exp)
File diff suppressed because it is too large Load Diff
@@ -1,140 +0,0 @@
import numpy as np
from numpy.random import randn
from datetime import timedelta
import pandas as pd
import pandas.util.testing as tm
from pandas import (PeriodIndex, period_range, DataFrame, date_range,
Index, to_datetime, DatetimeIndex)
def _permute(obj):
return obj.take(np.random.permutation(len(obj)))
class TestPeriodIndex(object):
def setup_method(self, method):
pass
def test_as_frame_columns(self):
rng = period_range('1/1/2000', periods=5)
df = DataFrame(randn(10, 5), columns=rng)
ts = df[rng[0]]
tm.assert_series_equal(ts, df.iloc[:, 0])
# GH # 1211
repr(df)
ts = df['1/1/2000']
tm.assert_series_equal(ts, df.iloc[:, 0])
def test_frame_setitem(self):
rng = period_range('1/1/2000', periods=5, name='index')
df = DataFrame(randn(5, 3), index=rng)
df['Index'] = rng
rs = Index(df['Index'])
tm.assert_index_equal(rs, rng, check_names=False)
assert rs.name == 'Index'
assert rng.name == 'index'
rs = df.reset_index().set_index('index')
assert isinstance(rs.index, PeriodIndex)
tm.assert_index_equal(rs.index, rng)
def test_frame_to_time_stamp(self):
K = 5
index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009')
df = DataFrame(randn(len(index), K), index=index)
df['mix'] = 'a'
exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC')
result = df.to_timestamp('D', 'end')
tm.assert_index_equal(result.index, exp_index)
tm.assert_numpy_array_equal(result.values, df.values)
exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN')
result = df.to_timestamp('D', 'start')
tm.assert_index_equal(result.index, exp_index)
def _get_with_delta(delta, freq='A-DEC'):
return date_range(to_datetime('1/1/2001') + delta,
to_datetime('12/31/2009') + delta, freq=freq)
delta = timedelta(hours=23)
result = df.to_timestamp('H', 'end')
exp_index = _get_with_delta(delta)
tm.assert_index_equal(result.index, exp_index)
delta = timedelta(hours=23, minutes=59)
result = df.to_timestamp('T', 'end')
exp_index = _get_with_delta(delta)
tm.assert_index_equal(result.index, exp_index)
result = df.to_timestamp('S', 'end')
delta = timedelta(hours=23, minutes=59, seconds=59)
exp_index = _get_with_delta(delta)
tm.assert_index_equal(result.index, exp_index)
# columns
df = df.T
exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC')
result = df.to_timestamp('D', 'end', axis=1)
tm.assert_index_equal(result.columns, exp_index)
tm.assert_numpy_array_equal(result.values, df.values)
exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN')
result = df.to_timestamp('D', 'start', axis=1)
tm.assert_index_equal(result.columns, exp_index)
delta = timedelta(hours=23)
result = df.to_timestamp('H', 'end', axis=1)
exp_index = _get_with_delta(delta)
tm.assert_index_equal(result.columns, exp_index)
delta = timedelta(hours=23, minutes=59)
result = df.to_timestamp('T', 'end', axis=1)
exp_index = _get_with_delta(delta)
tm.assert_index_equal(result.columns, exp_index)
result = df.to_timestamp('S', 'end', axis=1)
delta = timedelta(hours=23, minutes=59, seconds=59)
exp_index = _get_with_delta(delta)
tm.assert_index_equal(result.columns, exp_index)
# invalid axis
tm.assert_raises_regex(
ValueError, 'axis', df.to_timestamp, axis=2)
result1 = df.to_timestamp('5t', axis=1)
result2 = df.to_timestamp('t', axis=1)
expected = pd.date_range('2001-01-01', '2009-01-01', freq='AS')
assert isinstance(result1.columns, DatetimeIndex)
assert isinstance(result2.columns, DatetimeIndex)
tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8)
tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8)
# PeriodIndex.to_timestamp always use 'infer'
assert result1.columns.freqstr == 'AS-JAN'
assert result2.columns.freqstr == 'AS-JAN'
def test_frame_index_to_string(self):
index = PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M')
frame = DataFrame(np.random.randn(3, 4), index=index)
# it works!
frame.to_string()
def test_align_frame(self):
rng = period_range('1/1/2000', '1/1/2010', freq='A')
ts = DataFrame(np.random.randn(len(rng), 3), index=rng)
result = ts + ts[::2]
expected = ts + ts
expected.values[1::2] = np.nan
tm.assert_frame_equal(result, expected)
result = ts + _permute(ts[::2])
tm.assert_frame_equal(result, expected)
@@ -1,393 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import pytest
import numpy as np
from pandas import (DataFrame, Series, Timestamp, _np_version_under1p11)
import pandas as pd
from pandas.util.testing import assert_series_equal, assert_frame_equal
import pandas.util.testing as tm
from pandas.tests.frame.common import TestData
class TestDataFrameQuantile(TestData):
def test_quantile(self):
from numpy import percentile
q = self.tsframe.quantile(0.1, axis=0)
assert q['A'] == percentile(self.tsframe['A'], 10)
tm.assert_index_equal(q.index, self.tsframe.columns)
q = self.tsframe.quantile(0.9, axis=1)
assert (q['2000-01-17'] ==
percentile(self.tsframe.loc['2000-01-17'], 90))
tm.assert_index_equal(q.index, self.tsframe.index)
# test degenerate case
q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0)
assert(np.isnan(q['x']) and np.isnan(q['y']))
# non-numeric exclusion
df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]})
rs = df.quantile(0.5)
xp = df.median().rename(0.5)
assert_series_equal(rs, xp)
# axis
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
result = df.quantile(.5, axis=1)
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
assert_series_equal(result, expected)
result = df.quantile([.5, .75], axis=1)
expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75],
3: [3.5, 3.75]}, index=[0.5, 0.75])
assert_frame_equal(result, expected, check_index_type=True)
# We may want to break API in the future to change this
# so that we exclude non-numeric along the same axis
# See GH #7312
df = DataFrame([[1, 2, 3],
['a', 'b', 4]])
result = df.quantile(.5, axis=1)
expected = Series([3., 4.], index=[0, 1], name=0.5)
assert_series_equal(result, expected)
def test_quantile_axis_mixed(self):
# mixed on axis=1
df = DataFrame({"A": [1, 2, 3],
"B": [2., 3., 4.],
"C": pd.date_range('20130101', periods=3),
"D": ['foo', 'bar', 'baz']})
result = df.quantile(.5, axis=1)
expected = Series([1.5, 2.5, 3.5], name=0.5)
assert_series_equal(result, expected)
# must raise
def f():
df.quantile(.5, axis=1, numeric_only=False)
pytest.raises(TypeError, f)
def test_quantile_axis_parameter(self):
# GH 9543/9544
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
result = df.quantile(.5, axis=0)
expected = Series([2., 3.], index=["A", "B"], name=0.5)
assert_series_equal(result, expected)
expected = df.quantile(.5, axis="index")
assert_series_equal(result, expected)
result = df.quantile(.5, axis=1)
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
assert_series_equal(result, expected)
result = df.quantile(.5, axis="columns")
assert_series_equal(result, expected)
pytest.raises(ValueError, df.quantile, 0.1, axis=-1)
pytest.raises(ValueError, df.quantile, 0.1, axis="column")
def test_quantile_interpolation(self):
# see gh-10174
from numpy import percentile
# interpolation = linear (default case)
q = self.tsframe.quantile(0.1, axis=0, interpolation='linear')
assert q['A'] == percentile(self.tsframe['A'], 10)
q = self.intframe.quantile(0.1)
assert q['A'] == percentile(self.intframe['A'], 10)
# test with and without interpolation keyword
q1 = self.intframe.quantile(0.1)
assert q1['A'] == np.percentile(self.intframe['A'], 10)
tm.assert_series_equal(q, q1)
# interpolation method other than default linear
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
result = df.quantile(.5, axis=1, interpolation='nearest')
expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5)
tm.assert_series_equal(result, expected)
# cross-check interpolation=nearest results in original dtype
exp = np.percentile(np.array([[1, 2, 3], [2, 3, 4]]), .5,
axis=0, interpolation='nearest')
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='int64')
tm.assert_series_equal(result, expected)
# float
df = DataFrame({"A": [1., 2., 3.], "B": [2., 3., 4.]}, index=[1, 2, 3])
result = df.quantile(.5, axis=1, interpolation='nearest')
expected = Series([1., 2., 3.], index=[1, 2, 3], name=0.5)
tm.assert_series_equal(result, expected)
exp = np.percentile(np.array([[1., 2., 3.], [2., 3., 4.]]), .5,
axis=0, interpolation='nearest')
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='float64')
assert_series_equal(result, expected)
# axis
result = df.quantile([.5, .75], axis=1, interpolation='lower')
expected = DataFrame({1: [1., 1.], 2: [2., 2.],
3: [3., 3.]}, index=[0.5, 0.75])
assert_frame_equal(result, expected)
# test degenerate case
df = DataFrame({'x': [], 'y': []})
q = df.quantile(0.1, axis=0, interpolation='higher')
assert(np.isnan(q['x']) and np.isnan(q['y']))
# multi
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
columns=['a', 'b', 'c'])
result = df.quantile([.25, .5], interpolation='midpoint')
# https://github.com/numpy/numpy/issues/7163
if _np_version_under1p11:
expected = DataFrame([[1.5, 1.5, 1.5], [2.5, 2.5, 2.5]],
index=[.25, .5], columns=['a', 'b', 'c'])
else:
expected = DataFrame([[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
index=[.25, .5], columns=['a', 'b', 'c'])
assert_frame_equal(result, expected)
def test_quantile_multi(self):
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
columns=['a', 'b', 'c'])
result = df.quantile([.25, .5])
expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
index=[.25, .5], columns=['a', 'b', 'c'])
assert_frame_equal(result, expected)
# axis = 1
result = df.quantile([.25, .5], axis=1)
expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
index=[.25, .5], columns=[0, 1, 2])
# empty
result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0)
expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]},
index=[.1, .9])
assert_frame_equal(result, expected)
def test_quantile_datetime(self):
df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]})
# exclude datetime
result = df.quantile(.5)
expected = Series([2.5], index=['b'])
# datetime
result = df.quantile(.5, numeric_only=False)
expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5],
index=['a', 'b'],
name=0.5)
assert_series_equal(result, expected)
# datetime w/ multi
result = df.quantile([.5], numeric_only=False)
expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]],
index=[.5], columns=['a', 'b'])
assert_frame_equal(result, expected)
# axis = 1
df['c'] = pd.to_datetime(['2011', '2012'])
result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False)
expected = Series([Timestamp('2010-07-02 12:00:00'),
Timestamp('2011-07-02 12:00:00')],
index=[0, 1],
name=0.5)
assert_series_equal(result, expected)
result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False)
expected = DataFrame([[Timestamp('2010-07-02 12:00:00'),
Timestamp('2011-07-02 12:00:00')]],
index=[0.5], columns=[0, 1])
assert_frame_equal(result, expected)
# empty when numeric_only=True
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
# result = df[['a', 'c']].quantile(.5)
# result = df[['a', 'c']].quantile([.5])
def test_quantile_invalid(self):
msg = 'percentiles should all be in the interval \\[0, 1\\]'
for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
with tm.assert_raises_regex(ValueError, msg):
self.tsframe.quantile(invalid)
def test_quantile_box(self):
df = DataFrame({'A': [pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-03')],
'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timestamp('2011-01-03', tz='US/Eastern')],
'C': [pd.Timedelta('1 days'),
pd.Timedelta('2 days'),
pd.Timedelta('3 days')]})
res = df.quantile(0.5, numeric_only=False)
exp = pd.Series([pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timedelta('2 days')],
name=0.5, index=['A', 'B', 'C'])
tm.assert_series_equal(res, exp)
res = df.quantile([0.5], numeric_only=False)
exp = pd.DataFrame([[pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timedelta('2 days')]],
index=[0.5], columns=['A', 'B', 'C'])
tm.assert_frame_equal(res, exp)
# DatetimeBlock may be consolidated and contain NaT in different loc
df = DataFrame({'A': [pd.Timestamp('2011-01-01'),
pd.NaT,
pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-03')],
'a': [pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-01-02'),
pd.NaT,
pd.Timestamp('2011-01-03')],
'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
pd.NaT,
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timestamp('2011-01-03', tz='US/Eastern')],
'b': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.NaT,
pd.Timestamp('2011-01-03', tz='US/Eastern')],
'C': [pd.Timedelta('1 days'),
pd.Timedelta('2 days'),
pd.Timedelta('3 days'),
pd.NaT],
'c': [pd.NaT,
pd.Timedelta('1 days'),
pd.Timedelta('2 days'),
pd.Timedelta('3 days')]},
columns=list('AaBbCc'))
res = df.quantile(0.5, numeric_only=False)
exp = pd.Series([pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timedelta('2 days'),
pd.Timedelta('2 days')],
name=0.5, index=list('AaBbCc'))
tm.assert_series_equal(res, exp)
res = df.quantile([0.5], numeric_only=False)
exp = pd.DataFrame([[pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timedelta('2 days'),
pd.Timedelta('2 days')]],
index=[0.5], columns=list('AaBbCc'))
tm.assert_frame_equal(res, exp)
def test_quantile_nan(self):
# GH 14357 - float block where some cols have missing values
df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)})
df.iloc[-1, 1] = np.nan
res = df.quantile(0.5)
exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5, 0.75])
exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75])
tm.assert_frame_equal(res, exp)
res = df.quantile(0.5, axis=1)
exp = Series(np.arange(1.0, 6.0), name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5, 0.75], axis=1)
exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
tm.assert_frame_equal(res, exp)
# full-nan column
df['b'] = np.nan
res = df.quantile(0.5)
exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5, 0.75])
exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]},
index=[0.5, 0.75])
tm.assert_frame_equal(res, exp)
def test_quantile_nat(self):
# full NaT column
df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]})
res = df.quantile(0.5, numeric_only=False)
exp = Series([pd.NaT], index=['a'], name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5], numeric_only=False)
exp = DataFrame({'a': [pd.NaT]}, index=[0.5])
tm.assert_frame_equal(res, exp)
# mixed non-null / full null column
df = DataFrame({'a': [pd.Timestamp('2012-01-01'),
pd.Timestamp('2012-01-02'),
pd.Timestamp('2012-01-03')],
'b': [pd.NaT, pd.NaT, pd.NaT]})
res = df.quantile(0.5, numeric_only=False)
exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'],
name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5], numeric_only=False)
exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5],
columns=['a', 'b'])
tm.assert_frame_equal(res, exp)
def test_quantile_empty(self):
# floats
df = DataFrame(columns=['a', 'b'], dtype='float64')
res = df.quantile(0.5)
exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5])
exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5])
tm.assert_frame_equal(res, exp)
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
# res = df.quantile(0.5, axis=1)
# res = df.quantile([0.5], axis=1)
# ints
df = DataFrame(columns=['a', 'b'], dtype='int64')
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
# res = df.quantile(0.5)
# datetimes
df = DataFrame(columns=['a', 'b'], dtype='datetime64[ns]')
# FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0)
# res = df.quantile(0.5, numeric_only=False)
File diff suppressed because it is too large Load Diff
@@ -1,299 +0,0 @@
# -*- coding: utf-8 -*-
import pytest
import numpy as np
import pandas.util.testing as tm
from distutils.version import LooseVersion
from datetime import timedelta, datetime
from numpy import nan
from pandas.util.testing import assert_frame_equal
from pandas.tests.frame.common import TestData
from pandas import Series, DataFrame
from pandas.compat import product
class TestRank(TestData):
s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3])
df = DataFrame({'A': s, 'B': s})
results = {
'average': np.array([1.5, 5.5, 7.0, 3.5, nan,
3.5, 1.5, 8.0, nan, 5.5]),
'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]),
'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]),
'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]),
'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]),
}
def test_rank(self):
rankdata = pytest.importorskip('scipy.stats.rankdata')
self.frame['A'][::2] = np.nan
self.frame['B'][::3] = np.nan
self.frame['C'][::4] = np.nan
self.frame['D'][::5] = np.nan
ranks0 = self.frame.rank()
ranks1 = self.frame.rank(1)
mask = np.isnan(self.frame.values)
fvals = self.frame.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, fvals)
exp0[mask] = np.nan
exp1 = np.apply_along_axis(rankdata, 1, fvals)
exp1[mask] = np.nan
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# integers
df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))
result = df.rank()
exp = df.astype(float).rank()
tm.assert_frame_equal(result, exp)
result = df.rank(1)
exp = df.astype(float).rank(1)
tm.assert_frame_equal(result, exp)
def test_rank2(self):
df = DataFrame([[1, 3, 2], [1, 2, 3]])
expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
result = df.rank(1, pct=True)
tm.assert_frame_equal(result, expected)
df = DataFrame([[1, 3, 2], [1, 2, 3]])
expected = df.rank(0) / 2.0
result = df.rank(0, pct=True)
tm.assert_frame_equal(result, expected)
df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']])
expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
result = df.rank(1, numeric_only=False)
tm.assert_frame_equal(result, expected)
expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
result = df.rank(0, numeric_only=False)
tm.assert_frame_equal(result, expected)
df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']])
expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]])
result = df.rank(1, numeric_only=False)
tm.assert_frame_equal(result, expected)
expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]])
result = df.rank(0, numeric_only=False)
tm.assert_frame_equal(result, expected)
# f7u12, this does not work without extensive workaround
data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)],
[datetime(2000, 1, 2), datetime(2000, 1, 3),
datetime(2000, 1, 1)]]
df = DataFrame(data)
# check the rank
expected = DataFrame([[2., nan, 1.],
[2., 3., 1.]])
result = df.rank(1, numeric_only=False, ascending=True)
tm.assert_frame_equal(result, expected)
expected = DataFrame([[1., nan, 2.],
[2., 1., 3.]])
result = df.rank(1, numeric_only=False, ascending=False)
tm.assert_frame_equal(result, expected)
# mixed-type frames
self.mixed_frame['datetime'] = datetime.now()
self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1)
result = self.mixed_frame.rank(1)
expected = self.mixed_frame.rank(1, numeric_only=True)
tm.assert_frame_equal(result, expected)
df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10,
1e60, 1e80, 1e-30]})
exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]})
tm.assert_frame_equal(df.rank(), exp)
def test_rank_na_option(self):
rankdata = pytest.importorskip('scipy.stats.rankdata')
self.frame['A'][::2] = np.nan
self.frame['B'][::3] = np.nan
self.frame['C'][::4] = np.nan
self.frame['D'][::5] = np.nan
# bottom
ranks0 = self.frame.rank(na_option='bottom')
ranks1 = self.frame.rank(1, na_option='bottom')
fvals = self.frame.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, fvals)
exp1 = np.apply_along_axis(rankdata, 1, fvals)
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# top
ranks0 = self.frame.rank(na_option='top')
ranks1 = self.frame.rank(1, na_option='top')
fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
fval1 = self.frame.T
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
fval1 = fval1.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, fval0)
exp1 = np.apply_along_axis(rankdata, 1, fval1)
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# descending
# bottom
ranks0 = self.frame.rank(na_option='top', ascending=False)
ranks1 = self.frame.rank(1, na_option='top', ascending=False)
fvals = self.frame.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, -fvals)
exp1 = np.apply_along_axis(rankdata, 1, -fvals)
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# descending
# top
ranks0 = self.frame.rank(na_option='bottom', ascending=False)
ranks1 = self.frame.rank(1, na_option='bottom', ascending=False)
fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
fval1 = self.frame.T
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
fval1 = fval1.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, -fval0)
exp1 = np.apply_along_axis(rankdata, 1, -fval1)
tm.assert_numpy_array_equal(ranks0.values, exp0)
tm.assert_numpy_array_equal(ranks1.values, exp1)
def test_rank_axis(self):
# check if using axes' names gives the same result
df = DataFrame([[2, 1], [4, 3]])
tm.assert_frame_equal(df.rank(axis=0), df.rank(axis='index'))
tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns'))
def test_rank_methods_frame(self):
pytest.importorskip('scipy.stats.special')
rankdata = pytest.importorskip('scipy.stats.rankdata')
import scipy
xs = np.random.randint(0, 21, (100, 26))
xs = (xs - 10.0) / 10.0
cols = [chr(ord('z') - i) for i in range(xs.shape[1])]
for vals in [xs, xs + 1e6, xs * 1e-6]:
df = DataFrame(vals, columns=cols)
for ax in [0, 1]:
for m in ['average', 'min', 'max', 'first', 'dense']:
result = df.rank(axis=ax, method=m)
sprank = np.apply_along_axis(
rankdata, ax, vals,
m if m != 'first' else 'ordinal')
sprank = sprank.astype(np.float64)
expected = DataFrame(sprank, columns=cols)
if (LooseVersion(scipy.__version__) >=
LooseVersion('0.17.0')):
expected = expected.astype('float64')
tm.assert_frame_equal(result, expected)
def test_rank_descending(self):
dtypes = ['O', 'f8', 'i8']
for dtype, method in product(dtypes, self.results):
if 'i' in dtype:
df = self.df.dropna()
else:
df = self.df.astype(dtype)
res = df.rank(ascending=False)
expected = (df.max() - df).rank()
assert_frame_equal(res, expected)
if method == 'first' and dtype == 'O':
continue
expected = (df.max() - df).rank(method=method)
if dtype != 'O':
res2 = df.rank(method=method, ascending=False,
numeric_only=True)
assert_frame_equal(res2, expected)
res3 = df.rank(method=method, ascending=False,
numeric_only=False)
assert_frame_equal(res3, expected)
def test_rank_2d_tie_methods(self):
df = self.df
def _check2d(df, expected, method='average', axis=0):
exp_df = DataFrame({'A': expected, 'B': expected})
if axis == 1:
df = df.T
exp_df = exp_df.T
result = df.rank(method=method, axis=axis)
assert_frame_equal(result, exp_df)
dtypes = [None, object]
disabled = set([(object, 'first')])
results = self.results
for method, axis, dtype in product(results, [0, 1], dtypes):
if (dtype, method) in disabled:
continue
frame = df if dtype is None else df.astype(dtype)
_check2d(frame, results[method], method=method, axis=axis)
@pytest.mark.parametrize(
"method,exp", [("dense",
[[1., 1., 1.],
[1., 0.5, 2. / 3],
[1., 0.5, 1. / 3]]),
("min",
[[1. / 3, 1., 1.],
[1. / 3, 1. / 3, 2. / 3],
[1. / 3, 1. / 3, 1. / 3]]),
("max",
[[1., 1., 1.],
[1., 2. / 3, 2. / 3],
[1., 2. / 3, 1. / 3]]),
("average",
[[2. / 3, 1., 1.],
[2. / 3, 0.5, 2. / 3],
[2. / 3, 0.5, 1. / 3]]),
("first",
[[1. / 3, 1., 1.],
[2. / 3, 1. / 3, 2. / 3],
[3. / 3, 2. / 3, 1. / 3]])])
def test_rank_pct_true(method, exp):
# see gh-15630.
df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
result = df.rank(method=method, pct=True)
expected = DataFrame(exp)
tm.assert_frame_equal(result, expected)
File diff suppressed because it is too large Load Diff
@@ -1,525 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
from datetime import datetime, timedelta
import re
import sys
import textwrap
from numpy import nan
import numpy as np
import pytest
from pandas import (DataFrame, Series, compat, option_context,
date_range, period_range, Categorical)
from pandas.compat import StringIO, lrange, u, PYPY
import pandas.io.formats.format as fmt
import pandas as pd
import pandas.util.testing as tm
from pandas.tests.frame.common import TestData
# Segregated collection of methods that require the BlockManager internal data
# structure
class TestDataFrameReprInfoEtc(TestData):
def test_repr_empty(self):
# empty
foo = repr(self.empty) # noqa
# empty with index
frame = DataFrame(index=np.arange(1000))
foo = repr(frame) # noqa
def test_repr_mixed(self):
buf = StringIO()
# mixed
foo = repr(self.mixed_frame) # noqa
self.mixed_frame.info(verbose=False, buf=buf)
@pytest.mark.slow
def test_repr_mixed_big(self):
# big mixed
biggie = DataFrame({'A': np.random.randn(200),
'B': tm.makeStringIndex(200)},
index=lrange(200))
biggie.loc[:20, 'A'] = nan
biggie.loc[:20, 'B'] = nan
foo = repr(biggie) # noqa
def test_repr(self):
buf = StringIO()
# small one
foo = repr(self.frame)
self.frame.info(verbose=False, buf=buf)
# even smaller
self.frame.reindex(columns=['A']).info(verbose=False, buf=buf)
self.frame.reindex(columns=['A', 'B']).info(verbose=False, buf=buf)
# exhausting cases in DataFrame.info
# columns but no index
no_index = DataFrame(columns=[0, 1, 3])
foo = repr(no_index) # noqa
# no columns or index
self.empty.info(buf=buf)
df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"])
assert "\t" not in repr(df)
assert "\r" not in repr(df)
assert "a\n" not in repr(df)
def test_repr_dimensions(self):
df = DataFrame([[1, 2, ], [3, 4]])
with option_context('display.show_dimensions', True):
assert "2 rows x 2 columns" in repr(df)
with option_context('display.show_dimensions', False):
assert "2 rows x 2 columns" not in repr(df)
with option_context('display.show_dimensions', 'truncate'):
assert "2 rows x 2 columns" not in repr(df)
@pytest.mark.slow
def test_repr_big(self):
# big one
biggie = DataFrame(np.zeros((200, 4)), columns=lrange(4),
index=lrange(200))
repr(biggie)
def test_repr_unsortable(self):
# columns are not sortable
import warnings
warn_filters = warnings.filters
warnings.filterwarnings('ignore',
category=FutureWarning,
module=".*format")
unsortable = DataFrame({'foo': [1] * 50,
datetime.today(): [1] * 50,
'bar': ['bar'] * 50,
datetime.today() + timedelta(1): ['bar'] * 50},
index=np.arange(50))
repr(unsortable)
fmt.set_option('display.precision', 3, 'display.column_space', 10)
repr(self.frame)
fmt.set_option('display.max_rows', 10, 'display.max_columns', 2)
repr(self.frame)
fmt.set_option('display.max_rows', 1000, 'display.max_columns', 1000)
repr(self.frame)
tm.reset_display_options()
warnings.filters = warn_filters
def test_repr_unicode(self):
uval = u('\u03c3\u03c3\u03c3\u03c3')
# TODO(wesm): is this supposed to be used?
bval = uval.encode('utf-8') # noqa
df = DataFrame({'A': [uval, uval]})
result = repr(df)
ex_top = ' A'
assert result.split('\n')[0].rstrip() == ex_top
df = DataFrame({'A': [uval, uval]})
result = repr(df)
assert result.split('\n')[0].rstrip() == ex_top
def test_unicode_string_with_unicode(self):
df = DataFrame({'A': [u("\u05d0")]})
if compat.PY3:
str(df)
else:
compat.text_type(df)
def test_bytestring_with_unicode(self):
df = DataFrame({'A': [u("\u05d0")]})
if compat.PY3:
bytes(df)
else:
str(df)
def test_very_wide_info_repr(self):
df = DataFrame(np.random.randn(10, 20),
columns=tm.rands_array(10, 20))
repr(df)
def test_repr_column_name_unicode_truncation_bug(self):
# #1906
df = DataFrame({'Id': [7117434],
'StringCol': ('Is it possible to modify drop plot code'
' so that the output graph is displayed '
'in iphone simulator, Is it possible to '
'modify drop plot code so that the '
'output graph is \xe2\x80\xa8displayed '
'in iphone simulator.Now we are adding '
'the CSV file externally. I want to Call'
' the File through the code..')})
with option_context('display.max_columns', 20):
assert 'StringCol' in repr(df)
def test_latex_repr(self):
result = r"""\begin{tabular}{llll}
\toprule
{} & 0 & 1 & 2 \\
\midrule
0 & $\alpha$ & b & c \\
1 & 1 & 2 & 3 \\
\bottomrule
\end{tabular}
"""
with option_context("display.latex.escape", False,
'display.latex.repr', True):
df = DataFrame([[r'$\alpha$', 'b', 'c'], [1, 2, 3]])
assert result == df._repr_latex_()
# GH 12182
assert df._repr_latex_() is None
@tm.capture_stdout
def test_info(self):
io = StringIO()
self.frame.info(buf=io)
self.tsframe.info(buf=io)
frame = DataFrame(np.random.randn(5, 3))
frame.info()
frame.info(verbose=False)
def test_info_memory(self):
# https://github.com/pandas-dev/pandas/issues/21056
df = pd.DataFrame({'a': pd.Series([1, 2], dtype='i8')})
buf = StringIO()
df.info(buf=buf)
result = buf.getvalue()
bytes = float(df.memory_usage().sum())
expected = textwrap.dedent("""\
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
a 2 non-null int64
dtypes: int64(1)
memory usage: {} bytes
""".format(bytes))
assert result == expected
def test_info_wide(self):
from pandas import set_option, reset_option
io = StringIO()
df = DataFrame(np.random.randn(5, 101))
df.info(buf=io)
io = StringIO()
df.info(buf=io, max_cols=101)
rs = io.getvalue()
assert len(rs.splitlines()) > 100
xp = rs
set_option('display.max_info_columns', 101)
io = StringIO()
df.info(buf=io)
assert rs == xp
reset_option('display.max_info_columns')
def test_info_duplicate_columns(self):
io = StringIO()
# it works!
frame = DataFrame(np.random.randn(1500, 4),
columns=['a', 'a', 'b', 'b'])
frame.info(buf=io)
def test_info_duplicate_columns_shows_correct_dtypes(self):
# GH11761
io = StringIO()
frame = DataFrame([[1, 2.0]],
columns=['a', 'a'])
frame.info(buf=io)
io.seek(0)
lines = io.readlines()
assert 'a 1 non-null int64\n' == lines[3]
assert 'a 1 non-null float64\n' == lines[4]
def test_info_shows_column_dtypes(self):
dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
'complex128', 'object', 'bool']
data = {}
n = 10
for i, dtype in enumerate(dtypes):
data[i] = np.random.randint(2, size=n).astype(dtype)
df = DataFrame(data)
buf = StringIO()
df.info(buf=buf)
res = buf.getvalue()
for i, dtype in enumerate(dtypes):
name = '%d %d non-null %s' % (i, n, dtype)
assert name in res
def test_info_max_cols(self):
df = DataFrame(np.random.randn(10, 5))
for len_, verbose in [(5, None), (5, False), (10, True)]:
# For verbose always ^ setting ^ summarize ^ full output
with option_context('max_info_columns', 4):
buf = StringIO()
df.info(buf=buf, verbose=verbose)
res = buf.getvalue()
assert len(res.strip().split('\n')) == len_
for len_, verbose in [(10, None), (5, False), (10, True)]:
# max_cols no exceeded
with option_context('max_info_columns', 5):
buf = StringIO()
df.info(buf=buf, verbose=verbose)
res = buf.getvalue()
assert len(res.strip().split('\n')) == len_
for len_, max_cols in [(10, 5), (5, 4)]:
# setting truncates
with option_context('max_info_columns', 4):
buf = StringIO()
df.info(buf=buf, max_cols=max_cols)
res = buf.getvalue()
assert len(res.strip().split('\n')) == len_
# setting wouldn't truncate
with option_context('max_info_columns', 5):
buf = StringIO()
df.info(buf=buf, max_cols=max_cols)
res = buf.getvalue()
assert len(res.strip().split('\n')) == len_
def test_info_memory_usage(self):
# Ensure memory usage is displayed, when asserted, on the last line
dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
'complex128', 'object', 'bool']
data = {}
n = 10
for i, dtype in enumerate(dtypes):
data[i] = np.random.randint(2, size=n).astype(dtype)
df = DataFrame(data)
buf = StringIO()
# display memory usage case
df.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert "memory usage: " in res[-1]
# do not display memory usage case
df.info(buf=buf, memory_usage=False)
res = buf.getvalue().splitlines()
assert "memory usage: " not in res[-1]
df.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
# memory usage is a lower bound, so print it as XYZ+ MB
assert re.match(r"memory usage: [^+]+\+", res[-1])
df.iloc[:, :5].info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
# excluded column with object dtype, so estimate is accurate
assert not re.match(r"memory usage: [^+]+\+", res[-1])
# Test a DataFrame with duplicate columns
dtypes = ['int64', 'int64', 'int64', 'float64']
data = {}
n = 100
for i, dtype in enumerate(dtypes):
data[i] = np.random.randint(2, size=n).astype(dtype)
df = DataFrame(data)
df.columns = dtypes
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
df_with_object_index.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+\+", res[-1])
df_with_object_index.info(buf=buf, memory_usage='deep')
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+$", res[-1])
# Ensure df size is as expected
# (cols * rows * bytes) + index size
df_size = df.memory_usage().sum()
exp_size = len(dtypes) * n * 8 + df.index.nbytes
assert df_size == exp_size
# Ensure number of cols in memory_usage is the same as df
size_df = np.size(df.columns.values) + 1 # index=True; default
assert size_df == np.size(df.memory_usage())
# assert deep works only on object
assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
# test for validity
DataFrame(1, index=['a'], columns=['A']
).memory_usage(index=True)
DataFrame(1, index=['a'], columns=['A']
).index.nbytes
df = DataFrame(
data=1,
index=pd.MultiIndex.from_product(
[['a'], range(1000)]),
columns=['A']
)
df.index.nbytes
df.memory_usage(index=True)
df.index.values.nbytes
mem = df.memory_usage(deep=True).sum()
assert mem > 0
@pytest.mark.skipif(PYPY,
reason="on PyPy deep=True doesn't change result")
def test_info_memory_usage_deep_not_pypy(self):
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
assert (df_with_object_index.memory_usage(
index=True, deep=True).sum() >
df_with_object_index.memory_usage(
index=True).sum())
df_object = pd.DataFrame({'a': ['a']})
assert (df_object.memory_usage(deep=True).sum() >
df_object.memory_usage().sum())
@pytest.mark.skipif(not PYPY,
reason="on PyPy deep=True does not change result")
def test_info_memory_usage_deep_pypy(self):
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
assert (df_with_object_index.memory_usage(
index=True, deep=True).sum() ==
df_with_object_index.memory_usage(
index=True).sum())
df_object = pd.DataFrame({'a': ['a']})
assert (df_object.memory_usage(deep=True).sum() ==
df_object.memory_usage().sum())
@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
def test_usage_via_getsizeof(self):
df = DataFrame(
data=1,
index=pd.MultiIndex.from_product(
[['a'], range(1000)]),
columns=['A']
)
mem = df.memory_usage(deep=True).sum()
# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = mem - sys.getsizeof(df)
assert abs(diff) < 100
def test_info_memory_usage_qualified(self):
buf = StringIO()
df = DataFrame(1, columns=list('ab'),
index=[1, 2, 3])
df.info(buf=buf)
assert '+' not in buf.getvalue()
buf = StringIO()
df = DataFrame(1, columns=list('ab'),
index=list('ABC'))
df.info(buf=buf)
assert '+' in buf.getvalue()
buf = StringIO()
df = DataFrame(1, columns=list('ab'),
index=pd.MultiIndex.from_product(
[range(3), range(3)]))
df.info(buf=buf)
assert '+' not in buf.getvalue()
buf = StringIO()
df = DataFrame(1, columns=list('ab'),
index=pd.MultiIndex.from_product(
[range(3), ['foo', 'bar']]))
df.info(buf=buf)
assert '+' in buf.getvalue()
def test_info_memory_usage_bug_on_multiindex(self):
# GH 14308
# memory usage introspection should not materialize .values
from string import ascii_uppercase as uppercase
def memory_usage(f):
return f.memory_usage(deep=True).sum()
N = 100
M = len(uppercase)
index = pd.MultiIndex.from_product([list(uppercase),
pd.date_range('20160101',
periods=N)],
names=['id', 'date'])
df = DataFrame({'value': np.random.randn(N * M)}, index=index)
unstacked = df.unstack('id')
assert df.values.nbytes == unstacked.values.nbytes
assert memory_usage(df) > memory_usage(unstacked)
# high upper bound
assert memory_usage(unstacked) - memory_usage(df) < 2000
def test_info_categorical(self):
# GH14298
idx = pd.CategoricalIndex(['a', 'b'])
df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
buf = StringIO()
df.info(buf=buf)
def test_info_categorical_column(self):
# make sure it works
n = 2500
df = DataFrame({'int64': np.random.randint(100, size=n)})
df['category'] = Series(np.array(list('abcdefghij')).take(
np.random.randint(0, 10, size=n))).astype('category')
df.isna()
buf = StringIO()
df.info(buf=buf)
df2 = df[df['category'] == 'd']
buf = compat.StringIO()
df2.info(buf=buf)
def test_repr_categorical_dates_periods(self):
# normal DataFrame
dt = date_range('2011-01-01 09:00', freq='H', periods=5,
tz='US/Eastern')
p = period_range('2011-01', freq='M', periods=5)
df = DataFrame({'dt': dt, 'p': p})
exp = """ dt p
0 2011-01-01 09:00:00-05:00 2011-01
1 2011-01-01 10:00:00-05:00 2011-02
2 2011-01-01 11:00:00-05:00 2011-03
3 2011-01-01 12:00:00-05:00 2011-04
4 2011-01-01 13:00:00-05:00 2011-05"""
df = DataFrame({'dt': Categorical(dt), 'p': Categorical(p)})
assert repr(df) == exp
@@ -1,912 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
from warnings import catch_warnings
from datetime import datetime
import itertools
import pytest
from numpy.random import randn
from numpy import nan
import numpy as np
from pandas.compat import u
from pandas import (DataFrame, Index, Series, MultiIndex, date_range,
Timedelta, Period)
import pandas as pd
from pandas.util.testing import assert_series_equal, assert_frame_equal
import pandas.util.testing as tm
from pandas.tests.frame.common import TestData
class TestDataFrameReshape(TestData):
def test_pivot(self):
data = {
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
'values': [1., 2., 3., 3., 2., 1.]
}
frame = DataFrame(data)
pivoted = frame.pivot(
index='index', columns='columns', values='values')
expected = DataFrame({
'One': {'A': 1., 'B': 2., 'C': 3.},
'Two': {'A': 1., 'B': 2., 'C': 3.}
})
expected.index.name, expected.columns.name = 'index', 'columns'
tm.assert_frame_equal(pivoted, expected)
# name tracking
assert pivoted.index.name == 'index'
assert pivoted.columns.name == 'columns'
# don't specify values
pivoted = frame.pivot(index='index', columns='columns')
assert pivoted.index.name == 'index'
assert pivoted.columns.names == (None, 'columns')
with catch_warnings(record=True):
# pivot multiple columns
wp = tm.makePanel()
lp = wp.to_frame()
df = lp.reset_index()
tm.assert_frame_equal(df.pivot('major', 'minor'), lp.unstack())
def test_pivot_duplicates(self):
data = DataFrame({'a': ['bar', 'bar', 'foo', 'foo', 'foo'],
'b': ['one', 'two', 'one', 'one', 'two'],
'c': [1., 2., 3., 3., 4.]})
with tm.assert_raises_regex(ValueError, 'duplicate entries'):
data.pivot('a', 'b', 'c')
def test_pivot_empty(self):
df = DataFrame({}, columns=['a', 'b', 'c'])
result = df.pivot('a', 'b', 'c')
expected = DataFrame({})
tm.assert_frame_equal(result, expected, check_names=False)
def test_pivot_integer_bug(self):
df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")])
result = df.pivot(index=1, columns=0, values=2)
repr(result)
tm.assert_index_equal(result.columns, Index(['A', 'B'], name=0))
def test_pivot_index_none(self):
# gh-3962
data = {
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
'values': [1., 2., 3., 3., 2., 1.]
}
frame = DataFrame(data).set_index('index')
result = frame.pivot(columns='columns', values='values')
expected = DataFrame({
'One': {'A': 1., 'B': 2., 'C': 3.},
'Two': {'A': 1., 'B': 2., 'C': 3.}
})
expected.index.name, expected.columns.name = 'index', 'columns'
assert_frame_equal(result, expected)
# omit values
result = frame.pivot(columns='columns')
expected.columns = pd.MultiIndex.from_tuples([('values', 'One'),
('values', 'Two')],
names=[None, 'columns'])
expected.index.name = 'index'
tm.assert_frame_equal(result, expected, check_names=False)
assert result.index.name == 'index'
assert result.columns.names == (None, 'columns')
expected.columns = expected.columns.droplevel(0)
result = frame.pivot(columns='columns', values='values')
expected.columns.name = 'columns'
tm.assert_frame_equal(result, expected)
def test_stack_unstack(self):
df = self.frame.copy()
df[:] = np.arange(np.prod(df.shape)).reshape(df.shape)
stacked = df.stack()
stacked_df = DataFrame({'foo': stacked, 'bar': stacked})
unstacked = stacked.unstack()
unstacked_df = stacked_df.unstack()
assert_frame_equal(unstacked, df)
assert_frame_equal(unstacked_df['bar'], df)
unstacked_cols = stacked.unstack(0)
unstacked_cols_df = stacked_df.unstack(0)
assert_frame_equal(unstacked_cols.T, df)
assert_frame_equal(unstacked_cols_df['bar'].T, df)
def test_stack_mixed_level(self):
# GH 18310
levels = [range(3), [3, 'a', 'b'], [1, 2]]
# flat columns:
df = DataFrame(1, index=levels[0], columns=levels[1])
result = df.stack()
expected = Series(1, index=MultiIndex.from_product(levels[:2]))
assert_series_equal(result, expected)
# MultiIndex columns:
df = DataFrame(1, index=levels[0],
columns=MultiIndex.from_product(levels[1:]))
result = df.stack(1)
expected = DataFrame(1, index=MultiIndex.from_product([levels[0],
levels[2]]),
columns=levels[1])
assert_frame_equal(result, expected)
# as above, but used labels in level are actually of homogeneous type
result = df[['a', 'b']].stack(1)
expected = expected[['a', 'b']]
assert_frame_equal(result, expected)
def test_unstack_fill(self):
# GH #9746: fill_value keyword argument for Series
# and DataFrame unstack
# From a series
data = Series([1, 2, 4, 5], dtype=np.int16)
data.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
result = data.unstack(fill_value=-1)
expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]},
index=['x', 'y', 'z'], dtype=np.int16)
assert_frame_equal(result, expected)
# From a series with incorrect data type for fill_value
result = data.unstack(fill_value=0.5)
expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]},
index=['x', 'y', 'z'], dtype=np.float)
assert_frame_equal(result, expected)
# GH #13971: fill_value when unstacking multiple levels:
df = DataFrame({'x': ['a', 'a', 'b'],
'y': ['j', 'k', 'j'],
'z': [0, 1, 2],
'w': [0, 1, 2]}).set_index(['x', 'y', 'z'])
unstacked = df.unstack(['x', 'y'], fill_value=0)
key = ('w', 'b', 'j')
expected = unstacked[key]
result = pd.Series([0, 0, 2], index=unstacked.index, name=key)
assert_series_equal(result, expected)
stacked = unstacked.stack(['x', 'y'])
stacked.index = stacked.index.reorder_levels(df.index.names)
# Workaround for GH #17886 (unnecessarily casts to float):
stacked = stacked.astype(np.int64)
result = stacked.loc[df.index]
assert_frame_equal(result, df)
# From a series
s = df['w']
result = s.unstack(['x', 'y'], fill_value=0)
expected = unstacked['w']
assert_frame_equal(result, expected)
def test_unstack_fill_frame(self):
# From a dataframe
rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
df.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
result = df.unstack(fill_value=-1)
rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
expected.columns = MultiIndex.from_tuples(
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
assert_frame_equal(result, expected)
# From a mixed type dataframe
df['A'] = df['A'].astype(np.int16)
df['B'] = df['B'].astype(np.float64)
result = df.unstack(fill_value=-1)
expected['A'] = expected['A'].astype(np.int16)
expected['B'] = expected['B'].astype(np.float64)
assert_frame_equal(result, expected)
# From a dataframe with incorrect data type for fill_value
result = df.unstack(fill_value=0.5)
rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
expected.columns = MultiIndex.from_tuples(
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
assert_frame_equal(result, expected)
def test_unstack_fill_frame_datetime(self):
# Test unstacking with date times
dv = pd.date_range('2012-01-01', periods=4).values
data = Series(dv)
data.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
result = data.unstack()
expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]],
'b': [dv[1], dv[2], pd.NaT]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
result = data.unstack(fill_value=dv[0])
expected = DataFrame({'a': [dv[0], dv[0], dv[3]],
'b': [dv[1], dv[2], dv[0]]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
def test_unstack_fill_frame_timedelta(self):
# Test unstacking with time deltas
td = [Timedelta(days=i) for i in range(4)]
data = Series(td)
data.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
result = data.unstack()
expected = DataFrame({'a': [td[0], pd.NaT, td[3]],
'b': [td[1], td[2], pd.NaT]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
result = data.unstack(fill_value=td[1])
expected = DataFrame({'a': [td[0], td[1], td[3]],
'b': [td[1], td[2], td[1]]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
def test_unstack_fill_frame_period(self):
# Test unstacking with period
periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'),
Period('2012-04')]
data = Series(periods)
data.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
result = data.unstack()
expected = DataFrame({'a': [periods[0], None, periods[3]],
'b': [periods[1], periods[2], None]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
result = data.unstack(fill_value=periods[1])
expected = DataFrame({'a': [periods[0], periods[1], periods[3]],
'b': [periods[1], periods[2], periods[1]]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
def test_unstack_fill_frame_categorical(self):
# Test unstacking with categorical
data = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
data.index = pd.MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
# By default missing values will be NaN
result = data.unstack()
expected = DataFrame({'a': pd.Categorical(list('axa'),
categories=list('abc')),
'b': pd.Categorical(list('bcx'),
categories=list('abc'))},
index=list('xyz'))
assert_frame_equal(result, expected)
# Fill with non-category results in NaN entries similar to above
result = data.unstack(fill_value='d')
assert_frame_equal(result, expected)
# Fill with category value replaces missing values as expected
result = data.unstack(fill_value='c')
expected = DataFrame({'a': pd.Categorical(list('aca'),
categories=list('abc')),
'b': pd.Categorical(list('bcc'),
categories=list('abc'))},
index=list('xyz'))
assert_frame_equal(result, expected)
def test_unstack_preserve_dtypes(self):
# Checks fix for #11847
df = pd.DataFrame(dict(state=['IL', 'MI', 'NC'],
index=['a', 'b', 'c'],
some_categories=pd.Series(['a', 'b', 'c']
).astype('category'),
A=np.random.rand(3),
B=1,
C='foo',
D=pd.Timestamp('20010102'),
E=pd.Series([1.0, 50.0, 100.0]
).astype('float32'),
F=pd.Series([3.0, 4.0, 5.0]).astype('float64'),
G=False,
H=pd.Series([1, 200, 923442], dtype='int8')))
def unstack_and_compare(df, column_name):
unstacked1 = df.unstack([column_name])
unstacked2 = df.unstack(column_name)
assert_frame_equal(unstacked1, unstacked2)
df1 = df.set_index(['state', 'index'])
unstack_and_compare(df1, 'index')
df1 = df.set_index(['state', 'some_categories'])
unstack_and_compare(df1, 'some_categories')
df1 = df.set_index(['F', 'C'])
unstack_and_compare(df1, 'F')
df1 = df.set_index(['G', 'B', 'state'])
unstack_and_compare(df1, 'B')
df1 = df.set_index(['E', 'A'])
unstack_and_compare(df1, 'E')
df1 = df.set_index(['state', 'index'])
s = df1['A']
unstack_and_compare(s, 'index')
def test_stack_ints(self):
columns = MultiIndex.from_tuples(list(itertools.product(range(3),
repeat=3)))
df = DataFrame(np.random.randn(30, 27), columns=columns)
assert_frame_equal(df.stack(level=[1, 2]),
df.stack(level=1).stack(level=1))
assert_frame_equal(df.stack(level=[-2, -1]),
df.stack(level=1).stack(level=1))
df_named = df.copy()
df_named.columns.set_names(range(3), inplace=True)
assert_frame_equal(df_named.stack(level=[1, 2]),
df_named.stack(level=1).stack(level=1))
def test_stack_mixed_levels(self):
columns = MultiIndex.from_tuples(
[('A', 'cat', 'long'), ('B', 'cat', 'long'),
('A', 'dog', 'short'), ('B', 'dog', 'short')],
names=['exp', 'animal', 'hair_length']
)
df = DataFrame(randn(4, 4), columns=columns)
animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
exp_hair_stacked = df.stack(level=['exp', 'hair_length'])
# GH #8584: Need to check that stacking works when a number
# is passed that is both a level name and in the range of
# the level numbers
df2 = df.copy()
df2.columns.names = ['exp', 'animal', 1]
assert_frame_equal(df2.stack(level=['animal', 1]),
animal_hair_stacked, check_names=False)
assert_frame_equal(df2.stack(level=['exp', 1]),
exp_hair_stacked, check_names=False)
# When mixed types are passed and the ints are not level
# names, raise
pytest.raises(ValueError, df2.stack, level=['animal', 0])
# GH #8584: Having 0 in the level names could raise a
# strange error about lexsort depth
df3 = df.copy()
df3.columns.names = ['exp', 'animal', 0]
assert_frame_equal(df3.stack(level=['animal', 0]),
animal_hair_stacked, check_names=False)
def test_stack_int_level_names(self):
columns = MultiIndex.from_tuples(
[('A', 'cat', 'long'), ('B', 'cat', 'long'),
('A', 'dog', 'short'), ('B', 'dog', 'short')],
names=['exp', 'animal', 'hair_length']
)
df = DataFrame(randn(4, 4), columns=columns)
exp_animal_stacked = df.stack(level=['exp', 'animal'])
animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
exp_hair_stacked = df.stack(level=['exp', 'hair_length'])
df2 = df.copy()
df2.columns.names = [0, 1, 2]
assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked,
check_names=False)
assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked,
check_names=False)
assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked,
check_names=False)
# Out-of-order int column names
df3 = df.copy()
df3.columns.names = [2, 0, 1]
assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked,
check_names=False)
assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked,
check_names=False)
assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked,
check_names=False)
def test_unstack_bool(self):
df = DataFrame([False, False],
index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]),
columns=['col'])
rs = df.unstack()
xp = DataFrame(np.array([[False, np.nan], [np.nan, False]],
dtype=object),
index=['a', 'b'],
columns=MultiIndex.from_arrays([['col', 'col'],
['c', 'l']]))
assert_frame_equal(rs, xp)
def test_unstack_level_binding(self):
# GH9856
mi = pd.MultiIndex(
levels=[[u('foo'), u('bar')], [u('one'), u('two')],
[u('a'), u('b')]],
labels=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]],
names=[u('first'), u('second'), u('third')])
s = pd.Series(0, index=mi)
result = s.unstack([1, 2]).stack(0)
expected_mi = pd.MultiIndex(
levels=[['foo', 'bar'], ['one', 'two']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
names=['first', 'second'])
expected = pd.DataFrame(np.array([[np.nan, 0],
[0, np.nan],
[np.nan, 0],
[0, np.nan]],
dtype=np.float64),
index=expected_mi,
columns=pd.Index(['a', 'b'], name='third'))
assert_frame_equal(result, expected)
def test_unstack_to_series(self):
# check reversibility
data = self.frame.unstack()
assert isinstance(data, Series)
undo = data.unstack().T
assert_frame_equal(undo, self.frame)
# check NA handling
data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]})
data.index = Index(['a', 'b', 'c'])
result = data.unstack()
midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']],
labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx)
assert_series_equal(result, expected)
# check composability of unstack
old_data = data.copy()
for _ in range(4):
data = data.unstack()
assert_frame_equal(old_data, data)
def test_unstack_dtypes(self):
# GH 2929
rows = [[1, 1, 3, 4],
[1, 2, 3, 4],
[2, 1, 3, 4],
[2, 2, 3, 4]]
df = DataFrame(rows, columns=list('ABCD'))
result = df.get_dtype_counts()
expected = Series({'int64': 4})
assert_series_equal(result, expected)
# single dtype
df2 = df.set_index(['A', 'B'])
df3 = df2.unstack('B')
result = df3.get_dtype_counts()
expected = Series({'int64': 4})
assert_series_equal(result, expected)
# mixed
df2 = df.set_index(['A', 'B'])
df2['C'] = 3.
df3 = df2.unstack('B')
result = df3.get_dtype_counts()
expected = Series({'int64': 2, 'float64': 2})
assert_series_equal(result, expected)
df2['D'] = 'foo'
df3 = df2.unstack('B')
result = df3.get_dtype_counts()
expected = Series({'float64': 2, 'object': 2})
assert_series_equal(result, expected)
# GH7405
for c, d in (np.zeros(5), np.zeros(5)), \
(np.arange(5, dtype='f8'), np.arange(5, 10, dtype='f8')):
df = DataFrame({'A': ['a'] * 5, 'C': c, 'D': d,
'B': pd.date_range('2012-01-01', periods=5)})
right = df.iloc[:3].copy(deep=True)
df = df.set_index(['A', 'B'])
df['D'] = df['D'].astype('int64')
left = df.iloc[:3].unstack(0)
right = right.set_index(['A', 'B']).unstack(0)
right[('D', 'a')] = right[('D', 'a')].astype('int64')
assert left.shape == (3, 2)
tm.assert_frame_equal(left, right)
def test_unstack_non_unique_index_names(self):
idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')],
names=['c1', 'c1'])
df = DataFrame([1, 2], index=idx)
with pytest.raises(ValueError):
df.unstack('c1')
with pytest.raises(ValueError):
df.T.stack('c1')
def test_unstack_unused_levels(self):
# GH 17845: unused labels in index make unstack() cast int to float
idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1]
df = pd.DataFrame([[1, 0]] * 3, index=idx)
result = df.unstack()
exp_col = pd.MultiIndex.from_product([[0, 1], ['A', 'B', 'C']])
expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=['a'],
columns=exp_col)
tm.assert_frame_equal(result, expected)
assert((result.columns.levels[1] == idx.levels[1]).all())
# Unused items on both levels
levels = [[0, 1, 7], [0, 1, 2, 3]]
labels = [[0, 0, 1, 1], [0, 2, 0, 2]]
idx = pd.MultiIndex(levels, labels)
block = np.arange(4).reshape(2, 2)
df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx)
result = df.unstack()
expected = pd.DataFrame(np.concatenate([block * 2, block * 2 + 1],
axis=1),
columns=idx)
tm.assert_frame_equal(result, expected)
assert((result.columns.levels[1] == idx.levels[1]).all())
# With mixed dtype and NaN
levels = [['a', 2, 'c'], [1, 3, 5, 7]]
labels = [[0, -1, 1, 1], [0, 2, -1, 2]]
idx = pd.MultiIndex(levels, labels)
data = np.arange(8)
df = pd.DataFrame(data.reshape(4, 2), index=idx)
cases = ((0, [13, 16, 6, 9, 2, 5, 8, 11],
[np.nan, 'a', 2], [np.nan, 5, 1]),
(1, [8, 11, 1, 4, 12, 15, 13, 16],
[np.nan, 5, 1], [np.nan, 'a', 2]))
for level, idces, col_level, idx_level in cases:
result = df.unstack(level=level)
exp_data = np.zeros(18) * np.nan
exp_data[idces] = data
cols = pd.MultiIndex.from_product([[0, 1], col_level])
expected = pd.DataFrame(exp_data.reshape(3, 6),
index=idx_level, columns=cols)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("cols", [['A', 'C'], slice(None)])
def test_unstack_unused_level(self, cols):
# GH 18562 : unused labels on the unstacked level
df = pd.DataFrame([[2010, 'a', 'I'],
[2011, 'b', 'II']],
columns=['A', 'B', 'C'])
ind = df.set_index(['A', 'B', 'C'], drop=False)
selection = ind.loc[(slice(None), slice(None), 'I'), cols]
result = selection.unstack()
expected = ind.iloc[[0]][cols]
expected.columns = MultiIndex.from_product([expected.columns, ['I']],
names=[None, 'C'])
expected.index = expected.index.droplevel('C')
tm.assert_frame_equal(result, expected)
def test_unstack_nan_index(self): # GH7466
cast = lambda val: '{0:1}'.format('' if val != val else val)
nan = np.nan
def verify(df):
mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
rows, cols = df.notna().values.nonzero()
for i, j in zip(rows, cols):
left = sorted(df.iloc[i, j].split('.'))
right = mk_list(df.index[i]) + mk_list(df.columns[j])
right = sorted(list(map(cast, right)))
assert left == right
df = DataFrame({'jim': ['a', 'b', nan, 'd'],
'joe': ['w', 'x', 'y', 'z'],
'jolie': ['a.w', 'b.x', ' .y', 'd.z']})
left = df.set_index(['jim', 'joe']).unstack()['jolie']
right = df.set_index(['joe', 'jim']).unstack()['jolie'].T
assert_frame_equal(left, right)
for idx in itertools.permutations(df.columns[:2]):
mi = df.set_index(list(idx))
for lev in range(2):
udf = mi.unstack(level=lev)
assert udf.notna().values.sum() == len(df)
verify(udf['jolie'])
df = DataFrame({'1st': ['d'] * 3 + [nan] * 5 + ['a'] * 2 +
['c'] * 3 + ['e'] * 2 + ['b'] * 5,
'2nd': ['y'] * 2 + ['w'] * 3 + [nan] * 3 +
['z'] * 4 + [nan] * 3 + ['x'] * 3 + [nan] * 2,
'3rd': [67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59,
50, 62, 59, 76, 52, 14, 53, 60, 51]})
df['4th'], df['5th'] = \
df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \
df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1)
for idx in itertools.permutations(['1st', '2nd', '3rd']):
mi = df.set_index(list(idx))
for lev in range(3):
udf = mi.unstack(level=lev)
assert udf.notna().values.sum() == 2 * len(df)
for col in ['4th', '5th']:
verify(udf[col])
# GH7403
df = pd.DataFrame(
{'A': list('aaaabbbb'), 'B': range(8), 'C': range(8)})
df.iloc[3, 1] = np.NaN
left = df.set_index(['A', 'B']).unstack(0)
vals = [[3, 0, 1, 2, nan, nan, nan, nan],
[nan, nan, nan, nan, 4, 5, 6, 7]]
vals = list(map(list, zip(*vals)))
idx = Index([nan, 0, 1, 2, 4, 5, 6, 7], name='B')
cols = MultiIndex(levels=[['C'], ['a', 'b']],
labels=[[0, 0], [0, 1]],
names=[None, 'A'])
right = DataFrame(vals, columns=cols, index=idx)
assert_frame_equal(left, right)
df = DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
'C': range(8)})
df.iloc[2, 1] = np.NaN
left = df.set_index(['A', 'B']).unstack(0)
vals = [[2, nan], [0, 4], [1, 5], [nan, 6], [3, 7]]
cols = MultiIndex(levels=[['C'], ['a', 'b']],
labels=[[0, 0], [0, 1]],
names=[None, 'A'])
idx = Index([nan, 0, 1, 2, 3], name='B')
right = DataFrame(vals, columns=cols, index=idx)
assert_frame_equal(left, right)
df = pd.DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
'C': range(8)})
df.iloc[3, 1] = np.NaN
left = df.set_index(['A', 'B']).unstack(0)
vals = [[3, nan], [0, 4], [1, 5], [2, 6], [nan, 7]]
cols = MultiIndex(levels=[['C'], ['a', 'b']],
labels=[[0, 0], [0, 1]],
names=[None, 'A'])
idx = Index([nan, 0, 1, 2, 3], name='B')
right = DataFrame(vals, columns=cols, index=idx)
assert_frame_equal(left, right)
# GH7401
df = pd.DataFrame({'A': list('aaaaabbbbb'),
'B': (date_range('2012-01-01', periods=5)
.tolist() * 2),
'C': np.arange(10)})
df.iloc[3, 1] = np.NaN
left = df.set_index(['A', 'B']).unstack()
vals = np.array([[3, 0, 1, 2, nan, 4], [nan, 5, 6, 7, 8, 9]])
idx = Index(['a', 'b'], name='A')
cols = MultiIndex(levels=[['C'], date_range('2012-01-01', periods=5)],
labels=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
names=[None, 'B'])
right = DataFrame(vals, columns=cols, index=idx)
assert_frame_equal(left, right)
# GH4862
vals = [['Hg', nan, nan, 680585148],
['U', 0.0, nan, 680585148],
['Pb', 7.07e-06, nan, 680585148],
['Sn', 2.3614e-05, 0.0133, 680607017],
['Ag', 0.0, 0.0133, 680607017],
['Hg', -0.00015, 0.0133, 680607017]]
df = DataFrame(vals, columns=['agent', 'change', 'dosage', 's_id'],
index=[17263, 17264, 17265, 17266, 17267, 17268])
left = df.copy().set_index(['s_id', 'dosage', 'agent']).unstack()
vals = [[nan, nan, 7.07e-06, nan, 0.0],
[0.0, -0.00015, nan, 2.3614e-05, nan]]
idx = MultiIndex(levels=[[680585148, 680607017], [0.0133]],
labels=[[0, 1], [-1, 0]],
names=['s_id', 'dosage'])
cols = MultiIndex(levels=[['change'], ['Ag', 'Hg', 'Pb', 'Sn', 'U']],
labels=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
names=[None, 'agent'])
right = DataFrame(vals, columns=cols, index=idx)
assert_frame_equal(left, right)
left = df.loc[17264:].copy().set_index(['s_id', 'dosage', 'agent'])
assert_frame_equal(left.unstack(), right)
# GH9497 - multiple unstack with nulls
df = DataFrame({'1st': [1, 2, 1, 2, 1, 2],
'2nd': pd.date_range('2014-02-01', periods=6,
freq='D'),
'jim': 100 + np.arange(6),
'joe': (np.random.randn(6) * 10).round(2)})
df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02')
df.loc[1, '2nd'] = df.loc[3, '2nd'] = nan
df.loc[1, '3rd'] = df.loc[4, '3rd'] = nan
left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd'])
assert left.notna().values.sum() == 2 * len(df)
for col in ['jim', 'joe']:
for _, r in df.iterrows():
key = r['1st'], (col, r['2nd'], r['3rd'])
assert r[col] == left.loc[key]
def test_stack_datetime_column_multiIndex(self):
# GH 8039
t = datetime(2014, 1, 1)
df = DataFrame(
[1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, 'A', 'B')]))
result = df.stack()
eidx = MultiIndex.from_product([(0, 1, 2, 3), ('B',)])
ecols = MultiIndex.from_tuples([(t, 'A')])
expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols)
assert_frame_equal(result, expected)
def test_stack_partial_multiIndex(self):
# GH 8844
def _test_stack_with_multiindex(multiindex):
df = DataFrame(np.arange(3 * len(multiindex))
.reshape(3, len(multiindex)),
columns=multiindex)
for level in (-1, 0, 1, [0, 1], [1, 0]):
result = df.stack(level=level, dropna=False)
if isinstance(level, int):
# Stacking a single level should not make any all-NaN rows,
# so df.stack(level=level, dropna=False) should be the same
# as df.stack(level=level, dropna=True).
expected = df.stack(level=level, dropna=True)
if isinstance(expected, Series):
assert_series_equal(result, expected)
else:
assert_frame_equal(result, expected)
df.columns = MultiIndex.from_tuples(df.columns.get_values(),
names=df.columns.names)
expected = df.stack(level=level, dropna=False)
if isinstance(expected, Series):
assert_series_equal(result, expected)
else:
assert_frame_equal(result, expected)
full_multiindex = MultiIndex.from_tuples([('B', 'x'), ('B', 'z'),
('A', 'y'),
('C', 'x'), ('C', 'u')],
names=['Upper', 'Lower'])
for multiindex_columns in ([0, 1, 2, 3, 4],
[0, 1, 2, 3], [0, 1, 2, 4],
[0, 1, 2], [1, 2, 3], [2, 3, 4],
[0, 1], [0, 2], [0, 3],
[0], [2], [4]):
_test_stack_with_multiindex(full_multiindex[multiindex_columns])
if len(multiindex_columns) > 1:
multiindex_columns.reverse()
_test_stack_with_multiindex(
full_multiindex[multiindex_columns])
df = DataFrame(np.arange(6).reshape(2, 3),
columns=full_multiindex[[0, 1, 3]])
result = df.stack(dropna=False)
expected = DataFrame([[0, 2], [1, nan], [3, 5], [4, nan]],
index=MultiIndex(
levels=[[0, 1], ['u', 'x', 'y', 'z']],
labels=[[0, 0, 1, 1],
[1, 3, 1, 3]],
names=[None, 'Lower']),
columns=Index(['B', 'C'], name='Upper'),
dtype=df.dtypes[0])
assert_frame_equal(result, expected)
def test_stack_preserve_categorical_dtype(self):
# GH13854
for ordered in [False, True]:
for labels in [list("yxz"), list("yxy")]:
cidx = pd.CategoricalIndex(labels, categories=list("xyz"),
ordered=ordered)
df = DataFrame([[10, 11, 12]], columns=cidx)
result = df.stack()
# `MutliIndex.from_product` preserves categorical dtype -
# it's tested elsewhere.
midx = pd.MultiIndex.from_product([df.index, cidx])
expected = Series([10, 11, 12], index=midx)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("level", [0, 'baz'])
def test_unstack_swaplevel_sortlevel(self, level):
# GH 20994
mi = pd.MultiIndex.from_product([[0], ['d', 'c']],
names=['bar', 'baz'])
df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=['B', 'A'])
df.columns.name = 'foo'
expected = pd.DataFrame([
[3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples([
('c', 'A'), ('c', 'B'), ('d', 'A'), ('d', 'B')], names=[
'baz', 'foo']))
expected.index.name = 'bar'
result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level)
tm.assert_frame_equal(result, expected)
def test_unstack_fill_frame_object():
# GH12815 Test unstacking with object.
data = pd.Series(['a', 'b', 'c', 'a'], dtype='object')
data.index = pd.MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
# By default missing values will be NaN
result = data.unstack()
expected = pd.DataFrame(
{'a': ['a', np.nan, 'a'], 'b': ['b', 'c', np.nan]},
index=list('xyz')
)
assert_frame_equal(result, expected)
# Fill with any value replaces missing values as expected
result = data.unstack(fill_value='d')
expected = pd.DataFrame(
{'a': ['a', 'd', 'a'], 'b': ['b', 'c', 'd']},
index=list('xyz')
)
assert_frame_equal(result, expected)
@@ -1,126 +0,0 @@
import numpy as np
import pytest
from pandas import DataFrame, Index
from pandas.errors import PerformanceWarning
from pandas.util import testing as tm
from pandas.util.testing import assert_frame_equal
@pytest.fixture
def df_none():
return DataFrame({
'outer': ['a', 'a', 'a', 'b', 'b', 'b'],
'inner': [1, 2, 2, 2, 1, 1],
'A': np.arange(6, 0, -1),
('B', 5): ['one', 'one', 'two', 'two', 'one', 'one']})
@pytest.fixture(params=[
['outer'],
['outer', 'inner']
])
def df_idx(request, df_none):
levels = request.param
return df_none.set_index(levels)
@pytest.fixture(params=[
'inner', # index level
['outer'], # list of index level
'A', # column
[('B', 5)], # list of column
['inner', 'outer'], # two index levels
[('B', 5), 'outer'], # index level and column
['A', ('B', 5)], # Two columns
['inner', 'outer'] # two index levels and column
])
def sort_names(request):
return request.param
@pytest.fixture(params=[True, False])
def ascending(request):
return request.param
def test_sort_index_level_and_column_label(
df_none, df_idx, sort_names, ascending):
# GH 14353
# Get index levels from df_idx
levels = df_idx.index.names
# Compute expected by sorting on columns and the setting index
expected = df_none.sort_values(by=sort_names,
ascending=ascending,
axis=0).set_index(levels)
# Compute result sorting on mix on columns and index levels
result = df_idx.sort_values(by=sort_names,
ascending=ascending,
axis=0)
assert_frame_equal(result, expected)
def test_sort_column_level_and_index_label(
df_none, df_idx, sort_names, ascending):
# GH 14353
# Get levels from df_idx
levels = df_idx.index.names
# Compute expected by sorting on axis=0, setting index levels, and then
# transposing. For some cases this will result in a frame with
# multiple column levels
expected = df_none.sort_values(by=sort_names,
ascending=ascending,
axis=0).set_index(levels).T
# Compute result by transposing and sorting on axis=1.
result = df_idx.T.sort_values(by=sort_names,
ascending=ascending,
axis=1)
if len(levels) > 1:
# Accessing multi-level columns that are not lexsorted raises a
# performance warning
with tm.assert_produces_warning(PerformanceWarning,
check_stacklevel=False):
assert_frame_equal(result, expected)
else:
assert_frame_equal(result, expected)
def test_sort_values_column_index_level_precedence():
# GH 14353, when a string passed as the `by` parameter
# matches a column and an index level the column takes
# precedence
# Construct DataFrame with index and column named 'idx'
idx = Index(np.arange(1, 7), name='idx')
df = DataFrame({'A': np.arange(11, 17),
'idx': np.arange(6, 0, -1)},
index=idx)
# Sorting by 'idx' should sort by the idx column and raise a
# FutureWarning
with tm.assert_produces_warning(FutureWarning):
result = df.sort_values(by='idx')
# This should be equivalent to sorting by the 'idx' index level in
# descending order
expected = df.sort_index(level='idx', ascending=False)
assert_frame_equal(result, expected)
# Perform same test with MultiIndex
df_multi = df.set_index('A', append=True)
with tm.assert_produces_warning(FutureWarning):
result = df_multi.sort_values(by='idx')
expected = df_multi.sort_index(level='idx', ascending=False)
assert_frame_equal(result, expected)
@@ -1,600 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import pytest
import random
import numpy as np
import pandas as pd
from pandas.compat import lrange
from pandas.api.types import CategoricalDtype
from pandas import (DataFrame, Series, MultiIndex, Timestamp,
date_range, NaT, IntervalIndex)
from pandas.util.testing import assert_series_equal, assert_frame_equal
import pandas.util.testing as tm
from pandas.tests.frame.common import TestData
class TestDataFrameSorting(TestData):
def test_sort(self):
frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
columns=['A', 'B', 'C', 'D'])
# see gh-9816
with tm.assert_produces_warning(FutureWarning):
frame.sortlevel()
def test_sort_values(self):
frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]],
index=[1, 2, 3], columns=list('ABC'))
# by column (axis=0)
sorted_df = frame.sort_values(by='A')
indexer = frame['A'].argsort().values
expected = frame.loc[frame.index[indexer]]
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by='A', ascending=False)
indexer = indexer[::-1]
expected = frame.loc[frame.index[indexer]]
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by='A', ascending=False)
assert_frame_equal(sorted_df, expected)
# GH4839
sorted_df = frame.sort_values(by=['A'], ascending=[False])
assert_frame_equal(sorted_df, expected)
# multiple bys
sorted_df = frame.sort_values(by=['B', 'C'])
expected = frame.loc[[2, 1, 3]]
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=['B', 'C'], ascending=False)
assert_frame_equal(sorted_df, expected[::-1])
sorted_df = frame.sort_values(by=['B', 'A'], ascending=[True, False])
assert_frame_equal(sorted_df, expected)
pytest.raises(ValueError, lambda: frame.sort_values(
by=['A', 'B'], axis=2, inplace=True))
# by row (axis=1): GH 10806
sorted_df = frame.sort_values(by=3, axis=1)
expected = frame
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=3, axis=1, ascending=False)
expected = frame.reindex(columns=['C', 'B', 'A'])
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=[1, 2], axis='columns')
expected = frame.reindex(columns=['B', 'A', 'C'])
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=[1, 3], axis=1,
ascending=[True, False])
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False)
expected = frame.reindex(columns=['C', 'B', 'A'])
assert_frame_equal(sorted_df, expected)
msg = r'Length of ascending \(5\) != length of by \(2\)'
with tm.assert_raises_regex(ValueError, msg):
frame.sort_values(by=['A', 'B'], axis=0, ascending=[True] * 5)
def test_sort_values_inplace(self):
frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4],
columns=['A', 'B', 'C', 'D'])
sorted_df = frame.copy()
sorted_df.sort_values(by='A', inplace=True)
expected = frame.sort_values(by='A')
assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
sorted_df.sort_values(by=1, axis=1, inplace=True)
expected = frame.sort_values(by=1, axis=1)
assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
sorted_df.sort_values(by='A', ascending=False, inplace=True)
expected = frame.sort_values(by='A', ascending=False)
assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
sorted_df.sort_values(by=['A', 'B'], ascending=False, inplace=True)
expected = frame.sort_values(by=['A', 'B'], ascending=False)
assert_frame_equal(sorted_df, expected)
def test_sort_nan(self):
# GH3917
nan = np.nan
df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
'B': [9, nan, 5, 2, 5, 4, 5]})
# sort one column only
expected = DataFrame(
{'A': [nan, 1, 1, 2, 4, 6, 8],
'B': [5, 9, 2, nan, 5, 5, 4]},
index=[2, 0, 3, 1, 6, 4, 5])
sorted_df = df.sort_values(['A'], na_position='first')
assert_frame_equal(sorted_df, expected)
expected = DataFrame(
{'A': [nan, 8, 6, 4, 2, 1, 1],
'B': [5, 4, 5, 5, nan, 9, 2]},
index=[2, 5, 4, 6, 1, 0, 3])
sorted_df = df.sort_values(['A'], na_position='first', ascending=False)
assert_frame_equal(sorted_df, expected)
expected = df.reindex(columns=['B', 'A'])
sorted_df = df.sort_values(by=1, axis=1, na_position='first')
assert_frame_equal(sorted_df, expected)
# na_position='last', order
expected = DataFrame(
{'A': [1, 1, 2, 4, 6, 8, nan],
'B': [2, 9, nan, 5, 5, 4, 5]},
index=[3, 0, 1, 6, 4, 5, 2])
sorted_df = df.sort_values(['A', 'B'])
assert_frame_equal(sorted_df, expected)
# na_position='first', order
expected = DataFrame(
{'A': [nan, 1, 1, 2, 4, 6, 8],
'B': [5, 2, 9, nan, 5, 5, 4]},
index=[2, 3, 0, 1, 6, 4, 5])
sorted_df = df.sort_values(['A', 'B'], na_position='first')
assert_frame_equal(sorted_df, expected)
# na_position='first', not order
expected = DataFrame(
{'A': [nan, 1, 1, 2, 4, 6, 8],
'B': [5, 9, 2, nan, 5, 5, 4]},
index=[2, 0, 3, 1, 6, 4, 5])
sorted_df = df.sort_values(['A', 'B'], ascending=[
1, 0], na_position='first')
assert_frame_equal(sorted_df, expected)
# na_position='last', not order
expected = DataFrame(
{'A': [8, 6, 4, 2, 1, 1, nan],
'B': [4, 5, 5, nan, 2, 9, 5]},
index=[5, 4, 6, 1, 3, 0, 2])
sorted_df = df.sort_values(['A', 'B'], ascending=[
0, 1], na_position='last')
assert_frame_equal(sorted_df, expected)
# Test DataFrame with nan label
df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
'B': [9, nan, 5, 2, 5, 4, 5]},
index=[1, 2, 3, 4, 5, 6, nan])
# NaN label, ascending=True, na_position='last'
sorted_df = df.sort_index(
kind='quicksort', ascending=True, na_position='last')
expected = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
'B': [9, nan, 5, 2, 5, 4, 5]},
index=[1, 2, 3, 4, 5, 6, nan])
assert_frame_equal(sorted_df, expected)
# NaN label, ascending=True, na_position='first'
sorted_df = df.sort_index(na_position='first')
expected = DataFrame({'A': [4, 1, 2, nan, 1, 6, 8],
'B': [5, 9, nan, 5, 2, 5, 4]},
index=[nan, 1, 2, 3, 4, 5, 6])
assert_frame_equal(sorted_df, expected)
# NaN label, ascending=False, na_position='last'
sorted_df = df.sort_index(kind='quicksort', ascending=False)
expected = DataFrame({'A': [8, 6, 1, nan, 2, 1, 4],
'B': [4, 5, 2, 5, nan, 9, 5]},
index=[6, 5, 4, 3, 2, 1, nan])
assert_frame_equal(sorted_df, expected)
# NaN label, ascending=False, na_position='first'
sorted_df = df.sort_index(
kind='quicksort', ascending=False, na_position='first')
expected = DataFrame({'A': [4, 8, 6, 1, nan, 2, 1],
'B': [5, 4, 5, 2, 5, nan, 9]},
index=[nan, 6, 5, 4, 3, 2, 1])
assert_frame_equal(sorted_df, expected)
def test_stable_descending_sort(self):
# GH #6399
df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']],
columns=['sort_col', 'order'])
sorted_df = df.sort_values(by='sort_col', kind='mergesort',
ascending=False)
assert_frame_equal(df, sorted_df)
def test_stable_descending_multicolumn_sort(self):
nan = np.nan
df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
'B': [9, nan, 5, 2, 5, 4, 5]})
# test stable mergesort
expected = DataFrame(
{'A': [nan, 8, 6, 4, 2, 1, 1],
'B': [5, 4, 5, 5, nan, 2, 9]},
index=[2, 5, 4, 6, 1, 3, 0])
sorted_df = df.sort_values(['A', 'B'], ascending=[0, 1],
na_position='first',
kind='mergesort')
assert_frame_equal(sorted_df, expected)
expected = DataFrame(
{'A': [nan, 8, 6, 4, 2, 1, 1],
'B': [5, 4, 5, 5, nan, 9, 2]},
index=[2, 5, 4, 6, 1, 0, 3])
sorted_df = df.sort_values(['A', 'B'], ascending=[0, 0],
na_position='first',
kind='mergesort')
assert_frame_equal(sorted_df, expected)
def test_stable_categorial(self):
# GH 16793
df = DataFrame({
'x': pd.Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)
})
expected = df.copy()
sorted_df = df.sort_values('x', kind='mergesort')
assert_frame_equal(sorted_df, expected)
def test_sort_datetimes(self):
# GH 3461, argsort / lexsort differences for a datetime column
df = DataFrame(['a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'],
columns=['A'],
index=date_range('20130101', periods=9))
dts = [Timestamp(x)
for x in ['2004-02-11', '2004-01-21', '2004-01-26',
'2005-09-20', '2010-10-04', '2009-05-12',
'2008-11-12', '2010-09-28', '2010-09-28']]
df['B'] = dts[::2] + dts[1::2]
df['C'] = 2.
df['A1'] = 3.
df1 = df.sort_values(by='A')
df2 = df.sort_values(by=['A'])
assert_frame_equal(df1, df2)
df1 = df.sort_values(by='B')
df2 = df.sort_values(by=['B'])
assert_frame_equal(df1, df2)
df1 = df.sort_values(by='B')
df2 = df.sort_values(by=['C', 'B'])
assert_frame_equal(df1, df2)
def test_frame_column_inplace_sort_exception(self):
s = self.frame['A']
with tm.assert_raises_regex(ValueError, "This Series is a view"):
s.sort_values(inplace=True)
cp = s.copy()
cp.sort_values() # it works!
def test_sort_nat_values_in_int_column(self):
# GH 14922: "sorting with large float and multiple columns incorrect"
# cause was that the int64 value NaT was considered as "na". Which is
# only correct for datetime64 columns.
int_values = (2, int(NaT))
float_values = (2.0, -1.797693e308)
df = DataFrame(dict(int=int_values, float=float_values),
columns=["int", "float"])
df_reversed = DataFrame(dict(int=int_values[::-1],
float=float_values[::-1]),
columns=["int", "float"],
index=[1, 0])
# NaT is not a "na" for int64 columns, so na_position must not
# influence the result:
df_sorted = df.sort_values(["int", "float"], na_position="last")
assert_frame_equal(df_sorted, df_reversed)
df_sorted = df.sort_values(["int", "float"], na_position="first")
assert_frame_equal(df_sorted, df_reversed)
# reverse sorting order
df_sorted = df.sort_values(["int", "float"], ascending=False)
assert_frame_equal(df_sorted, df)
# and now check if NaT is still considered as "na" for datetime64
# columns:
df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT],
float=float_values), columns=["datetime", "float"])
df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")],
float=float_values[::-1]),
columns=["datetime", "float"],
index=[1, 0])
df_sorted = df.sort_values(["datetime", "float"], na_position="first")
assert_frame_equal(df_sorted, df_reversed)
df_sorted = df.sort_values(["datetime", "float"], na_position="last")
assert_frame_equal(df_sorted, df)
# Ascending should not affect the results.
df_sorted = df.sort_values(["datetime", "float"], ascending=False)
assert_frame_equal(df_sorted, df)
def test_sort_nat(self):
# GH 16836
d1 = [Timestamp(x) for x in ['2016-01-01', '2015-01-01',
np.nan, '2016-01-01']]
d2 = [Timestamp(x) for x in ['2017-01-01', '2014-01-01',
'2016-01-01', '2015-01-01']]
df = pd.DataFrame({'a': d1, 'b': d2}, index=[0, 1, 2, 3])
d3 = [Timestamp(x) for x in ['2015-01-01', '2016-01-01',
'2016-01-01', np.nan]]
d4 = [Timestamp(x) for x in ['2014-01-01', '2015-01-01',
'2017-01-01', '2016-01-01']]
expected = pd.DataFrame({'a': d3, 'b': d4}, index=[1, 3, 0, 2])
sorted_df = df.sort_values(by=['a', 'b'], )
tm.assert_frame_equal(sorted_df, expected)
class TestDataFrameSortIndexKinds(TestData):
def test_sort_index_multicolumn(self):
A = np.arange(5).repeat(20)
B = np.tile(np.arange(5), 20)
random.shuffle(A)
random.shuffle(B)
frame = DataFrame({'A': A, 'B': B,
'C': np.random.randn(100)})
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
frame.sort_index(by=['A', 'B'])
result = frame.sort_values(by=['A', 'B'])
indexer = np.lexsort((frame['B'], frame['A']))
expected = frame.take(indexer)
assert_frame_equal(result, expected)
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
frame.sort_index(by=['A', 'B'], ascending=False)
result = frame.sort_values(by=['A', 'B'], ascending=False)
indexer = np.lexsort((frame['B'].rank(ascending=False),
frame['A'].rank(ascending=False)))
expected = frame.take(indexer)
assert_frame_equal(result, expected)
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
frame.sort_index(by=['B', 'A'])
result = frame.sort_values(by=['B', 'A'])
indexer = np.lexsort((frame['A'], frame['B']))
expected = frame.take(indexer)
assert_frame_equal(result, expected)
def test_sort_index_inplace(self):
frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4],
columns=['A', 'B', 'C', 'D'])
# axis=0
unordered = frame.loc[[3, 2, 4, 1]]
a_id = id(unordered['A'])
df = unordered.copy()
df.sort_index(inplace=True)
expected = frame
assert_frame_equal(df, expected)
assert a_id != id(df['A'])
df = unordered.copy()
df.sort_index(ascending=False, inplace=True)
expected = frame[::-1]
assert_frame_equal(df, expected)
# axis=1
unordered = frame.loc[:, ['D', 'B', 'C', 'A']]
df = unordered.copy()
df.sort_index(axis=1, inplace=True)
expected = frame
assert_frame_equal(df, expected)
df = unordered.copy()
df.sort_index(axis=1, ascending=False, inplace=True)
expected = frame.iloc[:, ::-1]
assert_frame_equal(df, expected)
def test_sort_index_different_sortorder(self):
A = np.arange(20).repeat(5)
B = np.tile(np.arange(5), 20)
indexer = np.random.permutation(100)
A = A.take(indexer)
B = B.take(indexer)
df = DataFrame({'A': A, 'B': B,
'C': np.random.randn(100)})
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by=['A', 'B'], ascending=[1, 0])
result = df.sort_values(by=['A', 'B'], ascending=[1, 0])
ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
expected = df.take(ex_indexer)
assert_frame_equal(result, expected)
# test with multiindex, too
idf = df.set_index(['A', 'B'])
result = idf.sort_index(ascending=[1, 0])
expected = idf.take(ex_indexer)
assert_frame_equal(result, expected)
# also, Series!
result = idf['C'].sort_index(ascending=[1, 0])
assert_series_equal(result, expected['C'])
def test_sort_index_duplicates(self):
# with 9816, these are all translated to .sort_values
df = DataFrame([lrange(5, 9), lrange(4)],
columns=['a', 'a', 'b', 'b'])
with tm.assert_raises_regex(ValueError, 'not unique'):
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by='a')
with tm.assert_raises_regex(ValueError, 'not unique'):
df.sort_values(by='a')
with tm.assert_raises_regex(ValueError, 'not unique'):
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by=['a'])
with tm.assert_raises_regex(ValueError, 'not unique'):
df.sort_values(by=['a'])
with tm.assert_raises_regex(ValueError, 'not unique'):
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
# multi-column 'by' is separate codepath
df.sort_index(by=['a', 'b'])
with tm.assert_raises_regex(ValueError, 'not unique'):
# multi-column 'by' is separate codepath
df.sort_values(by=['a', 'b'])
# with multi-index
# GH4370
df = DataFrame(np.random.randn(4, 2),
columns=MultiIndex.from_tuples([('a', 0), ('a', 1)]))
with tm.assert_raises_regex(ValueError, 'level'):
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by='a')
with tm.assert_raises_regex(ValueError, 'level'):
df.sort_values(by='a')
# convert tuples to a list of tuples
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by=[('a', 1)])
expected = df.sort_values(by=[('a', 1)])
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by=('a', 1))
result = df.sort_values(by=('a', 1))
assert_frame_equal(result, expected)
def test_sort_index_level(self):
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
df = DataFrame([[1, 2], [3, 4]], mi)
res = df.sort_index(level='A', sort_remaining=False)
assert_frame_equal(df, res)
res = df.sort_index(level=['A', 'B'], sort_remaining=False)
assert_frame_equal(df, res)
def test_sort_index_categorical_index(self):
df = (DataFrame({'A': np.arange(6, dtype='int64'),
'B': Series(list('aabbca'))
.astype(CategoricalDtype(list('cab')))})
.set_index('B'))
result = df.sort_index()
expected = df.iloc[[4, 0, 1, 5, 2, 3]]
assert_frame_equal(result, expected)
result = df.sort_index(ascending=False)
expected = df.iloc[[3, 2, 5, 1, 0, 4]]
assert_frame_equal(result, expected)
def test_sort_index(self):
# GH13496
frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
columns=['A', 'B', 'C', 'D'])
# axis=0 : sort rows by index labels
unordered = frame.loc[[3, 2, 4, 1]]
result = unordered.sort_index(axis=0)
expected = frame
assert_frame_equal(result, expected)
result = unordered.sort_index(ascending=False)
expected = frame[::-1]
assert_frame_equal(result, expected)
# axis=1 : sort columns by column names
unordered = frame.iloc[:, [2, 1, 3, 0]]
result = unordered.sort_index(axis=1)
assert_frame_equal(result, frame)
result = unordered.sort_index(axis=1, ascending=False)
expected = frame.iloc[:, ::-1]
assert_frame_equal(result, expected)
@pytest.mark.parametrize("level", ['A', 0]) # GH 21052
def test_sort_index_multiindex(self, level):
# GH13496
# sort rows by specified level of multi-index
mi = MultiIndex.from_tuples([
[2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list('ABC'))
df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi)
expected_mi = MultiIndex.from_tuples([
[1, 1, 1],
[2, 1, 2],
[2, 1, 3]], names=list('ABC'))
expected = pd.DataFrame([
[5, 6],
[3, 4],
[1, 2]], index=expected_mi)
result = df.sort_index(level=level)
assert_frame_equal(result, expected)
# sort_remaining=False
expected_mi = MultiIndex.from_tuples([
[1, 1, 1],
[2, 1, 3],
[2, 1, 2]], names=list('ABC'))
expected = pd.DataFrame([
[5, 6],
[1, 2],
[3, 4]], index=expected_mi)
result = df.sort_index(level=level, sort_remaining=False)
assert_frame_equal(result, expected)
def test_sort_index_intervalindex(self):
# this is a de-facto sort via unstack
# confirming that we sort in the order of the bins
y = Series(np.random.randn(100))
x1 = Series(np.sign(np.random.randn(100)))
x2 = pd.cut(Series(np.random.randn(100)),
bins=[-3, -0.5, 0, 0.5, 3])
model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])
result = model.groupby(['X1', 'X2'], observed=True).mean().unstack()
expected = IntervalIndex.from_tuples(
[(-3.0, -0.5), (-0.5, 0.0),
(0.0, 0.5), (0.5, 3.0)],
closed='right')
result = result.columns.levels[1].categories
tm.assert_index_equal(result, expected)
@@ -1,572 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
from warnings import catch_warnings
import numpy as np
from pandas import DataFrame, Series, MultiIndex, Panel, Index
import pandas as pd
import pandas.util.testing as tm
from pandas.tests.frame.common import TestData
class TestDataFrameSubclassing(TestData):
def test_frame_subclassing_and_slicing(self):
# Subclass frame and ensure it returns the right class on slicing it
# In reference to PR 9632
class CustomSeries(Series):
@property
def _constructor(self):
return CustomSeries
def custom_series_function(self):
return 'OK'
class CustomDataFrame(DataFrame):
"""
Subclasses pandas DF, fills DF with simulation results, adds some
custom plotting functions.
"""
def __init__(self, *args, **kw):
super(CustomDataFrame, self).__init__(*args, **kw)
@property
def _constructor(self):
return CustomDataFrame
_constructor_sliced = CustomSeries
def custom_frame_function(self):
return 'OK'
data = {'col1': range(10),
'col2': range(10)}
cdf = CustomDataFrame(data)
# Did we get back our own DF class?
assert isinstance(cdf, CustomDataFrame)
# Do we get back our own Series class after selecting a column?
cdf_series = cdf.col1
assert isinstance(cdf_series, CustomSeries)
assert cdf_series.custom_series_function() == 'OK'
# Do we get back our own DF class after slicing row-wise?
cdf_rows = cdf[1:5]
assert isinstance(cdf_rows, CustomDataFrame)
assert cdf_rows.custom_frame_function() == 'OK'
# Make sure sliced part of multi-index frame is custom class
mcol = pd.MultiIndex.from_tuples([('A', 'A'), ('A', 'B')])
cdf_multi = CustomDataFrame([[0, 1], [2, 3]], columns=mcol)
assert isinstance(cdf_multi['A'], CustomDataFrame)
mcol = pd.MultiIndex.from_tuples([('A', ''), ('B', '')])
cdf_multi2 = CustomDataFrame([[0, 1], [2, 3]], columns=mcol)
assert isinstance(cdf_multi2['A'], CustomSeries)
def test_dataframe_metadata(self):
df = tm.SubclassedDataFrame({'X': [1, 2, 3], 'Y': [1, 2, 3]},
index=['a', 'b', 'c'])
df.testattr = 'XXX'
assert df.testattr == 'XXX'
assert df[['X']].testattr == 'XXX'
assert df.loc[['a', 'b'], :].testattr == 'XXX'
assert df.iloc[[0, 1], :].testattr == 'XXX'
# see gh-9776
assert df.iloc[0:1, :].testattr == 'XXX'
# see gh-10553
unpickled = tm.round_trip_pickle(df)
tm.assert_frame_equal(df, unpickled)
assert df._metadata == unpickled._metadata
assert df.testattr == unpickled.testattr
def test_indexing_sliced(self):
# GH 11559
df = tm.SubclassedDataFrame({'X': [1, 2, 3],
'Y': [4, 5, 6],
'Z': [7, 8, 9]},
index=['a', 'b', 'c'])
res = df.loc[:, 'X']
exp = tm.SubclassedSeries([1, 2, 3], index=list('abc'), name='X')
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
res = df.iloc[:, 1]
exp = tm.SubclassedSeries([4, 5, 6], index=list('abc'), name='Y')
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
res = df.loc[:, 'Z']
exp = tm.SubclassedSeries([7, 8, 9], index=list('abc'), name='Z')
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
res = df.loc['a', :]
exp = tm.SubclassedSeries([1, 4, 7], index=list('XYZ'), name='a')
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
res = df.iloc[1, :]
exp = tm.SubclassedSeries([2, 5, 8], index=list('XYZ'), name='b')
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
res = df.loc['c', :]
exp = tm.SubclassedSeries([3, 6, 9], index=list('XYZ'), name='c')
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
def test_to_panel_expanddim(self):
# GH 9762
with catch_warnings(record=True):
class SubclassedFrame(DataFrame):
@property
def _constructor_expanddim(self):
return SubclassedPanel
class SubclassedPanel(Panel):
pass
index = MultiIndex.from_tuples([(0, 0), (0, 1), (0, 2)])
df = SubclassedFrame({'X': [1, 2, 3], 'Y': [4, 5, 6]}, index=index)
result = df.to_panel()
assert isinstance(result, SubclassedPanel)
expected = SubclassedPanel([[[1, 2, 3]], [[4, 5, 6]]],
items=['X', 'Y'], major_axis=[0],
minor_axis=[0, 1, 2],
dtype='int64')
tm.assert_panel_equal(result, expected)
def test_subclass_attr_err_propagation(self):
# GH 11808
class A(DataFrame):
@property
def bar(self):
return self.i_dont_exist
with tm.assert_raises_regex(AttributeError, '.*i_dont_exist.*'):
A().bar
def test_subclass_align(self):
# GH 12983
df1 = tm.SubclassedDataFrame({'a': [1, 3, 5],
'b': [1, 3, 5]}, index=list('ACE'))
df2 = tm.SubclassedDataFrame({'c': [1, 2, 4],
'd': [1, 2, 4]}, index=list('ABD'))
res1, res2 = df1.align(df2, axis=0)
exp1 = tm.SubclassedDataFrame({'a': [1, np.nan, 3, np.nan, 5],
'b': [1, np.nan, 3, np.nan, 5]},
index=list('ABCDE'))
exp2 = tm.SubclassedDataFrame({'c': [1, 2, np.nan, 4, np.nan],
'd': [1, 2, np.nan, 4, np.nan]},
index=list('ABCDE'))
assert isinstance(res1, tm.SubclassedDataFrame)
tm.assert_frame_equal(res1, exp1)
assert isinstance(res2, tm.SubclassedDataFrame)
tm.assert_frame_equal(res2, exp2)
res1, res2 = df1.a.align(df2.c)
assert isinstance(res1, tm.SubclassedSeries)
tm.assert_series_equal(res1, exp1.a)
assert isinstance(res2, tm.SubclassedSeries)
tm.assert_series_equal(res2, exp2.c)
def test_subclass_align_combinations(self):
# GH 12983
df = tm.SubclassedDataFrame({'a': [1, 3, 5],
'b': [1, 3, 5]}, index=list('ACE'))
s = tm.SubclassedSeries([1, 2, 4], index=list('ABD'), name='x')
# frame + series
res1, res2 = df.align(s, axis=0)
exp1 = pd.DataFrame({'a': [1, np.nan, 3, np.nan, 5],
'b': [1, np.nan, 3, np.nan, 5]},
index=list('ABCDE'))
# name is lost when
exp2 = pd.Series([1, 2, np.nan, 4, np.nan],
index=list('ABCDE'), name='x')
assert isinstance(res1, tm.SubclassedDataFrame)
tm.assert_frame_equal(res1, exp1)
assert isinstance(res2, tm.SubclassedSeries)
tm.assert_series_equal(res2, exp2)
# series + frame
res1, res2 = s.align(df)
assert isinstance(res1, tm.SubclassedSeries)
tm.assert_series_equal(res1, exp2)
assert isinstance(res2, tm.SubclassedDataFrame)
tm.assert_frame_equal(res2, exp1)
def test_subclass_iterrows(self):
# GH 13977
df = tm.SubclassedDataFrame({'a': [1]})
for i, row in df.iterrows():
assert isinstance(row, tm.SubclassedSeries)
tm.assert_series_equal(row, df.loc[i])
def test_subclass_sparse_slice(self):
rows = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
ssdf = tm.SubclassedSparseDataFrame(rows)
ssdf.testattr = "testattr"
tm.assert_sp_frame_equal(ssdf.loc[:2],
tm.SubclassedSparseDataFrame(rows[:3]))
tm.assert_sp_frame_equal(ssdf.iloc[:2],
tm.SubclassedSparseDataFrame(rows[:2]))
tm.assert_sp_frame_equal(ssdf[:2],
tm.SubclassedSparseDataFrame(rows[:2]))
assert ssdf.loc[:2].testattr == "testattr"
assert ssdf.iloc[:2].testattr == "testattr"
assert ssdf[:2].testattr == "testattr"
tm.assert_sp_series_equal(ssdf.loc[1],
tm.SubclassedSparseSeries(rows[1]),
check_names=False)
tm.assert_sp_series_equal(ssdf.iloc[1],
tm.SubclassedSparseSeries(rows[1]),
check_names=False)
def test_subclass_sparse_transpose(self):
ossdf = tm.SubclassedSparseDataFrame([[1, 2, 3],
[4, 5, 6]])
essdf = tm.SubclassedSparseDataFrame([[1, 4],
[2, 5],
[3, 6]])
tm.assert_sp_frame_equal(ossdf.T, essdf)
def test_subclass_stack(self):
# GH 15564
df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=['a', 'b', 'c'],
columns=['X', 'Y', 'Z'])
res = df.stack()
exp = tm.SubclassedSeries(
[1, 2, 3, 4, 5, 6, 7, 8, 9],
index=[list('aaabbbccc'), list('XYZXYZXYZ')])
tm.assert_series_equal(res, exp)
def test_subclass_stack_multi(self):
# GH 15564
df = tm.SubclassedDataFrame([
[10, 11, 12, 13],
[20, 21, 22, 23],
[30, 31, 32, 33],
[40, 41, 42, 43]],
index=MultiIndex.from_tuples(
list(zip(list('AABB'), list('cdcd'))),
names=['aaa', 'ccc']),
columns=MultiIndex.from_tuples(
list(zip(list('WWXX'), list('yzyz'))),
names=['www', 'yyy']))
exp = tm.SubclassedDataFrame([
[10, 12],
[11, 13],
[20, 22],
[21, 23],
[30, 32],
[31, 33],
[40, 42],
[41, 43]],
index=MultiIndex.from_tuples(list(zip(
list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))),
names=['aaa', 'ccc', 'yyy']),
columns=Index(['W', 'X'], name='www'))
res = df.stack()
tm.assert_frame_equal(res, exp)
res = df.stack('yyy')
tm.assert_frame_equal(res, exp)
exp = tm.SubclassedDataFrame([
[10, 11],
[12, 13],
[20, 21],
[22, 23],
[30, 31],
[32, 33],
[40, 41],
[42, 43]],
index=MultiIndex.from_tuples(list(zip(
list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))),
names=['aaa', 'ccc', 'www']),
columns=Index(['y', 'z'], name='yyy'))
res = df.stack('www')
tm.assert_frame_equal(res, exp)
def test_subclass_stack_multi_mixed(self):
# GH 15564
df = tm.SubclassedDataFrame([
[10, 11, 12.0, 13.0],
[20, 21, 22.0, 23.0],
[30, 31, 32.0, 33.0],
[40, 41, 42.0, 43.0]],
index=MultiIndex.from_tuples(
list(zip(list('AABB'), list('cdcd'))),
names=['aaa', 'ccc']),
columns=MultiIndex.from_tuples(
list(zip(list('WWXX'), list('yzyz'))),
names=['www', 'yyy']))
exp = tm.SubclassedDataFrame([
[10, 12.0],
[11, 13.0],
[20, 22.0],
[21, 23.0],
[30, 32.0],
[31, 33.0],
[40, 42.0],
[41, 43.0]],
index=MultiIndex.from_tuples(list(zip(
list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))),
names=['aaa', 'ccc', 'yyy']),
columns=Index(['W', 'X'], name='www'))
res = df.stack()
tm.assert_frame_equal(res, exp)
res = df.stack('yyy')
tm.assert_frame_equal(res, exp)
exp = tm.SubclassedDataFrame([
[10.0, 11.0],
[12.0, 13.0],
[20.0, 21.0],
[22.0, 23.0],
[30.0, 31.0],
[32.0, 33.0],
[40.0, 41.0],
[42.0, 43.0]],
index=MultiIndex.from_tuples(list(zip(
list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))),
names=['aaa', 'ccc', 'www']),
columns=Index(['y', 'z'], name='yyy'))
res = df.stack('www')
tm.assert_frame_equal(res, exp)
def test_subclass_unstack(self):
# GH 15564
df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=['a', 'b', 'c'],
columns=['X', 'Y', 'Z'])
res = df.unstack()
exp = tm.SubclassedSeries(
[1, 4, 7, 2, 5, 8, 3, 6, 9],
index=[list('XXXYYYZZZ'), list('abcabcabc')])
tm.assert_series_equal(res, exp)
def test_subclass_unstack_multi(self):
# GH 15564
df = tm.SubclassedDataFrame([
[10, 11, 12, 13],
[20, 21, 22, 23],
[30, 31, 32, 33],
[40, 41, 42, 43]],
index=MultiIndex.from_tuples(
list(zip(list('AABB'), list('cdcd'))),
names=['aaa', 'ccc']),
columns=MultiIndex.from_tuples(
list(zip(list('WWXX'), list('yzyz'))),
names=['www', 'yyy']))
exp = tm.SubclassedDataFrame([
[10, 20, 11, 21, 12, 22, 13, 23],
[30, 40, 31, 41, 32, 42, 33, 43]],
index=Index(['A', 'B'], name='aaa'),
columns=MultiIndex.from_tuples(list(zip(
list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))),
names=['www', 'yyy', 'ccc']))
res = df.unstack()
tm.assert_frame_equal(res, exp)
res = df.unstack('ccc')
tm.assert_frame_equal(res, exp)
exp = tm.SubclassedDataFrame([
[10, 30, 11, 31, 12, 32, 13, 33],
[20, 40, 21, 41, 22, 42, 23, 43]],
index=Index(['c', 'd'], name='ccc'),
columns=MultiIndex.from_tuples(list(zip(
list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))),
names=['www', 'yyy', 'aaa']))
res = df.unstack('aaa')
tm.assert_frame_equal(res, exp)
def test_subclass_unstack_multi_mixed(self):
# GH 15564
df = tm.SubclassedDataFrame([
[10, 11, 12.0, 13.0],
[20, 21, 22.0, 23.0],
[30, 31, 32.0, 33.0],
[40, 41, 42.0, 43.0]],
index=MultiIndex.from_tuples(
list(zip(list('AABB'), list('cdcd'))),
names=['aaa', 'ccc']),
columns=MultiIndex.from_tuples(
list(zip(list('WWXX'), list('yzyz'))),
names=['www', 'yyy']))
exp = tm.SubclassedDataFrame([
[10, 20, 11, 21, 12.0, 22.0, 13.0, 23.0],
[30, 40, 31, 41, 32.0, 42.0, 33.0, 43.0]],
index=Index(['A', 'B'], name='aaa'),
columns=MultiIndex.from_tuples(list(zip(
list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))),
names=['www', 'yyy', 'ccc']))
res = df.unstack()
tm.assert_frame_equal(res, exp)
res = df.unstack('ccc')
tm.assert_frame_equal(res, exp)
exp = tm.SubclassedDataFrame([
[10, 30, 11, 31, 12.0, 32.0, 13.0, 33.0],
[20, 40, 21, 41, 22.0, 42.0, 23.0, 43.0]],
index=Index(['c', 'd'], name='ccc'),
columns=MultiIndex.from_tuples(list(zip(
list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))),
names=['www', 'yyy', 'aaa']))
res = df.unstack('aaa')
tm.assert_frame_equal(res, exp)
def test_subclass_pivot(self):
# GH 15564
df = tm.SubclassedDataFrame({
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
'values': [1., 2., 3., 3., 2., 1.]})
pivoted = df.pivot(
index='index', columns='columns', values='values')
expected = tm.SubclassedDataFrame({
'One': {'A': 1., 'B': 2., 'C': 3.},
'Two': {'A': 1., 'B': 2., 'C': 3.}})
expected.index.name, expected.columns.name = 'index', 'columns'
tm.assert_frame_equal(pivoted, expected)
def test_subclassed_melt(self):
# GH 15564
cheese = tm.SubclassedDataFrame({
'first': ['John', 'Mary'],
'last': ['Doe', 'Bo'],
'height': [5.5, 6.0],
'weight': [130, 150]})
melted = pd.melt(cheese, id_vars=['first', 'last'])
expected = tm.SubclassedDataFrame([
['John', 'Doe', 'height', 5.5],
['Mary', 'Bo', 'height', 6.0],
['John', 'Doe', 'weight', 130],
['Mary', 'Bo', 'weight', 150]],
columns=['first', 'last', 'variable', 'value'])
tm.assert_frame_equal(melted, expected)
def test_subclassed_wide_to_long(self):
# GH 9762
np.random.seed(123)
x = np.random.randn(3)
df = tm.SubclassedDataFrame({
"A1970": {0: "a", 1: "b", 2: "c"},
"A1980": {0: "d", 1: "e", 2: "f"},
"B1970": {0: 2.5, 1: 1.2, 2: .7},
"B1980": {0: 3.2, 1: 1.3, 2: .1},
"X": dict(zip(range(3), x))})
df["id"] = df.index
exp_data = {"X": x.tolist() + x.tolist(),
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
"year": [1970, 1970, 1970, 1980, 1980, 1980],
"id": [0, 1, 2, 0, 1, 2]}
expected = tm.SubclassedDataFrame(exp_data)
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year")
tm.assert_frame_equal(long_frame, expected)
def test_subclassed_apply(self):
# GH 19822
def check_row_subclass(row):
assert isinstance(row, tm.SubclassedSeries)
def strech(row):
if row["variable"] == "height":
row["value"] += 0.5
return row
df = tm.SubclassedDataFrame([
['John', 'Doe', 'height', 5.5],
['Mary', 'Bo', 'height', 6.0],
['John', 'Doe', 'weight', 130],
['Mary', 'Bo', 'weight', 150]],
columns=['first', 'last', 'variable', 'value'])
df.apply(lambda x: check_row_subclass(x))
df.apply(lambda x: check_row_subclass(x), axis=1)
expected = tm.SubclassedDataFrame([
['John', 'Doe', 'height', 6.0],
['Mary', 'Bo', 'height', 6.5],
['John', 'Doe', 'weight', 130],
['Mary', 'Bo', 'weight', 150]],
columns=['first', 'last', 'variable', 'value'])
result = df.apply(lambda x: strech(x), axis=1)
assert isinstance(result, tm.SubclassedDataFrame)
tm.assert_frame_equal(result, expected)
expected = tm.SubclassedDataFrame([
[1, 2, 3],
[1, 2, 3],
[1, 2, 3],
[1, 2, 3]])
result = df.apply(lambda x: tm.SubclassedSeries([1, 2, 3]), axis=1)
assert isinstance(result, tm.SubclassedDataFrame)
tm.assert_frame_equal(result, expected)
result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand")
assert isinstance(result, tm.SubclassedDataFrame)
tm.assert_frame_equal(result, expected)
expected = tm.SubclassedSeries([
[1, 2, 3],
[1, 2, 3],
[1, 2, 3],
[1, 2, 3]])
result = df.apply(lambda x: [1, 2, 3], axis=1)
assert not isinstance(result, tm.SubclassedDataFrame)
tm.assert_series_equal(result, expected)
@@ -1,847 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
from datetime import datetime, time
import pytest
from numpy import nan
from numpy.random import randn
import numpy as np
from pandas import (DataFrame, Series, Index,
Timestamp, DatetimeIndex, MultiIndex,
to_datetime, date_range, period_range)
import pandas as pd
import pandas.tseries.offsets as offsets
from pandas.util.testing import (assert_series_equal,
assert_frame_equal,
assert_index_equal,
assert_raises_regex)
import pandas.util.testing as tm
from pandas.compat import product
from pandas.tests.frame.common import TestData
class TestDataFrameTimeSeriesMethods(TestData):
def test_diff(self):
the_diff = self.tsframe.diff(1)
assert_series_equal(the_diff['A'],
self.tsframe['A'] - self.tsframe['A'].shift(1))
# int dtype
a = 10000000000000000
b = a + 1
s = Series([a, b])
rs = DataFrame({'s': s}).diff()
assert rs.s[1] == 1
# mixed numeric
tf = self.tsframe.astype('float32')
the_diff = tf.diff(1)
assert_series_equal(the_diff['A'],
tf['A'] - tf['A'].shift(1))
# issue 10907
df = pd.DataFrame({'y': pd.Series([2]), 'z': pd.Series([3])})
df.insert(0, 'x', 1)
result = df.diff(axis=1)
expected = pd.DataFrame({'x': np.nan, 'y': pd.Series(
1), 'z': pd.Series(1)}).astype('float64')
assert_frame_equal(result, expected)
@pytest.mark.parametrize('tz', [None, 'UTC'])
def test_diff_datetime_axis0(self, tz):
# GH 18578
df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz),
1: date_range('2010', freq='D', periods=2, tz=tz)})
result = df.diff(axis=0)
expected = DataFrame({0: pd.TimedeltaIndex(['NaT', '1 days']),
1: pd.TimedeltaIndex(['NaT', '1 days'])})
assert_frame_equal(result, expected)
@pytest.mark.parametrize('tz', [None, 'UTC'])
def test_diff_datetime_axis1(self, tz):
# GH 18578
df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz),
1: date_range('2010', freq='D', periods=2, tz=tz)})
if tz is None:
result = df.diff(axis=1)
expected = DataFrame({0: pd.TimedeltaIndex(['NaT', 'NaT']),
1: pd.TimedeltaIndex(['0 days',
'0 days'])})
assert_frame_equal(result, expected)
else:
with pytest.raises(NotImplementedError):
result = df.diff(axis=1)
def test_diff_timedelta(self):
# GH 4533
df = DataFrame(dict(time=[Timestamp('20130101 9:01'),
Timestamp('20130101 9:02')],
value=[1.0, 2.0]))
res = df.diff()
exp = DataFrame([[pd.NaT, np.nan],
[pd.Timedelta('00:01:00'), 1]],
columns=['time', 'value'])
assert_frame_equal(res, exp)
def test_diff_mixed_dtype(self):
df = DataFrame(np.random.randn(5, 3))
df['A'] = np.array([1, 2, 3, 4, 5], dtype=object)
result = df.diff()
assert result[0].dtype == np.float64
def test_diff_neg_n(self):
rs = self.tsframe.diff(-1)
xp = self.tsframe - self.tsframe.shift(-1)
assert_frame_equal(rs, xp)
def test_diff_float_n(self):
rs = self.tsframe.diff(1.)
xp = self.tsframe.diff(1)
assert_frame_equal(rs, xp)
def test_diff_axis(self):
# GH 9727
df = DataFrame([[1., 2.], [3., 4.]])
assert_frame_equal(df.diff(axis=1), DataFrame(
[[np.nan, 1.], [np.nan, 1.]]))
assert_frame_equal(df.diff(axis=0), DataFrame(
[[np.nan, np.nan], [2., 2.]]))
def test_pct_change(self):
rs = self.tsframe.pct_change(fill_method=None)
assert_frame_equal(rs, self.tsframe / self.tsframe.shift(1) - 1)
rs = self.tsframe.pct_change(2)
filled = self.tsframe.fillna(method='pad')
assert_frame_equal(rs, filled / filled.shift(2) - 1)
rs = self.tsframe.pct_change(fill_method='bfill', limit=1)
filled = self.tsframe.fillna(method='bfill', limit=1)
assert_frame_equal(rs, filled / filled.shift(1) - 1)
rs = self.tsframe.pct_change(freq='5D')
filled = self.tsframe.fillna(method='pad')
assert_frame_equal(rs,
(filled / filled.shift(freq='5D') - 1)
.reindex_like(filled))
def test_pct_change_shift_over_nas(self):
s = Series([1., 1.5, np.nan, 2.5, 3.])
df = DataFrame({'a': s, 'b': s})
chg = df.pct_change()
expected = Series([np.nan, 0.5, 0., 2.5 / 1.5 - 1, .2])
edf = DataFrame({'a': expected, 'b': expected})
assert_frame_equal(chg, edf)
@pytest.mark.parametrize("freq, periods, fill_method, limit",
[('5B', 5, None, None),
('3B', 3, None, None),
('3B', 3, 'bfill', None),
('7B', 7, 'pad', 1),
('7B', 7, 'bfill', 3),
('14B', 14, None, None)])
def test_pct_change_periods_freq(self, freq, periods, fill_method, limit):
# GH 7292
rs_freq = self.tsframe.pct_change(freq=freq,
fill_method=fill_method,
limit=limit)
rs_periods = self.tsframe.pct_change(periods,
fill_method=fill_method,
limit=limit)
assert_frame_equal(rs_freq, rs_periods)
empty_ts = DataFrame(index=self.tsframe.index,
columns=self.tsframe.columns)
rs_freq = empty_ts.pct_change(freq=freq,
fill_method=fill_method,
limit=limit)
rs_periods = empty_ts.pct_change(periods,
fill_method=fill_method,
limit=limit)
assert_frame_equal(rs_freq, rs_periods)
def test_frame_ctor_datetime64_column(self):
rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s')
dates = np.asarray(rng)
df = DataFrame({'A': np.random.randn(len(rng)), 'B': dates})
assert np.issubdtype(df['B'].dtype, np.dtype('M8[ns]'))
def test_frame_add_datetime64_column(self):
rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s')
df = DataFrame(index=np.arange(len(rng)))
df['A'] = rng
assert np.issubdtype(df['A'].dtype, np.dtype('M8[ns]'))
def test_frame_datetime64_pre1900_repr(self):
df = DataFrame({'year': date_range('1/1/1700', periods=50,
freq='A-DEC')})
# it works!
repr(df)
def test_frame_add_datetime64_col_other_units(self):
n = 100
units = ['h', 'm', 's', 'ms', 'D', 'M', 'Y']
ns_dtype = np.dtype('M8[ns]')
for unit in units:
dtype = np.dtype('M8[%s]' % unit)
vals = np.arange(n, dtype=np.int64).view(dtype)
df = DataFrame({'ints': np.arange(n)}, index=np.arange(n))
df[unit] = vals
ex_vals = to_datetime(vals.astype('O')).values
assert df[unit].dtype == ns_dtype
assert (df[unit].values == ex_vals).all()
# Test insertion into existing datetime64 column
df = DataFrame({'ints': np.arange(n)}, index=np.arange(n))
df['dates'] = np.arange(n, dtype=np.int64).view(ns_dtype)
for unit in units:
dtype = np.dtype('M8[%s]' % unit)
vals = np.arange(n, dtype=np.int64).view(dtype)
tmp = df.copy()
tmp['dates'] = vals
ex_vals = to_datetime(vals.astype('O')).values
assert (tmp['dates'].values == ex_vals).all()
def test_shift(self):
# naive shift
shiftedFrame = self.tsframe.shift(5)
tm.assert_index_equal(shiftedFrame.index, self.tsframe.index)
shiftedSeries = self.tsframe['A'].shift(5)
assert_series_equal(shiftedFrame['A'], shiftedSeries)
shiftedFrame = self.tsframe.shift(-5)
tm.assert_index_equal(shiftedFrame.index, self.tsframe.index)
shiftedSeries = self.tsframe['A'].shift(-5)
assert_series_equal(shiftedFrame['A'], shiftedSeries)
# shift by 0
unshifted = self.tsframe.shift(0)
assert_frame_equal(unshifted, self.tsframe)
# shift by DateOffset
shiftedFrame = self.tsframe.shift(5, freq=offsets.BDay())
assert len(shiftedFrame) == len(self.tsframe)
shiftedFrame2 = self.tsframe.shift(5, freq='B')
assert_frame_equal(shiftedFrame, shiftedFrame2)
d = self.tsframe.index[0]
shifted_d = d + offsets.BDay(5)
assert_series_equal(self.tsframe.xs(d),
shiftedFrame.xs(shifted_d), check_names=False)
# shift int frame
int_shifted = self.intframe.shift(1) # noqa
# Shifting with PeriodIndex
ps = tm.makePeriodFrame()
shifted = ps.shift(1)
unshifted = shifted.shift(-1)
tm.assert_index_equal(shifted.index, ps.index)
tm.assert_index_equal(unshifted.index, ps.index)
tm.assert_numpy_array_equal(unshifted.iloc[:, 0].dropna().values,
ps.iloc[:-1, 0].values)
shifted2 = ps.shift(1, 'B')
shifted3 = ps.shift(1, offsets.BDay())
assert_frame_equal(shifted2, shifted3)
assert_frame_equal(ps, shifted2.shift(-1, 'B'))
tm.assert_raises_regex(ValueError,
'does not match PeriodIndex freq',
ps.shift, freq='D')
# shift other axis
# GH 6371
df = DataFrame(np.random.rand(10, 5))
expected = pd.concat([DataFrame(np.nan, index=df.index,
columns=[0]),
df.iloc[:, 0:-1]],
ignore_index=True, axis=1)
result = df.shift(1, axis=1)
assert_frame_equal(result, expected)
# shift named axis
df = DataFrame(np.random.rand(10, 5))
expected = pd.concat([DataFrame(np.nan, index=df.index,
columns=[0]),
df.iloc[:, 0:-1]],
ignore_index=True, axis=1)
result = df.shift(1, axis='columns')
assert_frame_equal(result, expected)
def test_shift_bool(self):
df = DataFrame({'high': [True, False],
'low': [False, False]})
rs = df.shift(1)
xp = DataFrame(np.array([[np.nan, np.nan],
[True, False]], dtype=object),
columns=['high', 'low'])
assert_frame_equal(rs, xp)
def test_shift_categorical(self):
# GH 9416
s1 = pd.Series(['a', 'b', 'c'], dtype='category')
s2 = pd.Series(['A', 'B', 'C'], dtype='category')
df = DataFrame({'one': s1, 'two': s2})
rs = df.shift(1)
xp = DataFrame({'one': s1.shift(1), 'two': s2.shift(1)})
assert_frame_equal(rs, xp)
def test_shift_empty(self):
# Regression test for #8019
df = DataFrame({'foo': []})
rs = df.shift(-1)
assert_frame_equal(df, rs)
def test_shift_duplicate_columns(self):
# GH 9092; verify that position-based shifting works
# in the presence of duplicate columns
column_lists = [list(range(5)), [1] * 5, [1, 1, 2, 2, 1]]
data = np.random.randn(20, 5)
shifted = []
for columns in column_lists:
df = pd.DataFrame(data.copy(), columns=columns)
for s in range(5):
df.iloc[:, s] = df.iloc[:, s].shift(s + 1)
df.columns = range(5)
shifted.append(df)
# sanity check the base case
nulls = shifted[0].isna().sum()
assert_series_equal(nulls, Series(range(1, 6), dtype='int64'))
# check all answers are the same
assert_frame_equal(shifted[0], shifted[1])
assert_frame_equal(shifted[0], shifted[2])
def test_tshift(self):
# PeriodIndex
ps = tm.makePeriodFrame()
shifted = ps.tshift(1)
unshifted = shifted.tshift(-1)
assert_frame_equal(unshifted, ps)
shifted2 = ps.tshift(freq='B')
assert_frame_equal(shifted, shifted2)
shifted3 = ps.tshift(freq=offsets.BDay())
assert_frame_equal(shifted, shifted3)
tm.assert_raises_regex(
ValueError, 'does not match', ps.tshift, freq='M')
# DatetimeIndex
shifted = self.tsframe.tshift(1)
unshifted = shifted.tshift(-1)
assert_frame_equal(self.tsframe, unshifted)
shifted2 = self.tsframe.tshift(freq=self.tsframe.index.freq)
assert_frame_equal(shifted, shifted2)
inferred_ts = DataFrame(self.tsframe.values,
Index(np.asarray(self.tsframe.index)),
columns=self.tsframe.columns)
shifted = inferred_ts.tshift(1)
unshifted = shifted.tshift(-1)
assert_frame_equal(shifted, self.tsframe.tshift(1))
assert_frame_equal(unshifted, inferred_ts)
no_freq = self.tsframe.iloc[[0, 5, 7], :]
pytest.raises(ValueError, no_freq.tshift)
def test_truncate(self):
ts = self.tsframe[::3]
start, end = self.tsframe.index[3], self.tsframe.index[6]
start_missing = self.tsframe.index[2]
end_missing = self.tsframe.index[7]
# neither specified
truncated = ts.truncate()
assert_frame_equal(truncated, ts)
# both specified
expected = ts[1:3]
truncated = ts.truncate(start, end)
assert_frame_equal(truncated, expected)
truncated = ts.truncate(start_missing, end_missing)
assert_frame_equal(truncated, expected)
# start specified
expected = ts[1:]
truncated = ts.truncate(before=start)
assert_frame_equal(truncated, expected)
truncated = ts.truncate(before=start_missing)
assert_frame_equal(truncated, expected)
# end specified
expected = ts[:3]
truncated = ts.truncate(after=end)
assert_frame_equal(truncated, expected)
truncated = ts.truncate(after=end_missing)
assert_frame_equal(truncated, expected)
pytest.raises(ValueError, ts.truncate,
before=ts.index[-1] - 1,
after=ts.index[0] + 1)
def test_truncate_copy(self):
index = self.tsframe.index
truncated = self.tsframe.truncate(index[5], index[10])
truncated.values[:] = 5.
assert not (self.tsframe.values[5:11] == 5).any()
def test_truncate_nonsortedindex(self):
# GH 17935
df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e']},
index=[5, 3, 2, 9, 0])
with tm.assert_raises_regex(ValueError,
'truncate requires a sorted index'):
df.truncate(before=3, after=9)
rng = pd.date_range('2011-01-01', '2012-01-01', freq='W')
ts = pd.DataFrame({'A': np.random.randn(len(rng)),
'B': np.random.randn(len(rng))},
index=rng)
with tm.assert_raises_regex(ValueError,
'truncate requires a sorted index'):
ts.sort_values('A', ascending=False).truncate(before='2011-11',
after='2011-12')
df = pd.DataFrame({3: np.random.randn(5),
20: np.random.randn(5),
2: np.random.randn(5),
0: np.random.randn(5)},
columns=[3, 20, 2, 0])
with tm.assert_raises_regex(ValueError,
'truncate requires a sorted index'):
df.truncate(before=2, after=20, axis=1)
def test_asfreq(self):
offset_monthly = self.tsframe.asfreq(offsets.BMonthEnd())
rule_monthly = self.tsframe.asfreq('BM')
tm.assert_almost_equal(offset_monthly['A'], rule_monthly['A'])
filled = rule_monthly.asfreq('B', method='pad') # noqa
# TODO: actually check that this worked.
# don't forget!
filled_dep = rule_monthly.asfreq('B', method='pad') # noqa
# test does not blow up on length-0 DataFrame
zero_length = self.tsframe.reindex([])
result = zero_length.asfreq('BM')
assert result is not zero_length
def test_asfreq_datetimeindex(self):
df = DataFrame({'A': [1, 2, 3]},
index=[datetime(2011, 11, 1), datetime(2011, 11, 2),
datetime(2011, 11, 3)])
df = df.asfreq('B')
assert isinstance(df.index, DatetimeIndex)
ts = df['A'].asfreq('B')
assert isinstance(ts.index, DatetimeIndex)
def test_asfreq_fillvalue(self):
# test for fill value during upsampling, related to issue 3715
# setup
rng = pd.date_range('1/1/2016', periods=10, freq='2S')
ts = pd.Series(np.arange(len(rng)), index=rng)
df = pd.DataFrame({'one': ts})
# insert pre-existing missing value
df.loc['2016-01-01 00:00:08', 'one'] = None
actual_df = df.asfreq(freq='1S', fill_value=9.0)
expected_df = df.asfreq(freq='1S').fillna(9.0)
expected_df.loc['2016-01-01 00:00:08', 'one'] = None
assert_frame_equal(expected_df, actual_df)
expected_series = ts.asfreq(freq='1S').fillna(9.0)
actual_series = ts.asfreq(freq='1S', fill_value=9.0)
assert_series_equal(expected_series, actual_series)
@pytest.mark.parametrize("data,idx,expected_first,expected_last", [
({'A': [1, 2, 3]}, [1, 1, 2], 1, 2),
({'A': [1, 2, 3]}, [1, 2, 2], 1, 2),
({'A': [1, 2, 3, 4]}, ['d', 'd', 'd', 'd'], 'd', 'd'),
({'A': [1, np.nan, 3]}, [1, 1, 2], 1, 2),
({'A': [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2),
({'A': [1, np.nan, 3]}, [1, 2, 2], 1, 2)])
def test_first_last_valid(self, data, idx,
expected_first, expected_last):
N = len(self.frame.index)
mat = randn(N)
mat[:5] = nan
mat[-5:] = nan
frame = DataFrame({'foo': mat}, index=self.frame.index)
index = frame.first_valid_index()
assert index == frame.index[5]
index = frame.last_valid_index()
assert index == frame.index[-6]
# GH12800
empty = DataFrame()
assert empty.last_valid_index() is None
assert empty.first_valid_index() is None
# GH17400: no valid entries
frame[:] = nan
assert frame.last_valid_index() is None
assert frame.first_valid_index() is None
# GH20499: its preserves freq with holes
frame.index = date_range("20110101", periods=N, freq="B")
frame.iloc[1] = 1
frame.iloc[-2] = 1
assert frame.first_valid_index() == frame.index[1]
assert frame.last_valid_index() == frame.index[-2]
assert frame.first_valid_index().freq == frame.index.freq
assert frame.last_valid_index().freq == frame.index.freq
# GH 21441
df = DataFrame(data, index=idx)
assert expected_first == df.first_valid_index()
assert expected_last == df.last_valid_index()
def test_first_subset(self):
ts = tm.makeTimeDataFrame(freq='12h')
result = ts.first('10d')
assert len(result) == 20
ts = tm.makeTimeDataFrame(freq='D')
result = ts.first('10d')
assert len(result) == 10
result = ts.first('3M')
expected = ts[:'3/31/2000']
assert_frame_equal(result, expected)
result = ts.first('21D')
expected = ts[:21]
assert_frame_equal(result, expected)
result = ts[:0].first('3M')
assert_frame_equal(result, ts[:0])
def test_first_raises(self):
# GH20725
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
with pytest.raises(TypeError): # index is not a DatetimeIndex
df.first('1D')
def test_last_subset(self):
ts = tm.makeTimeDataFrame(freq='12h')
result = ts.last('10d')
assert len(result) == 20
ts = tm.makeTimeDataFrame(nper=30, freq='D')
result = ts.last('10d')
assert len(result) == 10
result = ts.last('21D')
expected = ts['2000-01-10':]
assert_frame_equal(result, expected)
result = ts.last('21D')
expected = ts[-21:]
assert_frame_equal(result, expected)
result = ts[:0].last('3M')
assert_frame_equal(result, ts[:0])
def test_last_raises(self):
# GH20725
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
with pytest.raises(TypeError): # index is not a DatetimeIndex
df.last('1D')
def test_at_time(self):
rng = date_range('1/1/2000', '1/5/2000', freq='5min')
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
rs = ts.at_time(rng[1])
assert (rs.index.hour == rng[1].hour).all()
assert (rs.index.minute == rng[1].minute).all()
assert (rs.index.second == rng[1].second).all()
result = ts.at_time('9:30')
expected = ts.at_time(time(9, 30))
assert_frame_equal(result, expected)
result = ts.loc[time(9, 30)]
expected = ts.loc[(rng.hour == 9) & (rng.minute == 30)]
assert_frame_equal(result, expected)
# midnight, everything
rng = date_range('1/1/2000', '1/31/2000')
ts = DataFrame(np.random.randn(len(rng), 3), index=rng)
result = ts.at_time(time(0, 0))
assert_frame_equal(result, ts)
# time doesn't exist
rng = date_range('1/1/2012', freq='23Min', periods=384)
ts = DataFrame(np.random.randn(len(rng), 2), rng)
rs = ts.at_time('16:00')
assert len(rs) == 0
def test_at_time_raises(self):
# GH20725
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
with pytest.raises(TypeError): # index is not a DatetimeIndex
df.at_time('00:00')
def test_between_time(self):
rng = date_range('1/1/2000', '1/5/2000', freq='5min')
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
stime = time(0, 0)
etime = time(1, 0)
close_open = product([True, False], [True, False])
for inc_start, inc_end in close_open:
filtered = ts.between_time(stime, etime, inc_start, inc_end)
exp_len = 13 * 4 + 1
if not inc_start:
exp_len -= 5
if not inc_end:
exp_len -= 4
assert len(filtered) == exp_len
for rs in filtered.index:
t = rs.time()
if inc_start:
assert t >= stime
else:
assert t > stime
if inc_end:
assert t <= etime
else:
assert t < etime
result = ts.between_time('00:00', '01:00')
expected = ts.between_time(stime, etime)
assert_frame_equal(result, expected)
# across midnight
rng = date_range('1/1/2000', '1/5/2000', freq='5min')
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
stime = time(22, 0)
etime = time(9, 0)
close_open = product([True, False], [True, False])
for inc_start, inc_end in close_open:
filtered = ts.between_time(stime, etime, inc_start, inc_end)
exp_len = (12 * 11 + 1) * 4 + 1
if not inc_start:
exp_len -= 4
if not inc_end:
exp_len -= 4
assert len(filtered) == exp_len
for rs in filtered.index:
t = rs.time()
if inc_start:
assert (t >= stime) or (t <= etime)
else:
assert (t > stime) or (t <= etime)
if inc_end:
assert (t <= etime) or (t >= stime)
else:
assert (t < etime) or (t >= stime)
def test_between_time_raises(self):
# GH20725
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
with pytest.raises(TypeError): # index is not a DatetimeIndex
df.between_time(start_time='00:00', end_time='12:00')
def test_operation_on_NaT(self):
# Both NaT and Timestamp are in DataFrame.
df = pd.DataFrame({'foo': [pd.NaT, pd.NaT,
pd.Timestamp('2012-05-01')]})
res = df.min()
exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"])
tm.assert_series_equal(res, exp)
res = df.max()
exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"])
tm.assert_series_equal(res, exp)
# GH12941, only NaTs are in DataFrame.
df = pd.DataFrame({'foo': [pd.NaT, pd.NaT]})
res = df.min()
exp = pd.Series([pd.NaT], index=["foo"])
tm.assert_series_equal(res, exp)
res = df.max()
exp = pd.Series([pd.NaT], index=["foo"])
tm.assert_series_equal(res, exp)
def test_datetime_assignment_with_NaT_and_diff_time_units(self):
# GH 7492
data_ns = np.array([1, 'nat'], dtype='datetime64[ns]')
result = pd.Series(data_ns).to_frame()
result['new'] = data_ns
expected = pd.DataFrame({0: [1, None],
'new': [1, None]}, dtype='datetime64[ns]')
tm.assert_frame_equal(result, expected)
# OutOfBoundsDatetime error shouldn't occur
data_s = np.array([1, 'nat'], dtype='datetime64[s]')
result['new'] = data_s
expected = pd.DataFrame({0: [1, None],
'new': [1e9, None]}, dtype='datetime64[ns]')
tm.assert_frame_equal(result, expected)
def test_frame_to_period(self):
K = 5
from pandas.core.indexes.period import period_range
dr = date_range('1/1/2000', '1/1/2001')
pr = period_range('1/1/2000', '1/1/2001')
df = DataFrame(randn(len(dr), K), index=dr)
df['mix'] = 'a'
pts = df.to_period()
exp = df.copy()
exp.index = pr
assert_frame_equal(pts, exp)
pts = df.to_period('M')
tm.assert_index_equal(pts.index, exp.index.asfreq('M'))
df = df.T
pts = df.to_period(axis=1)
exp = df.copy()
exp.columns = pr
assert_frame_equal(pts, exp)
pts = df.to_period('M', axis=1)
tm.assert_index_equal(pts.columns, exp.columns.asfreq('M'))
pytest.raises(ValueError, df.to_period, axis=2)
@pytest.mark.parametrize("fn", ['tz_localize', 'tz_convert'])
def test_tz_convert_and_localize(self, fn):
l0 = date_range('20140701', periods=5, freq='D')
# TODO: l1 should be a PeriodIndex for testing
# after GH2106 is addressed
with pytest.raises(NotImplementedError):
period_range('20140701', periods=1).tz_convert('UTC')
with pytest.raises(NotImplementedError):
period_range('20140701', periods=1).tz_localize('UTC')
# l1 = period_range('20140701', periods=5, freq='D')
l1 = date_range('20140701', periods=5, freq='D')
int_idx = Index(range(5))
if fn == 'tz_convert':
l0 = l0.tz_localize('UTC')
l1 = l1.tz_localize('UTC')
for idx in [l0, l1]:
l0_expected = getattr(idx, fn)('US/Pacific')
l1_expected = getattr(idx, fn)('US/Pacific')
df1 = DataFrame(np.ones(5), index=l0)
df1 = getattr(df1, fn)('US/Pacific')
assert_index_equal(df1.index, l0_expected)
# MultiIndex
# GH7846
df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1]))
df3 = getattr(df2, fn)('US/Pacific', level=0)
assert not df3.index.levels[0].equals(l0)
assert_index_equal(df3.index.levels[0], l0_expected)
assert_index_equal(df3.index.levels[1], l1)
assert not df3.index.levels[1].equals(l1_expected)
df3 = getattr(df2, fn)('US/Pacific', level=1)
assert_index_equal(df3.index.levels[0], l0)
assert not df3.index.levels[0].equals(l0_expected)
assert_index_equal(df3.index.levels[1], l1_expected)
assert not df3.index.levels[1].equals(l1)
df4 = DataFrame(np.ones(5),
MultiIndex.from_arrays([int_idx, l0]))
# TODO: untested
df5 = getattr(df4, fn)('US/Pacific', level=1) # noqa
assert_index_equal(df3.index.levels[0], l0)
assert not df3.index.levels[0].equals(l0_expected)
assert_index_equal(df3.index.levels[1], l1_expected)
assert not df3.index.levels[1].equals(l1)
# Bad Inputs
# Not DatetimeIndex / PeriodIndex
with assert_raises_regex(TypeError, 'DatetimeIndex'):
df = DataFrame(index=int_idx)
df = getattr(df, fn)('US/Pacific')
# Not DatetimeIndex / PeriodIndex
with assert_raises_regex(TypeError, 'DatetimeIndex'):
df = DataFrame(np.ones(5),
MultiIndex.from_arrays([int_idx, l0]))
df = getattr(df, fn)('US/Pacific', level=0)
# Invalid level
with assert_raises_regex(ValueError, 'not valid'):
df = DataFrame(index=l0)
df = getattr(df, fn)('US/Pacific', level=1)
@@ -1,145 +0,0 @@
# -*- coding: utf-8 -*-
"""
Tests for DataFrame timezone-related methods
"""
from datetime import datetime
import pytest
import pytz
import numpy as np
import pandas.util.testing as tm
from pandas.compat import lrange
from pandas.core.indexes.datetimes import date_range
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas import Series, DataFrame
class TestDataFrameTimezones(object):
def test_frame_from_records_utc(self):
rec = {'datum': 1.5,
'begin_time': datetime(2006, 4, 27, tzinfo=pytz.utc)}
# it works
DataFrame.from_records([rec], index='begin_time')
def test_frame_tz_localize(self):
rng = date_range('1/1/2011', periods=100, freq='H')
df = DataFrame({'a': 1}, index=rng)
result = df.tz_localize('utc')
expected = DataFrame({'a': 1}, rng.tz_localize('UTC'))
assert result.index.tz.zone == 'UTC'
tm.assert_frame_equal(result, expected)
df = df.T
result = df.tz_localize('utc', axis=1)
assert result.columns.tz.zone == 'UTC'
tm.assert_frame_equal(result, expected.T)
def test_frame_tz_convert(self):
rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern')
df = DataFrame({'a': 1}, index=rng)
result = df.tz_convert('Europe/Berlin')
expected = DataFrame({'a': 1}, rng.tz_convert('Europe/Berlin'))
assert result.index.tz.zone == 'Europe/Berlin'
tm.assert_frame_equal(result, expected)
df = df.T
result = df.tz_convert('Europe/Berlin', axis=1)
assert result.columns.tz.zone == 'Europe/Berlin'
tm.assert_frame_equal(result, expected.T)
def test_frame_join_tzaware(self):
test1 = DataFrame(np.zeros((6, 3)),
index=date_range("2012-11-15 00:00:00", periods=6,
freq="100L", tz="US/Central"))
test2 = DataFrame(np.zeros((3, 3)),
index=date_range("2012-11-15 00:00:00", periods=3,
freq="250L", tz="US/Central"),
columns=lrange(3, 6))
result = test1.join(test2, how='outer')
ex_index = test1.index.union(test2.index)
tm.assert_index_equal(result.index, ex_index)
assert result.index.tz.zone == 'US/Central'
def test_frame_add_tz_mismatch_converts_to_utc(self):
rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern')
df = DataFrame(np.random.randn(len(rng)), index=rng, columns=['a'])
df_moscow = df.tz_convert('Europe/Moscow')
result = df + df_moscow
assert result.index.tz is pytz.utc
result = df_moscow + df
assert result.index.tz is pytz.utc
def test_frame_align_aware(self):
idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern')
idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern')
df1 = DataFrame(np.random.randn(len(idx1), 3), idx1)
df2 = DataFrame(np.random.randn(len(idx2), 3), idx2)
new1, new2 = df1.align(df2)
assert df1.index.tz == new1.index.tz
assert df2.index.tz == new2.index.tz
# different timezones convert to UTC
# frame with frame
df1_central = df1.tz_convert('US/Central')
new1, new2 = df1.align(df1_central)
assert new1.index.tz == pytz.UTC
assert new2.index.tz == pytz.UTC
# frame with Series
new1, new2 = df1.align(df1_central[0], axis=0)
assert new1.index.tz == pytz.UTC
assert new2.index.tz == pytz.UTC
df1[0].align(df1_central, axis=0)
assert new1.index.tz == pytz.UTC
assert new2.index.tz == pytz.UTC
@pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern'])
def test_frame_no_datetime64_dtype(self, tz):
# after GH#7822
# these retain the timezones on dict construction
dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI')
dr_tz = dr.tz_localize(tz)
df = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr)
tz_expected = DatetimeTZDtype('ns', dr_tz.tzinfo)
assert df['B'].dtype == tz_expected
# GH#2810 (with timezones)
datetimes_naive = [ts.to_pydatetime() for ts in dr]
datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz]
df = DataFrame({'dr': dr,
'dr_tz': dr_tz,
'datetimes_naive': datetimes_naive,
'datetimes_with_tz': datetimes_with_tz})
result = df.get_dtype_counts().sort_index()
expected = Series({'datetime64[ns]': 2,
str(tz_expected): 2}).sort_index()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern'])
def test_frame_reset_index(self, tz):
dr = date_range('2012-06-02', periods=10, tz=tz)
df = DataFrame(np.random.randn(len(dr)), dr)
roundtripped = df.reset_index().set_index('index')
xp = df.index.tz
rs = roundtripped.index.tz
assert xp == rs
@pytest.mark.parametrize('tz', [None, 'America/New_York'])
def test_boolean_compare_transpose_tzindex_with_dst(self, tz):
# GH 19970
idx = date_range('20161101', '20161130', freq='4H', tz=tz)
df = DataFrame({'a': range(len(idx)), 'b': range(len(idx))},
index=idx)
result = df.T == df.T
expected = DataFrame(True, index=list('ab'), columns=idx)
tm.assert_frame_equal(result, expected)
File diff suppressed because it is too large Load Diff
@@ -1,33 +0,0 @@
from pandas.core.frame import DataFrame
import pytest
import pandas.util.testing as tm
@pytest.fixture
def dataframe():
return DataFrame({'a': [1, 2], 'b': [3, 4]})
class TestDataFrameValidate(object):
"""Tests for error handling related to data types of method arguments."""
@pytest.mark.parametrize("func", ["query", "eval", "set_index",
"reset_index", "dropna",
"drop_duplicates", "sort_values"])
@pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0])
def test_validate_bool_args(self, dataframe, func, inplace):
msg = "For argument \"inplace\" expected type bool"
kwargs = dict(inplace=inplace)
if func == "query":
kwargs["expr"] = "a > b"
elif func == "eval":
kwargs["expr"] = "a + b"
elif func == "set_index":
kwargs["keys"] = ["a"]
elif func == "sort_values":
kwargs["by"] = ["a"]
with tm.assert_raises_regex(ValueError, msg):
getattr(dataframe, func)(**kwargs)