started work on backend

This commit is contained in:
d3m1g0d
2019-01-21 17:36:00 +01:00
parent a1a8bca34b
commit 9f9a7e4974
4032 changed files with 745079 additions and 0 deletions
@@ -0,0 +1,74 @@
import pytest
from pandas.io.parsers import read_table
@pytest.fixture
def tips_file(datapath):
"""Path to the tips dataset"""
return datapath('io', 'parser', 'data', 'tips.csv')
@pytest.fixture
def jsonl_file(datapath):
"""Path a JSONL dataset"""
return datapath('io', 'parser', 'data', 'items.jsonl')
@pytest.fixture
def salaries_table(datapath):
"""DataFrame with the salaries dataset"""
return read_table(datapath('io', 'parser', 'data', 'salaries.csv'))
@pytest.fixture
def s3_resource(tips_file, jsonl_file):
"""Fixture for mocking S3 interaction.
The primary bucket name is "pandas-test". The following datasets
are loaded.
- tips.csv
- tips.csv.gz
- tips.csv.bz2
- items.jsonl
A private bucket "cant_get_it" is also created. The boto3 s3 resource
is yielded by the fixture.
"""
pytest.importorskip('s3fs')
boto3 = pytest.importorskip('boto3')
moto = pytest.importorskip('moto')
test_s3_files = [
('tips.csv', tips_file),
('tips.csv.gz', tips_file + '.gz'),
('tips.csv.bz2', tips_file + '.bz2'),
('items.jsonl', jsonl_file),
]
def add_tips_files(bucket_name):
for s3_key, file_name in test_s3_files:
with open(file_name, 'rb') as f:
conn.Bucket(bucket_name).put_object(
Key=s3_key,
Body=f)
try:
s3 = moto.mock_s3()
s3.start()
# see gh-16135
bucket = 'pandas-test'
conn = boto3.resource("s3", region_name="us-east-1")
conn.create_bucket(Bucket=bucket)
add_tips_files(bucket)
conn.create_bucket(Bucket='cant_get_it', ACL='private')
add_tips_files('cant_get_it')
yield conn
except: # noqa: flake8
pytest.skip("failure to use s3 resource")
finally:
s3.stop()
@@ -0,0 +1,186 @@
import pytest
from pandas.util import testing as tm
from pandas.io.formats.css import CSSResolver, CSSWarning
def assert_resolves(css, props, inherited=None):
resolve = CSSResolver()
actual = resolve(css, inherited=inherited)
assert props == actual
def assert_same_resolution(css1, css2, inherited=None):
resolve = CSSResolver()
resolved1 = resolve(css1, inherited=inherited)
resolved2 = resolve(css2, inherited=inherited)
assert resolved1 == resolved2
@pytest.mark.parametrize('name,norm,abnorm', [
('whitespace', 'hello: world; foo: bar',
' \t hello \t :\n world \n ; \n foo: \tbar\n\n'),
('case', 'hello: world; foo: bar', 'Hello: WORLD; foO: bar'),
('empty-decl', 'hello: world; foo: bar',
'; hello: world;; foo: bar;\n; ;'),
('empty-list', '', ';'),
])
def test_css_parse_normalisation(name, norm, abnorm):
assert_same_resolution(norm, abnorm)
@pytest.mark.parametrize(
'invalid_css,remainder', [
# No colon
('hello-world', ''),
('border-style: solid; hello-world', 'border-style: solid'),
('border-style: solid; hello-world; font-weight: bold',
'border-style: solid; font-weight: bold'),
# Unclosed string fail
# Invalid size
('font-size: blah', 'font-size: 1em'),
('font-size: 1a2b', 'font-size: 1em'),
('font-size: 1e5pt', 'font-size: 1em'),
('font-size: 1+6pt', 'font-size: 1em'),
('font-size: 1unknownunit', 'font-size: 1em'),
('font-size: 10', 'font-size: 1em'),
('font-size: 10 pt', 'font-size: 1em'),
])
def test_css_parse_invalid(invalid_css, remainder):
with tm.assert_produces_warning(CSSWarning):
assert_same_resolution(invalid_css, remainder)
# TODO: we should be checking that in other cases no warnings are raised
@pytest.mark.parametrize(
'shorthand,expansions',
[('margin', ['margin-top', 'margin-right',
'margin-bottom', 'margin-left']),
('padding', ['padding-top', 'padding-right',
'padding-bottom', 'padding-left']),
('border-width', ['border-top-width', 'border-right-width',
'border-bottom-width', 'border-left-width']),
('border-color', ['border-top-color', 'border-right-color',
'border-bottom-color', 'border-left-color']),
('border-style', ['border-top-style', 'border-right-style',
'border-bottom-style', 'border-left-style']),
])
def test_css_side_shorthands(shorthand, expansions):
top, right, bottom, left = expansions
assert_resolves('{shorthand}: 1pt'.format(shorthand=shorthand),
{top: '1pt', right: '1pt',
bottom: '1pt', left: '1pt'})
assert_resolves('{shorthand}: 1pt 4pt'.format(shorthand=shorthand),
{top: '1pt', right: '4pt',
bottom: '1pt', left: '4pt'})
assert_resolves('{shorthand}: 1pt 4pt 2pt'.format(shorthand=shorthand),
{top: '1pt', right: '4pt',
bottom: '2pt', left: '4pt'})
assert_resolves('{shorthand}: 1pt 4pt 2pt 0pt'.format(shorthand=shorthand),
{top: '1pt', right: '4pt',
bottom: '2pt', left: '0pt'})
with tm.assert_produces_warning(CSSWarning):
assert_resolves(
'{shorthand}: 1pt 1pt 1pt 1pt 1pt'.format(shorthand=shorthand), {})
@pytest.mark.parametrize('style,inherited,equiv', [
('margin: 1px; margin: 2px', '',
'margin: 2px'),
('margin: 1px', 'margin: 2px',
'margin: 1px'),
('margin: 1px; margin: inherit', 'margin: 2px',
'margin: 2px'),
('margin: 1px; margin-top: 2px', '',
'margin-left: 1px; margin-right: 1px; ' +
'margin-bottom: 1px; margin-top: 2px'),
('margin-top: 2px', 'margin: 1px',
'margin: 1px; margin-top: 2px'),
('margin: 1px', 'margin-top: 2px',
'margin: 1px'),
('margin: 1px; margin-top: inherit', 'margin: 2px',
'margin: 1px; margin-top: 2px'),
])
def test_css_precedence(style, inherited, equiv):
resolve = CSSResolver()
inherited_props = resolve(inherited)
style_props = resolve(style, inherited=inherited_props)
equiv_props = resolve(equiv)
assert style_props == equiv_props
@pytest.mark.parametrize('style,equiv', [
('margin: 1px; margin-top: inherit',
'margin-bottom: 1px; margin-right: 1px; margin-left: 1px'),
('margin-top: inherit', ''),
('margin-top: initial', ''),
])
def test_css_none_absent(style, equiv):
assert_same_resolution(style, equiv)
@pytest.mark.parametrize('size,resolved', [
('xx-small', '6pt'),
('x-small', '{pt:f}pt'.format(pt=7.5)),
('small', '{pt:f}pt'.format(pt=9.6)),
('medium', '12pt'),
('large', '{pt:f}pt'.format(pt=13.5)),
('x-large', '18pt'),
('xx-large', '24pt'),
('8px', '6pt'),
('1.25pc', '15pt'),
('.25in', '18pt'),
('02.54cm', '72pt'),
('25.4mm', '72pt'),
('101.6q', '72pt'),
('101.6q', '72pt'),
])
@pytest.mark.parametrize('relative_to', # invariant to inherited size
[None, '16pt'])
def test_css_absolute_font_size(size, relative_to, resolved):
if relative_to is None:
inherited = None
else:
inherited = {'font-size': relative_to}
assert_resolves('font-size: {size}'.format(size=size),
{'font-size': resolved}, inherited=inherited)
@pytest.mark.parametrize('size,relative_to,resolved', [
('1em', None, '12pt'),
('1.0em', None, '12pt'),
('1.25em', None, '15pt'),
('1em', '16pt', '16pt'),
('1.0em', '16pt', '16pt'),
('1.25em', '16pt', '20pt'),
('1rem', '16pt', '12pt'),
('1.0rem', '16pt', '12pt'),
('1.25rem', '16pt', '15pt'),
('100%', None, '12pt'),
('125%', None, '15pt'),
('100%', '16pt', '16pt'),
('125%', '16pt', '20pt'),
('2ex', None, '12pt'),
('2.0ex', None, '12pt'),
('2.50ex', None, '15pt'),
('inherit', '16pt', '16pt'),
('smaller', None, '10pt'),
('smaller', '18pt', '15pt'),
('larger', None, '{pt:f}pt'.format(pt=14.4)),
('larger', '15pt', '18pt'),
])
def test_css_relative_font_size(size, relative_to, resolved):
if relative_to is None:
inherited = None
else:
inherited = {'font-size': relative_to}
assert_resolves('font-size: {size}'.format(size=size),
{'font-size': resolved}, inherited=inherited)
@@ -0,0 +1,193 @@
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas.compat import u
import pandas.io.formats.format as fmt
from pandas.util import testing as tm
class TestEngFormatter(object):
def test_eng_float_formatter(self):
df = DataFrame({'A': [1.41, 141., 14100, 1410000.]})
fmt.set_eng_float_format()
result = df.to_string()
expected = (' A\n'
'0 1.410E+00\n'
'1 141.000E+00\n'
'2 14.100E+03\n'
'3 1.410E+06')
assert result == expected
fmt.set_eng_float_format(use_eng_prefix=True)
result = df.to_string()
expected = (' A\n'
'0 1.410\n'
'1 141.000\n'
'2 14.100k\n'
'3 1.410M')
assert result == expected
fmt.set_eng_float_format(accuracy=0)
result = df.to_string()
expected = (' A\n'
'0 1E+00\n'
'1 141E+00\n'
'2 14E+03\n'
'3 1E+06')
assert result == expected
tm.reset_display_options()
def compare(self, formatter, input, output):
formatted_input = formatter(input)
assert formatted_input == output
def compare_all(self, formatter, in_out):
"""
Parameters:
-----------
formatter: EngFormatter under test
in_out: list of tuples. Each tuple = (number, expected_formatting)
It is tested if 'formatter(number) == expected_formatting'.
*number* should be >= 0 because formatter(-number) == fmt is also
tested. *fmt* is derived from *expected_formatting*
"""
for input, output in in_out:
self.compare(formatter, input, output)
self.compare(formatter, -input, "-" + output[1:])
def test_exponents_with_eng_prefix(self):
formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True)
f = np.sqrt(2)
in_out = [
(f * 10 ** -24, " 1.414y"), (f * 10 ** -23, " 14.142y"),
(f * 10 ** -22, " 141.421y"), (f * 10 ** -21, " 1.414z"),
(f * 10 ** -20, " 14.142z"), (f * 10 ** -19, " 141.421z"),
(f * 10 ** -18, " 1.414a"), (f * 10 ** -17, " 14.142a"),
(f * 10 ** -16, " 141.421a"), (f * 10 ** -15, " 1.414f"),
(f * 10 ** -14, " 14.142f"), (f * 10 ** -13, " 141.421f"),
(f * 10 ** -12, " 1.414p"), (f * 10 ** -11, " 14.142p"),
(f * 10 ** -10, " 141.421p"), (f * 10 ** -9, " 1.414n"),
(f * 10 ** -8, " 14.142n"), (f * 10 ** -7, " 141.421n"),
(f * 10 ** -6, " 1.414u"), (f * 10 ** -5, " 14.142u"),
(f * 10 ** -4, " 141.421u"), (f * 10 ** -3, " 1.414m"),
(f * 10 ** -2, " 14.142m"), (f * 10 ** -1, " 141.421m"),
(f * 10 ** 0, " 1.414"), (f * 10 ** 1, " 14.142"),
(f * 10 ** 2, " 141.421"), (f * 10 ** 3, " 1.414k"),
(f * 10 ** 4, " 14.142k"), (f * 10 ** 5, " 141.421k"),
(f * 10 ** 6, " 1.414M"), (f * 10 ** 7, " 14.142M"),
(f * 10 ** 8, " 141.421M"), (f * 10 ** 9, " 1.414G"),
(f * 10 ** 10, " 14.142G"), (f * 10 ** 11, " 141.421G"),
(f * 10 ** 12, " 1.414T"), (f * 10 ** 13, " 14.142T"),
(f * 10 ** 14, " 141.421T"), (f * 10 ** 15, " 1.414P"),
(f * 10 ** 16, " 14.142P"), (f * 10 ** 17, " 141.421P"),
(f * 10 ** 18, " 1.414E"), (f * 10 ** 19, " 14.142E"),
(f * 10 ** 20, " 141.421E"), (f * 10 ** 21, " 1.414Z"),
(f * 10 ** 22, " 14.142Z"), (f * 10 ** 23, " 141.421Z"),
(f * 10 ** 24, " 1.414Y"), (f * 10 ** 25, " 14.142Y"),
(f * 10 ** 26, " 141.421Y")]
self.compare_all(formatter, in_out)
def test_exponents_without_eng_prefix(self):
formatter = fmt.EngFormatter(accuracy=4, use_eng_prefix=False)
f = np.pi
in_out = [
(f * 10 ** -24, " 3.1416E-24"),
(f * 10 ** -23, " 31.4159E-24"),
(f * 10 ** -22, " 314.1593E-24"),
(f * 10 ** -21, " 3.1416E-21"),
(f * 10 ** -20, " 31.4159E-21"),
(f * 10 ** -19, " 314.1593E-21"),
(f * 10 ** -18, " 3.1416E-18"),
(f * 10 ** -17, " 31.4159E-18"),
(f * 10 ** -16, " 314.1593E-18"),
(f * 10 ** -15, " 3.1416E-15"),
(f * 10 ** -14, " 31.4159E-15"),
(f * 10 ** -13, " 314.1593E-15"),
(f * 10 ** -12, " 3.1416E-12"),
(f * 10 ** -11, " 31.4159E-12"),
(f * 10 ** -10, " 314.1593E-12"),
(f * 10 ** -9, " 3.1416E-09"),
(f * 10 ** -8, " 31.4159E-09"),
(f * 10 ** -7, " 314.1593E-09"),
(f * 10 ** -6, " 3.1416E-06"),
(f * 10 ** -5, " 31.4159E-06"),
(f * 10 ** -4, " 314.1593E-06"),
(f * 10 ** -3, " 3.1416E-03"),
(f * 10 ** -2, " 31.4159E-03"),
(f * 10 ** -1, " 314.1593E-03"),
(f * 10 ** 0, " 3.1416E+00"),
(f * 10 ** 1, " 31.4159E+00"),
(f * 10 ** 2, " 314.1593E+00"),
(f * 10 ** 3, " 3.1416E+03"),
(f * 10 ** 4, " 31.4159E+03"),
(f * 10 ** 5, " 314.1593E+03"),
(f * 10 ** 6, " 3.1416E+06"),
(f * 10 ** 7, " 31.4159E+06"),
(f * 10 ** 8, " 314.1593E+06"),
(f * 10 ** 9, " 3.1416E+09"),
(f * 10 ** 10, " 31.4159E+09"),
(f * 10 ** 11, " 314.1593E+09"),
(f * 10 ** 12, " 3.1416E+12"),
(f * 10 ** 13, " 31.4159E+12"),
(f * 10 ** 14, " 314.1593E+12"),
(f * 10 ** 15, " 3.1416E+15"),
(f * 10 ** 16, " 31.4159E+15"),
(f * 10 ** 17, " 314.1593E+15"),
(f * 10 ** 18, " 3.1416E+18"),
(f * 10 ** 19, " 31.4159E+18"),
(f * 10 ** 20, " 314.1593E+18"),
(f * 10 ** 21, " 3.1416E+21"),
(f * 10 ** 22, " 31.4159E+21"),
(f * 10 ** 23, " 314.1593E+21"),
(f * 10 ** 24, " 3.1416E+24"),
(f * 10 ** 25, " 31.4159E+24"),
(f * 10 ** 26, " 314.1593E+24")]
self.compare_all(formatter, in_out)
def test_rounding(self):
formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True)
in_out = [(5.55555, ' 5.556'), (55.5555, ' 55.556'),
(555.555, ' 555.555'), (5555.55, ' 5.556k'),
(55555.5, ' 55.556k'), (555555, ' 555.555k')]
self.compare_all(formatter, in_out)
formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True)
in_out = [(5.55555, ' 5.6'), (55.5555, ' 55.6'), (555.555, ' 555.6'),
(5555.55, ' 5.6k'), (55555.5, ' 55.6k'), (555555, ' 555.6k')]
self.compare_all(formatter, in_out)
formatter = fmt.EngFormatter(accuracy=0, use_eng_prefix=True)
in_out = [(5.55555, ' 6'), (55.5555, ' 56'), (555.555, ' 556'),
(5555.55, ' 6k'), (55555.5, ' 56k'), (555555, ' 556k')]
self.compare_all(formatter, in_out)
formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True)
result = formatter(0)
assert result == u(' 0.000')
def test_nan(self):
# Issue #11981
formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True)
result = formatter(np.nan)
assert result == u('NaN')
df = pd.DataFrame({'a': [1.5, 10.3, 20.5],
'b': [50.3, 60.67, 70.12],
'c': [100.2, 101.33, 120.33]})
pt = df.pivot_table(values='a', index='b', columns='c')
fmt.set_eng_float_format(accuracy=1)
result = pt.to_string()
assert 'NaN' in result
tm.reset_display_options()
def test_inf(self):
# Issue #11981
formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True)
result = formatter(np.inf)
assert result == u('inf')
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,204 @@
# -*- coding: utf-8 -*-
import pytest
import numpy as np
import pandas as pd
from pandas import compat
import pandas.io.formats.printing as printing
import pandas.io.formats.format as fmt
import pandas.core.config as cf
def test_adjoin():
data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], ['ggg', 'hhh', 'iii']]
expected = 'a dd ggg\nb ee hhh\nc ff iii'
adjoined = printing.adjoin(2, *data)
assert (adjoined == expected)
def test_repr_binary_type():
import string
letters = string.ascii_letters
btype = compat.binary_type
try:
raw = btype(letters, encoding=cf.get_option('display.encoding'))
except TypeError:
raw = btype(letters)
b = compat.text_type(compat.bytes_to_str(raw))
res = printing.pprint_thing(b, quote_strings=True)
assert res == repr(b)
res = printing.pprint_thing(b, quote_strings=False)
assert res == b
class TestFormattBase(object):
def test_adjoin(self):
data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], ['ggg', 'hhh', 'iii']]
expected = 'a dd ggg\nb ee hhh\nc ff iii'
adjoined = printing.adjoin(2, *data)
assert adjoined == expected
def test_adjoin_unicode(self):
data = [[u'', 'b', 'c'], ['dd', u'ええ', 'ff'], ['ggg', 'hhh', u'いいい']]
expected = u'あ dd ggg\nb ええ hhh\nc ff いいい'
adjoined = printing.adjoin(2, *data)
assert adjoined == expected
adj = fmt.EastAsianTextAdjustment()
expected = u"""あ dd ggg
b ええ hhh
c ff いいい"""
adjoined = adj.adjoin(2, *data)
assert adjoined == expected
cols = adjoined.split('\n')
assert adj.len(cols[0]) == 13
assert adj.len(cols[1]) == 13
assert adj.len(cols[2]) == 16
expected = u"""あ dd ggg
b ええ hhh
c ff いいい"""
adjoined = adj.adjoin(7, *data)
assert adjoined == expected
cols = adjoined.split('\n')
assert adj.len(cols[0]) == 23
assert adj.len(cols[1]) == 23
assert adj.len(cols[2]) == 26
def test_justify(self):
adj = fmt.EastAsianTextAdjustment()
def just(x, *args, **kwargs):
# wrapper to test single str
return adj.justify([x], *args, **kwargs)[0]
assert just('abc', 5, mode='left') == 'abc '
assert just('abc', 5, mode='center') == ' abc '
assert just('abc', 5, mode='right') == ' abc'
assert just(u'abc', 5, mode='left') == 'abc '
assert just(u'abc', 5, mode='center') == ' abc '
assert just(u'abc', 5, mode='right') == ' abc'
assert just(u'パンダ', 5, mode='left') == u'パンダ'
assert just(u'パンダ', 5, mode='center') == u'パンダ'
assert just(u'パンダ', 5, mode='right') == u'パンダ'
assert just(u'パンダ', 10, mode='left') == u'パンダ '
assert just(u'パンダ', 10, mode='center') == u' パンダ '
assert just(u'パンダ', 10, mode='right') == u' パンダ'
def test_east_asian_len(self):
adj = fmt.EastAsianTextAdjustment()
assert adj.len('abc') == 3
assert adj.len(u'abc') == 3
assert adj.len(u'パンダ') == 6
assert adj.len(u'パンダ') == 5
assert adj.len(u'パンダpanda') == 11
assert adj.len(u'パンダpanda') == 10
def test_ambiguous_width(self):
adj = fmt.EastAsianTextAdjustment()
assert adj.len(u'¡¡ab') == 4
with cf.option_context('display.unicode.ambiguous_as_wide', True):
adj = fmt.EastAsianTextAdjustment()
assert adj.len(u'¡¡ab') == 6
data = [[u'', 'b', 'c'], ['dd', u'ええ', 'ff'],
['ggg', u'¡¡ab', u'いいい']]
expected = u'あ dd ggg \nb ええ ¡¡ab\nc ff いいい'
adjoined = adj.adjoin(2, *data)
assert adjoined == expected
class TestTableSchemaRepr(object):
@classmethod
def setup_class(cls):
pytest.importorskip('IPython')
from IPython.core.interactiveshell import InteractiveShell
cls.display_formatter = InteractiveShell.instance().display_formatter
def test_publishes(self):
df = pd.DataFrame({"A": [1, 2]})
objects = [df['A'], df, df] # dataframe / series
expected_keys = [
{'text/plain', 'application/vnd.dataresource+json'},
{'text/plain', 'text/html', 'application/vnd.dataresource+json'},
]
opt = pd.option_context('display.html.table_schema', True)
for obj, expected in zip(objects, expected_keys):
with opt:
formatted = self.display_formatter.format(obj)
assert set(formatted[0].keys()) == expected
with_latex = pd.option_context('display.latex.repr', True)
with opt, with_latex:
formatted = self.display_formatter.format(obj)
expected = {'text/plain', 'text/html', 'text/latex',
'application/vnd.dataresource+json'}
assert set(formatted[0].keys()) == expected
def test_publishes_not_implemented(self):
# column MultiIndex
# GH 15996
midx = pd.MultiIndex.from_product([['A', 'B'], ['a', 'b', 'c']])
df = pd.DataFrame(np.random.randn(5, len(midx)), columns=midx)
opt = pd.option_context('display.html.table_schema', True)
with opt:
formatted = self.display_formatter.format(df)
expected = {'text/plain', 'text/html'}
assert set(formatted[0].keys()) == expected
def test_config_on(self):
df = pd.DataFrame({"A": [1, 2]})
with pd.option_context("display.html.table_schema", True):
result = df._repr_data_resource_()
assert result is not None
def test_config_default_off(self):
df = pd.DataFrame({"A": [1, 2]})
with pd.option_context("display.html.table_schema", False):
result = df._repr_data_resource_()
assert result is None
def test_enable_data_resource_formatter(self):
# GH 10491
formatters = self.display_formatter.formatters
mimetype = 'application/vnd.dataresource+json'
with pd.option_context('display.html.table_schema', True):
assert 'application/vnd.dataresource+json' in formatters
assert formatters[mimetype].enabled
# still there, just disabled
assert 'application/vnd.dataresource+json' in formatters
assert not formatters[mimetype].enabled
# able to re-set
with pd.option_context('display.html.table_schema', True):
assert 'application/vnd.dataresource+json' in formatters
assert formatters[mimetype].enabled
# smoke test that it works
self.display_formatter.format(cf)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,302 @@
# -*- coding: utf-8 -*-
import sys
import numpy as np
import pandas as pd
import pytest
from pandas import DataFrame
from pandas.util import testing as tm
class TestToCSV(object):
@pytest.mark.xfail((3, 6, 5) > sys.version_info >= (3, 5),
reason=("Python csv library bug "
"(see https://bugs.python.org/issue32255)"))
def test_to_csv_with_single_column(self):
# see gh-18676, https://bugs.python.org/issue32255
#
# Python's CSV library adds an extraneous '""'
# before the newline when the NaN-value is in
# the first row. Otherwise, only the newline
# character is added. This behavior is inconsistent
# and was patched in https://bugs.python.org/pull_request4672.
df1 = DataFrame([None, 1])
expected1 = """\
""
1.0
"""
with tm.ensure_clean('test.csv') as path:
df1.to_csv(path, header=None, index=None)
with open(path, 'r') as f:
assert f.read() == expected1
df2 = DataFrame([1, None])
expected2 = """\
1.0
""
"""
with tm.ensure_clean('test.csv') as path:
df2.to_csv(path, header=None, index=None)
with open(path, 'r') as f:
assert f.read() == expected2
def test_to_csv_defualt_encoding(self):
# GH17097
df = DataFrame({'col': [u"AAAAA", u"ÄÄÄÄÄ", u"ßßßßß", u"聞聞聞聞聞"]})
with tm.ensure_clean('test.csv') as path:
# the default to_csv encoding in Python 2 is ascii, and that in
# Python 3 is uft-8.
if pd.compat.PY2:
# the encoding argument parameter should be utf-8
with tm.assert_raises_regex(UnicodeEncodeError, 'ascii'):
df.to_csv(path)
else:
df.to_csv(path)
tm.assert_frame_equal(pd.read_csv(path, index_col=0), df)
def test_to_csv_quotechar(self):
df = DataFrame({'col': [1, 2]})
expected = """\
"","col"
"0","1"
"1","2"
"""
with tm.ensure_clean('test.csv') as path:
df.to_csv(path, quoting=1) # 1=QUOTE_ALL
with open(path, 'r') as f:
assert f.read() == expected
expected = """\
$$,$col$
$0$,$1$
$1$,$2$
"""
with tm.ensure_clean('test.csv') as path:
df.to_csv(path, quoting=1, quotechar="$")
with open(path, 'r') as f:
assert f.read() == expected
with tm.ensure_clean('test.csv') as path:
with tm.assert_raises_regex(TypeError, 'quotechar'):
df.to_csv(path, quoting=1, quotechar=None)
def test_to_csv_doublequote(self):
df = DataFrame({'col': ['a"a', '"bb"']})
expected = '''\
"","col"
"0","a""a"
"1","""bb"""
'''
with tm.ensure_clean('test.csv') as path:
df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL
with open(path, 'r') as f:
assert f.read() == expected
from _csv import Error
with tm.ensure_clean('test.csv') as path:
with tm.assert_raises_regex(Error, 'escapechar'):
df.to_csv(path, doublequote=False) # no escapechar set
def test_to_csv_escapechar(self):
df = DataFrame({'col': ['a"a', '"bb"']})
expected = '''\
"","col"
"0","a\\"a"
"1","\\"bb\\""
'''
with tm.ensure_clean('test.csv') as path: # QUOTE_ALL
df.to_csv(path, quoting=1, doublequote=False, escapechar='\\')
with open(path, 'r') as f:
assert f.read() == expected
df = DataFrame({'col': ['a,a', ',bb,']})
expected = """\
,col
0,a\\,a
1,\\,bb\\,
"""
with tm.ensure_clean('test.csv') as path:
df.to_csv(path, quoting=3, escapechar='\\') # QUOTE_NONE
with open(path, 'r') as f:
assert f.read() == expected
def test_csv_to_string(self):
df = DataFrame({'col': [1, 2]})
expected = ',col\n0,1\n1,2\n'
assert df.to_csv() == expected
def test_to_csv_decimal(self):
# GH 781
df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]})
expected_default = ',col1,col2,col3\n0,1,a,10.1\n'
assert df.to_csv() == expected_default
expected_european_excel = ';col1;col2;col3\n0;1;a;10,1\n'
assert df.to_csv(decimal=',', sep=';') == expected_european_excel
expected_float_format_default = ',col1,col2,col3\n0,1,a,10.10\n'
assert df.to_csv(float_format='%.2f') == expected_float_format_default
expected_float_format = ';col1;col2;col3\n0;1;a;10,10\n'
assert df.to_csv(decimal=',', sep=';',
float_format='%.2f') == expected_float_format
# GH 11553: testing if decimal is taken into account for '0.0'
df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1})
expected = 'a,b,c\n0^0,2^2,1\n1^1,3^3,1\n'
assert df.to_csv(index=False, decimal='^') == expected
# same but for an index
assert df.set_index('a').to_csv(decimal='^') == expected
# same for a multi-index
assert df.set_index(['a', 'b']).to_csv(decimal="^") == expected
def test_to_csv_float_format(self):
# testing if float_format is taken into account for the index
# GH 11553
df = pd.DataFrame({'a': [0, 1], 'b': [2.2, 3.3], 'c': 1})
expected = 'a,b,c\n0,2.20,1\n1,3.30,1\n'
assert df.set_index('a').to_csv(float_format='%.2f') == expected
# same for a multi-index
assert df.set_index(['a', 'b']).to_csv(
float_format='%.2f') == expected
def test_to_csv_na_rep(self):
# testing if NaN values are correctly represented in the index
# GH 11553
df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]})
expected = "a,b,c\n0.0,0,2\n_,1,3\n"
assert df.set_index('a').to_csv(na_rep='_') == expected
assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected
# now with an index containing only NaNs
df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]})
expected = "a,b,c\n_,0,2\n_,1,3\n"
assert df.set_index('a').to_csv(na_rep='_') == expected
assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected
# check if na_rep parameter does not break anything when no NaN
df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]})
expected = "a,b,c\n0,0,2\n0,1,3\n"
assert df.set_index('a').to_csv(na_rep='_') == expected
assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected
def test_to_csv_date_format(self):
# GH 10209
df_sec = DataFrame({'A': pd.date_range('20130101', periods=5, freq='s')
})
df_day = DataFrame({'A': pd.date_range('20130101', periods=5, freq='d')
})
expected_default_sec = (',A\n0,2013-01-01 00:00:00\n1,'
'2013-01-01 00:00:01\n2,2013-01-01 00:00:02'
'\n3,2013-01-01 00:00:03\n4,'
'2013-01-01 00:00:04\n')
assert df_sec.to_csv() == expected_default_sec
expected_ymdhms_day = (',A\n0,2013-01-01 00:00:00\n1,'
'2013-01-02 00:00:00\n2,2013-01-03 00:00:00'
'\n3,2013-01-04 00:00:00\n4,'
'2013-01-05 00:00:00\n')
assert (df_day.to_csv(date_format='%Y-%m-%d %H:%M:%S') ==
expected_ymdhms_day)
expected_ymd_sec = (',A\n0,2013-01-01\n1,2013-01-01\n2,'
'2013-01-01\n3,2013-01-01\n4,2013-01-01\n')
assert df_sec.to_csv(date_format='%Y-%m-%d') == expected_ymd_sec
expected_default_day = (',A\n0,2013-01-01\n1,2013-01-02\n2,'
'2013-01-03\n3,2013-01-04\n4,2013-01-05\n')
assert df_day.to_csv() == expected_default_day
assert df_day.to_csv(date_format='%Y-%m-%d') == expected_default_day
# testing if date_format parameter is taken into account for
# multi-indexed dataframes (GH 7791)
df_sec['B'] = 0
df_sec['C'] = 1
expected_ymd_sec = 'A,B,C\n2013-01-01,0,1\n'
df_sec_grouped = df_sec.groupby([pd.Grouper(key='A', freq='1h'), 'B'])
assert (df_sec_grouped.mean().to_csv(date_format='%Y-%m-%d') ==
expected_ymd_sec)
def test_to_csv_multi_index(self):
# GH 6618
df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]))
exp = ",1\n,2\n0,1\n"
assert df.to_csv() == exp
exp = "1\n2\n1\n"
assert df.to_csv(index=False) == exp
df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]),
index=pd.MultiIndex.from_arrays([[1], [2]]))
exp = ",,1\n,,2\n1,2,1\n"
assert df.to_csv() == exp
exp = "1\n2\n1\n"
assert df.to_csv(index=False) == exp
df = DataFrame(
[1], columns=pd.MultiIndex.from_arrays([['foo'], ['bar']]))
exp = ",foo\n,bar\n0,1\n"
assert df.to_csv() == exp
exp = "foo\nbar\n1\n"
assert df.to_csv(index=False) == exp
def test_to_csv_string_array_ascii(self):
# GH 10813
str_array = [{'names': ['foo', 'bar']}, {'names': ['baz', 'qux']}]
df = pd.DataFrame(str_array)
expected_ascii = '''\
,names
0,"['foo', 'bar']"
1,"['baz', 'qux']"
'''
with tm.ensure_clean('str_test.csv') as path:
df.to_csv(path, encoding='ascii')
with open(path, 'r') as f:
assert f.read() == expected_ascii
@pytest.mark.xfail
def test_to_csv_string_array_utf8(self):
# GH 10813
str_array = [{'names': ['foo', 'bar']}, {'names': ['baz', 'qux']}]
df = pd.DataFrame(str_array)
expected_utf8 = '''\
,names
0,"[u'foo', u'bar']"
1,"[u'baz', u'qux']"
'''
with tm.ensure_clean('unicode_test.csv') as path:
df.to_csv(path, encoding='utf-8')
with open(path, 'r') as f:
assert f.read() == expected_utf8
@tm.capture_stdout
def test_to_csv_stdout_file(self):
# GH 21561
df = pd.DataFrame([['foo', 'bar'], ['baz', 'qux']],
columns=['name_1', 'name_2'])
expected_ascii = '''\
,name_1,name_2
0,foo,bar
1,baz,qux
'''
df.to_csv(sys.stdout, encoding='ascii')
output = sys.stdout.getvalue()
assert output == expected_ascii
assert not sys.stdout.closed
@@ -0,0 +1,274 @@
"""Tests formatting as writer-agnostic ExcelCells
ExcelFormatter is tested implicitly in pandas/tests/io/test_excel.py
"""
import pytest
import pandas.util.testing as tm
from warnings import catch_warnings
from pandas.io.formats.excel import CSSToExcelConverter
@pytest.mark.parametrize('css,expected', [
# FONT
# - name
('font-family: foo,bar', {'font': {'name': 'foo'}}),
('font-family: "foo bar",baz', {'font': {'name': 'foo bar'}}),
('font-family: foo,\nbar', {'font': {'name': 'foo'}}),
('font-family: foo, bar, baz', {'font': {'name': 'foo'}}),
('font-family: bar, foo', {'font': {'name': 'bar'}}),
('font-family: \'foo bar\', baz', {'font': {'name': 'foo bar'}}),
('font-family: \'foo \\\'bar\', baz', {'font': {'name': 'foo \'bar'}}),
('font-family: "foo \\"bar", baz', {'font': {'name': 'foo "bar'}}),
('font-family: "foo ,bar", baz', {'font': {'name': 'foo ,bar'}}),
# - family
('font-family: serif', {'font': {'name': 'serif', 'family': 1}}),
('font-family: Serif', {'font': {'name': 'serif', 'family': 1}}),
('font-family: roman, serif', {'font': {'name': 'roman', 'family': 1}}),
('font-family: roman, sans-serif', {'font': {'name': 'roman',
'family': 2}}),
('font-family: roman, sans serif', {'font': {'name': 'roman'}}),
('font-family: roman, sansserif', {'font': {'name': 'roman'}}),
('font-family: roman, cursive', {'font': {'name': 'roman', 'family': 4}}),
('font-family: roman, fantasy', {'font': {'name': 'roman', 'family': 5}}),
# - size
('font-size: 1em', {'font': {'size': 12}}),
('font-size: xx-small', {'font': {'size': 6}}),
('font-size: x-small', {'font': {'size': 7.5}}),
('font-size: small', {'font': {'size': 9.6}}),
('font-size: medium', {'font': {'size': 12}}),
('font-size: large', {'font': {'size': 13.5}}),
('font-size: x-large', {'font': {'size': 18}}),
('font-size: xx-large', {'font': {'size': 24}}),
('font-size: 50%', {'font': {'size': 6}}),
# - bold
('font-weight: 100', {'font': {'bold': False}}),
('font-weight: 200', {'font': {'bold': False}}),
('font-weight: 300', {'font': {'bold': False}}),
('font-weight: 400', {'font': {'bold': False}}),
('font-weight: normal', {'font': {'bold': False}}),
('font-weight: lighter', {'font': {'bold': False}}),
('font-weight: bold', {'font': {'bold': True}}),
('font-weight: bolder', {'font': {'bold': True}}),
('font-weight: 700', {'font': {'bold': True}}),
('font-weight: 800', {'font': {'bold': True}}),
('font-weight: 900', {'font': {'bold': True}}),
# - italic
('font-style: italic', {'font': {'italic': True}}),
('font-style: oblique', {'font': {'italic': True}}),
# - underline
('text-decoration: underline',
{'font': {'underline': 'single'}}),
('text-decoration: overline',
{}),
('text-decoration: none',
{}),
# - strike
('text-decoration: line-through',
{'font': {'strike': True}}),
('text-decoration: underline line-through',
{'font': {'strike': True, 'underline': 'single'}}),
('text-decoration: underline; text-decoration: line-through',
{'font': {'strike': True}}),
# - color
('color: red', {'font': {'color': 'FF0000'}}),
('color: #ff0000', {'font': {'color': 'FF0000'}}),
('color: #f0a', {'font': {'color': 'FF00AA'}}),
# - shadow
('text-shadow: none', {'font': {'shadow': False}}),
('text-shadow: 0px -0em 0px #CCC', {'font': {'shadow': False}}),
('text-shadow: 0px -0em 0px #999', {'font': {'shadow': False}}),
('text-shadow: 0px -0em 0px', {'font': {'shadow': False}}),
('text-shadow: 2px -0em 0px #CCC', {'font': {'shadow': True}}),
('text-shadow: 0px -2em 0px #CCC', {'font': {'shadow': True}}),
('text-shadow: 0px -0em 2px #CCC', {'font': {'shadow': True}}),
('text-shadow: 0px -0em 2px', {'font': {'shadow': True}}),
('text-shadow: 0px -2em', {'font': {'shadow': True}}),
# FILL
# - color, fillType
('background-color: red', {'fill': {'fgColor': 'FF0000',
'patternType': 'solid'}}),
('background-color: #ff0000', {'fill': {'fgColor': 'FF0000',
'patternType': 'solid'}}),
('background-color: #f0a', {'fill': {'fgColor': 'FF00AA',
'patternType': 'solid'}}),
# BORDER
# - style
('border-style: solid',
{'border': {'top': {'style': 'medium'},
'bottom': {'style': 'medium'},
'left': {'style': 'medium'},
'right': {'style': 'medium'}}}),
('border-style: solid; border-width: thin',
{'border': {'top': {'style': 'thin'},
'bottom': {'style': 'thin'},
'left': {'style': 'thin'},
'right': {'style': 'thin'}}}),
('border-top-style: solid; border-top-width: thin',
{'border': {'top': {'style': 'thin'}}}),
('border-top-style: solid; border-top-width: 1pt',
{'border': {'top': {'style': 'thin'}}}),
('border-top-style: solid',
{'border': {'top': {'style': 'medium'}}}),
('border-top-style: solid; border-top-width: medium',
{'border': {'top': {'style': 'medium'}}}),
('border-top-style: solid; border-top-width: 2pt',
{'border': {'top': {'style': 'medium'}}}),
('border-top-style: solid; border-top-width: thick',
{'border': {'top': {'style': 'thick'}}}),
('border-top-style: solid; border-top-width: 4pt',
{'border': {'top': {'style': 'thick'}}}),
('border-top-style: dotted',
{'border': {'top': {'style': 'mediumDashDotDot'}}}),
('border-top-style: dotted; border-top-width: thin',
{'border': {'top': {'style': 'dotted'}}}),
('border-top-style: dashed',
{'border': {'top': {'style': 'mediumDashed'}}}),
('border-top-style: dashed; border-top-width: thin',
{'border': {'top': {'style': 'dashed'}}}),
('border-top-style: double',
{'border': {'top': {'style': 'double'}}}),
# - color
('border-style: solid; border-color: #0000ff',
{'border': {'top': {'style': 'medium', 'color': '0000FF'},
'right': {'style': 'medium', 'color': '0000FF'},
'bottom': {'style': 'medium', 'color': '0000FF'},
'left': {'style': 'medium', 'color': '0000FF'}}}),
('border-top-style: double; border-top-color: blue',
{'border': {'top': {'style': 'double', 'color': '0000FF'}}}),
('border-top-style: solid; border-top-color: #06c',
{'border': {'top': {'style': 'medium', 'color': '0066CC'}}}),
# ALIGNMENT
# - horizontal
('text-align: center',
{'alignment': {'horizontal': 'center'}}),
('text-align: left',
{'alignment': {'horizontal': 'left'}}),
('text-align: right',
{'alignment': {'horizontal': 'right'}}),
('text-align: justify',
{'alignment': {'horizontal': 'justify'}}),
# - vertical
('vertical-align: top',
{'alignment': {'vertical': 'top'}}),
('vertical-align: text-top',
{'alignment': {'vertical': 'top'}}),
('vertical-align: middle',
{'alignment': {'vertical': 'center'}}),
('vertical-align: bottom',
{'alignment': {'vertical': 'bottom'}}),
('vertical-align: text-bottom',
{'alignment': {'vertical': 'bottom'}}),
# - wrap_text
('white-space: nowrap',
{'alignment': {'wrap_text': False}}),
('white-space: pre',
{'alignment': {'wrap_text': False}}),
('white-space: pre-line',
{'alignment': {'wrap_text': False}}),
('white-space: normal',
{'alignment': {'wrap_text': True}}),
])
def test_css_to_excel(css, expected):
convert = CSSToExcelConverter()
assert expected == convert(css)
def test_css_to_excel_multiple():
convert = CSSToExcelConverter()
actual = convert('''
font-weight: bold;
text-decoration: underline;
color: red;
border-width: thin;
text-align: center;
vertical-align: top;
unused: something;
''')
assert {"font": {"bold": True, "underline": "single", "color": "FF0000"},
"border": {"top": {"style": "thin"},
"right": {"style": "thin"},
"bottom": {"style": "thin"},
"left": {"style": "thin"}},
"alignment": {"horizontal": "center",
"vertical": "top"}} == actual
@pytest.mark.parametrize('css,inherited,expected', [
('font-weight: bold', '',
{'font': {'bold': True}}),
('', 'font-weight: bold',
{'font': {'bold': True}}),
('font-weight: bold', 'font-style: italic',
{'font': {'bold': True, 'italic': True}}),
('font-style: normal', 'font-style: italic',
{'font': {'italic': False}}),
('font-style: inherit', '', {}),
('font-style: normal; font-style: inherit', 'font-style: italic',
{'font': {'italic': True}}),
])
def test_css_to_excel_inherited(css, inherited, expected):
convert = CSSToExcelConverter(inherited)
assert expected == convert(css)
@pytest.mark.parametrize("input_color,output_color", (
[(name, rgb) for name, rgb in CSSToExcelConverter.NAMED_COLORS.items()] +
[("#" + rgb, rgb) for rgb in CSSToExcelConverter.NAMED_COLORS.values()] +
[("#F0F", "FF00FF"), ("#ABC", "AABBCC")])
)
def test_css_to_excel_good_colors(input_color, output_color):
# see gh-18392
css = ("border-top-color: {color}; "
"border-right-color: {color}; "
"border-bottom-color: {color}; "
"border-left-color: {color}; "
"background-color: {color}; "
"color: {color}").format(color=input_color)
expected = dict()
expected["fill"] = {
"patternType": "solid",
"fgColor": output_color
}
expected["font"] = {
"color": output_color
}
expected["border"] = {
k: {
"color": output_color,
} for k in ("top", "right", "bottom", "left")
}
with tm.assert_produces_warning(None):
convert = CSSToExcelConverter()
assert expected == convert(css)
@pytest.mark.parametrize("input_color", [None, "not-a-color"])
def test_css_to_excel_bad_colors(input_color):
# see gh-18392
css = ("border-top-color: {color}; "
"border-right-color: {color}; "
"border-bottom-color: {color}; "
"border-left-color: {color}; "
"background-color: {color}; "
"color: {color}").format(color=input_color)
expected = dict()
if input_color is not None:
expected["fill"] = {
"patternType": "solid"
}
with catch_warnings(record=True):
convert = CSSToExcelConverter()
assert expected == convert(css)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,710 @@
from datetime import datetime
import pytest
import pandas as pd
from pandas import DataFrame, compat, Series
from pandas.util import testing as tm
from pandas.compat import u
import codecs
@pytest.fixture
def frame():
return DataFrame(tm.getSeriesData())
class TestToLatex(object):
def test_to_latex_filename(self, frame):
with tm.ensure_clean('test.tex') as path:
frame.to_latex(path)
with open(path, 'r') as f:
assert frame.to_latex() == f.read()
# test with utf-8 and encoding option (GH 7061)
df = DataFrame([[u'au\xdfgangen']])
with tm.ensure_clean('test.tex') as path:
df.to_latex(path, encoding='utf-8')
with codecs.open(path, 'r', encoding='utf-8') as f:
assert df.to_latex() == f.read()
# test with utf-8 without encoding option
if compat.PY3: # python3: pandas default encoding is utf-8
with tm.ensure_clean('test.tex') as path:
df.to_latex(path)
with codecs.open(path, 'r', encoding='utf-8') as f:
assert df.to_latex() == f.read()
else:
# python2 default encoding is ascii, so an error should be raised
with tm.ensure_clean('test.tex') as path:
with pytest.raises(UnicodeEncodeError):
df.to_latex(path)
def test_to_latex(self, frame):
# it works!
frame.to_latex()
df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
withindex_result = df.to_latex()
withindex_expected = r"""\begin{tabular}{lrl}
\toprule
{} & a & b \\
\midrule
0 & 1 & b1 \\
1 & 2 & b2 \\
\bottomrule
\end{tabular}
"""
assert withindex_result == withindex_expected
withoutindex_result = df.to_latex(index=False)
withoutindex_expected = r"""\begin{tabular}{rl}
\toprule
a & b \\
\midrule
1 & b1 \\
2 & b2 \\
\bottomrule
\end{tabular}
"""
assert withoutindex_result == withoutindex_expected
def test_to_latex_format(self, frame):
# GH Bug #9402
frame.to_latex(column_format='ccc')
df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
withindex_result = df.to_latex(column_format='ccc')
withindex_expected = r"""\begin{tabular}{ccc}
\toprule
{} & a & b \\
\midrule
0 & 1 & b1 \\
1 & 2 & b2 \\
\bottomrule
\end{tabular}
"""
assert withindex_result == withindex_expected
def test_to_latex_empty(self):
df = DataFrame()
result = df.to_latex()
expected = r"""\begin{tabular}{l}
\toprule
Empty DataFrame
Columns: Index([], dtype='object')
Index: Index([], dtype='object') \\
\bottomrule
\end{tabular}
"""
assert result == expected
result = df.to_latex(longtable=True)
expected = r"""\begin{longtable}{l}
\toprule
Empty DataFrame
Columns: Index([], dtype='object')
Index: Index([], dtype='object') \\
\end{longtable}
"""
assert result == expected
def test_to_latex_with_formatters(self):
df = DataFrame({'datetime64': [datetime(2016, 1, 1),
datetime(2016, 2, 5),
datetime(2016, 3, 3)],
'float': [1.0, 2.0, 3.0],
'int': [1, 2, 3],
'object': [(1, 2), True, False],
})
formatters = {'datetime64': lambda x: x.strftime('%Y-%m'),
'float': lambda x: '[{x: 4.1f}]'.format(x=x),
'int': lambda x: '0x{x:x}'.format(x=x),
'object': lambda x: '-{x!s}-'.format(x=x),
'__index__': lambda x: 'index: {x}'.format(x=x)}
result = df.to_latex(formatters=dict(formatters))
expected = r"""\begin{tabular}{llrrl}
\toprule
{} & datetime64 & float & int & object \\
\midrule
index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\
index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\
index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\
\bottomrule
\end{tabular}
"""
assert result == expected
def test_to_latex_multiindex(self):
df = DataFrame({('x', 'y'): ['a']})
result = df.to_latex()
expected = r"""\begin{tabular}{ll}
\toprule
{} & x \\
{} & y \\
\midrule
0 & a \\
\bottomrule
\end{tabular}
"""
assert result == expected
result = df.T.to_latex()
expected = r"""\begin{tabular}{lll}
\toprule
& & 0 \\
\midrule
x & y & a \\
\bottomrule
\end{tabular}
"""
assert result == expected
df = DataFrame.from_dict({
('c1', 0): pd.Series({x: x for x in range(4)}),
('c1', 1): pd.Series({x: x + 4 for x in range(4)}),
('c2', 0): pd.Series({x: x for x in range(4)}),
('c2', 1): pd.Series({x: x + 4 for x in range(4)}),
('c3', 0): pd.Series({x: x for x in range(4)}),
}).T
result = df.to_latex()
expected = r"""\begin{tabular}{llrrrr}
\toprule
& & 0 & 1 & 2 & 3 \\
\midrule
c1 & 0 & 0 & 1 & 2 & 3 \\
& 1 & 4 & 5 & 6 & 7 \\
c2 & 0 & 0 & 1 & 2 & 3 \\
& 1 & 4 & 5 & 6 & 7 \\
c3 & 0 & 0 & 1 & 2 & 3 \\
\bottomrule
\end{tabular}
"""
assert result == expected
# GH 14184
df = df.T
df.columns.names = ['a', 'b']
result = df.to_latex()
expected = r"""\begin{tabular}{lrrrrr}
\toprule
a & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\
b & 0 & 1 & 0 & 1 & 0 \\
\midrule
0 & 0 & 4 & 0 & 4 & 0 \\
1 & 1 & 5 & 1 & 5 & 1 \\
2 & 2 & 6 & 2 & 6 & 2 \\
3 & 3 & 7 & 3 & 7 & 3 \\
\bottomrule
\end{tabular}
"""
assert result == expected
# GH 10660
df = pd.DataFrame({'a': [0, 0, 1, 1],
'b': list('abab'),
'c': [1, 2, 3, 4]})
result = df.set_index(['a', 'b']).to_latex()
expected = r"""\begin{tabular}{llr}
\toprule
& & c \\
a & b & \\
\midrule
0 & a & 1 \\
& b & 2 \\
1 & a & 3 \\
& b & 4 \\
\bottomrule
\end{tabular}
"""
assert result == expected
result = df.groupby('a').describe().to_latex()
expected = r"""\begin{tabular}{lrrrrrrrr}
\toprule
{} & \multicolumn{8}{l}{c} \\
{} & count & mean & std & min & 25\% & 50\% & 75\% & max \\
a & & & & & & & & \\
\midrule
0 & 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 & 2.0 \\
1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 & 3.5 & 3.75 & 4.0 \\
\bottomrule
\end{tabular}
"""
assert result == expected
def test_to_latex_multiindex_dupe_level(self):
# see gh-14484
#
# If an index is repeated in subsequent rows, it should be
# replaced with a blank in the created table. This should
# ONLY happen if all higher order indices (to the left) are
# equal too. In this test, 'c' has to be printed both times
# because the higher order index 'A' != 'B'.
df = pd.DataFrame(index=pd.MultiIndex.from_tuples(
[('A', 'c'), ('B', 'c')]), columns=['col'])
result = df.to_latex()
expected = r"""\begin{tabular}{lll}
\toprule
& & col \\
\midrule
A & c & NaN \\
B & c & NaN \\
\bottomrule
\end{tabular}
"""
assert result == expected
def test_to_latex_multicolumnrow(self):
df = pd.DataFrame({
('c1', 0): {x: x for x in range(5)},
('c1', 1): {x: x + 5 for x in range(5)},
('c2', 0): {x: x for x in range(5)},
('c2', 1): {x: x + 5 for x in range(5)},
('c3', 0): {x: x for x in range(5)}
})
result = df.to_latex()
expected = r"""\begin{tabular}{lrrrrr}
\toprule
{} & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\
{} & 0 & 1 & 0 & 1 & 0 \\
\midrule
0 & 0 & 5 & 0 & 5 & 0 \\
1 & 1 & 6 & 1 & 6 & 1 \\
2 & 2 & 7 & 2 & 7 & 2 \\
3 & 3 & 8 & 3 & 8 & 3 \\
4 & 4 & 9 & 4 & 9 & 4 \\
\bottomrule
\end{tabular}
"""
assert result == expected
result = df.to_latex(multicolumn=False)
expected = r"""\begin{tabular}{lrrrrr}
\toprule
{} & c1 & & c2 & & c3 \\
{} & 0 & 1 & 0 & 1 & 0 \\
\midrule
0 & 0 & 5 & 0 & 5 & 0 \\
1 & 1 & 6 & 1 & 6 & 1 \\
2 & 2 & 7 & 2 & 7 & 2 \\
3 & 3 & 8 & 3 & 8 & 3 \\
4 & 4 & 9 & 4 & 9 & 4 \\
\bottomrule
\end{tabular}
"""
assert result == expected
result = df.T.to_latex(multirow=True)
expected = r"""\begin{tabular}{llrrrrr}
\toprule
& & 0 & 1 & 2 & 3 & 4 \\
\midrule
\multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\
& 1 & 5 & 6 & 7 & 8 & 9 \\
\cline{1-7}
\multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\
& 1 & 5 & 6 & 7 & 8 & 9 \\
\cline{1-7}
c3 & 0 & 0 & 1 & 2 & 3 & 4 \\
\bottomrule
\end{tabular}
"""
assert result == expected
df.index = df.T.index
result = df.T.to_latex(multirow=True, multicolumn=True,
multicolumn_format='c')
expected = r"""\begin{tabular}{llrrrrr}
\toprule
& & \multicolumn{2}{c}{c1} & \multicolumn{2}{c}{c2} & c3 \\
& & 0 & 1 & 0 & 1 & 0 \\
\midrule
\multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\
& 1 & 5 & 6 & 7 & 8 & 9 \\
\cline{1-7}
\multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\
& 1 & 5 & 6 & 7 & 8 & 9 \\
\cline{1-7}
c3 & 0 & 0 & 1 & 2 & 3 & 4 \\
\bottomrule
\end{tabular}
"""
assert result == expected
def test_to_latex_escape(self):
a = 'a'
b = 'b'
test_dict = {u('co$e^x$'): {a: "a",
b: "b"},
u('co^l1'): {a: "a",
b: "b"}}
unescaped_result = DataFrame(test_dict).to_latex(escape=False)
escaped_result = DataFrame(test_dict).to_latex(
) # default: escape=True
unescaped_expected = r'''\begin{tabular}{lll}
\toprule
{} & co$e^x$ & co^l1 \\
\midrule
a & a & a \\
b & b & b \\
\bottomrule
\end{tabular}
'''
escaped_expected = r'''\begin{tabular}{lll}
\toprule
{} & co\$e\textasciicircum x\$ & co\textasciicircum l1 \\
\midrule
a & a & a \\
b & b & b \\
\bottomrule
\end{tabular}
'''
assert unescaped_result == unescaped_expected
assert escaped_result == escaped_expected
def test_to_latex_special_escape(self):
df = DataFrame([r"a\b\c", r"^a^b^c", r"~a~b~c"])
escaped_result = df.to_latex()
escaped_expected = r"""\begin{tabular}{ll}
\toprule
{} & 0 \\
\midrule
0 & a\textbackslash b\textbackslash c \\
1 & \textasciicircum a\textasciicircum b\textasciicircum c \\
2 & \textasciitilde a\textasciitilde b\textasciitilde c \\
\bottomrule
\end{tabular}
"""
assert escaped_result == escaped_expected
def test_to_latex_longtable(self, frame):
frame.to_latex(longtable=True)
df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
withindex_result = df.to_latex(longtable=True)
withindex_expected = r"""\begin{longtable}{lrl}
\toprule
{} & a & b \\
\midrule
\endhead
\midrule
\multicolumn{3}{r}{{Continued on next page}} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
0 & 1 & b1 \\
1 & 2 & b2 \\
\end{longtable}
"""
assert withindex_result == withindex_expected
withoutindex_result = df.to_latex(index=False, longtable=True)
withoutindex_expected = r"""\begin{longtable}{rl}
\toprule
a & b \\
\midrule
\endhead
\midrule
\multicolumn{2}{r}{{Continued on next page}} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
1 & b1 \\
2 & b2 \\
\end{longtable}
"""
assert withoutindex_result == withoutindex_expected
df = DataFrame({'a': [1, 2]})
with1column_result = df.to_latex(index=False, longtable=True)
assert r"\multicolumn{1}" in with1column_result
df = DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]})
with3columns_result = df.to_latex(index=False, longtable=True)
assert r"\multicolumn{3}" in with3columns_result
def test_to_latex_escape_special_chars(self):
special_characters = ['&', '%', '$', '#', '_', '{', '}', '~', '^',
'\\']
df = DataFrame(data=special_characters)
observed = df.to_latex()
expected = r"""\begin{tabular}{ll}
\toprule
{} & 0 \\
\midrule
0 & \& \\
1 & \% \\
2 & \$ \\
3 & \# \\
4 & \_ \\
5 & \{ \\
6 & \} \\
7 & \textasciitilde \\
8 & \textasciicircum \\
9 & \textbackslash \\
\bottomrule
\end{tabular}
"""
assert observed == expected
def test_to_latex_no_header(self):
# GH 7124
df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
withindex_result = df.to_latex(header=False)
withindex_expected = r"""\begin{tabular}{lrl}
\toprule
0 & 1 & b1 \\
1 & 2 & b2 \\
\bottomrule
\end{tabular}
"""
assert withindex_result == withindex_expected
withoutindex_result = df.to_latex(index=False, header=False)
withoutindex_expected = r"""\begin{tabular}{rl}
\toprule
1 & b1 \\
2 & b2 \\
\bottomrule
\end{tabular}
"""
assert withoutindex_result == withoutindex_expected
def test_to_latex_specified_header(self):
# GH 7124
df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
withindex_result = df.to_latex(header=['AA', 'BB'])
withindex_expected = r"""\begin{tabular}{lrl}
\toprule
{} & AA & BB \\
\midrule
0 & 1 & b1 \\
1 & 2 & b2 \\
\bottomrule
\end{tabular}
"""
assert withindex_result == withindex_expected
withoutindex_result = df.to_latex(header=['AA', 'BB'], index=False)
withoutindex_expected = r"""\begin{tabular}{rl}
\toprule
AA & BB \\
\midrule
1 & b1 \\
2 & b2 \\
\bottomrule
\end{tabular}
"""
assert withoutindex_result == withoutindex_expected
withoutescape_result = df.to_latex(header=['$A$', '$B$'], escape=False)
withoutescape_expected = r"""\begin{tabular}{lrl}
\toprule
{} & $A$ & $B$ \\
\midrule
0 & 1 & b1 \\
1 & 2 & b2 \\
\bottomrule
\end{tabular}
"""
assert withoutescape_result == withoutescape_expected
with pytest.raises(ValueError):
df.to_latex(header=['A'])
def test_to_latex_decimal(self, frame):
# GH 12031
frame.to_latex()
df = DataFrame({'a': [1.0, 2.1], 'b': ['b1', 'b2']})
withindex_result = df.to_latex(decimal=',')
withindex_expected = r"""\begin{tabular}{lrl}
\toprule
{} & a & b \\
\midrule
0 & 1,0 & b1 \\
1 & 2,1 & b2 \\
\bottomrule
\end{tabular}
"""
assert withindex_result == withindex_expected
def test_to_latex_series(self):
s = Series(['a', 'b', 'c'])
withindex_result = s.to_latex()
withindex_expected = r"""\begin{tabular}{ll}
\toprule
{} & 0 \\
\midrule
0 & a \\
1 & b \\
2 & c \\
\bottomrule
\end{tabular}
"""
assert withindex_result == withindex_expected
def test_to_latex_bold_rows(self):
# GH 16707
df = pd.DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
observed = df.to_latex(bold_rows=True)
expected = r"""\begin{tabular}{lrl}
\toprule
{} & a & b \\
\midrule
\textbf{0} & 1 & b1 \\
\textbf{1} & 2 & b2 \\
\bottomrule
\end{tabular}
"""
assert observed == expected
def test_to_latex_no_bold_rows(self):
# GH 16707
df = pd.DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
observed = df.to_latex(bold_rows=False)
expected = r"""\begin{tabular}{lrl}
\toprule
{} & a & b \\
\midrule
0 & 1 & b1 \\
1 & 2 & b2 \\
\bottomrule
\end{tabular}
"""
assert observed == expected
@pytest.mark.parametrize('name0', [None, 'named0'])
@pytest.mark.parametrize('name1', [None, 'named1'])
@pytest.mark.parametrize('axes', [[0], [1], [0, 1]])
def test_to_latex_multiindex_names(self, name0, name1, axes):
# GH 18667
names = [name0, name1]
mi = pd.MultiIndex.from_product([[1, 2], [3, 4]])
df = pd.DataFrame(-1, index=mi.copy(), columns=mi.copy())
for idx in axes:
df.axes[idx].names = names
idx_names = tuple(n or '{}' for n in names)
idx_names_row = ('%s & %s & & & & \\\\\n' % idx_names
if (0 in axes and any(names)) else '')
placeholder = '{}' if any(names) and 1 in axes else ' '
col_names = [n if (bool(n) and 1 in axes) else placeholder
for n in names]
observed = df.to_latex()
expected = r"""\begin{tabular}{llrrrr}
\toprule
& %s & \multicolumn{2}{l}{1} & \multicolumn{2}{l}{2} \\
& %s & 3 & 4 & 3 & 4 \\
%s\midrule
1 & 3 & -1 & -1 & -1 & -1 \\
& 4 & -1 & -1 & -1 & -1 \\
2 & 3 & -1 & -1 & -1 & -1 \\
& 4 & -1 & -1 & -1 & -1 \\
\bottomrule
\end{tabular}
""" % tuple(list(col_names) + [idx_names_row])
assert observed == expected
@pytest.mark.parametrize('one_row', [True, False])
def test_to_latex_multiindex_nans(self, one_row):
# GH 14249
df = pd.DataFrame({'a': [None, 1], 'b': [2, 3], 'c': [4, 5]})
if one_row:
df = df.iloc[[0]]
observed = df.set_index(['a', 'b']).to_latex()
expected = r"""\begin{tabular}{llr}
\toprule
& & c \\
a & b & \\
\midrule
NaN & 2 & 4 \\
"""
if not one_row:
expected += r"""1.0 & 3 & 5 \\
"""
expected += r"""\bottomrule
\end{tabular}
"""
assert observed == expected
def test_to_latex_non_string_index(self):
# GH 19981
observed = pd.DataFrame([[1, 2, 3]] * 2).set_index([0, 1]).to_latex()
expected = r"""\begin{tabular}{llr}
\toprule
& & 2 \\
0 & 1 & \\
\midrule
1 & 2 & 3 \\
& 2 & 3 \\
\bottomrule
\end{tabular}
"""
assert observed == expected
def test_to_latex_midrule_location(self):
# GH 18326
df = pd.DataFrame({'a': [1, 2]})
df.index.name = 'foo'
observed = df.to_latex(index_names=False)
expected = r"""\begin{tabular}{lr}
\toprule
{} & a \\
\midrule
0 & 1 \\
1 & 2 \\
\bottomrule
\end{tabular}
"""
assert observed == expected
def test_to_latex_multiindex_empty_name(self):
# GH 18669
mi = pd.MultiIndex.from_product([[1, 2]], names=[''])
df = pd.DataFrame(-1, index=mi, columns=range(4))
observed = df.to_latex()
expected = r"""\begin{tabular}{lrrrr}
\toprule
& 0 & 1 & 2 & 3 \\
{} & & & & \\
\midrule
1 & -1 & -1 & -1 & -1 \\
2 & -1 & -1 & -1 & -1 \\
\bottomrule
\end{tabular}
"""
assert observed == expected
@@ -0,0 +1,367 @@
#!/usr/env/bin python
"""
self-contained to write legacy storage (pickle/msgpack) files
To use this script. Create an environment where you want
generate pickles, say its for 0.18.1, with your pandas clone
in ~/pandas
. activate pandas_0.18.1
cd ~/
$ python pandas/pandas/tests/io/generate_legacy_storage_files.py \
pandas/pandas/tests/io/data/legacy_pickle/0.18.1/ pickle
This script generates a storage file for the current arch, system,
and python version
pandas version: 0.18.1
output dir : pandas/pandas/tests/io/data/legacy_pickle/0.18.1/
storage format: pickle
created pickle file: 0.18.1_x86_64_darwin_3.5.2.pickle
The idea here is you are using the *current* version of the
generate_legacy_storage_files with an *older* version of pandas to
generate a pickle file. We will then check this file into a current
branch, and test using test_pickle.py. This will load the *older*
pickles and test versus the current data that is generated
(with master). These are then compared.
If we have cases where we changed the signature (e.g. we renamed
offset -> freq in Timestamp). Then we have to conditionally execute
in the generate_legacy_storage_files.py to make it
run under the older AND the newer version.
"""
from __future__ import print_function
from warnings import catch_warnings
from distutils.version import LooseVersion
from pandas import (Series, DataFrame, Panel,
SparseSeries, SparseDataFrame,
Index, MultiIndex, bdate_range, to_msgpack,
date_range, period_range, timedelta_range,
Timestamp, NaT, Categorical, Period)
from pandas.tseries.offsets import (
DateOffset, Hour, Minute, Day,
MonthBegin, MonthEnd, YearBegin,
YearEnd, Week, WeekOfMonth, LastWeekOfMonth,
BusinessDay, BusinessHour, CustomBusinessDay, FY5253,
Easter,
SemiMonthEnd, SemiMonthBegin,
QuarterBegin, QuarterEnd)
from pandas.compat import u
import os
import sys
import numpy as np
import pandas
import platform as pl
from datetime import timedelta
_loose_version = LooseVersion(pandas.__version__)
def _create_sp_series():
nan = np.nan
# nan-based
arr = np.arange(15, dtype=np.float64)
arr[7:12] = nan
arr[-1:] = nan
bseries = SparseSeries(arr, kind='block')
bseries.name = u'bseries'
return bseries
def _create_sp_tsseries():
nan = np.nan
# nan-based
arr = np.arange(15, dtype=np.float64)
arr[7:12] = nan
arr[-1:] = nan
date_index = bdate_range('1/1/2011', periods=len(arr))
bseries = SparseSeries(arr, index=date_index, kind='block')
bseries.name = u'btsseries'
return bseries
def _create_sp_frame():
nan = np.nan
data = {u'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
u'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
u'C': np.arange(10).astype(np.int64),
u'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}
dates = bdate_range('1/1/2011', periods=10)
return SparseDataFrame(data, index=dates)
def create_data():
""" create the pickle/msgpack data """
data = {
u'A': [0., 1., 2., 3., np.nan],
u'B': [0, 1, 0, 1, 0],
u'C': [u'foo1', u'foo2', u'foo3', u'foo4', u'foo5'],
u'D': date_range('1/1/2009', periods=5),
u'E': [0., 1, Timestamp('20100101'), u'foo', 2.]
}
scalars = dict(timestamp=Timestamp('20130101'),
period=Period('2012', 'M'))
index = dict(int=Index(np.arange(10)),
date=date_range('20130101', periods=10),
period=period_range('2013-01-01', freq='M', periods=10),
float=Index(np.arange(10, dtype=np.float64)),
uint=Index(np.arange(10, dtype=np.uint64)),
timedelta=timedelta_range('00:00:00', freq='30T', periods=10))
if _loose_version >= LooseVersion('0.18'):
from pandas import RangeIndex
index['range'] = RangeIndex(10)
if _loose_version >= LooseVersion('0.21'):
from pandas import interval_range
index['interval'] = interval_range(0, periods=10)
mi = dict(reg2=MultiIndex.from_tuples(
tuple(zip(*[[u'bar', u'bar', u'baz', u'baz', u'foo',
u'foo', u'qux', u'qux'],
[u'one', u'two', u'one', u'two', u'one',
u'two', u'one', u'two']])),
names=[u'first', u'second']))
series = dict(float=Series(data[u'A']),
int=Series(data[u'B']),
mixed=Series(data[u'E']),
ts=Series(np.arange(10).astype(np.int64),
index=date_range('20130101', periods=10)),
mi=Series(np.arange(5).astype(np.float64),
index=MultiIndex.from_tuples(
tuple(zip(*[[1, 1, 2, 2, 2],
[3, 4, 3, 4, 5]])),
names=[u'one', u'two'])),
dup=Series(np.arange(5).astype(np.float64),
index=[u'A', u'B', u'C', u'D', u'A']),
cat=Series(Categorical([u'foo', u'bar', u'baz'])),
dt=Series(date_range('20130101', periods=5)),
dt_tz=Series(date_range('20130101', periods=5,
tz='US/Eastern')),
period=Series([Period('2000Q1')] * 5))
mixed_dup_df = DataFrame(data)
mixed_dup_df.columns = list(u"ABCDA")
frame = dict(float=DataFrame({u'A': series[u'float'],
u'B': series[u'float'] + 1}),
int=DataFrame({u'A': series[u'int'],
u'B': series[u'int'] + 1}),
mixed=DataFrame({k: data[k]
for k in [u'A', u'B', u'C', u'D']}),
mi=DataFrame({u'A': np.arange(5).astype(np.float64),
u'B': np.arange(5).astype(np.int64)},
index=MultiIndex.from_tuples(
tuple(zip(*[[u'bar', u'bar', u'baz',
u'baz', u'baz'],
[u'one', u'two', u'one',
u'two', u'three']])),
names=[u'first', u'second'])),
dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
columns=[u'A', u'B', u'A']),
cat_onecol=DataFrame({u'A': Categorical([u'foo', u'bar'])}),
cat_and_float=DataFrame({
u'A': Categorical([u'foo', u'bar', u'baz']),
u'B': np.arange(3).astype(np.int64)}),
mixed_dup=mixed_dup_df,
dt_mixed_tzs=DataFrame({
u'A': Timestamp('20130102', tz='US/Eastern'),
u'B': Timestamp('20130603', tz='CET')}, index=range(5)),
dt_mixed2_tzs=DataFrame({
u'A': Timestamp('20130102', tz='US/Eastern'),
u'B': Timestamp('20130603', tz='CET'),
u'C': Timestamp('20130603', tz='UTC')}, index=range(5))
)
with catch_warnings(record=True):
mixed_dup_panel = Panel({u'ItemA': frame[u'float'],
u'ItemB': frame[u'int']})
mixed_dup_panel.items = [u'ItemA', u'ItemA']
panel = dict(float=Panel({u'ItemA': frame[u'float'],
u'ItemB': frame[u'float'] + 1}),
dup=Panel(
np.arange(30).reshape(3, 5, 2).astype(np.float64),
items=[u'A', u'B', u'A']),
mixed_dup=mixed_dup_panel)
cat = dict(int8=Categorical(list('abcdefg')),
int16=Categorical(np.arange(1000)),
int32=Categorical(np.arange(10000)))
timestamp = dict(normal=Timestamp('2011-01-01'),
nat=NaT,
tz=Timestamp('2011-01-01', tz='US/Eastern'))
if _loose_version < LooseVersion('0.19.2'):
timestamp['freq'] = Timestamp('2011-01-01', offset='D')
timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo',
offset='M')
else:
timestamp['freq'] = Timestamp('2011-01-01', freq='D')
timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo',
freq='M')
off = {'DateOffset': DateOffset(years=1),
'DateOffset_h_ns': DateOffset(hour=6, nanoseconds=5824),
'BusinessDay': BusinessDay(offset=timedelta(seconds=9)),
'BusinessHour': BusinessHour(normalize=True, n=6, end='15:14'),
'CustomBusinessDay': CustomBusinessDay(weekmask='Mon Fri'),
'SemiMonthBegin': SemiMonthBegin(day_of_month=9),
'SemiMonthEnd': SemiMonthEnd(day_of_month=24),
'MonthBegin': MonthBegin(1),
'MonthEnd': MonthEnd(1),
'QuarterBegin': QuarterBegin(1),
'QuarterEnd': QuarterEnd(1),
'Day': Day(1),
'YearBegin': YearBegin(1),
'YearEnd': YearEnd(1),
'Week': Week(1),
'Week_Tues': Week(2, normalize=False, weekday=1),
'WeekOfMonth': WeekOfMonth(week=3, weekday=4),
'LastWeekOfMonth': LastWeekOfMonth(n=1, weekday=3),
'FY5253': FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
'Easter': Easter(),
'Hour': Hour(1),
'Minute': Minute(1)}
return dict(series=series,
frame=frame,
panel=panel,
index=index,
scalars=scalars,
mi=mi,
sp_series=dict(float=_create_sp_series(),
ts=_create_sp_tsseries()),
sp_frame=dict(float=_create_sp_frame()),
cat=cat,
timestamp=timestamp,
offsets=off)
def create_pickle_data():
data = create_data()
# Pre-0.14.1 versions generated non-unpicklable mixed-type frames and
# panels if their columns/items were non-unique.
if _loose_version < LooseVersion('0.14.1'):
del data['frame']['mixed_dup']
del data['panel']['mixed_dup']
if _loose_version < LooseVersion('0.17.0'):
del data['series']['period']
del data['scalars']['period']
return data
def _u(x):
return {u(k): _u(x[k]) for k in x} if isinstance(x, dict) else x
def create_msgpack_data():
data = create_data()
if _loose_version < LooseVersion('0.17.0'):
del data['frame']['mixed_dup']
del data['panel']['mixed_dup']
del data['frame']['dup']
del data['panel']['dup']
if _loose_version < LooseVersion('0.18.0'):
del data['series']['dt_tz']
del data['frame']['dt_mixed_tzs']
# Not supported
del data['sp_series']
del data['sp_frame']
del data['series']['cat']
del data['series']['period']
del data['frame']['cat_onecol']
del data['frame']['cat_and_float']
del data['scalars']['period']
if _loose_version < LooseVersion('0.23.0'):
del data['index']['interval']
del data['offsets']
return _u(data)
def platform_name():
return '_'.join([str(pandas.__version__), str(pl.machine()),
str(pl.system().lower()), str(pl.python_version())])
def write_legacy_pickles(output_dir):
# make sure we are < 0.13 compat (in py3)
try:
from pandas.compat import zip, cPickle as pickle # noqa
except:
import pickle
version = pandas.__version__
print("This script generates a storage file for the current arch, system, "
"and python version")
print(" pandas version: {0}".format(version))
print(" output dir : {0}".format(output_dir))
print(" storage format: pickle")
pth = '{0}.pickle'.format(platform_name())
fh = open(os.path.join(output_dir, pth), 'wb')
pickle.dump(create_pickle_data(), fh, pickle.HIGHEST_PROTOCOL)
fh.close()
print("created pickle file: %s" % pth)
def write_legacy_msgpack(output_dir, compress):
version = pandas.__version__
print("This script generates a storage file for the current arch, "
"system, and python version")
print(" pandas version: {0}".format(version))
print(" output dir : {0}".format(output_dir))
print(" storage format: msgpack")
pth = '{0}.msgpack'.format(platform_name())
to_msgpack(os.path.join(output_dir, pth), create_msgpack_data(),
compress=compress)
print("created msgpack file: %s" % pth)
def write_legacy_file():
# force our cwd to be the first searched
sys.path.insert(0, '.')
if not (3 <= len(sys.argv) <= 4):
exit("Specify output directory and storage type: generate_legacy_"
"storage_files.py <output_dir> <storage_type> "
"<msgpack_compress_type>")
output_dir = str(sys.argv[1])
storage_type = str(sys.argv[2])
try:
compress_type = str(sys.argv[3])
except IndexError:
compress_type = None
if storage_type == 'pickle':
write_legacy_pickles(output_dir=output_dir)
elif storage_type == 'msgpack':
write_legacy_msgpack(output_dir=output_dir, compress=compress_type)
else:
exit("storage_type must be one of {'pickle', 'msgpack'}")
if __name__ == '__main__':
write_legacy_file()
@@ -0,0 +1,90 @@
import pytest
import pandas as pd
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_raises_regex
def test_compression_roundtrip(compression):
df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
index=['A', 'B'], columns=['X', 'Y', 'Z'])
with tm.ensure_clean() as path:
df.to_json(path, compression=compression)
assert_frame_equal(df, pd.read_json(path,
compression=compression))
# explicitly ensure file was compressed.
with tm.decompress_file(path, compression) as fh:
result = fh.read().decode('utf8')
assert_frame_equal(df, pd.read_json(result))
def test_read_zipped_json(datapath):
uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
uncompressed_df = pd.read_json(uncompressed_path)
compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
compressed_df = pd.read_json(compressed_path, compression='zip')
assert_frame_equal(uncompressed_df, compressed_df)
def test_with_s3_url(compression):
boto3 = pytest.importorskip('boto3')
pytest.importorskip('s3fs')
moto = pytest.importorskip('moto')
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
with moto.mock_s3():
conn = boto3.resource("s3", region_name="us-east-1")
bucket = conn.create_bucket(Bucket="pandas-test")
with tm.ensure_clean() as path:
df.to_json(path, compression=compression)
with open(path, 'rb') as f:
bucket.put_object(Key='test-1', Body=f)
roundtripped_df = pd.read_json('s3://pandas-test/test-1',
compression=compression)
assert_frame_equal(df, roundtripped_df)
def test_lines_with_compression(compression):
with tm.ensure_clean() as path:
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
df.to_json(path, orient='records', lines=True,
compression=compression)
roundtripped_df = pd.read_json(path, lines=True,
compression=compression)
assert_frame_equal(df, roundtripped_df)
def test_chunksize_with_compression(compression):
with tm.ensure_clean() as path:
df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
df.to_json(path, orient='records', lines=True,
compression=compression)
res = pd.read_json(path, lines=True, chunksize=1,
compression=compression)
roundtripped_df = pd.concat(res)
assert_frame_equal(df, roundtripped_df)
def test_write_unsupported_compression_type():
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
with tm.ensure_clean() as path:
msg = "Unrecognized compression type: unsupported"
assert_raises_regex(ValueError, msg, df.to_json,
path, compression="unsupported")
def test_read_unsupported_compression_type():
with tm.ensure_clean() as path:
msg = "Unrecognized compression type: unsupported"
assert_raises_regex(ValueError, msg, pd.read_json,
path, compression="unsupported")
@@ -0,0 +1,575 @@
"""Tests for Table Schema integration."""
import json
from collections import OrderedDict
import numpy as np
import pandas as pd
import pytest
from pandas import DataFrame
from pandas.core.dtypes.dtypes import (
PeriodDtype, CategoricalDtype, DatetimeTZDtype)
from pandas.io.json.table_schema import (
as_json_table_type,
build_table_schema,
convert_pandas_type_to_json_field,
convert_json_field_to_pandas_type,
set_default_names)
import pandas.util.testing as tm
class TestBuildSchema(object):
def setup_method(self, method):
self.df = DataFrame(
{'A': [1, 2, 3, 4],
'B': ['a', 'b', 'c', 'c'],
'C': pd.date_range('2016-01-01', freq='d', periods=4),
'D': pd.timedelta_range('1H', periods=4, freq='T'),
},
index=pd.Index(range(4), name='idx'))
def test_build_table_schema(self):
result = build_table_schema(self.df, version=False)
expected = {
'fields': [{'name': 'idx', 'type': 'integer'},
{'name': 'A', 'type': 'integer'},
{'name': 'B', 'type': 'string'},
{'name': 'C', 'type': 'datetime'},
{'name': 'D', 'type': 'duration'},
],
'primaryKey': ['idx']
}
assert result == expected
result = build_table_schema(self.df)
assert "pandas_version" in result
def test_series(self):
s = pd.Series([1, 2, 3], name='foo')
result = build_table_schema(s, version=False)
expected = {'fields': [{'name': 'index', 'type': 'integer'},
{'name': 'foo', 'type': 'integer'}],
'primaryKey': ['index']}
assert result == expected
result = build_table_schema(s)
assert 'pandas_version' in result
def test_series_unnamed(self):
result = build_table_schema(pd.Series([1, 2, 3]), version=False)
expected = {'fields': [{'name': 'index', 'type': 'integer'},
{'name': 'values', 'type': 'integer'}],
'primaryKey': ['index']}
assert result == expected
def test_multiindex(self):
df = self.df.copy()
idx = pd.MultiIndex.from_product([('a', 'b'), (1, 2)])
df.index = idx
result = build_table_schema(df, version=False)
expected = {
'fields': [{'name': 'level_0', 'type': 'string'},
{'name': 'level_1', 'type': 'integer'},
{'name': 'A', 'type': 'integer'},
{'name': 'B', 'type': 'string'},
{'name': 'C', 'type': 'datetime'},
{'name': 'D', 'type': 'duration'},
],
'primaryKey': ['level_0', 'level_1']
}
assert result == expected
df.index.names = ['idx0', None]
expected['fields'][0]['name'] = 'idx0'
expected['primaryKey'] = ['idx0', 'level_1']
result = build_table_schema(df, version=False)
assert result == expected
class TestTableSchemaType(object):
@pytest.mark.parametrize('int_type', [
np.int, np.int16, np.int32, np.int64])
def test_as_json_table_type_int_data(self, int_type):
int_data = [1, 2, 3]
assert as_json_table_type(np.array(
int_data, dtype=int_type)) == 'integer'
@pytest.mark.parametrize('float_type', [
np.float, np.float16, np.float32, np.float64])
def test_as_json_table_type_float_data(self, float_type):
float_data = [1., 2., 3.]
assert as_json_table_type(np.array(
float_data, dtype=float_type)) == 'number'
@pytest.mark.parametrize('bool_type', [bool, np.bool])
def test_as_json_table_type_bool_data(self, bool_type):
bool_data = [True, False]
assert as_json_table_type(np.array(
bool_data, dtype=bool_type)) == 'boolean'
@pytest.mark.parametrize('date_data', [
pd.to_datetime(['2016']),
pd.to_datetime(['2016'], utc=True),
pd.Series(pd.to_datetime(['2016'])),
pd.Series(pd.to_datetime(['2016'], utc=True)),
pd.period_range('2016', freq='A', periods=3)
])
def test_as_json_table_type_date_data(self, date_data):
assert as_json_table_type(date_data) == 'datetime'
@pytest.mark.parametrize('str_data', [
pd.Series(['a', 'b']), pd.Index(['a', 'b'])])
def test_as_json_table_type_string_data(self, str_data):
assert as_json_table_type(str_data) == 'string'
@pytest.mark.parametrize('cat_data', [
pd.Categorical(['a']),
pd.Categorical([1]),
pd.Series(pd.Categorical([1])),
pd.CategoricalIndex([1]),
pd.Categorical([1])])
def test_as_json_table_type_categorical_data(self, cat_data):
assert as_json_table_type(cat_data) == 'any'
# ------
# dtypes
# ------
@pytest.mark.parametrize('int_dtype', [
np.int, np.int16, np.int32, np.int64])
def test_as_json_table_type_int_dtypes(self, int_dtype):
assert as_json_table_type(int_dtype) == 'integer'
@pytest.mark.parametrize('float_dtype', [
np.float, np.float16, np.float32, np.float64])
def test_as_json_table_type_float_dtypes(self, float_dtype):
assert as_json_table_type(float_dtype) == 'number'
@pytest.mark.parametrize('bool_dtype', [bool, np.bool])
def test_as_json_table_type_bool_dtypes(self, bool_dtype):
assert as_json_table_type(bool_dtype) == 'boolean'
@pytest.mark.parametrize('date_dtype', [
np.datetime64, np.dtype("<M8[ns]"), PeriodDtype(),
DatetimeTZDtype('ns', 'US/Central')])
def test_as_json_table_type_date_dtypes(self, date_dtype):
# TODO: datedate.date? datetime.time?
assert as_json_table_type(date_dtype) == 'datetime'
@pytest.mark.parametrize('td_dtype', [
np.timedelta64, np.dtype("<m8[ns]")])
def test_as_json_table_type_timedelta_dtypes(self, td_dtype):
assert as_json_table_type(td_dtype) == 'duration'
@pytest.mark.parametrize('str_dtype', [object]) # TODO
def test_as_json_table_type_string_dtypes(self, str_dtype):
assert as_json_table_type(str_dtype) == 'string'
def test_as_json_table_type_categorical_dtypes(self):
# TODO: I think before is_categorical_dtype(Categorical)
# returned True, but now it's False. Figure out why or
# if it matters
assert as_json_table_type(pd.Categorical(['a'])) == 'any'
assert as_json_table_type(CategoricalDtype()) == 'any'
class TestTableOrient(object):
def setup_method(self, method):
self.df = DataFrame(
{'A': [1, 2, 3, 4],
'B': ['a', 'b', 'c', 'c'],
'C': pd.date_range('2016-01-01', freq='d', periods=4),
'D': pd.timedelta_range('1H', periods=4, freq='T'),
'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
ordered=True)),
'G': [1., 2., 3, 4.],
'H': pd.date_range('2016-01-01', freq='d', periods=4,
tz='US/Central'),
},
index=pd.Index(range(4), name='idx'))
def test_build_series(self):
s = pd.Series([1, 2], name='a')
s.index.name = 'id'
result = s.to_json(orient='table', date_format='iso')
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result['schema']
result['schema'].pop('pandas_version')
fields = [{'name': 'id', 'type': 'integer'},
{'name': 'a', 'type': 'integer'}]
schema = {
'fields': fields,
'primaryKey': ['id'],
}
expected = OrderedDict([
('schema', schema),
('data', [OrderedDict([('id', 0), ('a', 1)]),
OrderedDict([('id', 1), ('a', 2)])])])
assert result == expected
def test_to_json(self):
df = self.df.copy()
df.index.name = 'idx'
result = df.to_json(orient='table', date_format='iso')
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result['schema']
result['schema'].pop('pandas_version')
fields = [
{'name': 'idx', 'type': 'integer'},
{'name': 'A', 'type': 'integer'},
{'name': 'B', 'type': 'string'},
{'name': 'C', 'type': 'datetime'},
{'name': 'D', 'type': 'duration'},
{'constraints': {'enum': ['a', 'b', 'c']},
'name': 'E',
'ordered': False,
'type': 'any'},
{'constraints': {'enum': ['a', 'b', 'c']},
'name': 'F',
'ordered': True,
'type': 'any'},
{'name': 'G', 'type': 'number'},
{'name': 'H', 'type': 'datetime', 'tz': 'US/Central'}
]
schema = {
'fields': fields,
'primaryKey': ['idx'],
}
data = [
OrderedDict([('idx', 0), ('A', 1), ('B', 'a'),
('C', '2016-01-01T00:00:00.000Z'),
('D', 'P0DT1H0M0S'),
('E', 'a'), ('F', 'a'), ('G', 1.),
('H', '2016-01-01T06:00:00.000Z')
]),
OrderedDict([('idx', 1), ('A', 2), ('B', 'b'),
('C', '2016-01-02T00:00:00.000Z'),
('D', 'P0DT1H1M0S'),
('E', 'b'), ('F', 'b'), ('G', 2.),
('H', '2016-01-02T06:00:00.000Z')
]),
OrderedDict([('idx', 2), ('A', 3), ('B', 'c'),
('C', '2016-01-03T00:00:00.000Z'),
('D', 'P0DT1H2M0S'),
('E', 'c'), ('F', 'c'), ('G', 3.),
('H', '2016-01-03T06:00:00.000Z')
]),
OrderedDict([('idx', 3), ('A', 4), ('B', 'c'),
('C', '2016-01-04T00:00:00.000Z'),
('D', 'P0DT1H3M0S'),
('E', 'c'), ('F', 'c'), ('G', 4.),
('H', '2016-01-04T06:00:00.000Z')
]),
]
expected = OrderedDict([('schema', schema), ('data', data)])
assert result == expected
def test_to_json_float_index(self):
data = pd.Series(1, index=[1., 2.])
result = data.to_json(orient='table', date_format='iso')
result = json.loads(result, object_pairs_hook=OrderedDict)
result['schema'].pop('pandas_version')
expected = (
OrderedDict([('schema', {
'fields': [{'name': 'index', 'type': 'number'},
{'name': 'values', 'type': 'integer'}],
'primaryKey': ['index']
}),
('data', [OrderedDict([('index', 1.0), ('values', 1)]),
OrderedDict([('index', 2.0), ('values', 1)])])])
)
assert result == expected
def test_to_json_period_index(self):
idx = pd.period_range('2016', freq='Q-JAN', periods=2)
data = pd.Series(1, idx)
result = data.to_json(orient='table', date_format='iso')
result = json.loads(result, object_pairs_hook=OrderedDict)
result['schema'].pop('pandas_version')
fields = [{'freq': 'Q-JAN', 'name': 'index', 'type': 'datetime'},
{'name': 'values', 'type': 'integer'}]
schema = {'fields': fields, 'primaryKey': ['index']}
data = [OrderedDict([('index', '2015-11-01T00:00:00.000Z'),
('values', 1)]),
OrderedDict([('index', '2016-02-01T00:00:00.000Z'),
('values', 1)])]
expected = OrderedDict([('schema', schema), ('data', data)])
assert result == expected
def test_to_json_categorical_index(self):
data = pd.Series(1, pd.CategoricalIndex(['a', 'b']))
result = data.to_json(orient='table', date_format='iso')
result = json.loads(result, object_pairs_hook=OrderedDict)
result['schema'].pop('pandas_version')
expected = (
OrderedDict([('schema',
{'fields': [{'name': 'index', 'type': 'any',
'constraints': {'enum': ['a', 'b']},
'ordered': False},
{'name': 'values', 'type': 'integer'}],
'primaryKey': ['index']}),
('data', [
OrderedDict([('index', 'a'),
('values', 1)]),
OrderedDict([('index', 'b'), ('values', 1)])])])
)
assert result == expected
def test_date_format_raises(self):
with pytest.raises(ValueError):
self.df.to_json(orient='table', date_format='epoch')
# others work
self.df.to_json(orient='table', date_format='iso')
self.df.to_json(orient='table')
@pytest.mark.parametrize('kind', [pd.Series, pd.Index])
def test_convert_pandas_type_to_json_field_int(self, kind):
data = [1, 2, 3]
result = convert_pandas_type_to_json_field(kind(data, name='name'))
expected = {"name": "name", "type": "integer"}
assert result == expected
@pytest.mark.parametrize('kind', [pd.Series, pd.Index])
def test_convert_pandas_type_to_json_field_float(self, kind):
data = [1., 2., 3.]
result = convert_pandas_type_to_json_field(kind(data, name='name'))
expected = {"name": "name", "type": "number"}
assert result == expected
@pytest.mark.parametrize('dt_args,extra_exp', [
({}, {}), ({'utc': True}, {'tz': 'UTC'})])
@pytest.mark.parametrize('wrapper', [None, pd.Series])
def test_convert_pandas_type_to_json_field_datetime(self, dt_args,
extra_exp, wrapper):
data = [1., 2., 3.]
data = pd.to_datetime(data, **dt_args)
if wrapper is pd.Series:
data = pd.Series(data, name='values')
result = convert_pandas_type_to_json_field(data)
expected = {"name": "values", "type": 'datetime'}
expected.update(extra_exp)
assert result == expected
def test_convert_pandas_type_to_json_period_range(self):
arr = pd.period_range('2016', freq='A-DEC', periods=4)
result = convert_pandas_type_to_json_field(arr)
expected = {"name": "values", "type": 'datetime', "freq": "A-DEC"}
assert result == expected
@pytest.mark.parametrize('kind', [pd.Categorical, pd.CategoricalIndex])
@pytest.mark.parametrize('ordered', [True, False])
def test_convert_pandas_type_to_json_field_categorical(self, kind,
ordered):
data = ['a', 'b', 'c']
if kind is pd.Categorical:
arr = pd.Series(kind(data, ordered=ordered), name='cats')
elif kind is pd.CategoricalIndex:
arr = kind(data, ordered=ordered, name='cats')
result = convert_pandas_type_to_json_field(arr)
expected = {"name": "cats", "type": "any",
"constraints": {"enum": data},
"ordered": ordered}
assert result == expected
@pytest.mark.parametrize("inp,exp", [
({'type': 'integer'}, 'int64'),
({'type': 'number'}, 'float64'),
({'type': 'boolean'}, 'bool'),
({'type': 'duration'}, 'timedelta64'),
({'type': 'datetime'}, 'datetime64[ns]'),
({'type': 'datetime', 'tz': 'US/Hawaii'}, 'datetime64[ns, US/Hawaii]'),
({'type': 'any'}, 'object'),
({'type': 'any', 'constraints': {'enum': ['a', 'b', 'c']},
'ordered': False}, CategoricalDtype(categories=['a', 'b', 'c'],
ordered=False)),
({'type': 'any', 'constraints': {'enum': ['a', 'b', 'c']},
'ordered': True}, CategoricalDtype(categories=['a', 'b', 'c'],
ordered=True)),
({'type': 'string'}, 'object')])
def test_convert_json_field_to_pandas_type(self, inp, exp):
field = {'name': 'foo'}
field.update(inp)
assert convert_json_field_to_pandas_type(field) == exp
@pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"])
def test_convert_json_field_to_pandas_type_raises(self, inp):
field = {'type': inp}
with tm.assert_raises_regex(ValueError, "Unsupported or invalid field "
"type: {}".format(inp)):
convert_json_field_to_pandas_type(field)
def test_categorical(self):
s = pd.Series(pd.Categorical(['a', 'b', 'a']))
s.index.name = 'idx'
result = s.to_json(orient='table', date_format='iso')
result = json.loads(result, object_pairs_hook=OrderedDict)
result['schema'].pop('pandas_version')
fields = [{'name': 'idx', 'type': 'integer'},
{'constraints': {'enum': ['a', 'b']},
'name': 'values',
'ordered': False,
'type': 'any'}]
expected = OrderedDict([
('schema', {'fields': fields,
'primaryKey': ['idx']}),
('data', [OrderedDict([('idx', 0), ('values', 'a')]),
OrderedDict([('idx', 1), ('values', 'b')]),
OrderedDict([('idx', 2), ('values', 'a')])])])
assert result == expected
@pytest.mark.parametrize('idx,nm,prop', [
(pd.Index([1]), 'index', 'name'),
(pd.Index([1], name='myname'), 'myname', 'name'),
(pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')]),
['level_0', 'level_1'], 'names'),
(pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')],
names=['n1', 'n2']),
['n1', 'n2'], 'names'),
(pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')],
names=['n1', None]),
['n1', 'level_1'], 'names')
])
def test_set_names_unset(self, idx, nm, prop):
data = pd.Series(1, idx)
result = set_default_names(data)
assert getattr(result.index, prop) == nm
@pytest.mark.parametrize("idx", [
pd.Index([], name='index'),
pd.MultiIndex.from_arrays([['foo'], ['bar']],
names=('level_0', 'level_1')),
pd.MultiIndex.from_arrays([['foo'], ['bar']],
names=('foo', 'level_1'))
])
def test_warns_non_roundtrippable_names(self, idx):
# GH 19130
df = pd.DataFrame([[]], index=idx)
df.index.name = 'index'
with tm.assert_produces_warning():
set_default_names(df)
def test_timestamp_in_columns(self):
df = pd.DataFrame([[1, 2]], columns=[pd.Timestamp('2016'),
pd.Timedelta(10, unit='s')])
result = df.to_json(orient="table")
js = json.loads(result)
assert js['schema']['fields'][1]['name'] == 1451606400000
assert js['schema']['fields'][2]['name'] == 10000
@pytest.mark.parametrize('case', [
pd.Series([1], index=pd.Index([1], name='a'), name='a'),
pd.DataFrame({"A": [1]}, index=pd.Index([1], name="A")),
pd.DataFrame({"A": [1]}, index=pd.MultiIndex.from_arrays([
['a'], [1]], names=["A", "a"]))
])
def test_overlapping_names(self, case):
with tm.assert_raises_regex(ValueError, 'Overlapping'):
case.to_json(orient='table')
def test_mi_falsey_name(self):
# GH 16203
df = pd.DataFrame(np.random.randn(4, 4),
index=pd.MultiIndex.from_product([('A', 'B'),
('a', 'b')]))
result = [x['name'] for x in build_table_schema(df)['fields']]
assert result == ['level_0', 'level_1', 0, 1, 2, 3]
class TestTableOrientReader(object):
@pytest.mark.parametrize("index_nm", [
None, "idx", pytest.param("index", marks=pytest.mark.xfail),
'level_0'])
@pytest.mark.parametrize("vals", [
{'ints': [1, 2, 3, 4]},
{'objects': ['a', 'b', 'c', 'd']},
{'date_ranges': pd.date_range('2016-01-01', freq='d', periods=4)},
{'categoricals': pd.Series(pd.Categorical(['a', 'b', 'c', 'c']))},
{'ordered_cats': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
ordered=True))},
pytest.param({'floats': [1., 2., 3., 4.]}, marks=pytest.mark.xfail),
{'floats': [1.1, 2.2, 3.3, 4.4]},
{'bools': [True, False, False, True]}])
def test_read_json_table_orient(self, index_nm, vals, recwarn):
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize("index_nm", [
None, "idx", "index"])
@pytest.mark.parametrize("vals", [
{'timedeltas': pd.timedelta_range('1H', periods=4, freq='T')},
{'timezones': pd.date_range('2016-01-01', freq='d', periods=4,
tz='US/Central')}])
def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
out = df.to_json(orient="table")
with tm.assert_raises_regex(NotImplementedError, 'can not yet read '):
pd.read_json(out, orient="table")
def test_comprehensive(self):
df = DataFrame(
{'A': [1, 2, 3, 4],
'B': ['a', 'b', 'c', 'c'],
'C': pd.date_range('2016-01-01', freq='d', periods=4),
# 'D': pd.timedelta_range('1H', periods=4, freq='T'),
'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
ordered=True)),
'G': [1.1, 2.2, 3.3, 4.4],
# 'H': pd.date_range('2016-01-01', freq='d', periods=4,
# tz='US/Central'),
'I': [True, False, False, True],
},
index=pd.Index(range(4), name='idx'))
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize("index_names", [
[None, None], ['foo', 'bar'], ['foo', None], [None, 'foo'],
['index', 'foo']])
def test_multiindex(self, index_names):
# GH 18912
df = pd.DataFrame(
[["Arr", "alpha", [1, 2, 3, 4]],
["Bee", "Beta", [10, 20, 30, 40]]],
index=[["A", "B"], ["Null", "Eins"]],
columns=["Aussprache", "Griechisch", "Args"]
)
df.index.names = index_names
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize("strict_check", [
pytest.param(True, marks=pytest.mark.xfail), False])
def test_empty_frame_roundtrip(self, strict_check):
# GH 21287
df = pd.DataFrame([], columns=['a', 'b', 'c'])
expected = df.copy()
out = df.to_json(orient='table')
result = pd.read_json(out, orient='table')
# TODO: When DF coercion issue (#21345) is resolved tighten type checks
tm.assert_frame_equal(expected, result,
check_dtype=strict_check,
check_index_type=strict_check)
@@ -0,0 +1,442 @@
import pytest
import numpy as np
import json
import pandas.util.testing as tm
from pandas import compat, Index, DataFrame
from pandas.io.json import json_normalize
from pandas.io.json.normalize import nested_to_record
@pytest.fixture
def deep_nested():
# deeply nested data
return [{'country': 'USA',
'states': [{'name': 'California',
'cities': [{'name': 'San Francisco',
'pop': 12345},
{'name': 'Los Angeles',
'pop': 12346}]
},
{'name': 'Ohio',
'cities': [{'name': 'Columbus',
'pop': 1234},
{'name': 'Cleveland',
'pop': 1236}]}
]
},
{'country': 'Germany',
'states': [{'name': 'Bayern',
'cities': [{'name': 'Munich', 'pop': 12347}]
},
{'name': 'Nordrhein-Westfalen',
'cities': [{'name': 'Duesseldorf', 'pop': 1238},
{'name': 'Koeln', 'pop': 1239}]}
]
}
]
@pytest.fixture
def state_data():
return [
{'counties': [{'name': 'Dade', 'population': 12345},
{'name': 'Broward', 'population': 40000},
{'name': 'Palm Beach', 'population': 60000}],
'info': {'governor': 'Rick Scott'},
'shortname': 'FL',
'state': 'Florida'},
{'counties': [{'name': 'Summit', 'population': 1234},
{'name': 'Cuyahoga', 'population': 1337}],
'info': {'governor': 'John Kasich'},
'shortname': 'OH',
'state': 'Ohio'}]
@pytest.fixture
def author_missing_data():
return [
{'info': None},
{'info':
{'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
'author_name':
{'first': 'Jane', 'last_name': 'Doe'}
}]
class TestJSONNormalize(object):
def test_simple_records(self):
recs = [{'a': 1, 'b': 2, 'c': 3},
{'a': 4, 'b': 5, 'c': 6},
{'a': 7, 'b': 8, 'c': 9},
{'a': 10, 'b': 11, 'c': 12}]
result = json_normalize(recs)
expected = DataFrame(recs)
tm.assert_frame_equal(result, expected)
def test_simple_normalize(self, state_data):
result = json_normalize(state_data[0], 'counties')
expected = DataFrame(state_data[0]['counties'])
tm.assert_frame_equal(result, expected)
result = json_normalize(state_data, 'counties')
expected = []
for rec in state_data:
expected.extend(rec['counties'])
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
result = json_normalize(state_data, 'counties', meta='state')
expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
tm.assert_frame_equal(result, expected)
def test_empty_array(self):
result = json_normalize([])
expected = DataFrame()
tm.assert_frame_equal(result, expected)
def test_simple_normalize_with_separator(self, deep_nested):
# GH 14883
result = json_normalize({'A': {'A': 1, 'B': 2}})
expected = DataFrame([[1, 2]], columns=['A.A', 'A.B'])
tm.assert_frame_equal(result.reindex_like(expected), expected)
result = json_normalize({'A': {'A': 1, 'B': 2}}, sep='_')
expected = DataFrame([[1, 2]], columns=['A_A', 'A_B'])
tm.assert_frame_equal(result.reindex_like(expected), expected)
result = json_normalize({'A': {'A': 1, 'B': 2}}, sep=u'\u03c3')
expected = DataFrame([[1, 2]], columns=[u'A\u03c3A', u'A\u03c3B'])
tm.assert_frame_equal(result.reindex_like(expected), expected)
result = json_normalize(deep_nested, ['states', 'cities'],
meta=['country', ['states', 'name']],
sep='_')
expected = Index(['name', 'pop',
'country', 'states_name']).sort_values()
assert result.columns.sort_values().equals(expected)
def test_value_array_record_prefix(self):
# GH 21536
result = json_normalize({'A': [1, 2]}, 'A', record_prefix='Prefix.')
expected = DataFrame([[1], [2]], columns=['Prefix.0'])
tm.assert_frame_equal(result, expected)
def test_more_deeply_nested(self, deep_nested):
result = json_normalize(deep_nested, ['states', 'cities'],
meta=['country', ['states', 'name']])
# meta_prefix={'states': 'state_'})
ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3,
'states.name': ['California', 'California', 'Ohio', 'Ohio',
'Bayern', 'Nordrhein-Westfalen',
'Nordrhein-Westfalen'],
'name': ['San Francisco', 'Los Angeles', 'Columbus',
'Cleveland', 'Munich', 'Duesseldorf', 'Koeln'],
'pop': [12345, 12346, 1234, 1236, 12347, 1238, 1239]}
expected = DataFrame(ex_data, columns=result.columns)
tm.assert_frame_equal(result, expected)
def test_shallow_nested(self):
data = [{'state': 'Florida',
'shortname': 'FL',
'info': {
'governor': 'Rick Scott'
},
'counties': [{'name': 'Dade', 'population': 12345},
{'name': 'Broward', 'population': 40000},
{'name': 'Palm Beach', 'population': 60000}]},
{'state': 'Ohio',
'shortname': 'OH',
'info': {
'governor': 'John Kasich'
},
'counties': [{'name': 'Summit', 'population': 1234},
{'name': 'Cuyahoga', 'population': 1337}]}]
result = json_normalize(data, 'counties',
['state', 'shortname',
['info', 'governor']])
ex_data = {'name': ['Dade', 'Broward', 'Palm Beach', 'Summit',
'Cuyahoga'],
'state': ['Florida'] * 3 + ['Ohio'] * 2,
'shortname': ['FL', 'FL', 'FL', 'OH', 'OH'],
'info.governor': ['Rick Scott'] * 3 + ['John Kasich'] * 2,
'population': [12345, 40000, 60000, 1234, 1337]}
expected = DataFrame(ex_data, columns=result.columns)
tm.assert_frame_equal(result, expected)
def test_meta_name_conflict(self):
data = [{'foo': 'hello',
'bar': 'there',
'data': [{'foo': 'something', 'bar': 'else'},
{'foo': 'something2', 'bar': 'else2'}]}]
with pytest.raises(ValueError):
json_normalize(data, 'data', meta=['foo', 'bar'])
result = json_normalize(data, 'data', meta=['foo', 'bar'],
meta_prefix='meta')
for val in ['metafoo', 'metabar', 'foo', 'bar']:
assert val in result
def test_meta_parameter_not_modified(self):
# GH 18610
data = [{'foo': 'hello',
'bar': 'there',
'data': [{'foo': 'something', 'bar': 'else'},
{'foo': 'something2', 'bar': 'else2'}]}]
COLUMNS = ['foo', 'bar']
result = json_normalize(data, 'data', meta=COLUMNS,
meta_prefix='meta')
assert COLUMNS == ['foo', 'bar']
for val in ['metafoo', 'metabar', 'foo', 'bar']:
assert val in result
def test_record_prefix(self, state_data):
result = json_normalize(state_data[0], 'counties')
expected = DataFrame(state_data[0]['counties'])
tm.assert_frame_equal(result, expected)
result = json_normalize(state_data, 'counties',
meta='state',
record_prefix='county_')
expected = []
for rec in state_data:
expected.extend(rec['counties'])
expected = DataFrame(expected)
expected = expected.rename(columns=lambda x: 'county_' + x)
expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
tm.assert_frame_equal(result, expected)
def test_non_ascii_key(self):
if compat.PY3:
testjson = (
b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' +
b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
).decode('utf8')
else:
testjson = ('[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]')
testdata = {
u'sub.A': [1, 3],
u'sub.B': [2, 4],
b"\xc3\x9cnic\xc3\xb8de".decode('utf8'): [0, 1]
}
expected = DataFrame(testdata)
result = json_normalize(json.loads(testjson))
tm.assert_frame_equal(result, expected)
def test_missing_field(self, author_missing_data):
# GH20030:
result = json_normalize(author_missing_data)
ex_data = [
{'info': np.nan,
'author_name.first': np.nan,
'author_name.last_name': np.nan,
'info.created_at': np.nan,
'info.last_updated': np.nan},
{'info': None,
'author_name.first': 'Jane',
'author_name.last_name': 'Doe',
'info.created_at': '11/08/1993',
'info.last_updated': '26/05/2012'}
]
expected = DataFrame(ex_data)
tm.assert_frame_equal(result, expected)
class TestNestedToRecord(object):
def test_flat_stays_flat(self):
recs = [dict(flat1=1, flat2=2),
dict(flat1=3, flat2=4),
]
result = nested_to_record(recs)
expected = recs
assert result == expected
def test_one_level_deep_flattens(self):
data = dict(flat1=1,
dict1=dict(c=1, d=2))
result = nested_to_record(data)
expected = {'dict1.c': 1,
'dict1.d': 2,
'flat1': 1}
assert result == expected
def test_nested_flattens(self):
data = dict(flat1=1,
dict1=dict(c=1, d=2),
nested=dict(e=dict(c=1, d=2),
d=2))
result = nested_to_record(data)
expected = {'dict1.c': 1,
'dict1.d': 2,
'flat1': 1,
'nested.d': 2,
'nested.e.c': 1,
'nested.e.d': 2}
assert result == expected
def test_json_normalize_errors(self):
# GH14583: If meta keys are not always present
# a new option to set errors='ignore' has been implemented
i = {
"Trades": [{
"general": {
"tradeid": 100,
"trade_version": 1,
"stocks": [{
"symbol": "AAPL",
"name": "Apple",
"price": "0"
}, {
"symbol": "GOOG",
"name": "Google",
"price": "0"
}
]
}
}, {
"general": {
"tradeid": 100,
"stocks": [{
"symbol": "AAPL",
"name": "Apple",
"price": "0"
}, {
"symbol": "GOOG",
"name": "Google",
"price": "0"
}
]
}
}
]
}
j = json_normalize(data=i['Trades'],
record_path=[['general', 'stocks']],
meta=[['general', 'tradeid'],
['general', 'trade_version']],
errors='ignore')
expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
'price': {0: '0', 1: '0', 2: '0', 3: '0'},
'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}
assert j.fillna('').to_dict() == expected
pytest.raises(KeyError,
json_normalize, data=i['Trades'],
record_path=[['general', 'stocks']],
meta=[['general', 'tradeid'],
['general', 'trade_version']],
errors='raise'
)
def test_donot_drop_nonevalues(self):
# GH21356
data = [
{'info': None,
'author_name':
{'first': 'Smith', 'last_name': 'Appleseed'}
},
{'info':
{'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
'author_name':
{'first': 'Jane', 'last_name': 'Doe'}
}
]
result = nested_to_record(data)
expected = [
{'info': None,
'author_name.first': 'Smith',
'author_name.last_name': 'Appleseed'},
{'author_name.first': 'Jane',
'author_name.last_name': 'Doe',
'info.created_at': '11/08/1993',
'info.last_updated': '26/05/2012'}]
assert result == expected
def test_nonetype_top_level_bottom_level(self):
# GH21158: If inner level json has a key with a null value
# make sure it doesnt do a new_d.pop twice and except
data = {
"id": None,
"location": {
"country": {
"state": {
"id": None,
"town.info": {
"id": None,
"region": None,
"x": 49.151580810546875,
"y": -33.148521423339844,
"z": 27.572303771972656}}}
}
}
result = nested_to_record(data)
expected = {
'id': None,
'location.country.state.id': None,
'location.country.state.town.info.id': None,
'location.country.state.town.info.region': None,
'location.country.state.town.info.x': 49.151580810546875,
'location.country.state.town.info.y': -33.148521423339844,
'location.country.state.town.info.z': 27.572303771972656}
assert result == expected
def test_nonetype_multiple_levels(self):
# GH21158: If inner level json has a key with a null value
# make sure it doesnt do a new_d.pop twice and except
data = {
"id": None,
"location": {
"id": None,
"country": {
"id": None,
"state": {
"id": None,
"town.info": {
"region": None,
"x": 49.151580810546875,
"y": -33.148521423339844,
"z": 27.572303771972656}}}
}
}
result = nested_to_record(data)
expected = {
'id': None,
'location.id': None,
'location.country.id': None,
'location.country.state.id': None,
'location.country.state.town.info.region': None,
'location.country.state.town.info.x': 49.151580810546875,
'location.country.state.town.info.y': -33.148521423339844,
'location.country.state.town.info.z': 27.572303771972656}
assert result == expected
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,169 @@
# -*- coding: utf-8 -*-
import pytest
import pandas as pd
from pandas import DataFrame, read_json
from pandas.compat import StringIO
from pandas.io.json.json import JsonReader
import pandas.util.testing as tm
from pandas.util.testing import (assert_frame_equal, assert_series_equal,
ensure_clean)
@pytest.fixture
def lines_json_df():
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
return df.to_json(lines=True, orient="records")
def test_read_jsonl():
# GH9180
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
assert_frame_equal(result, expected)
def test_read_jsonl_unicode_chars():
# GH15132: non-ascii unicode characters
# \u201d == RIGHT DOUBLE QUOTATION MARK
# simulate file handle
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
json = StringIO(json)
result = read_json(json, lines=True)
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
columns=['a', 'b'])
assert_frame_equal(result, expected)
# simulate string
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
result = read_json(json, lines=True)
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
columns=['a', 'b'])
assert_frame_equal(result, expected)
def test_to_jsonl():
# GH9180
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
result = df.to_json(orient="records", lines=True)
expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
assert result == expected
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
result = df.to_json(orient="records", lines=True)
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
assert result == expected
assert_frame_equal(read_json(result, lines=True), df)
# GH15096: escaped characters in columns and data
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
columns=["a\\", 'b'])
result = df.to_json(orient="records", lines=True)
expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
'{"a\\\\":"foo\\"","b":"bar"}')
assert result == expected
assert_frame_equal(read_json(result, lines=True), df)
@pytest.mark.parametrize("chunksize", [1, 1.0])
def test_readjson_chunks(lines_json_df, chunksize):
# Basic test that read_json(chunks=True) gives the same result as
# read_json(chunks=False)
# GH17048: memory usage when lines=True
unchunked = read_json(StringIO(lines_json_df), lines=True)
reader = read_json(StringIO(lines_json_df), lines=True,
chunksize=chunksize)
chunked = pd.concat(reader)
assert_frame_equal(chunked, unchunked)
def test_readjson_chunksize_requires_lines(lines_json_df):
msg = "chunksize can only be passed if lines=True"
with tm.assert_raises_regex(ValueError, msg):
pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
def test_readjson_chunks_series():
# Test reading line-format JSON to Series with chunksize param
s = pd.Series({'A': 1, 'B': 2})
strio = StringIO(s.to_json(lines=True, orient="records"))
unchunked = pd.read_json(strio, lines=True, typ='Series')
strio = StringIO(s.to_json(lines=True, orient="records"))
chunked = pd.concat(pd.read_json(
strio, lines=True, typ='Series', chunksize=1
))
assert_series_equal(chunked, unchunked)
def test_readjson_each_chunk(lines_json_df):
# Other tests check that the final result of read_json(chunksize=True)
# is correct. This checks the intermediate chunks.
chunks = list(
pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
)
assert chunks[0].shape == (2, 2)
assert chunks[1].shape == (1, 2)
def test_readjson_chunks_from_file():
with ensure_clean('test.json') as path:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df.to_json(path, lines=True, orient="records")
chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
unchunked = pd.read_json(path, lines=True)
assert_frame_equal(unchunked, chunked)
@pytest.mark.parametrize("chunksize", [None, 1])
def test_readjson_chunks_closes(chunksize):
with ensure_clean('test.json') as path:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df.to_json(path, lines=True, orient="records")
reader = JsonReader(
path, orient=None, typ="frame", dtype=True, convert_axes=True,
convert_dates=True, keep_default_dates=True, numpy=False,
precise_float=False, date_unit=None, encoding=None,
lines=True, chunksize=chunksize, compression=None)
reader.read()
assert reader.open_stream.closed, "didn't close stream with \
chunksize = {chunksize}".format(chunksize=chunksize)
@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
def test_readjson_invalid_chunksize(lines_json_df, chunksize):
msg = r"'chunksize' must be an integer >=1"
with tm.assert_raises_regex(ValueError, msg):
pd.read_json(StringIO(lines_json_df), lines=True,
chunksize=chunksize)
@pytest.mark.parametrize("chunksize", [None, 1, 2])
def test_readjson_chunks_multiple_empty_lines(chunksize):
j = """
{"A":1,"B":4}
{"A":2,"B":5}
{"A":3,"B":6}
"""
orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
test = pd.read_json(j, lines=True, chunksize=chunksize)
if chunksize is not None:
test = pd.concat(test)
tm.assert_frame_equal(
orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize))
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,10 @@
from pandas.compat import PY3
# array compat
if PY3:
frombytes = lambda obj, data: obj.frombytes(data)
tobytes = lambda obj: obj.tobytes()
else:
frombytes = lambda obj, data: obj.fromstring(data)
tobytes = lambda obj: obj.tostring()
@@ -0,0 +1,20 @@
# coding: utf-8
from pandas.io.msgpack import packb, unpackb
from .common import frombytes
def test_unpack_buffer():
from array import array
buf = array('b')
frombytes(buf, packb((b'foo', b'bar')))
obj = unpackb(buf, use_list=1)
assert [b'foo', b'bar'] == obj
def test_unpack_bytearray():
buf = bytearray(packb(('foo', 'bar')))
obj = unpackb(buf, use_list=1)
assert [b'foo', b'bar'] == obj
expected_type = bytes
assert all(type(s) == expected_type for s in obj)
@@ -0,0 +1,115 @@
# coding: utf-8
from pandas.io.msgpack import packb, unpackb
def check(length, obj):
v = packb(obj)
assert len(v) == length, \
"%r length should be %r but get %r" % (obj, length, len(v))
assert unpackb(v, use_list=0) == obj
def test_1():
for o in [None, True, False, 0, 1, (1 << 6), (1 << 7) - 1, -1,
-((1 << 5) - 1), -(1 << 5)]:
check(1, o)
def test_2():
for o in [1 << 7, (1 << 8) - 1, -((1 << 5) + 1), -(1 << 7)]:
check(2, o)
def test_3():
for o in [1 << 8, (1 << 16) - 1, -((1 << 7) + 1), -(1 << 15)]:
check(3, o)
def test_5():
for o in [1 << 16, (1 << 32) - 1, -((1 << 15) + 1), -(1 << 31)]:
check(5, o)
def test_9():
for o in [1 << 32, (1 << 64) - 1, -((1 << 31) + 1), -(1 << 63), 1.0, 0.1,
-0.1, -1.0]:
check(9, o)
def check_raw(overhead, num):
check(num + overhead, b" " * num)
def test_fixraw():
check_raw(1, 0)
check_raw(1, (1 << 5) - 1)
def test_raw16():
check_raw(3, 1 << 5)
check_raw(3, (1 << 16) - 1)
def test_raw32():
check_raw(5, 1 << 16)
def check_array(overhead, num):
check(num + overhead, (None, ) * num)
def test_fixarray():
check_array(1, 0)
check_array(1, (1 << 4) - 1)
def test_array16():
check_array(3, 1 << 4)
check_array(3, (1 << 16) - 1)
def test_array32():
check_array(5, (1 << 16))
def match(obj, buf):
assert packb(obj) == buf
assert unpackb(buf, use_list=0) == obj
def test_match():
cases = [
(None, b'\xc0'),
(False, b'\xc2'),
(True, b'\xc3'),
(0, b'\x00'),
(127, b'\x7f'),
(128, b'\xcc\x80'),
(256, b'\xcd\x01\x00'),
(-1, b'\xff'),
(-33, b'\xd0\xdf'),
(-129, b'\xd1\xff\x7f'),
({1: 1}, b'\x81\x01\x01'),
(1.0, b"\xcb\x3f\xf0\x00\x00\x00\x00\x00\x00"),
((), b'\x90'),
(tuple(range(15)), (b"\x9f\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09"
b"\x0a\x0b\x0c\x0d\x0e")),
(tuple(range(16)), (b"\xdc\x00\x10\x00\x01\x02\x03\x04\x05\x06\x07"
b"\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f")),
({}, b'\x80'),
({x: x for x in range(15)},
(b'\x8f\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06\x06\x07'
b'\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e')),
({x: x for x in range(16)},
(b'\xde\x00\x10\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06'
b'\x06\x07\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e'
b'\x0f\x0f')),
]
for v, p in cases:
match(v, p)
def test_unicode():
assert unpackb(packb('foobar'), use_list=1) == b'foobar'
@@ -0,0 +1,39 @@
# coding: utf-8
from datetime import datetime
from pandas.io.msgpack import packb, unpackb
import pytest
import pandas.util.testing as tm
class DummyException(Exception):
pass
class TestExceptions(object):
def test_raise_on_find_unsupported_value(self):
msg = "can\'t serialize datetime"
with tm.assert_raises_regex(TypeError, msg):
packb(datetime.now())
def test_raise_from_object_hook(self):
def hook(_):
raise DummyException()
pytest.raises(DummyException, unpackb, packb({}), object_hook=hook)
pytest.raises(DummyException, unpackb, packb({'fizz': 'buzz'}),
object_hook=hook)
pytest.raises(DummyException, unpackb, packb({'fizz': 'buzz'}),
object_pairs_hook=hook)
pytest.raises(DummyException, unpackb,
packb({'fizz': {'buzz': 'spam'}}), object_hook=hook)
pytest.raises(DummyException, unpackb,
packb({'fizz': {'buzz': 'spam'}}),
object_pairs_hook=hook)
def test_invalid_value(self):
msg = "Unpack failed: error"
with tm.assert_raises_regex(ValueError, msg):
unpackb(b"\xd9\x97#DL_")
@@ -0,0 +1,61 @@
from __future__ import print_function
import array
import pandas.io.msgpack as msgpack
from pandas.io.msgpack import ExtType
from .common import frombytes, tobytes
def test_pack_ext_type():
def p(s):
packer = msgpack.Packer()
packer.pack_ext_type(0x42, s)
return packer.bytes()
assert p(b'A') == b'\xd4\x42A' # fixext 1
assert p(b'AB') == b'\xd5\x42AB' # fixext 2
assert p(b'ABCD') == b'\xd6\x42ABCD' # fixext 4
assert p(b'ABCDEFGH') == b'\xd7\x42ABCDEFGH' # fixext 8
assert p(b'A' * 16) == b'\xd8\x42' + b'A' * 16 # fixext 16
assert p(b'ABC') == b'\xc7\x03\x42ABC' # ext 8
assert p(b'A' * 0x0123) == b'\xc8\x01\x23\x42' + b'A' * 0x0123 # ext 16
assert (p(b'A' * 0x00012345) ==
b'\xc9\x00\x01\x23\x45\x42' + b'A' * 0x00012345) # ext 32
def test_unpack_ext_type():
def check(b, expected):
assert msgpack.unpackb(b) == expected
check(b'\xd4\x42A', ExtType(0x42, b'A')) # fixext 1
check(b'\xd5\x42AB', ExtType(0x42, b'AB')) # fixext 2
check(b'\xd6\x42ABCD', ExtType(0x42, b'ABCD')) # fixext 4
check(b'\xd7\x42ABCDEFGH', ExtType(0x42, b'ABCDEFGH')) # fixext 8
check(b'\xd8\x42' + b'A' * 16, ExtType(0x42, b'A' * 16)) # fixext 16
check(b'\xc7\x03\x42ABC', ExtType(0x42, b'ABC')) # ext 8
check(b'\xc8\x01\x23\x42' + b'A' * 0x0123,
ExtType(0x42, b'A' * 0x0123)) # ext 16
check(b'\xc9\x00\x01\x23\x45\x42' + b'A' * 0x00012345,
ExtType(0x42, b'A' * 0x00012345)) # ext 32
def test_extension_type():
def default(obj):
print('default called', obj)
if isinstance(obj, array.array):
typecode = 123 # application specific typecode
data = tobytes(obj)
return ExtType(typecode, data)
raise TypeError("Unknown type object %r" % (obj, ))
def ext_hook(code, data):
print('ext_hook called', code, data)
assert code == 123
obj = array.array('d')
frombytes(obj, data)
return obj
obj = [42, b'hello', array.array('d', [1.1, 2.2, 3.3])]
s = msgpack.packb(obj, default=default)
obj2 = msgpack.unpackb(s, ext_hook=ext_hook)
assert obj == obj2
@@ -0,0 +1,91 @@
# coding: utf-8
from pandas.io.msgpack import unpackb
def check(src, should, use_list=0):
assert unpackb(src, use_list=use_list) == should
def testSimpleValue():
check(b"\x93\xc0\xc2\xc3", (None, False, True, ))
def testFixnum():
check(b"\x92\x93\x00\x40\x7f\x93\xe0\xf0\xff", ((0,
64,
127, ),
(-32,
-16,
-1, ), ))
def testFixArray():
check(b"\x92\x90\x91\x91\xc0", ((), ((None, ), ), ), )
def testFixRaw():
check(b"\x94\xa0\xa1a\xa2bc\xa3def", (b"", b"a", b"bc", b"def", ), )
def testFixMap():
check(b"\x82\xc2\x81\xc0\xc0\xc3\x81\xc0\x80",
{False: {None: None},
True: {None: {}}}, )
def testUnsignedInt():
check(b"\x99\xcc\x00\xcc\x80\xcc\xff\xcd\x00\x00\xcd\x80\x00"
b"\xcd\xff\xff\xce\x00\x00\x00\x00\xce\x80\x00\x00\x00"
b"\xce\xff\xff\xff\xff",
(0,
128,
255,
0,
32768,
65535,
0,
2147483648,
4294967295, ), )
def testSignedInt():
check(b"\x99\xd0\x00\xd0\x80\xd0\xff\xd1\x00\x00\xd1\x80\x00"
b"\xd1\xff\xff\xd2\x00\x00\x00\x00\xd2\x80\x00\x00\x00"
b"\xd2\xff\xff\xff\xff", (0,
-128,
-1,
0,
-32768,
-1,
0,
-2147483648,
-1, ))
def testRaw():
check(b"\x96\xda\x00\x00\xda\x00\x01a\xda\x00\x02ab\xdb\x00\x00"
b"\x00\x00\xdb\x00\x00\x00\x01a\xdb\x00\x00\x00\x02ab",
(b"", b"a", b"ab", b"", b"a", b"ab"))
def testArray():
check(b"\x96\xdc\x00\x00\xdc\x00\x01\xc0\xdc\x00\x02\xc2\xc3\xdd\x00"
b"\x00\x00\x00\xdd\x00\x00\x00\x01\xc0\xdd\x00\x00\x00\x02"
b"\xc2\xc3", ((), (None, ), (False, True), (), (None, ),
(False, True)))
def testMap():
check(b"\x96"
b"\xde\x00\x00"
b"\xde\x00\x01\xc0\xc2"
b"\xde\x00\x02\xc0\xc2\xc3\xc2"
b"\xdf\x00\x00\x00\x00"
b"\xdf\x00\x00\x00\x01\xc0\xc2"
b"\xdf\x00\x00\x00\x02\xc0\xc2\xc3\xc2", ({}, {None: False},
{True: False,
None: False}, {},
{None: False},
{True: False,
None: False}))
@@ -0,0 +1,105 @@
# coding: utf-8
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from pandas.io.msgpack import packb, unpackb, Packer, Unpacker, ExtType
import pytest
import pandas.util.testing as tm
class TestLimits(object):
def test_integer(self):
x = -(2 ** 63)
assert unpackb(packb(x)) == x
pytest.raises((OverflowError, ValueError), packb, x - 1)
x = 2 ** 64 - 1
assert unpackb(packb(x)) == x
pytest.raises((OverflowError, ValueError), packb, x + 1)
def test_array_header(self):
packer = Packer()
packer.pack_array_header(2 ** 32 - 1)
pytest.raises((OverflowError, ValueError),
packer.pack_array_header, 2 ** 32)
def test_map_header(self):
packer = Packer()
packer.pack_map_header(2 ** 32 - 1)
pytest.raises((OverflowError, ValueError),
packer.pack_array_header, 2 ** 32)
def test_max_str_len(self):
d = 'x' * 3
packed = packb(d)
unpacker = Unpacker(max_str_len=3, encoding='utf-8')
unpacker.feed(packed)
assert unpacker.unpack() == d
unpacker = Unpacker(max_str_len=2, encoding='utf-8')
unpacker.feed(packed)
msg = "3 exceeds max_str_len"
with tm.assert_raises_regex(ValueError, msg):
unpacker.unpack()
def test_max_bin_len(self):
d = b'x' * 3
packed = packb(d, use_bin_type=True)
unpacker = Unpacker(max_bin_len=3)
unpacker.feed(packed)
assert unpacker.unpack() == d
unpacker = Unpacker(max_bin_len=2)
unpacker.feed(packed)
msg = "3 exceeds max_bin_len"
with tm.assert_raises_regex(ValueError, msg):
unpacker.unpack()
def test_max_array_len(self):
d = [1, 2, 3]
packed = packb(d)
unpacker = Unpacker(max_array_len=3)
unpacker.feed(packed)
assert unpacker.unpack() == d
unpacker = Unpacker(max_array_len=2)
unpacker.feed(packed)
msg = "3 exceeds max_array_len"
with tm.assert_raises_regex(ValueError, msg):
unpacker.unpack()
def test_max_map_len(self):
d = {1: 2, 3: 4, 5: 6}
packed = packb(d)
unpacker = Unpacker(max_map_len=3)
unpacker.feed(packed)
assert unpacker.unpack() == d
unpacker = Unpacker(max_map_len=2)
unpacker.feed(packed)
msg = "3 exceeds max_map_len"
with tm.assert_raises_regex(ValueError, msg):
unpacker.unpack()
def test_max_ext_len(self):
d = ExtType(42, b"abc")
packed = packb(d)
unpacker = Unpacker(max_ext_len=3)
unpacker.feed(packed)
assert unpacker.unpack() == d
unpacker = Unpacker(max_ext_len=2)
unpacker.feed(packed)
msg = "4 exceeds max_ext_len"
with tm.assert_raises_regex(ValueError, msg):
unpacker.unpack()
@@ -0,0 +1,92 @@
# coding: utf-8
from pandas.io.msgpack import packb, unpackb, ExtType
def test_str8():
header = b'\xd9'
data = b'x' * 32
b = packb(data.decode(), use_bin_type=True)
assert len(b) == len(data) + 2
assert b[0:2] == header + b'\x20'
assert b[2:] == data
assert unpackb(b) == data
data = b'x' * 255
b = packb(data.decode(), use_bin_type=True)
assert len(b) == len(data) + 2
assert b[0:2] == header + b'\xff'
assert b[2:] == data
assert unpackb(b) == data
def test_bin8():
header = b'\xc4'
data = b''
b = packb(data, use_bin_type=True)
assert len(b) == len(data) + 2
assert b[0:2] == header + b'\x00'
assert b[2:] == data
assert unpackb(b) == data
data = b'x' * 255
b = packb(data, use_bin_type=True)
assert len(b) == len(data) + 2
assert b[0:2] == header + b'\xff'
assert b[2:] == data
assert unpackb(b) == data
def test_bin16():
header = b'\xc5'
data = b'x' * 256
b = packb(data, use_bin_type=True)
assert len(b) == len(data) + 3
assert b[0:1] == header
assert b[1:3] == b'\x01\x00'
assert b[3:] == data
assert unpackb(b) == data
data = b'x' * 65535
b = packb(data, use_bin_type=True)
assert len(b) == len(data) + 3
assert b[0:1] == header
assert b[1:3] == b'\xff\xff'
assert b[3:] == data
assert unpackb(b) == data
def test_bin32():
header = b'\xc6'
data = b'x' * 65536
b = packb(data, use_bin_type=True)
assert len(b) == len(data) + 5
assert b[0:1] == header
assert b[1:5] == b'\x00\x01\x00\x00'
assert b[5:] == data
assert unpackb(b) == data
def test_ext():
def check(ext, packed):
assert packb(ext) == packed
assert unpackb(packed) == ext
check(ExtType(0x42, b'Z'), b'\xd4\x42Z') # fixext 1
check(ExtType(0x42, b'ZZ'), b'\xd5\x42ZZ') # fixext 2
check(ExtType(0x42, b'Z' * 4), b'\xd6\x42' + b'Z' * 4) # fixext 4
check(ExtType(0x42, b'Z' * 8), b'\xd7\x42' + b'Z' * 8) # fixext 8
check(ExtType(0x42, b'Z' * 16), b'\xd8\x42' + b'Z' * 16) # fixext 16
# ext 8
check(ExtType(0x42, b''), b'\xc7\x00\x42')
check(ExtType(0x42, b'Z' * 255), b'\xc7\xff\x42' + b'Z' * 255)
# ext 16
check(ExtType(0x42, b'Z' * 256), b'\xc8\x01\x00\x42' + b'Z' * 256)
check(ExtType(0x42, b'Z' * 0xffff), b'\xc8\xff\xff\x42' + b'Z' * 0xffff)
# ext 32
check(
ExtType(0x42, b'Z' *
0x10000), b'\xc9\x00\x01\x00\x00\x42' + b'Z' * 0x10000)
# needs large memory
# check(ExtType(0x42, b'Z'*0xffffffff),
# b'\xc9\xff\xff\xff\xff\x42' + b'Z'*0xffffffff)
@@ -0,0 +1,77 @@
# coding: utf-8
import pytest
from pandas.io.msgpack import packb, unpackb
class DecodeError(Exception):
pass
class TestObj(object):
def _arr_to_str(self, arr):
return ''.join(str(c) for c in arr)
def bad_complex_decoder(self, o):
raise DecodeError("Ooops!")
def _decode_complex(self, obj):
if b'__complex__' in obj:
return complex(obj[b'real'], obj[b'imag'])
return obj
def _encode_complex(self, obj):
if isinstance(obj, complex):
return {b'__complex__': True, b'real': 1, b'imag': 2}
return obj
def test_encode_hook(self):
packed = packb([3, 1 + 2j], default=self._encode_complex)
unpacked = unpackb(packed, use_list=1)
assert unpacked[1] == {b'__complex__': True, b'real': 1, b'imag': 2}
def test_decode_hook(self):
packed = packb([3, {b'__complex__': True, b'real': 1, b'imag': 2}])
unpacked = unpackb(packed, object_hook=self._decode_complex,
use_list=1)
assert unpacked[1] == 1 + 2j
def test_decode_pairs_hook(self):
packed = packb([3, {1: 2, 3: 4}])
prod_sum = 1 * 2 + 3 * 4
unpacked = unpackb(
packed, object_pairs_hook=lambda l: sum(k * v for k, v in l),
use_list=1)
assert unpacked[1] == prod_sum
def test_only_one_obj_hook(self):
pytest.raises(TypeError, unpackb, b'', object_hook=lambda x: x,
object_pairs_hook=lambda x: x)
def test_bad_hook(self):
def f():
packed = packb([3, 1 + 2j], default=lambda o: o)
unpacked = unpackb(packed, use_list=1) # noqa
pytest.raises(TypeError, f)
def test_array_hook(self):
packed = packb([1, 2, 3])
unpacked = unpackb(packed, list_hook=self._arr_to_str, use_list=1)
assert unpacked == '123'
def test_an_exception_in_objecthook1(self):
def f():
packed = packb({1: {'__complex__': True, 'real': 1, 'imag': 2}})
unpackb(packed, object_hook=self.bad_complex_decoder)
pytest.raises(DecodeError, f)
def test_an_exception_in_objecthook2(self):
def f():
packed = packb({1: [{'__complex__': True, 'real': 1, 'imag': 2}]})
unpackb(packed, list_hook=self.bad_complex_decoder, use_list=1)
pytest.raises(DecodeError, f)
@@ -0,0 +1,153 @@
# coding: utf-8
import pytest
import struct
from pandas import compat
from pandas.compat import u, OrderedDict
from pandas.io.msgpack import packb, unpackb, Unpacker, Packer
class TestPack(object):
def check(self, data, use_list=False):
re = unpackb(packb(data), use_list=use_list)
assert re == data
def testPack(self):
test_data = [
0, 1, 127, 128, 255, 256, 65535, 65536,
-1, -32, -33, -128, -129, -32768, -32769,
1.0,
b"", b"a", b"a" * 31, b"a" * 32,
None, True, False,
(), ((),), ((), None,),
{None: 0},
(1 << 23),
]
for td in test_data:
self.check(td)
def testPackUnicode(self):
test_data = [u(""), u("abcd"), [u("defgh")], u("Русский текст"), ]
for td in test_data:
re = unpackb(
packb(td, encoding='utf-8'), use_list=1, encoding='utf-8')
assert re == td
packer = Packer(encoding='utf-8')
data = packer.pack(td)
re = Unpacker(
compat.BytesIO(data), encoding='utf-8', use_list=1).unpack()
assert re == td
def testPackUTF32(self):
test_data = [
compat.u(""),
compat.u("abcd"),
[compat.u("defgh")],
compat.u("Русский текст"),
]
for td in test_data:
re = unpackb(
packb(td, encoding='utf-32'), use_list=1, encoding='utf-32')
assert re == td
def testPackBytes(self):
test_data = [b"", b"abcd", (b"defgh", ), ]
for td in test_data:
self.check(td)
def testIgnoreUnicodeErrors(self):
re = unpackb(
packb(b'abc\xeddef'), encoding='utf-8', unicode_errors='ignore',
use_list=1)
assert re == "abcdef"
def testStrictUnicodeUnpack(self):
pytest.raises(UnicodeDecodeError, unpackb, packb(b'abc\xeddef'),
encoding='utf-8', use_list=1)
def testStrictUnicodePack(self):
pytest.raises(UnicodeEncodeError, packb, compat.u("abc\xeddef"),
encoding='ascii', unicode_errors='strict')
def testIgnoreErrorsPack(self):
re = unpackb(
packb(
compat.u("abcФФФdef"), encoding='ascii',
unicode_errors='ignore'), encoding='utf-8', use_list=1)
assert re == compat.u("abcdef")
def testNoEncoding(self):
pytest.raises(TypeError, packb, compat.u("abc"), encoding=None)
def testDecodeBinary(self):
re = unpackb(packb("abc"), encoding=None, use_list=1)
assert re == b"abc"
def testPackFloat(self):
assert packb(1.0,
use_single_float=True) == b'\xca' + struct.pack('>f', 1.0)
assert packb(
1.0, use_single_float=False) == b'\xcb' + struct.pack('>d', 1.0)
def testArraySize(self, sizes=[0, 5, 50, 1000]):
bio = compat.BytesIO()
packer = Packer()
for size in sizes:
bio.write(packer.pack_array_header(size))
for i in range(size):
bio.write(packer.pack(i))
bio.seek(0)
unpacker = Unpacker(bio, use_list=1)
for size in sizes:
assert unpacker.unpack() == list(range(size))
def test_manualreset(self, sizes=[0, 5, 50, 1000]):
packer = Packer(autoreset=False)
for size in sizes:
packer.pack_array_header(size)
for i in range(size):
packer.pack(i)
bio = compat.BytesIO(packer.bytes())
unpacker = Unpacker(bio, use_list=1)
for size in sizes:
assert unpacker.unpack() == list(range(size))
packer.reset()
assert packer.bytes() == b''
def testMapSize(self, sizes=[0, 5, 50, 1000]):
bio = compat.BytesIO()
packer = Packer()
for size in sizes:
bio.write(packer.pack_map_header(size))
for i in range(size):
bio.write(packer.pack(i)) # key
bio.write(packer.pack(i * 2)) # value
bio.seek(0)
unpacker = Unpacker(bio)
for size in sizes:
assert unpacker.unpack() == {i: i * 2 for i in range(size)}
def test_odict(self):
seq = [(b'one', 1), (b'two', 2), (b'three', 3), (b'four', 4)]
od = OrderedDict(seq)
assert unpackb(packb(od), use_list=1) == dict(seq)
def pair_hook(seq):
return list(seq)
assert unpackb(
packb(od), object_pairs_hook=pair_hook, use_list=1) == seq
def test_pairlist(self):
pairlist = [(b'a', 1), (2, b'b'), (b'foo', b'bar')]
packer = Packer()
packed = packer.pack_map_pairs(pairlist)
unpacked = unpackb(packed, object_pairs_hook=list)
assert pairlist == unpacked
@@ -0,0 +1,70 @@
"""Test Unpacker's read_array_header and read_map_header methods"""
from pandas.io.msgpack import packb, Unpacker, OutOfData
UnexpectedTypeException = ValueError
def test_read_array_header():
unpacker = Unpacker()
unpacker.feed(packb(['a', 'b', 'c']))
assert unpacker.read_array_header() == 3
assert unpacker.unpack() == b'a'
assert unpacker.unpack() == b'b'
assert unpacker.unpack() == b'c'
try:
unpacker.unpack()
assert 0, 'should raise exception'
except OutOfData:
assert 1, 'okay'
def test_read_map_header():
unpacker = Unpacker()
unpacker.feed(packb({'a': 'A'}))
assert unpacker.read_map_header() == 1
assert unpacker.unpack() == B'a'
assert unpacker.unpack() == B'A'
try:
unpacker.unpack()
assert 0, 'should raise exception'
except OutOfData:
assert 1, 'okay'
def test_incorrect_type_array():
unpacker = Unpacker()
unpacker.feed(packb(1))
try:
unpacker.read_array_header()
assert 0, 'should raise exception'
except UnexpectedTypeException:
assert 1, 'okay'
def test_incorrect_type_map():
unpacker = Unpacker()
unpacker.feed(packb(1))
try:
unpacker.read_map_header()
assert 0, 'should raise exception'
except UnexpectedTypeException:
assert 1, 'okay'
def test_correct_type_nested_array():
unpacker = Unpacker()
unpacker.feed(packb({'a': ['b', 'c', 'd']}))
try:
unpacker.read_array_header()
assert 0, 'should raise exception'
except UnexpectedTypeException:
assert 1, 'okay'
def test_incorrect_type_nested_map():
unpacker = Unpacker()
unpacker.feed(packb([{'a': 'b'}]))
try:
unpacker.read_map_header()
assert 0, 'should raise exception'
except UnexpectedTypeException:
assert 1, 'okay'
@@ -0,0 +1,46 @@
# coding: utf-8
import io
import pandas.io.msgpack as msgpack
binarydata = bytes(bytearray(range(256)))
def gen_binary_data(idx):
return binarydata[:idx % 300]
def test_exceeding_unpacker_read_size():
dumpf = io.BytesIO()
packer = msgpack.Packer()
NUMBER_OF_STRINGS = 6
read_size = 16
# 5 ok for read_size=16, while 6 glibc detected *** python: double free or
# corruption (fasttop):
# 20 ok for read_size=256, while 25 segfaults / glibc detected *** python:
# double free or corruption (!prev)
# 40 ok for read_size=1024, while 50 introduces errors
# 7000 ok for read_size=1024*1024, while 8000 leads to glibc detected ***
# python: double free or corruption (!prev):
for idx in range(NUMBER_OF_STRINGS):
data = gen_binary_data(idx)
dumpf.write(packer.pack(data))
f = io.BytesIO(dumpf.getvalue())
dumpf.close()
unpacker = msgpack.Unpacker(f, read_size=read_size, use_list=1)
read_count = 0
for idx, o in enumerate(unpacker):
assert type(o) == bytes
assert o == gen_binary_data(idx)
read_count += 1
assert read_count == NUMBER_OF_STRINGS
@@ -0,0 +1,93 @@
# coding: utf-8
from pandas import compat
from pandas.io.msgpack import Unpacker, BufferFull
from pandas.io.msgpack import OutOfData
import pytest
import pandas.util.testing as tm
class TestPack(object):
def test_partial_data(self):
unpacker = Unpacker()
msg = "No more data to unpack"
for data in [b"\xa5", b"h", b"a", b"l", b"l"]:
unpacker.feed(data)
with tm.assert_raises_regex(StopIteration, msg):
next(iter(unpacker))
unpacker.feed(b"o")
assert next(iter(unpacker)) == b"hallo"
def test_foobar(self):
unpacker = Unpacker(read_size=3, use_list=1)
unpacker.feed(b'foobar')
assert unpacker.unpack() == ord(b'f')
assert unpacker.unpack() == ord(b'o')
assert unpacker.unpack() == ord(b'o')
assert unpacker.unpack() == ord(b'b')
assert unpacker.unpack() == ord(b'a')
assert unpacker.unpack() == ord(b'r')
pytest.raises(OutOfData, unpacker.unpack)
unpacker.feed(b'foo')
unpacker.feed(b'bar')
k = 0
for o, e in zip(unpacker, 'foobarbaz'):
assert o == ord(e)
k += 1
assert k == len(b'foobar')
def test_foobar_skip(self):
unpacker = Unpacker(read_size=3, use_list=1)
unpacker.feed(b'foobar')
assert unpacker.unpack() == ord(b'f')
unpacker.skip()
assert unpacker.unpack() == ord(b'o')
unpacker.skip()
assert unpacker.unpack() == ord(b'a')
unpacker.skip()
pytest.raises(OutOfData, unpacker.unpack)
def test_maxbuffersize(self):
pytest.raises(ValueError, Unpacker, read_size=5, max_buffer_size=3)
unpacker = Unpacker(read_size=3, max_buffer_size=3, use_list=1)
unpacker.feed(b'fo')
pytest.raises(BufferFull, unpacker.feed, b'ob')
unpacker.feed(b'o')
assert ord('f') == next(unpacker)
unpacker.feed(b'b')
assert ord('o') == next(unpacker)
assert ord('o') == next(unpacker)
assert ord('b') == next(unpacker)
def test_readbytes(self):
unpacker = Unpacker(read_size=3)
unpacker.feed(b'foobar')
assert unpacker.unpack() == ord(b'f')
assert unpacker.read_bytes(3) == b'oob'
assert unpacker.unpack() == ord(b'a')
assert unpacker.unpack() == ord(b'r')
# Test buffer refill
unpacker = Unpacker(compat.BytesIO(b'foobar'), read_size=3)
assert unpacker.unpack() == ord(b'f')
assert unpacker.read_bytes(3) == b'oob'
assert unpacker.unpack() == ord(b'a')
assert unpacker.unpack() == ord(b'r')
def test_issue124(self):
unpacker = Unpacker()
unpacker.feed(b'\xa1?\xa1!')
assert tuple(unpacker) == (b'?', b'!')
assert tuple(unpacker) == ()
unpacker.feed(b"\xa1?\xa1")
assert tuple(unpacker) == (b'?', )
assert tuple(unpacker) == ()
unpacker.feed(b"!")
assert tuple(unpacker) == (b'!', )
assert tuple(unpacker) == ()
@@ -0,0 +1,25 @@
# coding: utf-8
from pandas.io.msgpack import packb
from collections import namedtuple
class MyList(list):
pass
class MyDict(dict):
pass
class MyTuple(tuple):
pass
MyNamedTuple = namedtuple('MyNamedTuple', 'x y')
def test_types():
assert packb(MyDict()) == packb(dict())
assert packb(MyList()) == packb(list())
assert packb(MyNamedTuple(1, 2)) == packb((1, 2))
@@ -0,0 +1,63 @@
from io import BytesIO
import sys
from pandas.io.msgpack import Unpacker, packb, OutOfData, ExtType
import pytest
class TestUnpack(object):
def test_unpack_array_header_from_file(self):
f = BytesIO(packb([1, 2, 3, 4]))
unpacker = Unpacker(f)
assert unpacker.read_array_header() == 4
assert unpacker.unpack() == 1
assert unpacker.unpack() == 2
assert unpacker.unpack() == 3
assert unpacker.unpack() == 4
pytest.raises(OutOfData, unpacker.unpack)
def test_unpacker_hook_refcnt(self):
if not hasattr(sys, 'getrefcount'):
pytest.skip('no sys.getrefcount()')
result = []
def hook(x):
result.append(x)
return x
basecnt = sys.getrefcount(hook)
up = Unpacker(object_hook=hook, list_hook=hook)
assert sys.getrefcount(hook) >= basecnt + 2
up.feed(packb([{}]))
up.feed(packb([{}]))
assert up.unpack() == [{}]
assert up.unpack() == [{}]
assert result == [{}, [{}], {}, [{}]]
del up
assert sys.getrefcount(hook) == basecnt
def test_unpacker_ext_hook(self):
class MyUnpacker(Unpacker):
def __init__(self):
super(MyUnpacker, self).__init__(ext_hook=self._hook,
encoding='utf-8')
def _hook(self, code, data):
if code == 1:
return int(data)
else:
return ExtType(code, data)
unpacker = MyUnpacker()
unpacker.feed(packb({'a': 1}, encoding='utf-8'))
assert unpacker.unpack() == {'a': 1}
unpacker.feed(packb({'a': ExtType(1, b'123')}, encoding='utf-8'))
assert unpacker.unpack() == {'a': 123}
unpacker.feed(packb({'a': ExtType(2, b'321')}, encoding='utf-8'))
assert unpacker.unpack() == {'a': ExtType(2, b'321')}
@@ -0,0 +1,29 @@
"""Tests for cases where the user seeks to obtain packed msgpack objects"""
import io
from pandas.io.msgpack import Unpacker, packb
def test_write_bytes():
unpacker = Unpacker()
unpacker.feed(b'abc')
f = io.BytesIO()
assert unpacker.unpack(f.write) == ord('a')
assert f.getvalue() == b'a'
f = io.BytesIO()
assert unpacker.skip(f.write) is None
assert f.getvalue() == b'b'
f = io.BytesIO()
assert unpacker.skip() is None
assert f.getvalue() == b''
def test_write_bytes_multi_buffer():
long_val = (5) * 100
expected = packb(long_val)
unpacker = Unpacker(io.BytesIO(expected), read_size=3, max_buffer_size=3)
f = io.BytesIO()
unpacked = unpacker.unpack(f.write)
assert unpacked == long_val
assert f.getvalue() == expected
@@ -0,0 +1,487 @@
# -*- coding: utf-8 -*-
"""
Tests that apply specifically to the CParser. Unless specifically stated
as a CParser-specific issue, the goal is to eventually move as many of
these tests out of this module as soon as the Python parser can accept
further arguments when parsing.
"""
import os
import sys
import tarfile
import pytest
import numpy as np
import pandas as pd
import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas import DataFrame
from pandas.compat import StringIO, range, lrange
class CParserTests(object):
def test_buffer_overflow(self):
# see gh-9205: test certain malformed input files that cause
# buffer overflows in tokenizer.c
malfw = "1\r1\r1\r 1\r 1\r" # buffer overflow in words pointer
malfs = "1\r1\r1\r 1\r 1\r11\r" # buffer overflow in stream pointer
malfl = "1\r1\r1\r 1\r 1\r11\r1\r" # buffer overflow in lines pointer
cperr = 'Buffer overflow caught - possible malformed input file.'
for malf in (malfw, malfs, malfl):
try:
self.read_table(StringIO(malf))
except Exception as err:
assert cperr in str(err)
def test_buffer_rd_bytes(self):
# see gh-12098: src->buffer in the C parser can be freed twice leading
# to a segfault if a corrupt gzip file is read with 'read_csv' and the
# buffer is filled more than once before gzip throws an exception
data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \
'\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \
'\xA6\x4D' + '\x55' * 267 + \
'\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \
'\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO'
for i in range(100):
try:
self.read_csv(StringIO(data),
compression='gzip',
delim_whitespace=True)
except Exception:
pass
def test_delim_whitespace_custom_terminator(self):
# See gh-12912
data = """a b c~1 2 3~4 5 6~7 8 9"""
df = self.read_csv(StringIO(data), lineterminator='~',
delim_whitespace=True)
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=['a', 'b', 'c'])
tm.assert_frame_equal(df, expected)
def test_dtype_and_names_error(self):
# see gh-8833: passing both dtype and names
# resulting in an error reporting issue
data = """
1.0 1
2.0 2
3.0 3
"""
# base cases
result = self.read_csv(StringIO(data), sep=r'\s+', header=None)
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
tm.assert_frame_equal(result, expected)
result = self.read_csv(StringIO(data), sep=r'\s+',
header=None, names=['a', 'b'])
expected = DataFrame(
[[1.0, 1], [2.0, 2], [3.0, 3]], columns=['a', 'b'])
tm.assert_frame_equal(result, expected)
# fallback casting
result = self.read_csv(StringIO(
data), sep=r'\s+', header=None,
names=['a', 'b'], dtype={'a': np.int32})
expected = DataFrame([[1, 1], [2, 2], [3, 3]],
columns=['a', 'b'])
expected['a'] = expected['a'].astype(np.int32)
tm.assert_frame_equal(result, expected)
data = """
1.0 1
nan 2
3.0 3
"""
# fallback casting, but not castable
with tm.assert_raises_regex(ValueError, 'cannot safely convert'):
self.read_csv(StringIO(data), sep=r'\s+', header=None,
names=['a', 'b'], dtype={'a': np.int32})
def test_unsupported_dtype(self):
df = DataFrame(np.random.rand(5, 2), columns=list(
'AB'), index=['1A', '1B', '1C', '1D', '1E'])
with tm.ensure_clean('__unsupported_dtype__.csv') as path:
df.to_csv(path)
# valid but we don't support it (date)
pytest.raises(TypeError, self.read_csv, path,
dtype={'A': 'datetime64', 'B': 'float64'},
index_col=0)
pytest.raises(TypeError, self.read_csv, path,
dtype={'A': 'datetime64', 'B': 'float64'},
index_col=0, parse_dates=['B'])
# valid but we don't support it
pytest.raises(TypeError, self.read_csv, path,
dtype={'A': 'timedelta64', 'B': 'float64'},
index_col=0)
# valid but unsupported - fixed width unicode string
pytest.raises(TypeError, self.read_csv, path,
dtype={'A': 'U8'},
index_col=0)
@td.skip_if_32bit
def test_precise_conversion(self):
from decimal import Decimal
normal_errors = []
precise_errors = []
# test numbers between 1 and 2
for num in np.linspace(1., 2., num=500):
# 25 decimal digits of precision
text = 'a\n{0:.25}'.format(num)
normal_val = float(self.read_csv(StringIO(text))['a'][0])
precise_val = float(self.read_csv(
StringIO(text), float_precision='high')['a'][0])
roundtrip_val = float(self.read_csv(
StringIO(text), float_precision='round_trip')['a'][0])
actual_val = Decimal(text[2:])
def error(val):
return abs(Decimal('{0:.100}'.format(val)) - actual_val)
normal_errors.append(error(normal_val))
precise_errors.append(error(precise_val))
# round-trip should match float()
assert roundtrip_val == float(text[2:])
assert sum(precise_errors) <= sum(normal_errors)
assert max(precise_errors) <= max(normal_errors)
def test_usecols_dtypes(self):
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
result = self.read_csv(StringIO(data), usecols=(0, 1, 2),
names=('a', 'b', 'c'),
header=None,
converters={'a': str},
dtype={'b': int, 'c': float},
)
result2 = self.read_csv(StringIO(data), usecols=(0, 2),
names=('a', 'b', 'c'),
header=None,
converters={'a': str},
dtype={'b': int, 'c': float},
)
assert (result.dtypes == [object, np.int, np.float]).all()
assert (result2.dtypes == [object, np.float]).all()
def test_disable_bool_parsing(self):
# #2090
data = """A,B,C
Yes,No,Yes
No,Yes,Yes
Yes,,Yes
No,No,No"""
result = self.read_csv(StringIO(data), dtype=object)
assert (result.dtypes == object).all()
result = self.read_csv(StringIO(data), dtype=object, na_filter=False)
assert result['B'][2] == ''
def test_custom_lineterminator(self):
data = 'a,b,c~1,2,3~4,5,6'
result = self.read_csv(StringIO(data), lineterminator='~')
expected = self.read_csv(StringIO(data.replace('~', '\n')))
tm.assert_frame_equal(result, expected)
def test_parse_ragged_csv(self):
data = """1,2,3
1,2,3,4
1,2,3,4,5
1,2
1,2,3,4"""
nice_data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
result = self.read_csv(StringIO(data), header=None,
names=['a', 'b', 'c', 'd', 'e'])
expected = self.read_csv(StringIO(nice_data), header=None,
names=['a', 'b', 'c', 'd', 'e'])
tm.assert_frame_equal(result, expected)
# too many columns, cause segfault if not careful
data = "1,2\n3,4,5"
result = self.read_csv(StringIO(data), header=None,
names=lrange(50))
expected = self.read_csv(StringIO(data), header=None,
names=lrange(3)).reindex(columns=lrange(50))
tm.assert_frame_equal(result, expected)
def test_tokenize_CR_with_quoting(self):
# see gh-3453
data = ' a,b,c\r"a,b","e,d","f,f"'
result = self.read_csv(StringIO(data), header=None)
expected = self.read_csv(StringIO(data.replace('\r', '\n')),
header=None)
tm.assert_frame_equal(result, expected)
result = self.read_csv(StringIO(data))
expected = self.read_csv(StringIO(data.replace('\r', '\n')))
tm.assert_frame_equal(result, expected)
def test_grow_boundary_at_cap(self):
# See gh-12494
#
# Cause of error was that the C parser
# was not increasing the buffer size when
# the desired space would fill the buffer
# to capacity, which would later cause a
# buffer overflow error when checking the
# EOF terminator of the CSV stream
def test_empty_header_read(count):
s = StringIO(',' * count)
expected = DataFrame(columns=[
'Unnamed: {i}'.format(i=i)
for i in range(count + 1)])
df = self.read_csv(s)
tm.assert_frame_equal(df, expected)
for count in range(1, 101):
test_empty_header_read(count)
def test_parse_trim_buffers(self):
# This test is part of a bugfix for issue #13703. It attempts to
# to stress the system memory allocator, to cause it to move the
# stream buffer and either let the OS reclaim the region, or let
# other memory requests of parser otherwise modify the contents
# of memory space, where it was formally located.
# This test is designed to cause a `segfault` with unpatched
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
# times it fails due to memory corruption, which causes the
# loaded DataFrame to differ from the expected one.
# Generate a large mixed-type CSV file on-the-fly (one record is
# approx 1.5KiB).
record_ = \
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
# Set the number of lines so that a call to `parser_trim_buffers`
# is triggered: after a couple of full chunks are consumed a
# relatively small 'residual' chunk would cause reallocation
# within the parser.
chunksize, n_lines = 128, 2 * 128 + 15
csv_data = "\n".join([record_] * n_lines) + "\n"
# We will use StringIO to load the CSV from this text buffer.
# pd.read_csv() will iterate over the file in chunks and will
# finally read a residual chunk of really small size.
# Generate the expected output: manually create the dataframe
# by splitting by comma and repeating the `n_lines` times.
row = tuple(val_ if val_ else np.nan
for val_ in record_.split(","))
expected = pd.DataFrame([row for _ in range(n_lines)],
dtype=object, columns=None, index=None)
# Iterate over the CSV file in chunks of `chunksize` lines
chunks_ = self.read_csv(StringIO(csv_data), header=None,
dtype=object, chunksize=chunksize)
result = pd.concat(chunks_, axis=0, ignore_index=True)
# Check for data corruption if there was no segfault
tm.assert_frame_equal(result, expected)
# This extra test was added to replicate the fault in gh-5291.
# Force 'utf-8' encoding, so that `_string_convert` would take
# a different execution branch.
chunks_ = self.read_csv(StringIO(csv_data), header=None,
dtype=object, chunksize=chunksize,
encoding='utf_8')
result = pd.concat(chunks_, axis=0, ignore_index=True)
tm.assert_frame_equal(result, expected)
def test_internal_null_byte(self):
# see gh-14012
#
# The null byte ('\x00') should not be used as a
# true line terminator, escape character, or comment
# character, only as a placeholder to indicate that
# none was specified.
#
# This test should be moved to common.py ONLY when
# Python's csv class supports parsing '\x00'.
names = ['a', 'b', 'c']
data = "1,2,3\n4,\x00,6\n7,8,9"
expected = pd.DataFrame([[1, 2.0, 3], [4, np.nan, 6],
[7, 8, 9]], columns=names)
result = self.read_csv(StringIO(data), names=names)
tm.assert_frame_equal(result, expected)
def test_read_nrows_large(self):
# gh-7626 - Read only nrows of data in for large inputs (>262144b)
header_narrow = '\t'.join(['COL_HEADER_' + str(i)
for i in range(10)]) + '\n'
data_narrow = '\t'.join(['somedatasomedatasomedata1'
for i in range(10)]) + '\n'
header_wide = '\t'.join(['COL_HEADER_' + str(i)
for i in range(15)]) + '\n'
data_wide = '\t'.join(['somedatasomedatasomedata2'
for i in range(15)]) + '\n'
test_input = (header_narrow + data_narrow * 1050 +
header_wide + data_wide * 2)
df = self.read_csv(StringIO(test_input), sep='\t', nrows=1010)
assert df.size == 1010 * 10
def test_float_precision_round_trip_with_text(self):
# gh-15140 - This should not segfault on Python 2.7+
df = self.read_csv(StringIO('a'),
float_precision='round_trip',
header=None)
tm.assert_frame_equal(df, DataFrame({0: ['a']}))
def test_large_difference_in_columns(self):
# gh-14125
count = 10000
large_row = ('X,' * count)[:-1] + '\n'
normal_row = 'XXXXXX XXXXXX,111111111111111\n'
test_input = (large_row + normal_row * 6)[:-1]
result = self.read_csv(StringIO(test_input), header=None, usecols=[0])
rows = test_input.split('\n')
expected = DataFrame([row.split(',')[0] for row in rows])
tm.assert_frame_equal(result, expected)
def test_data_after_quote(self):
# see gh-15910
data = 'a\n1\n"b"a'
result = self.read_csv(StringIO(data))
expected = DataFrame({'a': ['1', 'ba']})
tm.assert_frame_equal(result, expected)
@tm.capture_stderr
def test_comment_whitespace_delimited(self):
test_input = """\
1 2
2 2 3
3 2 3 # 3 fields
4 2 3# 3 fields
5 2 # 2 fields
6 2# 2 fields
7 # 1 field, NaN
8# 1 field, NaN
9 2 3 # skipped line
# comment"""
df = self.read_csv(StringIO(test_input), comment='#', header=None,
delimiter='\\s+', skiprows=0,
error_bad_lines=False)
error = sys.stderr.getvalue()
# skipped lines 2, 3, 4, 9
for line_num in (2, 3, 4, 9):
assert 'Skipping line {}'.format(line_num) in error, error
expected = DataFrame([[1, 2],
[5, 2],
[6, 2],
[7, np.nan],
[8, np.nan]])
tm.assert_frame_equal(df, expected)
def test_file_like_no_next(self):
# gh-16530: the file-like need not have a "next" or "__next__"
# attribute despite having an "__iter__" attribute.
#
# NOTE: This is only true for the C engine, not Python engine.
class NoNextBuffer(StringIO):
def __next__(self):
raise AttributeError("No next method")
next = __next__
data = "a\n1"
expected = pd.DataFrame({"a": [1]})
result = self.read_csv(NoNextBuffer(data))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
def test_read_tarfile(self, tar_suffix):
# see gh-16530
#
# Unfortunately, Python's CSV library can't handle
# tarfile objects (expects string, not bytes when
# iterating through a file-like).
tar_path = os.path.join(self.dirpath, "tar_csv" + tar_suffix)
with tarfile.open(tar_path, "r") as tar:
data_file = tar.extractfile("tar_data.csv")
out = self.read_csv(data_file)
expected = pd.DataFrame({"a": [1]})
tm.assert_frame_equal(out, expected)
@pytest.mark.high_memory
def test_bytes_exceed_2gb(self):
"""Read from a "CSV" that has a column larger than 2GB.
GH 16798
"""
if self.low_memory:
pytest.skip("not a high_memory test")
csv = StringIO('strings\n' + '\n'.join(
['x' * (1 << 20) for _ in range(2100)]))
df = self.read_csv(csv, low_memory=False)
assert not df.empty
@@ -0,0 +1,118 @@
# -*- coding: utf-8 -*-
"""
Tests that comments are properly handled during parsing
for all of the parsers defined in parsers.py
"""
import numpy as np
import pandas.util.testing as tm
from pandas import DataFrame
from pandas.compat import StringIO
class CommentTests(object):
def test_comment(self):
data = """A,B,C
1,2.,4.#hello world
5.,NaN,10.0
"""
expected = np.array([[1., 2., 4.],
[5., np.nan, 10.]])
df = self.read_csv(StringIO(data), comment='#')
tm.assert_numpy_array_equal(df.values, expected)
df = self.read_table(StringIO(data), sep=',', comment='#',
na_values=['NaN'])
tm.assert_numpy_array_equal(df.values, expected)
def test_line_comment(self):
data = """# empty
A,B,C
1,2.,4.#hello world
#ignore this line
5.,NaN,10.0
"""
expected = np.array([[1., 2., 4.],
[5., np.nan, 10.]])
df = self.read_csv(StringIO(data), comment='#')
tm.assert_numpy_array_equal(df.values, expected)
# check with delim_whitespace=True
df = self.read_csv(StringIO(data.replace(',', ' ')), comment='#',
delim_whitespace=True)
tm.assert_almost_equal(df.values, expected)
# custom line terminator is not supported
# with the Python parser yet
if self.engine == 'c':
expected = np.array([[1., 2., 4.],
[5., np.nan, 10.]])
df = self.read_csv(StringIO(data.replace('\n', '*')),
comment='#', lineterminator='*')
tm.assert_numpy_array_equal(df.values, expected)
def test_comment_skiprows(self):
data = """# empty
random line
# second empty line
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0
"""
# this should ignore the first four lines (including comments)
expected = np.array([[1., 2., 4.], [5., np.nan, 10.]])
df = self.read_csv(StringIO(data), comment='#', skiprows=4)
tm.assert_numpy_array_equal(df.values, expected)
def test_comment_header(self):
data = """# empty
# second empty line
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0
"""
# header should begin at the second non-comment line
expected = np.array([[1., 2., 4.], [5., np.nan, 10.]])
df = self.read_csv(StringIO(data), comment='#', header=1)
tm.assert_numpy_array_equal(df.values, expected)
def test_comment_skiprows_header(self):
data = """# empty
# second empty line
# third empty line
X,Y,Z
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0
"""
# skiprows should skip the first 4 lines (including comments), while
# header should start from the second non-commented line starting
# with line 5
expected = np.array([[1., 2., 4.], [5., np.nan, 10.]])
df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1)
tm.assert_numpy_array_equal(df.values, expected)
def test_custom_comment_char(self):
data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
result = self.read_csv(StringIO(data), comment='#')
expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]})
tm.assert_frame_equal(result, expected)
def test_commment_first_line(self):
# see gh-4623
data = '# notes\na,b,c\n# more notes\n1,2,3'
expected = DataFrame([[1, 2, 3]], columns=['a', 'b', 'c'])
result = self.read_csv(StringIO(data), comment='#')
tm.assert_frame_equal(result, expected)
expected = DataFrame({0: ['a', '1'], 1: ['b', '2'], 2: ['c', '3']})
result = self.read_csv(StringIO(data), comment='#', header=None)
tm.assert_frame_equal(result, expected)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,139 @@
# -*- coding: utf-8 -*-
"""
Tests compressed data parsing functionality for all
of the parsers defined in parsers.py
"""
import pytest
import pandas as pd
import pandas.compat as compat
import pandas.util.testing as tm
import pandas.util._test_decorators as td
import gzip
import bz2
try:
lzma = compat.import_lzma()
except ImportError:
lzma = None
class CompressionTests(object):
def test_zip(self):
import zipfile
with open(self.csv1, 'rb') as data_file:
data = data_file.read()
expected = self.read_csv(self.csv1)
with tm.ensure_clean('test_file.zip') as path:
tmp = zipfile.ZipFile(path, mode='w')
tmp.writestr('test_file', data)
tmp.close()
result = self.read_csv(path, compression='zip')
tm.assert_frame_equal(result, expected)
result = self.read_csv(path, compression='infer')
tm.assert_frame_equal(result, expected)
if self.engine is not 'python':
with open(path, 'rb') as f:
result = self.read_csv(f, compression='zip')
tm.assert_frame_equal(result, expected)
with tm.ensure_clean('combined_zip.zip') as path:
inner_file_names = ['test_file', 'second_file']
tmp = zipfile.ZipFile(path, mode='w')
for file_name in inner_file_names:
tmp.writestr(file_name, data)
tmp.close()
tm.assert_raises_regex(ValueError, 'Multiple files',
self.read_csv, path, compression='zip')
tm.assert_raises_regex(ValueError, 'Multiple files',
self.read_csv, path,
compression='infer')
with tm.ensure_clean() as path:
tmp = zipfile.ZipFile(path, mode='w')
tmp.close()
tm.assert_raises_regex(ValueError, 'Zero files',
self.read_csv, path, compression='zip')
with tm.ensure_clean() as path:
with open(path, 'wb') as f:
pytest.raises(zipfile.BadZipfile, self.read_csv,
f, compression='zip')
@pytest.mark.parametrize('compress_type, compress_method, ext', [
('gzip', gzip.GzipFile, 'gz'),
('bz2', bz2.BZ2File, 'bz2'),
pytest.param('xz', getattr(lzma, 'LZMAFile', None), 'xz',
marks=td.skip_if_no_lzma)
])
def test_other_compression(self, compress_type, compress_method, ext):
with open(self.csv1, 'rb') as data_file:
data = data_file.read()
expected = self.read_csv(self.csv1)
with tm.ensure_clean() as path:
tmp = compress_method(path, mode='wb')
tmp.write(data)
tmp.close()
result = self.read_csv(path, compression=compress_type)
tm.assert_frame_equal(result, expected)
if compress_type == 'bz2':
pytest.raises(ValueError, self.read_csv,
path, compression='bz3')
with open(path, 'rb') as fin:
result = self.read_csv(fin, compression=compress_type)
tm.assert_frame_equal(result, expected)
with tm.ensure_clean('test.{}'.format(ext)) as path:
tmp = compress_method(path, mode='wb')
tmp.write(data)
tmp.close()
result = self.read_csv(path, compression='infer')
tm.assert_frame_equal(result, expected)
def test_read_csv_infer_compression(self):
# see gh-9770
expected = self.read_csv(self.csv1, index_col=0, parse_dates=True)
with open(self.csv1) as f:
inputs = [self.csv1, self.csv1 + '.gz',
self.csv1 + '.bz2', f]
for inp in inputs:
df = self.read_csv(inp, index_col=0, parse_dates=True,
compression='infer')
tm.assert_frame_equal(expected, df)
def test_read_csv_compressed_utf16_example(self, datapath):
# GH18071
path = datapath('io', 'parser', 'data', 'utf16_ex_small.zip')
result = self.read_csv(path, encoding='utf-16',
compression='zip', sep='\t')
expected = pd.DataFrame({
u'Country': [u'Venezuela', u'Venezuela'],
u'Twitter': [u'Hugo Chávez Frías', u'Henrique Capriles R.']
})
tm.assert_frame_equal(result, expected)
def test_invalid_compression(self):
msg = 'Unrecognized compression type: sfark'
with tm.assert_raises_regex(ValueError, msg):
self.read_csv('test_file.zip', compression='sfark')
@@ -0,0 +1,153 @@
# -*- coding: utf-8 -*-
"""
Tests column conversion functionality during parsing
for all of the parsers defined in parsers.py
"""
from datetime import datetime
import pytest
import numpy as np
import pandas as pd
import pandas.util.testing as tm
from pandas._libs.tslib import Timestamp
from pandas import DataFrame, Index
from pandas.compat import parse_date, StringIO, lmap
class ConverterTests(object):
def test_converters_type_must_be_dict(self):
data = """index,A,B,C,D
foo,2,3,4,5
"""
with tm.assert_raises_regex(TypeError, 'Type converters.+'):
self.read_csv(StringIO(data), converters=0)
def test_converters(self):
data = """A,B,C,D
a,1,2,01/01/2009
b,3,4,01/02/2009
c,4,5,01/03/2009
"""
result = self.read_csv(StringIO(data), converters={'D': parse_date})
result2 = self.read_csv(StringIO(data), converters={3: parse_date})
expected = self.read_csv(StringIO(data))
expected['D'] = expected['D'].map(parse_date)
assert isinstance(result['D'][0], (datetime, Timestamp))
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result2, expected)
# produce integer
converter = lambda x: int(x.split('/')[2])
result = self.read_csv(StringIO(data), converters={'D': converter})
expected = self.read_csv(StringIO(data))
expected['D'] = expected['D'].map(converter)
tm.assert_frame_equal(result, expected)
def test_converters_no_implicit_conv(self):
# see gh-2184
data = """000102,1.2,A\n001245,2,B"""
f = lambda x: x.strip()
converter = {0: f}
df = self.read_csv(StringIO(data), header=None, converters=converter)
assert df[0].dtype == object
def test_converters_euro_decimal_format(self):
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,738797819
2;121,12;14897,76;DEF;uyt;0,377320872
3;878,158;108013,434;GHI;rez;2,735694704"""
f = lambda x: float(x.replace(",", "."))
converter = {'Number1': f, 'Number2': f, 'Number3': f}
df2 = self.read_csv(StringIO(data), sep=';', converters=converter)
assert df2['Number1'].dtype == float
assert df2['Number2'].dtype == float
assert df2['Number3'].dtype == float
def test_converter_return_string_bug(self):
# see gh-583
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,738797819
2;121,12;14897,76;DEF;uyt;0,377320872
3;878,158;108013,434;GHI;rez;2,735694704"""
f = lambda x: float(x.replace(",", "."))
converter = {'Number1': f, 'Number2': f, 'Number3': f}
df2 = self.read_csv(StringIO(data), sep=';', converters=converter)
assert df2['Number1'].dtype == float
def test_converters_corner_with_nas(self):
# skip aberration observed on Win64 Python 3.2.2
if hash(np.int64(-1)) != -2:
pytest.skip("skipping because of windows hash on Python"
" 3.2.2")
data = """id,score,days
1,2,12
2,2-5,
3,,14+
4,6-12,2"""
def convert_days(x):
x = x.strip()
if not x:
return np.nan
is_plus = x.endswith('+')
if is_plus:
x = int(x[:-1]) + 1
else:
x = int(x)
return x
def convert_days_sentinel(x):
x = x.strip()
if not x:
return np.nan
is_plus = x.endswith('+')
if is_plus:
x = int(x[:-1]) + 1
else:
x = int(x)
return x
def convert_score(x):
x = x.strip()
if not x:
return np.nan
if x.find('-') > 0:
valmin, valmax = lmap(int, x.split('-'))
val = 0.5 * (valmin + valmax)
else:
val = float(x)
return val
fh = StringIO(data)
result = self.read_csv(fh, converters={'score': convert_score,
'days': convert_days},
na_values=['', None])
assert pd.isna(result['days'][1])
fh = StringIO(data)
result2 = self.read_csv(fh, converters={'score': convert_score,
'days': convert_days_sentinel},
na_values=['', None])
tm.assert_frame_equal(result, result2)
def test_converter_index_col_bug(self):
# see gh-1835
data = "A;B\n1;2\n3;4"
rs = self.read_csv(StringIO(data), sep=';', index_col='A',
converters={'A': lambda x: x})
xp = DataFrame({'B': [2, 4]}, index=Index([1, 3], name='A'))
tm.assert_frame_equal(rs, xp)
assert rs.index.name == xp.index.name
@@ -0,0 +1,78 @@
# -*- coding: utf-8 -*-
"""
Tests that dialects are properly handled during parsing
for all of the parsers defined in parsers.py
"""
import csv
from pandas import DataFrame
from pandas.compat import StringIO
from pandas.errors import ParserWarning
import pandas.util.testing as tm
class DialectTests(object):
def test_dialect(self):
data = """\
label1,label2,label3
index1,"a,c,e
index2,b,d,f
"""
dia = csv.excel()
dia.quoting = csv.QUOTE_NONE
with tm.assert_produces_warning(ParserWarning):
df = self.read_csv(StringIO(data), dialect=dia)
data = '''\
label1,label2,label3
index1,a,c,e
index2,b,d,f
'''
exp = self.read_csv(StringIO(data))
exp.replace('a', '"a', inplace=True)
tm.assert_frame_equal(df, exp)
def test_dialect_str(self):
data = """\
fruit:vegetable
apple:brocolli
pear:tomato
"""
exp = DataFrame({
'fruit': ['apple', 'pear'],
'vegetable': ['brocolli', 'tomato']
})
csv.register_dialect('mydialect', delimiter=':')
with tm.assert_produces_warning(ParserWarning):
df = self.read_csv(StringIO(data), dialect='mydialect')
tm.assert_frame_equal(df, exp)
csv.unregister_dialect('mydialect')
def test_invalid_dialect(self):
class InvalidDialect(object):
pass
data = 'a\n1'
msg = 'Invalid dialect'
with tm.assert_raises_regex(ValueError, msg):
self.read_csv(StringIO(data), dialect=InvalidDialect)
def test_dialect_conflict(self):
data = 'a,b\n1,2'
dialect = 'excel'
exp = DataFrame({'a': [1], 'b': [2]})
with tm.assert_produces_warning(None):
df = self.read_csv(StringIO(data), delimiter=',', dialect=dialect)
tm.assert_frame_equal(df, exp)
with tm.assert_produces_warning(ParserWarning):
df = self.read_csv(StringIO(data), delimiter='.', dialect=dialect)
tm.assert_frame_equal(df, exp)
@@ -0,0 +1,399 @@
# -*- coding: utf-8 -*-
"""
Tests dtype specification during parsing
for all of the parsers defined in parsers.py
"""
import pytest
import numpy as np
import pandas as pd
import pandas.util.testing as tm
from pandas import DataFrame, Series, Index, MultiIndex, Categorical
from pandas.compat import StringIO
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.errors import ParserWarning
class DtypeTests(object):
def test_passing_dtype(self):
# see gh-6607
df = DataFrame(np.random.rand(5, 2).round(4), columns=list(
'AB'), index=['1A', '1B', '1C', '1D', '1E'])
with tm.ensure_clean('__passing_str_as_dtype__.csv') as path:
df.to_csv(path)
# see gh-3795: passing 'str' as the dtype
result = self.read_csv(path, dtype=str, index_col=0)
expected = df.astype(str)
tm.assert_frame_equal(result, expected)
# for parsing, interpret object as str
result = self.read_csv(path, dtype=object, index_col=0)
tm.assert_frame_equal(result, expected)
# we expect all object columns, so need to
# convert to test for equivalence
result = result.astype(float)
tm.assert_frame_equal(result, df)
# invalid dtype
pytest.raises(TypeError, self.read_csv, path,
dtype={'A': 'foo', 'B': 'float64'},
index_col=0)
# see gh-12048: empty frame
actual = self.read_csv(StringIO('A,B'), dtype=str)
expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str)
tm.assert_frame_equal(actual, expected)
def test_pass_dtype(self):
data = """\
one,two
1,2.5
2,3.5
3,4.5
4,5.5"""
result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'})
assert result['one'].dtype == 'u1'
assert result['two'].dtype == 'object'
def test_categorical_dtype(self):
# GH 10153
data = """a,b,c
1,a,3.4
1,a,3.4
2,b,4.5"""
expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
'b': Categorical(['a', 'a', 'b']),
'c': Categorical(['3.4', '3.4', '4.5'])})
actual = self.read_csv(StringIO(data), dtype='category')
tm.assert_frame_equal(actual, expected)
actual = self.read_csv(StringIO(data), dtype=CategoricalDtype())
tm.assert_frame_equal(actual, expected)
actual = self.read_csv(StringIO(data), dtype={'a': 'category',
'b': 'category',
'c': CategoricalDtype()})
tm.assert_frame_equal(actual, expected)
actual = self.read_csv(StringIO(data), dtype={'b': 'category'})
expected = pd.DataFrame({'a': [1, 1, 2],
'b': Categorical(['a', 'a', 'b']),
'c': [3.4, 3.4, 4.5]})
tm.assert_frame_equal(actual, expected)
actual = self.read_csv(StringIO(data), dtype={1: 'category'})
tm.assert_frame_equal(actual, expected)
# unsorted
data = """a,b,c
1,b,3.4
1,b,3.4
2,a,4.5"""
expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
'b': Categorical(['b', 'b', 'a']),
'c': Categorical(['3.4', '3.4', '4.5'])})
actual = self.read_csv(StringIO(data), dtype='category')
tm.assert_frame_equal(actual, expected)
# missing
data = """a,b,c
1,b,3.4
1,nan,3.4
2,a,4.5"""
expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
'b': Categorical(['b', np.nan, 'a']),
'c': Categorical(['3.4', '3.4', '4.5'])})
actual = self.read_csv(StringIO(data), dtype='category')
tm.assert_frame_equal(actual, expected)
@pytest.mark.slow
def test_categorical_dtype_high_cardinality_numeric(self):
# GH 18186
data = np.sort([str(i) for i in range(524289)])
expected = DataFrame({'a': Categorical(data, ordered=True)})
actual = self.read_csv(StringIO('a\n' + '\n'.join(data)),
dtype='category')
actual["a"] = actual["a"].cat.reorder_categories(
np.sort(actual.a.cat.categories), ordered=True)
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_encoding(self, datapath):
# GH 10153
pth = datapath('io', 'parser', 'data', 'unicode_series.csv')
encoding = 'latin-1'
expected = self.read_csv(pth, header=None, encoding=encoding)
expected[1] = Categorical(expected[1])
actual = self.read_csv(pth, header=None, encoding=encoding,
dtype={1: 'category'})
tm.assert_frame_equal(actual, expected)
pth = datapath('io', 'parser', 'data', 'utf16_ex.txt')
encoding = 'utf-16'
expected = self.read_table(pth, encoding=encoding)
expected = expected.apply(Categorical)
actual = self.read_table(pth, encoding=encoding, dtype='category')
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_chunksize(self):
# GH 10153
data = """a,b
1,a
1,b
1,b
2,c"""
expecteds = [pd.DataFrame({'a': [1, 1],
'b': Categorical(['a', 'b'])}),
pd.DataFrame({'a': [1, 2],
'b': Categorical(['b', 'c'])},
index=[2, 3])]
actuals = self.read_csv(StringIO(data), dtype={'b': 'category'},
chunksize=2)
for actual, expected in zip(actuals, expecteds):
tm.assert_frame_equal(actual, expected)
@pytest.mark.parametrize('ordered', [False, True])
@pytest.mark.parametrize('categories', [
['a', 'b', 'c'],
['a', 'c', 'b'],
['a', 'b', 'c', 'd'],
['c', 'b', 'a'],
])
def test_categorical_categoricaldtype(self, categories, ordered):
data = """a,b
1,a
1,b
1,b
2,c"""
expected = pd.DataFrame({
"a": [1, 1, 1, 2],
"b": Categorical(['a', 'b', 'b', 'c'],
categories=categories,
ordered=ordered)
})
dtype = {"b": CategoricalDtype(categories=categories,
ordered=ordered)}
result = self.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_categoricaldtype_unsorted(self):
data = """a,b
1,a
1,b
1,b
2,c"""
dtype = CategoricalDtype(['c', 'b', 'a'])
expected = pd.DataFrame({
'a': [1, 1, 1, 2],
'b': Categorical(['a', 'b', 'b', 'c'], categories=['c', 'b', 'a'])
})
result = self.read_csv(StringIO(data), dtype={'b': dtype})
tm.assert_frame_equal(result, expected)
def test_categoricaldtype_coerces_numeric(self):
dtype = {'b': CategoricalDtype([1, 2, 3])}
data = "b\n1\n1\n2\n3"
expected = pd.DataFrame({'b': Categorical([1, 1, 2, 3])})
result = self.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categoricaldtype_coerces_datetime(self):
dtype = {
'b': CategoricalDtype(pd.date_range('2017', '2019', freq='AS'))
}
data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)})
result = self.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
dtype = {
'b': CategoricalDtype([pd.Timestamp("2014")])
}
data = "b\n2014-01-01\n2014-01-01T00:00:00"
expected = pd.DataFrame({'b': Categorical([pd.Timestamp('2014')] * 2)})
result = self.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categoricaldtype_coerces_timedelta(self):
dtype = {'b': CategoricalDtype(pd.to_timedelta(['1H', '2H', '3H']))}
data = "b\n1H\n2H\n3H"
expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)})
result = self.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categoricaldtype_unexpected_categories(self):
dtype = {'b': CategoricalDtype(['a', 'b', 'd', 'e'])}
data = "b\nd\na\nc\nd" # Unexpected c
expected = pd.DataFrame({"b": Categorical(list('dacd'),
dtype=dtype['b'])})
result = self.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_categoricaldtype_chunksize(self):
# GH 10153
data = """a,b
1,a
1,b
1,b
2,c"""
cats = ['a', 'b', 'c']
expecteds = [pd.DataFrame({'a': [1, 1],
'b': Categorical(['a', 'b'],
categories=cats)}),
pd.DataFrame({'a': [1, 2],
'b': Categorical(['b', 'c'],
categories=cats)},
index=[2, 3])]
dtype = CategoricalDtype(cats)
actuals = self.read_csv(StringIO(data), dtype={'b': dtype},
chunksize=2)
for actual, expected in zip(actuals, expecteds):
tm.assert_frame_equal(actual, expected)
def test_empty_pass_dtype(self):
data = 'one,two'
result = self.read_csv(StringIO(data), dtype={'one': 'u1'})
expected = DataFrame({'one': np.empty(0, dtype='u1'),
'two': np.empty(0, dtype=np.object)})
tm.assert_frame_equal(result, expected, check_index_type=False)
def test_empty_with_index_pass_dtype(self):
data = 'one,two'
result = self.read_csv(StringIO(data), index_col=['one'],
dtype={'one': 'u1', 1: 'f'})
expected = DataFrame({'two': np.empty(0, dtype='f')},
index=Index([], dtype='u1', name='one'))
tm.assert_frame_equal(result, expected, check_index_type=False)
def test_empty_with_multiindex_pass_dtype(self):
data = 'one,two,three'
result = self.read_csv(StringIO(data), index_col=['one', 'two'],
dtype={'one': 'u1', 1: 'f8'})
exp_idx = MultiIndex.from_arrays([np.empty(0, dtype='u1'),
np.empty(0, dtype='O')],
names=['one', 'two'])
expected = DataFrame(
{'three': np.empty(0, dtype=np.object)}, index=exp_idx)
tm.assert_frame_equal(result, expected, check_index_type=False)
def test_empty_with_mangled_column_pass_dtype_by_names(self):
data = 'one,one'
result = self.read_csv(StringIO(data), dtype={
'one': 'u1', 'one.1': 'f'})
expected = DataFrame(
{'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')})
tm.assert_frame_equal(result, expected, check_index_type=False)
def test_empty_with_mangled_column_pass_dtype_by_indexes(self):
data = 'one,one'
result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})
expected = DataFrame(
{'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')})
tm.assert_frame_equal(result, expected, check_index_type=False)
def test_empty_with_dup_column_pass_dtype_by_indexes(self):
# see gh-9424
expected = pd.concat([Series([], name='one', dtype='u1'),
Series([], name='one.1', dtype='f')], axis=1)
data = 'one,one'
result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})
tm.assert_frame_equal(result, expected, check_index_type=False)
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
data = ''
result = self.read_csv(StringIO(data), names=['one', 'one'],
dtype={0: 'u1', 1: 'f'})
tm.assert_frame_equal(result, expected, check_index_type=False)
def test_raise_on_passed_int_dtype_with_nas(self):
# see gh-2631
data = """YEAR, DOY, a
2001,106380451,10
2001,,11
2001,106380451,67"""
pytest.raises(ValueError, self.read_csv, StringIO(data),
sep=",", skipinitialspace=True,
dtype={'DOY': np.int64})
def test_dtype_with_converter(self):
data = """a,b
1.1,2.2
1.2,2.3"""
# dtype spec ignored if converted specified
with tm.assert_produces_warning(ParserWarning):
result = self.read_csv(StringIO(data), dtype={'a': 'i8'},
converters={'a': lambda x: str(x)})
expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]})
tm.assert_frame_equal(result, expected)
def test_empty_dtype(self):
# see gh-14712
data = 'a,b'
expected = pd.DataFrame(columns=['a', 'b'], dtype=np.float64)
result = self.read_csv(StringIO(data), header=0, dtype=np.float64)
tm.assert_frame_equal(result, expected)
expected = pd.DataFrame({'a': pd.Categorical([]),
'b': pd.Categorical([])},
index=[])
result = self.read_csv(StringIO(data), header=0,
dtype='category')
tm.assert_frame_equal(result, expected)
result = self.read_csv(StringIO(data), header=0,
dtype={'a': 'category', 'b': 'category'})
tm.assert_frame_equal(result, expected)
expected = pd.DataFrame(columns=['a', 'b'], dtype='datetime64[ns]')
result = self.read_csv(StringIO(data), header=0,
dtype='datetime64[ns]')
tm.assert_frame_equal(result, expected)
expected = pd.DataFrame({'a': pd.Series([], dtype='timedelta64[ns]'),
'b': pd.Series([], dtype='timedelta64[ns]')},
index=[])
result = self.read_csv(StringIO(data), header=0,
dtype='timedelta64[ns]')
tm.assert_frame_equal(result, expected)
expected = pd.DataFrame(columns=['a', 'b'])
expected['a'] = expected['a'].astype(np.float64)
result = self.read_csv(StringIO(data), header=0,
dtype={'a': np.float64})
tm.assert_frame_equal(result, expected)
expected = pd.DataFrame(columns=['a', 'b'])
expected['a'] = expected['a'].astype(np.float64)
result = self.read_csv(StringIO(data), header=0,
dtype={0: np.float64})
tm.assert_frame_equal(result, expected)
expected = pd.DataFrame(columns=['a', 'b'])
expected['a'] = expected['a'].astype(np.int32)
expected['b'] = expected['b'].astype(np.float64)
result = self.read_csv(StringIO(data), header=0,
dtype={'a': np.int32, 1: np.float64})
tm.assert_frame_equal(result, expected)
def test_numeric_dtype(self):
data = '0\n1'
for dt in np.typecodes['AllInteger'] + np.typecodes['Float']:
expected = pd.DataFrame([0, 1], dtype=dt)
result = self.read_csv(StringIO(data), header=None, dtype=dt)
tm.assert_frame_equal(expected, result)
@@ -0,0 +1,312 @@
# -*- coding: utf-8 -*-
"""
Tests that the file header is properly handled or inferred
during parsing for all of the parsers defined in parsers.py
"""
import pytest
import numpy as np
import pandas.util.testing as tm
from pandas import DataFrame, Index, MultiIndex
from pandas.compat import StringIO, lrange, u
class HeaderTests(object):
def test_read_with_bad_header(self):
errmsg = r"but only \d+ lines in file"
with tm.assert_raises_regex(ValueError, errmsg):
s = StringIO(',,')
self.read_csv(s, header=[10])
def test_bool_header_arg(self):
# see gh-6114
data = """\
MyColumn
a
b
a
b"""
for arg in [True, False]:
with pytest.raises(TypeError):
self.read_csv(StringIO(data), header=arg)
with pytest.raises(TypeError):
self.read_table(StringIO(data), header=arg)
def test_no_header_prefix(self):
data = """1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
df_pref = self.read_table(StringIO(data), sep=',', prefix='Field',
header=None)
expected = np.array([[1, 2, 3, 4, 5],
[6, 7, 8, 9, 10],
[11, 12, 13, 14, 15]], dtype=np.int64)
tm.assert_almost_equal(df_pref.values, expected)
tm.assert_index_equal(df_pref.columns,
Index(['Field0', 'Field1', 'Field2',
'Field3', 'Field4']))
def test_header_with_index_col(self):
data = """foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
names = ['A', 'B', 'C']
df = self.read_csv(StringIO(data), names=names)
assert list(df.columns) == ['A', 'B', 'C']
values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
expected = DataFrame(values, index=['foo', 'bar', 'baz'],
columns=['A', 'B', 'C'])
tm.assert_frame_equal(df, expected)
def test_header_not_first_line(self):
data = """got,to,ignore,this,line
got,to,ignore,this,line
index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
"""
data2 = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
"""
df = self.read_csv(StringIO(data), header=2, index_col=0)
expected = self.read_csv(StringIO(data2), header=0, index_col=0)
tm.assert_frame_equal(df, expected)
def test_header_multi_index(self):
expected = tm.makeCustomDataframe(
5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
data = """\
C0,,C_l0_g0,C_l0_g1,C_l0_g2
C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
"""
df = self.read_csv(StringIO(data), header=[0, 1, 2, 3],
index_col=[0, 1])
tm.assert_frame_equal(df, expected)
# skipping lines in the header
df = self.read_csv(StringIO(data), header=[0, 1, 2, 3],
index_col=[0, 1])
tm.assert_frame_equal(df, expected)
# INVALID OPTIONS
# names
pytest.raises(ValueError, self.read_csv,
StringIO(data), header=[0, 1, 2, 3],
index_col=[0, 1], names=['foo', 'bar'])
# usecols
pytest.raises(ValueError, self.read_csv,
StringIO(data), header=[0, 1, 2, 3],
index_col=[0, 1], usecols=['foo', 'bar'])
# non-numeric index_col
pytest.raises(ValueError, self.read_csv,
StringIO(data), header=[0, 1, 2, 3],
index_col=['foo', 'bar'])
def test_header_multiindex_common_format(self):
df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
index=['one', 'two'],
columns=MultiIndex.from_tuples(
[('a', 'q'), ('a', 'r'), ('a', 's'),
('b', 't'), ('c', 'u'), ('c', 'v')]))
# to_csv
data = """,a,a,a,b,c,c
,q,r,s,t,u,v
,,,,,,
one,1,2,3,4,5,6
two,7,8,9,10,11,12"""
result = self.read_csv(StringIO(data), header=[0, 1], index_col=0)
tm.assert_frame_equal(df, result)
# common
data = """,a,a,a,b,c,c
,q,r,s,t,u,v
one,1,2,3,4,5,6
two,7,8,9,10,11,12"""
result = self.read_csv(StringIO(data), header=[0, 1], index_col=0)
tm.assert_frame_equal(df, result)
# common, no index_col
data = """a,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = self.read_csv(StringIO(data), header=[0, 1], index_col=None)
tm.assert_frame_equal(df.reset_index(drop=True), result)
# malformed case 1
expected = DataFrame(np.array(
[[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'),
index=Index([1, 7]),
columns=MultiIndex(levels=[[u('a'), u('b'), u('c')],
[u('r'), u('s'), u('t'),
u('u'), u('v')]],
labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
names=[u('a'), u('q')]))
data = """a,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = self.read_csv(StringIO(data), header=[0, 1], index_col=0)
tm.assert_frame_equal(expected, result)
# malformed case 2
expected = DataFrame(np.array(
[[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'),
index=Index([1, 7]),
columns=MultiIndex(levels=[[u('a'), u('b'), u('c')],
[u('r'), u('s'), u('t'),
u('u'), u('v')]],
labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
names=[None, u('q')]))
data = """,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = self.read_csv(StringIO(data), header=[0, 1], index_col=0)
tm.assert_frame_equal(expected, result)
# mi on columns and index (malformed)
expected = DataFrame(np.array(
[[3, 4, 5, 6], [9, 10, 11, 12]], dtype='int64'),
index=MultiIndex(levels=[[1, 7], [2, 8]],
labels=[[0, 1], [0, 1]]),
columns=MultiIndex(levels=[[u('a'), u('b'), u('c')],
[u('s'), u('t'), u('u'), u('v')]],
labels=[[0, 1, 2, 2], [0, 1, 2, 3]],
names=[None, u('q')]))
data = """,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = self.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
tm.assert_frame_equal(expected, result)
def test_header_names_backward_compat(self):
# #2539
data = '1,2,3\n4,5,6'
result = self.read_csv(StringIO(data), names=['a', 'b', 'c'])
expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
header=None)
tm.assert_frame_equal(result, expected)
data2 = 'foo,bar,baz\n' + data
result = self.read_csv(StringIO(data2), names=['a', 'b', 'c'],
header=0)
tm.assert_frame_equal(result, expected)
def test_read_only_header_no_rows(self):
# See gh-7773
expected = DataFrame(columns=['a', 'b', 'c'])
df = self.read_csv(StringIO('a,b,c'))
tm.assert_frame_equal(df, expected)
df = self.read_csv(StringIO('a,b,c'), index_col=False)
tm.assert_frame_equal(df, expected)
def test_no_header(self):
data = """1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
df = self.read_table(StringIO(data), sep=',', header=None)
df_pref = self.read_table(StringIO(data), sep=',', prefix='X',
header=None)
names = ['foo', 'bar', 'baz', 'quux', 'panda']
df2 = self.read_table(StringIO(data), sep=',', names=names)
expected = np.array([[1, 2, 3, 4, 5],
[6, 7, 8, 9, 10],
[11, 12, 13, 14, 15]], dtype=np.int64)
tm.assert_almost_equal(df.values, expected)
tm.assert_almost_equal(df.values, df2.values)
tm.assert_index_equal(df_pref.columns,
Index(['X0', 'X1', 'X2', 'X3', 'X4']))
tm.assert_index_equal(df.columns, Index(lrange(5)))
tm.assert_index_equal(df2.columns, Index(names))
def test_non_int_header(self):
# GH 16338
msg = 'header must be integer or list of integers'
data = """1,2\n3,4"""
with tm.assert_raises_regex(ValueError, msg):
self.read_csv(StringIO(data), sep=',', header=['a', 'b'])
with tm.assert_raises_regex(ValueError, msg):
self.read_csv(StringIO(data), sep=',', header='string_header')
def test_singleton_header(self):
# See GH #7757
data = """a,b,c\n0,1,2\n1,2,3"""
df = self.read_csv(StringIO(data), header=[0])
expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
tm.assert_frame_equal(df, expected)
def test_mangles_multi_index(self):
# See GH 18062
data = """A,A,A,B\none,one,one,two\n0,40,34,0.1"""
df = self.read_csv(StringIO(data), header=[0, 1])
expected = DataFrame([[0, 40, 34, 0.1]],
columns=MultiIndex.from_tuples(
[('A', 'one'), ('A', 'one.1'),
('A', 'one.2'), ('B', 'two')]))
tm.assert_frame_equal(df, expected)
data = """A,A,A,B\none,one,one.1,two\n0,40,34,0.1"""
df = self.read_csv(StringIO(data), header=[0, 1])
expected = DataFrame([[0, 40, 34, 0.1]],
columns=MultiIndex.from_tuples(
[('A', 'one'), ('A', 'one.1'),
('A', 'one.1.1'), ('B', 'two')]))
tm.assert_frame_equal(df, expected)
data = """A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1"""
df = self.read_csv(StringIO(data), header=[0, 1])
expected = DataFrame([[0, 40, 34, 0.1, 0.1]],
columns=MultiIndex.from_tuples(
[('A', 'one'), ('A', 'one.1'),
('A', 'one.1.1'), ('B', 'two'),
('B', 'two.1')]))
tm.assert_frame_equal(df, expected)
@@ -0,0 +1,143 @@
# -*- coding: utf-8 -*-
"""
Tests that the specified index column (a.k.a 'index_col')
is properly handled or inferred during parsing for all of
the parsers defined in parsers.py
"""
import pytest
import pandas.util.testing as tm
from pandas import DataFrame, Index, MultiIndex
from pandas.compat import StringIO
class IndexColTests(object):
def test_index_col_named(self):
no_header = """\
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa
h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" # noqa
data = h + no_header
rs = self.read_csv(StringIO(data), index_col='ID')
xp = self.read_csv(StringIO(data), header=0).set_index('ID')
tm.assert_frame_equal(rs, xp)
pytest.raises(ValueError, self.read_csv, StringIO(no_header),
index_col='ID')
data = """\
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
"""
names = ['a', 'b', 'c', 'd', 'message']
xp = DataFrame({'a': [1, 5, 9], 'b': [2, 6, 10], 'c': [3, 7, 11],
'd': [4, 8, 12]},
index=Index(['hello', 'world', 'foo'], name='message'))
rs = self.read_csv(StringIO(data), names=names, index_col=['message'])
tm.assert_frame_equal(xp, rs)
assert xp.index.name == rs.index.name
rs = self.read_csv(StringIO(data), names=names, index_col='message')
tm.assert_frame_equal(xp, rs)
assert xp.index.name == rs.index.name
def test_index_col_is_true(self):
# see gh-9798
pytest.raises(ValueError, self.read_csv,
StringIO(self.ts_data), index_col=True)
def test_infer_index_col(self):
data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
data = self.read_csv(StringIO(data))
assert data.index.equals(Index(['foo', 'bar', 'baz']))
def test_empty_index_col_scenarios(self):
data = 'x,y,z'
# None, no index
index_col, expected = None, DataFrame([], columns=list('xyz')),
tm.assert_frame_equal(self.read_csv(
StringIO(data), index_col=index_col), expected)
# False, no index
index_col, expected = False, DataFrame([], columns=list('xyz')),
tm.assert_frame_equal(self.read_csv(
StringIO(data), index_col=index_col), expected)
# int, first column
index_col, expected = 0, DataFrame(
[], columns=['y', 'z'], index=Index([], name='x'))
tm.assert_frame_equal(self.read_csv(
StringIO(data), index_col=index_col), expected)
# int, not first column
index_col, expected = 1, DataFrame(
[], columns=['x', 'z'], index=Index([], name='y'))
tm.assert_frame_equal(self.read_csv(
StringIO(data), index_col=index_col), expected)
# str, first column
index_col, expected = 'x', DataFrame(
[], columns=['y', 'z'], index=Index([], name='x'))
tm.assert_frame_equal(self.read_csv(
StringIO(data), index_col=index_col), expected)
# str, not the first column
index_col, expected = 'y', DataFrame(
[], columns=['x', 'z'], index=Index([], name='y'))
tm.assert_frame_equal(self.read_csv(
StringIO(data), index_col=index_col), expected)
# list of int
index_col, expected = [0, 1], DataFrame(
[], columns=['z'], index=MultiIndex.from_arrays(
[[]] * 2, names=['x', 'y']))
tm.assert_frame_equal(self.read_csv(
StringIO(data), index_col=index_col),
expected, check_index_type=False)
# list of str
index_col = ['x', 'y']
expected = DataFrame([], columns=['z'],
index=MultiIndex.from_arrays(
[[]] * 2, names=['x', 'y']))
tm.assert_frame_equal(self.read_csv(StringIO(
data), index_col=index_col),
expected, check_index_type=False)
# list of int, reversed sequence
index_col = [1, 0]
expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays(
[[]] * 2, names=['y', 'x']))
tm.assert_frame_equal(self.read_csv(
StringIO(data), index_col=index_col),
expected, check_index_type=False)
# list of str, reversed sequence
index_col = ['y', 'x']
expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays(
[[]] * 2, names=['y', 'x']))
tm.assert_frame_equal(self.read_csv(StringIO(
data), index_col=index_col),
expected, check_index_type=False)
def test_empty_with_index_col_false(self):
# see gh-10413
data = 'x,y'
result = self.read_csv(StringIO(data), index_col=False)
expected = DataFrame([], columns=['x', 'y'])
tm.assert_frame_equal(result, expected)
@@ -0,0 +1,88 @@
# -*- coding: utf-8 -*-
"""
Tests that duplicate columns are handled appropriately when parsed by the
CSV engine. In general, the expected result is that they are either thoroughly
de-duplicated (if mangling requested) or ignored otherwise.
"""
from pandas.compat import StringIO
from pandas import DataFrame
import pandas.util.testing as tm
class DupeColumnTests(object):
def test_basic(self):
# TODO: add test for condition "mangle_dupe_cols=False"
# once it is actually supported (gh-12935)
data = "a,a,b,b,b\n1,2,3,4,5"
for method in ("read_csv", "read_table"):
# Check default behavior.
expected = ["a", "a.1", "b", "b.1", "b.2"]
df = getattr(self, method)(StringIO(data), sep=",")
assert list(df.columns) == expected
df = getattr(self, method)(StringIO(data), sep=",",
mangle_dupe_cols=True)
assert list(df.columns) == expected
def test_basic_names(self):
# See gh-7160
data = "a,b,a\n0,1,2\n3,4,5"
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
columns=["a", "b", "a.1"])
df = self.read_csv(StringIO(data))
tm.assert_frame_equal(df, expected)
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
data = "0,1,2\n3,4,5"
df = self.read_csv(StringIO(data),
names=["a", "b", "a"])
tm.assert_frame_equal(df, expected)
def test_thorough_mangle_columns(self):
# see gh-17060
data = "a,a,a.1\n1,2,3"
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.1.1"]
data = "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6"
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
"a.1.1.1.1", "a.1.1.1.1.1"]
data = "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7"
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
"a.2", "a.2.1", "a.3.1"]
def test_thorough_mangle_names(self):
# see gh-17095
data = "a,b,b\n1,2,3"
names = ["a.1", "a.1", "a.1.1"]
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]
data = "a,b,c,d,e,f\n1,2,3,4,5,6"
names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"]
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
"a.1.1.1.1", "a.1.1.1.1.1"]
data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7"
names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"]
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
"a.2", "a.2.1", "a.3.1"]
@@ -0,0 +1,99 @@
# -*- coding: utf-8 -*-
"""
Tests multithreading behaviour for reading and
parsing files for each parser defined in parsers.py
"""
from __future__ import division
from multiprocessing.pool import ThreadPool
import numpy as np
import pandas as pd
import pandas.util.testing as tm
from pandas import DataFrame
from pandas.compat import BytesIO, range
def _construct_dataframe(num_rows):
df = DataFrame(np.random.rand(num_rows, 5), columns=list('abcde'))
df['foo'] = 'foo'
df['bar'] = 'bar'
df['baz'] = 'baz'
df['date'] = pd.date_range('20000101 09:00:00',
periods=num_rows,
freq='s')
df['int'] = np.arange(num_rows, dtype='int64')
return df
class MultithreadTests(object):
def _generate_multithread_dataframe(self, path, num_rows, num_tasks):
def reader(arg):
start, nrows = arg
if not start:
return self.read_csv(path, index_col=0, header=0,
nrows=nrows, parse_dates=['date'])
return self.read_csv(path,
index_col=0,
header=None,
skiprows=int(start) + 1,
nrows=nrows,
parse_dates=[9])
tasks = [
(num_rows * i // num_tasks,
num_rows // num_tasks) for i in range(num_tasks)
]
pool = ThreadPool(processes=num_tasks)
results = pool.map(reader, tasks)
header = results[0].columns
for r in results[1:]:
r.columns = header
final_dataframe = pd.concat(results)
return final_dataframe
def test_multithread_stringio_read_csv(self):
# see gh-11786
max_row_range = 10000
num_files = 100
bytes_to_df = [
'\n'.join(
['%d,%d,%d' % (i, i, i) for i in range(max_row_range)]
).encode() for j in range(num_files)]
files = [BytesIO(b) for b in bytes_to_df]
# read all files in many threads
pool = ThreadPool(8)
results = pool.map(self.read_csv, files)
first_result = results[0]
for result in results:
tm.assert_frame_equal(first_result, result)
def test_multithread_path_multipart_read_csv(self):
# see gh-11786
num_tasks = 4
file_name = '__threadpool_reader__.csv'
num_rows = 100000
df = _construct_dataframe(num_rows)
with tm.ensure_clean(file_name) as path:
df.to_csv(path)
final_dataframe = self._generate_multithread_dataframe(
path, num_rows, num_tasks)
tm.assert_frame_equal(df, final_dataframe)
@@ -0,0 +1,371 @@
# -*- coding: utf-8 -*-
"""
Tests that NA values are properly handled during
parsing for all of the parsers defined in parsers.py
"""
import numpy as np
from numpy import nan
import pandas.io.common as com
import pandas.util.testing as tm
from pandas import DataFrame, Index, MultiIndex
from pandas.compat import StringIO, range
class NAvaluesTests(object):
def test_string_nas(self):
data = """A,B,C
a,b,c
d,,f
,g,h
"""
result = self.read_csv(StringIO(data))
expected = DataFrame([['a', 'b', 'c'],
['d', np.nan, 'f'],
[np.nan, 'g', 'h']],
columns=['A', 'B', 'C'])
tm.assert_frame_equal(result, expected)
def test_detect_string_na(self):
data = """A,B
foo,bar
NA,baz
NaN,nan
"""
expected = np.array([['foo', 'bar'], [nan, 'baz'], [nan, nan]],
dtype=np.object_)
df = self.read_csv(StringIO(data))
tm.assert_numpy_array_equal(df.values, expected)
def test_non_string_na_values(self):
# see gh-3611: with an odd float format, we can't match
# the string '999.0' exactly but still need float matching
nice = """A,B
-999,1.2
2,-999
3,4.5
"""
ugly = """A,B
-999,1.200
2,-999.000
3,4.500
"""
na_values_param = [['-999.0', '-999'],
[-999, -999.0],
[-999.0, -999],
['-999.0'], ['-999'],
[-999.0], [-999]]
expected = DataFrame([[np.nan, 1.2], [2.0, np.nan],
[3.0, 4.5]], columns=['A', 'B'])
for data in (nice, ugly):
for na_values in na_values_param:
out = self.read_csv(StringIO(data), na_values=na_values)
tm.assert_frame_equal(out, expected)
def test_default_na_values(self):
_NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN',
'#N/A', 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null',
'NaN', 'nan', '-NaN', '-nan', '#N/A N/A', ''])
assert _NA_VALUES == com._NA_VALUES
nv = len(_NA_VALUES)
def f(i, v):
if i == 0:
buf = ''
elif i > 0:
buf = ''.join([','] * i)
buf = "{0}{1}".format(buf, v)
if i < nv - 1:
buf = "{0}{1}".format(buf, ''.join([','] * (nv - i - 1)))
return buf
data = StringIO('\n'.join(f(i, v) for i, v in enumerate(_NA_VALUES)))
expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
df = self.read_csv(data, header=None)
tm.assert_frame_equal(df, expected)
def test_custom_na_values(self):
data = """A,B,C
ignore,this,row
1,NA,3
-1.#IND,5,baz
7,8,NaN
"""
expected = np.array([[1., nan, 3],
[nan, 5, nan],
[7, 8, nan]])
df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1])
tm.assert_numpy_array_equal(df.values, expected)
df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'],
skiprows=[1])
tm.assert_numpy_array_equal(df2.values, expected)
df3 = self.read_table(StringIO(data), sep=',', na_values='baz',
skiprows=[1])
tm.assert_numpy_array_equal(df3.values, expected)
def test_bool_na_values(self):
data = """A,B,C
True,False,True
NA,True,False
False,NA,True"""
result = self.read_csv(StringIO(data))
expected = DataFrame({'A': np.array([True, nan, False], dtype=object),
'B': np.array([False, True, nan], dtype=object),
'C': [True, False, True]})
tm.assert_frame_equal(result, expected)
def test_na_value_dict(self):
data = """A,B,C
foo,bar,NA
bar,foo,foo
foo,bar,NA
bar,foo,foo"""
df = self.read_csv(StringIO(data),
na_values={'A': ['foo'], 'B': ['bar']})
expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'],
'B': [np.nan, 'foo', np.nan, 'foo'],
'C': [np.nan, 'foo', np.nan, 'foo']})
tm.assert_frame_equal(df, expected)
data = """\
a,b,c,d
0,NA,1,5
"""
xp = DataFrame({'b': [np.nan], 'c': [1], 'd': [5]}, index=[0])
xp.index.name = 'a'
df = self.read_csv(StringIO(data), na_values={}, index_col=0)
tm.assert_frame_equal(df, xp)
xp = DataFrame({'b': [np.nan], 'd': [5]},
MultiIndex.from_tuples([(0, 1)]))
xp.index.names = ['a', 'c']
df = self.read_csv(StringIO(data), na_values={}, index_col=[0, 2])
tm.assert_frame_equal(df, xp)
xp = DataFrame({'b': [np.nan], 'd': [5]},
MultiIndex.from_tuples([(0, 1)]))
xp.index.names = ['a', 'c']
df = self.read_csv(StringIO(data), na_values={}, index_col=['a', 'c'])
tm.assert_frame_equal(df, xp)
def test_na_values_keep_default(self):
data = """\
One,Two,Three
a,1,one
b,2,two
,3,three
d,4,nan
e,5,five
nan,6,
g,7,seven
"""
df = self.read_csv(StringIO(data))
xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
'Two': [1, 2, 3, 4, 5, 6, 7],
'Three': ['one', 'two', 'three', np.nan, 'five',
np.nan, 'seven']})
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []},
keep_default_na=False)
xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'],
'Two': [1, 2, 3, 4, 5, 6, 7],
'Three': ['one', 'two', 'three', 'nan', 'five',
'', 'seven']})
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
df = self.read_csv(
StringIO(data), na_values=['a'], keep_default_na=False)
xp = DataFrame({'One': [np.nan, 'b', '', 'd', 'e', 'nan', 'g'],
'Two': [1, 2, 3, 4, 5, 6, 7],
'Three': ['one', 'two', 'three', 'nan', 'five', '',
'seven']})
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []})
xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
'Two': [1, 2, 3, 4, 5, 6, 7],
'Three': ['one', 'two', 'three', np.nan, 'five',
np.nan, 'seven']})
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
# see gh-4318: passing na_values=None and
# keep_default_na=False yields 'None' as a na_value
data = """\
One,Two,Three
a,1,None
b,2,two
,3,None
d,4,nan
e,5,five
nan,6,
g,7,seven
"""
df = self.read_csv(
StringIO(data), keep_default_na=False)
xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'],
'Two': [1, 2, 3, 4, 5, 6, 7],
'Three': ['None', 'two', 'None', 'nan', 'five', '',
'seven']})
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
def test_no_keep_default_na_dict_na_values(self):
# see gh-19227
data = "a,b\n,2"
df = self.read_csv(StringIO(data), na_values={"b": ["2"]},
keep_default_na=False)
expected = DataFrame({"a": [""], "b": [np.nan]})
tm.assert_frame_equal(df, expected)
# Scalar values shouldn't cause the parsing to crash or fail.
data = "a,b\n1,2"
df = self.read_csv(StringIO(data), na_values={"b": 2},
keep_default_na=False)
expected = DataFrame({"a": [1], "b": [np.nan]})
tm.assert_frame_equal(df, expected)
data = """\
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
729639,"qwer","",asdfkj,466.681,,252.373
"""
expected = DataFrame({0: [np.nan, 729639.0],
1: [np.nan, "qwer"],
2: ["/blaha", np.nan],
3: ["kjsdkj", "asdfkj"],
4: [412.166, 466.681],
5: ["225.874", ""],
6: [np.nan, 252.373]})
df = self.read_csv(StringIO(data), header=None, keep_default_na=False,
na_values={2: "", 6: "214.008",
1: "blah", 0: 113125})
tm.assert_frame_equal(df, expected)
df = self.read_csv(StringIO(data), header=None, keep_default_na=False,
na_values={2: "", 6: "214.008",
1: "blah", 0: "113125"})
tm.assert_frame_equal(df, expected)
def test_na_values_na_filter_override(self):
data = """\
A,B
1,A
nan,B
3,C
"""
expected = DataFrame([[1, 'A'], [np.nan, np.nan], [3, 'C']],
columns=['A', 'B'])
out = self.read_csv(StringIO(data), na_values=['B'], na_filter=True)
tm.assert_frame_equal(out, expected)
expected = DataFrame([['1', 'A'], ['nan', 'B'], ['3', 'C']],
columns=['A', 'B'])
out = self.read_csv(StringIO(data), na_values=['B'], na_filter=False)
tm.assert_frame_equal(out, expected)
def test_na_trailing_columns(self):
data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax
2012-03-14,USD,AAPL,BUY,1000
2012-05-12,USD,SBUX,SELL,500"""
result = self.read_csv(StringIO(data))
assert result['Date'][1] == '2012-05-12'
assert result['UnitPrice'].isna().all()
def test_na_values_scalar(self):
# see gh-12224
names = ['a', 'b']
data = '1,2\n2,1'
expected = DataFrame([[np.nan, 2.0], [2.0, np.nan]],
columns=names)
out = self.read_csv(StringIO(data), names=names, na_values=1)
tm.assert_frame_equal(out, expected)
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]],
columns=names)
out = self.read_csv(StringIO(data), names=names,
na_values={'a': 2, 'b': 1})
tm.assert_frame_equal(out, expected)
def test_na_values_dict_aliasing(self):
na_values = {'a': 2, 'b': 1}
na_values_copy = na_values.copy()
names = ['a', 'b']
data = '1,2\n2,1'
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
out = self.read_csv(StringIO(data), names=names, na_values=na_values)
tm.assert_frame_equal(out, expected)
tm.assert_dict_equal(na_values, na_values_copy)
def test_na_values_dict_col_index(self):
# see gh-14203
data = 'a\nfoo\n1'
na_values = {0: 'foo'}
out = self.read_csv(StringIO(data), na_values=na_values)
expected = DataFrame({'a': [np.nan, 1]})
tm.assert_frame_equal(out, expected)
def test_na_values_uint64(self):
# see gh-14983
na_values = [2**63]
data = str(2**63) + '\n' + str(2**63 + 1)
expected = DataFrame([str(2**63), str(2**63 + 1)])
out = self.read_csv(StringIO(data), header=None, na_values=na_values)
tm.assert_frame_equal(out, expected)
data = str(2**63) + ',1' + '\n,2'
expected = DataFrame([[str(2**63), 1], ['', 2]])
out = self.read_csv(StringIO(data), header=None)
tm.assert_frame_equal(out, expected)
def test_empty_na_values_no_default_with_index(self):
# see gh-15835
data = "a,1\nb,2"
expected = DataFrame({'1': [2]}, index=Index(["b"], name="a"))
out = self.read_csv(StringIO(data), keep_default_na=False, index_col=0)
tm.assert_frame_equal(out, expected)
def test_no_na_filter_on_index(self):
# see gh-5239
data = "a,b,c\n1,,3\n4,5,6"
# Don't parse NA-values in index when na_filter=False.
out = self.read_csv(StringIO(data), index_col=[1], na_filter=False)
expected = DataFrame({"a": [1, 4], "c": [3, 6]},
index=Index(["", "5"], name="b"))
tm.assert_frame_equal(out, expected)
# Parse NA-values in index when na_filter=True.
out = self.read_csv(StringIO(data), index_col=[1], na_filter=True)
expected = DataFrame({"a": [1, 4], "c": [3, 6]},
index=Index([np.nan, 5.0], name="b"))
tm.assert_frame_equal(out, expected)
@@ -0,0 +1,676 @@
# -*- coding: utf-8 -*-
"""
Tests date parsing functionality for all of the
parsers defined in parsers.py
"""
from distutils.version import LooseVersion
from datetime import datetime, date
import pytest
import numpy as np
from pandas._libs.tslibs import parsing
from pandas._libs.tslib import Timestamp
import pandas as pd
import pandas.io.parsers as parsers
import pandas.core.tools.datetimes as tools
import pandas.util.testing as tm
import pandas.io.date_converters as conv
from pandas import DataFrame, Series, Index, DatetimeIndex, MultiIndex
from pandas import compat
from pandas.compat import parse_date, StringIO, lrange
from pandas.compat.numpy import np_array_datetime64_compat
from pandas.core.indexes.datetimes import date_range
class ParseDatesTests(object):
def test_separator_date_conflict(self):
# Regression test for gh-4678: make sure thousands separator and
# date parsing do not conflict.
data = '06-02-2013;13:00;1-000.215'
expected = DataFrame(
[[datetime(2013, 6, 2, 13, 0, 0), 1000.215]],
columns=['Date', 2]
)
df = self.read_csv(StringIO(data), sep=';', thousands='-',
parse_dates={'Date': [0, 1]}, header=None)
tm.assert_frame_equal(df, expected)
def test_multiple_date_col(self):
# Can use multiple date parsers
data = """\
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
"""
def func(*date_cols):
res = parsing.try_parse_dates(parsers._concat_date_cols(date_cols))
return res
df = self.read_csv(StringIO(data), header=None,
date_parser=func,
prefix='X',
parse_dates={'nominal': [1, 2],
'actual': [1, 3]})
assert 'nominal' in df
assert 'actual' in df
assert 'X1' not in df
assert 'X2' not in df
assert 'X3' not in df
d = datetime(1999, 1, 27, 19, 0)
assert df.loc[0, 'nominal'] == d
df = self.read_csv(StringIO(data), header=None,
date_parser=func,
parse_dates={'nominal': [1, 2],
'actual': [1, 3]},
keep_date_col=True)
assert 'nominal' in df
assert 'actual' in df
assert 1 in df
assert 2 in df
assert 3 in df
data = """\
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
"""
df = self.read_csv(StringIO(data), header=None,
prefix='X', parse_dates=[[1, 2], [1, 3]])
assert 'X1_X2' in df
assert 'X1_X3' in df
assert 'X1' not in df
assert 'X2' not in df
assert 'X3' not in df
d = datetime(1999, 1, 27, 19, 0)
assert df.loc[0, 'X1_X2'] == d
df = self.read_csv(StringIO(data), header=None,
parse_dates=[[1, 2], [1, 3]], keep_date_col=True)
assert '1_2' in df
assert '1_3' in df
assert 1 in df
assert 2 in df
assert 3 in df
data = '''\
KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
'''
df = self.read_csv(StringIO(data), sep=',', header=None,
parse_dates=[1], index_col=1)
d = datetime(1999, 1, 27, 19, 0)
assert df.index[0] == d
def test_multiple_date_cols_int_cast(self):
data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
"KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
"KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
"KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
"KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
"KORD,19990127, 23:00:00, 22:56:00, -0.5900")
date_spec = {'nominal': [1, 2], 'actual': [1, 3]}
import pandas.io.date_converters as conv
# it works!
df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec,
date_parser=conv.parse_date_time)
assert 'nominal' in df
def test_multiple_date_col_timestamp_parse(self):
data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25"""
result = self.read_csv(StringIO(data), sep=',', header=None,
parse_dates=[[0, 1]], date_parser=Timestamp)
ex_val = Timestamp('05/31/2012 15:30:00.029')
assert result['0_1'][0] == ex_val
def test_multiple_date_cols_with_header(self):
data = """\
ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]})
assert not isinstance(df.nominal[0], compat.string_types)
ts_data = """\
ID,date,nominalTime,actualTime,A,B,C,D,E
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
"""
def test_multiple_date_col_name_collision(self):
with pytest.raises(ValueError):
self.read_csv(StringIO(self.ts_data), parse_dates={'ID': [1, 2]})
data = """\
date_NominalTime,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa
with pytest.raises(ValueError):
self.read_csv(StringIO(data), parse_dates=[[1, 2]])
def test_date_parser_int_bug(self):
# See gh-3071
log_file = StringIO(
'posix_timestamp,elapsed,sys,user,queries,query_time,rows,'
'accountid,userid,contactid,level,silo,method\n'
'1343103150,0.062353,0,4,6,0.01690,3,'
'12345,1,-1,3,invoice_InvoiceResource,search\n'
)
def f(posix_string):
return datetime.utcfromtimestamp(int(posix_string))
# it works!
self.read_csv(log_file, index_col=0, parse_dates=[0], date_parser=f)
def test_nat_parse(self):
# See gh-3062
df = DataFrame(dict({
'A': np.asarray(lrange(10), dtype='float64'),
'B': pd.Timestamp('20010101')}))
df.iloc[3:6, :] = np.nan
with tm.ensure_clean('__nat_parse_.csv') as path:
df.to_csv(path)
result = self.read_csv(path, index_col=0, parse_dates=['B'])
tm.assert_frame_equal(result, df)
expected = Series(dict(A='float64', B='datetime64[ns]'))
tm.assert_series_equal(expected, result.dtypes)
# test with NaT for the nan_rep
# we don't have a method to specify the Datetime na_rep
# (it defaults to '')
df.to_csv(path)
result = self.read_csv(path, index_col=0, parse_dates=['B'])
tm.assert_frame_equal(result, df)
def test_csv_custom_parser(self):
data = """A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5
"""
f = lambda x: datetime.strptime(x, '%Y%m%d')
df = self.read_csv(StringIO(data), date_parser=f)
expected = self.read_csv(StringIO(data), parse_dates=True)
tm.assert_frame_equal(df, expected)
def test_parse_dates_implicit_first_col(self):
data = """A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5
"""
df = self.read_csv(StringIO(data), parse_dates=True)
expected = self.read_csv(StringIO(data), index_col=0, parse_dates=True)
assert isinstance(
df.index[0], (datetime, np.datetime64, Timestamp))
tm.assert_frame_equal(df, expected)
def test_parse_dates_string(self):
data = """date,A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5
"""
rs = self.read_csv(
StringIO(data), index_col='date', parse_dates=['date'])
idx = date_range('1/1/2009', periods=3)
idx.name = 'date'
xp = DataFrame({'A': ['a', 'b', 'c'],
'B': [1, 3, 4],
'C': [2, 4, 5]}, idx)
tm.assert_frame_equal(rs, xp)
def test_yy_format_with_yearfirst(self):
data = """date,time,B,C
090131,0010,1,2
090228,1020,3,4
090331,0830,5,6
"""
# See gh-217
import dateutil
if LooseVersion(dateutil.__version__) >= LooseVersion('2.5.0'):
pytest.skip("testing yearfirst=True not-support"
"on datetutil < 2.5.0 this works but"
"is wrong")
rs = self.read_csv(StringIO(data), index_col=0,
parse_dates=[['date', 'time']])
idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0),
datetime(2009, 2, 28, 10, 20, 0),
datetime(2009, 3, 31, 8, 30, 0)],
dtype=object, name='date_time')
xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx)
tm.assert_frame_equal(rs, xp)
rs = self.read_csv(StringIO(data), index_col=0,
parse_dates=[[0, 1]])
idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0),
datetime(2009, 2, 28, 10, 20, 0),
datetime(2009, 3, 31, 8, 30, 0)],
dtype=object, name='date_time')
xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx)
tm.assert_frame_equal(rs, xp)
def test_parse_dates_column_list(self):
data = 'a,b,c\n01/01/2010,1,15/02/2010'
expected = DataFrame({'a': [datetime(2010, 1, 1)], 'b': [1],
'c': [datetime(2010, 2, 15)]})
expected = expected.set_index(['a', 'b'])
df = self.read_csv(StringIO(data), index_col=[0, 1],
parse_dates=[0, 2], dayfirst=True)
tm.assert_frame_equal(df, expected)
df = self.read_csv(StringIO(data), index_col=[0, 1],
parse_dates=['a', 'c'], dayfirst=True)
tm.assert_frame_equal(df, expected)
def test_multi_index_parse_dates(self):
data = """index1,index2,A,B,C
20090101,one,a,1,2
20090101,two,b,3,4
20090101,three,c,4,5
20090102,one,a,1,2
20090102,two,b,3,4
20090102,three,c,4,5
20090103,one,a,1,2
20090103,two,b,3,4
20090103,three,c,4,5
"""
df = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=True)
assert isinstance(df.index.levels[0][0],
(datetime, np.datetime64, Timestamp))
# specify columns out of order!
df2 = self.read_csv(StringIO(data), index_col=[1, 0], parse_dates=True)
assert isinstance(df2.index.levels[1][0],
(datetime, np.datetime64, Timestamp))
def test_parse_dates_custom_euroformat(self):
text = """foo,bar,baz
31/01/2010,1,2
01/02/2010,1,NA
02/02/2010,1,2
"""
parser = lambda d: parse_date(d, dayfirst=True)
df = self.read_csv(StringIO(text),
names=['time', 'Q', 'NTU'], header=0,
index_col=0, parse_dates=True,
date_parser=parser, na_values=['NA'])
exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1),
datetime(2010, 2, 2)], name='time')
expected = DataFrame({'Q': [1, 1, 1], 'NTU': [2, np.nan, 2]},
index=exp_index, columns=['Q', 'NTU'])
tm.assert_frame_equal(df, expected)
parser = lambda d: parse_date(d, day_first=True)
pytest.raises(TypeError, self.read_csv,
StringIO(text), skiprows=[0],
names=['time', 'Q', 'NTU'], index_col=0,
parse_dates=True, date_parser=parser,
na_values=['NA'])
def test_parse_tz_aware(self):
# See gh-1693
import pytz
data = StringIO("Date,x\n2012-06-13T01:39:00Z,0.5")
# it works
result = self.read_csv(data, index_col=0, parse_dates=True)
stamp = result.index[0]
assert stamp.minute == 39
try:
assert result.index.tz is pytz.utc
except AssertionError: # hello Yaroslav
arr = result.index.to_pydatetime()
result = tools.to_datetime(arr, utc=True)[0]
assert stamp.minute == result.minute
assert stamp.hour == result.hour
assert stamp.day == result.day
def test_multiple_date_cols_index(self):
data = """
ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
"""
xp = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]})
df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]},
index_col='nominal')
tm.assert_frame_equal(xp.set_index('nominal'), df)
df2 = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]},
index_col=0)
tm.assert_frame_equal(df2, df)
df3 = self.read_csv(StringIO(data), parse_dates=[[1, 2]], index_col=0)
tm.assert_frame_equal(df3, df, check_names=False)
def test_multiple_date_cols_chunked(self):
df = self.read_csv(StringIO(self.ts_data), parse_dates={
'nominal': [1, 2]}, index_col='nominal')
reader = self.read_csv(StringIO(self.ts_data),
parse_dates={'nominal': [1, 2]},
index_col='nominal', chunksize=2)
chunks = list(reader)
assert 'nominalTime' not in df
tm.assert_frame_equal(chunks[0], df[:2])
tm.assert_frame_equal(chunks[1], df[2:4])
tm.assert_frame_equal(chunks[2], df[4:])
def test_multiple_date_col_named_components(self):
xp = self.read_csv(StringIO(self.ts_data),
parse_dates={'nominal': [1, 2]},
index_col='nominal')
colspec = {'nominal': ['date', 'nominalTime']}
df = self.read_csv(StringIO(self.ts_data), parse_dates=colspec,
index_col='nominal')
tm.assert_frame_equal(df, xp)
def test_multiple_date_col_multiple_index(self):
df = self.read_csv(StringIO(self.ts_data),
parse_dates={'nominal': [1, 2]},
index_col=['nominal', 'ID'])
xp = self.read_csv(StringIO(self.ts_data),
parse_dates={'nominal': [1, 2]})
tm.assert_frame_equal(xp.set_index(['nominal', 'ID']), df)
def test_read_with_parse_dates_scalar_non_bool(self):
# See gh-5636
errmsg = ("Only booleans, lists, and "
"dictionaries are accepted "
"for the 'parse_dates' parameter")
data = """A,B,C
1,2,2003-11-1"""
tm.assert_raises_regex(TypeError, errmsg, self.read_csv,
StringIO(data), parse_dates="C")
tm.assert_raises_regex(TypeError, errmsg, self.read_csv,
StringIO(data), parse_dates="C",
index_col="C")
def test_read_with_parse_dates_invalid_type(self):
errmsg = ("Only booleans, lists, and "
"dictionaries are accepted "
"for the 'parse_dates' parameter")
data = """A,B,C
1,2,2003-11-1"""
tm.assert_raises_regex(TypeError, errmsg, self.read_csv,
StringIO(data), parse_dates=(1,))
tm.assert_raises_regex(TypeError, errmsg,
self.read_csv, StringIO(data),
parse_dates=np.array([4, 5]))
tm.assert_raises_regex(TypeError, errmsg, self.read_csv,
StringIO(data), parse_dates=set([1, 3, 3]))
def test_parse_dates_empty_string(self):
# see gh-2263
data = "Date, test\n2012-01-01, 1\n,2"
result = self.read_csv(StringIO(data), parse_dates=["Date"],
na_filter=False)
assert result['Date'].isna()[1]
def test_parse_dates_noconvert_thousands(self):
# see gh-14066
data = 'a\n04.15.2016'
expected = DataFrame([datetime(2016, 4, 15)], columns=['a'])
result = self.read_csv(StringIO(data), parse_dates=['a'],
thousands='.')
tm.assert_frame_equal(result, expected)
exp_index = DatetimeIndex(['2016-04-15'], name='a')
expected = DataFrame(index=exp_index)
result = self.read_csv(StringIO(data), index_col=0,
parse_dates=True, thousands='.')
tm.assert_frame_equal(result, expected)
data = 'a,b\n04.15.2016,09.16.2013'
expected = DataFrame([[datetime(2016, 4, 15),
datetime(2013, 9, 16)]],
columns=['a', 'b'])
result = self.read_csv(StringIO(data), parse_dates=['a', 'b'],
thousands='.')
tm.assert_frame_equal(result, expected)
expected = DataFrame([[datetime(2016, 4, 15),
datetime(2013, 9, 16)]],
columns=['a', 'b'])
expected = expected.set_index(['a', 'b'])
result = self.read_csv(StringIO(data), index_col=[0, 1],
parse_dates=True, thousands='.')
tm.assert_frame_equal(result, expected)
def test_parse_date_time_multi_level_column_name(self):
data = """\
D,T,A,B
date, time,a,b
2001-01-05, 09:00:00, 0.0, 10.
2001-01-06, 00:00:00, 1.0, 11.
"""
datecols = {'date_time': [0, 1]}
result = self.read_csv(StringIO(data), sep=',', header=[0, 1],
parse_dates=datecols,
date_parser=conv.parse_date_time)
expected_data = [[datetime(2001, 1, 5, 9, 0, 0), 0., 10.],
[datetime(2001, 1, 6, 0, 0, 0), 1., 11.]]
expected = DataFrame(expected_data,
columns=['date_time', ('A', 'a'), ('B', 'b')])
tm.assert_frame_equal(result, expected)
def test_parse_date_time(self):
dates = np.array(['2007/1/3', '2008/2/4'], dtype=object)
times = np.array(['05:07:09', '06:08:00'], dtype=object)
expected = np.array([datetime(2007, 1, 3, 5, 7, 9),
datetime(2008, 2, 4, 6, 8, 0)])
result = conv.parse_date_time(dates, times)
assert (result == expected).all()
data = """\
date, time, a, b
2001-01-05, 10:00:00, 0.0, 10.
2001-01-05, 00:00:00, 1., 11.
"""
datecols = {'date_time': [0, 1]}
df = self.read_csv(StringIO(data), sep=',', header=0,
parse_dates=datecols,
date_parser=conv.parse_date_time)
assert 'date_time' in df
assert df.date_time.loc[0] == datetime(2001, 1, 5, 10, 0, 0)
data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
"KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
"KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
"KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
"KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
"KORD,19990127, 23:00:00, 22:56:00, -0.5900")
date_spec = {'nominal': [1, 2], 'actual': [1, 3]}
df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec,
date_parser=conv.parse_date_time)
def test_parse_date_fields(self):
years = np.array([2007, 2008])
months = np.array([1, 2])
days = np.array([3, 4])
result = conv.parse_date_fields(years, months, days)
expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)])
assert (result == expected).all()
data = ("year, month, day, a\n 2001 , 01 , 10 , 10.\n"
"2001 , 02 , 1 , 11.")
datecols = {'ymd': [0, 1, 2]}
df = self.read_csv(StringIO(data), sep=',', header=0,
parse_dates=datecols,
date_parser=conv.parse_date_fields)
assert 'ymd' in df
assert df.ymd.loc[0] == datetime(2001, 1, 10)
def test_datetime_six_col(self):
years = np.array([2007, 2008])
months = np.array([1, 2])
days = np.array([3, 4])
hours = np.array([5, 6])
minutes = np.array([7, 8])
seconds = np.array([9, 0])
expected = np.array([datetime(2007, 1, 3, 5, 7, 9),
datetime(2008, 2, 4, 6, 8, 0)])
result = conv.parse_all_fields(years, months, days,
hours, minutes, seconds)
assert (result == expected).all()
data = """\
year, month, day, hour, minute, second, a, b
2001, 01, 05, 10, 00, 0, 0.0, 10.
2001, 01, 5, 10, 0, 00, 1., 11.
"""
datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]}
df = self.read_csv(StringIO(data), sep=',', header=0,
parse_dates=datecols,
date_parser=conv.parse_all_fields)
assert 'ymdHMS' in df
assert df.ymdHMS.loc[0] == datetime(2001, 1, 5, 10, 0, 0)
def test_datetime_fractional_seconds(self):
data = """\
year, month, day, hour, minute, second, a, b
2001, 01, 05, 10, 00, 0.123456, 0.0, 10.
2001, 01, 5, 10, 0, 0.500000, 1., 11.
"""
datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]}
df = self.read_csv(StringIO(data), sep=',', header=0,
parse_dates=datecols,
date_parser=conv.parse_all_fields)
assert 'ymdHMS' in df
assert df.ymdHMS.loc[0] == datetime(2001, 1, 5, 10, 0, 0,
microsecond=123456)
assert df.ymdHMS.loc[1] == datetime(2001, 1, 5, 10, 0, 0,
microsecond=500000)
def test_generic(self):
data = "year, month, day, a\n 2001, 01, 10, 10.\n 2001, 02, 1, 11."
datecols = {'ym': [0, 1]}
dateconverter = lambda y, m: date(year=int(y), month=int(m), day=1)
df = self.read_csv(StringIO(data), sep=',', header=0,
parse_dates=datecols,
date_parser=dateconverter)
assert 'ym' in df
assert df.ym.loc[0] == date(2001, 1, 1)
def test_dateparser_resolution_if_not_ns(self):
# GH 10245
data = """\
date,time,prn,rxstatus
2013-11-03,19:00:00,126,00E80000
2013-11-03,19:00:00,23,00E80000
2013-11-03,19:00:00,13,00E80000
"""
def date_parser(date, time):
datetime = np_array_datetime64_compat(
date + 'T' + time + 'Z', dtype='datetime64[s]')
return datetime
df = self.read_csv(StringIO(data), date_parser=date_parser,
parse_dates={'datetime': ['date', 'time']},
index_col=['datetime', 'prn'])
datetimes = np_array_datetime64_compat(['2013-11-03T19:00:00Z'] * 3,
dtype='datetime64[s]')
df_correct = DataFrame(data={'rxstatus': ['00E80000'] * 3},
index=MultiIndex.from_tuples(
[(datetimes[0], 126),
(datetimes[1], 23),
(datetimes[2], 13)],
names=['datetime', 'prn']))
tm.assert_frame_equal(df, df_correct)
def test_parse_date_column_with_empty_string(self):
# GH 6428
data = """case,opdate
7,10/18/2006
7,10/18/2008
621, """
result = self.read_csv(StringIO(data), parse_dates=['opdate'])
expected_data = [[7, '10/18/2006'],
[7, '10/18/2008'],
[621, ' ']]
expected = DataFrame(expected_data, columns=['case', 'opdate'])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("data,expected", [
("a\n135217135789158401\n1352171357E+5",
DataFrame({"a": [135217135789158401,
135217135700000]}, dtype="float64")),
("a\n99999999999\n123456789012345\n1234E+0",
DataFrame({"a": [99999999999,
123456789012345,
1234]}, dtype="float64"))
])
@pytest.mark.parametrize("parse_dates", [True, False])
def test_parse_date_float(self, data, expected, parse_dates):
# see gh-2697
#
# Date parsing should fail, so we leave the data untouched
# (i.e. float precision should remain unchanged).
result = self.read_csv(StringIO(data), parse_dates=parse_dates)
tm.assert_frame_equal(result, expected)
@@ -0,0 +1,263 @@
# -*- coding: utf-8 -*-
"""
Tests that apply specifically to the Python parser. Unless specifically
stated as a Python-specific issue, the goal is to eventually move as many of
these tests out of this module as soon as the C parser can accept further
arguments when parsing.
"""
import csv
import pytest
import pandas.util.testing as tm
from pandas import DataFrame, Index
from pandas import compat
from pandas.errors import ParserError
from pandas.compat import StringIO, BytesIO, u
class PythonParserTests(object):
def test_default_separator(self):
# GH17333
# csv.Sniffer in Python treats 'o' as separator.
text = 'aob\n1o2\n3o4'
expected = DataFrame({'a': [1, 3], 'b': [2, 4]})
result = self.read_csv(StringIO(text), sep=None)
tm.assert_frame_equal(result, expected)
def test_invalid_skipfooter(self):
text = "a\n1\n2"
# see gh-15925 (comment)
msg = "skipfooter must be an integer"
with tm.assert_raises_regex(ValueError, msg):
self.read_csv(StringIO(text), skipfooter="foo")
with tm.assert_raises_regex(ValueError, msg):
self.read_csv(StringIO(text), skipfooter=1.5)
with tm.assert_raises_regex(ValueError, msg):
self.read_csv(StringIO(text), skipfooter=True)
msg = "skipfooter cannot be negative"
with tm.assert_raises_regex(ValueError, msg):
self.read_csv(StringIO(text), skipfooter=-1)
def test_sniff_delimiter(self):
text = """index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
data = self.read_csv(StringIO(text), index_col=0, sep=None)
tm.assert_index_equal(data.index,
Index(['foo', 'bar', 'baz'], name='index'))
data2 = self.read_csv(StringIO(text), index_col=0, delimiter='|')
tm.assert_frame_equal(data, data2)
text = """ignore this
ignore this too
index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
data3 = self.read_csv(StringIO(text), index_col=0,
sep=None, skiprows=2)
tm.assert_frame_equal(data, data3)
text = u("""ignore this
ignore this too
index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
""").encode('utf-8')
s = BytesIO(text)
if compat.PY3:
# somewhat False since the code never sees bytes
from io import TextIOWrapper
s = TextIOWrapper(s, encoding='utf-8')
data4 = self.read_csv(s, index_col=0, sep=None, skiprows=2,
encoding='utf-8')
tm.assert_frame_equal(data, data4)
def test_BytesIO_input(self):
if not compat.PY3:
pytest.skip(
"Bytes-related test - only needs to work on Python 3")
data = BytesIO("שלום::1234\n562::123".encode('cp1255'))
result = self.read_table(data, sep="::", encoding='cp1255')
expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
tm.assert_frame_equal(result, expected)
def test_single_line(self):
# see gh-6607: sniff separator
df = self.read_csv(StringIO('1,2'), names=['a', 'b'],
header=None, sep=None)
tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df)
def test_skipfooter(self):
# see gh-6607
data = """A,B,C
1,2,3
4,5,6
7,8,9
want to skip this
also also skip this
"""
result = self.read_csv(StringIO(data), skipfooter=2)
no_footer = '\n'.join(data.split('\n')[:-3])
expected = self.read_csv(StringIO(no_footer))
tm.assert_frame_equal(result, expected)
result = self.read_csv(StringIO(data), nrows=3)
tm.assert_frame_equal(result, expected)
# skipfooter alias
result = self.read_csv(StringIO(data), skipfooter=2)
no_footer = '\n'.join(data.split('\n')[:-3])
expected = self.read_csv(StringIO(no_footer))
tm.assert_frame_equal(result, expected)
def test_decompression_regex_sep(self):
# see gh-6607
try:
import gzip
import bz2
except ImportError:
pytest.skip('need gzip and bz2 to run')
with open(self.csv1, 'rb') as f:
data = f.read()
data = data.replace(b',', b'::')
expected = self.read_csv(self.csv1)
with tm.ensure_clean() as path:
tmp = gzip.GzipFile(path, mode='wb')
tmp.write(data)
tmp.close()
result = self.read_csv(path, sep='::', compression='gzip')
tm.assert_frame_equal(result, expected)
with tm.ensure_clean() as path:
tmp = bz2.BZ2File(path, mode='wb')
tmp.write(data)
tmp.close()
result = self.read_csv(path, sep='::', compression='bz2')
tm.assert_frame_equal(result, expected)
pytest.raises(ValueError, self.read_csv,
path, compression='bz3')
def test_read_table_buglet_4x_multiindex(self):
# see gh-6607
text = """ A B C D E
one two three four
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
df = self.read_table(StringIO(text), sep=r'\s+')
assert df.index.names == ('one', 'two', 'three', 'four')
# see gh-6893
data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9'
expected = DataFrame.from_records(
[(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
columns=list('abcABC'), index=list('abc'))
actual = self.read_table(StringIO(data), sep=r'\s+')
tm.assert_frame_equal(actual, expected)
def test_skipfooter_with_decimal(self):
# see gh-6971
data = '1#2\n3#4'
expected = DataFrame({'a': [1.2, 3.4]})
result = self.read_csv(StringIO(data), names=['a'],
decimal='#')
tm.assert_frame_equal(result, expected)
# the stray footer line should not mess with the
# casting of the first t wo lines if we skip it
data = data + '\nFooter'
result = self.read_csv(StringIO(data), names=['a'],
decimal='#', skipfooter=1)
tm.assert_frame_equal(result, expected)
def test_encoding_non_utf8_multichar_sep(self):
# see gh-3404
expected = DataFrame({'a': [1], 'b': [2]})
for sep in ['::', '#####', '!!!', '123', '#1!c5',
'%!c!d', '@@#4:2', '_!pd#_']:
data = '1' + sep + '2'
for encoding in ['utf-16', 'utf-16-be', 'utf-16-le',
'utf-32', 'cp037']:
encoded_data = data.encode(encoding)
result = self.read_csv(BytesIO(encoded_data),
sep=sep, names=['a', 'b'],
encoding=encoding)
tm.assert_frame_equal(result, expected)
def test_multi_char_sep_quotes(self):
# see gh-13374
data = 'a,,b\n1,,a\n2,,"2,,b"'
msg = 'ignored when a multi-char delimiter is used'
with tm.assert_raises_regex(ParserError, msg):
self.read_csv(StringIO(data), sep=',,')
# We expect no match, so there should be an assertion
# error out of the inner context manager.
with pytest.raises(AssertionError):
with tm.assert_raises_regex(ParserError, msg):
self.read_csv(StringIO(data), sep=',,',
quoting=csv.QUOTE_NONE)
def test_none_delimiter(self):
# see gh-13374 and gh-17465
data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
expected = DataFrame({'a': [0, 7],
'b': [1, 8],
'c': [2, 9]})
# We expect the third line in the data to be
# skipped because it is malformed,
# but we do not expect any errors to occur.
result = self.read_csv(StringIO(data), header=0,
sep=None,
error_bad_lines=False,
warn_bad_lines=True)
tm.assert_frame_equal(result, expected)
def test_skipfooter_bad_row(self):
# see gh-13879
# see gh-15910
msg = 'parsing errors in the skipped footer rows'
for data in ('a\n1\n"b"a',
'a,b,c\ncat,foo,bar\ndog,foo,"baz'):
with tm.assert_raises_regex(ParserError, msg):
self.read_csv(StringIO(data), skipfooter=1)
# We expect no match, so there should be an assertion
# error out of the inner context manager.
with pytest.raises(AssertionError):
with tm.assert_raises_regex(ParserError, msg):
self.read_csv(StringIO(data))
@@ -0,0 +1,153 @@
# -*- coding: utf-8 -*-
"""
Tests that quoting specifications are properly handled
during parsing for all of the parsers defined in parsers.py
"""
import csv
import pandas.util.testing as tm
from pandas import DataFrame
from pandas.compat import PY3, StringIO, u
class QuotingTests(object):
def test_bad_quote_char(self):
data = '1,2,3'
# Python 2.x: "...must be an 1-character..."
# Python 3.x: "...must be a 1-character..."
msg = '"quotechar" must be a(n)? 1-character string'
tm.assert_raises_regex(TypeError, msg, self.read_csv,
StringIO(data), quotechar='foo')
msg = 'quotechar must be set if quoting enabled'
tm.assert_raises_regex(TypeError, msg, self.read_csv,
StringIO(data), quotechar=None,
quoting=csv.QUOTE_MINIMAL)
msg = '"quotechar" must be string, not int'
tm.assert_raises_regex(TypeError, msg, self.read_csv,
StringIO(data), quotechar=2)
def test_bad_quoting(self):
data = '1,2,3'
msg = '"quoting" must be an integer'
tm.assert_raises_regex(TypeError, msg, self.read_csv,
StringIO(data), quoting='foo')
# quoting must in the range [0, 3]
msg = 'bad "quoting" value'
tm.assert_raises_regex(TypeError, msg, self.read_csv,
StringIO(data), quoting=5)
def test_quote_char_basic(self):
data = 'a,b,c\n1,2,"cat"'
expected = DataFrame([[1, 2, 'cat']],
columns=['a', 'b', 'c'])
result = self.read_csv(StringIO(data), quotechar='"')
tm.assert_frame_equal(result, expected)
def test_quote_char_various(self):
data = 'a,b,c\n1,2,"cat"'
expected = DataFrame([[1, 2, 'cat']],
columns=['a', 'b', 'c'])
quote_chars = ['~', '*', '%', '$', '@', 'P']
for quote_char in quote_chars:
new_data = data.replace('"', quote_char)
result = self.read_csv(StringIO(new_data), quotechar=quote_char)
tm.assert_frame_equal(result, expected)
def test_null_quote_char(self):
data = 'a,b,c\n1,2,3'
# sanity checks
msg = 'quotechar must be set if quoting enabled'
tm.assert_raises_regex(TypeError, msg, self.read_csv,
StringIO(data), quotechar=None,
quoting=csv.QUOTE_MINIMAL)
tm.assert_raises_regex(TypeError, msg, self.read_csv,
StringIO(data), quotechar='',
quoting=csv.QUOTE_MINIMAL)
# no errors should be raised if quoting is None
expected = DataFrame([[1, 2, 3]],
columns=['a', 'b', 'c'])
result = self.read_csv(StringIO(data), quotechar=None,
quoting=csv.QUOTE_NONE)
tm.assert_frame_equal(result, expected)
result = self.read_csv(StringIO(data), quotechar='',
quoting=csv.QUOTE_NONE)
tm.assert_frame_equal(result, expected)
def test_quoting_various(self):
data = '1,2,"foo"'
cols = ['a', 'b', 'c']
# QUOTE_MINIMAL and QUOTE_ALL apply only to
# the CSV writer, so they should have no
# special effect for the CSV reader
expected = DataFrame([[1, 2, 'foo']], columns=cols)
# test default (afterwards, arguments are all explicit)
result = self.read_csv(StringIO(data), names=cols)
tm.assert_frame_equal(result, expected)
result = self.read_csv(StringIO(data), quotechar='"',
quoting=csv.QUOTE_MINIMAL, names=cols)
tm.assert_frame_equal(result, expected)
result = self.read_csv(StringIO(data), quotechar='"',
quoting=csv.QUOTE_ALL, names=cols)
tm.assert_frame_equal(result, expected)
# QUOTE_NONE tells the reader to do no special handling
# of quote characters and leave them alone
expected = DataFrame([[1, 2, '"foo"']], columns=cols)
result = self.read_csv(StringIO(data), quotechar='"',
quoting=csv.QUOTE_NONE, names=cols)
tm.assert_frame_equal(result, expected)
# QUOTE_NONNUMERIC tells the reader to cast
# all non-quoted fields to float
expected = DataFrame([[1.0, 2.0, 'foo']], columns=cols)
result = self.read_csv(StringIO(data), quotechar='"',
quoting=csv.QUOTE_NONNUMERIC,
names=cols)
tm.assert_frame_equal(result, expected)
def test_double_quote(self):
data = 'a,b\n3,"4 "" 5"'
expected = DataFrame([[3, '4 " 5']],
columns=['a', 'b'])
result = self.read_csv(StringIO(data), quotechar='"',
doublequote=True)
tm.assert_frame_equal(result, expected)
expected = DataFrame([[3, '4 " 5"']],
columns=['a', 'b'])
result = self.read_csv(StringIO(data), quotechar='"',
doublequote=False)
tm.assert_frame_equal(result, expected)
def test_quotechar_unicode(self):
# See gh-14477
data = 'a\n1'
expected = DataFrame({'a': [1]})
result = self.read_csv(StringIO(data), quotechar=u('"'))
tm.assert_frame_equal(result, expected)
# Compared to Python 3.x, Python 2.x does not handle unicode well.
if PY3:
result = self.read_csv(StringIO(data), quotechar=u('\u0001'))
tm.assert_frame_equal(result, expected)
@@ -0,0 +1,225 @@
# -*- coding: utf-8 -*-
"""
Tests that skipped rows are properly handled during
parsing for all of the parsers defined in parsers.py
"""
from datetime import datetime
import numpy as np
import pandas.util.testing as tm
from pandas import DataFrame
from pandas.errors import EmptyDataError
from pandas.compat import StringIO, range, lrange
class SkipRowsTests(object):
def test_skiprows_bug(self):
# see gh-505
text = """#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
1/1/2000,1.,2.,3.
1/2/2000,4,5,6
1/3/2000,7,8,9
"""
data = self.read_csv(StringIO(text), skiprows=lrange(6), header=None,
index_col=0, parse_dates=True)
data2 = self.read_csv(StringIO(text), skiprows=6, header=None,
index_col=0, parse_dates=True)
expected = DataFrame(np.arange(1., 10.).reshape((3, 3)),
columns=[1, 2, 3],
index=[datetime(2000, 1, 1), datetime(2000, 1, 2),
datetime(2000, 1, 3)])
expected.index.name = 0
tm.assert_frame_equal(data, expected)
tm.assert_frame_equal(data, data2)
def test_deep_skiprows(self):
# see gh-4382
text = "a,b,c\n" + \
"\n".join([",".join([str(i), str(i + 1), str(i + 2)])
for i in range(10)])
condensed_text = "a,b,c\n" + \
"\n".join([",".join([str(i), str(i + 1), str(i + 2)])
for i in [0, 1, 2, 3, 4, 6, 8, 9]])
data = self.read_csv(StringIO(text), skiprows=[6, 8])
condensed_data = self.read_csv(StringIO(condensed_text))
tm.assert_frame_equal(data, condensed_data)
def test_skiprows_blank(self):
# see gh-9832
text = """#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
1/1/2000,1.,2.,3.
1/2/2000,4,5,6
1/3/2000,7,8,9
"""
data = self.read_csv(StringIO(text), skiprows=6, header=None,
index_col=0, parse_dates=True)
expected = DataFrame(np.arange(1., 10.).reshape((3, 3)),
columns=[1, 2, 3],
index=[datetime(2000, 1, 1), datetime(2000, 1, 2),
datetime(2000, 1, 3)])
expected.index.name = 0
tm.assert_frame_equal(data, expected)
def test_skiprow_with_newline(self):
# see gh-12775 and gh-10911
data = """id,text,num_lines
1,"line 11
line 12",2
2,"line 21
line 22",2
3,"line 31",1"""
expected = [[2, 'line 21\nline 22', 2],
[3, 'line 31', 1]]
expected = DataFrame(expected, columns=[
'id', 'text', 'num_lines'])
df = self.read_csv(StringIO(data), skiprows=[1])
tm.assert_frame_equal(df, expected)
data = ('a,b,c\n~a\n b~,~e\n d~,'
'~f\n f~\n1,2,~12\n 13\n 14~')
expected = [['a\n b', 'e\n d', 'f\n f']]
expected = DataFrame(expected, columns=[
'a', 'b', 'c'])
df = self.read_csv(StringIO(data),
quotechar="~",
skiprows=[2])
tm.assert_frame_equal(df, expected)
data = ('Text,url\n~example\n '
'sentence\n one~,url1\n~'
'example\n sentence\n two~,url2\n~'
'example\n sentence\n three~,url3')
expected = [['example\n sentence\n two', 'url2']]
expected = DataFrame(expected, columns=[
'Text', 'url'])
df = self.read_csv(StringIO(data),
quotechar="~",
skiprows=[1, 3])
tm.assert_frame_equal(df, expected)
def test_skiprow_with_quote(self):
# see gh-12775 and gh-10911
data = """id,text,num_lines
1,"line '11' line 12",2
2,"line '21' line 22",2
3,"line '31' line 32",1"""
expected = [[2, "line '21' line 22", 2],
[3, "line '31' line 32", 1]]
expected = DataFrame(expected, columns=[
'id', 'text', 'num_lines'])
df = self.read_csv(StringIO(data), skiprows=[1])
tm.assert_frame_equal(df, expected)
def test_skiprow_with_newline_and_quote(self):
# see gh-12775 and gh-10911
data = """id,text,num_lines
1,"line \n'11' line 12",2
2,"line \n'21' line 22",2
3,"line \n'31' line 32",1"""
expected = [[2, "line \n'21' line 22", 2],
[3, "line \n'31' line 32", 1]]
expected = DataFrame(expected, columns=[
'id', 'text', 'num_lines'])
df = self.read_csv(StringIO(data), skiprows=[1])
tm.assert_frame_equal(df, expected)
data = """id,text,num_lines
1,"line '11\n' line 12",2
2,"line '21\n' line 22",2
3,"line '31\n' line 32",1"""
expected = [[2, "line '21\n' line 22", 2],
[3, "line '31\n' line 32", 1]]
expected = DataFrame(expected, columns=[
'id', 'text', 'num_lines'])
df = self.read_csv(StringIO(data), skiprows=[1])
tm.assert_frame_equal(df, expected)
data = """id,text,num_lines
1,"line '11\n' \r\tline 12",2
2,"line '21\n' \r\tline 22",2
3,"line '31\n' \r\tline 32",1"""
expected = [[2, "line '21\n' \r\tline 22", 2],
[3, "line '31\n' \r\tline 32", 1]]
expected = DataFrame(expected, columns=[
'id', 'text', 'num_lines'])
df = self.read_csv(StringIO(data), skiprows=[1])
tm.assert_frame_equal(df, expected)
def test_skiprows_lineterminator(self):
# see gh-9079
data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ',
'2007/01/01 01:00 0.2140 U M ',
'2007/01/01 02:00 0.2141 M O ',
'2007/01/01 04:00 0.2142 D M '])
expected = DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'],
['2007/01/01', '02:00', 0.2141, 'M', 'O'],
['2007/01/01', '04:00', 0.2142, 'D', 'M']],
columns=['date', 'time', 'var', 'flag',
'oflag'])
# test with default line terminators "LF" and "CRLF"
df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True,
names=['date', 'time', 'var', 'flag', 'oflag'])
tm.assert_frame_equal(df, expected)
df = self.read_csv(StringIO(data.replace('\n', '\r\n')),
skiprows=1, delim_whitespace=True,
names=['date', 'time', 'var', 'flag', 'oflag'])
tm.assert_frame_equal(df, expected)
# "CR" is not respected with the Python parser yet
if self.engine == 'c':
df = self.read_csv(StringIO(data.replace('\n', '\r')),
skiprows=1, delim_whitespace=True,
names=['date', 'time', 'var', 'flag', 'oflag'])
tm.assert_frame_equal(df, expected)
def test_skiprows_infield_quote(self):
# see gh-14459
data = 'a"\nb"\na\n1'
expected = DataFrame({'a': [1]})
df = self.read_csv(StringIO(data), skiprows=2)
tm.assert_frame_equal(df, expected)
def test_skiprows_callable(self):
data = 'a\n1\n2\n3\n4\n5'
skiprows = lambda x: x % 2 == 0
expected = DataFrame({'1': [3, 5]})
df = self.read_csv(StringIO(data), skiprows=skiprows)
tm.assert_frame_equal(df, expected)
expected = DataFrame({'foo': [3, 5]})
df = self.read_csv(StringIO(data), skiprows=skiprows,
header=0, names=['foo'])
tm.assert_frame_equal(df, expected)
skiprows = lambda x: True
msg = "No columns to parse from file"
with tm.assert_raises_regex(EmptyDataError, msg):
self.read_csv(StringIO(data), skiprows=skiprows)
# This is a bad callable and should raise.
msg = "by zero"
skiprows = lambda x: 1 / 0
with tm.assert_raises_regex(ZeroDivisionError, msg):
self.read_csv(StringIO(data), skiprows=skiprows)
@@ -0,0 +1,200 @@
# -*- coding: utf-8 -*-
"""
Tests parsers ability to read and parse non-local files
and hence require a network connection to be read.
"""
import logging
import pytest
import numpy as np
import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas import DataFrame
from pandas.io.parsers import read_csv, read_table
from pandas.compat import BytesIO, StringIO
@pytest.mark.network
@pytest.mark.parametrize(
"compress_type, extension", [
('gzip', '.gz'), ('bz2', '.bz2'), ('zip', '.zip'),
pytest.param('xz', '.xz', marks=td.skip_if_no_lzma)
]
)
@pytest.mark.parametrize('mode', ['explicit', 'infer'])
@pytest.mark.parametrize('engine', ['python', 'c'])
def test_compressed_urls(salaries_table, compress_type, extension, mode,
engine):
check_compressed_urls(salaries_table, compress_type, extension, mode,
engine)
@tm.network
def check_compressed_urls(salaries_table, compression, extension, mode,
engine):
# test reading compressed urls with various engines and
# extension inference
base_url = ('https://github.com/pandas-dev/pandas/raw/master/'
'pandas/tests/io/parser/data/salaries.csv')
url = base_url + extension
if mode != 'explicit':
compression = mode
url_table = read_table(url, compression=compression, engine=engine)
tm.assert_frame_equal(url_table, salaries_table)
@pytest.fixture
def tips_df(datapath):
"""DataFrame with the tips dataset."""
return read_csv(datapath('io', 'parser', 'data', 'tips.csv'))
@pytest.mark.usefixtures("s3_resource")
class TestS3(object):
def test_parse_public_s3_bucket(self, tips_df):
pytest.importorskip('s3fs')
# more of an integration test due to the not-public contents portion
# can probably mock this though.
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df = read_csv('s3://pandas-test/tips.csv' +
ext, compression=comp)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
# Read public file from bucket with not-public contents
df = read_csv('s3://cant_get_it/tips.csv')
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_parse_public_s3n_bucket(self, tips_df):
# Read from AWS s3 as "s3n" URL
df = read_csv('s3n://pandas-test/tips.csv', nrows=10)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_parse_public_s3a_bucket(self, tips_df):
# Read from AWS s3 as "s3a" URL
df = read_csv('s3a://pandas-test/tips.csv', nrows=10)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_parse_public_s3_bucket_nrows(self, tips_df):
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df = read_csv('s3://pandas-test/tips.csv' +
ext, nrows=10, compression=comp)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_parse_public_s3_bucket_chunked(self, tips_df):
# Read with a chunksize
chunksize = 5
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
chunksize=chunksize, compression=comp)
assert df_reader.chunksize == chunksize
for i_chunk in [0, 1, 2]:
# Read a couple of chunks and make sure we see them
# properly.
df = df_reader.get_chunk()
assert isinstance(df, DataFrame)
assert not df.empty
true_df = tips_df.iloc[
chunksize * i_chunk: chunksize * (i_chunk + 1)]
tm.assert_frame_equal(true_df, df)
def test_parse_public_s3_bucket_chunked_python(self, tips_df):
# Read with a chunksize using the Python parser
chunksize = 5
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
chunksize=chunksize, compression=comp,
engine='python')
assert df_reader.chunksize == chunksize
for i_chunk in [0, 1, 2]:
# Read a couple of chunks and make sure we see them properly.
df = df_reader.get_chunk()
assert isinstance(df, DataFrame)
assert not df.empty
true_df = tips_df.iloc[
chunksize * i_chunk: chunksize * (i_chunk + 1)]
tm.assert_frame_equal(true_df, df)
def test_parse_public_s3_bucket_python(self, tips_df):
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
compression=comp)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_infer_s3_compression(self, tips_df):
for ext in ['', '.gz', '.bz2']:
df = read_csv('s3://pandas-test/tips.csv' + ext,
engine='python', compression='infer')
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_parse_public_s3_bucket_nrows_python(self, tips_df):
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
nrows=10, compression=comp)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_s3_fails(self):
with pytest.raises(IOError):
read_csv('s3://nyqpug/asdf.csv')
# Receive a permission error when trying to read a private bucket.
# It's irrelevant here that this isn't actually a table.
with pytest.raises(IOError):
read_csv('s3://cant_get_it/')
def test_read_csv_handles_boto_s3_object(self,
s3_resource,
tips_file):
# see gh-16135
s3_object = s3_resource.meta.client.get_object(
Bucket='pandas-test',
Key='tips.csv')
result = read_csv(BytesIO(s3_object["Body"].read()), encoding='utf8')
assert isinstance(result, DataFrame)
assert not result.empty
expected = read_csv(tips_file)
tm.assert_frame_equal(result, expected)
def test_read_csv_chunked_download(self, s3_resource, caplog):
# 8 MB, S3FS usees 5MB chunks
df = DataFrame(np.random.randn(100000, 4), columns=list('abcd'))
buf = BytesIO()
str_buf = StringIO()
df.to_csv(str_buf)
buf = BytesIO(str_buf.getvalue().encode('utf-8'))
s3_resource.Bucket("pandas-test").put_object(
Key="large-file.csv",
Body=buf)
with caplog.at_level(logging.DEBUG, logger='s3fs.core'):
read_csv("s3://pandas-test/large-file.csv", nrows=5)
# log of fetch_range (start, stop)
assert ((0, 5505024) in set(x.args[-2:] for x in caplog.records))
@@ -0,0 +1,154 @@
# -*- coding: utf-8 -*-
import os
import pytest
import pandas.util.testing as tm
from pandas import read_csv, read_table, DataFrame
import pandas.core.common as com
from pandas._libs.tslib import Timestamp
from pandas.compat import StringIO
from .common import ParserTests
from .header import HeaderTests
from .comment import CommentTests
from .dialect import DialectTests
from .quoting import QuotingTests
from .usecols import UsecolsTests
from .skiprows import SkipRowsTests
from .index_col import IndexColTests
from .na_values import NAvaluesTests
from .converters import ConverterTests
from .c_parser_only import CParserTests
from .parse_dates import ParseDatesTests
from .compression import CompressionTests
from .mangle_dupes import DupeColumnTests
from .multithread import MultithreadTests
from .python_parser_only import PythonParserTests
from .dtypes import DtypeTests
class BaseParser(CommentTests, CompressionTests,
ConverterTests, DialectTests,
DtypeTests, DupeColumnTests,
HeaderTests, IndexColTests,
MultithreadTests, NAvaluesTests,
ParseDatesTests, ParserTests,
SkipRowsTests, UsecolsTests,
QuotingTests):
def read_csv(self, *args, **kwargs):
raise NotImplementedError
def read_table(self, *args, **kwargs):
raise NotImplementedError
def float_precision_choices(self):
raise com.AbstractMethodError(self)
@pytest.fixture(autouse=True)
def setup_method(self, datapath):
self.dirpath = datapath('io', 'parser', 'data')
self.csv1 = os.path.join(self.dirpath, 'test1.csv')
self.csv2 = os.path.join(self.dirpath, 'test2.csv')
self.xls1 = os.path.join(self.dirpath, 'test.xls')
self.csv_shiftjs = os.path.join(self.dirpath, 'sauron.SHIFT_JIS.csv')
class TestCParserHighMemory(BaseParser, CParserTests):
engine = 'c'
low_memory = False
float_precision_choices = [None, 'high', 'round_trip']
def read_csv(self, *args, **kwds):
kwds = kwds.copy()
kwds['engine'] = self.engine
kwds['low_memory'] = self.low_memory
return read_csv(*args, **kwds)
def read_table(self, *args, **kwds):
kwds = kwds.copy()
kwds['engine'] = self.engine
kwds['low_memory'] = self.low_memory
return read_table(*args, **kwds)
class TestCParserLowMemory(BaseParser, CParserTests):
engine = 'c'
low_memory = True
float_precision_choices = [None, 'high', 'round_trip']
def read_csv(self, *args, **kwds):
kwds = kwds.copy()
kwds['engine'] = self.engine
kwds['low_memory'] = self.low_memory
return read_csv(*args, **kwds)
def read_table(self, *args, **kwds):
kwds = kwds.copy()
kwds['engine'] = self.engine
kwds['low_memory'] = True
return read_table(*args, **kwds)
class TestPythonParser(BaseParser, PythonParserTests):
engine = 'python'
float_precision_choices = [None]
def read_csv(self, *args, **kwds):
kwds = kwds.copy()
kwds['engine'] = self.engine
return read_csv(*args, **kwds)
def read_table(self, *args, **kwds):
kwds = kwds.copy()
kwds['engine'] = self.engine
return read_table(*args, **kwds)
class TestUnsortedUsecols(object):
def test_override__set_noconvert_columns(self):
# GH 17351 - usecols needs to be sorted in _setnoconvert_columns
# based on the test_usecols_with_parse_dates test from usecols.py
from pandas.io.parsers import CParserWrapper, TextFileReader
s = """a,b,c,d,e
0,1,20140101,0900,4
0,1,20140102,1000,4"""
parse_dates = [[1, 2]]
cols = {
'a': [0, 0],
'c_d': [
Timestamp('2014-01-01 09:00:00'),
Timestamp('2014-01-02 10:00:00')
]
}
expected = DataFrame(cols, columns=['c_d', 'a'])
class MyTextFileReader(TextFileReader):
def __init__(self):
self._currow = 0
self.squeeze = False
class MyCParserWrapper(CParserWrapper):
def _set_noconvert_columns(self):
if self.usecols_dtype == 'integer':
# self.usecols is a set, which is documented as unordered
# but in practice, a CPython set of integers is sorted.
# In other implementations this assumption does not hold.
# The following code simulates a different order, which
# before GH 17351 would cause the wrong columns to be
# converted via the parse_dates parameter
self.usecols = list(self.usecols)
self.usecols.reverse()
return CParserWrapper._set_noconvert_columns(self)
parser = MyTextFileReader()
parser.options = {'usecols': [0, 2, 3],
'parse_dates': parse_dates,
'delimiter': ','}
parser._engine = MyCParserWrapper(StringIO(s), **parser.options)
df = parser.read()
tm.assert_frame_equal(df, expected)
@@ -0,0 +1,436 @@
# -*- coding: utf-8 -*-
"""
Tests the 'read_fwf' function in parsers.py. This
test suite is independent of the others because the
engine is set to 'python-fwf' internally.
"""
from datetime import datetime
import pytest
import numpy as np
import pandas as pd
import pandas.util.testing as tm
from pandas import DataFrame
from pandas import compat
from pandas.compat import StringIO, BytesIO
from pandas.io.parsers import read_csv, read_fwf, EmptyDataError
class TestFwfParsing(object):
def test_fwf(self):
data_expected = """\
2011,58,360.242940,149.910199,11950.7
2011,59,444.953632,166.985655,11788.4
2011,60,364.136849,183.628767,11806.2
2011,61,413.836124,184.375703,11916.8
2011,62,502.953953,173.237159,12468.3
"""
expected = read_csv(StringIO(data_expected),
engine='python', header=None)
data1 = """\
201158 360.242940 149.910199 11950.7
201159 444.953632 166.985655 11788.4
201160 364.136849 183.628767 11806.2
201161 413.836124 184.375703 11916.8
201162 502.953953 173.237159 12468.3
"""
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
df = read_fwf(StringIO(data1), colspecs=colspecs, header=None)
tm.assert_frame_equal(df, expected)
data2 = """\
2011 58 360.242940 149.910199 11950.7
2011 59 444.953632 166.985655 11788.4
2011 60 364.136849 183.628767 11806.2
2011 61 413.836124 184.375703 11916.8
2011 62 502.953953 173.237159 12468.3
"""
df = read_fwf(StringIO(data2), widths=[5, 5, 13, 13, 7], header=None)
tm.assert_frame_equal(df, expected)
# From Thomas Kluyver: apparently some non-space filler characters can
# be seen, this is supported by specifying the 'delimiter' character:
# http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html
data3 = """\
201158~~~~360.242940~~~149.910199~~~11950.7
201159~~~~444.953632~~~166.985655~~~11788.4
201160~~~~364.136849~~~183.628767~~~11806.2
201161~~~~413.836124~~~184.375703~~~11916.8
201162~~~~502.953953~~~173.237159~~~12468.3
"""
df = read_fwf(
StringIO(data3), colspecs=colspecs, delimiter='~', header=None)
tm.assert_frame_equal(df, expected)
with tm.assert_raises_regex(ValueError,
"must specify only one of"):
read_fwf(StringIO(data3), colspecs=colspecs, widths=[6, 10, 10, 7])
with tm.assert_raises_regex(ValueError, "Must specify either"):
read_fwf(StringIO(data3), colspecs=None, widths=None)
def test_BytesIO_input(self):
if not compat.PY3:
pytest.skip(
"Bytes-related test - only needs to work on Python 3")
result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[
2, 2], encoding='utf8')
expected = DataFrame([["של", "ום"]], columns=["של", "ום"])
tm.assert_frame_equal(result, expected)
def test_fwf_colspecs_is_list_or_tuple(self):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
with tm.assert_raises_regex(TypeError,
'column specifications must '
'be a list or tuple.+'):
pd.io.parsers.FixedWidthReader(StringIO(data),
{'a': 1}, ',', '#')
def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(self):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
with tm.assert_raises_regex(TypeError,
'Each column specification '
'must be.+'):
read_fwf(StringIO(data), [('a', 1)])
def test_fwf_colspecs_None(self):
# GH 7079
data = """\
123456
456789
"""
colspecs = [(0, 3), (3, None)]
result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
expected = DataFrame([[123, 456], [456, 789]])
tm.assert_frame_equal(result, expected)
colspecs = [(None, 3), (3, 6)]
result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
expected = DataFrame([[123, 456], [456, 789]])
tm.assert_frame_equal(result, expected)
colspecs = [(0, None), (3, None)]
result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
expected = DataFrame([[123456, 456], [456789, 789]])
tm.assert_frame_equal(result, expected)
colspecs = [(None, None), (3, 6)]
result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
expected = DataFrame([[123456, 456], [456789, 789]])
tm.assert_frame_equal(result, expected)
def test_fwf_regression(self):
# GH 3594
# turns out 'T060' is parsable as a datetime slice!
tzlist = [1, 10, 20, 30, 60, 80, 100]
ntz = len(tzlist)
tcolspecs = [16] + [8] * ntz
tcolnames = ['SST'] + ["T%03d" % z for z in tzlist[1:]]
data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192
2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869
2009164204000 9.5873 9.1326 8.4694 7.5889 6.0422 5.8526 5.4657
2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379
2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039
"""
df = read_fwf(StringIO(data),
index_col=0,
header=None,
names=tcolnames,
widths=tcolspecs,
parse_dates=True,
date_parser=lambda s: datetime.strptime(s, '%Y%j%H%M%S'))
for c in df.columns:
res = df.loc[:, c]
assert len(res)
def test_fwf_for_uint8(self):
data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127
1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" # noqa
df = read_fwf(StringIO(data),
colspecs=[(0, 17), (25, 26), (33, 37),
(49, 51), (58, 62), (63, 1000)],
names=['time', 'pri', 'pgn', 'dst', 'src', 'data'],
converters={
'pgn': lambda x: int(x, 16),
'src': lambda x: int(x, 16),
'dst': lambda x: int(x, 16),
'data': lambda x: len(x.split(' '))})
expected = DataFrame([[1421302965.213420, 3, 61184, 23, 40, 8],
[1421302964.226776, 6, 61442, None, 71, 8]],
columns=["time", "pri", "pgn",
"dst", "src", "data"])
expected["dst"] = expected["dst"].astype(object)
tm.assert_frame_equal(df, expected)
def test_fwf_compression(self):
try:
import gzip
import bz2
except ImportError:
pytest.skip("Need gzip and bz2 to run this test")
data = """1111111111
2222222222
3333333333""".strip()
widths = [5, 5]
names = ['one', 'two']
expected = read_fwf(StringIO(data), widths=widths, names=names)
if compat.PY3:
data = bytes(data, encoding='utf-8')
comps = [('gzip', gzip.GzipFile), ('bz2', bz2.BZ2File)]
for comp_name, compresser in comps:
with tm.ensure_clean() as path:
tmp = compresser(path, mode='wb')
tmp.write(data)
tmp.close()
result = read_fwf(path, widths=widths, names=names,
compression=comp_name)
tm.assert_frame_equal(result, expected)
def test_comment_fwf(self):
data = """
1 2. 4 #hello world
5 NaN 10.0
"""
expected = np.array([[1, 2., 4],
[5, np.nan, 10.]])
df = read_fwf(StringIO(data), colspecs=[(0, 3), (4, 9), (9, 25)],
comment='#')
tm.assert_almost_equal(df.values, expected)
def test_1000_fwf(self):
data = """
1 2,334.0 5
10 13 10.
"""
expected = np.array([[1, 2334., 5],
[10, 13, 10]])
df = read_fwf(StringIO(data), colspecs=[(0, 3), (3, 11), (12, 16)],
thousands=',')
tm.assert_almost_equal(df.values, expected)
def test_bool_header_arg(self):
# see gh-6114
data = """\
MyColumn
a
b
a
b"""
for arg in [True, False]:
with pytest.raises(TypeError):
read_fwf(StringIO(data), header=arg)
def test_full_file(self):
# File with all values
test = """index A B C
2000-01-03T00:00:00 0.980268513777 3 foo
2000-01-04T00:00:00 1.04791624281 -4 bar
2000-01-05T00:00:00 0.498580885705 73 baz
2000-01-06T00:00:00 1.12020151869 1 foo
2000-01-07T00:00:00 0.487094399463 0 bar
2000-01-10T00:00:00 0.836648671666 2 baz
2000-01-11T00:00:00 0.157160753327 34 foo"""
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
expected = read_fwf(StringIO(test), colspecs=colspecs)
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
def test_full_file_with_missing(self):
# File with missing values
test = """index A B C
2000-01-03T00:00:00 0.980268513777 3 foo
2000-01-04T00:00:00 1.04791624281 -4 bar
0.498580885705 73 baz
2000-01-06T00:00:00 1.12020151869 1 foo
2000-01-07T00:00:00 0 bar
2000-01-10T00:00:00 0.836648671666 2 baz
34"""
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
expected = read_fwf(StringIO(test), colspecs=colspecs)
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
def test_full_file_with_spaces(self):
# File with spaces in columns
test = """
Account Name Balance CreditLimit AccountCreated
101 Keanu Reeves 9315.45 10000.00 1/17/1998
312 Gerard Butler 90.00 1000.00 8/6/2003
868 Jennifer Love Hewitt 0 17000.00 5/25/1985
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
317 Bill Murray 789.65 5000.00 2/5/2007
""".strip('\r\n')
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
expected = read_fwf(StringIO(test), colspecs=colspecs)
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
def test_full_file_with_spaces_and_missing(self):
# File with spaces and missing values in columns
test = """
Account Name Balance CreditLimit AccountCreated
101 10000.00 1/17/1998
312 Gerard Butler 90.00 1000.00 8/6/2003
868 5/25/1985
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
317 Bill Murray 789.65
""".strip('\r\n')
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
expected = read_fwf(StringIO(test), colspecs=colspecs)
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
def test_messed_up_data(self):
# Completely messed up file
test = """
Account Name Balance Credit Limit Account Created
101 10000.00 1/17/1998
312 Gerard Butler 90.00 1000.00
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
317 Bill Murray 789.65
""".strip('\r\n')
colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
expected = read_fwf(StringIO(test), colspecs=colspecs)
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
def test_multiple_delimiters(self):
test = r"""
col1~~~~~col2 col3++++++++++++++++++col4
~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
33+++122.33\\\bar.........Gerard Butler
++44~~~~12.01 baz~~Jennifer Love Hewitt
~~55 11+++foo++++Jada Pinkett-Smith
..66++++++.03~~~bar Bill Murray
""".strip('\r\n')
colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
expected = read_fwf(StringIO(test), colspecs=colspecs,
delimiter=' +~.\\')
tm.assert_frame_equal(expected, read_fwf(StringIO(test),
delimiter=' +~.\\'))
def test_variable_width_unicode(self):
if not compat.PY3:
pytest.skip(
'Bytes-related test - only needs to work on Python 3')
test = """
שלום שלום
ום שלל
של ום
""".strip('\r\n')
expected = read_fwf(BytesIO(test.encode('utf8')),
colspecs=[(0, 4), (5, 9)],
header=None, encoding='utf8')
tm.assert_frame_equal(expected, read_fwf(
BytesIO(test.encode('utf8')), header=None, encoding='utf8'))
def test_dtype(self):
data = """ a b c
1 2 3.2
3 4 5.2
"""
colspecs = [(0, 5), (5, 10), (10, None)]
result = pd.read_fwf(StringIO(data), colspecs=colspecs)
expected = pd.DataFrame({
'a': [1, 3],
'b': [2, 4],
'c': [3.2, 5.2]}, columns=['a', 'b', 'c'])
tm.assert_frame_equal(result, expected)
expected['a'] = expected['a'].astype('float64')
expected['b'] = expected['b'].astype(str)
expected['c'] = expected['c'].astype('int32')
result = pd.read_fwf(StringIO(data), colspecs=colspecs,
dtype={'a': 'float64', 'b': str, 'c': 'int32'})
tm.assert_frame_equal(result, expected)
def test_skiprows_inference(self):
# GH11256
test = """
Text contained in the file header
DataCol1 DataCol2
0.0 1.0
101.6 956.1
""".strip()
expected = read_csv(StringIO(test), skiprows=2,
delim_whitespace=True)
tm.assert_frame_equal(expected, read_fwf(
StringIO(test), skiprows=2))
def test_skiprows_by_index_inference(self):
test = """
To be skipped
Not To Be Skipped
Once more to be skipped
123 34 8 123
456 78 9 456
""".strip()
expected = read_csv(StringIO(test), skiprows=[0, 2],
delim_whitespace=True)
tm.assert_frame_equal(expected, read_fwf(
StringIO(test), skiprows=[0, 2]))
def test_skiprows_inference_empty(self):
test = """
AA BBB C
12 345 6
78 901 2
""".strip()
with pytest.raises(EmptyDataError):
read_fwf(StringIO(test), skiprows=3)
def test_whitespace_preservation(self):
# Addresses Issue #16772
data_expected = """
a ,bbb
cc,dd """
expected = read_csv(StringIO(data_expected), header=None)
test_data = """
a bbb
ccdd """
result = read_fwf(StringIO(test_data), widths=[3, 3],
header=None, skiprows=[0], delimiter="\n\t")
tm.assert_frame_equal(result, expected)
def test_default_delimiter(self):
data_expected = """
a,bbb
cc,dd"""
expected = read_csv(StringIO(data_expected), header=None)
test_data = """
a \tbbb
cc\tdd """
result = read_fwf(StringIO(test_data), widths=[3, 3],
header=None, skiprows=[0])
tm.assert_frame_equal(result, expected)
@@ -0,0 +1,354 @@
# -*- coding: utf-8 -*-
"""
Tests the TextReader class in parsers.pyx, which
is integral to the C engine in parsers.py
"""
import pytest
from pandas.compat import StringIO, BytesIO, map
from pandas import compat
import os
import sys
from numpy import nan
import numpy as np
from pandas import DataFrame
from pandas.io.parsers import (read_csv, TextFileReader)
from pandas.util.testing import assert_frame_equal
import pandas.util.testing as tm
from pandas._libs.parsers import TextReader
import pandas._libs.parsers as parser
class TestTextReader(object):
@pytest.fixture(autouse=True)
def setup_method(self, datapath):
self.dirpath = datapath('io', 'parser', 'data')
self.csv1 = os.path.join(self.dirpath, 'test1.csv')
self.csv2 = os.path.join(self.dirpath, 'test2.csv')
self.xls1 = os.path.join(self.dirpath, 'test.xls')
def test_file_handle(self):
with open(self.csv1, 'rb') as f:
reader = TextReader(f)
reader.read()
def test_string_filename(self):
reader = TextReader(self.csv1, header=None)
reader.read()
def test_file_handle_mmap(self):
with open(self.csv1, 'rb') as f:
reader = TextReader(f, memory_map=True, header=None)
reader.read()
def test_StringIO(self):
with open(self.csv1, 'rb') as f:
text = f.read()
src = BytesIO(text)
reader = TextReader(src, header=None)
reader.read()
def test_string_factorize(self):
# should this be optional?
data = 'a\nb\na\nb\na'
reader = TextReader(StringIO(data), header=None)
result = reader.read()
assert len(set(map(id, result[0]))) == 2
def test_skipinitialspace(self):
data = ('a, b\n'
'a, b\n'
'a, b\n'
'a, b')
reader = TextReader(StringIO(data), skipinitialspace=True,
header=None)
result = reader.read()
tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a', 'a'],
dtype=np.object_))
tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b', 'b'],
dtype=np.object_))
def test_parse_booleans(self):
data = 'True\nFalse\nTrue\nTrue'
reader = TextReader(StringIO(data), header=None)
result = reader.read()
assert result[0].dtype == np.bool_
def test_delimit_whitespace(self):
data = 'a b\na\t\t "b"\n"a"\t \t b'
reader = TextReader(StringIO(data), delim_whitespace=True,
header=None)
result = reader.read()
tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a'],
dtype=np.object_))
tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b'],
dtype=np.object_))
def test_embedded_newline(self):
data = 'a\n"hello\nthere"\nthis'
reader = TextReader(StringIO(data), header=None)
result = reader.read()
expected = np.array(['a', 'hello\nthere', 'this'], dtype=np.object_)
tm.assert_numpy_array_equal(result[0], expected)
def test_euro_decimal(self):
data = '12345,67\n345,678'
reader = TextReader(StringIO(data), delimiter=':',
decimal=',', header=None)
result = reader.read()
expected = np.array([12345.67, 345.678])
tm.assert_almost_equal(result[0], expected)
def test_integer_thousands(self):
data = '123,456\n12,500'
reader = TextReader(StringIO(data), delimiter=':',
thousands=',', header=None)
result = reader.read()
expected = np.array([123456, 12500], dtype=np.int64)
tm.assert_almost_equal(result[0], expected)
def test_integer_thousands_alt(self):
data = '123.456\n12.500'
reader = TextFileReader(StringIO(data), delimiter=':',
thousands='.', header=None)
result = reader.read()
expected = DataFrame([123456, 12500])
tm.assert_frame_equal(result, expected)
@tm.capture_stderr
def test_skip_bad_lines(self):
# too many lines, see #2430 for why
data = ('a:b:c\n'
'd:e:f\n'
'g:h:i\n'
'j:k:l:m\n'
'l:m:n\n'
'o:p:q:r')
reader = TextReader(StringIO(data), delimiter=':',
header=None)
pytest.raises(parser.ParserError, reader.read)
reader = TextReader(StringIO(data), delimiter=':',
header=None,
error_bad_lines=False,
warn_bad_lines=False)
result = reader.read()
expected = {0: np.array(['a', 'd', 'g', 'l'], dtype=object),
1: np.array(['b', 'e', 'h', 'm'], dtype=object),
2: np.array(['c', 'f', 'i', 'n'], dtype=object)}
assert_array_dicts_equal(result, expected)
reader = TextReader(StringIO(data), delimiter=':',
header=None,
error_bad_lines=False,
warn_bad_lines=True)
reader.read()
val = sys.stderr.getvalue()
assert 'Skipping line 4' in val
assert 'Skipping line 6' in val
def test_header_not_enough_lines(self):
data = ('skip this\n'
'skip this\n'
'a,b,c\n'
'1,2,3\n'
'4,5,6')
reader = TextReader(StringIO(data), delimiter=',', header=2)
header = reader.header
expected = [['a', 'b', 'c']]
assert header == expected
recs = reader.read()
expected = {0: np.array([1, 4], dtype=np.int64),
1: np.array([2, 5], dtype=np.int64),
2: np.array([3, 6], dtype=np.int64)}
assert_array_dicts_equal(recs, expected)
def test_escapechar(self):
data = ('\\"hello world\"\n'
'\\"hello world\"\n'
'\\"hello world\"')
reader = TextReader(StringIO(data), delimiter=',', header=None,
escapechar='\\')
result = reader.read()
expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
assert_array_dicts_equal(result, expected)
def test_eof_has_eol(self):
# handling of new line at EOF
pass
def test_na_substitution(self):
pass
def test_numpy_string_dtype(self):
data = """\
a,1
aa,2
aaa,3
aaaa,4
aaaaa,5"""
def _make_reader(**kwds):
return TextReader(StringIO(data), delimiter=',', header=None,
**kwds)
reader = _make_reader(dtype='S5,i4')
result = reader.read()
assert result[0].dtype == 'S5'
ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaaa'], dtype='S5')
assert (result[0] == ex_values).all()
assert result[1].dtype == 'i4'
reader = _make_reader(dtype='S4')
result = reader.read()
assert result[0].dtype == 'S4'
ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaa'], dtype='S4')
assert (result[0] == ex_values).all()
assert result[1].dtype == 'S4'
def test_pass_dtype(self):
data = """\
one,two
1,a
2,b
3,c
4,d"""
def _make_reader(**kwds):
return TextReader(StringIO(data), delimiter=',', **kwds)
reader = _make_reader(dtype={'one': 'u1', 1: 'S1'})
result = reader.read()
assert result[0].dtype == 'u1'
assert result[1].dtype == 'S1'
reader = _make_reader(dtype={'one': np.uint8, 1: object})
result = reader.read()
assert result[0].dtype == 'u1'
assert result[1].dtype == 'O'
reader = _make_reader(dtype={'one': np.dtype('u1'),
1: np.dtype('O')})
result = reader.read()
assert result[0].dtype == 'u1'
assert result[1].dtype == 'O'
def test_usecols(self):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
def _make_reader(**kwds):
return TextReader(StringIO(data), delimiter=',', **kwds)
reader = _make_reader(usecols=(1, 2))
result = reader.read()
exp = _make_reader().read()
assert len(result) == 2
assert (result[1] == exp[1]).all()
assert (result[2] == exp[2]).all()
def test_cr_delimited(self):
def _test(text, **kwargs):
nice_text = text.replace('\r', '\r\n')
result = TextReader(StringIO(text), **kwargs).read()
expected = TextReader(StringIO(nice_text), **kwargs).read()
assert_array_dicts_equal(result, expected)
data = 'a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12'
_test(data, delimiter=',')
data = 'a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12'
_test(data, delim_whitespace=True)
data = 'a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12'
_test(data, delimiter=',')
sample = ('A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r'
'AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r'
',BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0')
_test(sample, delimiter=',')
data = 'A B C\r 2 3\r4 5 6'
_test(data, delim_whitespace=True)
data = 'A B C\r2 3\r4 5 6'
_test(data, delim_whitespace=True)
def test_empty_field_eof(self):
data = 'a,b,c\n1,2,3\n4,,'
result = TextReader(StringIO(data), delimiter=',').read()
expected = {0: np.array([1, 4], dtype=np.int64),
1: np.array(['2', ''], dtype=object),
2: np.array(['3', ''], dtype=object)}
assert_array_dicts_equal(result, expected)
# GH5664
a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c'])
b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]],
columns=list('abcd'),
index=[1, 1])
c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan],
[8, 9, 10, 11], [13, 14, nan, nan]],
columns=list('abcd'),
index=[0, 5, 7, 12])
for _ in range(100):
df = read_csv(StringIO('a,b\nc\n'), skiprows=0,
names=['a'], engine='c')
assert_frame_equal(df, a)
df = read_csv(StringIO('1,1,1,1,0\n' * 2 + '\n' * 2),
names=list("abcd"), engine='c')
assert_frame_equal(df, b)
df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'),
names=list('abcd'), engine='c')
assert_frame_equal(df, c)
def test_empty_csv_input(self):
# GH14867
df = read_csv(StringIO(), chunksize=20, header=None,
names=['a', 'b', 'c'])
assert isinstance(df, TextFileReader)
def assert_array_dicts_equal(left, right):
for k, v in compat.iteritems(left):
assert tm.assert_numpy_array_equal(np.asarray(v),
np.asarray(right[k]))
@@ -0,0 +1,139 @@
# -*- coding: utf-8 -*-
"""
Tests that features that are currently unsupported in
either the Python or C parser are actually enforced
and are clearly communicated to the user.
Ultimately, the goal is to remove test cases from this
test suite as new feature support is added to the parsers.
"""
import pandas.io.parsers as parsers
import pandas.util.testing as tm
from pandas.compat import StringIO
from pandas.errors import ParserError
from pandas.io.parsers import read_csv, read_table
import pytest
@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
def python_engine(request):
return request.param
class TestUnsupportedFeatures(object):
def test_mangle_dupe_cols_false(self):
# see gh-12935
data = 'a b c\n1 2 3'
msg = 'is not supported'
for engine in ('c', 'python'):
with tm.assert_raises_regex(ValueError, msg):
read_csv(StringIO(data), engine=engine,
mangle_dupe_cols=False)
def test_c_engine(self):
# see gh-6607
data = 'a b c\n1 2 3'
msg = 'does not support'
# specify C engine with unsupported options (raise)
with tm.assert_raises_regex(ValueError, msg):
read_table(StringIO(data), engine='c',
sep=None, delim_whitespace=False)
with tm.assert_raises_regex(ValueError, msg):
read_table(StringIO(data), engine='c', sep=r'\s')
with tm.assert_raises_regex(ValueError, msg):
read_table(StringIO(data), engine='c', quotechar=chr(128))
with tm.assert_raises_regex(ValueError, msg):
read_table(StringIO(data), engine='c', skipfooter=1)
# specify C-unsupported options without python-unsupported options
with tm.assert_produces_warning(parsers.ParserWarning):
read_table(StringIO(data), sep=None, delim_whitespace=False)
with tm.assert_produces_warning(parsers.ParserWarning):
read_table(StringIO(data), quotechar=chr(128))
with tm.assert_produces_warning(parsers.ParserWarning):
read_table(StringIO(data), sep=r'\s')
with tm.assert_produces_warning(parsers.ParserWarning):
read_table(StringIO(data), skipfooter=1)
text = """ A B C D E
one two three four
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
msg = 'Error tokenizing data'
with tm.assert_raises_regex(ParserError, msg):
read_table(StringIO(text), sep='\\s+')
with tm.assert_raises_regex(ParserError, msg):
read_table(StringIO(text), engine='c', sep='\\s+')
msg = "Only length-1 thousands markers supported"
data = """A|B|C
1|2,334|5
10|13|10.
"""
with tm.assert_raises_regex(ValueError, msg):
read_csv(StringIO(data), thousands=',,')
with tm.assert_raises_regex(ValueError, msg):
read_csv(StringIO(data), thousands='')
msg = "Only length-1 line terminators supported"
data = 'a,b,c~~1,2,3~~4,5,6'
with tm.assert_raises_regex(ValueError, msg):
read_csv(StringIO(data), lineterminator='~~')
def test_python_engine(self, python_engine):
from pandas.io.parsers import _python_unsupported as py_unsupported
data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
for default in py_unsupported:
msg = ('The %r option is not supported '
'with the %r engine' % (default, python_engine))
kwargs = {default: object()}
with tm.assert_raises_regex(ValueError, msg):
read_csv(StringIO(data), engine=python_engine, **kwargs)
def test_python_engine_file_no_next(self, python_engine):
# see gh-16530
class NoNextBuffer(object):
def __init__(self, csv_data):
self.data = csv_data
def __iter__(self):
return self
def read(self):
return self.data
data = "a\n1"
msg = "The 'python' engine cannot iterate"
with tm.assert_raises_regex(ValueError, msg):
read_csv(NoNextBuffer(data), engine=python_engine)
class TestDeprecatedFeatures(object):
@pytest.mark.parametrize("engine", ["c", "python"])
@pytest.mark.parametrize("kwargs", [{"tupleize_cols": True},
{"tupleize_cols": False}])
def test_deprecated_args(self, engine, kwargs):
data = "1,2,3"
arg, _ = list(kwargs.items())[0]
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
read_csv(StringIO(data), engine=engine, **kwargs)
@@ -0,0 +1,549 @@
# -*- coding: utf-8 -*-
"""
Tests the usecols functionality during parsing
for all of the parsers defined in parsers.py
"""
import pytest
import numpy as np
import pandas.util.testing as tm
from pandas import DataFrame, Index
from pandas._libs.tslib import Timestamp
from pandas.compat import StringIO
class UsecolsTests(object):
msg_validate_usecols_arg = ("'usecols' must either be list-like of all "
"strings, all unicode, all integers or a "
"callable.")
msg_validate_usecols_names = ("Usecols do not match columns, columns "
"expected but not found: {0}")
def test_raise_on_mixed_dtype_usecols(self):
# See gh-12678
data = """a,b,c
1000,2000,3000
4000,5000,6000
"""
usecols = [0, 'b', 2]
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
self.read_csv(StringIO(data), usecols=usecols)
def test_usecols(self):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
result = self.read_csv(StringIO(data), usecols=(1, 2))
result2 = self.read_csv(StringIO(data), usecols=('b', 'c'))
exp = self.read_csv(StringIO(data))
assert len(result.columns) == 2
assert (result['b'] == exp['b']).all()
assert (result['c'] == exp['c']).all()
tm.assert_frame_equal(result, result2)
result = self.read_csv(StringIO(data), usecols=[1, 2], header=0,
names=['foo', 'bar'])
expected = self.read_csv(StringIO(data), usecols=[1, 2])
expected.columns = ['foo', 'bar']
tm.assert_frame_equal(result, expected)
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
result = self.read_csv(StringIO(data), names=['b', 'c'],
header=None, usecols=[1, 2])
expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
header=None)
expected = expected[['b', 'c']]
tm.assert_frame_equal(result, expected)
result2 = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
header=None, usecols=['b', 'c'])
tm.assert_frame_equal(result2, result)
# see gh-5766
result = self.read_csv(StringIO(data), names=['a', 'b'],
header=None, usecols=[0, 1])
expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
header=None)
expected = expected[['a', 'b']]
tm.assert_frame_equal(result, expected)
# length conflict, passed names and usecols disagree
pytest.raises(ValueError, self.read_csv, StringIO(data),
names=['a', 'b'], usecols=[1], header=None)
def test_usecols_single_string(self):
# GH 20558
data = """foo, bar, baz
1000, 2000, 3000
4000, 5000, 6000
"""
usecols = 'foo'
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
self.read_csv(StringIO(data), usecols=usecols)
def test_usecols_index_col_False(self):
# see gh-9082
s = "a,b,c,d\n1,2,3,4\n5,6,7,8"
s_malformed = "a,b,c,d\n1,2,3,4,\n5,6,7,8,"
cols = ['a', 'c', 'd']
expected = DataFrame({'a': [1, 5], 'c': [3, 7], 'd': [4, 8]})
df = self.read_csv(StringIO(s), usecols=cols, index_col=False)
tm.assert_frame_equal(expected, df)
df = self.read_csv(StringIO(s_malformed),
usecols=cols, index_col=False)
tm.assert_frame_equal(expected, df)
def test_usecols_index_col_conflict(self):
# see gh-4201: test that index_col as integer reflects usecols
data = 'a,b,c,d\nA,a,1,one\nB,b,2,two'
expected = DataFrame({'c': [1, 2]}, index=Index(
['a', 'b'], name='b'))
df = self.read_csv(StringIO(data), usecols=['b', 'c'],
index_col=0)
tm.assert_frame_equal(expected, df)
df = self.read_csv(StringIO(data), usecols=['b', 'c'],
index_col='b')
tm.assert_frame_equal(expected, df)
df = self.read_csv(StringIO(data), usecols=[1, 2],
index_col='b')
tm.assert_frame_equal(expected, df)
df = self.read_csv(StringIO(data), usecols=[1, 2],
index_col=0)
tm.assert_frame_equal(expected, df)
expected = DataFrame(
{'b': ['a', 'b'], 'c': [1, 2], 'd': ('one', 'two')})
expected = expected.set_index(['b', 'c'])
df = self.read_csv(StringIO(data), usecols=['b', 'c', 'd'],
index_col=['b', 'c'])
tm.assert_frame_equal(expected, df)
def test_usecols_implicit_index_col(self):
# see gh-2654
data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10'
result = self.read_csv(StringIO(data), usecols=['a', 'b'])
expected = DataFrame({'a': ['apple', 'orange'],
'b': ['bat', 'cow']}, index=[4, 8])
tm.assert_frame_equal(result, expected)
def test_usecols_regex_sep(self):
# see gh-2733
data = 'a b c\n4 apple bat 5.7\n8 orange cow 10'
df = self.read_csv(StringIO(data), sep=r'\s+', usecols=('a', 'b'))
expected = DataFrame({'a': ['apple', 'orange'],
'b': ['bat', 'cow']}, index=[4, 8])
tm.assert_frame_equal(df, expected)
def test_usecols_with_whitespace(self):
data = 'a b c\n4 apple bat 5.7\n8 orange cow 10'
result = self.read_csv(StringIO(data), delim_whitespace=True,
usecols=('a', 'b'))
expected = DataFrame({'a': ['apple', 'orange'],
'b': ['bat', 'cow']}, index=[4, 8])
tm.assert_frame_equal(result, expected)
def test_usecols_with_integer_like_header(self):
data = """2,0,1
1000,2000,3000
4000,5000,6000
"""
usecols = [0, 1] # column selection by index
expected = DataFrame(data=[[1000, 2000],
[4000, 5000]],
columns=['2', '0'])
df = self.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(df, expected)
usecols = ['0', '1'] # column selection by name
expected = DataFrame(data=[[2000, 3000],
[5000, 6000]],
columns=['0', '1'])
df = self.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(df, expected)
def test_usecols_with_parse_dates(self):
# See gh-9755
s = """a,b,c,d,e
0,1,20140101,0900,4
0,1,20140102,1000,4"""
parse_dates = [[1, 2]]
cols = {
'a': [0, 0],
'c_d': [
Timestamp('2014-01-01 09:00:00'),
Timestamp('2014-01-02 10:00:00')
]
}
expected = DataFrame(cols, columns=['c_d', 'a'])
df = self.read_csv(StringIO(s), usecols=[0, 2, 3],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)
df = self.read_csv(StringIO(s), usecols=[3, 0, 2],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)
# See gh-13604
s = """2008-02-07 09:40,1032.43
2008-02-07 09:50,1042.54
2008-02-07 10:00,1051.65
"""
parse_dates = [0]
names = ['date', 'values']
usecols = names[:]
index = Index([Timestamp('2008-02-07 09:40'),
Timestamp('2008-02-07 09:50'),
Timestamp('2008-02-07 10:00')],
name='date')
cols = {'values': [1032.43, 1042.54, 1051.65]}
expected = DataFrame(cols, index=index)
df = self.read_csv(StringIO(s), parse_dates=parse_dates, index_col=0,
usecols=usecols, header=None, names=names)
tm.assert_frame_equal(df, expected)
# See gh-14792
s = """a,b,c,d,e,f,g,h,i,j
2016/09/21,1,1,2,3,4,5,6,7,8"""
parse_dates = [0]
usecols = list('abcdefghij')
cols = {'a': Timestamp('2016-09-21'),
'b': [1], 'c': [1], 'd': [2],
'e': [3], 'f': [4], 'g': [5],
'h': [6], 'i': [7], 'j': [8]}
expected = DataFrame(cols, columns=usecols)
df = self.read_csv(StringIO(s), usecols=usecols,
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)
s = """a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"""
parse_dates = [[0, 1]]
usecols = list('abcdefghij')
cols = {'a_b': '2016/09/21 1',
'c': [1], 'd': [2], 'e': [3], 'f': [4],
'g': [5], 'h': [6], 'i': [7], 'j': [8]}
expected = DataFrame(cols, columns=['a_b'] + list('cdefghij'))
df = self.read_csv(StringIO(s), usecols=usecols,
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)
def test_usecols_with_parse_dates_and_full_names(self):
# See gh-9755
s = """0,1,20140101,0900,4
0,1,20140102,1000,4"""
parse_dates = [[1, 2]]
names = list('abcde')
cols = {
'a': [0, 0],
'c_d': [
Timestamp('2014-01-01 09:00:00'),
Timestamp('2014-01-02 10:00:00')
]
}
expected = DataFrame(cols, columns=['c_d', 'a'])
df = self.read_csv(StringIO(s), names=names,
usecols=[0, 2, 3],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)
df = self.read_csv(StringIO(s), names=names,
usecols=[3, 0, 2],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)
def test_usecols_with_parse_dates_and_usecol_names(self):
# See gh-9755
s = """0,1,20140101,0900,4
0,1,20140102,1000,4"""
parse_dates = [[1, 2]]
names = list('acd')
cols = {
'a': [0, 0],
'c_d': [
Timestamp('2014-01-01 09:00:00'),
Timestamp('2014-01-02 10:00:00')
]
}
expected = DataFrame(cols, columns=['c_d', 'a'])
df = self.read_csv(StringIO(s), names=names,
usecols=[0, 2, 3],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)
df = self.read_csv(StringIO(s), names=names,
usecols=[3, 0, 2],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)
def test_usecols_with_unicode_strings(self):
# see gh-13219
s = '''AAA,BBB,CCC,DDD
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a
'''
data = {
'AAA': {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002
},
'BBB': {0: 8, 1: 2, 2: 7}
}
expected = DataFrame(data)
df = self.read_csv(StringIO(s), usecols=[u'AAA', u'BBB'])
tm.assert_frame_equal(df, expected)
def test_usecols_with_single_byte_unicode_strings(self):
# see gh-13219
s = '''A,B,C,D
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a
'''
data = {
'A': {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002
},
'B': {0: 8, 1: 2, 2: 7}
}
expected = DataFrame(data)
df = self.read_csv(StringIO(s), usecols=[u'A', u'B'])
tm.assert_frame_equal(df, expected)
def test_usecols_with_mixed_encoding_strings(self):
s = '''AAA,BBB,CCC,DDD
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a
'''
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB'])
def test_usecols_with_multibyte_characters(self):
s = '''あああ,いい,ううう,ええええ
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a
'''
data = {
'あああ': {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002
},
'いい': {0: 8, 1: 2, 2: 7}
}
expected = DataFrame(data)
df = self.read_csv(StringIO(s), usecols=['あああ', 'いい'])
tm.assert_frame_equal(df, expected)
def test_usecols_with_multibyte_unicode_characters(self):
pytest.skip('TODO: see gh-13253')
s = '''あああ,いい,ううう,ええええ
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a
'''
data = {
'あああ': {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002
},
'いい': {0: 8, 1: 2, 2: 7}
}
expected = DataFrame(data)
df = self.read_csv(StringIO(s), usecols=[u'あああ', u'いい'])
tm.assert_frame_equal(df, expected)
def test_empty_usecols(self):
# should not raise
data = 'a,b,c\n1,2,3\n4,5,6'
expected = DataFrame()
result = self.read_csv(StringIO(data), usecols=set([]))
tm.assert_frame_equal(result, expected)
def test_np_array_usecols(self):
# See gh-12546
data = 'a,b,c\n1,2,3'
usecols = np.array(['a', 'b'])
expected = DataFrame([[1, 2]], columns=usecols)
result = self.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
def test_callable_usecols(self):
# See gh-14154
s = '''AaA,bBb,CCC,ddd
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a
'''
data = {
'AaA': {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002
},
'bBb': {0: 8, 1: 2, 2: 7},
'ddd': {0: 'a', 1: 'b', 2: 'a'}
}
expected = DataFrame(data)
df = self.read_csv(StringIO(s), usecols=lambda x:
x.upper() in ['AAA', 'BBB', 'DDD'])
tm.assert_frame_equal(df, expected)
# Check that a callable returning only False returns
# an empty DataFrame
expected = DataFrame()
df = self.read_csv(StringIO(s), usecols=lambda x: False)
tm.assert_frame_equal(df, expected)
def test_incomplete_first_row(self):
# see gh-6710
data = '1,2\n1,2,3'
names = ['a', 'b', 'c']
expected = DataFrame({'a': [1, 1],
'c': [np.nan, 3]})
usecols = ['a', 'c']
df = self.read_csv(StringIO(data), names=names, usecols=usecols)
tm.assert_frame_equal(df, expected)
usecols = lambda x: x in ['a', 'c']
df = self.read_csv(StringIO(data), names=names, usecols=usecols)
tm.assert_frame_equal(df, expected)
def test_uneven_length_cols(self):
# see gh-8985
usecols = [0, 1, 2]
data = '19,29,39\n' * 2 + '10,20,30,40'
expected = DataFrame([[19, 29, 39],
[19, 29, 39],
[10, 20, 30]])
df = self.read_csv(StringIO(data), header=None, usecols=usecols)
tm.assert_frame_equal(df, expected)
# see gh-9549
usecols = ['A', 'B', 'C']
data = ('A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n'
'1,2,3,,,1,\n1,2,3\n5,6,7')
expected = DataFrame({'A': [1, 3, 1, 1, 1, 5],
'B': [2, 4, 2, 2, 2, 6],
'C': [3, 5, 4, 3, 3, 7]})
df = self.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(df, expected)
def test_raise_on_usecols_names_mismatch(self):
# GH 14671
data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
usecols = ['a', 'b', 'c', 'd']
df = self.read_csv(StringIO(data), usecols=usecols)
expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7],
'd': [4, 8]})
tm.assert_frame_equal(df, expected)
usecols = ['a', 'b', 'c', 'f']
with tm.assert_raises_regex(ValueError,
self.msg_validate_usecols_names.format(
r"\['f'\]")):
self.read_csv(StringIO(data), usecols=usecols)
usecols = ['a', 'b', 'f']
with tm.assert_raises_regex(ValueError,
self.msg_validate_usecols_names.format(
r"\['f'\]")):
self.read_csv(StringIO(data), usecols=usecols)
usecols = ['a', 'b', 'f', 'g']
with tm.assert_raises_regex(ValueError,
self.msg_validate_usecols_names.format(
r"\[('f', 'g'|'g', 'f')\]")):
self.read_csv(StringIO(data), usecols=usecols)
names = ['A', 'B', 'C', 'D']
df = self.read_csv(StringIO(data), header=0, names=names)
expected = DataFrame({'A': [1, 5], 'B': [2, 6], 'C': [3, 7],
'D': [4, 8]})
tm.assert_frame_equal(df, expected)
# TODO: https://github.com/pandas-dev/pandas/issues/16469
# usecols = ['A','C']
# df = self.read_csv(StringIO(data), header=0, names=names,
# usecols=usecols)
# expected = DataFrame({'A': [1,5], 'C': [3,7]})
# tm.assert_frame_equal(df, expected)
#
# usecols = [0,2]
# df = self.read_csv(StringIO(data), header=0, names=names,
# usecols=usecols)
# expected = DataFrame({'A': [1,5], 'C': [3,7]})
# tm.assert_frame_equal(df, expected)
usecols = ['A', 'B', 'C', 'f']
with tm.assert_raises_regex(ValueError,
self.msg_validate_usecols_names.format(
r"\['f'\]")):
self.read_csv(StringIO(data), header=0, names=names,
usecols=usecols)
usecols = ['A', 'B', 'f']
with tm.assert_raises_regex(ValueError,
self.msg_validate_usecols_names.format(
r"\['f'\]")):
self.read_csv(StringIO(data), names=names, usecols=usecols)
@@ -0,0 +1,16 @@
from pandas.compat import StringIO
from pandas import read_sas
import pandas.util.testing as tm
class TestSas(object):
def test_sas_buffer_format(self):
# see gh-14947
b = StringIO("")
msg = ("If this is a buffer object rather than a string "
"name, you must specify a format string")
with tm.assert_raises_regex(ValueError, msg):
read_sas(b)
@@ -0,0 +1,190 @@
import pandas as pd
from pandas.compat import PY2
import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas.errors import EmptyDataError
import os
import io
import numpy as np
import pytest
class TestSAS7BDAT(object):
@pytest.fixture(autouse=True)
def setup_method(self, datapath):
self.dirpath = datapath("io", "sas", "data")
self.data = []
self.test_ix = [list(range(1, 16)), [16]]
for j in 1, 2:
fname = os.path.join(
self.dirpath, "test_sas7bdat_{j}.csv".format(j=j))
df = pd.read_csv(fname)
epoch = pd.datetime(1960, 1, 1)
t1 = pd.to_timedelta(df["Column4"], unit='d')
df["Column4"] = epoch + t1
t2 = pd.to_timedelta(df["Column12"], unit='d')
df["Column12"] = epoch + t2
for k in range(df.shape[1]):
col = df.iloc[:, k]
if col.dtype == np.int64:
df.iloc[:, k] = df.iloc[:, k].astype(np.float64)
elif col.dtype == np.dtype('O'):
if PY2:
f = lambda x: (x.decode('utf-8') if
isinstance(x, str) else x)
df.iloc[:, k] = df.iloc[:, k].apply(f)
self.data.append(df)
def test_from_file(self):
for j in 0, 1:
df0 = self.data[j]
for k in self.test_ix[j]:
fname = os.path.join(
self.dirpath, "test{k}.sas7bdat".format(k=k))
df = pd.read_sas(fname, encoding='utf-8')
tm.assert_frame_equal(df, df0)
def test_from_buffer(self):
for j in 0, 1:
df0 = self.data[j]
for k in self.test_ix[j]:
fname = os.path.join(
self.dirpath, "test{k}.sas7bdat".format(k=k))
with open(fname, 'rb') as f:
byts = f.read()
buf = io.BytesIO(byts)
rdr = pd.read_sas(buf, format="sas7bdat",
iterator=True, encoding='utf-8')
df = rdr.read()
tm.assert_frame_equal(df, df0, check_exact=False)
rdr.close()
def test_from_iterator(self):
for j in 0, 1:
df0 = self.data[j]
for k in self.test_ix[j]:
fname = os.path.join(
self.dirpath, "test{k}.sas7bdat".format(k=k))
rdr = pd.read_sas(fname, iterator=True, encoding='utf-8')
df = rdr.read(2)
tm.assert_frame_equal(df, df0.iloc[0:2, :])
df = rdr.read(3)
tm.assert_frame_equal(df, df0.iloc[2:5, :])
rdr.close()
@td.skip_if_no('pathlib')
def test_path_pathlib(self):
from pathlib import Path
for j in 0, 1:
df0 = self.data[j]
for k in self.test_ix[j]:
fname = Path(os.path.join(
self.dirpath, "test{k}.sas7bdat".format(k=k)))
df = pd.read_sas(fname, encoding='utf-8')
tm.assert_frame_equal(df, df0)
@td.skip_if_no('py.path')
def test_path_localpath(self):
from py.path import local as LocalPath
for j in 0, 1:
df0 = self.data[j]
for k in self.test_ix[j]:
fname = LocalPath(os.path.join(
self.dirpath, "test{k}.sas7bdat".format(k=k)))
df = pd.read_sas(fname, encoding='utf-8')
tm.assert_frame_equal(df, df0)
def test_iterator_loop(self):
# github #13654
for j in 0, 1:
for k in self.test_ix[j]:
for chunksize in 3, 5, 10, 11:
fname = os.path.join(
self.dirpath, "test{k}.sas7bdat".format(k=k))
rdr = pd.read_sas(fname, chunksize=10, encoding='utf-8')
y = 0
for x in rdr:
y += x.shape[0]
assert y == rdr.row_count
rdr.close()
def test_iterator_read_too_much(self):
# github #14734
k = self.test_ix[0][0]
fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k))
rdr = pd.read_sas(fname, format="sas7bdat",
iterator=True, encoding='utf-8')
d1 = rdr.read(rdr.row_count + 20)
rdr.close()
rdr = pd.read_sas(fname, iterator=True, encoding="utf-8")
d2 = rdr.read(rdr.row_count + 20)
tm.assert_frame_equal(d1, d2)
rdr.close()
def test_encoding_options(datapath):
fname = datapath("io", "sas", "data", "test1.sas7bdat")
df1 = pd.read_sas(fname)
df2 = pd.read_sas(fname, encoding='utf-8')
for col in df1.columns:
try:
df1[col] = df1[col].str.decode('utf-8')
except AttributeError:
pass
tm.assert_frame_equal(df1, df2)
from pandas.io.sas.sas7bdat import SAS7BDATReader
rdr = SAS7BDATReader(fname, convert_header_text=False)
df3 = rdr.read()
rdr.close()
for x, y in zip(df1.columns, df3.columns):
assert(x == y.decode())
def test_productsales(datapath):
fname = datapath("io", "sas", "data", "productsales.sas7bdat")
df = pd.read_sas(fname, encoding='utf-8')
fname = datapath("io", "sas", "data", "productsales.csv")
df0 = pd.read_csv(fname, parse_dates=['MONTH'])
vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"]
df0[vn] = df0[vn].astype(np.float64)
tm.assert_frame_equal(df, df0)
def test_12659(datapath):
fname = datapath("io", "sas", "data", "test_12659.sas7bdat")
df = pd.read_sas(fname)
fname = datapath("io", "sas", "data", "test_12659.csv")
df0 = pd.read_csv(fname)
df0 = df0.astype(np.float64)
tm.assert_frame_equal(df, df0)
def test_airline(datapath):
fname = datapath("io", "sas", "data", "airline.sas7bdat")
df = pd.read_sas(fname)
fname = datapath("io", "sas", "data", "airline.csv")
df0 = pd.read_csv(fname)
df0 = df0.astype(np.float64)
tm.assert_frame_equal(df, df0, check_exact=False)
def test_date_time(datapath):
# Support of different SAS date/datetime formats (PR #15871)
fname = datapath("io", "sas", "data", "datetime.sas7bdat")
df = pd.read_sas(fname)
fname = datapath("io", "sas", "data", "datetime.csv")
df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime',
'DateTimeHi', 'Taiw'])
# GH 19732: Timestamps imported from sas will incur floating point errors
df.iloc[:, 3] = df.iloc[:, 3].dt.round('us')
tm.assert_frame_equal(df, df0)
def test_zero_variables(datapath):
# Check if the SAS file has zero variables (PR #18184)
fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")
with pytest.raises(EmptyDataError):
pd.read_sas(fname)
@@ -0,0 +1,143 @@
import pytest
import pandas as pd
import pandas.util.testing as tm
from pandas.io.sas.sasreader import read_sas
import numpy as np
import os
# CSV versions of test xpt files were obtained using the R foreign library
# Numbers in a SAS xport file are always float64, so need to convert
# before making comparisons.
def numeric_as_float(data):
for v in data.columns:
if data[v].dtype is np.dtype('int64'):
data[v] = data[v].astype(np.float64)
class TestXport(object):
@pytest.fixture(autouse=True)
def setup_method(self, datapath):
self.dirpath = datapath("io", "sas", "data")
self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt")
self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt")
self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt")
self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt")
def test1_basic(self):
# Tests with DEMO_G.xpt (all numeric file)
# Compare to this
data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
numeric_as_float(data_csv)
# Read full file
data = read_sas(self.file01, format="xport")
tm.assert_frame_equal(data, data_csv)
num_rows = data.shape[0]
# Test reading beyond end of file
reader = read_sas(self.file01, format="xport", iterator=True)
data = reader.read(num_rows + 100)
assert data.shape[0] == num_rows
reader.close()
# Test incremental read with `read` method.
reader = read_sas(self.file01, format="xport", iterator=True)
data = reader.read(10)
reader.close()
tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
# Test incremental read with `get_chunk` method.
reader = read_sas(self.file01, format="xport", chunksize=10)
data = reader.get_chunk()
reader.close()
tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
# Test read in loop
m = 0
reader = read_sas(self.file01, format="xport", chunksize=100)
for x in reader:
m += x.shape[0]
reader.close()
assert m == num_rows
# Read full file with `read_sas` method
data = read_sas(self.file01)
tm.assert_frame_equal(data, data_csv)
def test1_index(self):
# Tests with DEMO_G.xpt using index (all numeric file)
# Compare to this
data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
data_csv = data_csv.set_index("SEQN")
numeric_as_float(data_csv)
# Read full file
data = read_sas(self.file01, index="SEQN", format="xport")
tm.assert_frame_equal(data, data_csv, check_index_type=False)
# Test incremental read with `read` method.
reader = read_sas(self.file01, index="SEQN", format="xport",
iterator=True)
data = reader.read(10)
reader.close()
tm.assert_frame_equal(data, data_csv.iloc[0:10, :],
check_index_type=False)
# Test incremental read with `get_chunk` method.
reader = read_sas(self.file01, index="SEQN", format="xport",
chunksize=10)
data = reader.get_chunk()
reader.close()
tm.assert_frame_equal(data, data_csv.iloc[0:10, :],
check_index_type=False)
def test1_incremental(self):
# Test with DEMO_G.xpt, reading full file incrementally
data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
data_csv = data_csv.set_index("SEQN")
numeric_as_float(data_csv)
reader = read_sas(self.file01, index="SEQN", chunksize=1000)
all_data = [x for x in reader]
data = pd.concat(all_data, axis=0)
tm.assert_frame_equal(data, data_csv, check_index_type=False)
def test2(self):
# Test with SSHSV1_A.xpt
# Compare to this
data_csv = pd.read_csv(self.file02.replace(".xpt", ".csv"))
numeric_as_float(data_csv)
data = read_sas(self.file02)
tm.assert_frame_equal(data, data_csv)
def test_multiple_types(self):
# Test with DRXFCD_G.xpt (contains text and numeric variables)
# Compare to this
data_csv = pd.read_csv(self.file03.replace(".xpt", ".csv"))
data = read_sas(self.file03, encoding="utf-8")
tm.assert_frame_equal(data, data_csv)
def test_truncated_float_support(self):
# Test with paxraw_d_short.xpt, a shortened version of:
# http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP
# This file has truncated floats (5 bytes in this case).
# GH 11713
data_csv = pd.read_csv(self.file04.replace(".xpt", ".csv"))
data = read_sas(self.file04, format="xport")
tm.assert_frame_equal(data.astype('int64'), data_csv)
@@ -0,0 +1,184 @@
# -*- coding: utf-8 -*-
import numpy as np
from numpy.random import randint
from textwrap import dedent
import pytest
import pandas as pd
from pandas import DataFrame
from pandas import read_clipboard
from pandas import get_option
from pandas.compat import PY2
from pandas.util import testing as tm
from pandas.util.testing import makeCustomDataframe as mkdf
from pandas.io.clipboard.exceptions import PyperclipException
from pandas.io.clipboard import clipboard_set, clipboard_get
try:
DataFrame({'A': [1, 2]}).to_clipboard()
_DEPS_INSTALLED = 1
except (PyperclipException, RuntimeError):
_DEPS_INSTALLED = 0
def build_kwargs(sep, excel):
kwargs = {}
if excel != 'default':
kwargs['excel'] = excel
if sep != 'default':
kwargs['sep'] = sep
return kwargs
@pytest.fixture(params=['delims', 'utf8', 'string', 'long', 'nonascii',
'colwidth', 'mixed', 'float', 'int'])
def df(request):
data_type = request.param
if data_type == 'delims':
return pd.DataFrame({'a': ['"a,\t"b|c', 'd\tef´'],
'b': ['hi\'j', 'k\'\'lm']})
elif data_type == 'utf8':
return pd.DataFrame({'a': ['µasd', 'Ωœ∑´'],
'b': ['øπ∆˚¬', 'œ∑´®']})
elif data_type == 'string':
return mkdf(5, 3, c_idx_type='s', r_idx_type='i',
c_idx_names=[None], r_idx_names=[None])
elif data_type == 'long':
max_rows = get_option('display.max_rows')
return mkdf(max_rows + 1, 3,
data_gen_f=lambda *args: randint(2),
c_idx_type='s', r_idx_type='i',
c_idx_names=[None], r_idx_names=[None])
elif data_type == 'nonascii':
return pd.DataFrame({'en': 'in English'.split(),
'es': 'en español'.split()})
elif data_type == 'colwidth':
_cw = get_option('display.max_colwidth') + 1
return mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw,
c_idx_type='s', r_idx_type='i',
c_idx_names=[None], r_idx_names=[None])
elif data_type == 'mixed':
return DataFrame({'a': np.arange(1.0, 6.0) + 0.01,
'b': np.arange(1, 6),
'c': list('abcde')})
elif data_type == 'float':
return mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01,
c_idx_type='s', r_idx_type='i',
c_idx_names=[None], r_idx_names=[None])
elif data_type == 'int':
return mkdf(5, 3, data_gen_f=lambda *args: randint(2),
c_idx_type='s', r_idx_type='i',
c_idx_names=[None], r_idx_names=[None])
else:
raise ValueError
@pytest.mark.single
@pytest.mark.skipif(not _DEPS_INSTALLED,
reason="clipboard primitives not installed")
class TestClipboard(object):
def check_round_trip_frame(self, data, excel=None, sep=None,
encoding=None):
data.to_clipboard(excel=excel, sep=sep, encoding=encoding)
result = read_clipboard(sep=sep or '\t', index_col=0,
encoding=encoding)
tm.assert_frame_equal(data, result, check_dtype=False)
# Test that default arguments copy as tab delimited
def test_round_trip_frame(self, df):
self.check_round_trip_frame(df)
# Test that explicit delimiters are respected
@pytest.mark.parametrize('sep', ['\t', ',', '|'])
def test_round_trip_frame_sep(self, df, sep):
self.check_round_trip_frame(df, sep=sep)
# Test white space separator
def test_round_trip_frame_string(self, df):
df.to_clipboard(excel=False, sep=None)
result = read_clipboard()
assert df.to_string() == result.to_string()
assert df.shape == result.shape
# Two character separator is not supported in to_clipboard
# Test that multi-character separators are not silently passed
def test_excel_sep_warning(self, df):
with tm.assert_produces_warning():
df.to_clipboard(excel=True, sep=r'\t')
# Separator is ignored when excel=False and should produce a warning
def test_copy_delim_warning(self, df):
with tm.assert_produces_warning():
df.to_clipboard(excel=False, sep='\t')
# Tests that the default behavior of to_clipboard is tab
# delimited and excel="True"
@pytest.mark.parametrize('sep', ['\t', None, 'default'])
@pytest.mark.parametrize('excel', [True, None, 'default'])
def test_clipboard_copy_tabs_default(self, sep, excel, df):
kwargs = build_kwargs(sep, excel)
df.to_clipboard(**kwargs)
if PY2:
# to_clipboard copies unicode, to_csv produces bytes. This is
# expected behavior
assert clipboard_get().encode('utf-8') == df.to_csv(sep='\t')
else:
assert clipboard_get() == df.to_csv(sep='\t')
# Tests reading of white space separated tables
@pytest.mark.parametrize('sep', [None, 'default'])
@pytest.mark.parametrize('excel', [False])
def test_clipboard_copy_strings(self, sep, excel, df):
kwargs = build_kwargs(sep, excel)
df.to_clipboard(**kwargs)
result = read_clipboard(sep=r'\s+')
assert result.to_string() == df.to_string()
assert df.shape == result.shape
def test_read_clipboard_infer_excel(self):
# gh-19010: avoid warnings
clip_kwargs = dict(engine="python")
text = dedent("""
John James Charlie Mingus
1 2
4 Harry Carney
""".strip())
clipboard_set(text)
df = pd.read_clipboard(**clip_kwargs)
# excel data is parsed correctly
assert df.iloc[1][1] == 'Harry Carney'
# having diff tab counts doesn't trigger it
text = dedent("""
a\t b
1 2
3 4
""".strip())
clipboard_set(text)
res = pd.read_clipboard(**clip_kwargs)
text = dedent("""
a b
1 2
3 4
""".strip())
clipboard_set(text)
exp = pd.read_clipboard(**clip_kwargs)
tm.assert_frame_equal(res, exp)
def test_invalid_encoding(self, df):
# test case for testing invalid encoding
with pytest.raises(ValueError):
df.to_clipboard(encoding='ascii')
with pytest.raises(NotImplementedError):
pd.read_clipboard(encoding='ascii')
@pytest.mark.parametrize('enc', ['UTF-8', 'utf-8', 'utf8'])
def test_round_trip_valid_encodings(self, enc, df):
self.check_round_trip_frame(df, encoding=enc)
@@ -0,0 +1,288 @@
"""
Tests for the pandas.io.common functionalities
"""
import mmap
import pytest
import os
from os.path import isabs
import pandas as pd
import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas.io import common
from pandas.compat import is_platform_windows, StringIO, FileNotFoundError
from pandas import read_csv, concat
class CustomFSPath(object):
"""For testing fspath on unknown objects"""
def __init__(self, path):
self.path = path
def __fspath__(self):
return self.path
# Functions that consume a string path and return a string or path-like object
path_types = [str, CustomFSPath]
try:
from pathlib import Path
path_types.append(Path)
except ImportError:
pass
try:
from py.path import local as LocalPath
path_types.append(LocalPath)
except ImportError:
pass
HERE = os.path.abspath(os.path.dirname(__file__))
class TestCommonIOCapabilities(object):
data1 = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
def test_expand_user(self):
filename = '~/sometest'
expanded_name = common._expand_user(filename)
assert expanded_name != filename
assert isabs(expanded_name)
assert os.path.expanduser(filename) == expanded_name
def test_expand_user_normal_path(self):
filename = '/somefolder/sometest'
expanded_name = common._expand_user(filename)
assert expanded_name == filename
assert os.path.expanduser(filename) == expanded_name
@td.skip_if_no('pathlib')
def test_stringify_path_pathlib(self):
rel_path = common._stringify_path(Path('.'))
assert rel_path == '.'
redundant_path = common._stringify_path(Path('foo//bar'))
assert redundant_path == os.path.join('foo', 'bar')
@td.skip_if_no('py.path')
def test_stringify_path_localpath(self):
path = os.path.join('foo', 'bar')
abs_path = os.path.abspath(path)
lpath = LocalPath(path)
assert common._stringify_path(lpath) == abs_path
def test_stringify_path_fspath(self):
p = CustomFSPath('foo/bar.csv')
result = common._stringify_path(p)
assert result == 'foo/bar.csv'
@pytest.mark.parametrize('extension,expected', [
('', None),
('.gz', 'gzip'),
('.bz2', 'bz2'),
('.zip', 'zip'),
('.xz', 'xz'),
])
@pytest.mark.parametrize('path_type', path_types)
def test_infer_compression_from_path(self, extension, expected, path_type):
path = path_type('foo/bar.csv' + extension)
compression = common._infer_compression(path, compression='infer')
assert compression == expected
def test_get_filepath_or_buffer_with_path(self):
filename = '~/sometest'
filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer(
filename)
assert filepath_or_buffer != filename
assert isabs(filepath_or_buffer)
assert os.path.expanduser(filename) == filepath_or_buffer
assert not should_close
def test_get_filepath_or_buffer_with_buffer(self):
input_buffer = StringIO()
filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer(
input_buffer)
assert filepath_or_buffer == input_buffer
assert not should_close
def test_iterator(self):
reader = read_csv(StringIO(self.data1), chunksize=1)
result = concat(reader, ignore_index=True)
expected = read_csv(StringIO(self.data1))
tm.assert_frame_equal(result, expected)
# GH12153
it = read_csv(StringIO(self.data1), chunksize=1)
first = next(it)
tm.assert_frame_equal(first, expected.iloc[[0]])
tm.assert_frame_equal(concat(it), expected.iloc[1:])
@pytest.mark.parametrize('reader, module, error_class, fn_ext', [
(pd.read_csv, 'os', FileNotFoundError, 'csv'),
(pd.read_table, 'os', FileNotFoundError, 'csv'),
(pd.read_fwf, 'os', FileNotFoundError, 'txt'),
(pd.read_excel, 'xlrd', FileNotFoundError, 'xlsx'),
(pd.read_feather, 'feather', Exception, 'feather'),
(pd.read_hdf, 'tables', FileNotFoundError, 'h5'),
(pd.read_stata, 'os', FileNotFoundError, 'dta'),
(pd.read_sas, 'os', FileNotFoundError, 'sas7bdat'),
(pd.read_json, 'os', ValueError, 'json'),
(pd.read_msgpack, 'os', ValueError, 'mp'),
(pd.read_pickle, 'os', FileNotFoundError, 'pickle'),
])
def test_read_non_existant(self, reader, module, error_class, fn_ext):
pytest.importorskip(module)
path = os.path.join(HERE, 'data', 'does_not_exist.' + fn_ext)
with pytest.raises(error_class):
reader(path)
@pytest.mark.parametrize('reader, module, path', [
(pd.read_csv, 'os', ('io', 'data', 'iris.csv')),
(pd.read_table, 'os', ('io', 'data', 'iris.csv')),
(pd.read_fwf, 'os', ('io', 'data', 'fixed_width_format.txt')),
(pd.read_excel, 'xlrd', ('io', 'data', 'test1.xlsx')),
(pd.read_feather, 'feather', ('io', 'data', 'feather-0_3_1.feather')),
(pd.read_hdf, 'tables', ('io', 'data', 'legacy_hdf',
'datetimetz_object.h5')),
(pd.read_stata, 'os', ('io', 'data', 'stata10_115.dta')),
(pd.read_sas, 'os', ('io', 'sas', 'data', 'test1.sas7bdat')),
(pd.read_json, 'os', ('io', 'json', 'data', 'tsframe_v012.json')),
(pd.read_msgpack, 'os', ('io', 'msgpack', 'data', 'frame.mp')),
(pd.read_pickle, 'os', ('io', 'data', 'categorical_0_14_1.pickle')),
])
def test_read_fspath_all(self, reader, module, path, datapath):
pytest.importorskip(module)
path = datapath(*path)
mypath = CustomFSPath(path)
result = reader(mypath)
expected = reader(path)
if path.endswith('.pickle'):
# categorical
tm.assert_categorical_equal(result, expected)
else:
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize('writer_name, writer_kwargs, module', [
('to_csv', {}, 'os'),
('to_excel', {'engine': 'xlwt'}, 'xlwt'),
('to_feather', {}, 'feather'),
('to_html', {}, 'os'),
('to_json', {}, 'os'),
('to_latex', {}, 'os'),
('to_msgpack', {}, 'os'),
('to_pickle', {}, 'os'),
('to_stata', {}, 'os'),
])
def test_write_fspath_all(self, writer_name, writer_kwargs, module):
p1 = tm.ensure_clean('string')
p2 = tm.ensure_clean('fspath')
df = pd.DataFrame({"A": [1, 2]})
with p1 as string, p2 as fspath:
pytest.importorskip(module)
mypath = CustomFSPath(fspath)
writer = getattr(df, writer_name)
writer(string, **writer_kwargs)
with open(string, 'rb') as f:
expected = f.read()
writer(mypath, **writer_kwargs)
with open(fspath, 'rb') as f:
result = f.read()
assert result == expected
def test_write_fspath_hdf5(self):
# Same test as write_fspath_all, except HDF5 files aren't
# necessarily byte-for-byte identical for a given dataframe, so we'll
# have to read and compare equality
pytest.importorskip('tables')
df = pd.DataFrame({"A": [1, 2]})
p1 = tm.ensure_clean('string')
p2 = tm.ensure_clean('fspath')
with p1 as string, p2 as fspath:
mypath = CustomFSPath(fspath)
df.to_hdf(mypath, key='bar')
df.to_hdf(string, key='bar')
result = pd.read_hdf(fspath, key='bar')
expected = pd.read_hdf(string, key='bar')
tm.assert_frame_equal(result, expected)
@pytest.fixture
def mmap_file(datapath):
return datapath('io', 'data', 'test_mmap.csv')
class TestMMapWrapper(object):
def test_constructor_bad_file(self, mmap_file):
non_file = StringIO('I am not a file')
non_file.fileno = lambda: -1
# the error raised is different on Windows
if is_platform_windows():
msg = "The parameter is incorrect"
err = OSError
else:
msg = "[Errno 22]"
err = mmap.error
tm.assert_raises_regex(err, msg, common.MMapWrapper, non_file)
target = open(mmap_file, 'r')
target.close()
msg = "I/O operation on closed file"
tm.assert_raises_regex(
ValueError, msg, common.MMapWrapper, target)
def test_get_attr(self, mmap_file):
with open(mmap_file, 'r') as target:
wrapper = common.MMapWrapper(target)
attrs = dir(wrapper.mmap)
attrs = [attr for attr in attrs
if not attr.startswith('__')]
attrs.append('__next__')
for attr in attrs:
assert hasattr(wrapper, attr)
assert not hasattr(wrapper, 'foo')
def test_next(self, mmap_file):
with open(mmap_file, 'r') as target:
wrapper = common.MMapWrapper(target)
lines = target.readlines()
for line in lines:
next_line = next(wrapper)
assert next_line.strip() == line.strip()
pytest.raises(StopIteration, next, wrapper)
def test_unknown_engine(self):
with tm.ensure_clean() as path:
df = tm.makeDataFrame()
df.to_csv(path)
with tm.assert_raises_regex(ValueError, 'Unknown engine'):
read_csv(path, engine='pyt')
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,145 @@
""" test feather-format compat """
from distutils.version import LooseVersion
from warnings import catch_warnings
import numpy as np
import pandas as pd
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, ensure_clean
import pytest
feather = pytest.importorskip('feather')
from feather import FeatherError # noqa:E402
from pandas.io.feather_format import to_feather, read_feather # noqa:E402
fv = LooseVersion(feather.__version__)
@pytest.mark.single
class TestFeather(object):
def check_error_on_write(self, df, exc):
# check that we are raising the exception
# on writing
with pytest.raises(exc):
with ensure_clean() as path:
to_feather(df, path)
def check_round_trip(self, df, **kwargs):
with ensure_clean() as path:
to_feather(df, path)
with catch_warnings(record=True):
result = read_feather(path, **kwargs)
assert_frame_equal(result, df)
def test_error(self):
for obj in [pd.Series([1, 2, 3]), 1, 'foo', pd.Timestamp('20130101'),
np.array([1, 2, 3])]:
self.check_error_on_write(obj, ValueError)
def test_basic(self):
df = pd.DataFrame({'string': list('abc'),
'int': list(range(1, 4)),
'uint': np.arange(3, 6).astype('u1'),
'float': np.arange(4.0, 7.0, dtype='float64'),
'float_with_null': [1., np.nan, 3],
'bool': [True, False, True],
'bool_with_null': [True, np.nan, False],
'cat': pd.Categorical(list('abc')),
'dt': pd.date_range('20130101', periods=3),
'dttz': pd.date_range('20130101', periods=3,
tz='US/Eastern'),
'dt_with_null': [pd.Timestamp('20130101'), pd.NaT,
pd.Timestamp('20130103')],
'dtns': pd.date_range('20130101', periods=3,
freq='ns')})
assert df.dttz.dtype.tz.zone == 'US/Eastern'
self.check_round_trip(df)
@pytest.mark.skipif(fv >= LooseVersion('0.4.0'), reason='fixed in 0.4.0')
def test_strided_data_issues(self):
# strided data issuehttps://github.com/wesm/feather/issues/97
df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list('abc'))
self.check_error_on_write(df, FeatherError)
def test_duplicate_columns(self):
# https://github.com/wesm/feather/issues/53
# not currently able to handle duplicate columns
df = pd.DataFrame(np.arange(12).reshape(4, 3),
columns=list('aaa')).copy()
self.check_error_on_write(df, ValueError)
def test_stringify_columns(self):
df = pd.DataFrame(np.arange(12).reshape(4, 3)).copy()
self.check_error_on_write(df, ValueError)
@pytest.mark.skipif(fv >= LooseVersion('0.4.0'), reason='fixed in 0.4.0')
def test_unsupported(self):
# timedelta
df = pd.DataFrame({'a': pd.timedelta_range('1 day', periods=3)})
self.check_error_on_write(df, FeatherError)
# non-strings
df = pd.DataFrame({'a': ['a', 1, 2.0]})
self.check_error_on_write(df, ValueError)
def test_unsupported_other(self):
# period
df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
self.check_error_on_write(df, ValueError)
@pytest.mark.skipif(fv < LooseVersion('0.4.0'), reason='new in 0.4.0')
def test_rw_nthreads(self):
df = pd.DataFrame({'A': np.arange(100000)})
self.check_round_trip(df, nthreads=2)
def test_write_with_index(self):
df = pd.DataFrame({'A': [1, 2, 3]})
self.check_round_trip(df)
# non-default index
for index in [[2, 3, 4],
pd.date_range('20130101', periods=3),
list('abc'),
[1, 3, 4],
pd.MultiIndex.from_tuples([('a', 1), ('a', 2),
('b', 1)]),
]:
df.index = index
self.check_error_on_write(df, ValueError)
# index with meta-data
df.index = [0, 1, 2]
df.index.name = 'foo'
self.check_error_on_write(df, ValueError)
# column multi-index
df.index = [0, 1, 2]
df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]),
self.check_error_on_write(df, ValueError)
def test_path_pathlib(self):
df = tm.makeDataFrame().reset_index()
result = tm.round_trip_pathlib(df.to_feather, pd.read_feather)
tm.assert_frame_equal(df, result)
def test_path_localpath(self):
df = tm.makeDataFrame().reset_index()
result = tm.round_trip_localpath(df.to_feather, pd.read_feather)
tm.assert_frame_equal(df, result)
@@ -0,0 +1,135 @@
import pytest
from datetime import datetime
import pytz
import platform
from time import sleep
import os
import numpy as np
import pandas as pd
from pandas import compat, DataFrame
from pandas.compat import range
pandas_gbq = pytest.importorskip('pandas_gbq')
PROJECT_ID = None
PRIVATE_KEY_JSON_PATH = None
PRIVATE_KEY_JSON_CONTENTS = None
if compat.PY3:
DATASET_ID = 'pydata_pandas_bq_testing_py3'
else:
DATASET_ID = 'pydata_pandas_bq_testing_py2'
TABLE_ID = 'new_test'
DESTINATION_TABLE = "{0}.{1}".format(DATASET_ID + "1", TABLE_ID)
VERSION = platform.python_version()
def _skip_if_no_project_id():
if not _get_project_id():
pytest.skip(
"Cannot run integration tests without a project id")
def _skip_if_no_private_key_path():
if not _get_private_key_path():
pytest.skip("Cannot run integration tests without a "
"private key json file path")
def _in_travis_environment():
return 'TRAVIS_BUILD_DIR' in os.environ and \
'GBQ_PROJECT_ID' in os.environ
def _get_project_id():
if _in_travis_environment():
return os.environ.get('GBQ_PROJECT_ID')
else:
return PROJECT_ID
def _get_private_key_path():
if _in_travis_environment():
return os.path.join(*[os.environ.get('TRAVIS_BUILD_DIR'), 'ci',
'travis_gbq.json'])
else:
return PRIVATE_KEY_JSON_PATH
def clean_gbq_environment(private_key=None):
dataset = pandas_gbq.gbq._Dataset(_get_project_id(),
private_key=private_key)
for i in range(1, 10):
if DATASET_ID + str(i) in dataset.datasets():
dataset_id = DATASET_ID + str(i)
table = pandas_gbq.gbq._Table(_get_project_id(), dataset_id,
private_key=private_key)
for j in range(1, 20):
if TABLE_ID + str(j) in dataset.tables(dataset_id):
table.delete(TABLE_ID + str(j))
dataset.delete(dataset_id)
def make_mixed_dataframe_v2(test_size):
# create df to test for all BQ datatypes except RECORD
bools = np.random.randint(2, size=(1, test_size)).astype(bool)
flts = np.random.randn(1, test_size)
ints = np.random.randint(1, 10, size=(1, test_size))
strs = np.random.randint(1, 10, size=(1, test_size)).astype(str)
times = [datetime.now(pytz.timezone('US/Arizona'))
for t in range(test_size)]
return DataFrame({'bools': bools[0],
'flts': flts[0],
'ints': ints[0],
'strs': strs[0],
'times': times[0]},
index=range(test_size))
@pytest.mark.single
class TestToGBQIntegrationWithServiceAccountKeyPath(object):
@classmethod
def setup_class(cls):
# - GLOBAL CLASS FIXTURES -
# put here any instruction you want to execute only *ONCE* *BEFORE*
# executing *ALL* tests described below.
_skip_if_no_project_id()
_skip_if_no_private_key_path()
clean_gbq_environment(_get_private_key_path())
pandas_gbq.gbq._Dataset(_get_project_id(),
private_key=_get_private_key_path()
).create(DATASET_ID + "1")
@classmethod
def teardown_class(cls):
# - GLOBAL CLASS FIXTURES -
# put here any instruction you want to execute only *ONCE* *AFTER*
# executing all tests.
clean_gbq_environment(_get_private_key_path())
def test_roundtrip(self):
destination_table = DESTINATION_TABLE + "1"
test_size = 20001
df = make_mixed_dataframe_v2(test_size)
df.to_gbq(destination_table, _get_project_id(), chunksize=10000,
private_key=_get_private_key_path())
sleep(30) # <- Curses Google!!!
result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}"
.format(destination_table),
project_id=_get_project_id(),
private_key=_get_private_key_path())
assert result['num_rows'][0] == test_size
@@ -0,0 +1,947 @@
from __future__ import print_function
import os
import re
import threading
from functools import partial
import pytest
import numpy as np
from numpy.random import rand
from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index,
date_range, Series)
from pandas.compat import (map, zip, StringIO, BytesIO,
is_platform_windows, PY3, reload)
from pandas.io.common import URLError, file_path_to_url
import pandas.io.html
from pandas.io.html import read_html
from pandas._libs.parsers import ParserError
import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas.util.testing import makeCustomDataframe as mkdf, network
HERE = os.path.dirname(__file__)
@pytest.fixture(params=[
'chinese_utf-16.html',
'chinese_utf-32.html',
'chinese_utf-8.html',
'letz_latin1.html',
])
def html_encoding_file(request, datapath):
"""Parametrized fixture for HTML encoding test filenames."""
return datapath('io', 'data', 'html_encoding', request.param)
def assert_framelist_equal(list1, list2, *args, **kwargs):
assert len(list1) == len(list2), ('lists are not of equal size '
'len(list1) == {0}, '
'len(list2) == {1}'.format(len(list1),
len(list2)))
msg = 'not all list elements are DataFrames'
both_frames = all(map(lambda x, y: isinstance(x, DataFrame) and
isinstance(y, DataFrame), list1, list2))
assert both_frames, msg
for frame_i, frame_j in zip(list1, list2):
tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs)
assert not frame_i.empty, 'frames are both empty'
@td.skip_if_no('bs4')
def test_bs4_version_fails(monkeypatch, datapath):
import bs4
monkeypatch.setattr(bs4, '__version__', '4.2')
with tm.assert_raises_regex(ValueError, "minimum version"):
read_html(datapath("io", "data", "spam.html"), flavor='bs4')
def test_invalid_flavor():
url = 'google.com'
with pytest.raises(ValueError):
read_html(url, 'google', flavor='not a* valid**++ flaver')
@td.skip_if_no('bs4')
@td.skip_if_no('lxml')
def test_same_ordering(datapath):
filename = datapath('io', 'data', 'valid_markup.html')
dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
assert_framelist_equal(dfs_lxml, dfs_bs4)
@pytest.mark.parametrize("flavor", [
pytest.param('bs4', marks=pytest.mark.skipif(
not td.safe_import('lxml'), reason='No bs4')),
pytest.param('lxml', marks=pytest.mark.skipif(
not td.safe_import('lxml'), reason='No lxml'))], scope="class")
class TestReadHtml(object):
@pytest.fixture(autouse=True)
def set_files(self, datapath):
self.spam_data = datapath('io', 'data', 'spam.html')
self.spam_data_kwargs = {}
if PY3:
self.spam_data_kwargs['encoding'] = 'UTF-8'
self.banklist_data = datapath("io", "data", "banklist.html")
@pytest.fixture(autouse=True, scope="function")
def set_defaults(self, flavor, request):
self.read_html = partial(read_html, flavor=flavor)
yield
def test_to_html_compat(self):
df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False,
r_idx_names=False).applymap('{0:.3f}'.format).astype(float)
out = df.to_html()
res = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0]
tm.assert_frame_equal(res, df)
@network
def test_banklist_url(self):
url = 'http://www.fdic.gov/bank/individual/failed/banklist.html'
df1 = self.read_html(url, 'First Federal Bank of Florida',
attrs={"id": 'table'})
df2 = self.read_html(url, 'Metcalf Bank', attrs={'id': 'table'})
assert_framelist_equal(df1, df2)
@network
def test_spam_url(self):
url = ('http://ndb.nal.usda.gov/ndb/foods/show/300772?fg=&man=&'
'lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam')
df1 = self.read_html(url, '.*Water.*')
df2 = self.read_html(url, 'Unit')
assert_framelist_equal(df1, df2)
@pytest.mark.slow
def test_banklist(self):
df1 = self.read_html(self.banklist_data, '.*Florida.*',
attrs={'id': 'table'})
df2 = self.read_html(self.banklist_data, 'Metcalf Bank',
attrs={'id': 'table'})
assert_framelist_equal(df1, df2)
def test_spam_no_types(self):
# infer_types removed in #10892
df1 = self.read_html(self.spam_data, '.*Water.*')
df2 = self.read_html(self.spam_data, 'Unit')
assert_framelist_equal(df1, df2)
assert df1[0].iloc[0, 0] == 'Proximates'
assert df1[0].columns[0] == 'Nutrient'
def test_spam_with_types(self):
df1 = self.read_html(self.spam_data, '.*Water.*')
df2 = self.read_html(self.spam_data, 'Unit')
assert_framelist_equal(df1, df2)
assert df1[0].iloc[0, 0] == 'Proximates'
assert df1[0].columns[0] == 'Nutrient'
def test_spam_no_match(self):
dfs = self.read_html(self.spam_data)
for df in dfs:
assert isinstance(df, DataFrame)
def test_banklist_no_match(self):
dfs = self.read_html(self.banklist_data, attrs={'id': 'table'})
for df in dfs:
assert isinstance(df, DataFrame)
def test_spam_header(self):
df = self.read_html(self.spam_data, '.*Water.*', header=1)[0]
assert df.columns[0] == 'Proximates'
assert not df.empty
def test_skiprows_int(self):
df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1)
df2 = self.read_html(self.spam_data, 'Unit', skiprows=1)
assert_framelist_equal(df1, df2)
def test_skiprows_xrange(self):
df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=range(2))[0]
df2 = self.read_html(self.spam_data, 'Unit', skiprows=range(2))[0]
tm.assert_frame_equal(df1, df2)
def test_skiprows_list(self):
df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=[1, 2])
df2 = self.read_html(self.spam_data, 'Unit', skiprows=[2, 1])
assert_framelist_equal(df1, df2)
def test_skiprows_set(self):
df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=set([1, 2]))
df2 = self.read_html(self.spam_data, 'Unit', skiprows=set([2, 1]))
assert_framelist_equal(df1, df2)
def test_skiprows_slice(self):
df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1)
df2 = self.read_html(self.spam_data, 'Unit', skiprows=1)
assert_framelist_equal(df1, df2)
def test_skiprows_slice_short(self):
df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2))
df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(2))
assert_framelist_equal(df1, df2)
def test_skiprows_slice_long(self):
df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2, 5))
df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(4, 1, -1))
assert_framelist_equal(df1, df2)
def test_skiprows_ndarray(self):
df1 = self.read_html(self.spam_data, '.*Water.*',
skiprows=np.arange(2))
df2 = self.read_html(self.spam_data, 'Unit', skiprows=np.arange(2))
assert_framelist_equal(df1, df2)
def test_skiprows_invalid(self):
with tm.assert_raises_regex(TypeError, 'is not a valid type '
'for skipping rows'):
self.read_html(self.spam_data, '.*Water.*', skiprows='asdf')
def test_index(self):
df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0)
df2 = self.read_html(self.spam_data, 'Unit', index_col=0)
assert_framelist_equal(df1, df2)
def test_header_and_index_no_types(self):
df1 = self.read_html(self.spam_data, '.*Water.*', header=1,
index_col=0)
df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0)
assert_framelist_equal(df1, df2)
def test_header_and_index_with_types(self):
df1 = self.read_html(self.spam_data, '.*Water.*', header=1,
index_col=0)
df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0)
assert_framelist_equal(df1, df2)
def test_infer_types(self):
# 10892 infer_types removed
df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0)
df2 = self.read_html(self.spam_data, 'Unit', index_col=0)
assert_framelist_equal(df1, df2)
def test_string_io(self):
with open(self.spam_data, **self.spam_data_kwargs) as f:
data1 = StringIO(f.read())
with open(self.spam_data, **self.spam_data_kwargs) as f:
data2 = StringIO(f.read())
df1 = self.read_html(data1, '.*Water.*')
df2 = self.read_html(data2, 'Unit')
assert_framelist_equal(df1, df2)
def test_string(self):
with open(self.spam_data, **self.spam_data_kwargs) as f:
data = f.read()
df1 = self.read_html(data, '.*Water.*')
df2 = self.read_html(data, 'Unit')
assert_framelist_equal(df1, df2)
def test_file_like(self):
with open(self.spam_data, **self.spam_data_kwargs) as f:
df1 = self.read_html(f, '.*Water.*')
with open(self.spam_data, **self.spam_data_kwargs) as f:
df2 = self.read_html(f, 'Unit')
assert_framelist_equal(df1, df2)
@network
def test_bad_url_protocol(self):
with pytest.raises(URLError):
self.read_html('git://github.com', match='.*Water.*')
@network
def test_invalid_url(self):
try:
with pytest.raises(URLError):
self.read_html('http://www.a23950sdfa908sd.com',
match='.*Water.*')
except ValueError as e:
assert str(e) == 'No tables found'
@pytest.mark.slow
def test_file_url(self):
url = self.banklist_data
dfs = self.read_html(file_path_to_url(os.path.abspath(url)),
'First',
attrs={'id': 'table'})
assert isinstance(dfs, list)
for df in dfs:
assert isinstance(df, DataFrame)
@pytest.mark.slow
def test_invalid_table_attrs(self):
url = self.banklist_data
with tm.assert_raises_regex(ValueError, 'No tables found'):
self.read_html(url, 'First Federal Bank of Florida',
attrs={'id': 'tasdfable'})
def _bank_data(self, *args, **kwargs):
return self.read_html(self.banklist_data, 'Metcalf',
attrs={'id': 'table'}, *args, **kwargs)
@pytest.mark.slow
def test_multiindex_header(self):
df = self._bank_data(header=[0, 1])[0]
assert isinstance(df.columns, MultiIndex)
@pytest.mark.slow
def test_multiindex_index(self):
df = self._bank_data(index_col=[0, 1])[0]
assert isinstance(df.index, MultiIndex)
@pytest.mark.slow
def test_multiindex_header_index(self):
df = self._bank_data(header=[0, 1], index_col=[0, 1])[0]
assert isinstance(df.columns, MultiIndex)
assert isinstance(df.index, MultiIndex)
@pytest.mark.slow
def test_multiindex_header_skiprows_tuples(self):
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
df = self._bank_data(header=[0, 1], skiprows=1,
tupleize_cols=True)[0]
assert isinstance(df.columns, Index)
@pytest.mark.slow
def test_multiindex_header_skiprows(self):
df = self._bank_data(header=[0, 1], skiprows=1)[0]
assert isinstance(df.columns, MultiIndex)
@pytest.mark.slow
def test_multiindex_header_index_skiprows(self):
df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0]
assert isinstance(df.index, MultiIndex)
assert isinstance(df.columns, MultiIndex)
@pytest.mark.slow
def test_regex_idempotency(self):
url = self.banklist_data
dfs = self.read_html(file_path_to_url(os.path.abspath(url)),
match=re.compile(re.compile('Florida')),
attrs={'id': 'table'})
assert isinstance(dfs, list)
for df in dfs:
assert isinstance(df, DataFrame)
def test_negative_skiprows(self):
with tm.assert_raises_regex(ValueError,
r'\(you passed a negative value\)'):
self.read_html(self.spam_data, 'Water', skiprows=-1)
@network
def test_multiple_matches(self):
url = 'https://docs.python.org/2/'
dfs = self.read_html(url, match='Python')
assert len(dfs) > 1
@network
def test_python_docs_table(self):
url = 'https://docs.python.org/2/'
dfs = self.read_html(url, match='Python')
zz = [df.iloc[0, 0][0:4] for df in dfs]
assert sorted(zz) == sorted(['Repo', 'What'])
@pytest.mark.slow
def test_thousands_macau_stats(self, datapath):
all_non_nan_table_index = -2
macau_data = datapath("io", "data", "macau.html")
dfs = self.read_html(macau_data, index_col=0,
attrs={'class': 'style1'})
df = dfs[all_non_nan_table_index]
assert not any(s.isna().any() for _, s in df.iteritems())
@pytest.mark.slow
def test_thousands_macau_index_col(self, datapath):
all_non_nan_table_index = -2
macau_data = datapath('io', 'data', 'macau.html')
dfs = self.read_html(macau_data, index_col=0, header=0)
df = dfs[all_non_nan_table_index]
assert not any(s.isna().any() for _, s in df.iteritems())
def test_empty_tables(self):
"""
Make sure that read_html ignores empty tables.
"""
data1 = '''<table>
<thead>
<tr>
<th>A</th>
<th>B</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>2</td>
</tr>
</tbody>
</table>'''
data2 = data1 + '''<table>
<tbody>
</tbody>
</table>'''
res1 = self.read_html(StringIO(data1))
res2 = self.read_html(StringIO(data2))
assert_framelist_equal(res1, res2)
def test_multiple_tbody(self):
# GH-20690
# Read all tbody tags within a single table.
data = '''<table>
<thead>
<tr>
<th>A</th>
<th>B</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>2</td>
</tr>
</tbody>
<tbody>
<tr>
<td>3</td>
<td>4</td>
</tr>
</tbody>
</table>'''
expected = DataFrame({'A': [1, 3], 'B': [2, 4]})
result = self.read_html(StringIO(data))[0]
tm.assert_frame_equal(result, expected)
def test_header_and_one_column(self):
"""
Don't fail with bs4 when there is a header and only one column
as described in issue #9178
"""
data = StringIO('''<html>
<body>
<table>
<thead>
<tr>
<th>Header</th>
</tr>
</thead>
<tbody>
<tr>
<td>first</td>
</tr>
</tbody>
</table>
</body>
</html>''')
expected = DataFrame(data={'Header': 'first'}, index=[0])
result = self.read_html(data)[0]
tm.assert_frame_equal(result, expected)
def test_tfoot_read(self):
"""
Make sure that read_html reads tfoot, containing td or th.
Ignores empty tfoot
"""
data_template = '''<table>
<thead>
<tr>
<th>A</th>
<th>B</th>
</tr>
</thead>
<tbody>
<tr>
<td>bodyA</td>
<td>bodyB</td>
</tr>
</tbody>
<tfoot>
{footer}
</tfoot>
</table>'''
data1 = data_template.format(footer="")
data2 = data_template.format(
footer="<tr><td>footA</td><th>footB</th></tr>")
d1 = {'A': ['bodyA'], 'B': ['bodyB']}
d2 = {'A': ['bodyA', 'footA'], 'B': ['bodyB', 'footB']}
tm.assert_frame_equal(self.read_html(data1)[0], DataFrame(d1))
tm.assert_frame_equal(self.read_html(data2)[0], DataFrame(d2))
def test_countries_municipalities(self):
# GH5048
data1 = StringIO('''<table>
<thead>
<tr>
<th>Country</th>
<th>Municipality</th>
<th>Year</th>
</tr>
</thead>
<tbody>
<tr>
<td>Ukraine</td>
<th>Odessa</th>
<td>1944</td>
</tr>
</tbody>
</table>''')
data2 = StringIO('''
<table>
<tbody>
<tr>
<th>Country</th>
<th>Municipality</th>
<th>Year</th>
</tr>
<tr>
<td>Ukraine</td>
<th>Odessa</th>
<td>1944</td>
</tr>
</tbody>
</table>''')
res1 = self.read_html(data1)
res2 = self.read_html(data2, header=0)
assert_framelist_equal(res1, res2)
def test_nyse_wsj_commas_table(self, datapath):
data = datapath('io', 'data', 'nyse_wsj.html')
df = self.read_html(data, index_col=0, header=0,
attrs={'class': 'mdcTable'})[0]
columns = Index(['Issue(Roll over for charts and headlines)',
'Volume', 'Price', 'Chg', '% Chg'])
nrows = 100
assert df.shape[0] == nrows
tm.assert_index_equal(df.columns, columns)
@pytest.mark.slow
def test_banklist_header(self, datapath):
from pandas.io.html import _remove_whitespace
def try_remove_ws(x):
try:
return _remove_whitespace(x)
except AttributeError:
return x
df = self.read_html(self.banklist_data, 'Metcalf',
attrs={'id': 'table'})[0]
ground_truth = read_csv(datapath('io', 'data', 'banklist.csv'),
converters={'Updated Date': Timestamp,
'Closing Date': Timestamp})
assert df.shape == ground_truth.shape
old = ['First Vietnamese American BankIn Vietnamese',
'Westernbank Puerto RicoEn Espanol',
'R-G Premier Bank of Puerto RicoEn Espanol',
'EurobankEn Espanol', 'Sanderson State BankEn Espanol',
'Washington Mutual Bank(Including its subsidiary Washington '
'Mutual Bank FSB)',
'Silver State BankEn Espanol',
'AmTrade International BankEn Espanol',
'Hamilton Bank, NAEn Espanol',
'The Citizens Savings BankPioneer Community Bank, Inc.']
new = ['First Vietnamese American Bank', 'Westernbank Puerto Rico',
'R-G Premier Bank of Puerto Rico', 'Eurobank',
'Sanderson State Bank', 'Washington Mutual Bank',
'Silver State Bank', 'AmTrade International Bank',
'Hamilton Bank, NA', 'The Citizens Savings Bank']
dfnew = df.applymap(try_remove_ws).replace(old, new)
gtnew = ground_truth.applymap(try_remove_ws)
converted = dfnew._convert(datetime=True, numeric=True)
date_cols = ['Closing Date', 'Updated Date']
converted[date_cols] = converted[date_cols]._convert(datetime=True,
coerce=True)
tm.assert_frame_equal(converted, gtnew)
@pytest.mark.slow
def test_gold_canyon(self):
gc = 'Gold Canyon'
with open(self.banklist_data, 'r') as f:
raw_text = f.read()
assert gc in raw_text
df = self.read_html(self.banklist_data, 'Gold Canyon',
attrs={'id': 'table'})[0]
assert gc in df.to_string()
def test_different_number_of_rows(self):
expected = """<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>C_l0_g0</th>
<th>C_l0_g1</th>
<th>C_l0_g2</th>
<th>C_l0_g3</th>
<th>C_l0_g4</th>
</tr>
</thead>
<tbody>
<tr>
<th>R_l0_g0</th>
<td> 0.763</td>
<td> 0.233</td>
<td> nan</td>
<td> nan</td>
<td> nan</td>
</tr>
<tr>
<th>R_l0_g1</th>
<td> 0.244</td>
<td> 0.285</td>
<td> 0.392</td>
<td> 0.137</td>
<td> 0.222</td>
</tr>
</tbody>
</table>"""
out = """<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>C_l0_g0</th>
<th>C_l0_g1</th>
<th>C_l0_g2</th>
<th>C_l0_g3</th>
<th>C_l0_g4</th>
</tr>
</thead>
<tbody>
<tr>
<th>R_l0_g0</th>
<td> 0.763</td>
<td> 0.233</td>
</tr>
<tr>
<th>R_l0_g1</th>
<td> 0.244</td>
<td> 0.285</td>
<td> 0.392</td>
<td> 0.137</td>
<td> 0.222</td>
</tr>
</tbody>
</table>"""
expected = self.read_html(expected, index_col=0)[0]
res = self.read_html(out, index_col=0)[0]
tm.assert_frame_equal(expected, res)
def test_parse_dates_list(self):
df = DataFrame({'date': date_range('1/1/2001', periods=10)})
expected = df.to_html()
res = self.read_html(expected, parse_dates=[1], index_col=0)
tm.assert_frame_equal(df, res[0])
res = self.read_html(expected, parse_dates=['date'], index_col=0)
tm.assert_frame_equal(df, res[0])
def test_parse_dates_combine(self):
raw_dates = Series(date_range('1/1/2001', periods=10))
df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())),
'time': raw_dates.map(lambda x: str(x.time()))})
res = self.read_html(df.to_html(), parse_dates={'datetime': [1, 2]},
index_col=1)
newdf = DataFrame({'datetime': raw_dates})
tm.assert_frame_equal(newdf, res[0])
def test_computer_sales_page(self, datapath):
data = datapath('io', 'data', 'computer_sales_page.html')
with tm.assert_raises_regex(ParserError,
r"Passed header=\[0,1\] are "
r"too many rows for this "
r"multi_index of columns"):
self.read_html(data, header=[0, 1])
data = datapath('io', 'data', 'computer_sales_page.html')
assert self.read_html(data, header=[1, 2])
def test_wikipedia_states_table(self, datapath):
data = datapath('io', 'data', 'wikipedia_states.html')
assert os.path.isfile(data), '%r is not a file' % data
assert os.path.getsize(data), '%r is an empty file' % data
result = self.read_html(data, 'Arizona', header=1)[0]
assert result['sq mi'].dtype == np.dtype('float64')
def test_decimal_rows(self):
# GH 12907
data = StringIO('''<html>
<body>
<table>
<thead>
<tr>
<th>Header</th>
</tr>
</thead>
<tbody>
<tr>
<td>1100#101</td>
</tr>
</tbody>
</table>
</body>
</html>''')
expected = DataFrame(data={'Header': 1100.101}, index=[0])
result = self.read_html(data, decimal='#')[0]
assert result['Header'].dtype == np.dtype('float64')
tm.assert_frame_equal(result, expected)
def test_bool_header_arg(self):
# GH 6114
for arg in [True, False]:
with pytest.raises(TypeError):
read_html(self.spam_data, header=arg)
def test_converters(self):
# GH 13461
html_data = """<table>
<thead>
<th>a</th>
</tr>
</thead>
<tbody>
<tr>
<td> 0.763</td>
</tr>
<tr>
<td> 0.244</td>
</tr>
</tbody>
</table>"""
expected_df = DataFrame({'a': ['0.763', '0.244']})
html_df = read_html(html_data, converters={'a': str})[0]
tm.assert_frame_equal(expected_df, html_df)
def test_na_values(self):
# GH 13461
html_data = """<table>
<thead>
<th>a</th>
</tr>
</thead>
<tbody>
<tr>
<td> 0.763</td>
</tr>
<tr>
<td> 0.244</td>
</tr>
</tbody>
</table>"""
expected_df = DataFrame({'a': [0.763, np.nan]})
html_df = read_html(html_data, na_values=[0.244])[0]
tm.assert_frame_equal(expected_df, html_df)
def test_keep_default_na(self):
html_data = """<table>
<thead>
<th>a</th>
</tr>
</thead>
<tbody>
<tr>
<td> N/A</td>
</tr>
<tr>
<td> NA</td>
</tr>
</tbody>
</table>"""
expected_df = DataFrame({'a': ['N/A', 'NA']})
html_df = read_html(html_data, keep_default_na=False)[0]
tm.assert_frame_equal(expected_df, html_df)
expected_df = DataFrame({'a': [np.nan, np.nan]})
html_df = read_html(html_data, keep_default_na=True)[0]
tm.assert_frame_equal(expected_df, html_df)
def test_multiple_header_rows(self):
# Issue #13434
expected_df = DataFrame(data=[("Hillary", 68, "D"),
("Bernie", 74, "D"),
("Donald", 69, "R")])
expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"],
["Name", "Unnamed: 1_level_1",
"Unnamed: 2_level_1"]]
html = expected_df.to_html(index=False)
html_df = read_html(html, )[0]
tm.assert_frame_equal(expected_df, html_df)
def test_works_on_valid_markup(self, datapath):
filename = datapath('io', 'data', 'valid_markup.html')
dfs = self.read_html(filename, index_col=0)
assert isinstance(dfs, list)
assert isinstance(dfs[0], DataFrame)
@pytest.mark.slow
def test_fallback_success(self, datapath):
banklist_data = datapath('io', 'data', 'banklist.html')
self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib'])
def test_to_html_timestamp(self):
rng = date_range('2000-01-01', periods=10)
df = DataFrame(np.random.randn(10, 4), index=rng)
result = df.to_html()
assert '2000-01-01' in result
@pytest.mark.parametrize("displayed_only,exp0,exp1", [
(True, DataFrame(["foo"]), None),
(False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))])
def test_displayed_only(self, displayed_only, exp0, exp1):
# GH 20027
data = StringIO("""<html>
<body>
<table>
<tr>
<td>
foo
<span style="display:none;text-align:center">bar</span>
<span style="display:none">baz</span>
<span style="display: none">qux</span>
</td>
</tr>
</table>
<table style="display: none">
<tr>
<td>foo</td>
</tr>
</table>
</body>
</html>""")
dfs = self.read_html(data, displayed_only=displayed_only)
tm.assert_frame_equal(dfs[0], exp0)
if exp1 is not None:
tm.assert_frame_equal(dfs[1], exp1)
else:
assert len(dfs) == 1 # Should not parse hidden table
def test_encode(self, html_encoding_file):
_, encoding = os.path.splitext(
os.path.basename(html_encoding_file)
)[0].split('_')
try:
with open(html_encoding_file, 'rb') as fobj:
from_string = self.read_html(fobj.read(), encoding=encoding,
index_col=0).pop()
with open(html_encoding_file, 'rb') as fobj:
from_file_like = self.read_html(BytesIO(fobj.read()),
encoding=encoding,
index_col=0).pop()
from_filename = self.read_html(html_encoding_file,
encoding=encoding,
index_col=0).pop()
tm.assert_frame_equal(from_string, from_file_like)
tm.assert_frame_equal(from_string, from_filename)
except Exception:
# seems utf-16/32 fail on windows
if is_platform_windows():
if '16' in encoding or '32' in encoding:
pytest.skip()
raise
def test_parse_failure_unseekable(self):
# Issue #17975
if self.read_html.keywords.get('flavor') == 'lxml':
pytest.skip("Not applicable for lxml")
class UnseekableStringIO(StringIO):
def seekable(self):
return False
bad = UnseekableStringIO('''
<table><tr><td>spam<foobr />eggs</td></tr></table>''')
assert self.read_html(bad)
with pytest.raises(ValueError,
match='passed a non-rewindable file object'):
self.read_html(bad)
def test_parse_failure_rewinds(self):
# Issue #17975
class MockFile(object):
def __init__(self, data):
self.data = data
self.at_end = False
def read(self, size=None):
data = '' if self.at_end else self.data
self.at_end = True
return data
def seek(self, offset):
self.at_end = False
def seekable(self):
return True
good = MockFile('<table><tr><td>spam<br />eggs</td></tr></table>')
bad = MockFile('<table><tr><td>spam<foobr />eggs</td></tr></table>')
assert self.read_html(good)
assert self.read_html(bad)
@pytest.mark.slow
def test_importcheck_thread_safety(self, datapath):
# see gh-16928
class ErrorThread(threading.Thread):
def run(self):
try:
super(ErrorThread, self).run()
except Exception as e:
self.err = e
else:
self.err = None
# force import check by reinitalising global vars in html.py
reload(pandas.io.html)
filename = datapath('io', 'data', 'valid_markup.html')
helper_thread1 = ErrorThread(target=self.read_html, args=(filename,))
helper_thread2 = ErrorThread(target=self.read_html, args=(filename,))
helper_thread1.start()
helper_thread2.start()
while helper_thread1.is_alive() or helper_thread2.is_alive():
pass
assert None is helper_thread1.err is helper_thread2.err
@@ -0,0 +1,940 @@
import pytest
from warnings import catch_warnings
import os
import datetime
import glob
import numpy as np
from distutils.version import LooseVersion
from pandas import compat
from pandas.compat import u, PY3
from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range,
date_range, period_range, Index, Categorical,
Period, Interval)
from pandas.errors import PerformanceWarning
from pandas.io.packers import to_msgpack, read_msgpack
import pandas.util.testing as tm
from pandas.util.testing import (ensure_clean,
assert_categorical_equal,
assert_frame_equal,
assert_index_equal,
assert_series_equal,
patch)
from pandas.tests.test_panel import assert_panel_equal
import pandas
from pandas import Timestamp, NaT
from pandas._libs.tslib import iNaT
nan = np.nan
try:
import blosc # NOQA
except ImportError:
_BLOSC_INSTALLED = False
else:
_BLOSC_INSTALLED = True
try:
import zlib # NOQA
except ImportError:
_ZLIB_INSTALLED = False
else:
_ZLIB_INSTALLED = True
@pytest.fixture(scope='module')
def current_packers_data():
# our current version packers data
from pandas.tests.io.generate_legacy_storage_files import (
create_msgpack_data)
return create_msgpack_data()
@pytest.fixture(scope='module')
def all_packers_data():
# our all of our current version packers data
from pandas.tests.io.generate_legacy_storage_files import (
create_data)
return create_data()
def check_arbitrary(a, b):
if isinstance(a, (list, tuple)) and isinstance(b, (list, tuple)):
assert(len(a) == len(b))
for a_, b_ in zip(a, b):
check_arbitrary(a_, b_)
elif isinstance(a, Panel):
assert_panel_equal(a, b)
elif isinstance(a, DataFrame):
assert_frame_equal(a, b)
elif isinstance(a, Series):
assert_series_equal(a, b)
elif isinstance(a, Index):
assert_index_equal(a, b)
elif isinstance(a, Categorical):
# Temp,
# Categorical.categories is changed from str to bytes in PY3
# maybe the same as GH 13591
if PY3 and b.categories.inferred_type == 'string':
pass
else:
tm.assert_categorical_equal(a, b)
elif a is NaT:
assert b is NaT
elif isinstance(a, Timestamp):
assert a == b
assert a.freq == b.freq
else:
assert(a == b)
class TestPackers(object):
def setup_method(self, method):
self.path = '__%s__.msg' % tm.rands(10)
def teardown_method(self, method):
pass
def encode_decode(self, x, compress=None, **kwargs):
with ensure_clean(self.path) as p:
to_msgpack(p, x, compress=compress, **kwargs)
return read_msgpack(p, **kwargs)
class TestAPI(TestPackers):
def test_string_io(self):
df = DataFrame(np.random.randn(10, 2))
s = df.to_msgpack(None)
result = read_msgpack(s)
tm.assert_frame_equal(result, df)
s = df.to_msgpack()
result = read_msgpack(s)
tm.assert_frame_equal(result, df)
s = df.to_msgpack()
result = read_msgpack(compat.BytesIO(s))
tm.assert_frame_equal(result, df)
s = to_msgpack(None, df)
result = read_msgpack(s)
tm.assert_frame_equal(result, df)
with ensure_clean(self.path) as p:
s = df.to_msgpack()
fh = open(p, 'wb')
fh.write(s)
fh.close()
result = read_msgpack(p)
tm.assert_frame_equal(result, df)
def test_path_pathlib(self):
df = tm.makeDataFrame()
result = tm.round_trip_pathlib(df.to_msgpack, read_msgpack)
tm.assert_frame_equal(df, result)
def test_path_localpath(self):
df = tm.makeDataFrame()
result = tm.round_trip_localpath(df.to_msgpack, read_msgpack)
tm.assert_frame_equal(df, result)
def test_iterator_with_string_io(self):
dfs = [DataFrame(np.random.randn(10, 2)) for i in range(5)]
s = to_msgpack(None, *dfs)
for i, result in enumerate(read_msgpack(s, iterator=True)):
tm.assert_frame_equal(result, dfs[i])
def test_invalid_arg(self):
# GH10369
class A(object):
def __init__(self):
self.read = 0
pytest.raises(ValueError, read_msgpack, path_or_buf=None)
pytest.raises(ValueError, read_msgpack, path_or_buf={})
pytest.raises(ValueError, read_msgpack, path_or_buf=A())
class TestNumpy(TestPackers):
def test_numpy_scalar_float(self):
x = np.float32(np.random.rand())
x_rec = self.encode_decode(x)
tm.assert_almost_equal(x, x_rec)
def test_numpy_scalar_complex(self):
x = np.complex64(np.random.rand() + 1j * np.random.rand())
x_rec = self.encode_decode(x)
assert np.allclose(x, x_rec)
def test_scalar_float(self):
x = np.random.rand()
x_rec = self.encode_decode(x)
tm.assert_almost_equal(x, x_rec)
def test_scalar_bool(self):
x = np.bool_(1)
x_rec = self.encode_decode(x)
tm.assert_almost_equal(x, x_rec)
x = np.bool_(0)
x_rec = self.encode_decode(x)
tm.assert_almost_equal(x, x_rec)
def test_scalar_complex(self):
x = np.random.rand() + 1j * np.random.rand()
x_rec = self.encode_decode(x)
assert np.allclose(x, x_rec)
def test_list_numpy_float(self):
x = [np.float32(np.random.rand()) for i in range(5)]
x_rec = self.encode_decode(x)
# current msgpack cannot distinguish list/tuple
tm.assert_almost_equal(tuple(x), x_rec)
x_rec = self.encode_decode(tuple(x))
tm.assert_almost_equal(tuple(x), x_rec)
def test_list_numpy_float_complex(self):
if not hasattr(np, 'complex128'):
pytest.skip('numpy can not handle complex128')
x = [np.float32(np.random.rand()) for i in range(5)] + \
[np.complex128(np.random.rand() + 1j * np.random.rand())
for i in range(5)]
x_rec = self.encode_decode(x)
assert np.allclose(x, x_rec)
def test_list_float(self):
x = [np.random.rand() for i in range(5)]
x_rec = self.encode_decode(x)
# current msgpack cannot distinguish list/tuple
tm.assert_almost_equal(tuple(x), x_rec)
x_rec = self.encode_decode(tuple(x))
tm.assert_almost_equal(tuple(x), x_rec)
def test_list_float_complex(self):
x = [np.random.rand() for i in range(5)] + \
[(np.random.rand() + 1j * np.random.rand()) for i in range(5)]
x_rec = self.encode_decode(x)
assert np.allclose(x, x_rec)
def test_dict_float(self):
x = {'foo': 1.0, 'bar': 2.0}
x_rec = self.encode_decode(x)
tm.assert_almost_equal(x, x_rec)
def test_dict_complex(self):
x = {'foo': 1.0 + 1.0j, 'bar': 2.0 + 2.0j}
x_rec = self.encode_decode(x)
tm.assert_dict_equal(x, x_rec)
for key in x:
tm.assert_class_equal(x[key], x_rec[key], obj="complex value")
def test_dict_numpy_float(self):
x = {'foo': np.float32(1.0), 'bar': np.float32(2.0)}
x_rec = self.encode_decode(x)
tm.assert_almost_equal(x, x_rec)
def test_dict_numpy_complex(self):
x = {'foo': np.complex128(1.0 + 1.0j),
'bar': np.complex128(2.0 + 2.0j)}
x_rec = self.encode_decode(x)
tm.assert_dict_equal(x, x_rec)
for key in x:
tm.assert_class_equal(x[key], x_rec[key], obj="numpy complex128")
def test_numpy_array_float(self):
# run multiple times
for n in range(10):
x = np.random.rand(10)
for dtype in ['float32', 'float64']:
x = x.astype(dtype)
x_rec = self.encode_decode(x)
tm.assert_almost_equal(x, x_rec)
def test_numpy_array_complex(self):
x = (np.random.rand(5) + 1j * np.random.rand(5)).astype(np.complex128)
x_rec = self.encode_decode(x)
assert (all(map(lambda x, y: x == y, x, x_rec)) and
x.dtype == x_rec.dtype)
def test_list_mixed(self):
x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo'), np.bool_(1)]
x_rec = self.encode_decode(x)
# current msgpack cannot distinguish list/tuple
tm.assert_almost_equal(tuple(x), x_rec)
x_rec = self.encode_decode(tuple(x))
tm.assert_almost_equal(tuple(x), x_rec)
class TestBasic(TestPackers):
def test_timestamp(self):
for i in [Timestamp(
'20130101'), Timestamp('20130101', tz='US/Eastern'),
Timestamp('201301010501')]:
i_rec = self.encode_decode(i)
assert i == i_rec
def test_nat(self):
nat_rec = self.encode_decode(NaT)
assert NaT is nat_rec
def test_datetimes(self):
for i in [datetime.datetime(2013, 1, 1),
datetime.datetime(2013, 1, 1, 5, 1),
datetime.date(2013, 1, 1),
np.datetime64(datetime.datetime(2013, 1, 5, 2, 15))]:
i_rec = self.encode_decode(i)
assert i == i_rec
def test_timedeltas(self):
for i in [datetime.timedelta(days=1),
datetime.timedelta(days=1, seconds=10),
np.timedelta64(1000000)]:
i_rec = self.encode_decode(i)
assert i == i_rec
def test_periods(self):
# 13463
for i in [Period('2010-09', 'M'), Period('2014-Q1', 'Q')]:
i_rec = self.encode_decode(i)
assert i == i_rec
def test_intervals(self):
# 19967
for i in [Interval(0, 1), Interval(0, 1, 'left'),
Interval(10, 25., 'right')]:
i_rec = self.encode_decode(i)
assert i == i_rec
class TestIndex(TestPackers):
def setup_method(self, method):
super(TestIndex, self).setup_method(method)
self.d = {
'string': tm.makeStringIndex(100),
'date': tm.makeDateIndex(100),
'int': tm.makeIntIndex(100),
'rng': tm.makeRangeIndex(100),
'float': tm.makeFloatIndex(100),
'empty': Index([]),
'tuple': Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])),
'period': Index(period_range('2012-1-1', freq='M', periods=3)),
'date2': Index(date_range('2013-01-1', periods=10)),
'bdate': Index(bdate_range('2013-01-02', periods=10)),
'cat': tm.makeCategoricalIndex(100),
'interval': tm.makeIntervalIndex(100),
'timedelta': tm.makeTimedeltaIndex(100, 'H')
}
self.mi = {
'reg': MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'),
('foo', 'two'),
('qux', 'one'), ('qux', 'two')],
names=['first', 'second']),
}
def test_basic_index(self):
for s, i in self.d.items():
i_rec = self.encode_decode(i)
tm.assert_index_equal(i, i_rec)
# datetime with no freq (GH5506)
i = Index([Timestamp('20130101'), Timestamp('20130103')])
i_rec = self.encode_decode(i)
tm.assert_index_equal(i, i_rec)
# datetime with timezone
i = Index([Timestamp('20130101 9:00:00'), Timestamp(
'20130103 11:00:00')]).tz_localize('US/Eastern')
i_rec = self.encode_decode(i)
tm.assert_index_equal(i, i_rec)
def test_multi_index(self):
for s, i in self.mi.items():
i_rec = self.encode_decode(i)
tm.assert_index_equal(i, i_rec)
def test_unicode(self):
i = tm.makeUnicodeIndex(100)
i_rec = self.encode_decode(i)
tm.assert_index_equal(i, i_rec)
def categorical_index(self):
# GH15487
df = DataFrame(np.random.randn(10, 2))
df = df.astype({0: 'category'}).set_index(0)
result = self.encode_decode(df)
tm.assert_frame_equal(result, df)
class TestSeries(TestPackers):
def setup_method(self, method):
super(TestSeries, self).setup_method(method)
self.d = {}
s = tm.makeStringSeries()
s.name = 'string'
self.d['string'] = s
s = tm.makeObjectSeries()
s.name = 'object'
self.d['object'] = s
s = Series(iNaT, dtype='M8[ns]', index=range(5))
self.d['date'] = s
data = {
'A': [0., 1., 2., 3., np.nan],
'B': [0, 1, 0, 1, 0],
'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
'D': date_range('1/1/2009', periods=5),
'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
'F': [Timestamp('20130102', tz='US/Eastern')] * 2 +
[Timestamp('20130603', tz='CET')] * 3,
'G': [Timestamp('20130102', tz='US/Eastern')] * 5,
'H': Categorical([1, 2, 3, 4, 5]),
'I': Categorical([1, 2, 3, 4, 5], ordered=True),
'J': (np.bool_(1), 2, 3, 4, 5),
}
self.d['float'] = Series(data['A'])
self.d['int'] = Series(data['B'])
self.d['mixed'] = Series(data['E'])
self.d['dt_tz_mixed'] = Series(data['F'])
self.d['dt_tz'] = Series(data['G'])
self.d['cat_ordered'] = Series(data['H'])
self.d['cat_unordered'] = Series(data['I'])
self.d['numpy_bool_mixed'] = Series(data['J'])
def test_basic(self):
# run multiple times here
for n in range(10):
for s, i in self.d.items():
i_rec = self.encode_decode(i)
assert_series_equal(i, i_rec)
class TestCategorical(TestPackers):
def setup_method(self, method):
super(TestCategorical, self).setup_method(method)
self.d = {}
self.d['plain_str'] = Categorical(['a', 'b', 'c', 'd', 'e'])
self.d['plain_str_ordered'] = Categorical(['a', 'b', 'c', 'd', 'e'],
ordered=True)
self.d['plain_int'] = Categorical([5, 6, 7, 8])
self.d['plain_int_ordered'] = Categorical([5, 6, 7, 8], ordered=True)
def test_basic(self):
# run multiple times here
for n in range(10):
for s, i in self.d.items():
i_rec = self.encode_decode(i)
assert_categorical_equal(i, i_rec)
class TestNDFrame(TestPackers):
def setup_method(self, method):
super(TestNDFrame, self).setup_method(method)
data = {
'A': [0., 1., 2., 3., np.nan],
'B': [0, 1, 0, 1, 0],
'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
'D': date_range('1/1/2009', periods=5),
'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
'F': [Timestamp('20130102', tz='US/Eastern')] * 5,
'G': [Timestamp('20130603', tz='CET')] * 5,
'H': Categorical(['a', 'b', 'c', 'd', 'e']),
'I': Categorical(['a', 'b', 'c', 'd', 'e'], ordered=True),
}
self.frame = {
'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)),
'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)),
'mixed': DataFrame(data)}
with catch_warnings(record=True):
self.panel = {
'float': Panel(dict(ItemA=self.frame['float'],
ItemB=self.frame['float'] + 1))}
def test_basic_frame(self):
for s, i in self.frame.items():
i_rec = self.encode_decode(i)
assert_frame_equal(i, i_rec)
def test_basic_panel(self):
with catch_warnings(record=True):
for s, i in self.panel.items():
i_rec = self.encode_decode(i)
assert_panel_equal(i, i_rec)
def test_multi(self):
i_rec = self.encode_decode(self.frame)
for k in self.frame.keys():
assert_frame_equal(self.frame[k], i_rec[k])
l = tuple([self.frame['float'], self.frame['float'].A,
self.frame['float'].B, None])
l_rec = self.encode_decode(l)
check_arbitrary(l, l_rec)
# this is an oddity in that packed lists will be returned as tuples
l = [self.frame['float'], self.frame['float']
.A, self.frame['float'].B, None]
l_rec = self.encode_decode(l)
assert isinstance(l_rec, tuple)
check_arbitrary(l, l_rec)
def test_iterator(self):
l = [self.frame['float'], self.frame['float']
.A, self.frame['float'].B, None]
with ensure_clean(self.path) as path:
to_msgpack(path, *l)
for i, packed in enumerate(read_msgpack(path, iterator=True)):
check_arbitrary(packed, l[i])
def tests_datetimeindex_freq_issue(self):
# GH 5947
# inferring freq on the datetimeindex
df = DataFrame([1, 2, 3], index=date_range('1/1/2013', '1/3/2013'))
result = self.encode_decode(df)
assert_frame_equal(result, df)
df = DataFrame([1, 2], index=date_range('1/1/2013', '1/2/2013'))
result = self.encode_decode(df)
assert_frame_equal(result, df)
def test_dataframe_duplicate_column_names(self):
# GH 9618
expected_1 = DataFrame(columns=['a', 'a'])
expected_2 = DataFrame(columns=[1] * 100)
expected_2.loc[0] = np.random.randn(100)
expected_3 = DataFrame(columns=[1, 1])
expected_3.loc[0] = ['abc', np.nan]
result_1 = self.encode_decode(expected_1)
result_2 = self.encode_decode(expected_2)
result_3 = self.encode_decode(expected_3)
assert_frame_equal(result_1, expected_1)
assert_frame_equal(result_2, expected_2)
assert_frame_equal(result_3, expected_3)
class TestSparse(TestPackers):
def _check_roundtrip(self, obj, comparator, **kwargs):
# currently these are not implemetned
# i_rec = self.encode_decode(obj)
# comparator(obj, i_rec, **kwargs)
pytest.raises(NotImplementedError, self.encode_decode, obj)
def test_sparse_series(self):
s = tm.makeStringSeries()
s[3:5] = np.nan
ss = s.to_sparse()
self._check_roundtrip(ss, tm.assert_series_equal,
check_series_type=True)
ss2 = s.to_sparse(kind='integer')
self._check_roundtrip(ss2, tm.assert_series_equal,
check_series_type=True)
ss3 = s.to_sparse(fill_value=0)
self._check_roundtrip(ss3, tm.assert_series_equal,
check_series_type=True)
def test_sparse_frame(self):
s = tm.makeDataFrame()
s.loc[3:5, 1:3] = np.nan
s.loc[8:10, -2] = np.nan
ss = s.to_sparse()
self._check_roundtrip(ss, tm.assert_frame_equal,
check_frame_type=True)
ss2 = s.to_sparse(kind='integer')
self._check_roundtrip(ss2, tm.assert_frame_equal,
check_frame_type=True)
ss3 = s.to_sparse(fill_value=0)
self._check_roundtrip(ss3, tm.assert_frame_equal,
check_frame_type=True)
class TestCompression(TestPackers):
"""See https://github.com/pandas-dev/pandas/pull/9783
"""
def setup_method(self, method):
try:
from sqlalchemy import create_engine
self._create_sql_engine = create_engine
except ImportError:
self._SQLALCHEMY_INSTALLED = False
else:
self._SQLALCHEMY_INSTALLED = True
super(TestCompression, self).setup_method(method)
data = {
'A': np.arange(1000, dtype=np.float64),
'B': np.arange(1000, dtype=np.int32),
'C': list(100 * 'abcdefghij'),
'D': date_range(datetime.datetime(2015, 4, 1), periods=1000),
'E': [datetime.timedelta(days=x) for x in range(1000)],
}
self.frame = {
'float': DataFrame({k: data[k] for k in ['A', 'A']}),
'int': DataFrame({k: data[k] for k in ['B', 'B']}),
'mixed': DataFrame(data),
}
def test_plain(self):
i_rec = self.encode_decode(self.frame)
for k in self.frame.keys():
assert_frame_equal(self.frame[k], i_rec[k])
def _test_compression(self, compress):
i_rec = self.encode_decode(self.frame, compress=compress)
for k in self.frame.keys():
value = i_rec[k]
expected = self.frame[k]
assert_frame_equal(value, expected)
# make sure that we can write to the new frames
for block in value._data.blocks:
assert block.values.flags.writeable
def test_compression_zlib(self):
if not _ZLIB_INSTALLED:
pytest.skip('no zlib')
self._test_compression('zlib')
def test_compression_blosc(self):
if not _BLOSC_INSTALLED:
pytest.skip('no blosc')
self._test_compression('blosc')
def _test_compression_warns_when_decompress_caches(self, compress):
not_garbage = []
control = [] # copied data
compress_module = globals()[compress]
real_decompress = compress_module.decompress
def decompress(ob):
"""mock decompress function that delegates to the real
decompress but caches the result and a copy of the result.
"""
res = real_decompress(ob)
not_garbage.append(res) # hold a reference to this bytes object
control.append(bytearray(res)) # copy the data here to check later
return res
# types mapped to values to add in place.
rhs = {
np.dtype('float64'): 1.0,
np.dtype('int32'): 1,
np.dtype('object'): 'a',
np.dtype('datetime64[ns]'): np.timedelta64(1, 'ns'),
np.dtype('timedelta64[ns]'): np.timedelta64(1, 'ns'),
}
with patch(compress_module, 'decompress', decompress), \
tm.assert_produces_warning(PerformanceWarning) as ws:
i_rec = self.encode_decode(self.frame, compress=compress)
for k in self.frame.keys():
value = i_rec[k]
expected = self.frame[k]
assert_frame_equal(value, expected)
# make sure that we can write to the new frames even though
# we needed to copy the data
for block in value._data.blocks:
assert block.values.flags.writeable
# mutate the data in some way
block.values[0] += rhs[block.dtype]
for w in ws:
# check the messages from our warnings
assert str(w.message) == ('copying data after decompressing; '
'this may mean that decompress is '
'caching its result')
for buf, control_buf in zip(not_garbage, control):
# make sure none of our mutations above affected the
# original buffers
assert buf == control_buf
def test_compression_warns_when_decompress_caches_zlib(self):
if not _ZLIB_INSTALLED:
pytest.skip('no zlib')
self._test_compression_warns_when_decompress_caches('zlib')
def test_compression_warns_when_decompress_caches_blosc(self):
if not _BLOSC_INSTALLED:
pytest.skip('no blosc')
self._test_compression_warns_when_decompress_caches('blosc')
def _test_small_strings_no_warn(self, compress):
empty = np.array([], dtype='uint8')
with tm.assert_produces_warning(None):
empty_unpacked = self.encode_decode(empty, compress=compress)
tm.assert_numpy_array_equal(empty_unpacked, empty)
assert empty_unpacked.flags.writeable
char = np.array([ord(b'a')], dtype='uint8')
with tm.assert_produces_warning(None):
char_unpacked = self.encode_decode(char, compress=compress)
tm.assert_numpy_array_equal(char_unpacked, char)
assert char_unpacked.flags.writeable
# if this test fails I am sorry because the interpreter is now in a
# bad state where b'a' points to 98 == ord(b'b').
char_unpacked[0] = ord(b'b')
# we compare the ord of bytes b'a' with unicode u'a' because the should
# always be the same (unless we were able to mutate the shared
# character singleton in which case ord(b'a') == ord(b'b').
assert ord(b'a') == ord(u'a')
tm.assert_numpy_array_equal(
char_unpacked,
np.array([ord(b'b')], dtype='uint8'),
)
def test_small_strings_no_warn_zlib(self):
if not _ZLIB_INSTALLED:
pytest.skip('no zlib')
self._test_small_strings_no_warn('zlib')
def test_small_strings_no_warn_blosc(self):
if not _BLOSC_INSTALLED:
pytest.skip('no blosc')
self._test_small_strings_no_warn('blosc')
def test_readonly_axis_blosc(self):
# GH11880
if not _BLOSC_INSTALLED:
pytest.skip('no blosc')
df1 = DataFrame({'A': list('abcd')})
df2 = DataFrame(df1, index=[1., 2., 3., 4.])
assert 1 in self.encode_decode(df1['A'], compress='blosc')
assert 1. in self.encode_decode(df2['A'], compress='blosc')
def test_readonly_axis_zlib(self):
# GH11880
df1 = DataFrame({'A': list('abcd')})
df2 = DataFrame(df1, index=[1., 2., 3., 4.])
assert 1 in self.encode_decode(df1['A'], compress='zlib')
assert 1. in self.encode_decode(df2['A'], compress='zlib')
def test_readonly_axis_blosc_to_sql(self):
# GH11880
if not _BLOSC_INSTALLED:
pytest.skip('no blosc')
if not self._SQLALCHEMY_INSTALLED:
pytest.skip('no sqlalchemy')
expected = DataFrame({'A': list('abcd')})
df = self.encode_decode(expected, compress='blosc')
eng = self._create_sql_engine("sqlite:///:memory:")
df.to_sql('test', eng, if_exists='append')
result = pandas.read_sql_table('test', eng, index_col='index')
result.index.names = [None]
assert_frame_equal(expected, result)
def test_readonly_axis_zlib_to_sql(self):
# GH11880
if not _ZLIB_INSTALLED:
pytest.skip('no zlib')
if not self._SQLALCHEMY_INSTALLED:
pytest.skip('no sqlalchemy')
expected = DataFrame({'A': list('abcd')})
df = self.encode_decode(expected, compress='zlib')
eng = self._create_sql_engine("sqlite:///:memory:")
df.to_sql('test', eng, if_exists='append')
result = pandas.read_sql_table('test', eng, index_col='index')
result.index.names = [None]
assert_frame_equal(expected, result)
class TestEncoding(TestPackers):
def setup_method(self, method):
super(TestEncoding, self).setup_method(method)
data = {
'A': [compat.u('\u2019')] * 1000,
'B': np.arange(1000, dtype=np.int32),
'C': list(100 * 'abcdefghij'),
'D': date_range(datetime.datetime(2015, 4, 1), periods=1000),
'E': [datetime.timedelta(days=x) for x in range(1000)],
'G': [400] * 1000
}
self.frame = {
'float': DataFrame({k: data[k] for k in ['A', 'A']}),
'int': DataFrame({k: data[k] for k in ['B', 'B']}),
'mixed': DataFrame(data),
}
self.utf_encodings = ['utf8', 'utf16', 'utf32']
def test_utf(self):
# GH10581
for encoding in self.utf_encodings:
for frame in compat.itervalues(self.frame):
result = self.encode_decode(frame, encoding=encoding)
assert_frame_equal(result, frame)
def test_default_encoding(self):
for frame in compat.itervalues(self.frame):
result = frame.to_msgpack()
expected = frame.to_msgpack(encoding='utf8')
assert result == expected
result = self.encode_decode(frame)
assert_frame_equal(result, frame)
files = glob.glob(os.path.join(os.path.dirname(__file__), "data",
"legacy_msgpack", "*", "*.msgpack"))
@pytest.fixture(params=files)
def legacy_packer(request, datapath):
return datapath(request.param)
class TestMsgpack(object):
"""
How to add msgpack tests:
1. Install pandas version intended to output the msgpack.
TestPackers
2. Execute "generate_legacy_storage_files.py" to create the msgpack.
$ python generate_legacy_storage_files.py <output_dir> msgpack
3. Move the created pickle to "data/legacy_msgpack/<version>" directory.
"""
minimum_structure = {'series': ['float', 'int', 'mixed',
'ts', 'mi', 'dup'],
'frame': ['float', 'int', 'mixed', 'mi'],
'panel': ['float'],
'index': ['int', 'date', 'period'],
'mi': ['reg2']}
def check_min_structure(self, data, version):
for typ, v in self.minimum_structure.items():
assert typ in data, '"{0}" not found in unpacked data'.format(typ)
for kind in v:
msg = '"{0}" not found in data["{1}"]'.format(kind, typ)
assert kind in data[typ], msg
def compare(self, current_data, all_data, vf, version):
# GH12277 encoding default used to be latin-1, now utf-8
if LooseVersion(version) < LooseVersion('0.18.0'):
data = read_msgpack(vf, encoding='latin-1')
else:
data = read_msgpack(vf)
self.check_min_structure(data, version)
for typ, dv in data.items():
assert typ in all_data, ('unpacked data contains '
'extra key "{0}"'
.format(typ))
for dt, result in dv.items():
assert dt in current_data[typ], ('data["{0}"] contains extra '
'key "{1}"'.format(typ, dt))
try:
expected = current_data[typ][dt]
except KeyError:
continue
# use a specific comparator
# if available
comp_method = "compare_{typ}_{dt}".format(typ=typ, dt=dt)
comparator = getattr(self, comp_method, None)
if comparator is not None:
comparator(result, expected, typ, version)
else:
check_arbitrary(result, expected)
return data
def compare_series_dt_tz(self, result, expected, typ, version):
# 8260
# dtype is object < 0.17.0
if LooseVersion(version) < LooseVersion('0.17.0'):
expected = expected.astype(object)
tm.assert_series_equal(result, expected)
else:
tm.assert_series_equal(result, expected)
def compare_frame_dt_mixed_tzs(self, result, expected, typ, version):
# 8260
# dtype is object < 0.17.0
if LooseVersion(version) < LooseVersion('0.17.0'):
expected = expected.astype(object)
tm.assert_frame_equal(result, expected)
else:
tm.assert_frame_equal(result, expected)
def test_msgpacks_legacy(self, current_packers_data, all_packers_data,
legacy_packer, datapath):
version = os.path.basename(os.path.dirname(legacy_packer))
# GH12142 0.17 files packed in P2 can't be read in P3
if (compat.PY3 and version.startswith('0.17.') and
legacy_packer.split('.')[-4][-1] == '2'):
msg = "Files packed in Py2 can't be read in Py3 ({})"
pytest.skip(msg.format(version))
try:
with catch_warnings(record=True):
self.compare(current_packers_data, all_packers_data,
legacy_packer, version)
except ImportError:
# blosc not installed
pass
@@ -0,0 +1,504 @@
""" test parquet compat """
import pytest
import datetime
from distutils.version import LooseVersion
from warnings import catch_warnings
import numpy as np
import pandas as pd
from pandas.compat import PY3, is_platform_windows, is_platform_mac
from pandas.io.parquet import (to_parquet, read_parquet, get_engine,
PyArrowImpl, FastParquetImpl)
from pandas.util import testing as tm
try:
import pyarrow # noqa
_HAVE_PYARROW = True
except ImportError:
_HAVE_PYARROW = False
try:
import fastparquet # noqa
_HAVE_FASTPARQUET = True
except ImportError:
_HAVE_FASTPARQUET = False
# setup engines & skips
@pytest.fixture(params=[
pytest.param('fastparquet',
marks=pytest.mark.skipif(not _HAVE_FASTPARQUET,
reason='fastparquet is '
'not installed')),
pytest.param('pyarrow',
marks=pytest.mark.skipif(not _HAVE_PYARROW,
reason='pyarrow is '
'not installed'))])
def engine(request):
return request.param
@pytest.fixture
def pa():
if not _HAVE_PYARROW:
pytest.skip("pyarrow is not installed")
return 'pyarrow'
@pytest.fixture
def pa_lt_070():
if not _HAVE_PYARROW:
pytest.skip("pyarrow is not installed")
if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'):
pytest.skip("pyarrow is >= 0.7.0")
return 'pyarrow'
@pytest.fixture
def pa_ge_070():
if not _HAVE_PYARROW:
pytest.skip("pyarrow is not installed")
if LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'):
pytest.skip("pyarrow is < 0.7.0")
return 'pyarrow'
@pytest.fixture
def fp():
if not _HAVE_FASTPARQUET:
pytest.skip("fastparquet is not installed")
return 'fastparquet'
@pytest.fixture
def fp_lt_014():
if not _HAVE_FASTPARQUET:
pytest.skip("fastparquet is not installed")
if LooseVersion(fastparquet.__version__) >= LooseVersion('0.1.4'):
pytest.skip("fastparquet is >= 0.1.4")
return 'fastparquet'
@pytest.fixture
def df_compat():
return pd.DataFrame({'A': [1, 2, 3], 'B': 'foo'})
@pytest.fixture
def df_cross_compat():
df = pd.DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
# 'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.date_range('20130101', periods=3),
# 'g': pd.date_range('20130101', periods=3,
# tz='US/Eastern'),
# 'h': pd.date_range('20130101', periods=3, freq='ns')
})
return df
@pytest.fixture
def df_full():
return pd.DataFrame(
{'string': list('abc'),
'string_with_nan': ['a', np.nan, 'c'],
'string_with_none': ['a', None, 'c'],
'bytes': [b'foo', b'bar', b'baz'],
'unicode': [u'foo', u'bar', u'baz'],
'int': list(range(1, 4)),
'uint': np.arange(3, 6).astype('u1'),
'float': np.arange(4.0, 7.0, dtype='float64'),
'float_with_nan': [2., np.nan, 3.],
'bool': [True, False, True],
'datetime': pd.date_range('20130101', periods=3),
'datetime_with_nat': [pd.Timestamp('20130101'),
pd.NaT,
pd.Timestamp('20130103')]})
def check_round_trip(df, engine=None, path=None,
write_kwargs=None, read_kwargs=None,
expected=None, check_names=True,
repeat=2):
"""Verify parquet serializer and deserializer produce the same results.
Performs a pandas to disk and disk to pandas round trip,
then compares the 2 resulting DataFrames to verify equality.
Parameters
----------
df: Dataframe
engine: str, optional
'pyarrow' or 'fastparquet'
path: str, optional
write_kwargs: dict of str:str, optional
read_kwargs: dict of str:str, optional
expected: DataFrame, optional
Expected deserialization result, otherwise will be equal to `df`
check_names: list of str, optional
Closed set of column names to be compared
repeat: int, optional
How many times to repeat the test
"""
write_kwargs = write_kwargs or {'compression': None}
read_kwargs = read_kwargs or {}
if expected is None:
expected = df
if engine:
write_kwargs['engine'] = engine
read_kwargs['engine'] = engine
def compare(repeat):
for _ in range(repeat):
df.to_parquet(path, **write_kwargs)
with catch_warnings(record=True):
actual = read_parquet(path, **read_kwargs)
tm.assert_frame_equal(expected, actual,
check_names=check_names)
if path is None:
with tm.ensure_clean() as path:
compare(repeat)
else:
compare(repeat)
def test_invalid_engine(df_compat):
with pytest.raises(ValueError):
check_round_trip(df_compat, 'foo', 'bar')
def test_options_py(df_compat, pa):
# use the set option
with pd.option_context('io.parquet.engine', 'pyarrow'):
check_round_trip(df_compat)
def test_options_fp(df_compat, fp):
# use the set option
with pd.option_context('io.parquet.engine', 'fastparquet'):
check_round_trip(df_compat)
def test_options_auto(df_compat, fp, pa):
# use the set option
with pd.option_context('io.parquet.engine', 'auto'):
check_round_trip(df_compat)
def test_options_get_engine(fp, pa):
assert isinstance(get_engine('pyarrow'), PyArrowImpl)
assert isinstance(get_engine('fastparquet'), FastParquetImpl)
with pd.option_context('io.parquet.engine', 'pyarrow'):
assert isinstance(get_engine('auto'), PyArrowImpl)
assert isinstance(get_engine('pyarrow'), PyArrowImpl)
assert isinstance(get_engine('fastparquet'), FastParquetImpl)
with pd.option_context('io.parquet.engine', 'fastparquet'):
assert isinstance(get_engine('auto'), FastParquetImpl)
assert isinstance(get_engine('pyarrow'), PyArrowImpl)
assert isinstance(get_engine('fastparquet'), FastParquetImpl)
with pd.option_context('io.parquet.engine', 'auto'):
assert isinstance(get_engine('auto'), PyArrowImpl)
assert isinstance(get_engine('pyarrow'), PyArrowImpl)
assert isinstance(get_engine('fastparquet'), FastParquetImpl)
@pytest.mark.xfail(is_platform_windows() or is_platform_mac(),
reason="reading pa metadata failing on Windows/mac")
def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
# cross-compat with differing reading/writing engines
df = df_cross_compat
with tm.ensure_clean() as path:
df.to_parquet(path, engine=pa, compression=None)
result = read_parquet(path, engine=fp)
tm.assert_frame_equal(result, df)
result = read_parquet(path, engine=fp, columns=['a', 'd'])
tm.assert_frame_equal(result, df[['a', 'd']])
def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
# cross-compat with differing reading/writing engines
df = df_cross_compat
with tm.ensure_clean() as path:
df.to_parquet(path, engine=fp, compression=None)
with catch_warnings(record=True):
result = read_parquet(path, engine=pa)
tm.assert_frame_equal(result, df)
result = read_parquet(path, engine=pa, columns=['a', 'd'])
tm.assert_frame_equal(result, df[['a', 'd']])
class Base(object):
def check_error_on_write(self, df, engine, exc):
# check that we are raising the exception on writing
with tm.ensure_clean() as path:
with pytest.raises(exc):
to_parquet(df, path, engine, compression=None)
class TestBasic(Base):
def test_error(self, engine):
for obj in [pd.Series([1, 2, 3]), 1, 'foo', pd.Timestamp('20130101'),
np.array([1, 2, 3])]:
self.check_error_on_write(obj, engine, ValueError)
def test_columns_dtypes(self, engine):
df = pd.DataFrame({'string': list('abc'),
'int': list(range(1, 4))})
# unicode
df.columns = [u'foo', u'bar']
check_round_trip(df, engine)
def test_columns_dtypes_invalid(self, engine):
df = pd.DataFrame({'string': list('abc'),
'int': list(range(1, 4))})
# numeric
df.columns = [0, 1]
self.check_error_on_write(df, engine, ValueError)
if PY3:
# bytes on PY3, on PY2 these are str
df.columns = [b'foo', b'bar']
self.check_error_on_write(df, engine, ValueError)
# python object
df.columns = [datetime.datetime(2011, 1, 1, 0, 0),
datetime.datetime(2011, 1, 1, 1, 1)]
self.check_error_on_write(df, engine, ValueError)
@pytest.mark.parametrize('compression', [None, 'gzip', 'snappy', 'brotli'])
def test_compression(self, engine, compression):
if compression == 'snappy':
pytest.importorskip('snappy')
elif compression == 'brotli':
pytest.importorskip('brotli')
df = pd.DataFrame({'A': [1, 2, 3]})
check_round_trip(df, engine, write_kwargs={'compression': compression})
def test_read_columns(self, engine):
# GH18154
df = pd.DataFrame({'string': list('abc'),
'int': list(range(1, 4))})
expected = pd.DataFrame({'string': list('abc')})
check_round_trip(df, engine, expected=expected,
read_kwargs={'columns': ['string']})
def test_write_index(self, engine):
check_names = engine != 'fastparquet'
if engine == 'pyarrow':
import pyarrow
if LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'):
pytest.skip("pyarrow is < 0.7.0")
df = pd.DataFrame({'A': [1, 2, 3]})
check_round_trip(df, engine)
indexes = [
[2, 3, 4],
pd.date_range('20130101', periods=3),
list('abc'),
[1, 3, 4],
]
# non-default index
for index in indexes:
df.index = index
check_round_trip(df, engine, check_names=check_names)
# index with meta-data
df.index = [0, 1, 2]
df.index.name = 'foo'
check_round_trip(df, engine)
def test_write_multiindex(self, pa_ge_070):
# Not suppoprted in fastparquet as of 0.1.3 or older pyarrow version
engine = pa_ge_070
df = pd.DataFrame({'A': [1, 2, 3]})
index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)])
df.index = index
check_round_trip(df, engine)
def test_write_column_multiindex(self, engine):
# column multi-index
mi_columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)])
df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
self.check_error_on_write(df, engine, ValueError)
def test_multiindex_with_columns(self, pa_ge_070):
engine = pa_ge_070
dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS')
df = pd.DataFrame(np.random.randn(2 * len(dates), 3),
columns=list('ABC'))
index1 = pd.MultiIndex.from_product(
[['Level1', 'Level2'], dates],
names=['level', 'date'])
index2 = index1.copy(names=None)
for index in [index1, index2]:
df.index = index
check_round_trip(df, engine)
check_round_trip(df, engine, read_kwargs={'columns': ['A', 'B']},
expected=df[['A', 'B']])
class TestParquetPyArrow(Base):
def test_basic(self, pa, df_full):
df = df_full
# additional supported types for pyarrow
import pyarrow
if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'):
df['datetime_tz'] = pd.date_range('20130101', periods=3,
tz='Europe/Brussels')
df['bool_with_none'] = [True, None, True]
check_round_trip(df, pa)
@pytest.mark.xfail(reason="pyarrow fails on this (ARROW-1883)")
def test_basic_subset_columns(self, pa, df_full):
# GH18628
df = df_full
# additional supported types for pyarrow
df['datetime_tz'] = pd.date_range('20130101', periods=3,
tz='Europe/Brussels')
check_round_trip(df, pa, expected=df[['string', 'int']],
read_kwargs={'columns': ['string', 'int']})
def test_duplicate_columns(self, pa):
# not currently able to handle duplicate columns
df = pd.DataFrame(np.arange(12).reshape(4, 3),
columns=list('aaa')).copy()
self.check_error_on_write(df, pa, ValueError)
def test_unsupported(self, pa):
# period
df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
self.check_error_on_write(df, pa, ValueError)
# timedelta
df = pd.DataFrame({'a': pd.timedelta_range('1 day',
periods=3)})
self.check_error_on_write(df, pa, NotImplementedError)
# mixed python objects
df = pd.DataFrame({'a': ['a', 1, 2.0]})
self.check_error_on_write(df, pa, ValueError)
def test_categorical(self, pa_ge_070):
pa = pa_ge_070
# supported in >= 0.7.0
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
# de-serialized as object
expected = df.assign(a=df.a.astype(object))
check_round_trip(df, pa, expected=expected)
def test_categorical_unsupported(self, pa_lt_070):
pa = pa_lt_070
# supported in >= 0.7.0
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
self.check_error_on_write(df, pa, NotImplementedError)
def test_s3_roundtrip(self, df_compat, s3_resource, pa):
# GH #19134
check_round_trip(df_compat, pa,
path='s3://pandas-test/pyarrow.parquet')
class TestParquetFastParquet(Base):
def test_basic(self, fp, df_full):
df = df_full
# additional supported types for fastparquet
if LooseVersion(fastparquet.__version__) >= LooseVersion('0.1.4'):
df['datetime_tz'] = pd.date_range('20130101', periods=3,
tz='US/Eastern')
df['timedelta'] = pd.timedelta_range('1 day', periods=3)
check_round_trip(df, fp)
@pytest.mark.skip(reason="not supported")
def test_duplicate_columns(self, fp):
# not currently able to handle duplicate columns
df = pd.DataFrame(np.arange(12).reshape(4, 3),
columns=list('aaa')).copy()
self.check_error_on_write(df, fp, ValueError)
def test_bool_with_none(self, fp):
df = pd.DataFrame({'a': [True, None, False]})
expected = pd.DataFrame({'a': [1.0, np.nan, 0.0]}, dtype='float16')
check_round_trip(df, fp, expected=expected)
def test_unsupported(self, fp):
# period
df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
self.check_error_on_write(df, fp, ValueError)
# mixed
df = pd.DataFrame({'a': ['a', 1, 2.0]})
self.check_error_on_write(df, fp, ValueError)
def test_categorical(self, fp):
if LooseVersion(fastparquet.__version__) < LooseVersion("0.1.3"):
pytest.skip("CategoricalDtype not supported for older fp")
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
check_round_trip(df, fp)
def test_datetime_tz(self, fp_lt_014):
# fastparquet<0.1.4 doesn't preserve tz
df = pd.DataFrame({'a': pd.date_range('20130101', periods=3,
tz='US/Eastern')})
# warns on the coercion
with catch_warnings(record=True):
check_round_trip(df, fp_lt_014,
expected=df.astype('datetime64[ns]'))
def test_filter_row_groups(self, fp):
d = {'a': list(range(0, 3))}
df = pd.DataFrame(d)
with tm.ensure_clean() as path:
df.to_parquet(path, fp, compression=None,
row_group_offsets=1)
result = read_parquet(path, fp, filters=[('a', '==', 0)])
assert len(result) == 1
def test_s3_roundtrip(self, df_compat, s3_resource, fp):
# GH #19134
check_round_trip(df_compat, fp,
path='s3://pandas-test/fastparquet.parquet')
@@ -0,0 +1,480 @@
# pylint: disable=E1101,E1103,W0232
"""
manage legacy pickle tests
How to add pickle tests:
1. Install pandas version intended to output the pickle.
2. Execute "generate_legacy_storage_files.py" to create the pickle.
$ python generate_legacy_storage_files.py <output_dir> pickle
3. Move the created pickle to "data/legacy_pickle/<version>" directory.
"""
import glob
import pytest
from warnings import catch_warnings
import os
from distutils.version import LooseVersion
import pandas as pd
from pandas import Index
from pandas.compat import is_platform_little_endian, PY3
import pandas
import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas.tseries.offsets import Day, MonthEnd
import shutil
@pytest.fixture(scope='module')
def current_pickle_data():
# our current version pickle data
from pandas.tests.io.generate_legacy_storage_files import (
create_pickle_data)
return create_pickle_data()
# ---------------------
# comparison functions
# ---------------------
def compare_element(result, expected, typ, version=None):
if isinstance(expected, Index):
tm.assert_index_equal(expected, result)
return
if typ.startswith('sp_'):
comparator = getattr(tm, "assert_%s_equal" % typ)
comparator(result, expected, exact_indices=False)
elif typ == 'timestamp':
if expected is pd.NaT:
assert result is pd.NaT
else:
assert result == expected
assert result.freq == expected.freq
else:
comparator = getattr(tm, "assert_%s_equal" %
typ, tm.assert_almost_equal)
comparator(result, expected)
def compare(data, vf, version):
# py3 compat when reading py2 pickle
try:
data = pandas.read_pickle(vf)
except (ValueError) as e:
if 'unsupported pickle protocol:' in str(e):
# trying to read a py3 pickle in py2
return
else:
raise
m = globals()
for typ, dv in data.items():
for dt, result in dv.items():
try:
expected = data[typ][dt]
except (KeyError):
if version in ('0.10.1', '0.11.0') and dt == 'reg':
break
else:
raise
# use a specific comparator
# if available
comparator = "compare_{typ}_{dt}".format(typ=typ, dt=dt)
comparator = m.get(comparator, m['compare_element'])
comparator(result, expected, typ, version)
return data
def compare_sp_series_ts(res, exp, typ, version):
# SparseTimeSeries integrated into SparseSeries in 0.12.0
# and deprecated in 0.17.0
if version and LooseVersion(version) <= LooseVersion("0.12.0"):
tm.assert_sp_series_equal(res, exp, check_series_type=False)
else:
tm.assert_sp_series_equal(res, exp)
def compare_series_ts(result, expected, typ, version):
# GH 7748
tm.assert_series_equal(result, expected)
assert result.index.freq == expected.index.freq
assert not result.index.freq.normalize
tm.assert_series_equal(result > 0, expected > 0)
# GH 9291
freq = result.index.freq
assert freq + Day(1) == Day(2)
res = freq + pandas.Timedelta(hours=1)
assert isinstance(res, pandas.Timedelta)
assert res == pandas.Timedelta(days=1, hours=1)
res = freq + pandas.Timedelta(nanoseconds=1)
assert isinstance(res, pandas.Timedelta)
assert res == pandas.Timedelta(days=1, nanoseconds=1)
def compare_series_dt_tz(result, expected, typ, version):
# 8260
# dtype is object < 0.17.0
if LooseVersion(version) < LooseVersion('0.17.0'):
expected = expected.astype(object)
tm.assert_series_equal(result, expected)
else:
tm.assert_series_equal(result, expected)
def compare_series_cat(result, expected, typ, version):
# Categorical dtype is added in 0.15.0
# ordered is changed in 0.16.0
if LooseVersion(version) < LooseVersion('0.15.0'):
tm.assert_series_equal(result, expected, check_dtype=False,
check_categorical=False)
elif LooseVersion(version) < LooseVersion('0.16.0'):
tm.assert_series_equal(result, expected, check_categorical=False)
else:
tm.assert_series_equal(result, expected)
def compare_frame_dt_mixed_tzs(result, expected, typ, version):
# 8260
# dtype is object < 0.17.0
if LooseVersion(version) < LooseVersion('0.17.0'):
expected = expected.astype(object)
tm.assert_frame_equal(result, expected)
else:
tm.assert_frame_equal(result, expected)
def compare_frame_cat_onecol(result, expected, typ, version):
# Categorical dtype is added in 0.15.0
# ordered is changed in 0.16.0
if LooseVersion(version) < LooseVersion('0.15.0'):
tm.assert_frame_equal(result, expected, check_dtype=False,
check_categorical=False)
elif LooseVersion(version) < LooseVersion('0.16.0'):
tm.assert_frame_equal(result, expected, check_categorical=False)
else:
tm.assert_frame_equal(result, expected)
def compare_frame_cat_and_float(result, expected, typ, version):
compare_frame_cat_onecol(result, expected, typ, version)
def compare_index_period(result, expected, typ, version):
tm.assert_index_equal(result, expected)
assert isinstance(result.freq, MonthEnd)
assert result.freq == MonthEnd()
assert result.freqstr == 'M'
tm.assert_index_equal(result.shift(2), expected.shift(2))
def compare_sp_frame_float(result, expected, typ, version):
if LooseVersion(version) <= LooseVersion('0.18.1'):
tm.assert_sp_frame_equal(result, expected, exact_indices=False,
check_dtype=False)
else:
tm.assert_sp_frame_equal(result, expected)
files = glob.glob(os.path.join(os.path.dirname(__file__), "data",
"legacy_pickle", "*", "*.pickle"))
@pytest.fixture(params=files)
def legacy_pickle(request, datapath):
return datapath(request.param)
# ---------------------
# tests
# ---------------------
def test_pickles(current_pickle_data, legacy_pickle):
if not is_platform_little_endian():
pytest.skip("known failure on non-little endian")
version = os.path.basename(os.path.dirname(legacy_pickle))
with catch_warnings(record=True):
compare(current_pickle_data, legacy_pickle, version)
def test_round_trip_current(current_pickle_data):
try:
import cPickle as c_pickle
def c_pickler(obj, path):
with open(path, 'wb') as fh:
c_pickle.dump(obj, fh, protocol=-1)
def c_unpickler(path):
with open(path, 'rb') as fh:
fh.seek(0)
return c_pickle.load(fh)
except:
c_pickler = None
c_unpickler = None
import pickle as python_pickle
def python_pickler(obj, path):
with open(path, 'wb') as fh:
python_pickle.dump(obj, fh, protocol=-1)
def python_unpickler(path):
with open(path, 'rb') as fh:
fh.seek(0)
return python_pickle.load(fh)
data = current_pickle_data
for typ, dv in data.items():
for dt, expected in dv.items():
for writer in [pd.to_pickle, c_pickler, python_pickler]:
if writer is None:
continue
with tm.ensure_clean() as path:
# test writing with each pickler
writer(expected, path)
# test reading with each unpickler
result = pd.read_pickle(path)
compare_element(result, expected, typ)
if c_unpickler is not None:
result = c_unpickler(path)
compare_element(result, expected, typ)
result = python_unpickler(path)
compare_element(result, expected, typ)
def test_pickle_v0_14_1(datapath):
cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False,
categories=['a', 'b', 'c', 'd'])
pickle_path = datapath('io', 'data', 'categorical_0_14_1.pickle')
# This code was executed once on v0.14.1 to generate the pickle:
#
# cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
# name='foobar')
# with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
#
tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
def test_pickle_v0_15_2(datapath):
# ordered -> _ordered
# GH 9347
cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False,
categories=['a', 'b', 'c', 'd'])
pickle_path = datapath('io', 'data', 'categorical_0_15_2.pickle')
# This code was executed once on v0.15.2 to generate the pickle:
#
# cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
# name='foobar')
# with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
#
tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
def test_pickle_path_pathlib():
df = tm.makeDataFrame()
result = tm.round_trip_pathlib(df.to_pickle, pd.read_pickle)
tm.assert_frame_equal(df, result)
def test_pickle_path_localpath():
df = tm.makeDataFrame()
result = tm.round_trip_localpath(df.to_pickle, pd.read_pickle)
tm.assert_frame_equal(df, result)
# ---------------------
# test pickle compression
# ---------------------
@pytest.fixture
def get_random_path():
return u'__%s__.pickle' % tm.rands(10)
class TestCompression(object):
_compression_to_extension = {
None: ".none",
'gzip': '.gz',
'bz2': '.bz2',
'zip': '.zip',
'xz': '.xz',
}
def compress_file(self, src_path, dest_path, compression):
if compression is None:
shutil.copyfile(src_path, dest_path)
return
if compression == 'gzip':
import gzip
f = gzip.open(dest_path, "w")
elif compression == 'bz2':
import bz2
f = bz2.BZ2File(dest_path, "w")
elif compression == 'zip':
import zipfile
zip_file = zipfile.ZipFile(dest_path, "w",
compression=zipfile.ZIP_DEFLATED)
zip_file.write(src_path, os.path.basename(src_path))
elif compression == 'xz':
lzma = pandas.compat.import_lzma()
f = lzma.LZMAFile(dest_path, "w")
else:
msg = 'Unrecognized compression type: {}'.format(compression)
raise ValueError(msg)
if compression != "zip":
with open(src_path, "rb") as fh:
f.write(fh.read())
f.close()
def test_write_explicit(self, compression, get_random_path):
base = get_random_path
path1 = base + ".compressed"
path2 = base + ".raw"
with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
df = tm.makeDataFrame()
# write to compressed file
df.to_pickle(p1, compression=compression)
# decompress
with tm.decompress_file(p1, compression=compression) as f:
with open(p2, "wb") as fh:
fh.write(f.read())
# read decompressed file
df2 = pd.read_pickle(p2, compression=None)
tm.assert_frame_equal(df, df2)
@pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z'])
def test_write_explicit_bad(self, compression, get_random_path):
with tm.assert_raises_regex(ValueError,
"Unrecognized compression type"):
with tm.ensure_clean(get_random_path) as path:
df = tm.makeDataFrame()
df.to_pickle(path, compression=compression)
@pytest.mark.parametrize('ext', [
'', '.gz', '.bz2', '.no_compress',
pytest.param('.xz', marks=td.skip_if_no_lzma)
])
def test_write_infer(self, ext, get_random_path):
base = get_random_path
path1 = base + ext
path2 = base + ".raw"
compression = None
for c in self._compression_to_extension:
if self._compression_to_extension[c] == ext:
compression = c
break
with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
df = tm.makeDataFrame()
# write to compressed file by inferred compression method
df.to_pickle(p1)
# decompress
with tm.decompress_file(p1, compression=compression) as f:
with open(p2, "wb") as fh:
fh.write(f.read())
# read decompressed file
df2 = pd.read_pickle(p2, compression=None)
tm.assert_frame_equal(df, df2)
def test_read_explicit(self, compression, get_random_path):
base = get_random_path
path1 = base + ".raw"
path2 = base + ".compressed"
with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
df = tm.makeDataFrame()
# write to uncompressed file
df.to_pickle(p1, compression=None)
# compress
self.compress_file(p1, p2, compression=compression)
# read compressed file
df2 = pd.read_pickle(p2, compression=compression)
tm.assert_frame_equal(df, df2)
@pytest.mark.parametrize('ext', [
'', '.gz', '.bz2', '.zip', '.no_compress',
pytest.param('.xz', marks=td.skip_if_no_lzma)
])
def test_read_infer(self, ext, get_random_path):
base = get_random_path
path1 = base + ".raw"
path2 = base + ext
compression = None
for c in self._compression_to_extension:
if self._compression_to_extension[c] == ext:
compression = c
break
with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
df = tm.makeDataFrame()
# write to uncompressed file
df.to_pickle(p1, compression=None)
# compress
self.compress_file(p1, p2, compression=compression)
# read compressed file by inferred compression method
df2 = pd.read_pickle(p2)
tm.assert_frame_equal(df, df2)
# ---------------------
# test pickle compression
# ---------------------
class TestProtocol(object):
@pytest.mark.parametrize('protocol', [-1, 0, 1, 2])
def test_read(self, protocol, get_random_path):
with tm.ensure_clean(get_random_path) as path:
df = tm.makeDataFrame()
df.to_pickle(path, protocol=protocol)
df2 = pd.read_pickle(path)
tm.assert_frame_equal(df, df2)
@pytest.mark.parametrize('protocol', [3, 4])
@pytest.mark.skipif(PY3, reason="Testing invalid parameters for Python 2")
def test_read_bad_versions(self, protocol, get_random_path):
# For Python 2, HIGHEST_PROTOCOL should be 2.
msg = ("pickle protocol {protocol} asked for; the highest available "
"protocol is 2").format(protocol=protocol)
with tm.assert_raises_regex(ValueError, msg):
with tm.ensure_clean(get_random_path) as path:
df = tm.makeDataFrame()
df.to_pickle(path, protocol=protocol)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,8 @@
from pandas.io.common import is_s3_url
class TestS3URL(object):
def test_is_s3_url(self):
assert is_s3_url("s3://pandas/somethingelse.com")
assert not is_s3_url("s4://pandas/somethingelse.com")
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff