Static code analysis and corrections
This commit is contained in:
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -0,0 +1,90 @@
|
||||
from distutils.version import LooseVersion
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.parsers import read_csv
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tips_file(datapath):
|
||||
"""Path to the tips dataset"""
|
||||
return datapath('io', 'parser', 'data', 'tips.csv')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def jsonl_file(datapath):
|
||||
"""Path a JSONL dataset"""
|
||||
return datapath('io', 'parser', 'data', 'items.jsonl')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def salaries_table(datapath):
|
||||
"""DataFrame with the salaries dataset"""
|
||||
return read_csv(datapath('io', 'parser', 'data', 'salaries.csv'), sep='\t')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def s3_resource(tips_file, jsonl_file):
|
||||
"""Fixture for mocking S3 interaction.
|
||||
|
||||
The primary bucket name is "pandas-test". The following datasets
|
||||
are loaded.
|
||||
|
||||
- tips.csv
|
||||
- tips.csv.gz
|
||||
- tips.csv.bz2
|
||||
- items.jsonl
|
||||
|
||||
A private bucket "cant_get_it" is also created. The boto3 s3 resource
|
||||
is yielded by the fixture.
|
||||
"""
|
||||
pytest.importorskip('s3fs')
|
||||
boto3 = pytest.importorskip('boto3')
|
||||
botocore = pytest.importorskip('botocore')
|
||||
|
||||
if LooseVersion(botocore.__version__) < LooseVersion("1.11.0"):
|
||||
# botocore leaks an uncatchable ResourceWarning before 1.11.0;
|
||||
# see GH 23731 and https://github.com/boto/botocore/issues/1464
|
||||
pytest.skip("botocore is leaking resources before 1.11.0")
|
||||
|
||||
with tm.ensure_safe_environment_variables():
|
||||
# temporary workaround as moto fails for botocore >= 1.11 otherwise,
|
||||
# see https://github.com/spulec/moto/issues/1924 & 1952
|
||||
os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key")
|
||||
os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret")
|
||||
|
||||
moto = pytest.importorskip('moto')
|
||||
|
||||
test_s3_files = [
|
||||
('tips.csv', tips_file),
|
||||
('tips.csv.gz', tips_file + '.gz'),
|
||||
('tips.csv.bz2', tips_file + '.bz2'),
|
||||
('items.jsonl', jsonl_file),
|
||||
]
|
||||
|
||||
def add_tips_files(bucket_name):
|
||||
for s3_key, file_name in test_s3_files:
|
||||
with open(file_name, 'rb') as f:
|
||||
conn.Bucket(bucket_name).put_object(
|
||||
Key=s3_key,
|
||||
Body=f)
|
||||
|
||||
try:
|
||||
s3 = moto.mock_s3()
|
||||
s3.start()
|
||||
|
||||
# see gh-16135
|
||||
bucket = 'pandas-test'
|
||||
conn = boto3.resource("s3", region_name="us-east-1")
|
||||
|
||||
conn.create_bucket(Bucket=bucket)
|
||||
add_tips_files(bucket)
|
||||
|
||||
conn.create_bucket(Bucket='cant_get_it', ACL='private')
|
||||
add_tips_files('cant_get_it')
|
||||
yield conn
|
||||
finally:
|
||||
s3.stop()
|
||||
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -0,0 +1,74 @@
|
||||
import pytest
|
||||
|
||||
from pandas.io.formats.console import detect_console_encoding
|
||||
|
||||
|
||||
class MockEncoding(object): # TODO(py27): replace with mock
|
||||
"""
|
||||
Used to add a side effect when accessing the 'encoding' property. If the
|
||||
side effect is a str in nature, the value will be returned. Otherwise, the
|
||||
side effect should be an exception that will be raised.
|
||||
"""
|
||||
def __init__(self, encoding):
|
||||
super(MockEncoding, self).__init__()
|
||||
self.val = encoding
|
||||
|
||||
@property
|
||||
def encoding(self):
|
||||
return self.raise_or_return(self.val)
|
||||
|
||||
@staticmethod
|
||||
def raise_or_return(val):
|
||||
if isinstance(val, str):
|
||||
return val
|
||||
else:
|
||||
raise val
|
||||
|
||||
|
||||
@pytest.mark.parametrize('empty,filled', [
|
||||
['stdin', 'stdout'],
|
||||
['stdout', 'stdin']
|
||||
])
|
||||
def test_detect_console_encoding_from_stdout_stdin(monkeypatch, empty, filled):
|
||||
# Ensures that when sys.stdout.encoding or sys.stdin.encoding is used when
|
||||
# they have values filled.
|
||||
# GH 21552
|
||||
with monkeypatch.context() as context:
|
||||
context.setattr('sys.{}'.format(empty), MockEncoding(''))
|
||||
context.setattr('sys.{}'.format(filled), MockEncoding(filled))
|
||||
assert detect_console_encoding() == filled
|
||||
|
||||
|
||||
@pytest.mark.parametrize('encoding', [
|
||||
AttributeError,
|
||||
IOError,
|
||||
'ascii'
|
||||
])
|
||||
def test_detect_console_encoding_fallback_to_locale(monkeypatch, encoding):
|
||||
# GH 21552
|
||||
with monkeypatch.context() as context:
|
||||
context.setattr('locale.getpreferredencoding', lambda: 'foo')
|
||||
context.setattr('sys.stdout', MockEncoding(encoding))
|
||||
assert detect_console_encoding() == 'foo'
|
||||
|
||||
|
||||
@pytest.mark.parametrize('std,locale', [
|
||||
['ascii', 'ascii'],
|
||||
['ascii', Exception],
|
||||
[AttributeError, 'ascii'],
|
||||
[AttributeError, Exception],
|
||||
[IOError, 'ascii'],
|
||||
[IOError, Exception]
|
||||
])
|
||||
def test_detect_console_encoding_fallback_to_default(monkeypatch, std, locale):
|
||||
# When both the stdout/stdin encoding and locale preferred encoding checks
|
||||
# fail (or return 'ascii', we should default to the sys default encoding.
|
||||
# GH 21552
|
||||
with monkeypatch.context() as context:
|
||||
context.setattr(
|
||||
'locale.getpreferredencoding',
|
||||
lambda: MockEncoding.raise_or_return(locale)
|
||||
)
|
||||
context.setattr('sys.stdout', MockEncoding(std))
|
||||
context.setattr('sys.getdefaultencoding', lambda: 'sysDefaultEncoding')
|
||||
assert detect_console_encoding() == 'sysDefaultEncoding'
|
||||
@@ -0,0 +1,187 @@
|
||||
import pytest
|
||||
|
||||
from pandas.util import testing as tm
|
||||
|
||||
from pandas.io.formats.css import CSSResolver, CSSWarning
|
||||
|
||||
|
||||
def assert_resolves(css, props, inherited=None):
|
||||
resolve = CSSResolver()
|
||||
actual = resolve(css, inherited=inherited)
|
||||
assert props == actual
|
||||
|
||||
|
||||
def assert_same_resolution(css1, css2, inherited=None):
|
||||
resolve = CSSResolver()
|
||||
resolved1 = resolve(css1, inherited=inherited)
|
||||
resolved2 = resolve(css2, inherited=inherited)
|
||||
assert resolved1 == resolved2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name,norm,abnorm', [
|
||||
('whitespace', 'hello: world; foo: bar',
|
||||
' \t hello \t :\n world \n ; \n foo: \tbar\n\n'),
|
||||
('case', 'hello: world; foo: bar', 'Hello: WORLD; foO: bar'),
|
||||
('empty-decl', 'hello: world; foo: bar',
|
||||
'; hello: world;; foo: bar;\n; ;'),
|
||||
('empty-list', '', ';'),
|
||||
])
|
||||
def test_css_parse_normalisation(name, norm, abnorm):
|
||||
assert_same_resolution(norm, abnorm)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'invalid_css,remainder', [
|
||||
# No colon
|
||||
('hello-world', ''),
|
||||
('border-style: solid; hello-world', 'border-style: solid'),
|
||||
('border-style: solid; hello-world; font-weight: bold',
|
||||
'border-style: solid; font-weight: bold'),
|
||||
# Unclosed string fail
|
||||
# Invalid size
|
||||
('font-size: blah', 'font-size: 1em'),
|
||||
('font-size: 1a2b', 'font-size: 1em'),
|
||||
('font-size: 1e5pt', 'font-size: 1em'),
|
||||
('font-size: 1+6pt', 'font-size: 1em'),
|
||||
('font-size: 1unknownunit', 'font-size: 1em'),
|
||||
('font-size: 10', 'font-size: 1em'),
|
||||
('font-size: 10 pt', 'font-size: 1em'),
|
||||
])
|
||||
def test_css_parse_invalid(invalid_css, remainder):
|
||||
with tm.assert_produces_warning(CSSWarning):
|
||||
assert_same_resolution(invalid_css, remainder)
|
||||
|
||||
# TODO: we should be checking that in other cases no warnings are raised
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'shorthand,expansions',
|
||||
[('margin', ['margin-top', 'margin-right',
|
||||
'margin-bottom', 'margin-left']),
|
||||
('padding', ['padding-top', 'padding-right',
|
||||
'padding-bottom', 'padding-left']),
|
||||
('border-width', ['border-top-width', 'border-right-width',
|
||||
'border-bottom-width', 'border-left-width']),
|
||||
('border-color', ['border-top-color', 'border-right-color',
|
||||
'border-bottom-color', 'border-left-color']),
|
||||
('border-style', ['border-top-style', 'border-right-style',
|
||||
'border-bottom-style', 'border-left-style']),
|
||||
])
|
||||
def test_css_side_shorthands(shorthand, expansions):
|
||||
top, right, bottom, left = expansions
|
||||
|
||||
assert_resolves('{shorthand}: 1pt'.format(shorthand=shorthand),
|
||||
{top: '1pt', right: '1pt',
|
||||
bottom: '1pt', left: '1pt'})
|
||||
|
||||
assert_resolves('{shorthand}: 1pt 4pt'.format(shorthand=shorthand),
|
||||
{top: '1pt', right: '4pt',
|
||||
bottom: '1pt', left: '4pt'})
|
||||
|
||||
assert_resolves('{shorthand}: 1pt 4pt 2pt'.format(shorthand=shorthand),
|
||||
{top: '1pt', right: '4pt',
|
||||
bottom: '2pt', left: '4pt'})
|
||||
|
||||
assert_resolves('{shorthand}: 1pt 4pt 2pt 0pt'.format(shorthand=shorthand),
|
||||
{top: '1pt', right: '4pt',
|
||||
bottom: '2pt', left: '0pt'})
|
||||
|
||||
with tm.assert_produces_warning(CSSWarning):
|
||||
assert_resolves(
|
||||
'{shorthand}: 1pt 1pt 1pt 1pt 1pt'.format(shorthand=shorthand), {})
|
||||
|
||||
|
||||
@pytest.mark.parametrize('style,inherited,equiv', [
|
||||
('margin: 1px; margin: 2px', '',
|
||||
'margin: 2px'),
|
||||
('margin: 1px', 'margin: 2px',
|
||||
'margin: 1px'),
|
||||
('margin: 1px; margin: inherit', 'margin: 2px',
|
||||
'margin: 2px'),
|
||||
('margin: 1px; margin-top: 2px', '',
|
||||
'margin-left: 1px; margin-right: 1px; ' +
|
||||
'margin-bottom: 1px; margin-top: 2px'),
|
||||
('margin-top: 2px', 'margin: 1px',
|
||||
'margin: 1px; margin-top: 2px'),
|
||||
('margin: 1px', 'margin-top: 2px',
|
||||
'margin: 1px'),
|
||||
('margin: 1px; margin-top: inherit', 'margin: 2px',
|
||||
'margin: 1px; margin-top: 2px'),
|
||||
])
|
||||
def test_css_precedence(style, inherited, equiv):
|
||||
resolve = CSSResolver()
|
||||
inherited_props = resolve(inherited)
|
||||
style_props = resolve(style, inherited=inherited_props)
|
||||
equiv_props = resolve(equiv)
|
||||
assert style_props == equiv_props
|
||||
|
||||
|
||||
@pytest.mark.parametrize('style,equiv', [
|
||||
('margin: 1px; margin-top: inherit',
|
||||
'margin-bottom: 1px; margin-right: 1px; margin-left: 1px'),
|
||||
('margin-top: inherit', ''),
|
||||
('margin-top: initial', ''),
|
||||
])
|
||||
def test_css_none_absent(style, equiv):
|
||||
assert_same_resolution(style, equiv)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('size,resolved', [
|
||||
('xx-small', '6pt'),
|
||||
('x-small', '{pt:f}pt'.format(pt=7.5)),
|
||||
('small', '{pt:f}pt'.format(pt=9.6)),
|
||||
('medium', '12pt'),
|
||||
('large', '{pt:f}pt'.format(pt=13.5)),
|
||||
('x-large', '18pt'),
|
||||
('xx-large', '24pt'),
|
||||
|
||||
('8px', '6pt'),
|
||||
('1.25pc', '15pt'),
|
||||
('.25in', '18pt'),
|
||||
('02.54cm', '72pt'),
|
||||
('25.4mm', '72pt'),
|
||||
('101.6q', '72pt'),
|
||||
('101.6q', '72pt'),
|
||||
])
|
||||
@pytest.mark.parametrize('relative_to', # invariant to inherited size
|
||||
[None, '16pt'])
|
||||
def test_css_absolute_font_size(size, relative_to, resolved):
|
||||
if relative_to is None:
|
||||
inherited = None
|
||||
else:
|
||||
inherited = {'font-size': relative_to}
|
||||
assert_resolves('font-size: {size}'.format(size=size),
|
||||
{'font-size': resolved}, inherited=inherited)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('size,relative_to,resolved', [
|
||||
('1em', None, '12pt'),
|
||||
('1.0em', None, '12pt'),
|
||||
('1.25em', None, '15pt'),
|
||||
('1em', '16pt', '16pt'),
|
||||
('1.0em', '16pt', '16pt'),
|
||||
('1.25em', '16pt', '20pt'),
|
||||
('1rem', '16pt', '12pt'),
|
||||
('1.0rem', '16pt', '12pt'),
|
||||
('1.25rem', '16pt', '15pt'),
|
||||
('100%', None, '12pt'),
|
||||
('125%', None, '15pt'),
|
||||
('100%', '16pt', '16pt'),
|
||||
('125%', '16pt', '20pt'),
|
||||
('2ex', None, '12pt'),
|
||||
('2.0ex', None, '12pt'),
|
||||
('2.50ex', None, '15pt'),
|
||||
('inherit', '16pt', '16pt'),
|
||||
|
||||
('smaller', None, '10pt'),
|
||||
('smaller', '18pt', '15pt'),
|
||||
('larger', None, '{pt:f}pt'.format(pt=14.4)),
|
||||
('larger', '15pt', '18pt'),
|
||||
])
|
||||
def test_css_relative_font_size(size, relative_to, resolved):
|
||||
if relative_to is None:
|
||||
inherited = None
|
||||
else:
|
||||
inherited = {'font-size': relative_to}
|
||||
assert_resolves('font-size: {size}'.format(size=size),
|
||||
{'font-size': resolved}, inherited=inherited)
|
||||
+196
@@ -0,0 +1,196 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat import u
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
from pandas.util import testing as tm
|
||||
|
||||
import pandas.io.formats.format as fmt
|
||||
|
||||
|
||||
class TestEngFormatter(object):
|
||||
|
||||
def test_eng_float_formatter(self):
|
||||
df = DataFrame({'A': [1.41, 141., 14100, 1410000.]})
|
||||
|
||||
fmt.set_eng_float_format()
|
||||
result = df.to_string()
|
||||
expected = (' A\n'
|
||||
'0 1.410E+00\n'
|
||||
'1 141.000E+00\n'
|
||||
'2 14.100E+03\n'
|
||||
'3 1.410E+06')
|
||||
assert result == expected
|
||||
|
||||
fmt.set_eng_float_format(use_eng_prefix=True)
|
||||
result = df.to_string()
|
||||
expected = (' A\n'
|
||||
'0 1.410\n'
|
||||
'1 141.000\n'
|
||||
'2 14.100k\n'
|
||||
'3 1.410M')
|
||||
assert result == expected
|
||||
|
||||
fmt.set_eng_float_format(accuracy=0)
|
||||
result = df.to_string()
|
||||
expected = (' A\n'
|
||||
'0 1E+00\n'
|
||||
'1 141E+00\n'
|
||||
'2 14E+03\n'
|
||||
'3 1E+06')
|
||||
assert result == expected
|
||||
|
||||
tm.reset_display_options()
|
||||
|
||||
def compare(self, formatter, input, output):
|
||||
formatted_input = formatter(input)
|
||||
assert formatted_input == output
|
||||
|
||||
def compare_all(self, formatter, in_out):
|
||||
"""
|
||||
Parameters:
|
||||
-----------
|
||||
formatter: EngFormatter under test
|
||||
in_out: list of tuples. Each tuple = (number, expected_formatting)
|
||||
|
||||
It is tested if 'formatter(number) == expected_formatting'.
|
||||
*number* should be >= 0 because formatter(-number) == fmt is also
|
||||
tested. *fmt* is derived from *expected_formatting*
|
||||
"""
|
||||
for input, output in in_out:
|
||||
self.compare(formatter, input, output)
|
||||
self.compare(formatter, -input, "-" + output[1:])
|
||||
|
||||
def test_exponents_with_eng_prefix(self):
|
||||
formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True)
|
||||
f = np.sqrt(2)
|
||||
in_out = [
|
||||
(f * 10 ** -24, " 1.414y"), (f * 10 ** -23, " 14.142y"),
|
||||
(f * 10 ** -22, " 141.421y"), (f * 10 ** -21, " 1.414z"),
|
||||
(f * 10 ** -20, " 14.142z"), (f * 10 ** -19, " 141.421z"),
|
||||
(f * 10 ** -18, " 1.414a"), (f * 10 ** -17, " 14.142a"),
|
||||
(f * 10 ** -16, " 141.421a"), (f * 10 ** -15, " 1.414f"),
|
||||
(f * 10 ** -14, " 14.142f"), (f * 10 ** -13, " 141.421f"),
|
||||
(f * 10 ** -12, " 1.414p"), (f * 10 ** -11, " 14.142p"),
|
||||
(f * 10 ** -10, " 141.421p"), (f * 10 ** -9, " 1.414n"),
|
||||
(f * 10 ** -8, " 14.142n"), (f * 10 ** -7, " 141.421n"),
|
||||
(f * 10 ** -6, " 1.414u"), (f * 10 ** -5, " 14.142u"),
|
||||
(f * 10 ** -4, " 141.421u"), (f * 10 ** -3, " 1.414m"),
|
||||
(f * 10 ** -2, " 14.142m"), (f * 10 ** -1, " 141.421m"),
|
||||
(f * 10 ** 0, " 1.414"), (f * 10 ** 1, " 14.142"),
|
||||
(f * 10 ** 2, " 141.421"), (f * 10 ** 3, " 1.414k"),
|
||||
(f * 10 ** 4, " 14.142k"), (f * 10 ** 5, " 141.421k"),
|
||||
(f * 10 ** 6, " 1.414M"), (f * 10 ** 7, " 14.142M"),
|
||||
(f * 10 ** 8, " 141.421M"), (f * 10 ** 9, " 1.414G"),
|
||||
(f * 10 ** 10, " 14.142G"), (f * 10 ** 11, " 141.421G"),
|
||||
(f * 10 ** 12, " 1.414T"), (f * 10 ** 13, " 14.142T"),
|
||||
(f * 10 ** 14, " 141.421T"), (f * 10 ** 15, " 1.414P"),
|
||||
(f * 10 ** 16, " 14.142P"), (f * 10 ** 17, " 141.421P"),
|
||||
(f * 10 ** 18, " 1.414E"), (f * 10 ** 19, " 14.142E"),
|
||||
(f * 10 ** 20, " 141.421E"), (f * 10 ** 21, " 1.414Z"),
|
||||
(f * 10 ** 22, " 14.142Z"), (f * 10 ** 23, " 141.421Z"),
|
||||
(f * 10 ** 24, " 1.414Y"), (f * 10 ** 25, " 14.142Y"),
|
||||
(f * 10 ** 26, " 141.421Y")]
|
||||
self.compare_all(formatter, in_out)
|
||||
|
||||
def test_exponents_without_eng_prefix(self):
|
||||
formatter = fmt.EngFormatter(accuracy=4, use_eng_prefix=False)
|
||||
f = np.pi
|
||||
in_out = [
|
||||
(f * 10 ** -24, " 3.1416E-24"),
|
||||
(f * 10 ** -23, " 31.4159E-24"),
|
||||
(f * 10 ** -22, " 314.1593E-24"),
|
||||
(f * 10 ** -21, " 3.1416E-21"),
|
||||
(f * 10 ** -20, " 31.4159E-21"),
|
||||
(f * 10 ** -19, " 314.1593E-21"),
|
||||
(f * 10 ** -18, " 3.1416E-18"),
|
||||
(f * 10 ** -17, " 31.4159E-18"),
|
||||
(f * 10 ** -16, " 314.1593E-18"),
|
||||
(f * 10 ** -15, " 3.1416E-15"),
|
||||
(f * 10 ** -14, " 31.4159E-15"),
|
||||
(f * 10 ** -13, " 314.1593E-15"),
|
||||
(f * 10 ** -12, " 3.1416E-12"),
|
||||
(f * 10 ** -11, " 31.4159E-12"),
|
||||
(f * 10 ** -10, " 314.1593E-12"),
|
||||
(f * 10 ** -9, " 3.1416E-09"),
|
||||
(f * 10 ** -8, " 31.4159E-09"),
|
||||
(f * 10 ** -7, " 314.1593E-09"),
|
||||
(f * 10 ** -6, " 3.1416E-06"),
|
||||
(f * 10 ** -5, " 31.4159E-06"),
|
||||
(f * 10 ** -4, " 314.1593E-06"),
|
||||
(f * 10 ** -3, " 3.1416E-03"),
|
||||
(f * 10 ** -2, " 31.4159E-03"),
|
||||
(f * 10 ** -1, " 314.1593E-03"),
|
||||
(f * 10 ** 0, " 3.1416E+00"),
|
||||
(f * 10 ** 1, " 31.4159E+00"),
|
||||
(f * 10 ** 2, " 314.1593E+00"),
|
||||
(f * 10 ** 3, " 3.1416E+03"),
|
||||
(f * 10 ** 4, " 31.4159E+03"),
|
||||
(f * 10 ** 5, " 314.1593E+03"),
|
||||
(f * 10 ** 6, " 3.1416E+06"),
|
||||
(f * 10 ** 7, " 31.4159E+06"),
|
||||
(f * 10 ** 8, " 314.1593E+06"),
|
||||
(f * 10 ** 9, " 3.1416E+09"),
|
||||
(f * 10 ** 10, " 31.4159E+09"),
|
||||
(f * 10 ** 11, " 314.1593E+09"),
|
||||
(f * 10 ** 12, " 3.1416E+12"),
|
||||
(f * 10 ** 13, " 31.4159E+12"),
|
||||
(f * 10 ** 14, " 314.1593E+12"),
|
||||
(f * 10 ** 15, " 3.1416E+15"),
|
||||
(f * 10 ** 16, " 31.4159E+15"),
|
||||
(f * 10 ** 17, " 314.1593E+15"),
|
||||
(f * 10 ** 18, " 3.1416E+18"),
|
||||
(f * 10 ** 19, " 31.4159E+18"),
|
||||
(f * 10 ** 20, " 314.1593E+18"),
|
||||
(f * 10 ** 21, " 3.1416E+21"),
|
||||
(f * 10 ** 22, " 31.4159E+21"),
|
||||
(f * 10 ** 23, " 314.1593E+21"),
|
||||
(f * 10 ** 24, " 3.1416E+24"),
|
||||
(f * 10 ** 25, " 31.4159E+24"),
|
||||
(f * 10 ** 26, " 314.1593E+24")]
|
||||
self.compare_all(formatter, in_out)
|
||||
|
||||
def test_rounding(self):
|
||||
formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True)
|
||||
in_out = [(5.55555, ' 5.556'), (55.5555, ' 55.556'),
|
||||
(555.555, ' 555.555'), (5555.55, ' 5.556k'),
|
||||
(55555.5, ' 55.556k'), (555555, ' 555.555k')]
|
||||
self.compare_all(formatter, in_out)
|
||||
|
||||
formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True)
|
||||
in_out = [(5.55555, ' 5.6'), (55.5555, ' 55.6'), (555.555, ' 555.6'),
|
||||
(5555.55, ' 5.6k'), (55555.5, ' 55.6k'), (555555, ' 555.6k')]
|
||||
self.compare_all(formatter, in_out)
|
||||
|
||||
formatter = fmt.EngFormatter(accuracy=0, use_eng_prefix=True)
|
||||
in_out = [(5.55555, ' 6'), (55.5555, ' 56'), (555.555, ' 556'),
|
||||
(5555.55, ' 6k'), (55555.5, ' 56k'), (555555, ' 556k')]
|
||||
self.compare_all(formatter, in_out)
|
||||
|
||||
formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True)
|
||||
result = formatter(0)
|
||||
assert result == u(' 0.000')
|
||||
|
||||
def test_nan(self):
|
||||
# Issue #11981
|
||||
|
||||
formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True)
|
||||
result = formatter(np.nan)
|
||||
assert result == u('NaN')
|
||||
|
||||
df = pd.DataFrame({'a': [1.5, 10.3, 20.5],
|
||||
'b': [50.3, 60.67, 70.12],
|
||||
'c': [100.2, 101.33, 120.33]})
|
||||
pt = df.pivot_table(values='a', index='b', columns='c')
|
||||
fmt.set_eng_float_format(accuracy=1)
|
||||
result = pt.to_string()
|
||||
assert 'NaN' in result
|
||||
tm.reset_display_options()
|
||||
|
||||
def test_inf(self):
|
||||
# Issue #11981
|
||||
|
||||
formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True)
|
||||
result = formatter(np.inf)
|
||||
assert result == u('inf')
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,204 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import compat
|
||||
import pandas.core.config as cf
|
||||
|
||||
import pandas.io.formats.format as fmt
|
||||
import pandas.io.formats.printing as printing
|
||||
|
||||
|
||||
def test_adjoin():
|
||||
data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], ['ggg', 'hhh', 'iii']]
|
||||
expected = 'a dd ggg\nb ee hhh\nc ff iii'
|
||||
|
||||
adjoined = printing.adjoin(2, *data)
|
||||
|
||||
assert (adjoined == expected)
|
||||
|
||||
|
||||
def test_repr_binary_type():
|
||||
import string
|
||||
letters = string.ascii_letters
|
||||
btype = compat.binary_type
|
||||
try:
|
||||
raw = btype(letters, encoding=cf.get_option('display.encoding'))
|
||||
except TypeError:
|
||||
raw = btype(letters)
|
||||
b = compat.text_type(compat.bytes_to_str(raw))
|
||||
res = printing.pprint_thing(b, quote_strings=True)
|
||||
assert res == repr(b)
|
||||
res = printing.pprint_thing(b, quote_strings=False)
|
||||
assert res == b
|
||||
|
||||
|
||||
class TestFormattBase(object):
|
||||
|
||||
def test_adjoin(self):
|
||||
data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], ['ggg', 'hhh', 'iii']]
|
||||
expected = 'a dd ggg\nb ee hhh\nc ff iii'
|
||||
|
||||
adjoined = printing.adjoin(2, *data)
|
||||
|
||||
assert adjoined == expected
|
||||
|
||||
def test_adjoin_unicode(self):
|
||||
data = [[u'あ', 'b', 'c'], ['dd', u'ええ', 'ff'], ['ggg', 'hhh', u'いいい']]
|
||||
expected = u'あ dd ggg\nb ええ hhh\nc ff いいい'
|
||||
adjoined = printing.adjoin(2, *data)
|
||||
assert adjoined == expected
|
||||
|
||||
adj = fmt.EastAsianTextAdjustment()
|
||||
|
||||
expected = u"""あ dd ggg
|
||||
b ええ hhh
|
||||
c ff いいい"""
|
||||
|
||||
adjoined = adj.adjoin(2, *data)
|
||||
assert adjoined == expected
|
||||
cols = adjoined.split('\n')
|
||||
assert adj.len(cols[0]) == 13
|
||||
assert adj.len(cols[1]) == 13
|
||||
assert adj.len(cols[2]) == 16
|
||||
|
||||
expected = u"""あ dd ggg
|
||||
b ええ hhh
|
||||
c ff いいい"""
|
||||
|
||||
adjoined = adj.adjoin(7, *data)
|
||||
assert adjoined == expected
|
||||
cols = adjoined.split('\n')
|
||||
assert adj.len(cols[0]) == 23
|
||||
assert adj.len(cols[1]) == 23
|
||||
assert adj.len(cols[2]) == 26
|
||||
|
||||
def test_justify(self):
|
||||
adj = fmt.EastAsianTextAdjustment()
|
||||
|
||||
def just(x, *args, **kwargs):
|
||||
# wrapper to test single str
|
||||
return adj.justify([x], *args, **kwargs)[0]
|
||||
|
||||
assert just('abc', 5, mode='left') == 'abc '
|
||||
assert just('abc', 5, mode='center') == ' abc '
|
||||
assert just('abc', 5, mode='right') == ' abc'
|
||||
assert just(u'abc', 5, mode='left') == 'abc '
|
||||
assert just(u'abc', 5, mode='center') == ' abc '
|
||||
assert just(u'abc', 5, mode='right') == ' abc'
|
||||
|
||||
assert just(u'パンダ', 5, mode='left') == u'パンダ'
|
||||
assert just(u'パンダ', 5, mode='center') == u'パンダ'
|
||||
assert just(u'パンダ', 5, mode='right') == u'パンダ'
|
||||
|
||||
assert just(u'パンダ', 10, mode='left') == u'パンダ '
|
||||
assert just(u'パンダ', 10, mode='center') == u' パンダ '
|
||||
assert just(u'パンダ', 10, mode='right') == u' パンダ'
|
||||
|
||||
def test_east_asian_len(self):
|
||||
adj = fmt.EastAsianTextAdjustment()
|
||||
|
||||
assert adj.len('abc') == 3
|
||||
assert adj.len(u'abc') == 3
|
||||
|
||||
assert adj.len(u'パンダ') == 6
|
||||
assert adj.len(u'パンダ') == 5
|
||||
assert adj.len(u'パンダpanda') == 11
|
||||
assert adj.len(u'パンダpanda') == 10
|
||||
|
||||
def test_ambiguous_width(self):
|
||||
adj = fmt.EastAsianTextAdjustment()
|
||||
assert adj.len(u'¡¡ab') == 4
|
||||
|
||||
with cf.option_context('display.unicode.ambiguous_as_wide', True):
|
||||
adj = fmt.EastAsianTextAdjustment()
|
||||
assert adj.len(u'¡¡ab') == 6
|
||||
|
||||
data = [[u'あ', 'b', 'c'], ['dd', u'ええ', 'ff'],
|
||||
['ggg', u'¡¡ab', u'いいい']]
|
||||
expected = u'あ dd ggg \nb ええ ¡¡ab\nc ff いいい'
|
||||
adjoined = adj.adjoin(2, *data)
|
||||
assert adjoined == expected
|
||||
|
||||
|
||||
class TestTableSchemaRepr(object):
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
pytest.importorskip('IPython')
|
||||
|
||||
from IPython.core.interactiveshell import InteractiveShell
|
||||
cls.display_formatter = InteractiveShell.instance().display_formatter
|
||||
|
||||
def test_publishes(self):
|
||||
|
||||
df = pd.DataFrame({"A": [1, 2]})
|
||||
objects = [df['A'], df, df] # dataframe / series
|
||||
expected_keys = [
|
||||
{'text/plain', 'application/vnd.dataresource+json'},
|
||||
{'text/plain', 'text/html', 'application/vnd.dataresource+json'},
|
||||
]
|
||||
|
||||
opt = pd.option_context('display.html.table_schema', True)
|
||||
for obj, expected in zip(objects, expected_keys):
|
||||
with opt:
|
||||
formatted = self.display_formatter.format(obj)
|
||||
assert set(formatted[0].keys()) == expected
|
||||
|
||||
with_latex = pd.option_context('display.latex.repr', True)
|
||||
|
||||
with opt, with_latex:
|
||||
formatted = self.display_formatter.format(obj)
|
||||
|
||||
expected = {'text/plain', 'text/html', 'text/latex',
|
||||
'application/vnd.dataresource+json'}
|
||||
assert set(formatted[0].keys()) == expected
|
||||
|
||||
def test_publishes_not_implemented(self):
|
||||
# column MultiIndex
|
||||
# GH 15996
|
||||
midx = pd.MultiIndex.from_product([['A', 'B'], ['a', 'b', 'c']])
|
||||
df = pd.DataFrame(np.random.randn(5, len(midx)), columns=midx)
|
||||
|
||||
opt = pd.option_context('display.html.table_schema', True)
|
||||
|
||||
with opt:
|
||||
formatted = self.display_formatter.format(df)
|
||||
|
||||
expected = {'text/plain', 'text/html'}
|
||||
assert set(formatted[0].keys()) == expected
|
||||
|
||||
def test_config_on(self):
|
||||
df = pd.DataFrame({"A": [1, 2]})
|
||||
with pd.option_context("display.html.table_schema", True):
|
||||
result = df._repr_data_resource_()
|
||||
|
||||
assert result is not None
|
||||
|
||||
def test_config_default_off(self):
|
||||
df = pd.DataFrame({"A": [1, 2]})
|
||||
with pd.option_context("display.html.table_schema", False):
|
||||
result = df._repr_data_resource_()
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_enable_data_resource_formatter(self):
|
||||
# GH 10491
|
||||
formatters = self.display_formatter.formatters
|
||||
mimetype = 'application/vnd.dataresource+json'
|
||||
|
||||
with pd.option_context('display.html.table_schema', True):
|
||||
assert 'application/vnd.dataresource+json' in formatters
|
||||
assert formatters[mimetype].enabled
|
||||
|
||||
# still there, just disabled
|
||||
assert 'application/vnd.dataresource+json' in formatters
|
||||
assert not formatters[mimetype].enabled
|
||||
|
||||
# able to re-set
|
||||
with pd.option_context('display.html.table_schema', True):
|
||||
assert 'application/vnd.dataresource+json' in formatters
|
||||
assert formatters[mimetype].enabled
|
||||
# smoke test that it works
|
||||
self.display_formatter.format(cf)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,563 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, compat
|
||||
from pandas.util import testing as tm
|
||||
|
||||
|
||||
class TestToCSV(object):
|
||||
|
||||
@pytest.mark.xfail((3, 6, 5) > sys.version_info >= (3, 5),
|
||||
reason=("Python csv library bug "
|
||||
"(see https://bugs.python.org/issue32255)"))
|
||||
def test_to_csv_with_single_column(self):
|
||||
# see gh-18676, https://bugs.python.org/issue32255
|
||||
#
|
||||
# Python's CSV library adds an extraneous '""'
|
||||
# before the newline when the NaN-value is in
|
||||
# the first row. Otherwise, only the newline
|
||||
# character is added. This behavior is inconsistent
|
||||
# and was patched in https://bugs.python.org/pull_request4672.
|
||||
df1 = DataFrame([None, 1])
|
||||
expected1 = """\
|
||||
""
|
||||
1.0
|
||||
"""
|
||||
with tm.ensure_clean('test.csv') as path:
|
||||
df1.to_csv(path, header=None, index=None)
|
||||
with open(path, 'r') as f:
|
||||
assert f.read() == expected1
|
||||
|
||||
df2 = DataFrame([1, None])
|
||||
expected2 = """\
|
||||
1.0
|
||||
""
|
||||
"""
|
||||
with tm.ensure_clean('test.csv') as path:
|
||||
df2.to_csv(path, header=None, index=None)
|
||||
with open(path, 'r') as f:
|
||||
assert f.read() == expected2
|
||||
|
||||
def test_to_csv_defualt_encoding(self):
|
||||
# GH17097
|
||||
df = DataFrame({'col': [u"AAAAA", u"ÄÄÄÄÄ", u"ßßßßß", u"聞聞聞聞聞"]})
|
||||
|
||||
with tm.ensure_clean('test.csv') as path:
|
||||
# the default to_csv encoding in Python 2 is ascii, and that in
|
||||
# Python 3 is uft-8.
|
||||
if pd.compat.PY2:
|
||||
# the encoding argument parameter should be utf-8
|
||||
with pytest.raises(UnicodeEncodeError, match='ascii'):
|
||||
df.to_csv(path)
|
||||
else:
|
||||
df.to_csv(path)
|
||||
tm.assert_frame_equal(pd.read_csv(path, index_col=0), df)
|
||||
|
||||
def test_to_csv_quotechar(self):
|
||||
df = DataFrame({'col': [1, 2]})
|
||||
expected = """\
|
||||
"","col"
|
||||
"0","1"
|
||||
"1","2"
|
||||
"""
|
||||
|
||||
with tm.ensure_clean('test.csv') as path:
|
||||
df.to_csv(path, quoting=1) # 1=QUOTE_ALL
|
||||
with open(path, 'r') as f:
|
||||
assert f.read() == expected
|
||||
|
||||
expected = """\
|
||||
$$,$col$
|
||||
$0$,$1$
|
||||
$1$,$2$
|
||||
"""
|
||||
|
||||
with tm.ensure_clean('test.csv') as path:
|
||||
df.to_csv(path, quoting=1, quotechar="$")
|
||||
with open(path, 'r') as f:
|
||||
assert f.read() == expected
|
||||
|
||||
with tm.ensure_clean('test.csv') as path:
|
||||
with pytest.raises(TypeError, match='quotechar'):
|
||||
df.to_csv(path, quoting=1, quotechar=None)
|
||||
|
||||
def test_to_csv_doublequote(self):
|
||||
df = DataFrame({'col': ['a"a', '"bb"']})
|
||||
expected = '''\
|
||||
"","col"
|
||||
"0","a""a"
|
||||
"1","""bb"""
|
||||
'''
|
||||
|
||||
with tm.ensure_clean('test.csv') as path:
|
||||
df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL
|
||||
with open(path, 'r') as f:
|
||||
assert f.read() == expected
|
||||
|
||||
from _csv import Error
|
||||
with tm.ensure_clean('test.csv') as path:
|
||||
with pytest.raises(Error, match='escapechar'):
|
||||
df.to_csv(path, doublequote=False) # no escapechar set
|
||||
|
||||
def test_to_csv_escapechar(self):
|
||||
df = DataFrame({'col': ['a"a', '"bb"']})
|
||||
expected = '''\
|
||||
"","col"
|
||||
"0","a\\"a"
|
||||
"1","\\"bb\\""
|
||||
'''
|
||||
|
||||
with tm.ensure_clean('test.csv') as path: # QUOTE_ALL
|
||||
df.to_csv(path, quoting=1, doublequote=False, escapechar='\\')
|
||||
with open(path, 'r') as f:
|
||||
assert f.read() == expected
|
||||
|
||||
df = DataFrame({'col': ['a,a', ',bb,']})
|
||||
expected = """\
|
||||
,col
|
||||
0,a\\,a
|
||||
1,\\,bb\\,
|
||||
"""
|
||||
|
||||
with tm.ensure_clean('test.csv') as path:
|
||||
df.to_csv(path, quoting=3, escapechar='\\') # QUOTE_NONE
|
||||
with open(path, 'r') as f:
|
||||
assert f.read() == expected
|
||||
|
||||
def test_csv_to_string(self):
|
||||
df = DataFrame({'col': [1, 2]})
|
||||
expected_rows = [',col',
|
||||
'0,1',
|
||||
'1,2']
|
||||
expected = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df.to_csv() == expected
|
||||
|
||||
def test_to_csv_decimal(self):
|
||||
# see gh-781
|
||||
df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]})
|
||||
|
||||
expected_rows = [',col1,col2,col3',
|
||||
'0,1,a,10.1']
|
||||
expected_default = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df.to_csv() == expected_default
|
||||
|
||||
expected_rows = [';col1;col2;col3',
|
||||
'0;1;a;10,1']
|
||||
expected_european_excel = tm.convert_rows_list_to_csv_str(
|
||||
expected_rows)
|
||||
assert df.to_csv(decimal=',', sep=';') == expected_european_excel
|
||||
|
||||
expected_rows = [',col1,col2,col3',
|
||||
'0,1,a,10.10']
|
||||
expected_float_format_default = tm.convert_rows_list_to_csv_str(
|
||||
expected_rows)
|
||||
assert df.to_csv(float_format='%.2f') == expected_float_format_default
|
||||
|
||||
expected_rows = [';col1;col2;col3',
|
||||
'0;1;a;10,10']
|
||||
expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df.to_csv(decimal=',', sep=';',
|
||||
float_format='%.2f') == expected_float_format
|
||||
|
||||
# see gh-11553: testing if decimal is taken into account for '0.0'
|
||||
df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1})
|
||||
|
||||
expected_rows = ['a,b,c',
|
||||
'0^0,2^2,1',
|
||||
'1^1,3^3,1']
|
||||
expected = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df.to_csv(index=False, decimal='^') == expected
|
||||
|
||||
# same but for an index
|
||||
assert df.set_index('a').to_csv(decimal='^') == expected
|
||||
|
||||
# same for a multi-index
|
||||
assert df.set_index(['a', 'b']).to_csv(decimal="^") == expected
|
||||
|
||||
def test_to_csv_float_format(self):
|
||||
# testing if float_format is taken into account for the index
|
||||
# GH 11553
|
||||
df = pd.DataFrame({'a': [0, 1], 'b': [2.2, 3.3], 'c': 1})
|
||||
|
||||
expected_rows = ['a,b,c',
|
||||
'0,2.20,1',
|
||||
'1,3.30,1']
|
||||
expected = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df.set_index('a').to_csv(float_format='%.2f') == expected
|
||||
|
||||
# same for a multi-index
|
||||
assert df.set_index(['a', 'b']).to_csv(
|
||||
float_format='%.2f') == expected
|
||||
|
||||
def test_to_csv_na_rep(self):
|
||||
# see gh-11553
|
||||
#
|
||||
# Testing if NaN values are correctly represented in the index.
|
||||
df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]})
|
||||
expected_rows = ['a,b,c',
|
||||
'0.0,0,2',
|
||||
'_,1,3']
|
||||
expected = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
|
||||
assert df.set_index('a').to_csv(na_rep='_') == expected
|
||||
assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected
|
||||
|
||||
# now with an index containing only NaNs
|
||||
df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]})
|
||||
expected_rows = ['a,b,c',
|
||||
'_,0,2',
|
||||
'_,1,3']
|
||||
expected = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
|
||||
assert df.set_index('a').to_csv(na_rep='_') == expected
|
||||
assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected
|
||||
|
||||
# check if na_rep parameter does not break anything when no NaN
|
||||
df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]})
|
||||
expected_rows = ['a,b,c',
|
||||
'0,0,2',
|
||||
'0,1,3']
|
||||
expected = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
|
||||
assert df.set_index('a').to_csv(na_rep='_') == expected
|
||||
assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected
|
||||
|
||||
def test_to_csv_date_format(self):
|
||||
# GH 10209
|
||||
df_sec = DataFrame({'A': pd.date_range('20130101', periods=5, freq='s')
|
||||
})
|
||||
df_day = DataFrame({'A': pd.date_range('20130101', periods=5, freq='d')
|
||||
})
|
||||
|
||||
expected_rows = [',A',
|
||||
'0,2013-01-01 00:00:00',
|
||||
'1,2013-01-01 00:00:01',
|
||||
'2,2013-01-01 00:00:02',
|
||||
'3,2013-01-01 00:00:03',
|
||||
'4,2013-01-01 00:00:04']
|
||||
expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df_sec.to_csv() == expected_default_sec
|
||||
|
||||
expected_rows = [',A',
|
||||
'0,2013-01-01 00:00:00',
|
||||
'1,2013-01-02 00:00:00',
|
||||
'2,2013-01-03 00:00:00',
|
||||
'3,2013-01-04 00:00:00',
|
||||
'4,2013-01-05 00:00:00']
|
||||
expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert (df_day.to_csv(date_format='%Y-%m-%d %H:%M:%S') ==
|
||||
expected_ymdhms_day)
|
||||
|
||||
expected_rows = [',A',
|
||||
'0,2013-01-01',
|
||||
'1,2013-01-01',
|
||||
'2,2013-01-01',
|
||||
'3,2013-01-01',
|
||||
'4,2013-01-01']
|
||||
expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df_sec.to_csv(date_format='%Y-%m-%d') == expected_ymd_sec
|
||||
|
||||
expected_rows = [',A',
|
||||
'0,2013-01-01',
|
||||
'1,2013-01-02',
|
||||
'2,2013-01-03',
|
||||
'3,2013-01-04',
|
||||
'4,2013-01-05']
|
||||
expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
assert df_day.to_csv() == expected_default_day
|
||||
assert df_day.to_csv(date_format='%Y-%m-%d') == expected_default_day
|
||||
|
||||
# see gh-7791
|
||||
#
|
||||
# Testing if date_format parameter is taken into account
|
||||
# for multi-indexed DataFrames.
|
||||
df_sec['B'] = 0
|
||||
df_sec['C'] = 1
|
||||
|
||||
expected_rows = ['A,B,C',
|
||||
'2013-01-01,0,1']
|
||||
expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
|
||||
df_sec_grouped = df_sec.groupby([pd.Grouper(key='A', freq='1h'), 'B'])
|
||||
assert (df_sec_grouped.mean().to_csv(date_format='%Y-%m-%d') ==
|
||||
expected_ymd_sec)
|
||||
|
||||
def test_to_csv_multi_index(self):
|
||||
# see gh-6618
|
||||
df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]))
|
||||
|
||||
exp_rows = [',1',
|
||||
',2',
|
||||
'0,1']
|
||||
exp = tm.convert_rows_list_to_csv_str(exp_rows)
|
||||
assert df.to_csv() == exp
|
||||
|
||||
exp_rows = ['1', '2', '1']
|
||||
exp = tm.convert_rows_list_to_csv_str(exp_rows)
|
||||
assert df.to_csv(index=False) == exp
|
||||
|
||||
df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]),
|
||||
index=pd.MultiIndex.from_arrays([[1], [2]]))
|
||||
|
||||
exp_rows = [',,1', ',,2', '1,2,1']
|
||||
exp = tm.convert_rows_list_to_csv_str(exp_rows)
|
||||
assert df.to_csv() == exp
|
||||
|
||||
exp_rows = ['1', '2', '1']
|
||||
exp = tm.convert_rows_list_to_csv_str(exp_rows)
|
||||
assert df.to_csv(index=False) == exp
|
||||
|
||||
df = DataFrame(
|
||||
[1], columns=pd.MultiIndex.from_arrays([['foo'], ['bar']]))
|
||||
|
||||
exp_rows = [',foo', ',bar', '0,1']
|
||||
exp = tm.convert_rows_list_to_csv_str(exp_rows)
|
||||
assert df.to_csv() == exp
|
||||
|
||||
exp_rows = ['foo', 'bar', '1']
|
||||
exp = tm.convert_rows_list_to_csv_str(exp_rows)
|
||||
assert df.to_csv(index=False) == exp
|
||||
|
||||
@pytest.mark.parametrize("ind,expected", [
|
||||
(pd.MultiIndex(levels=[[1.0]],
|
||||
codes=[[0]],
|
||||
names=["x"]),
|
||||
"x,data\n1.0,1\n"),
|
||||
(pd.MultiIndex(levels=[[1.], [2.]],
|
||||
codes=[[0], [0]],
|
||||
names=["x", "y"]),
|
||||
"x,y,data\n1.0,2.0,1\n")
|
||||
])
|
||||
@pytest.mark.parametrize("klass", [
|
||||
pd.DataFrame, pd.Series
|
||||
])
|
||||
def test_to_csv_single_level_multi_index(self, ind, expected, klass):
|
||||
# see gh-19589
|
||||
result = klass(pd.Series([1], ind, name="data")).to_csv(
|
||||
line_terminator="\n", header=True)
|
||||
assert result == expected
|
||||
|
||||
def test_to_csv_string_array_ascii(self):
|
||||
# GH 10813
|
||||
str_array = [{'names': ['foo', 'bar']}, {'names': ['baz', 'qux']}]
|
||||
df = pd.DataFrame(str_array)
|
||||
expected_ascii = '''\
|
||||
,names
|
||||
0,"['foo', 'bar']"
|
||||
1,"['baz', 'qux']"
|
||||
'''
|
||||
with tm.ensure_clean('str_test.csv') as path:
|
||||
df.to_csv(path, encoding='ascii')
|
||||
with open(path, 'r') as f:
|
||||
assert f.read() == expected_ascii
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_to_csv_string_array_utf8(self):
|
||||
# GH 10813
|
||||
str_array = [{'names': ['foo', 'bar']}, {'names': ['baz', 'qux']}]
|
||||
df = pd.DataFrame(str_array)
|
||||
expected_utf8 = '''\
|
||||
,names
|
||||
0,"[u'foo', u'bar']"
|
||||
1,"[u'baz', u'qux']"
|
||||
'''
|
||||
with tm.ensure_clean('unicode_test.csv') as path:
|
||||
df.to_csv(path, encoding='utf-8')
|
||||
with open(path, 'r') as f:
|
||||
assert f.read() == expected_utf8
|
||||
|
||||
def test_to_csv_string_with_lf(self):
|
||||
# GH 20353
|
||||
data = {
|
||||
'int': [1, 2, 3],
|
||||
'str_lf': ['abc', 'd\nef', 'g\nh\n\ni']
|
||||
}
|
||||
df = pd.DataFrame(data)
|
||||
with tm.ensure_clean('lf_test.csv') as path:
|
||||
# case 1: The default line terminator(=os.linesep)(PR 21406)
|
||||
os_linesep = os.linesep.encode('utf-8')
|
||||
expected_noarg = (
|
||||
b'int,str_lf' + os_linesep +
|
||||
b'1,abc' + os_linesep +
|
||||
b'2,"d\nef"' + os_linesep +
|
||||
b'3,"g\nh\n\ni"' + os_linesep
|
||||
)
|
||||
df.to_csv(path, index=False)
|
||||
with open(path, 'rb') as f:
|
||||
assert f.read() == expected_noarg
|
||||
with tm.ensure_clean('lf_test.csv') as path:
|
||||
# case 2: LF as line terminator
|
||||
expected_lf = (
|
||||
b'int,str_lf\n'
|
||||
b'1,abc\n'
|
||||
b'2,"d\nef"\n'
|
||||
b'3,"g\nh\n\ni"\n'
|
||||
)
|
||||
df.to_csv(path, line_terminator='\n', index=False)
|
||||
with open(path, 'rb') as f:
|
||||
assert f.read() == expected_lf
|
||||
with tm.ensure_clean('lf_test.csv') as path:
|
||||
# case 3: CRLF as line terminator
|
||||
# 'line_terminator' should not change inner element
|
||||
expected_crlf = (
|
||||
b'int,str_lf\r\n'
|
||||
b'1,abc\r\n'
|
||||
b'2,"d\nef"\r\n'
|
||||
b'3,"g\nh\n\ni"\r\n'
|
||||
)
|
||||
df.to_csv(path, line_terminator='\r\n', index=False)
|
||||
with open(path, 'rb') as f:
|
||||
assert f.read() == expected_crlf
|
||||
|
||||
def test_to_csv_string_with_crlf(self):
|
||||
# GH 20353
|
||||
data = {
|
||||
'int': [1, 2, 3],
|
||||
'str_crlf': ['abc', 'd\r\nef', 'g\r\nh\r\n\r\ni']
|
||||
}
|
||||
df = pd.DataFrame(data)
|
||||
with tm.ensure_clean('crlf_test.csv') as path:
|
||||
# case 1: The default line terminator(=os.linesep)(PR 21406)
|
||||
os_linesep = os.linesep.encode('utf-8')
|
||||
expected_noarg = (
|
||||
b'int,str_crlf' + os_linesep +
|
||||
b'1,abc' + os_linesep +
|
||||
b'2,"d\r\nef"' + os_linesep +
|
||||
b'3,"g\r\nh\r\n\r\ni"' + os_linesep
|
||||
)
|
||||
df.to_csv(path, index=False)
|
||||
with open(path, 'rb') as f:
|
||||
assert f.read() == expected_noarg
|
||||
with tm.ensure_clean('crlf_test.csv') as path:
|
||||
# case 2: LF as line terminator
|
||||
expected_lf = (
|
||||
b'int,str_crlf\n'
|
||||
b'1,abc\n'
|
||||
b'2,"d\r\nef"\n'
|
||||
b'3,"g\r\nh\r\n\r\ni"\n'
|
||||
)
|
||||
df.to_csv(path, line_terminator='\n', index=False)
|
||||
with open(path, 'rb') as f:
|
||||
assert f.read() == expected_lf
|
||||
with tm.ensure_clean('crlf_test.csv') as path:
|
||||
# case 3: CRLF as line terminator
|
||||
# 'line_terminator' should not change inner element
|
||||
expected_crlf = (
|
||||
b'int,str_crlf\r\n'
|
||||
b'1,abc\r\n'
|
||||
b'2,"d\r\nef"\r\n'
|
||||
b'3,"g\r\nh\r\n\r\ni"\r\n'
|
||||
)
|
||||
df.to_csv(path, line_terminator='\r\n', index=False)
|
||||
with open(path, 'rb') as f:
|
||||
assert f.read() == expected_crlf
|
||||
|
||||
def test_to_csv_stdout_file(self, capsys):
|
||||
# GH 21561
|
||||
df = pd.DataFrame([['foo', 'bar'], ['baz', 'qux']],
|
||||
columns=['name_1', 'name_2'])
|
||||
expected_rows = [',name_1,name_2',
|
||||
'0,foo,bar',
|
||||
'1,baz,qux']
|
||||
expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows)
|
||||
|
||||
df.to_csv(sys.stdout, encoding='ascii')
|
||||
captured = capsys.readouterr()
|
||||
|
||||
assert captured.out == expected_ascii
|
||||
assert not sys.stdout.closed
|
||||
|
||||
@pytest.mark.xfail(
|
||||
compat.is_platform_windows(),
|
||||
reason=("Especially in Windows, file stream should not be passed"
|
||||
"to csv writer without newline='' option."
|
||||
"(https://docs.python.org/3.6/library/csv.html#csv.writer)"))
|
||||
def test_to_csv_write_to_open_file(self):
|
||||
# GH 21696
|
||||
df = pd.DataFrame({'a': ['x', 'y', 'z']})
|
||||
expected = '''\
|
||||
manual header
|
||||
x
|
||||
y
|
||||
z
|
||||
'''
|
||||
with tm.ensure_clean('test.txt') as path:
|
||||
with open(path, 'w') as f:
|
||||
f.write('manual header\n')
|
||||
df.to_csv(f, header=None, index=None)
|
||||
with open(path, 'r') as f:
|
||||
assert f.read() == expected
|
||||
|
||||
@pytest.mark.skipif(compat.PY2, reason="Test case for python3")
|
||||
def test_to_csv_write_to_open_file_with_newline_py3(self):
|
||||
# see gh-21696
|
||||
# see gh-20353
|
||||
df = pd.DataFrame({'a': ['x', 'y', 'z']})
|
||||
expected_rows = ["x",
|
||||
"y",
|
||||
"z"]
|
||||
expected = ("manual header\n" +
|
||||
tm.convert_rows_list_to_csv_str(expected_rows))
|
||||
with tm.ensure_clean('test.txt') as path:
|
||||
with open(path, 'w', newline='') as f:
|
||||
f.write('manual header\n')
|
||||
df.to_csv(f, header=None, index=None)
|
||||
|
||||
with open(path, 'rb') as f:
|
||||
assert f.read() == bytes(expected, 'utf-8')
|
||||
|
||||
@pytest.mark.skipif(compat.PY3, reason="Test case for python2")
|
||||
def test_to_csv_write_to_open_file_with_newline_py2(self):
|
||||
# see gh-21696
|
||||
# see gh-20353
|
||||
df = pd.DataFrame({'a': ['x', 'y', 'z']})
|
||||
expected_rows = ["x",
|
||||
"y",
|
||||
"z"]
|
||||
expected = ("manual header\n" +
|
||||
tm.convert_rows_list_to_csv_str(expected_rows))
|
||||
with tm.ensure_clean('test.txt') as path:
|
||||
with open(path, 'wb') as f:
|
||||
f.write('manual header\n')
|
||||
df.to_csv(f, header=None, index=None)
|
||||
|
||||
with open(path, 'rb') as f:
|
||||
assert f.read() == expected
|
||||
|
||||
@pytest.mark.parametrize("to_infer", [True, False])
|
||||
@pytest.mark.parametrize("read_infer", [True, False])
|
||||
def test_to_csv_compression(self, compression_only,
|
||||
read_infer, to_infer):
|
||||
# see gh-15008
|
||||
compression = compression_only
|
||||
|
||||
if compression == "zip":
|
||||
pytest.skip("{compression} is not supported "
|
||||
"for to_csv".format(compression=compression))
|
||||
|
||||
# We'll complete file extension subsequently.
|
||||
filename = "test."
|
||||
|
||||
if compression == "gzip":
|
||||
filename += "gz"
|
||||
else:
|
||||
# xz --> .xz
|
||||
# bz2 --> .bz2
|
||||
filename += compression
|
||||
|
||||
df = DataFrame({"A": [1]})
|
||||
|
||||
to_compression = "infer" if to_infer else compression
|
||||
read_compression = "infer" if read_infer else compression
|
||||
|
||||
with tm.ensure_clean(filename) as path:
|
||||
df.to_csv(path, compression=to_compression)
|
||||
result = pd.read_csv(path, index_col=0,
|
||||
compression=read_compression)
|
||||
tm.assert_frame_equal(result, df)
|
||||
@@ -0,0 +1,278 @@
|
||||
"""Tests formatting as writer-agnostic ExcelCells
|
||||
|
||||
ExcelFormatter is tested implicitly in pandas/tests/io/test_excel.py
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.formats.css import CSSWarning
|
||||
from pandas.io.formats.excel import CSSToExcelConverter
|
||||
|
||||
|
||||
@pytest.mark.parametrize('css,expected', [
|
||||
# FONT
|
||||
# - name
|
||||
('font-family: foo,bar', {'font': {'name': 'foo'}}),
|
||||
('font-family: "foo bar",baz', {'font': {'name': 'foo bar'}}),
|
||||
('font-family: foo,\nbar', {'font': {'name': 'foo'}}),
|
||||
('font-family: foo, bar, baz', {'font': {'name': 'foo'}}),
|
||||
('font-family: bar, foo', {'font': {'name': 'bar'}}),
|
||||
('font-family: \'foo bar\', baz', {'font': {'name': 'foo bar'}}),
|
||||
('font-family: \'foo \\\'bar\', baz', {'font': {'name': 'foo \'bar'}}),
|
||||
('font-family: "foo \\"bar", baz', {'font': {'name': 'foo "bar'}}),
|
||||
('font-family: "foo ,bar", baz', {'font': {'name': 'foo ,bar'}}),
|
||||
# - family
|
||||
('font-family: serif', {'font': {'name': 'serif', 'family': 1}}),
|
||||
('font-family: Serif', {'font': {'name': 'serif', 'family': 1}}),
|
||||
('font-family: roman, serif', {'font': {'name': 'roman', 'family': 1}}),
|
||||
('font-family: roman, sans-serif', {'font': {'name': 'roman',
|
||||
'family': 2}}),
|
||||
('font-family: roman, sans serif', {'font': {'name': 'roman'}}),
|
||||
('font-family: roman, sansserif', {'font': {'name': 'roman'}}),
|
||||
('font-family: roman, cursive', {'font': {'name': 'roman', 'family': 4}}),
|
||||
('font-family: roman, fantasy', {'font': {'name': 'roman', 'family': 5}}),
|
||||
# - size
|
||||
('font-size: 1em', {'font': {'size': 12}}),
|
||||
('font-size: xx-small', {'font': {'size': 6}}),
|
||||
('font-size: x-small', {'font': {'size': 7.5}}),
|
||||
('font-size: small', {'font': {'size': 9.6}}),
|
||||
('font-size: medium', {'font': {'size': 12}}),
|
||||
('font-size: large', {'font': {'size': 13.5}}),
|
||||
('font-size: x-large', {'font': {'size': 18}}),
|
||||
('font-size: xx-large', {'font': {'size': 24}}),
|
||||
('font-size: 50%', {'font': {'size': 6}}),
|
||||
# - bold
|
||||
('font-weight: 100', {'font': {'bold': False}}),
|
||||
('font-weight: 200', {'font': {'bold': False}}),
|
||||
('font-weight: 300', {'font': {'bold': False}}),
|
||||
('font-weight: 400', {'font': {'bold': False}}),
|
||||
('font-weight: normal', {'font': {'bold': False}}),
|
||||
('font-weight: lighter', {'font': {'bold': False}}),
|
||||
('font-weight: bold', {'font': {'bold': True}}),
|
||||
('font-weight: bolder', {'font': {'bold': True}}),
|
||||
('font-weight: 700', {'font': {'bold': True}}),
|
||||
('font-weight: 800', {'font': {'bold': True}}),
|
||||
('font-weight: 900', {'font': {'bold': True}}),
|
||||
# - italic
|
||||
('font-style: italic', {'font': {'italic': True}}),
|
||||
('font-style: oblique', {'font': {'italic': True}}),
|
||||
# - underline
|
||||
('text-decoration: underline',
|
||||
{'font': {'underline': 'single'}}),
|
||||
('text-decoration: overline',
|
||||
{}),
|
||||
('text-decoration: none',
|
||||
{}),
|
||||
# - strike
|
||||
('text-decoration: line-through',
|
||||
{'font': {'strike': True}}),
|
||||
('text-decoration: underline line-through',
|
||||
{'font': {'strike': True, 'underline': 'single'}}),
|
||||
('text-decoration: underline; text-decoration: line-through',
|
||||
{'font': {'strike': True}}),
|
||||
# - color
|
||||
('color: red', {'font': {'color': 'FF0000'}}),
|
||||
('color: #ff0000', {'font': {'color': 'FF0000'}}),
|
||||
('color: #f0a', {'font': {'color': 'FF00AA'}}),
|
||||
# - shadow
|
||||
('text-shadow: none', {'font': {'shadow': False}}),
|
||||
('text-shadow: 0px -0em 0px #CCC', {'font': {'shadow': False}}),
|
||||
('text-shadow: 0px -0em 0px #999', {'font': {'shadow': False}}),
|
||||
('text-shadow: 0px -0em 0px', {'font': {'shadow': False}}),
|
||||
('text-shadow: 2px -0em 0px #CCC', {'font': {'shadow': True}}),
|
||||
('text-shadow: 0px -2em 0px #CCC', {'font': {'shadow': True}}),
|
||||
('text-shadow: 0px -0em 2px #CCC', {'font': {'shadow': True}}),
|
||||
('text-shadow: 0px -0em 2px', {'font': {'shadow': True}}),
|
||||
('text-shadow: 0px -2em', {'font': {'shadow': True}}),
|
||||
|
||||
# FILL
|
||||
# - color, fillType
|
||||
('background-color: red', {'fill': {'fgColor': 'FF0000',
|
||||
'patternType': 'solid'}}),
|
||||
('background-color: #ff0000', {'fill': {'fgColor': 'FF0000',
|
||||
'patternType': 'solid'}}),
|
||||
('background-color: #f0a', {'fill': {'fgColor': 'FF00AA',
|
||||
'patternType': 'solid'}}),
|
||||
# BORDER
|
||||
# - style
|
||||
('border-style: solid',
|
||||
{'border': {'top': {'style': 'medium'},
|
||||
'bottom': {'style': 'medium'},
|
||||
'left': {'style': 'medium'},
|
||||
'right': {'style': 'medium'}}}),
|
||||
('border-style: solid; border-width: thin',
|
||||
{'border': {'top': {'style': 'thin'},
|
||||
'bottom': {'style': 'thin'},
|
||||
'left': {'style': 'thin'},
|
||||
'right': {'style': 'thin'}}}),
|
||||
|
||||
('border-top-style: solid; border-top-width: thin',
|
||||
{'border': {'top': {'style': 'thin'}}}),
|
||||
('border-top-style: solid; border-top-width: 1pt',
|
||||
{'border': {'top': {'style': 'thin'}}}),
|
||||
('border-top-style: solid',
|
||||
{'border': {'top': {'style': 'medium'}}}),
|
||||
('border-top-style: solid; border-top-width: medium',
|
||||
{'border': {'top': {'style': 'medium'}}}),
|
||||
('border-top-style: solid; border-top-width: 2pt',
|
||||
{'border': {'top': {'style': 'medium'}}}),
|
||||
('border-top-style: solid; border-top-width: thick',
|
||||
{'border': {'top': {'style': 'thick'}}}),
|
||||
('border-top-style: solid; border-top-width: 4pt',
|
||||
{'border': {'top': {'style': 'thick'}}}),
|
||||
|
||||
('border-top-style: dotted',
|
||||
{'border': {'top': {'style': 'mediumDashDotDot'}}}),
|
||||
('border-top-style: dotted; border-top-width: thin',
|
||||
{'border': {'top': {'style': 'dotted'}}}),
|
||||
('border-top-style: dashed',
|
||||
{'border': {'top': {'style': 'mediumDashed'}}}),
|
||||
('border-top-style: dashed; border-top-width: thin',
|
||||
{'border': {'top': {'style': 'dashed'}}}),
|
||||
('border-top-style: double',
|
||||
{'border': {'top': {'style': 'double'}}}),
|
||||
# - color
|
||||
('border-style: solid; border-color: #0000ff',
|
||||
{'border': {'top': {'style': 'medium', 'color': '0000FF'},
|
||||
'right': {'style': 'medium', 'color': '0000FF'},
|
||||
'bottom': {'style': 'medium', 'color': '0000FF'},
|
||||
'left': {'style': 'medium', 'color': '0000FF'}}}),
|
||||
('border-top-style: double; border-top-color: blue',
|
||||
{'border': {'top': {'style': 'double', 'color': '0000FF'}}}),
|
||||
('border-top-style: solid; border-top-color: #06c',
|
||||
{'border': {'top': {'style': 'medium', 'color': '0066CC'}}}),
|
||||
# ALIGNMENT
|
||||
# - horizontal
|
||||
('text-align: center',
|
||||
{'alignment': {'horizontal': 'center'}}),
|
||||
('text-align: left',
|
||||
{'alignment': {'horizontal': 'left'}}),
|
||||
('text-align: right',
|
||||
{'alignment': {'horizontal': 'right'}}),
|
||||
('text-align: justify',
|
||||
{'alignment': {'horizontal': 'justify'}}),
|
||||
# - vertical
|
||||
('vertical-align: top',
|
||||
{'alignment': {'vertical': 'top'}}),
|
||||
('vertical-align: text-top',
|
||||
{'alignment': {'vertical': 'top'}}),
|
||||
('vertical-align: middle',
|
||||
{'alignment': {'vertical': 'center'}}),
|
||||
('vertical-align: bottom',
|
||||
{'alignment': {'vertical': 'bottom'}}),
|
||||
('vertical-align: text-bottom',
|
||||
{'alignment': {'vertical': 'bottom'}}),
|
||||
# - wrap_text
|
||||
('white-space: nowrap',
|
||||
{'alignment': {'wrap_text': False}}),
|
||||
('white-space: pre',
|
||||
{'alignment': {'wrap_text': False}}),
|
||||
('white-space: pre-line',
|
||||
{'alignment': {'wrap_text': False}}),
|
||||
('white-space: normal',
|
||||
{'alignment': {'wrap_text': True}}),
|
||||
# NUMBER FORMAT
|
||||
('number-format: 0%',
|
||||
{'number_format': {'format_code': '0%'}}),
|
||||
])
|
||||
def test_css_to_excel(css, expected):
|
||||
convert = CSSToExcelConverter()
|
||||
assert expected == convert(css)
|
||||
|
||||
|
||||
def test_css_to_excel_multiple():
|
||||
convert = CSSToExcelConverter()
|
||||
actual = convert('''
|
||||
font-weight: bold;
|
||||
text-decoration: underline;
|
||||
color: red;
|
||||
border-width: thin;
|
||||
text-align: center;
|
||||
vertical-align: top;
|
||||
unused: something;
|
||||
''')
|
||||
assert {"font": {"bold": True, "underline": "single", "color": "FF0000"},
|
||||
"border": {"top": {"style": "thin"},
|
||||
"right": {"style": "thin"},
|
||||
"bottom": {"style": "thin"},
|
||||
"left": {"style": "thin"}},
|
||||
"alignment": {"horizontal": "center",
|
||||
"vertical": "top"}} == actual
|
||||
|
||||
|
||||
@pytest.mark.parametrize('css,inherited,expected', [
|
||||
('font-weight: bold', '',
|
||||
{'font': {'bold': True}}),
|
||||
('', 'font-weight: bold',
|
||||
{'font': {'bold': True}}),
|
||||
('font-weight: bold', 'font-style: italic',
|
||||
{'font': {'bold': True, 'italic': True}}),
|
||||
('font-style: normal', 'font-style: italic',
|
||||
{'font': {'italic': False}}),
|
||||
('font-style: inherit', '', {}),
|
||||
('font-style: normal; font-style: inherit', 'font-style: italic',
|
||||
{'font': {'italic': True}}),
|
||||
])
|
||||
def test_css_to_excel_inherited(css, inherited, expected):
|
||||
convert = CSSToExcelConverter(inherited)
|
||||
assert expected == convert(css)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_color,output_color", (
|
||||
[(name, rgb) for name, rgb in CSSToExcelConverter.NAMED_COLORS.items()] +
|
||||
[("#" + rgb, rgb) for rgb in CSSToExcelConverter.NAMED_COLORS.values()] +
|
||||
[("#F0F", "FF00FF"), ("#ABC", "AABBCC")])
|
||||
)
|
||||
def test_css_to_excel_good_colors(input_color, output_color):
|
||||
# see gh-18392
|
||||
css = ("border-top-color: {color}; "
|
||||
"border-right-color: {color}; "
|
||||
"border-bottom-color: {color}; "
|
||||
"border-left-color: {color}; "
|
||||
"background-color: {color}; "
|
||||
"color: {color}").format(color=input_color)
|
||||
|
||||
expected = dict()
|
||||
|
||||
expected["fill"] = {
|
||||
"patternType": "solid",
|
||||
"fgColor": output_color
|
||||
}
|
||||
|
||||
expected["font"] = {
|
||||
"color": output_color
|
||||
}
|
||||
|
||||
expected["border"] = {
|
||||
k: {
|
||||
"color": output_color,
|
||||
} for k in ("top", "right", "bottom", "left")
|
||||
}
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
convert = CSSToExcelConverter()
|
||||
assert expected == convert(css)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_color", [None, "not-a-color"])
|
||||
def test_css_to_excel_bad_colors(input_color):
|
||||
# see gh-18392
|
||||
css = ("border-top-color: {color}; "
|
||||
"border-right-color: {color}; "
|
||||
"border-bottom-color: {color}; "
|
||||
"border-left-color: {color}; "
|
||||
"background-color: {color}; "
|
||||
"color: {color}").format(color=input_color)
|
||||
|
||||
expected = dict()
|
||||
|
||||
if input_color is not None:
|
||||
expected["fill"] = {
|
||||
"patternType": "solid"
|
||||
}
|
||||
|
||||
with tm.assert_produces_warning(CSSWarning):
|
||||
convert = CSSToExcelConverter()
|
||||
assert expected == convert(css)
|
||||
@@ -0,0 +1,602 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datetime import datetime
|
||||
from io import open
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO, lrange, u
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, compat, option_context
|
||||
from pandas.util import testing as tm
|
||||
|
||||
import pandas.io.formats.format as fmt
|
||||
|
||||
|
||||
def expected_html(datapath, name):
|
||||
"""
|
||||
Read HTML file from formats data directory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
datapath : pytest fixture
|
||||
The datapath fixture injected into a test by pytest.
|
||||
name : str
|
||||
The name of the HTML file without the suffix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str : contents of HTML file.
|
||||
"""
|
||||
filename = '.'.join([name, 'html'])
|
||||
filepath = datapath('io', 'formats', 'data', 'html', filename)
|
||||
with open(filepath, encoding='utf-8') as f:
|
||||
html = f.read()
|
||||
return html.rstrip()
|
||||
|
||||
|
||||
@pytest.fixture(params=['mixed', 'empty'])
|
||||
def biggie_df_fixture(request):
|
||||
"""Fixture for a big mixed Dataframe and an empty Dataframe"""
|
||||
if request.param == 'mixed':
|
||||
df = DataFrame({'A': np.random.randn(200),
|
||||
'B': tm.makeStringIndex(200)},
|
||||
index=lrange(200))
|
||||
df.loc[:20, 'A'] = np.nan
|
||||
df.loc[:20, 'B'] = np.nan
|
||||
return df
|
||||
elif request.param == 'empty':
|
||||
df = DataFrame(index=np.arange(200))
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture(params=fmt._VALID_JUSTIFY_PARAMETERS)
|
||||
def justify(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.mark.parametrize('col_space', [30, 50])
|
||||
def test_to_html_with_col_space(col_space):
|
||||
df = DataFrame(np.random.random(size=(1, 3)))
|
||||
# check that col_space affects HTML generation
|
||||
# and be very brittle about it.
|
||||
result = df.to_html(col_space=col_space)
|
||||
hdrs = [x for x in result.split(r"\n") if re.search(r"<th[>\s]", x)]
|
||||
assert len(hdrs) > 0
|
||||
for h in hdrs:
|
||||
assert "min-width" in h
|
||||
assert str(col_space) in h
|
||||
|
||||
|
||||
def test_to_html_with_empty_string_label():
|
||||
# GH 3547, to_html regards empty string labels as repeated labels
|
||||
data = {'c1': ['a', 'b'], 'c2': ['a', ''], 'data': [1, 2]}
|
||||
df = DataFrame(data).set_index(['c1', 'c2'])
|
||||
result = df.to_html()
|
||||
assert "rowspan" not in result
|
||||
|
||||
|
||||
@pytest.mark.parametrize('df,expected', [
|
||||
(DataFrame({u('\u03c3'): np.arange(10.)}), 'unicode_1'),
|
||||
(DataFrame({'A': [u('\u03c3')]}), 'unicode_2')
|
||||
])
|
||||
def test_to_html_unicode(df, expected, datapath):
|
||||
expected = expected_html(datapath, expected)
|
||||
result = df.to_html()
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_to_html_decimal(datapath):
|
||||
# GH 12031
|
||||
df = DataFrame({'A': [6.0, 3.1, 2.2]})
|
||||
result = df.to_html(decimal=',')
|
||||
expected = expected_html(datapath, 'gh12031_expected_output')
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize('kwargs,string,expected', [
|
||||
(dict(), "<type 'str'>", 'escaped'),
|
||||
(dict(escape=False), "<b>bold</b>", 'escape_disabled')
|
||||
])
|
||||
def test_to_html_escaped(kwargs, string, expected, datapath):
|
||||
a = 'str<ing1 &'
|
||||
b = 'stri>ng2 &'
|
||||
|
||||
test_dict = {'co<l1': {a: string,
|
||||
b: string},
|
||||
'co>l2': {a: string,
|
||||
b: string}}
|
||||
result = DataFrame(test_dict).to_html(**kwargs)
|
||||
expected = expected_html(datapath, expected)
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize('index_is_named', [True, False])
|
||||
def test_to_html_multiindex_index_false(index_is_named, datapath):
|
||||
# GH 8452
|
||||
df = DataFrame({
|
||||
'a': range(2),
|
||||
'b': range(3, 5),
|
||||
'c': range(5, 7),
|
||||
'd': range(3, 5)
|
||||
})
|
||||
df.columns = MultiIndex.from_product([['a', 'b'], ['c', 'd']])
|
||||
if index_is_named:
|
||||
df.index = Index(df.index.values, name='idx')
|
||||
result = df.to_html(index=False)
|
||||
expected = expected_html(datapath, 'gh8452_expected_output')
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize('multi_sparse,expected', [
|
||||
(False, 'multiindex_sparsify_false_multi_sparse_1'),
|
||||
(False, 'multiindex_sparsify_false_multi_sparse_2'),
|
||||
(True, 'multiindex_sparsify_1'),
|
||||
(True, 'multiindex_sparsify_2')
|
||||
])
|
||||
def test_to_html_multiindex_sparsify(multi_sparse, expected, datapath):
|
||||
index = MultiIndex.from_arrays([[0, 0, 1, 1], [0, 1, 0, 1]],
|
||||
names=['foo', None])
|
||||
df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], index=index)
|
||||
if expected.endswith('2'):
|
||||
df.columns = index[::2]
|
||||
with option_context('display.multi_sparse', multi_sparse):
|
||||
result = df.to_html()
|
||||
expected = expected_html(datapath, expected)
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize('max_rows,expected', [
|
||||
(60, 'gh14882_expected_output_1'),
|
||||
|
||||
# Test that ... appears in a middle level
|
||||
(56, 'gh14882_expected_output_2')
|
||||
])
|
||||
def test_to_html_multiindex_odd_even_truncate(max_rows, expected, datapath):
|
||||
# GH 14882 - Issue on truncation with odd length DataFrame
|
||||
index = MultiIndex.from_product([[100, 200, 300],
|
||||
[10, 20, 30],
|
||||
[1, 2, 3, 4, 5, 6, 7]],
|
||||
names=['a', 'b', 'c'])
|
||||
df = DataFrame({'n': range(len(index))}, index=index)
|
||||
result = df.to_html(max_rows=max_rows)
|
||||
expected = expected_html(datapath, expected)
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize('df,formatters,expected', [
|
||||
(DataFrame(
|
||||
[[0, 1], [2, 3], [4, 5], [6, 7]],
|
||||
columns=['foo', None], index=lrange(4)),
|
||||
{'__index__': lambda x: 'abcd' [x]},
|
||||
'index_formatter'),
|
||||
|
||||
(DataFrame(
|
||||
{'months': [datetime(2016, 1, 1), datetime(2016, 2, 2)]}),
|
||||
{'months': lambda x: x.strftime('%Y-%m')},
|
||||
'datetime64_monthformatter'),
|
||||
|
||||
(DataFrame({'hod': pd.to_datetime(['10:10:10.100', '12:12:12.120'],
|
||||
format='%H:%M:%S.%f')}),
|
||||
{'hod': lambda x: x.strftime('%H:%M')},
|
||||
'datetime64_hourformatter')
|
||||
])
|
||||
def test_to_html_formatters(df, formatters, expected, datapath):
|
||||
expected = expected_html(datapath, expected)
|
||||
result = df.to_html(formatters=formatters)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_to_html_regression_GH6098():
|
||||
df = DataFrame({
|
||||
u('clé1'): [u('a'), u('a'), u('b'), u('b'), u('a')],
|
||||
u('clé2'): [u('1er'), u('2ème'), u('1er'), u('2ème'), u('1er')],
|
||||
'données1': np.random.randn(5),
|
||||
'données2': np.random.randn(5)})
|
||||
|
||||
# it works
|
||||
df.pivot_table(index=[u('clé1')], columns=[u('clé2')])._repr_html_()
|
||||
|
||||
|
||||
def test_to_html_truncate(datapath):
|
||||
index = pd.date_range(start='20010101', freq='D', periods=20)
|
||||
df = DataFrame(index=index, columns=range(20))
|
||||
result = df.to_html(max_rows=8, max_cols=4)
|
||||
expected = expected_html(datapath, 'truncate')
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize('sparsify,expected', [
|
||||
(True, 'truncate_multi_index'),
|
||||
(False, 'truncate_multi_index_sparse_off')
|
||||
])
|
||||
def test_to_html_truncate_multi_index(sparsify, expected, datapath):
|
||||
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
|
||||
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
|
||||
df = DataFrame(index=arrays, columns=arrays)
|
||||
result = df.to_html(max_rows=7, max_cols=7, sparsify=sparsify)
|
||||
expected = expected_html(datapath, expected)
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize('option,result,expected', [
|
||||
(None, lambda df: df.to_html(), '1'),
|
||||
(None, lambda df: df.to_html(border=0), '0'),
|
||||
(0, lambda df: df.to_html(), '0'),
|
||||
(0, lambda df: df._repr_html_(), '0'),
|
||||
])
|
||||
def test_to_html_border(option, result, expected):
|
||||
df = DataFrame({'A': [1, 2]})
|
||||
if option is None:
|
||||
result = result(df)
|
||||
else:
|
||||
with option_context('display.html.border', option):
|
||||
result = result(df)
|
||||
expected = 'border="{}"'.format(expected)
|
||||
assert expected in result
|
||||
|
||||
|
||||
def test_display_option_warning():
|
||||
with tm.assert_produces_warning(FutureWarning,
|
||||
check_stacklevel=False):
|
||||
pd.options.html.border
|
||||
|
||||
|
||||
@pytest.mark.parametrize('biggie_df_fixture', ['mixed'], indirect=True)
|
||||
def test_to_html(biggie_df_fixture):
|
||||
# TODO: split this test
|
||||
df = biggie_df_fixture
|
||||
s = df.to_html()
|
||||
|
||||
buf = StringIO()
|
||||
retval = df.to_html(buf=buf)
|
||||
assert retval is None
|
||||
assert buf.getvalue() == s
|
||||
|
||||
assert isinstance(s, compat.string_types)
|
||||
|
||||
df.to_html(columns=['B', 'A'], col_space=17)
|
||||
df.to_html(columns=['B', 'A'],
|
||||
formatters={'A': lambda x: '{x:.1f}'.format(x=x)})
|
||||
|
||||
df.to_html(columns=['B', 'A'], float_format=str)
|
||||
df.to_html(columns=['B', 'A'], col_space=12, float_format=str)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('biggie_df_fixture', ['empty'], indirect=True)
|
||||
def test_to_html_empty_dataframe(biggie_df_fixture):
|
||||
df = biggie_df_fixture
|
||||
df.to_html()
|
||||
|
||||
|
||||
def test_to_html_filename(biggie_df_fixture, tmpdir):
|
||||
df = biggie_df_fixture
|
||||
expected = df.to_html()
|
||||
path = tmpdir.join('test.html')
|
||||
df.to_html(path)
|
||||
result = path.read()
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_to_html_with_no_bold():
|
||||
df = DataFrame({'x': np.random.randn(5)})
|
||||
html = df.to_html(bold_rows=False)
|
||||
result = html[html.find("</thead>")]
|
||||
assert '<strong' not in result
|
||||
|
||||
|
||||
def test_to_html_columns_arg():
|
||||
df = DataFrame(tm.getSeriesData())
|
||||
result = df.to_html(columns=['A'])
|
||||
assert '<th>B</th>' not in result
|
||||
|
||||
|
||||
@pytest.mark.parametrize('columns,justify,expected', [
|
||||
(MultiIndex.from_tuples(
|
||||
list(zip(np.arange(2).repeat(2), np.mod(lrange(4), 2))),
|
||||
names=['CL0', 'CL1']),
|
||||
'left',
|
||||
'multiindex_1'),
|
||||
|
||||
(MultiIndex.from_tuples(
|
||||
list(zip(range(4), np.mod(lrange(4), 2)))),
|
||||
'right',
|
||||
'multiindex_2')
|
||||
])
|
||||
def test_to_html_multiindex(columns, justify, expected, datapath):
|
||||
df = DataFrame([list('abcd'), list('efgh')], columns=columns)
|
||||
result = df.to_html(justify=justify)
|
||||
expected = expected_html(datapath, expected)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_to_html_justify(justify, datapath):
|
||||
df = DataFrame({'A': [6, 30000, 2],
|
||||
'B': [1, 2, 70000],
|
||||
'C': [223442, 0, 1]},
|
||||
columns=['A', 'B', 'C'])
|
||||
result = df.to_html(justify=justify)
|
||||
expected = expected_html(datapath, 'justify').format(justify=justify)
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("justify", ["super-right", "small-left",
|
||||
"noinherit", "tiny", "pandas"])
|
||||
def test_to_html_invalid_justify(justify):
|
||||
# GH 17527
|
||||
df = DataFrame()
|
||||
msg = "Invalid value for justify parameter"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_html(justify=justify)
|
||||
|
||||
|
||||
def test_to_html_index(datapath):
|
||||
# TODO: split this test
|
||||
index = ['foo', 'bar', 'baz']
|
||||
df = DataFrame({'A': [1, 2, 3],
|
||||
'B': [1.2, 3.4, 5.6],
|
||||
'C': ['one', 'two', np.nan]},
|
||||
columns=['A', 'B', 'C'],
|
||||
index=index)
|
||||
expected_with_index = expected_html(datapath, 'index_1')
|
||||
assert df.to_html() == expected_with_index
|
||||
|
||||
expected_without_index = expected_html(datapath, 'index_2')
|
||||
result = df.to_html(index=False)
|
||||
for i in index:
|
||||
assert i not in result
|
||||
assert result == expected_without_index
|
||||
df.index = Index(['foo', 'bar', 'baz'], name='idx')
|
||||
expected_with_index = expected_html(datapath, 'index_3')
|
||||
assert df.to_html() == expected_with_index
|
||||
assert df.to_html(index=False) == expected_without_index
|
||||
|
||||
tuples = [('foo', 'car'), ('foo', 'bike'), ('bar', 'car')]
|
||||
df.index = MultiIndex.from_tuples(tuples)
|
||||
|
||||
expected_with_index = expected_html(datapath, 'index_4')
|
||||
assert df.to_html() == expected_with_index
|
||||
|
||||
result = df.to_html(index=False)
|
||||
for i in ['foo', 'bar', 'car', 'bike']:
|
||||
assert i not in result
|
||||
# must be the same result as normal index
|
||||
assert result == expected_without_index
|
||||
|
||||
df.index = MultiIndex.from_tuples(tuples, names=['idx1', 'idx2'])
|
||||
expected_with_index = expected_html(datapath, 'index_5')
|
||||
assert df.to_html() == expected_with_index
|
||||
assert df.to_html(index=False) == expected_without_index
|
||||
|
||||
|
||||
@pytest.mark.parametrize('classes', [
|
||||
"sortable draggable",
|
||||
["sortable", "draggable"]
|
||||
])
|
||||
def test_to_html_with_classes(classes, datapath):
|
||||
df = DataFrame()
|
||||
expected = expected_html(datapath, 'with_classes')
|
||||
result = df.to_html(classes=classes)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_to_html_no_index_max_rows(datapath):
|
||||
# GH 14998
|
||||
df = DataFrame({"A": [1, 2, 3, 4]})
|
||||
result = df.to_html(index=False, max_rows=1)
|
||||
expected = expected_html(datapath, 'gh14998_expected_output')
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_to_html_multiindex_max_cols(datapath):
|
||||
# GH 6131
|
||||
index = MultiIndex(levels=[['ba', 'bb', 'bc'], ['ca', 'cb', 'cc']],
|
||||
codes=[[0, 1, 2], [0, 1, 2]],
|
||||
names=['b', 'c'])
|
||||
columns = MultiIndex(levels=[['d'], ['aa', 'ab', 'ac']],
|
||||
codes=[[0, 0, 0], [0, 1, 2]],
|
||||
names=[None, 'a'])
|
||||
data = np.array(
|
||||
[[1., np.nan, np.nan], [np.nan, 2., np.nan], [np.nan, np.nan, 3.]])
|
||||
df = DataFrame(data, index, columns)
|
||||
result = df.to_html(max_cols=2)
|
||||
expected = expected_html(datapath, 'gh6131_expected_output')
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_to_html_multi_indexes_index_false(datapath):
|
||||
# GH 22579
|
||||
df = DataFrame({'a': range(10), 'b': range(10, 20), 'c': range(10, 20),
|
||||
'd': range(10, 20)})
|
||||
df.columns = MultiIndex.from_product([['a', 'b'], ['c', 'd']])
|
||||
df.index = MultiIndex.from_product([['a', 'b'],
|
||||
['c', 'd', 'e', 'f', 'g']])
|
||||
result = df.to_html(index=False)
|
||||
expected = expected_html(datapath, 'gh22579_expected_output')
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize('index_names', [True, False])
|
||||
@pytest.mark.parametrize('header', [True, False])
|
||||
@pytest.mark.parametrize('index', [True, False])
|
||||
@pytest.mark.parametrize('column_index, column_type', [
|
||||
(Index([0, 1]), 'unnamed_standard'),
|
||||
(Index([0, 1], name='columns.name'), 'named_standard'),
|
||||
(MultiIndex.from_product([['a'], ['b', 'c']]), 'unnamed_multi'),
|
||||
(MultiIndex.from_product(
|
||||
[['a'], ['b', 'c']], names=['columns.name.0',
|
||||
'columns.name.1']), 'named_multi')
|
||||
])
|
||||
@pytest.mark.parametrize('row_index, row_type', [
|
||||
(Index([0, 1]), 'unnamed_standard'),
|
||||
(Index([0, 1], name='index.name'), 'named_standard'),
|
||||
(MultiIndex.from_product([['a'], ['b', 'c']]), 'unnamed_multi'),
|
||||
(MultiIndex.from_product(
|
||||
[['a'], ['b', 'c']], names=['index.name.0',
|
||||
'index.name.1']), 'named_multi')
|
||||
])
|
||||
def test_to_html_basic_alignment(
|
||||
datapath, row_index, row_type, column_index, column_type,
|
||||
index, header, index_names):
|
||||
# GH 22747, GH 22579
|
||||
df = DataFrame(np.zeros((2, 2), dtype=int),
|
||||
index=row_index, columns=column_index)
|
||||
result = df.to_html(
|
||||
index=index, header=header, index_names=index_names)
|
||||
|
||||
if not index:
|
||||
row_type = 'none'
|
||||
elif not index_names and row_type.startswith('named'):
|
||||
row_type = 'un' + row_type
|
||||
|
||||
if not header:
|
||||
column_type = 'none'
|
||||
elif not index_names and column_type.startswith('named'):
|
||||
column_type = 'un' + column_type
|
||||
|
||||
filename = 'index_' + row_type + '_columns_' + column_type
|
||||
expected = expected_html(datapath, filename)
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize('index_names', [True, False])
|
||||
@pytest.mark.parametrize('header', [True, False])
|
||||
@pytest.mark.parametrize('index', [True, False])
|
||||
@pytest.mark.parametrize('column_index, column_type', [
|
||||
(Index(np.arange(8)), 'unnamed_standard'),
|
||||
(Index(np.arange(8), name='columns.name'), 'named_standard'),
|
||||
(MultiIndex.from_product(
|
||||
[['a', 'b'], ['c', 'd'], ['e', 'f']]), 'unnamed_multi'),
|
||||
(MultiIndex.from_product(
|
||||
[['a', 'b'], ['c', 'd'], ['e', 'f']], names=['foo', None, 'baz']),
|
||||
'named_multi')
|
||||
])
|
||||
@pytest.mark.parametrize('row_index, row_type', [
|
||||
(Index(np.arange(8)), 'unnamed_standard'),
|
||||
(Index(np.arange(8), name='index.name'), 'named_standard'),
|
||||
(MultiIndex.from_product(
|
||||
[['a', 'b'], ['c', 'd'], ['e', 'f']]), 'unnamed_multi'),
|
||||
(MultiIndex.from_product(
|
||||
[['a', 'b'], ['c', 'd'], ['e', 'f']], names=['foo', None, 'baz']),
|
||||
'named_multi')
|
||||
])
|
||||
def test_to_html_alignment_with_truncation(
|
||||
datapath, row_index, row_type, column_index, column_type,
|
||||
index, header, index_names):
|
||||
# GH 22747, GH 22579
|
||||
df = DataFrame(np.arange(64).reshape(8, 8),
|
||||
index=row_index, columns=column_index)
|
||||
result = df.to_html(
|
||||
max_rows=4, max_cols=4,
|
||||
index=index, header=header, index_names=index_names)
|
||||
|
||||
if not index:
|
||||
row_type = 'none'
|
||||
elif not index_names and row_type.startswith('named'):
|
||||
row_type = 'un' + row_type
|
||||
|
||||
if not header:
|
||||
column_type = 'none'
|
||||
elif not index_names and column_type.startswith('named'):
|
||||
column_type = 'un' + column_type
|
||||
|
||||
filename = 'trunc_df_index_' + row_type + '_columns_' + column_type
|
||||
expected = expected_html(datapath, filename)
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize('index', [False, 0])
|
||||
def test_to_html_truncation_index_false_max_rows(datapath, index):
|
||||
# GH 15019
|
||||
data = [[1.764052, 0.400157],
|
||||
[0.978738, 2.240893],
|
||||
[1.867558, -0.977278],
|
||||
[0.950088, -0.151357],
|
||||
[-0.103219, 0.410599]]
|
||||
df = DataFrame(data)
|
||||
result = df.to_html(max_rows=4, index=index)
|
||||
expected = expected_html(datapath, 'gh15019_expected_output')
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize('index', [False, 0])
|
||||
@pytest.mark.parametrize('col_index_named, expected_output', [
|
||||
(False, 'gh22783_expected_output'),
|
||||
(True, 'gh22783_named_columns_index')
|
||||
])
|
||||
def test_to_html_truncation_index_false_max_cols(
|
||||
datapath, index, col_index_named, expected_output):
|
||||
# GH 22783
|
||||
data = [[1.764052, 0.400157, 0.978738, 2.240893, 1.867558],
|
||||
[-0.977278, 0.950088, -0.151357, -0.103219, 0.410599]]
|
||||
df = DataFrame(data)
|
||||
if col_index_named:
|
||||
df.columns.rename('columns.name', inplace=True)
|
||||
result = df.to_html(max_cols=4, index=index)
|
||||
expected = expected_html(datapath, expected_output)
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize('notebook', [True, False])
|
||||
def test_to_html_notebook_has_style(notebook):
|
||||
df = DataFrame({"A": [1, 2, 3]})
|
||||
result = df.to_html(notebook=notebook)
|
||||
|
||||
if notebook:
|
||||
assert "tbody tr th:only-of-type" in result
|
||||
assert "vertical-align: middle;" in result
|
||||
assert "thead th" in result
|
||||
else:
|
||||
assert "tbody tr th:only-of-type" not in result
|
||||
assert "vertical-align: middle;" not in result
|
||||
assert "thead th" not in result
|
||||
|
||||
|
||||
def test_to_html_with_index_names_false():
|
||||
# GH 16493
|
||||
df = DataFrame({"A": [1, 2]}, index=Index(['a', 'b'],
|
||||
name='myindexname'))
|
||||
result = df.to_html(index_names=False)
|
||||
assert 'myindexname' not in result
|
||||
|
||||
|
||||
def test_to_html_with_id():
|
||||
# GH 8496
|
||||
df = DataFrame({"A": [1, 2]}, index=Index(['a', 'b'],
|
||||
name='myindexname'))
|
||||
result = df.to_html(index_names=False, table_id="TEST_ID")
|
||||
assert ' id="TEST_ID"' in result
|
||||
|
||||
|
||||
@pytest.mark.parametrize('value,float_format,expected', [
|
||||
(0.19999, '%.3f', 'gh21625_expected_output'),
|
||||
(100.0, '%.0f', 'gh22270_expected_output'),
|
||||
])
|
||||
def test_to_html_float_format_no_fixed_width(
|
||||
value, float_format, expected, datapath):
|
||||
# GH 21625, GH 22270
|
||||
df = DataFrame({'x': [value]})
|
||||
expected = expected_html(datapath, expected)
|
||||
result = df.to_html(float_format=float_format)
|
||||
assert result == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("render_links,expected", [
|
||||
(True, 'render_links_true'),
|
||||
(False, 'render_links_false'),
|
||||
])
|
||||
def test_to_html_render_links(render_links, expected, datapath):
|
||||
# GH 2679
|
||||
data = [
|
||||
[0, 'http://pandas.pydata.org/?q1=a&q2=b', 'pydata.org'],
|
||||
[0, 'www.pydata.org', 'pydata.org']
|
||||
]
|
||||
df = DataFrame(data, columns=['foo', 'bar', None])
|
||||
|
||||
result = df.to_html(render_links=render_links)
|
||||
expected = expected_html(datapath, expected)
|
||||
assert result == expected
|
||||
@@ -0,0 +1,737 @@
|
||||
import codecs
|
||||
from datetime import datetime
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import u
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Series, compat
|
||||
from pandas.util import testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def frame():
|
||||
return DataFrame(tm.getSeriesData())
|
||||
|
||||
|
||||
class TestToLatex(object):
|
||||
|
||||
def test_to_latex_filename(self, frame):
|
||||
with tm.ensure_clean('test.tex') as path:
|
||||
frame.to_latex(path)
|
||||
|
||||
with open(path, 'r') as f:
|
||||
assert frame.to_latex() == f.read()
|
||||
|
||||
# test with utf-8 and encoding option (GH 7061)
|
||||
df = DataFrame([[u'au\xdfgangen']])
|
||||
with tm.ensure_clean('test.tex') as path:
|
||||
df.to_latex(path, encoding='utf-8')
|
||||
with codecs.open(path, 'r', encoding='utf-8') as f:
|
||||
assert df.to_latex() == f.read()
|
||||
|
||||
# test with utf-8 without encoding option
|
||||
if compat.PY3: # python3: pandas default encoding is utf-8
|
||||
with tm.ensure_clean('test.tex') as path:
|
||||
df.to_latex(path)
|
||||
with codecs.open(path, 'r', encoding='utf-8') as f:
|
||||
assert df.to_latex() == f.read()
|
||||
else:
|
||||
# python2 default encoding is ascii, so an error should be raised
|
||||
with tm.ensure_clean('test.tex') as path:
|
||||
with pytest.raises(UnicodeEncodeError):
|
||||
df.to_latex(path)
|
||||
|
||||
def test_to_latex(self, frame):
|
||||
# it works!
|
||||
frame.to_latex()
|
||||
|
||||
df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
|
||||
withindex_result = df.to_latex()
|
||||
withindex_expected = r"""\begin{tabular}{lrl}
|
||||
\toprule
|
||||
{} & a & b \\
|
||||
\midrule
|
||||
0 & 1 & b1 \\
|
||||
1 & 2 & b2 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
|
||||
assert withindex_result == withindex_expected
|
||||
|
||||
withoutindex_result = df.to_latex(index=False)
|
||||
withoutindex_expected = r"""\begin{tabular}{rl}
|
||||
\toprule
|
||||
a & b \\
|
||||
\midrule
|
||||
1 & b1 \\
|
||||
2 & b2 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
|
||||
assert withoutindex_result == withoutindex_expected
|
||||
|
||||
def test_to_latex_format(self, frame):
|
||||
# GH Bug #9402
|
||||
frame.to_latex(column_format='ccc')
|
||||
|
||||
df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
|
||||
withindex_result = df.to_latex(column_format='ccc')
|
||||
withindex_expected = r"""\begin{tabular}{ccc}
|
||||
\toprule
|
||||
{} & a & b \\
|
||||
\midrule
|
||||
0 & 1 & b1 \\
|
||||
1 & 2 & b2 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
|
||||
assert withindex_result == withindex_expected
|
||||
|
||||
def test_to_latex_empty(self):
|
||||
df = DataFrame()
|
||||
result = df.to_latex()
|
||||
expected = r"""\begin{tabular}{l}
|
||||
\toprule
|
||||
Empty DataFrame
|
||||
Columns: Index([], dtype='object')
|
||||
Index: Index([], dtype='object') \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert result == expected
|
||||
|
||||
result = df.to_latex(longtable=True)
|
||||
expected = r"""\begin{longtable}{l}
|
||||
\toprule
|
||||
Empty DataFrame
|
||||
Columns: Index([], dtype='object')
|
||||
Index: Index([], dtype='object') \\
|
||||
\end{longtable}
|
||||
"""
|
||||
assert result == expected
|
||||
|
||||
def test_to_latex_with_formatters(self):
|
||||
df = DataFrame({'datetime64': [datetime(2016, 1, 1),
|
||||
datetime(2016, 2, 5),
|
||||
datetime(2016, 3, 3)],
|
||||
'float': [1.0, 2.0, 3.0],
|
||||
'int': [1, 2, 3],
|
||||
'object': [(1, 2), True, False],
|
||||
})
|
||||
|
||||
formatters = {'datetime64': lambda x: x.strftime('%Y-%m'),
|
||||
'float': lambda x: '[{x: 4.1f}]'.format(x=x),
|
||||
'int': lambda x: '0x{x:x}'.format(x=x),
|
||||
'object': lambda x: '-{x!s}-'.format(x=x),
|
||||
'__index__': lambda x: 'index: {x}'.format(x=x)}
|
||||
result = df.to_latex(formatters=dict(formatters))
|
||||
|
||||
expected = r"""\begin{tabular}{llrrl}
|
||||
\toprule
|
||||
{} & datetime64 & float & int & object \\
|
||||
\midrule
|
||||
index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\
|
||||
index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\
|
||||
index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert result == expected
|
||||
|
||||
def test_to_latex_multiindex(self):
|
||||
df = DataFrame({('x', 'y'): ['a']})
|
||||
result = df.to_latex()
|
||||
expected = r"""\begin{tabular}{ll}
|
||||
\toprule
|
||||
{} & x \\
|
||||
{} & y \\
|
||||
\midrule
|
||||
0 & a \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
|
||||
assert result == expected
|
||||
|
||||
result = df.T.to_latex()
|
||||
expected = r"""\begin{tabular}{lll}
|
||||
\toprule
|
||||
& & 0 \\
|
||||
\midrule
|
||||
x & y & a \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
|
||||
assert result == expected
|
||||
|
||||
df = DataFrame.from_dict({
|
||||
('c1', 0): pd.Series({x: x for x in range(4)}),
|
||||
('c1', 1): pd.Series({x: x + 4 for x in range(4)}),
|
||||
('c2', 0): pd.Series({x: x for x in range(4)}),
|
||||
('c2', 1): pd.Series({x: x + 4 for x in range(4)}),
|
||||
('c3', 0): pd.Series({x: x for x in range(4)}),
|
||||
}).T
|
||||
result = df.to_latex()
|
||||
expected = r"""\begin{tabular}{llrrrr}
|
||||
\toprule
|
||||
& & 0 & 1 & 2 & 3 \\
|
||||
\midrule
|
||||
c1 & 0 & 0 & 1 & 2 & 3 \\
|
||||
& 1 & 4 & 5 & 6 & 7 \\
|
||||
c2 & 0 & 0 & 1 & 2 & 3 \\
|
||||
& 1 & 4 & 5 & 6 & 7 \\
|
||||
c3 & 0 & 0 & 1 & 2 & 3 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
|
||||
assert result == expected
|
||||
|
||||
# GH 14184
|
||||
df = df.T
|
||||
df.columns.names = ['a', 'b']
|
||||
result = df.to_latex()
|
||||
expected = r"""\begin{tabular}{lrrrrr}
|
||||
\toprule
|
||||
a & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\
|
||||
b & 0 & 1 & 0 & 1 & 0 \\
|
||||
\midrule
|
||||
0 & 0 & 4 & 0 & 4 & 0 \\
|
||||
1 & 1 & 5 & 1 & 5 & 1 \\
|
||||
2 & 2 & 6 & 2 & 6 & 2 \\
|
||||
3 & 3 & 7 & 3 & 7 & 3 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert result == expected
|
||||
|
||||
# GH 10660
|
||||
df = pd.DataFrame({'a': [0, 0, 1, 1],
|
||||
'b': list('abab'),
|
||||
'c': [1, 2, 3, 4]})
|
||||
result = df.set_index(['a', 'b']).to_latex()
|
||||
expected = r"""\begin{tabular}{llr}
|
||||
\toprule
|
||||
& & c \\
|
||||
a & b & \\
|
||||
\midrule
|
||||
0 & a & 1 \\
|
||||
& b & 2 \\
|
||||
1 & a & 3 \\
|
||||
& b & 4 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
|
||||
assert result == expected
|
||||
|
||||
result = df.groupby('a').describe().to_latex()
|
||||
expected = r"""\begin{tabular}{lrrrrrrrr}
|
||||
\toprule
|
||||
{} & \multicolumn{8}{l}{c} \\
|
||||
{} & count & mean & std & min & 25\% & 50\% & 75\% & max \\
|
||||
a & & & & & & & & \\
|
||||
\midrule
|
||||
0 & 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 & 2.0 \\
|
||||
1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 & 3.5 & 3.75 & 4.0 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_to_latex_multiindex_dupe_level(self):
|
||||
# see gh-14484
|
||||
#
|
||||
# If an index is repeated in subsequent rows, it should be
|
||||
# replaced with a blank in the created table. This should
|
||||
# ONLY happen if all higher order indices (to the left) are
|
||||
# equal too. In this test, 'c' has to be printed both times
|
||||
# because the higher order index 'A' != 'B'.
|
||||
df = pd.DataFrame(index=pd.MultiIndex.from_tuples(
|
||||
[('A', 'c'), ('B', 'c')]), columns=['col'])
|
||||
result = df.to_latex()
|
||||
expected = r"""\begin{tabular}{lll}
|
||||
\toprule
|
||||
& & col \\
|
||||
\midrule
|
||||
A & c & NaN \\
|
||||
B & c & NaN \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert result == expected
|
||||
|
||||
def test_to_latex_multicolumnrow(self):
|
||||
df = pd.DataFrame({
|
||||
('c1', 0): {x: x for x in range(5)},
|
||||
('c1', 1): {x: x + 5 for x in range(5)},
|
||||
('c2', 0): {x: x for x in range(5)},
|
||||
('c2', 1): {x: x + 5 for x in range(5)},
|
||||
('c3', 0): {x: x for x in range(5)}
|
||||
})
|
||||
result = df.to_latex()
|
||||
expected = r"""\begin{tabular}{lrrrrr}
|
||||
\toprule
|
||||
{} & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\
|
||||
{} & 0 & 1 & 0 & 1 & 0 \\
|
||||
\midrule
|
||||
0 & 0 & 5 & 0 & 5 & 0 \\
|
||||
1 & 1 & 6 & 1 & 6 & 1 \\
|
||||
2 & 2 & 7 & 2 & 7 & 2 \\
|
||||
3 & 3 & 8 & 3 & 8 & 3 \\
|
||||
4 & 4 & 9 & 4 & 9 & 4 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert result == expected
|
||||
|
||||
result = df.to_latex(multicolumn=False)
|
||||
expected = r"""\begin{tabular}{lrrrrr}
|
||||
\toprule
|
||||
{} & c1 & & c2 & & c3 \\
|
||||
{} & 0 & 1 & 0 & 1 & 0 \\
|
||||
\midrule
|
||||
0 & 0 & 5 & 0 & 5 & 0 \\
|
||||
1 & 1 & 6 & 1 & 6 & 1 \\
|
||||
2 & 2 & 7 & 2 & 7 & 2 \\
|
||||
3 & 3 & 8 & 3 & 8 & 3 \\
|
||||
4 & 4 & 9 & 4 & 9 & 4 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert result == expected
|
||||
|
||||
result = df.T.to_latex(multirow=True)
|
||||
expected = r"""\begin{tabular}{llrrrrr}
|
||||
\toprule
|
||||
& & 0 & 1 & 2 & 3 & 4 \\
|
||||
\midrule
|
||||
\multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\
|
||||
& 1 & 5 & 6 & 7 & 8 & 9 \\
|
||||
\cline{1-7}
|
||||
\multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\
|
||||
& 1 & 5 & 6 & 7 & 8 & 9 \\
|
||||
\cline{1-7}
|
||||
c3 & 0 & 0 & 1 & 2 & 3 & 4 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert result == expected
|
||||
|
||||
df.index = df.T.index
|
||||
result = df.T.to_latex(multirow=True, multicolumn=True,
|
||||
multicolumn_format='c')
|
||||
expected = r"""\begin{tabular}{llrrrrr}
|
||||
\toprule
|
||||
& & \multicolumn{2}{c}{c1} & \multicolumn{2}{c}{c2} & c3 \\
|
||||
& & 0 & 1 & 0 & 1 & 0 \\
|
||||
\midrule
|
||||
\multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\
|
||||
& 1 & 5 & 6 & 7 & 8 & 9 \\
|
||||
\cline{1-7}
|
||||
\multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\
|
||||
& 1 & 5 & 6 & 7 & 8 & 9 \\
|
||||
\cline{1-7}
|
||||
c3 & 0 & 0 & 1 & 2 & 3 & 4 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert result == expected
|
||||
|
||||
def test_to_latex_escape(self):
|
||||
a = 'a'
|
||||
b = 'b'
|
||||
|
||||
test_dict = {u('co$e^x$'): {a: "a",
|
||||
b: "b"},
|
||||
u('co^l1'): {a: "a",
|
||||
b: "b"}}
|
||||
|
||||
unescaped_result = DataFrame(test_dict).to_latex(escape=False)
|
||||
escaped_result = DataFrame(test_dict).to_latex(
|
||||
) # default: escape=True
|
||||
|
||||
unescaped_expected = r'''\begin{tabular}{lll}
|
||||
\toprule
|
||||
{} & co$e^x$ & co^l1 \\
|
||||
\midrule
|
||||
a & a & a \\
|
||||
b & b & b \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
'''
|
||||
|
||||
escaped_expected = r'''\begin{tabular}{lll}
|
||||
\toprule
|
||||
{} & co\$e\textasciicircum x\$ & co\textasciicircum l1 \\
|
||||
\midrule
|
||||
a & a & a \\
|
||||
b & b & b \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
'''
|
||||
|
||||
assert unescaped_result == unescaped_expected
|
||||
assert escaped_result == escaped_expected
|
||||
|
||||
def test_to_latex_special_escape(self):
|
||||
df = DataFrame([r"a\b\c", r"^a^b^c", r"~a~b~c"])
|
||||
|
||||
escaped_result = df.to_latex()
|
||||
escaped_expected = r"""\begin{tabular}{ll}
|
||||
\toprule
|
||||
{} & 0 \\
|
||||
\midrule
|
||||
0 & a\textbackslash b\textbackslash c \\
|
||||
1 & \textasciicircum a\textasciicircum b\textasciicircum c \\
|
||||
2 & \textasciitilde a\textasciitilde b\textasciitilde c \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert escaped_result == escaped_expected
|
||||
|
||||
def test_to_latex_longtable(self, frame):
|
||||
frame.to_latex(longtable=True)
|
||||
|
||||
df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
|
||||
withindex_result = df.to_latex(longtable=True)
|
||||
withindex_expected = r"""\begin{longtable}{lrl}
|
||||
\toprule
|
||||
{} & a & b \\
|
||||
\midrule
|
||||
\endhead
|
||||
\midrule
|
||||
\multicolumn{3}{r}{{Continued on next page}} \\
|
||||
\midrule
|
||||
\endfoot
|
||||
|
||||
\bottomrule
|
||||
\endlastfoot
|
||||
0 & 1 & b1 \\
|
||||
1 & 2 & b2 \\
|
||||
\end{longtable}
|
||||
"""
|
||||
assert withindex_result == withindex_expected
|
||||
|
||||
withoutindex_result = df.to_latex(index=False, longtable=True)
|
||||
withoutindex_expected = r"""\begin{longtable}{rl}
|
||||
\toprule
|
||||
a & b \\
|
||||
\midrule
|
||||
\endhead
|
||||
\midrule
|
||||
\multicolumn{2}{r}{{Continued on next page}} \\
|
||||
\midrule
|
||||
\endfoot
|
||||
|
||||
\bottomrule
|
||||
\endlastfoot
|
||||
1 & b1 \\
|
||||
2 & b2 \\
|
||||
\end{longtable}
|
||||
"""
|
||||
|
||||
assert withoutindex_result == withoutindex_expected
|
||||
|
||||
df = DataFrame({'a': [1, 2]})
|
||||
with1column_result = df.to_latex(index=False, longtable=True)
|
||||
assert r"\multicolumn{1}" in with1column_result
|
||||
|
||||
df = DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]})
|
||||
with3columns_result = df.to_latex(index=False, longtable=True)
|
||||
assert r"\multicolumn{3}" in with3columns_result
|
||||
|
||||
def test_to_latex_escape_special_chars(self):
|
||||
special_characters = ['&', '%', '$', '#', '_', '{', '}', '~', '^',
|
||||
'\\']
|
||||
df = DataFrame(data=special_characters)
|
||||
observed = df.to_latex()
|
||||
expected = r"""\begin{tabular}{ll}
|
||||
\toprule
|
||||
{} & 0 \\
|
||||
\midrule
|
||||
0 & \& \\
|
||||
1 & \% \\
|
||||
2 & \$ \\
|
||||
3 & \# \\
|
||||
4 & \_ \\
|
||||
5 & \{ \\
|
||||
6 & \} \\
|
||||
7 & \textasciitilde \\
|
||||
8 & \textasciicircum \\
|
||||
9 & \textbackslash \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
|
||||
assert observed == expected
|
||||
|
||||
def test_to_latex_no_header(self):
|
||||
# GH 7124
|
||||
df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
|
||||
withindex_result = df.to_latex(header=False)
|
||||
withindex_expected = r"""\begin{tabular}{lrl}
|
||||
\toprule
|
||||
0 & 1 & b1 \\
|
||||
1 & 2 & b2 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
|
||||
assert withindex_result == withindex_expected
|
||||
|
||||
withoutindex_result = df.to_latex(index=False, header=False)
|
||||
withoutindex_expected = r"""\begin{tabular}{rl}
|
||||
\toprule
|
||||
1 & b1 \\
|
||||
2 & b2 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
|
||||
assert withoutindex_result == withoutindex_expected
|
||||
|
||||
def test_to_latex_specified_header(self):
|
||||
# GH 7124
|
||||
df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
|
||||
withindex_result = df.to_latex(header=['AA', 'BB'])
|
||||
withindex_expected = r"""\begin{tabular}{lrl}
|
||||
\toprule
|
||||
{} & AA & BB \\
|
||||
\midrule
|
||||
0 & 1 & b1 \\
|
||||
1 & 2 & b2 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
|
||||
assert withindex_result == withindex_expected
|
||||
|
||||
withoutindex_result = df.to_latex(header=['AA', 'BB'], index=False)
|
||||
withoutindex_expected = r"""\begin{tabular}{rl}
|
||||
\toprule
|
||||
AA & BB \\
|
||||
\midrule
|
||||
1 & b1 \\
|
||||
2 & b2 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
|
||||
assert withoutindex_result == withoutindex_expected
|
||||
|
||||
withoutescape_result = df.to_latex(header=['$A$', '$B$'], escape=False)
|
||||
withoutescape_expected = r"""\begin{tabular}{lrl}
|
||||
\toprule
|
||||
{} & $A$ & $B$ \\
|
||||
\midrule
|
||||
0 & 1 & b1 \\
|
||||
1 & 2 & b2 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
|
||||
assert withoutescape_result == withoutescape_expected
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
df.to_latex(header=['A'])
|
||||
|
||||
def test_to_latex_decimal(self, frame):
|
||||
# GH 12031
|
||||
frame.to_latex()
|
||||
|
||||
df = DataFrame({'a': [1.0, 2.1], 'b': ['b1', 'b2']})
|
||||
withindex_result = df.to_latex(decimal=',')
|
||||
|
||||
withindex_expected = r"""\begin{tabular}{lrl}
|
||||
\toprule
|
||||
{} & a & b \\
|
||||
\midrule
|
||||
0 & 1,0 & b1 \\
|
||||
1 & 2,1 & b2 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
|
||||
assert withindex_result == withindex_expected
|
||||
|
||||
def test_to_latex_series(self):
|
||||
s = Series(['a', 'b', 'c'])
|
||||
withindex_result = s.to_latex()
|
||||
withindex_expected = r"""\begin{tabular}{ll}
|
||||
\toprule
|
||||
{} & 0 \\
|
||||
\midrule
|
||||
0 & a \\
|
||||
1 & b \\
|
||||
2 & c \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert withindex_result == withindex_expected
|
||||
|
||||
def test_to_latex_bold_rows(self):
|
||||
# GH 16707
|
||||
df = pd.DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
|
||||
observed = df.to_latex(bold_rows=True)
|
||||
expected = r"""\begin{tabular}{lrl}
|
||||
\toprule
|
||||
{} & a & b \\
|
||||
\midrule
|
||||
\textbf{0} & 1 & b1 \\
|
||||
\textbf{1} & 2 & b2 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert observed == expected
|
||||
|
||||
def test_to_latex_no_bold_rows(self):
|
||||
# GH 16707
|
||||
df = pd.DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
|
||||
observed = df.to_latex(bold_rows=False)
|
||||
expected = r"""\begin{tabular}{lrl}
|
||||
\toprule
|
||||
{} & a & b \\
|
||||
\midrule
|
||||
0 & 1 & b1 \\
|
||||
1 & 2 & b2 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert observed == expected
|
||||
|
||||
@pytest.mark.parametrize('name0', [None, 'named0'])
|
||||
@pytest.mark.parametrize('name1', [None, 'named1'])
|
||||
@pytest.mark.parametrize('axes', [[0], [1], [0, 1]])
|
||||
def test_to_latex_multiindex_names(self, name0, name1, axes):
|
||||
# GH 18667
|
||||
names = [name0, name1]
|
||||
mi = pd.MultiIndex.from_product([[1, 2], [3, 4]])
|
||||
df = pd.DataFrame(-1, index=mi.copy(), columns=mi.copy())
|
||||
for idx in axes:
|
||||
df.axes[idx].names = names
|
||||
|
||||
idx_names = tuple(n or '{}' for n in names)
|
||||
idx_names_row = ('%s & %s & & & & \\\\\n' % idx_names
|
||||
if (0 in axes and any(names)) else '')
|
||||
placeholder = '{}' if any(names) and 1 in axes else ' '
|
||||
col_names = [n if (bool(n) and 1 in axes) else placeholder
|
||||
for n in names]
|
||||
observed = df.to_latex()
|
||||
expected = r"""\begin{tabular}{llrrrr}
|
||||
\toprule
|
||||
& %s & \multicolumn{2}{l}{1} & \multicolumn{2}{l}{2} \\
|
||||
& %s & 3 & 4 & 3 & 4 \\
|
||||
%s\midrule
|
||||
1 & 3 & -1 & -1 & -1 & -1 \\
|
||||
& 4 & -1 & -1 & -1 & -1 \\
|
||||
2 & 3 & -1 & -1 & -1 & -1 \\
|
||||
& 4 & -1 & -1 & -1 & -1 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
""" % tuple(list(col_names) + [idx_names_row])
|
||||
assert observed == expected
|
||||
|
||||
@pytest.mark.parametrize('one_row', [True, False])
|
||||
def test_to_latex_multiindex_nans(self, one_row):
|
||||
# GH 14249
|
||||
df = pd.DataFrame({'a': [None, 1], 'b': [2, 3], 'c': [4, 5]})
|
||||
if one_row:
|
||||
df = df.iloc[[0]]
|
||||
observed = df.set_index(['a', 'b']).to_latex()
|
||||
expected = r"""\begin{tabular}{llr}
|
||||
\toprule
|
||||
& & c \\
|
||||
a & b & \\
|
||||
\midrule
|
||||
NaN & 2 & 4 \\
|
||||
"""
|
||||
if not one_row:
|
||||
expected += r"""1.0 & 3 & 5 \\
|
||||
"""
|
||||
expected += r"""\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert observed == expected
|
||||
|
||||
def test_to_latex_non_string_index(self):
|
||||
# GH 19981
|
||||
observed = pd.DataFrame([[1, 2, 3]] * 2).set_index([0, 1]).to_latex()
|
||||
expected = r"""\begin{tabular}{llr}
|
||||
\toprule
|
||||
& & 2 \\
|
||||
0 & 1 & \\
|
||||
\midrule
|
||||
1 & 2 & 3 \\
|
||||
& 2 & 3 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert observed == expected
|
||||
|
||||
def test_to_latex_midrule_location(self):
|
||||
# GH 18326
|
||||
df = pd.DataFrame({'a': [1, 2]})
|
||||
df.index.name = 'foo'
|
||||
observed = df.to_latex(index_names=False)
|
||||
expected = r"""\begin{tabular}{lr}
|
||||
\toprule
|
||||
{} & a \\
|
||||
\midrule
|
||||
0 & 1 \\
|
||||
1 & 2 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
|
||||
assert observed == expected
|
||||
|
||||
def test_to_latex_multiindex_empty_name(self):
|
||||
# GH 18669
|
||||
mi = pd.MultiIndex.from_product([[1, 2]], names=[''])
|
||||
df = pd.DataFrame(-1, index=mi, columns=range(4))
|
||||
observed = df.to_latex()
|
||||
expected = r"""\begin{tabular}{lrrrr}
|
||||
\toprule
|
||||
& 0 & 1 & 2 & 3 \\
|
||||
{} & & & & \\
|
||||
\midrule
|
||||
1 & -1 & -1 & -1 & -1 \\
|
||||
2 & -1 & -1 & -1 & -1 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert observed == expected
|
||||
|
||||
def test_to_latex_float_format_no_fixed_width(self):
|
||||
|
||||
# GH 21625
|
||||
df = DataFrame({'x': [0.19999]})
|
||||
expected = r"""\begin{tabular}{lr}
|
||||
\toprule
|
||||
{} & x \\
|
||||
\midrule
|
||||
0 & 0.200 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert df.to_latex(float_format='%.3f') == expected
|
||||
|
||||
# GH 22270
|
||||
df = DataFrame({'x': [100.0]})
|
||||
expected = r"""\begin{tabular}{lr}
|
||||
\toprule
|
||||
{} & x \\
|
||||
\midrule
|
||||
0 & 100 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
assert df.to_latex(float_format='%.0f') == expected
|
||||
+369
@@ -0,0 +1,369 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
"""
|
||||
self-contained to write legacy storage (pickle/msgpack) files
|
||||
|
||||
To use this script. Create an environment where you want
|
||||
generate pickles, say its for 0.18.1, with your pandas clone
|
||||
in ~/pandas
|
||||
|
||||
. activate pandas_0.18.1
|
||||
cd ~/
|
||||
|
||||
$ python pandas/pandas/tests/io/generate_legacy_storage_files.py \
|
||||
pandas/pandas/tests/io/data/legacy_pickle/0.18.1/ pickle
|
||||
|
||||
This script generates a storage file for the current arch, system,
|
||||
and python version
|
||||
pandas version: 0.18.1
|
||||
output dir : pandas/pandas/tests/io/data/legacy_pickle/0.18.1/
|
||||
storage format: pickle
|
||||
created pickle file: 0.18.1_x86_64_darwin_3.5.2.pickle
|
||||
|
||||
The idea here is you are using the *current* version of the
|
||||
generate_legacy_storage_files with an *older* version of pandas to
|
||||
generate a pickle file. We will then check this file into a current
|
||||
branch, and test using test_pickle.py. This will load the *older*
|
||||
pickles and test versus the current data that is generated
|
||||
(with master). These are then compared.
|
||||
|
||||
If we have cases where we changed the signature (e.g. we renamed
|
||||
offset -> freq in Timestamp). Then we have to conditionally execute
|
||||
in the generate_legacy_storage_files.py to make it
|
||||
run under the older AND the newer version.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from datetime import timedelta
|
||||
from distutils.version import LooseVersion
|
||||
import os
|
||||
import platform as pl
|
||||
import sys
|
||||
from warnings import catch_warnings, filterwarnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat import u
|
||||
|
||||
import pandas
|
||||
from pandas import (
|
||||
Categorical, DataFrame, Index, MultiIndex, NaT, Panel, Period, Series,
|
||||
SparseDataFrame, SparseSeries, Timestamp, bdate_range, date_range,
|
||||
period_range, timedelta_range, to_msgpack)
|
||||
|
||||
from pandas.tseries.offsets import (
|
||||
FY5253, BusinessDay, BusinessHour, CustomBusinessDay, DateOffset, Day,
|
||||
Easter, Hour, LastWeekOfMonth, Minute, MonthBegin, MonthEnd, QuarterBegin,
|
||||
QuarterEnd, SemiMonthBegin, SemiMonthEnd, Week, WeekOfMonth, YearBegin,
|
||||
YearEnd)
|
||||
|
||||
_loose_version = LooseVersion(pandas.__version__)
|
||||
|
||||
|
||||
def _create_sp_series():
|
||||
nan = np.nan
|
||||
|
||||
# nan-based
|
||||
arr = np.arange(15, dtype=np.float64)
|
||||
arr[7:12] = nan
|
||||
arr[-1:] = nan
|
||||
|
||||
bseries = SparseSeries(arr, kind='block')
|
||||
bseries.name = u'bseries'
|
||||
return bseries
|
||||
|
||||
|
||||
def _create_sp_tsseries():
|
||||
nan = np.nan
|
||||
|
||||
# nan-based
|
||||
arr = np.arange(15, dtype=np.float64)
|
||||
arr[7:12] = nan
|
||||
arr[-1:] = nan
|
||||
|
||||
date_index = bdate_range('1/1/2011', periods=len(arr))
|
||||
bseries = SparseSeries(arr, index=date_index, kind='block')
|
||||
bseries.name = u'btsseries'
|
||||
return bseries
|
||||
|
||||
|
||||
def _create_sp_frame():
|
||||
nan = np.nan
|
||||
|
||||
data = {u'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
|
||||
u'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
|
||||
u'C': np.arange(10).astype(np.int64),
|
||||
u'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}
|
||||
|
||||
dates = bdate_range('1/1/2011', periods=10)
|
||||
return SparseDataFrame(data, index=dates)
|
||||
|
||||
|
||||
def create_data():
|
||||
""" create the pickle/msgpack data """
|
||||
|
||||
data = {
|
||||
u'A': [0., 1., 2., 3., np.nan],
|
||||
u'B': [0, 1, 0, 1, 0],
|
||||
u'C': [u'foo1', u'foo2', u'foo3', u'foo4', u'foo5'],
|
||||
u'D': date_range('1/1/2009', periods=5),
|
||||
u'E': [0., 1, Timestamp('20100101'), u'foo', 2.]
|
||||
}
|
||||
|
||||
scalars = dict(timestamp=Timestamp('20130101'),
|
||||
period=Period('2012', 'M'))
|
||||
|
||||
index = dict(int=Index(np.arange(10)),
|
||||
date=date_range('20130101', periods=10),
|
||||
period=period_range('2013-01-01', freq='M', periods=10),
|
||||
float=Index(np.arange(10, dtype=np.float64)),
|
||||
uint=Index(np.arange(10, dtype=np.uint64)),
|
||||
timedelta=timedelta_range('00:00:00', freq='30T', periods=10))
|
||||
|
||||
if _loose_version >= LooseVersion('0.18'):
|
||||
from pandas import RangeIndex
|
||||
index['range'] = RangeIndex(10)
|
||||
|
||||
if _loose_version >= LooseVersion('0.21'):
|
||||
from pandas import interval_range
|
||||
index['interval'] = interval_range(0, periods=10)
|
||||
|
||||
mi = dict(reg2=MultiIndex.from_tuples(
|
||||
tuple(zip(*[[u'bar', u'bar', u'baz', u'baz', u'foo',
|
||||
u'foo', u'qux', u'qux'],
|
||||
[u'one', u'two', u'one', u'two', u'one',
|
||||
u'two', u'one', u'two']])),
|
||||
names=[u'first', u'second']))
|
||||
|
||||
series = dict(float=Series(data[u'A']),
|
||||
int=Series(data[u'B']),
|
||||
mixed=Series(data[u'E']),
|
||||
ts=Series(np.arange(10).astype(np.int64),
|
||||
index=date_range('20130101', periods=10)),
|
||||
mi=Series(np.arange(5).astype(np.float64),
|
||||
index=MultiIndex.from_tuples(
|
||||
tuple(zip(*[[1, 1, 2, 2, 2],
|
||||
[3, 4, 3, 4, 5]])),
|
||||
names=[u'one', u'two'])),
|
||||
dup=Series(np.arange(5).astype(np.float64),
|
||||
index=[u'A', u'B', u'C', u'D', u'A']),
|
||||
cat=Series(Categorical([u'foo', u'bar', u'baz'])),
|
||||
dt=Series(date_range('20130101', periods=5)),
|
||||
dt_tz=Series(date_range('20130101', periods=5,
|
||||
tz='US/Eastern')),
|
||||
period=Series([Period('2000Q1')] * 5))
|
||||
|
||||
mixed_dup_df = DataFrame(data)
|
||||
mixed_dup_df.columns = list(u"ABCDA")
|
||||
frame = dict(float=DataFrame({u'A': series[u'float'],
|
||||
u'B': series[u'float'] + 1}),
|
||||
int=DataFrame({u'A': series[u'int'],
|
||||
u'B': series[u'int'] + 1}),
|
||||
mixed=DataFrame({k: data[k]
|
||||
for k in [u'A', u'B', u'C', u'D']}),
|
||||
mi=DataFrame({u'A': np.arange(5).astype(np.float64),
|
||||
u'B': np.arange(5).astype(np.int64)},
|
||||
index=MultiIndex.from_tuples(
|
||||
tuple(zip(*[[u'bar', u'bar', u'baz',
|
||||
u'baz', u'baz'],
|
||||
[u'one', u'two', u'one',
|
||||
u'two', u'three']])),
|
||||
names=[u'first', u'second'])),
|
||||
dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
|
||||
columns=[u'A', u'B', u'A']),
|
||||
cat_onecol=DataFrame({u'A': Categorical([u'foo', u'bar'])}),
|
||||
cat_and_float=DataFrame({
|
||||
u'A': Categorical([u'foo', u'bar', u'baz']),
|
||||
u'B': np.arange(3).astype(np.int64)}),
|
||||
mixed_dup=mixed_dup_df,
|
||||
dt_mixed_tzs=DataFrame({
|
||||
u'A': Timestamp('20130102', tz='US/Eastern'),
|
||||
u'B': Timestamp('20130603', tz='CET')}, index=range(5)),
|
||||
dt_mixed2_tzs=DataFrame({
|
||||
u'A': Timestamp('20130102', tz='US/Eastern'),
|
||||
u'B': Timestamp('20130603', tz='CET'),
|
||||
u'C': Timestamp('20130603', tz='UTC')}, index=range(5))
|
||||
)
|
||||
|
||||
with catch_warnings(record=True):
|
||||
filterwarnings("ignore", "\\nPanel", FutureWarning)
|
||||
mixed_dup_panel = Panel({u'ItemA': frame[u'float'],
|
||||
u'ItemB': frame[u'int']})
|
||||
mixed_dup_panel.items = [u'ItemA', u'ItemA']
|
||||
panel = dict(float=Panel({u'ItemA': frame[u'float'],
|
||||
u'ItemB': frame[u'float'] + 1}),
|
||||
dup=Panel(
|
||||
np.arange(30).reshape(3, 5, 2).astype(np.float64),
|
||||
items=[u'A', u'B', u'A']),
|
||||
mixed_dup=mixed_dup_panel)
|
||||
|
||||
cat = dict(int8=Categorical(list('abcdefg')),
|
||||
int16=Categorical(np.arange(1000)),
|
||||
int32=Categorical(np.arange(10000)))
|
||||
|
||||
timestamp = dict(normal=Timestamp('2011-01-01'),
|
||||
nat=NaT,
|
||||
tz=Timestamp('2011-01-01', tz='US/Eastern'))
|
||||
|
||||
if _loose_version < LooseVersion('0.19.2'):
|
||||
timestamp['freq'] = Timestamp('2011-01-01', offset='D')
|
||||
timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo',
|
||||
offset='M')
|
||||
else:
|
||||
timestamp['freq'] = Timestamp('2011-01-01', freq='D')
|
||||
timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo',
|
||||
freq='M')
|
||||
|
||||
off = {'DateOffset': DateOffset(years=1),
|
||||
'DateOffset_h_ns': DateOffset(hour=6, nanoseconds=5824),
|
||||
'BusinessDay': BusinessDay(offset=timedelta(seconds=9)),
|
||||
'BusinessHour': BusinessHour(normalize=True, n=6, end='15:14'),
|
||||
'CustomBusinessDay': CustomBusinessDay(weekmask='Mon Fri'),
|
||||
'SemiMonthBegin': SemiMonthBegin(day_of_month=9),
|
||||
'SemiMonthEnd': SemiMonthEnd(day_of_month=24),
|
||||
'MonthBegin': MonthBegin(1),
|
||||
'MonthEnd': MonthEnd(1),
|
||||
'QuarterBegin': QuarterBegin(1),
|
||||
'QuarterEnd': QuarterEnd(1),
|
||||
'Day': Day(1),
|
||||
'YearBegin': YearBegin(1),
|
||||
'YearEnd': YearEnd(1),
|
||||
'Week': Week(1),
|
||||
'Week_Tues': Week(2, normalize=False, weekday=1),
|
||||
'WeekOfMonth': WeekOfMonth(week=3, weekday=4),
|
||||
'LastWeekOfMonth': LastWeekOfMonth(n=1, weekday=3),
|
||||
'FY5253': FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
|
||||
'Easter': Easter(),
|
||||
'Hour': Hour(1),
|
||||
'Minute': Minute(1)}
|
||||
|
||||
return dict(series=series,
|
||||
frame=frame,
|
||||
panel=panel,
|
||||
index=index,
|
||||
scalars=scalars,
|
||||
mi=mi,
|
||||
sp_series=dict(float=_create_sp_series(),
|
||||
ts=_create_sp_tsseries()),
|
||||
sp_frame=dict(float=_create_sp_frame()),
|
||||
cat=cat,
|
||||
timestamp=timestamp,
|
||||
offsets=off)
|
||||
|
||||
|
||||
def create_pickle_data():
|
||||
data = create_data()
|
||||
|
||||
# Pre-0.14.1 versions generated non-unpicklable mixed-type frames and
|
||||
# panels if their columns/items were non-unique.
|
||||
if _loose_version < LooseVersion('0.14.1'):
|
||||
del data['frame']['mixed_dup']
|
||||
del data['panel']['mixed_dup']
|
||||
if _loose_version < LooseVersion('0.17.0'):
|
||||
del data['series']['period']
|
||||
del data['scalars']['period']
|
||||
return data
|
||||
|
||||
|
||||
def _u(x):
|
||||
return {u(k): _u(x[k]) for k in x} if isinstance(x, dict) else x
|
||||
|
||||
|
||||
def create_msgpack_data():
|
||||
data = create_data()
|
||||
if _loose_version < LooseVersion('0.17.0'):
|
||||
del data['frame']['mixed_dup']
|
||||
del data['panel']['mixed_dup']
|
||||
del data['frame']['dup']
|
||||
del data['panel']['dup']
|
||||
if _loose_version < LooseVersion('0.18.0'):
|
||||
del data['series']['dt_tz']
|
||||
del data['frame']['dt_mixed_tzs']
|
||||
# Not supported
|
||||
del data['sp_series']
|
||||
del data['sp_frame']
|
||||
del data['series']['cat']
|
||||
del data['series']['period']
|
||||
del data['frame']['cat_onecol']
|
||||
del data['frame']['cat_and_float']
|
||||
del data['scalars']['period']
|
||||
if _loose_version < LooseVersion('0.23.0'):
|
||||
del data['index']['interval']
|
||||
del data['offsets']
|
||||
return _u(data)
|
||||
|
||||
|
||||
def platform_name():
|
||||
return '_'.join([str(pandas.__version__), str(pl.machine()),
|
||||
str(pl.system().lower()), str(pl.python_version())])
|
||||
|
||||
|
||||
def write_legacy_pickles(output_dir):
|
||||
|
||||
# make sure we are < 0.13 compat (in py3)
|
||||
try:
|
||||
from pandas.compat import zip, cPickle as pickle # noqa
|
||||
except ImportError:
|
||||
import pickle
|
||||
|
||||
version = pandas.__version__
|
||||
|
||||
print("This script generates a storage file for the current arch, system, "
|
||||
"and python version")
|
||||
print(" pandas version: {0}".format(version))
|
||||
print(" output dir : {0}".format(output_dir))
|
||||
print(" storage format: pickle")
|
||||
|
||||
pth = '{0}.pickle'.format(platform_name())
|
||||
|
||||
fh = open(os.path.join(output_dir, pth), 'wb')
|
||||
pickle.dump(create_pickle_data(), fh, pickle.HIGHEST_PROTOCOL)
|
||||
fh.close()
|
||||
|
||||
print("created pickle file: %s" % pth)
|
||||
|
||||
|
||||
def write_legacy_msgpack(output_dir, compress):
|
||||
|
||||
version = pandas.__version__
|
||||
|
||||
print("This script generates a storage file for the current arch, "
|
||||
"system, and python version")
|
||||
print(" pandas version: {0}".format(version))
|
||||
print(" output dir : {0}".format(output_dir))
|
||||
print(" storage format: msgpack")
|
||||
pth = '{0}.msgpack'.format(platform_name())
|
||||
to_msgpack(os.path.join(output_dir, pth), create_msgpack_data(),
|
||||
compress=compress)
|
||||
|
||||
print("created msgpack file: %s" % pth)
|
||||
|
||||
|
||||
def write_legacy_file():
|
||||
# force our cwd to be the first searched
|
||||
sys.path.insert(0, '.')
|
||||
|
||||
if not (3 <= len(sys.argv) <= 4):
|
||||
exit("Specify output directory and storage type: generate_legacy_"
|
||||
"storage_files.py <output_dir> <storage_type> "
|
||||
"<msgpack_compress_type>")
|
||||
|
||||
output_dir = str(sys.argv[1])
|
||||
storage_type = str(sys.argv[2])
|
||||
try:
|
||||
compress_type = str(sys.argv[3])
|
||||
except IndexError:
|
||||
compress_type = None
|
||||
|
||||
if storage_type == 'pickle':
|
||||
write_legacy_pickles(output_dir=output_dir)
|
||||
elif storage_type == 'msgpack':
|
||||
write_legacy_msgpack(output_dir=output_dir, compress=compress_type)
|
||||
else:
|
||||
exit("storage_type must be one of {'pickle', 'msgpack'}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
write_legacy_file()
|
||||
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -0,0 +1,120 @@
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
|
||||
def test_compression_roundtrip(compression):
|
||||
df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
|
||||
[12.32112, 123123.2, 321321.2]],
|
||||
index=['A', 'B'], columns=['X', 'Y', 'Z'])
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_json(path, compression=compression)
|
||||
assert_frame_equal(df, pd.read_json(path,
|
||||
compression=compression))
|
||||
|
||||
# explicitly ensure file was compressed.
|
||||
with tm.decompress_file(path, compression) as fh:
|
||||
result = fh.read().decode('utf8')
|
||||
assert_frame_equal(df, pd.read_json(result))
|
||||
|
||||
|
||||
def test_read_zipped_json(datapath):
|
||||
uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
|
||||
uncompressed_df = pd.read_json(uncompressed_path)
|
||||
|
||||
compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
|
||||
compressed_df = pd.read_json(compressed_path, compression='zip')
|
||||
|
||||
assert_frame_equal(uncompressed_df, compressed_df)
|
||||
|
||||
|
||||
@td.skip_if_not_us_locale
|
||||
def test_with_s3_url(compression, s3_resource):
|
||||
# Bucket "pandas-test" created in tests/io/conftest.py
|
||||
|
||||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_json(path, compression=compression)
|
||||
with open(path, 'rb') as f:
|
||||
s3_resource.Bucket("pandas-test").put_object(Key='test-1', Body=f)
|
||||
|
||||
roundtripped_df = pd.read_json('s3://pandas-test/test-1',
|
||||
compression=compression)
|
||||
assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_lines_with_compression(compression):
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
|
||||
df.to_json(path, orient='records', lines=True,
|
||||
compression=compression)
|
||||
roundtripped_df = pd.read_json(path, lines=True,
|
||||
compression=compression)
|
||||
assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_chunksize_with_compression(compression):
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
|
||||
df.to_json(path, orient='records', lines=True,
|
||||
compression=compression)
|
||||
|
||||
res = pd.read_json(path, lines=True, chunksize=1,
|
||||
compression=compression)
|
||||
roundtripped_df = pd.concat(res)
|
||||
assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_write_unsupported_compression_type():
|
||||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
|
||||
with tm.ensure_clean() as path:
|
||||
msg = "Unrecognized compression type: unsupported"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_json(path, compression="unsupported")
|
||||
|
||||
|
||||
def test_read_unsupported_compression_type():
|
||||
with tm.ensure_clean() as path:
|
||||
msg = "Unrecognized compression type: unsupported"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.read_json(path, compression="unsupported")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("to_infer", [True, False])
|
||||
@pytest.mark.parametrize("read_infer", [True, False])
|
||||
def test_to_json_compression(compression_only,
|
||||
read_infer, to_infer):
|
||||
# see gh-15008
|
||||
compression = compression_only
|
||||
|
||||
if compression == "zip":
|
||||
pytest.skip("{compression} is not supported "
|
||||
"for to_csv".format(compression=compression))
|
||||
|
||||
# We'll complete file extension subsequently.
|
||||
filename = "test."
|
||||
|
||||
if compression == "gzip":
|
||||
filename += "gz"
|
||||
else:
|
||||
# xz --> .xz
|
||||
# bz2 --> .bz2
|
||||
filename += compression
|
||||
|
||||
df = pd.DataFrame({"A": [1]})
|
||||
|
||||
to_compression = "infer" if to_infer else compression
|
||||
read_compression = "infer" if read_infer else compression
|
||||
|
||||
with tm.ensure_clean(filename) as path:
|
||||
df.to_json(path, compression=to_compression)
|
||||
result = pd.read_json(path, compression=read_compression)
|
||||
tm.assert_frame_equal(result, df)
|
||||
+580
@@ -0,0 +1,580 @@
|
||||
"""Tests for Table Schema integration."""
|
||||
from collections import OrderedDict
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
CategoricalDtype, DatetimeTZDtype, PeriodDtype)
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.json.table_schema import (
|
||||
as_json_table_type, build_table_schema, convert_json_field_to_pandas_type,
|
||||
convert_pandas_type_to_json_field, set_default_names)
|
||||
|
||||
|
||||
class TestBuildSchema(object):
|
||||
|
||||
def setup_method(self, method):
|
||||
self.df = DataFrame(
|
||||
{'A': [1, 2, 3, 4],
|
||||
'B': ['a', 'b', 'c', 'c'],
|
||||
'C': pd.date_range('2016-01-01', freq='d', periods=4),
|
||||
'D': pd.timedelta_range('1H', periods=4, freq='T'),
|
||||
},
|
||||
index=pd.Index(range(4), name='idx'))
|
||||
|
||||
def test_build_table_schema(self):
|
||||
result = build_table_schema(self.df, version=False)
|
||||
expected = {
|
||||
'fields': [{'name': 'idx', 'type': 'integer'},
|
||||
{'name': 'A', 'type': 'integer'},
|
||||
{'name': 'B', 'type': 'string'},
|
||||
{'name': 'C', 'type': 'datetime'},
|
||||
{'name': 'D', 'type': 'duration'},
|
||||
],
|
||||
'primaryKey': ['idx']
|
||||
}
|
||||
assert result == expected
|
||||
result = build_table_schema(self.df)
|
||||
assert "pandas_version" in result
|
||||
|
||||
def test_series(self):
|
||||
s = pd.Series([1, 2, 3], name='foo')
|
||||
result = build_table_schema(s, version=False)
|
||||
expected = {'fields': [{'name': 'index', 'type': 'integer'},
|
||||
{'name': 'foo', 'type': 'integer'}],
|
||||
'primaryKey': ['index']}
|
||||
assert result == expected
|
||||
result = build_table_schema(s)
|
||||
assert 'pandas_version' in result
|
||||
|
||||
def test_series_unnamed(self):
|
||||
result = build_table_schema(pd.Series([1, 2, 3]), version=False)
|
||||
expected = {'fields': [{'name': 'index', 'type': 'integer'},
|
||||
{'name': 'values', 'type': 'integer'}],
|
||||
'primaryKey': ['index']}
|
||||
assert result == expected
|
||||
|
||||
def test_multiindex(self):
|
||||
df = self.df.copy()
|
||||
idx = pd.MultiIndex.from_product([('a', 'b'), (1, 2)])
|
||||
df.index = idx
|
||||
|
||||
result = build_table_schema(df, version=False)
|
||||
expected = {
|
||||
'fields': [{'name': 'level_0', 'type': 'string'},
|
||||
{'name': 'level_1', 'type': 'integer'},
|
||||
{'name': 'A', 'type': 'integer'},
|
||||
{'name': 'B', 'type': 'string'},
|
||||
{'name': 'C', 'type': 'datetime'},
|
||||
{'name': 'D', 'type': 'duration'},
|
||||
],
|
||||
'primaryKey': ['level_0', 'level_1']
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
df.index.names = ['idx0', None]
|
||||
expected['fields'][0]['name'] = 'idx0'
|
||||
expected['primaryKey'] = ['idx0', 'level_1']
|
||||
result = build_table_schema(df, version=False)
|
||||
assert result == expected
|
||||
|
||||
|
||||
class TestTableSchemaType(object):
|
||||
|
||||
@pytest.mark.parametrize('int_type', [
|
||||
np.int, np.int16, np.int32, np.int64])
|
||||
def test_as_json_table_type_int_data(self, int_type):
|
||||
int_data = [1, 2, 3]
|
||||
assert as_json_table_type(np.array(
|
||||
int_data, dtype=int_type)) == 'integer'
|
||||
|
||||
@pytest.mark.parametrize('float_type', [
|
||||
np.float, np.float16, np.float32, np.float64])
|
||||
def test_as_json_table_type_float_data(self, float_type):
|
||||
float_data = [1., 2., 3.]
|
||||
assert as_json_table_type(np.array(
|
||||
float_data, dtype=float_type)) == 'number'
|
||||
|
||||
@pytest.mark.parametrize('bool_type', [bool, np.bool])
|
||||
def test_as_json_table_type_bool_data(self, bool_type):
|
||||
bool_data = [True, False]
|
||||
assert as_json_table_type(np.array(
|
||||
bool_data, dtype=bool_type)) == 'boolean'
|
||||
|
||||
@pytest.mark.parametrize('date_data', [
|
||||
pd.to_datetime(['2016']),
|
||||
pd.to_datetime(['2016'], utc=True),
|
||||
pd.Series(pd.to_datetime(['2016'])),
|
||||
pd.Series(pd.to_datetime(['2016'], utc=True)),
|
||||
pd.period_range('2016', freq='A', periods=3)
|
||||
])
|
||||
def test_as_json_table_type_date_data(self, date_data):
|
||||
assert as_json_table_type(date_data) == 'datetime'
|
||||
|
||||
@pytest.mark.parametrize('str_data', [
|
||||
pd.Series(['a', 'b']), pd.Index(['a', 'b'])])
|
||||
def test_as_json_table_type_string_data(self, str_data):
|
||||
assert as_json_table_type(str_data) == 'string'
|
||||
|
||||
@pytest.mark.parametrize('cat_data', [
|
||||
pd.Categorical(['a']),
|
||||
pd.Categorical([1]),
|
||||
pd.Series(pd.Categorical([1])),
|
||||
pd.CategoricalIndex([1]),
|
||||
pd.Categorical([1])])
|
||||
def test_as_json_table_type_categorical_data(self, cat_data):
|
||||
assert as_json_table_type(cat_data) == 'any'
|
||||
|
||||
# ------
|
||||
# dtypes
|
||||
# ------
|
||||
@pytest.mark.parametrize('int_dtype', [
|
||||
np.int, np.int16, np.int32, np.int64])
|
||||
def test_as_json_table_type_int_dtypes(self, int_dtype):
|
||||
assert as_json_table_type(int_dtype) == 'integer'
|
||||
|
||||
@pytest.mark.parametrize('float_dtype', [
|
||||
np.float, np.float16, np.float32, np.float64])
|
||||
def test_as_json_table_type_float_dtypes(self, float_dtype):
|
||||
assert as_json_table_type(float_dtype) == 'number'
|
||||
|
||||
@pytest.mark.parametrize('bool_dtype', [bool, np.bool])
|
||||
def test_as_json_table_type_bool_dtypes(self, bool_dtype):
|
||||
assert as_json_table_type(bool_dtype) == 'boolean'
|
||||
|
||||
@pytest.mark.parametrize('date_dtype', [
|
||||
np.datetime64, np.dtype("<M8[ns]"), PeriodDtype('D'),
|
||||
DatetimeTZDtype('ns', 'US/Central')])
|
||||
def test_as_json_table_type_date_dtypes(self, date_dtype):
|
||||
# TODO: datedate.date? datetime.time?
|
||||
assert as_json_table_type(date_dtype) == 'datetime'
|
||||
|
||||
@pytest.mark.parametrize('td_dtype', [
|
||||
np.timedelta64, np.dtype("<m8[ns]")])
|
||||
def test_as_json_table_type_timedelta_dtypes(self, td_dtype):
|
||||
assert as_json_table_type(td_dtype) == 'duration'
|
||||
|
||||
@pytest.mark.parametrize('str_dtype', [object]) # TODO
|
||||
def test_as_json_table_type_string_dtypes(self, str_dtype):
|
||||
assert as_json_table_type(str_dtype) == 'string'
|
||||
|
||||
def test_as_json_table_type_categorical_dtypes(self):
|
||||
# TODO: I think before is_categorical_dtype(Categorical)
|
||||
# returned True, but now it's False. Figure out why or
|
||||
# if it matters
|
||||
assert as_json_table_type(pd.Categorical(['a'])) == 'any'
|
||||
assert as_json_table_type(CategoricalDtype()) == 'any'
|
||||
|
||||
|
||||
class TestTableOrient(object):
|
||||
|
||||
def setup_method(self, method):
|
||||
self.df = DataFrame(
|
||||
{'A': [1, 2, 3, 4],
|
||||
'B': ['a', 'b', 'c', 'c'],
|
||||
'C': pd.date_range('2016-01-01', freq='d', periods=4),
|
||||
'D': pd.timedelta_range('1H', periods=4, freq='T'),
|
||||
'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
|
||||
'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
|
||||
ordered=True)),
|
||||
'G': [1., 2., 3, 4.],
|
||||
'H': pd.date_range('2016-01-01', freq='d', periods=4,
|
||||
tz='US/Central'),
|
||||
},
|
||||
index=pd.Index(range(4), name='idx'))
|
||||
|
||||
def test_build_series(self):
|
||||
s = pd.Series([1, 2], name='a')
|
||||
s.index.name = 'id'
|
||||
result = s.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result['schema']
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
fields = [{'name': 'id', 'type': 'integer'},
|
||||
{'name': 'a', 'type': 'integer'}]
|
||||
|
||||
schema = {
|
||||
'fields': fields,
|
||||
'primaryKey': ['id'],
|
||||
}
|
||||
|
||||
expected = OrderedDict([
|
||||
('schema', schema),
|
||||
('data', [OrderedDict([('id', 0), ('a', 1)]),
|
||||
OrderedDict([('id', 1), ('a', 2)])])])
|
||||
assert result == expected
|
||||
|
||||
def test_to_json(self):
|
||||
df = self.df.copy()
|
||||
df.index.name = 'idx'
|
||||
result = df.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result['schema']
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
fields = [
|
||||
{'name': 'idx', 'type': 'integer'},
|
||||
{'name': 'A', 'type': 'integer'},
|
||||
{'name': 'B', 'type': 'string'},
|
||||
{'name': 'C', 'type': 'datetime'},
|
||||
{'name': 'D', 'type': 'duration'},
|
||||
{'constraints': {'enum': ['a', 'b', 'c']},
|
||||
'name': 'E',
|
||||
'ordered': False,
|
||||
'type': 'any'},
|
||||
{'constraints': {'enum': ['a', 'b', 'c']},
|
||||
'name': 'F',
|
||||
'ordered': True,
|
||||
'type': 'any'},
|
||||
{'name': 'G', 'type': 'number'},
|
||||
{'name': 'H', 'type': 'datetime', 'tz': 'US/Central'}
|
||||
]
|
||||
|
||||
schema = {
|
||||
'fields': fields,
|
||||
'primaryKey': ['idx'],
|
||||
}
|
||||
data = [
|
||||
OrderedDict([('idx', 0), ('A', 1), ('B', 'a'),
|
||||
('C', '2016-01-01T00:00:00.000Z'),
|
||||
('D', 'P0DT1H0M0S'),
|
||||
('E', 'a'), ('F', 'a'), ('G', 1.),
|
||||
('H', '2016-01-01T06:00:00.000Z')
|
||||
]),
|
||||
OrderedDict([('idx', 1), ('A', 2), ('B', 'b'),
|
||||
('C', '2016-01-02T00:00:00.000Z'),
|
||||
('D', 'P0DT1H1M0S'),
|
||||
('E', 'b'), ('F', 'b'), ('G', 2.),
|
||||
('H', '2016-01-02T06:00:00.000Z')
|
||||
]),
|
||||
OrderedDict([('idx', 2), ('A', 3), ('B', 'c'),
|
||||
('C', '2016-01-03T00:00:00.000Z'),
|
||||
('D', 'P0DT1H2M0S'),
|
||||
('E', 'c'), ('F', 'c'), ('G', 3.),
|
||||
('H', '2016-01-03T06:00:00.000Z')
|
||||
]),
|
||||
OrderedDict([('idx', 3), ('A', 4), ('B', 'c'),
|
||||
('C', '2016-01-04T00:00:00.000Z'),
|
||||
('D', 'P0DT1H3M0S'),
|
||||
('E', 'c'), ('F', 'c'), ('G', 4.),
|
||||
('H', '2016-01-04T06:00:00.000Z')
|
||||
]),
|
||||
]
|
||||
expected = OrderedDict([('schema', schema), ('data', data)])
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_float_index(self):
|
||||
data = pd.Series(1, index=[1., 2.])
|
||||
result = data.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
expected = (
|
||||
OrderedDict([('schema', {
|
||||
'fields': [{'name': 'index', 'type': 'number'},
|
||||
{'name': 'values', 'type': 'integer'}],
|
||||
'primaryKey': ['index']
|
||||
}),
|
||||
('data', [OrderedDict([('index', 1.0), ('values', 1)]),
|
||||
OrderedDict([('index', 2.0), ('values', 1)])])])
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_period_index(self):
|
||||
idx = pd.period_range('2016', freq='Q-JAN', periods=2)
|
||||
data = pd.Series(1, idx)
|
||||
result = data.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
fields = [{'freq': 'Q-JAN', 'name': 'index', 'type': 'datetime'},
|
||||
{'name': 'values', 'type': 'integer'}]
|
||||
|
||||
schema = {'fields': fields, 'primaryKey': ['index']}
|
||||
data = [OrderedDict([('index', '2015-11-01T00:00:00.000Z'),
|
||||
('values', 1)]),
|
||||
OrderedDict([('index', '2016-02-01T00:00:00.000Z'),
|
||||
('values', 1)])]
|
||||
expected = OrderedDict([('schema', schema), ('data', data)])
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_categorical_index(self):
|
||||
data = pd.Series(1, pd.CategoricalIndex(['a', 'b']))
|
||||
result = data.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
expected = (
|
||||
OrderedDict([('schema',
|
||||
{'fields': [{'name': 'index', 'type': 'any',
|
||||
'constraints': {'enum': ['a', 'b']},
|
||||
'ordered': False},
|
||||
{'name': 'values', 'type': 'integer'}],
|
||||
'primaryKey': ['index']}),
|
||||
('data', [
|
||||
OrderedDict([('index', 'a'),
|
||||
('values', 1)]),
|
||||
OrderedDict([('index', 'b'), ('values', 1)])])])
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
def test_date_format_raises(self):
|
||||
with pytest.raises(ValueError):
|
||||
self.df.to_json(orient='table', date_format='epoch')
|
||||
|
||||
# others work
|
||||
self.df.to_json(orient='table', date_format='iso')
|
||||
self.df.to_json(orient='table')
|
||||
|
||||
@pytest.mark.parametrize('kind', [pd.Series, pd.Index])
|
||||
def test_convert_pandas_type_to_json_field_int(self, kind):
|
||||
data = [1, 2, 3]
|
||||
result = convert_pandas_type_to_json_field(kind(data, name='name'))
|
||||
expected = {"name": "name", "type": "integer"}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize('kind', [pd.Series, pd.Index])
|
||||
def test_convert_pandas_type_to_json_field_float(self, kind):
|
||||
data = [1., 2., 3.]
|
||||
result = convert_pandas_type_to_json_field(kind(data, name='name'))
|
||||
expected = {"name": "name", "type": "number"}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize('dt_args,extra_exp', [
|
||||
({}, {}), ({'utc': True}, {'tz': 'UTC'})])
|
||||
@pytest.mark.parametrize('wrapper', [None, pd.Series])
|
||||
def test_convert_pandas_type_to_json_field_datetime(self, dt_args,
|
||||
extra_exp, wrapper):
|
||||
data = [1., 2., 3.]
|
||||
data = pd.to_datetime(data, **dt_args)
|
||||
if wrapper is pd.Series:
|
||||
data = pd.Series(data, name='values')
|
||||
result = convert_pandas_type_to_json_field(data)
|
||||
expected = {"name": "values", "type": 'datetime'}
|
||||
expected.update(extra_exp)
|
||||
assert result == expected
|
||||
|
||||
def test_convert_pandas_type_to_json_period_range(self):
|
||||
arr = pd.period_range('2016', freq='A-DEC', periods=4)
|
||||
result = convert_pandas_type_to_json_field(arr)
|
||||
expected = {"name": "values", "type": 'datetime', "freq": "A-DEC"}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize('kind', [pd.Categorical, pd.CategoricalIndex])
|
||||
@pytest.mark.parametrize('ordered', [True, False])
|
||||
def test_convert_pandas_type_to_json_field_categorical(self, kind,
|
||||
ordered):
|
||||
data = ['a', 'b', 'c']
|
||||
if kind is pd.Categorical:
|
||||
arr = pd.Series(kind(data, ordered=ordered), name='cats')
|
||||
elif kind is pd.CategoricalIndex:
|
||||
arr = kind(data, ordered=ordered, name='cats')
|
||||
|
||||
result = convert_pandas_type_to_json_field(arr)
|
||||
expected = {"name": "cats", "type": "any",
|
||||
"constraints": {"enum": data},
|
||||
"ordered": ordered}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize("inp,exp", [
|
||||
({'type': 'integer'}, 'int64'),
|
||||
({'type': 'number'}, 'float64'),
|
||||
({'type': 'boolean'}, 'bool'),
|
||||
({'type': 'duration'}, 'timedelta64'),
|
||||
({'type': 'datetime'}, 'datetime64[ns]'),
|
||||
({'type': 'datetime', 'tz': 'US/Hawaii'}, 'datetime64[ns, US/Hawaii]'),
|
||||
({'type': 'any'}, 'object'),
|
||||
({'type': 'any', 'constraints': {'enum': ['a', 'b', 'c']},
|
||||
'ordered': False}, CategoricalDtype(categories=['a', 'b', 'c'],
|
||||
ordered=False)),
|
||||
({'type': 'any', 'constraints': {'enum': ['a', 'b', 'c']},
|
||||
'ordered': True}, CategoricalDtype(categories=['a', 'b', 'c'],
|
||||
ordered=True)),
|
||||
({'type': 'string'}, 'object')])
|
||||
def test_convert_json_field_to_pandas_type(self, inp, exp):
|
||||
field = {'name': 'foo'}
|
||||
field.update(inp)
|
||||
assert convert_json_field_to_pandas_type(field) == exp
|
||||
|
||||
@pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"])
|
||||
def test_convert_json_field_to_pandas_type_raises(self, inp):
|
||||
field = {'type': inp}
|
||||
with pytest.raises(ValueError, match=("Unsupported or invalid field "
|
||||
"type: {}".format(inp))):
|
||||
convert_json_field_to_pandas_type(field)
|
||||
|
||||
def test_categorical(self):
|
||||
s = pd.Series(pd.Categorical(['a', 'b', 'a']))
|
||||
s.index.name = 'idx'
|
||||
result = s.to_json(orient='table', date_format='iso')
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result['schema'].pop('pandas_version')
|
||||
|
||||
fields = [{'name': 'idx', 'type': 'integer'},
|
||||
{'constraints': {'enum': ['a', 'b']},
|
||||
'name': 'values',
|
||||
'ordered': False,
|
||||
'type': 'any'}]
|
||||
|
||||
expected = OrderedDict([
|
||||
('schema', {'fields': fields,
|
||||
'primaryKey': ['idx']}),
|
||||
('data', [OrderedDict([('idx', 0), ('values', 'a')]),
|
||||
OrderedDict([('idx', 1), ('values', 'b')]),
|
||||
OrderedDict([('idx', 2), ('values', 'a')])])])
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize('idx,nm,prop', [
|
||||
(pd.Index([1]), 'index', 'name'),
|
||||
(pd.Index([1], name='myname'), 'myname', 'name'),
|
||||
(pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')]),
|
||||
['level_0', 'level_1'], 'names'),
|
||||
(pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')],
|
||||
names=['n1', 'n2']),
|
||||
['n1', 'n2'], 'names'),
|
||||
(pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')],
|
||||
names=['n1', None]),
|
||||
['n1', 'level_1'], 'names')
|
||||
])
|
||||
def test_set_names_unset(self, idx, nm, prop):
|
||||
data = pd.Series(1, idx)
|
||||
result = set_default_names(data)
|
||||
assert getattr(result.index, prop) == nm
|
||||
|
||||
@pytest.mark.parametrize("idx", [
|
||||
pd.Index([], name='index'),
|
||||
pd.MultiIndex.from_arrays([['foo'], ['bar']],
|
||||
names=('level_0', 'level_1')),
|
||||
pd.MultiIndex.from_arrays([['foo'], ['bar']],
|
||||
names=('foo', 'level_1'))
|
||||
])
|
||||
def test_warns_non_roundtrippable_names(self, idx):
|
||||
# GH 19130
|
||||
df = pd.DataFrame([[]], index=idx)
|
||||
df.index.name = 'index'
|
||||
with tm.assert_produces_warning():
|
||||
set_default_names(df)
|
||||
|
||||
def test_timestamp_in_columns(self):
|
||||
df = pd.DataFrame([[1, 2]], columns=[pd.Timestamp('2016'),
|
||||
pd.Timedelta(10, unit='s')])
|
||||
result = df.to_json(orient="table")
|
||||
js = json.loads(result)
|
||||
assert js['schema']['fields'][1]['name'] == 1451606400000
|
||||
assert js['schema']['fields'][2]['name'] == 10000
|
||||
|
||||
@pytest.mark.parametrize('case', [
|
||||
pd.Series([1], index=pd.Index([1], name='a'), name='a'),
|
||||
pd.DataFrame({"A": [1]}, index=pd.Index([1], name="A")),
|
||||
pd.DataFrame({"A": [1]}, index=pd.MultiIndex.from_arrays([
|
||||
['a'], [1]], names=["A", "a"]))
|
||||
])
|
||||
def test_overlapping_names(self, case):
|
||||
with pytest.raises(ValueError, match='Overlapping'):
|
||||
case.to_json(orient='table')
|
||||
|
||||
def test_mi_falsey_name(self):
|
||||
# GH 16203
|
||||
df = pd.DataFrame(np.random.randn(4, 4),
|
||||
index=pd.MultiIndex.from_product([('A', 'B'),
|
||||
('a', 'b')]))
|
||||
result = [x['name'] for x in build_table_schema(df)['fields']]
|
||||
assert result == ['level_0', 'level_1', 0, 1, 2, 3]
|
||||
|
||||
|
||||
class TestTableOrientReader(object):
|
||||
|
||||
@pytest.mark.parametrize("index_nm", [
|
||||
None,
|
||||
"idx",
|
||||
pytest.param("index",
|
||||
marks=pytest.mark.xfail),
|
||||
'level_0'])
|
||||
@pytest.mark.parametrize("vals", [
|
||||
{'ints': [1, 2, 3, 4]},
|
||||
{'objects': ['a', 'b', 'c', 'd']},
|
||||
{'date_ranges': pd.date_range('2016-01-01', freq='d', periods=4)},
|
||||
{'categoricals': pd.Series(pd.Categorical(['a', 'b', 'c', 'c']))},
|
||||
{'ordered_cats': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
|
||||
ordered=True))},
|
||||
pytest.param({'floats': [1., 2., 3., 4.]},
|
||||
marks=pytest.mark.xfail),
|
||||
{'floats': [1.1, 2.2, 3.3, 4.4]},
|
||||
{'bools': [True, False, False, True]}])
|
||||
def test_read_json_table_orient(self, index_nm, vals, recwarn):
|
||||
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize("index_nm", [
|
||||
None, "idx", "index"])
|
||||
@pytest.mark.parametrize("vals", [
|
||||
{'timedeltas': pd.timedelta_range('1H', periods=4, freq='T')},
|
||||
{'timezones': pd.date_range('2016-01-01', freq='d', periods=4,
|
||||
tz='US/Central')}])
|
||||
def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
|
||||
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
|
||||
out = df.to_json(orient="table")
|
||||
with pytest.raises(NotImplementedError, match='can not yet read '):
|
||||
pd.read_json(out, orient="table")
|
||||
|
||||
def test_comprehensive(self):
|
||||
df = DataFrame(
|
||||
{'A': [1, 2, 3, 4],
|
||||
'B': ['a', 'b', 'c', 'c'],
|
||||
'C': pd.date_range('2016-01-01', freq='d', periods=4),
|
||||
# 'D': pd.timedelta_range('1H', periods=4, freq='T'),
|
||||
'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
|
||||
'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
|
||||
ordered=True)),
|
||||
'G': [1.1, 2.2, 3.3, 4.4],
|
||||
# 'H': pd.date_range('2016-01-01', freq='d', periods=4,
|
||||
# tz='US/Central'),
|
||||
'I': [True, False, False, True],
|
||||
},
|
||||
index=pd.Index(range(4), name='idx'))
|
||||
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize("index_names", [
|
||||
[None, None], ['foo', 'bar'], ['foo', None], [None, 'foo'],
|
||||
['index', 'foo']])
|
||||
def test_multiindex(self, index_names):
|
||||
# GH 18912
|
||||
df = pd.DataFrame(
|
||||
[["Arr", "alpha", [1, 2, 3, 4]],
|
||||
["Bee", "Beta", [10, 20, 30, 40]]],
|
||||
index=[["A", "B"], ["Null", "Eins"]],
|
||||
columns=["Aussprache", "Griechisch", "Args"]
|
||||
)
|
||||
df.index.names = index_names
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize("strict_check", [
|
||||
pytest.param(True, marks=pytest.mark.xfail),
|
||||
False
|
||||
])
|
||||
def test_empty_frame_roundtrip(self, strict_check):
|
||||
# GH 21287
|
||||
df = pd.DataFrame([], columns=['a', 'b', 'c'])
|
||||
expected = df.copy()
|
||||
out = df.to_json(orient='table')
|
||||
result = pd.read_json(out, orient='table')
|
||||
# TODO: When DF coercion issue (#21345) is resolved tighten type checks
|
||||
tm.assert_frame_equal(expected, result,
|
||||
check_dtype=strict_check,
|
||||
check_index_type=strict_check)
|
||||
@@ -0,0 +1,462 @@
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, Index, compat
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.json import json_normalize
|
||||
from pandas.io.json.normalize import nested_to_record
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def deep_nested():
|
||||
# deeply nested data
|
||||
return [{'country': 'USA',
|
||||
'states': [{'name': 'California',
|
||||
'cities': [{'name': 'San Francisco',
|
||||
'pop': 12345},
|
||||
{'name': 'Los Angeles',
|
||||
'pop': 12346}]
|
||||
},
|
||||
{'name': 'Ohio',
|
||||
'cities': [{'name': 'Columbus',
|
||||
'pop': 1234},
|
||||
{'name': 'Cleveland',
|
||||
'pop': 1236}]}
|
||||
]
|
||||
},
|
||||
{'country': 'Germany',
|
||||
'states': [{'name': 'Bayern',
|
||||
'cities': [{'name': 'Munich', 'pop': 12347}]
|
||||
},
|
||||
{'name': 'Nordrhein-Westfalen',
|
||||
'cities': [{'name': 'Duesseldorf', 'pop': 1238},
|
||||
{'name': 'Koeln', 'pop': 1239}]}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def state_data():
|
||||
return [
|
||||
{'counties': [{'name': 'Dade', 'population': 12345},
|
||||
{'name': 'Broward', 'population': 40000},
|
||||
{'name': 'Palm Beach', 'population': 60000}],
|
||||
'info': {'governor': 'Rick Scott'},
|
||||
'shortname': 'FL',
|
||||
'state': 'Florida'},
|
||||
{'counties': [{'name': 'Summit', 'population': 1234},
|
||||
{'name': 'Cuyahoga', 'population': 1337}],
|
||||
'info': {'governor': 'John Kasich'},
|
||||
'shortname': 'OH',
|
||||
'state': 'Ohio'}]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def author_missing_data():
|
||||
return [
|
||||
{'info': None},
|
||||
{'info':
|
||||
{'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
|
||||
'author_name':
|
||||
{'first': 'Jane', 'last_name': 'Doe'}
|
||||
}]
|
||||
|
||||
|
||||
class TestJSONNormalize(object):
|
||||
|
||||
def test_simple_records(self):
|
||||
recs = [{'a': 1, 'b': 2, 'c': 3},
|
||||
{'a': 4, 'b': 5, 'c': 6},
|
||||
{'a': 7, 'b': 8, 'c': 9},
|
||||
{'a': 10, 'b': 11, 'c': 12}]
|
||||
|
||||
result = json_normalize(recs)
|
||||
expected = DataFrame(recs)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_simple_normalize(self, state_data):
|
||||
result = json_normalize(state_data[0], 'counties')
|
||||
expected = DataFrame(state_data[0]['counties'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(state_data, 'counties')
|
||||
|
||||
expected = []
|
||||
for rec in state_data:
|
||||
expected.extend(rec['counties'])
|
||||
expected = DataFrame(expected)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(state_data, 'counties', meta='state')
|
||||
expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_empty_array(self):
|
||||
result = json_normalize([])
|
||||
expected = DataFrame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_simple_normalize_with_separator(self, deep_nested):
|
||||
# GH 14883
|
||||
result = json_normalize({'A': {'A': 1, 'B': 2}})
|
||||
expected = DataFrame([[1, 2]], columns=['A.A', 'A.B'])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize({'A': {'A': 1, 'B': 2}}, sep='_')
|
||||
expected = DataFrame([[1, 2]], columns=['A_A', 'A_B'])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize({'A': {'A': 1, 'B': 2}}, sep=u'\u03c3')
|
||||
expected = DataFrame([[1, 2]], columns=[u'A\u03c3A', u'A\u03c3B'])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize(deep_nested, ['states', 'cities'],
|
||||
meta=['country', ['states', 'name']],
|
||||
sep='_')
|
||||
expected = Index(['name', 'pop',
|
||||
'country', 'states_name']).sort_values()
|
||||
assert result.columns.sort_values().equals(expected)
|
||||
|
||||
def test_value_array_record_prefix(self):
|
||||
# GH 21536
|
||||
result = json_normalize({'A': [1, 2]}, 'A', record_prefix='Prefix.')
|
||||
expected = DataFrame([[1], [2]], columns=['Prefix.0'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_nested_object_record_path(self):
|
||||
# GH 22706
|
||||
data = {'state': 'Florida',
|
||||
'info': {
|
||||
'governor': 'Rick Scott',
|
||||
'counties': [{'name': 'Dade', 'population': 12345},
|
||||
{'name': 'Broward', 'population': 40000},
|
||||
{'name': 'Palm Beach', 'population': 60000}]}}
|
||||
result = json_normalize(data, record_path=["info", "counties"])
|
||||
expected = DataFrame([['Dade', 12345],
|
||||
['Broward', 40000],
|
||||
['Palm Beach', 60000]],
|
||||
columns=['name', 'population'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_more_deeply_nested(self, deep_nested):
|
||||
|
||||
result = json_normalize(deep_nested, ['states', 'cities'],
|
||||
meta=['country', ['states', 'name']])
|
||||
# meta_prefix={'states': 'state_'})
|
||||
|
||||
ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3,
|
||||
'states.name': ['California', 'California', 'Ohio', 'Ohio',
|
||||
'Bayern', 'Nordrhein-Westfalen',
|
||||
'Nordrhein-Westfalen'],
|
||||
'name': ['San Francisco', 'Los Angeles', 'Columbus',
|
||||
'Cleveland', 'Munich', 'Duesseldorf', 'Koeln'],
|
||||
'pop': [12345, 12346, 1234, 1236, 12347, 1238, 1239]}
|
||||
|
||||
expected = DataFrame(ex_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_shallow_nested(self):
|
||||
data = [{'state': 'Florida',
|
||||
'shortname': 'FL',
|
||||
'info': {
|
||||
'governor': 'Rick Scott'
|
||||
},
|
||||
'counties': [{'name': 'Dade', 'population': 12345},
|
||||
{'name': 'Broward', 'population': 40000},
|
||||
{'name': 'Palm Beach', 'population': 60000}]},
|
||||
{'state': 'Ohio',
|
||||
'shortname': 'OH',
|
||||
'info': {
|
||||
'governor': 'John Kasich'
|
||||
},
|
||||
'counties': [{'name': 'Summit', 'population': 1234},
|
||||
{'name': 'Cuyahoga', 'population': 1337}]}]
|
||||
|
||||
result = json_normalize(data, 'counties',
|
||||
['state', 'shortname',
|
||||
['info', 'governor']])
|
||||
ex_data = {'name': ['Dade', 'Broward', 'Palm Beach', 'Summit',
|
||||
'Cuyahoga'],
|
||||
'state': ['Florida'] * 3 + ['Ohio'] * 2,
|
||||
'shortname': ['FL', 'FL', 'FL', 'OH', 'OH'],
|
||||
'info.governor': ['Rick Scott'] * 3 + ['John Kasich'] * 2,
|
||||
'population': [12345, 40000, 60000, 1234, 1337]}
|
||||
expected = DataFrame(ex_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_meta_name_conflict(self):
|
||||
data = [{'foo': 'hello',
|
||||
'bar': 'there',
|
||||
'data': [{'foo': 'something', 'bar': 'else'},
|
||||
{'foo': 'something2', 'bar': 'else2'}]}]
|
||||
|
||||
msg = (r"Conflicting metadata name (foo|bar),"
|
||||
" need distinguishing prefix")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
json_normalize(data, 'data', meta=['foo', 'bar'])
|
||||
|
||||
result = json_normalize(data, 'data', meta=['foo', 'bar'],
|
||||
meta_prefix='meta')
|
||||
|
||||
for val in ['metafoo', 'metabar', 'foo', 'bar']:
|
||||
assert val in result
|
||||
|
||||
def test_meta_parameter_not_modified(self):
|
||||
# GH 18610
|
||||
data = [{'foo': 'hello',
|
||||
'bar': 'there',
|
||||
'data': [{'foo': 'something', 'bar': 'else'},
|
||||
{'foo': 'something2', 'bar': 'else2'}]}]
|
||||
|
||||
COLUMNS = ['foo', 'bar']
|
||||
result = json_normalize(data, 'data', meta=COLUMNS,
|
||||
meta_prefix='meta')
|
||||
|
||||
assert COLUMNS == ['foo', 'bar']
|
||||
for val in ['metafoo', 'metabar', 'foo', 'bar']:
|
||||
assert val in result
|
||||
|
||||
def test_record_prefix(self, state_data):
|
||||
result = json_normalize(state_data[0], 'counties')
|
||||
expected = DataFrame(state_data[0]['counties'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(state_data, 'counties',
|
||||
meta='state',
|
||||
record_prefix='county_')
|
||||
|
||||
expected = []
|
||||
for rec in state_data:
|
||||
expected.extend(rec['counties'])
|
||||
expected = DataFrame(expected)
|
||||
expected = expected.rename(columns=lambda x: 'county_' + x)
|
||||
expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_non_ascii_key(self):
|
||||
if compat.PY3:
|
||||
testjson = (
|
||||
b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' +
|
||||
b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
|
||||
).decode('utf8')
|
||||
else:
|
||||
testjson = ('[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
|
||||
'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]')
|
||||
|
||||
testdata = {
|
||||
u'sub.A': [1, 3],
|
||||
u'sub.B': [2, 4],
|
||||
b"\xc3\x9cnic\xc3\xb8de".decode('utf8'): [0, 1]
|
||||
}
|
||||
expected = DataFrame(testdata)
|
||||
|
||||
result = json_normalize(json.loads(testjson))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_missing_field(self, author_missing_data):
|
||||
# GH20030:
|
||||
result = json_normalize(author_missing_data)
|
||||
ex_data = [
|
||||
{'info': np.nan,
|
||||
'author_name.first': np.nan,
|
||||
'author_name.last_name': np.nan,
|
||||
'info.created_at': np.nan,
|
||||
'info.last_updated': np.nan},
|
||||
{'info': None,
|
||||
'author_name.first': 'Jane',
|
||||
'author_name.last_name': 'Doe',
|
||||
'info.created_at': '11/08/1993',
|
||||
'info.last_updated': '26/05/2012'}
|
||||
]
|
||||
expected = DataFrame(ex_data)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestNestedToRecord(object):
|
||||
|
||||
def test_flat_stays_flat(self):
|
||||
recs = [dict(flat1=1, flat2=2),
|
||||
dict(flat1=3, flat2=4),
|
||||
]
|
||||
|
||||
result = nested_to_record(recs)
|
||||
expected = recs
|
||||
assert result == expected
|
||||
|
||||
def test_one_level_deep_flattens(self):
|
||||
data = dict(flat1=1,
|
||||
dict1=dict(c=1, d=2))
|
||||
|
||||
result = nested_to_record(data)
|
||||
expected = {'dict1.c': 1,
|
||||
'dict1.d': 2,
|
||||
'flat1': 1}
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_nested_flattens(self):
|
||||
data = dict(flat1=1,
|
||||
dict1=dict(c=1, d=2),
|
||||
nested=dict(e=dict(c=1, d=2),
|
||||
d=2))
|
||||
|
||||
result = nested_to_record(data)
|
||||
expected = {'dict1.c': 1,
|
||||
'dict1.d': 2,
|
||||
'flat1': 1,
|
||||
'nested.d': 2,
|
||||
'nested.e.c': 1,
|
||||
'nested.e.d': 2}
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_json_normalize_errors(self):
|
||||
# GH14583: If meta keys are not always present
|
||||
# a new option to set errors='ignore' has been implemented
|
||||
i = {
|
||||
"Trades": [{
|
||||
"general": {
|
||||
"tradeid": 100,
|
||||
"trade_version": 1,
|
||||
"stocks": [{
|
||||
|
||||
"symbol": "AAPL",
|
||||
"name": "Apple",
|
||||
"price": "0"
|
||||
}, {
|
||||
"symbol": "GOOG",
|
||||
"name": "Google",
|
||||
"price": "0"
|
||||
}
|
||||
]
|
||||
}
|
||||
}, {
|
||||
"general": {
|
||||
"tradeid": 100,
|
||||
"stocks": [{
|
||||
"symbol": "AAPL",
|
||||
"name": "Apple",
|
||||
"price": "0"
|
||||
}, {
|
||||
"symbol": "GOOG",
|
||||
"name": "Google",
|
||||
"price": "0"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
j = json_normalize(data=i['Trades'],
|
||||
record_path=[['general', 'stocks']],
|
||||
meta=[['general', 'tradeid'],
|
||||
['general', 'trade_version']],
|
||||
errors='ignore')
|
||||
expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
|
||||
'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
|
||||
'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
|
||||
'price': {0: '0', 1: '0', 2: '0', 3: '0'},
|
||||
'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}
|
||||
|
||||
assert j.fillna('').to_dict() == expected
|
||||
|
||||
msg = ("Try running with errors='ignore' as key 'trade_version'"
|
||||
" is not always present")
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
json_normalize(
|
||||
data=i['Trades'],
|
||||
record_path=[['general', 'stocks']],
|
||||
meta=[['general', 'tradeid'],
|
||||
['general', 'trade_version']],
|
||||
errors='raise')
|
||||
|
||||
def test_donot_drop_nonevalues(self):
|
||||
# GH21356
|
||||
data = [
|
||||
{'info': None,
|
||||
'author_name':
|
||||
{'first': 'Smith', 'last_name': 'Appleseed'}
|
||||
},
|
||||
{'info':
|
||||
{'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
|
||||
'author_name':
|
||||
{'first': 'Jane', 'last_name': 'Doe'}
|
||||
}
|
||||
]
|
||||
result = nested_to_record(data)
|
||||
expected = [
|
||||
{'info': None,
|
||||
'author_name.first': 'Smith',
|
||||
'author_name.last_name': 'Appleseed'},
|
||||
{'author_name.first': 'Jane',
|
||||
'author_name.last_name': 'Doe',
|
||||
'info.created_at': '11/08/1993',
|
||||
'info.last_updated': '26/05/2012'}]
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_nonetype_top_level_bottom_level(self):
|
||||
# GH21158: If inner level json has a key with a null value
|
||||
# make sure it doesnt do a new_d.pop twice and except
|
||||
data = {
|
||||
"id": None,
|
||||
"location": {
|
||||
"country": {
|
||||
"state": {
|
||||
"id": None,
|
||||
"town.info": {
|
||||
"id": None,
|
||||
"region": None,
|
||||
"x": 49.151580810546875,
|
||||
"y": -33.148521423339844,
|
||||
"z": 27.572303771972656}}}
|
||||
}
|
||||
}
|
||||
result = nested_to_record(data)
|
||||
expected = {
|
||||
'id': None,
|
||||
'location.country.state.id': None,
|
||||
'location.country.state.town.info.id': None,
|
||||
'location.country.state.town.info.region': None,
|
||||
'location.country.state.town.info.x': 49.151580810546875,
|
||||
'location.country.state.town.info.y': -33.148521423339844,
|
||||
'location.country.state.town.info.z': 27.572303771972656}
|
||||
assert result == expected
|
||||
|
||||
def test_nonetype_multiple_levels(self):
|
||||
# GH21158: If inner level json has a key with a null value
|
||||
# make sure it doesnt do a new_d.pop twice and except
|
||||
data = {
|
||||
"id": None,
|
||||
"location": {
|
||||
"id": None,
|
||||
"country": {
|
||||
"id": None,
|
||||
"state": {
|
||||
"id": None,
|
||||
"town.info": {
|
||||
"region": None,
|
||||
"x": 49.151580810546875,
|
||||
"y": -33.148521423339844,
|
||||
"z": 27.572303771972656}}}
|
||||
}
|
||||
}
|
||||
result = nested_to_record(data)
|
||||
expected = {
|
||||
'id': None,
|
||||
'location.id': None,
|
||||
'location.country.id': None,
|
||||
'location.country.state.id': None,
|
||||
'location.country.state.town.info.region': None,
|
||||
'location.country.state.town.info.x': 49.151580810546875,
|
||||
'location.country.state.town.info.y': -33.148521423339844,
|
||||
'location.country.state.town.info.z': 27.572303771972656}
|
||||
assert result == expected
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,172 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, read_json
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (
|
||||
assert_frame_equal, assert_series_equal, ensure_clean)
|
||||
|
||||
from pandas.io.json.json import JsonReader
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lines_json_df():
|
||||
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
return df.to_json(lines=True, orient="records")
|
||||
|
||||
|
||||
def test_read_jsonl():
|
||||
# GH9180
|
||||
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
|
||||
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_jsonl_unicode_chars():
|
||||
# GH15132: non-ascii unicode characters
|
||||
# \u201d == RIGHT DOUBLE QUOTATION MARK
|
||||
|
||||
# simulate file handle
|
||||
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
|
||||
json = StringIO(json)
|
||||
result = read_json(json, lines=True)
|
||||
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
|
||||
columns=['a', 'b'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# simulate string
|
||||
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
|
||||
result = read_json(json, lines=True)
|
||||
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
|
||||
columns=['a', 'b'])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_jsonl():
|
||||
# GH9180
|
||||
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
|
||||
assert result == expected
|
||||
|
||||
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
|
||||
assert result == expected
|
||||
assert_frame_equal(read_json(result, lines=True), df)
|
||||
|
||||
# GH15096: escaped characters in columns and data
|
||||
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
|
||||
columns=["a\\", 'b'])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
|
||||
'{"a\\\\":"foo\\"","b":"bar"}')
|
||||
assert result == expected
|
||||
assert_frame_equal(read_json(result, lines=True), df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [1, 1.0])
|
||||
def test_readjson_chunks(lines_json_df, chunksize):
|
||||
# Basic test that read_json(chunks=True) gives the same result as
|
||||
# read_json(chunks=False)
|
||||
# GH17048: memory usage when lines=True
|
||||
|
||||
unchunked = read_json(StringIO(lines_json_df), lines=True)
|
||||
reader = read_json(StringIO(lines_json_df), lines=True,
|
||||
chunksize=chunksize)
|
||||
chunked = pd.concat(reader)
|
||||
|
||||
assert_frame_equal(chunked, unchunked)
|
||||
|
||||
|
||||
def test_readjson_chunksize_requires_lines(lines_json_df):
|
||||
msg = "chunksize can only be passed if lines=True"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
|
||||
|
||||
|
||||
def test_readjson_chunks_series():
|
||||
# Test reading line-format JSON to Series with chunksize param
|
||||
s = pd.Series({'A': 1, 'B': 2})
|
||||
|
||||
strio = StringIO(s.to_json(lines=True, orient="records"))
|
||||
unchunked = pd.read_json(strio, lines=True, typ='Series')
|
||||
|
||||
strio = StringIO(s.to_json(lines=True, orient="records"))
|
||||
chunked = pd.concat(pd.read_json(
|
||||
strio, lines=True, typ='Series', chunksize=1
|
||||
))
|
||||
|
||||
assert_series_equal(chunked, unchunked)
|
||||
|
||||
|
||||
def test_readjson_each_chunk(lines_json_df):
|
||||
# Other tests check that the final result of read_json(chunksize=True)
|
||||
# is correct. This checks the intermediate chunks.
|
||||
chunks = list(
|
||||
pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
|
||||
)
|
||||
assert chunks[0].shape == (2, 2)
|
||||
assert chunks[1].shape == (1, 2)
|
||||
|
||||
|
||||
def test_readjson_chunks_from_file():
|
||||
with ensure_clean('test.json') as path:
|
||||
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
df.to_json(path, lines=True, orient="records")
|
||||
chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
|
||||
unchunked = pd.read_json(path, lines=True)
|
||||
assert_frame_equal(unchunked, chunked)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [None, 1])
|
||||
def test_readjson_chunks_closes(chunksize):
|
||||
with ensure_clean('test.json') as path:
|
||||
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
df.to_json(path, lines=True, orient="records")
|
||||
reader = JsonReader(
|
||||
path, orient=None, typ="frame", dtype=True, convert_axes=True,
|
||||
convert_dates=True, keep_default_dates=True, numpy=False,
|
||||
precise_float=False, date_unit=None, encoding=None,
|
||||
lines=True, chunksize=chunksize, compression=None)
|
||||
reader.read()
|
||||
assert reader.open_stream.closed, "didn't close stream with \
|
||||
chunksize = {chunksize}".format(chunksize=chunksize)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
|
||||
def test_readjson_invalid_chunksize(lines_json_df, chunksize):
|
||||
msg = r"'chunksize' must be an integer >=1"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.read_json(StringIO(lines_json_df), lines=True,
|
||||
chunksize=chunksize)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [None, 1, 2])
|
||||
def test_readjson_chunks_multiple_empty_lines(chunksize):
|
||||
j = """
|
||||
|
||||
{"A":1,"B":4}
|
||||
|
||||
|
||||
|
||||
{"A":2,"B":5}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
{"A":3,"B":6}
|
||||
"""
|
||||
orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||||
test = pd.read_json(j, lines=True, chunksize=chunksize)
|
||||
if chunksize is not None:
|
||||
test = pd.concat(test)
|
||||
tm.assert_frame_equal(
|
||||
orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize))
|
||||
File diff suppressed because it is too large
Load Diff
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -0,0 +1,9 @@
|
||||
from pandas.compat import PY3
|
||||
|
||||
# array compat
|
||||
if PY3:
|
||||
frombytes = lambda obj, data: obj.frombytes(data)
|
||||
tobytes = lambda obj: obj.tobytes()
|
||||
else:
|
||||
frombytes = lambda obj, data: obj.fromstring(data)
|
||||
tobytes = lambda obj: obj.tostring()
|
||||
@@ -0,0 +1,21 @@
|
||||
# coding: utf-8
|
||||
|
||||
from pandas.io.msgpack import packb, unpackb
|
||||
|
||||
from .common import frombytes
|
||||
|
||||
|
||||
def test_unpack_buffer():
|
||||
from array import array
|
||||
buf = array('b')
|
||||
frombytes(buf, packb((b'foo', b'bar')))
|
||||
obj = unpackb(buf, use_list=1)
|
||||
assert [b'foo', b'bar'] == obj
|
||||
|
||||
|
||||
def test_unpack_bytearray():
|
||||
buf = bytearray(packb(('foo', 'bar')))
|
||||
obj = unpackb(buf, use_list=1)
|
||||
assert [b'foo', b'bar'] == obj
|
||||
expected_type = bytes
|
||||
assert all(type(s) == expected_type for s in obj)
|
||||
@@ -0,0 +1,115 @@
|
||||
# coding: utf-8
|
||||
|
||||
from pandas.io.msgpack import packb, unpackb
|
||||
|
||||
|
||||
def check(length, obj):
|
||||
v = packb(obj)
|
||||
assert len(v) == length, \
|
||||
"%r length should be %r but get %r" % (obj, length, len(v))
|
||||
assert unpackb(v, use_list=0) == obj
|
||||
|
||||
|
||||
def test_1():
|
||||
for o in [None, True, False, 0, 1, (1 << 6), (1 << 7) - 1, -1,
|
||||
-((1 << 5) - 1), -(1 << 5)]:
|
||||
check(1, o)
|
||||
|
||||
|
||||
def test_2():
|
||||
for o in [1 << 7, (1 << 8) - 1, -((1 << 5) + 1), -(1 << 7)]:
|
||||
check(2, o)
|
||||
|
||||
|
||||
def test_3():
|
||||
for o in [1 << 8, (1 << 16) - 1, -((1 << 7) + 1), -(1 << 15)]:
|
||||
check(3, o)
|
||||
|
||||
|
||||
def test_5():
|
||||
for o in [1 << 16, (1 << 32) - 1, -((1 << 15) + 1), -(1 << 31)]:
|
||||
check(5, o)
|
||||
|
||||
|
||||
def test_9():
|
||||
for o in [1 << 32, (1 << 64) - 1, -((1 << 31) + 1), -(1 << 63), 1.0, 0.1,
|
||||
-0.1, -1.0]:
|
||||
check(9, o)
|
||||
|
||||
|
||||
def check_raw(overhead, num):
|
||||
check(num + overhead, b" " * num)
|
||||
|
||||
|
||||
def test_fixraw():
|
||||
check_raw(1, 0)
|
||||
check_raw(1, (1 << 5) - 1)
|
||||
|
||||
|
||||
def test_raw16():
|
||||
check_raw(3, 1 << 5)
|
||||
check_raw(3, (1 << 16) - 1)
|
||||
|
||||
|
||||
def test_raw32():
|
||||
check_raw(5, 1 << 16)
|
||||
|
||||
|
||||
def check_array(overhead, num):
|
||||
check(num + overhead, (None, ) * num)
|
||||
|
||||
|
||||
def test_fixarray():
|
||||
check_array(1, 0)
|
||||
check_array(1, (1 << 4) - 1)
|
||||
|
||||
|
||||
def test_array16():
|
||||
check_array(3, 1 << 4)
|
||||
check_array(3, (1 << 16) - 1)
|
||||
|
||||
|
||||
def test_array32():
|
||||
check_array(5, (1 << 16))
|
||||
|
||||
|
||||
def match(obj, buf):
|
||||
assert packb(obj) == buf
|
||||
assert unpackb(buf, use_list=0) == obj
|
||||
|
||||
|
||||
def test_match():
|
||||
cases = [
|
||||
(None, b'\xc0'),
|
||||
(False, b'\xc2'),
|
||||
(True, b'\xc3'),
|
||||
(0, b'\x00'),
|
||||
(127, b'\x7f'),
|
||||
(128, b'\xcc\x80'),
|
||||
(256, b'\xcd\x01\x00'),
|
||||
(-1, b'\xff'),
|
||||
(-33, b'\xd0\xdf'),
|
||||
(-129, b'\xd1\xff\x7f'),
|
||||
({1: 1}, b'\x81\x01\x01'),
|
||||
(1.0, b"\xcb\x3f\xf0\x00\x00\x00\x00\x00\x00"),
|
||||
((), b'\x90'),
|
||||
(tuple(range(15)), (b"\x9f\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09"
|
||||
b"\x0a\x0b\x0c\x0d\x0e")),
|
||||
(tuple(range(16)), (b"\xdc\x00\x10\x00\x01\x02\x03\x04\x05\x06\x07"
|
||||
b"\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f")),
|
||||
({}, b'\x80'),
|
||||
({x: x for x in range(15)},
|
||||
(b'\x8f\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06\x06\x07'
|
||||
b'\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e')),
|
||||
({x: x for x in range(16)},
|
||||
(b'\xde\x00\x10\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06'
|
||||
b'\x06\x07\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e'
|
||||
b'\x0f\x0f')),
|
||||
]
|
||||
|
||||
for v, p in cases:
|
||||
match(v, p)
|
||||
|
||||
|
||||
def test_unicode():
|
||||
assert unpackb(packb('foobar'), use_list=1) == b'foobar'
|
||||
@@ -0,0 +1,39 @@
|
||||
# coding: utf-8
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.io.msgpack import packb, unpackb
|
||||
|
||||
|
||||
class DummyException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class TestExceptions(object):
|
||||
|
||||
def test_raise_on_find_unsupported_value(self):
|
||||
msg = "can\'t serialize datetime"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
packb(datetime.now())
|
||||
|
||||
def test_raise_from_object_hook(self):
|
||||
def hook(_):
|
||||
raise DummyException()
|
||||
|
||||
with pytest.raises(DummyException):
|
||||
unpackb(packb({}), object_hook=hook)
|
||||
with pytest.raises(DummyException):
|
||||
unpackb(packb({'fizz': 'buzz'}), object_hook=hook)
|
||||
with pytest.raises(DummyException):
|
||||
unpackb(packb({'fizz': 'buzz'}), object_pairs_hook=hook)
|
||||
with pytest.raises(DummyException):
|
||||
unpackb(packb({'fizz': {'buzz': 'spam'}}), object_hook=hook)
|
||||
with pytest.raises(DummyException):
|
||||
unpackb(packb({'fizz': {'buzz': 'spam'}}), object_pairs_hook=hook)
|
||||
|
||||
def test_invalid_value(self):
|
||||
msg = "Unpack failed: error"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
unpackb(b"\xd9\x97#DL_")
|
||||
@@ -0,0 +1,63 @@
|
||||
from __future__ import print_function
|
||||
|
||||
import array
|
||||
|
||||
import pandas.io.msgpack as msgpack
|
||||
from pandas.io.msgpack import ExtType
|
||||
|
||||
from .common import frombytes, tobytes
|
||||
|
||||
|
||||
def test_pack_ext_type():
|
||||
def p(s):
|
||||
packer = msgpack.Packer()
|
||||
packer.pack_ext_type(0x42, s)
|
||||
return packer.bytes()
|
||||
|
||||
assert p(b'A') == b'\xd4\x42A' # fixext 1
|
||||
assert p(b'AB') == b'\xd5\x42AB' # fixext 2
|
||||
assert p(b'ABCD') == b'\xd6\x42ABCD' # fixext 4
|
||||
assert p(b'ABCDEFGH') == b'\xd7\x42ABCDEFGH' # fixext 8
|
||||
assert p(b'A' * 16) == b'\xd8\x42' + b'A' * 16 # fixext 16
|
||||
assert p(b'ABC') == b'\xc7\x03\x42ABC' # ext 8
|
||||
assert p(b'A' * 0x0123) == b'\xc8\x01\x23\x42' + b'A' * 0x0123 # ext 16
|
||||
assert (p(b'A' * 0x00012345) ==
|
||||
b'\xc9\x00\x01\x23\x45\x42' + b'A' * 0x00012345) # ext 32
|
||||
|
||||
|
||||
def test_unpack_ext_type():
|
||||
def check(b, expected):
|
||||
assert msgpack.unpackb(b) == expected
|
||||
|
||||
check(b'\xd4\x42A', ExtType(0x42, b'A')) # fixext 1
|
||||
check(b'\xd5\x42AB', ExtType(0x42, b'AB')) # fixext 2
|
||||
check(b'\xd6\x42ABCD', ExtType(0x42, b'ABCD')) # fixext 4
|
||||
check(b'\xd7\x42ABCDEFGH', ExtType(0x42, b'ABCDEFGH')) # fixext 8
|
||||
check(b'\xd8\x42' + b'A' * 16, ExtType(0x42, b'A' * 16)) # fixext 16
|
||||
check(b'\xc7\x03\x42ABC', ExtType(0x42, b'ABC')) # ext 8
|
||||
check(b'\xc8\x01\x23\x42' + b'A' * 0x0123,
|
||||
ExtType(0x42, b'A' * 0x0123)) # ext 16
|
||||
check(b'\xc9\x00\x01\x23\x45\x42' + b'A' * 0x00012345,
|
||||
ExtType(0x42, b'A' * 0x00012345)) # ext 32
|
||||
|
||||
|
||||
def test_extension_type():
|
||||
def default(obj):
|
||||
print('default called', obj)
|
||||
if isinstance(obj, array.array):
|
||||
typecode = 123 # application specific typecode
|
||||
data = tobytes(obj)
|
||||
return ExtType(typecode, data)
|
||||
raise TypeError("Unknown type object %r" % (obj, ))
|
||||
|
||||
def ext_hook(code, data):
|
||||
print('ext_hook called', code, data)
|
||||
assert code == 123
|
||||
obj = array.array('d')
|
||||
frombytes(obj, data)
|
||||
return obj
|
||||
|
||||
obj = [42, b'hello', array.array('d', [1.1, 2.2, 3.3])]
|
||||
s = msgpack.packb(obj, default=default)
|
||||
obj2 = msgpack.unpackb(s, ext_hook=ext_hook)
|
||||
assert obj == obj2
|
||||
@@ -0,0 +1,91 @@
|
||||
# coding: utf-8
|
||||
|
||||
from pandas.io.msgpack import unpackb
|
||||
|
||||
|
||||
def check(src, should, use_list=0):
|
||||
assert unpackb(src, use_list=use_list) == should
|
||||
|
||||
|
||||
def testSimpleValue():
|
||||
check(b"\x93\xc0\xc2\xc3", (None, False, True, ))
|
||||
|
||||
|
||||
def testFixnum():
|
||||
check(b"\x92\x93\x00\x40\x7f\x93\xe0\xf0\xff", ((0,
|
||||
64,
|
||||
127, ),
|
||||
(-32,
|
||||
-16,
|
||||
-1, ), ))
|
||||
|
||||
|
||||
def testFixArray():
|
||||
check(b"\x92\x90\x91\x91\xc0", ((), ((None, ), ), ), )
|
||||
|
||||
|
||||
def testFixRaw():
|
||||
check(b"\x94\xa0\xa1a\xa2bc\xa3def", (b"", b"a", b"bc", b"def", ), )
|
||||
|
||||
|
||||
def testFixMap():
|
||||
check(b"\x82\xc2\x81\xc0\xc0\xc3\x81\xc0\x80",
|
||||
{False: {None: None},
|
||||
True: {None: {}}}, )
|
||||
|
||||
|
||||
def testUnsignedInt():
|
||||
check(b"\x99\xcc\x00\xcc\x80\xcc\xff\xcd\x00\x00\xcd\x80\x00"
|
||||
b"\xcd\xff\xff\xce\x00\x00\x00\x00\xce\x80\x00\x00\x00"
|
||||
b"\xce\xff\xff\xff\xff",
|
||||
(0,
|
||||
128,
|
||||
255,
|
||||
0,
|
||||
32768,
|
||||
65535,
|
||||
0,
|
||||
2147483648,
|
||||
4294967295, ), )
|
||||
|
||||
|
||||
def testSignedInt():
|
||||
check(b"\x99\xd0\x00\xd0\x80\xd0\xff\xd1\x00\x00\xd1\x80\x00"
|
||||
b"\xd1\xff\xff\xd2\x00\x00\x00\x00\xd2\x80\x00\x00\x00"
|
||||
b"\xd2\xff\xff\xff\xff", (0,
|
||||
-128,
|
||||
-1,
|
||||
0,
|
||||
-32768,
|
||||
-1,
|
||||
0,
|
||||
-2147483648,
|
||||
-1, ))
|
||||
|
||||
|
||||
def testRaw():
|
||||
check(b"\x96\xda\x00\x00\xda\x00\x01a\xda\x00\x02ab\xdb\x00\x00"
|
||||
b"\x00\x00\xdb\x00\x00\x00\x01a\xdb\x00\x00\x00\x02ab",
|
||||
(b"", b"a", b"ab", b"", b"a", b"ab"))
|
||||
|
||||
|
||||
def testArray():
|
||||
check(b"\x96\xdc\x00\x00\xdc\x00\x01\xc0\xdc\x00\x02\xc2\xc3\xdd\x00"
|
||||
b"\x00\x00\x00\xdd\x00\x00\x00\x01\xc0\xdd\x00\x00\x00\x02"
|
||||
b"\xc2\xc3", ((), (None, ), (False, True), (), (None, ),
|
||||
(False, True)))
|
||||
|
||||
|
||||
def testMap():
|
||||
check(b"\x96"
|
||||
b"\xde\x00\x00"
|
||||
b"\xde\x00\x01\xc0\xc2"
|
||||
b"\xde\x00\x02\xc0\xc2\xc3\xc2"
|
||||
b"\xdf\x00\x00\x00\x00"
|
||||
b"\xdf\x00\x00\x00\x01\xc0\xc2"
|
||||
b"\xdf\x00\x00\x00\x02\xc0\xc2\xc3\xc2", ({}, {None: False},
|
||||
{True: False,
|
||||
None: False}, {},
|
||||
{None: False},
|
||||
{True: False,
|
||||
None: False}))
|
||||
@@ -0,0 +1,109 @@
|
||||
# coding: utf-8
|
||||
from __future__ import (
|
||||
absolute_import, division, print_function, unicode_literals)
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.io.msgpack import ExtType, Packer, Unpacker, packb, unpackb
|
||||
|
||||
|
||||
class TestLimits(object):
|
||||
|
||||
def test_integer(self):
|
||||
x = -(2 ** 63)
|
||||
assert unpackb(packb(x)) == x
|
||||
msg = (r"((long |Python )?(int )?too (big|large) to convert"
|
||||
r"( to C (unsigned )?long))?")
|
||||
with pytest.raises((OverflowError, ValueError), match=msg):
|
||||
packb(x - 1)
|
||||
x = 2 ** 64 - 1
|
||||
assert unpackb(packb(x)) == x
|
||||
with pytest.raises((OverflowError, ValueError), match=msg):
|
||||
packb(x + 1)
|
||||
|
||||
def test_array_header(self):
|
||||
packer = Packer()
|
||||
packer.pack_array_header(2 ** 32 - 1)
|
||||
with pytest.raises((OverflowError, ValueError)):
|
||||
packer.pack_array_header(2 ** 32)
|
||||
|
||||
def test_map_header(self):
|
||||
packer = Packer()
|
||||
packer.pack_map_header(2 ** 32 - 1)
|
||||
with pytest.raises((OverflowError, ValueError)):
|
||||
packer.pack_array_header(2 ** 32)
|
||||
|
||||
def test_max_str_len(self):
|
||||
d = 'x' * 3
|
||||
packed = packb(d)
|
||||
|
||||
unpacker = Unpacker(max_str_len=3, encoding='utf-8')
|
||||
unpacker.feed(packed)
|
||||
assert unpacker.unpack() == d
|
||||
|
||||
unpacker = Unpacker(max_str_len=2, encoding='utf-8')
|
||||
unpacker.feed(packed)
|
||||
|
||||
msg = "3 exceeds max_str_len"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
unpacker.unpack()
|
||||
|
||||
def test_max_bin_len(self):
|
||||
d = b'x' * 3
|
||||
packed = packb(d, use_bin_type=True)
|
||||
|
||||
unpacker = Unpacker(max_bin_len=3)
|
||||
unpacker.feed(packed)
|
||||
assert unpacker.unpack() == d
|
||||
|
||||
unpacker = Unpacker(max_bin_len=2)
|
||||
unpacker.feed(packed)
|
||||
|
||||
msg = "3 exceeds max_bin_len"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
unpacker.unpack()
|
||||
|
||||
def test_max_array_len(self):
|
||||
d = [1, 2, 3]
|
||||
packed = packb(d)
|
||||
|
||||
unpacker = Unpacker(max_array_len=3)
|
||||
unpacker.feed(packed)
|
||||
assert unpacker.unpack() == d
|
||||
|
||||
unpacker = Unpacker(max_array_len=2)
|
||||
unpacker.feed(packed)
|
||||
|
||||
msg = "3 exceeds max_array_len"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
unpacker.unpack()
|
||||
|
||||
def test_max_map_len(self):
|
||||
d = {1: 2, 3: 4, 5: 6}
|
||||
packed = packb(d)
|
||||
|
||||
unpacker = Unpacker(max_map_len=3)
|
||||
unpacker.feed(packed)
|
||||
assert unpacker.unpack() == d
|
||||
|
||||
unpacker = Unpacker(max_map_len=2)
|
||||
unpacker.feed(packed)
|
||||
|
||||
msg = "3 exceeds max_map_len"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
unpacker.unpack()
|
||||
|
||||
def test_max_ext_len(self):
|
||||
d = ExtType(42, b"abc")
|
||||
packed = packb(d)
|
||||
|
||||
unpacker = Unpacker(max_ext_len=3)
|
||||
unpacker.feed(packed)
|
||||
assert unpacker.unpack() == d
|
||||
|
||||
unpacker = Unpacker(max_ext_len=2)
|
||||
unpacker.feed(packed)
|
||||
|
||||
msg = "4 exceeds max_ext_len"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
unpacker.unpack()
|
||||
@@ -0,0 +1,92 @@
|
||||
# coding: utf-8
|
||||
|
||||
from pandas.io.msgpack import ExtType, packb, unpackb
|
||||
|
||||
|
||||
def test_str8():
|
||||
header = b'\xd9'
|
||||
data = b'x' * 32
|
||||
b = packb(data.decode(), use_bin_type=True)
|
||||
assert len(b) == len(data) + 2
|
||||
assert b[0:2] == header + b'\x20'
|
||||
assert b[2:] == data
|
||||
assert unpackb(b) == data
|
||||
|
||||
data = b'x' * 255
|
||||
b = packb(data.decode(), use_bin_type=True)
|
||||
assert len(b) == len(data) + 2
|
||||
assert b[0:2] == header + b'\xff'
|
||||
assert b[2:] == data
|
||||
assert unpackb(b) == data
|
||||
|
||||
|
||||
def test_bin8():
|
||||
header = b'\xc4'
|
||||
data = b''
|
||||
b = packb(data, use_bin_type=True)
|
||||
assert len(b) == len(data) + 2
|
||||
assert b[0:2] == header + b'\x00'
|
||||
assert b[2:] == data
|
||||
assert unpackb(b) == data
|
||||
|
||||
data = b'x' * 255
|
||||
b = packb(data, use_bin_type=True)
|
||||
assert len(b) == len(data) + 2
|
||||
assert b[0:2] == header + b'\xff'
|
||||
assert b[2:] == data
|
||||
assert unpackb(b) == data
|
||||
|
||||
|
||||
def test_bin16():
|
||||
header = b'\xc5'
|
||||
data = b'x' * 256
|
||||
b = packb(data, use_bin_type=True)
|
||||
assert len(b) == len(data) + 3
|
||||
assert b[0:1] == header
|
||||
assert b[1:3] == b'\x01\x00'
|
||||
assert b[3:] == data
|
||||
assert unpackb(b) == data
|
||||
|
||||
data = b'x' * 65535
|
||||
b = packb(data, use_bin_type=True)
|
||||
assert len(b) == len(data) + 3
|
||||
assert b[0:1] == header
|
||||
assert b[1:3] == b'\xff\xff'
|
||||
assert b[3:] == data
|
||||
assert unpackb(b) == data
|
||||
|
||||
|
||||
def test_bin32():
|
||||
header = b'\xc6'
|
||||
data = b'x' * 65536
|
||||
b = packb(data, use_bin_type=True)
|
||||
assert len(b) == len(data) + 5
|
||||
assert b[0:1] == header
|
||||
assert b[1:5] == b'\x00\x01\x00\x00'
|
||||
assert b[5:] == data
|
||||
assert unpackb(b) == data
|
||||
|
||||
|
||||
def test_ext():
|
||||
def check(ext, packed):
|
||||
assert packb(ext) == packed
|
||||
assert unpackb(packed) == ext
|
||||
|
||||
check(ExtType(0x42, b'Z'), b'\xd4\x42Z') # fixext 1
|
||||
check(ExtType(0x42, b'ZZ'), b'\xd5\x42ZZ') # fixext 2
|
||||
check(ExtType(0x42, b'Z' * 4), b'\xd6\x42' + b'Z' * 4) # fixext 4
|
||||
check(ExtType(0x42, b'Z' * 8), b'\xd7\x42' + b'Z' * 8) # fixext 8
|
||||
check(ExtType(0x42, b'Z' * 16), b'\xd8\x42' + b'Z' * 16) # fixext 16
|
||||
# ext 8
|
||||
check(ExtType(0x42, b''), b'\xc7\x00\x42')
|
||||
check(ExtType(0x42, b'Z' * 255), b'\xc7\xff\x42' + b'Z' * 255)
|
||||
# ext 16
|
||||
check(ExtType(0x42, b'Z' * 256), b'\xc8\x01\x00\x42' + b'Z' * 256)
|
||||
check(ExtType(0x42, b'Z' * 0xffff), b'\xc8\xff\xff\x42' + b'Z' * 0xffff)
|
||||
# ext 32
|
||||
check(
|
||||
ExtType(0x42, b'Z' *
|
||||
0x10000), b'\xc9\x00\x01\x00\x00\x42' + b'Z' * 0x10000)
|
||||
# needs large memory
|
||||
# check(ExtType(0x42, b'Z'*0xffffffff),
|
||||
# b'\xc9\xff\xff\xff\xff\x42' + b'Z'*0xffffffff)
|
||||
@@ -0,0 +1,74 @@
|
||||
# coding: utf-8
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.io.msgpack import packb, unpackb
|
||||
|
||||
|
||||
class DecodeError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class TestObj(object):
|
||||
|
||||
def _arr_to_str(self, arr):
|
||||
return ''.join(str(c) for c in arr)
|
||||
|
||||
def bad_complex_decoder(self, o):
|
||||
raise DecodeError("Ooops!")
|
||||
|
||||
def _decode_complex(self, obj):
|
||||
if b'__complex__' in obj:
|
||||
return complex(obj[b'real'], obj[b'imag'])
|
||||
return obj
|
||||
|
||||
def _encode_complex(self, obj):
|
||||
if isinstance(obj, complex):
|
||||
return {b'__complex__': True, b'real': 1, b'imag': 2}
|
||||
return obj
|
||||
|
||||
def test_encode_hook(self):
|
||||
packed = packb([3, 1 + 2j], default=self._encode_complex)
|
||||
unpacked = unpackb(packed, use_list=1)
|
||||
assert unpacked[1] == {b'__complex__': True, b'real': 1, b'imag': 2}
|
||||
|
||||
def test_decode_hook(self):
|
||||
packed = packb([3, {b'__complex__': True, b'real': 1, b'imag': 2}])
|
||||
unpacked = unpackb(packed, object_hook=self._decode_complex,
|
||||
use_list=1)
|
||||
assert unpacked[1] == 1 + 2j
|
||||
|
||||
def test_decode_pairs_hook(self):
|
||||
packed = packb([3, {1: 2, 3: 4}])
|
||||
prod_sum = 1 * 2 + 3 * 4
|
||||
unpacked = unpackb(
|
||||
packed, object_pairs_hook=lambda l: sum(k * v for k, v in l),
|
||||
use_list=1)
|
||||
assert unpacked[1] == prod_sum
|
||||
|
||||
def test_only_one_obj_hook(self):
|
||||
msg = "object_pairs_hook and object_hook are mutually exclusive"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
unpackb(b'', object_hook=lambda x: x,
|
||||
object_pairs_hook=lambda x: x)
|
||||
|
||||
def test_bad_hook(self):
|
||||
msg = r"can't serialize \(1\+2j\)"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
packed = packb([3, 1 + 2j], default=lambda o: o)
|
||||
unpacked = unpackb(packed, use_list=1) # noqa
|
||||
|
||||
def test_array_hook(self):
|
||||
packed = packb([1, 2, 3])
|
||||
unpacked = unpackb(packed, list_hook=self._arr_to_str, use_list=1)
|
||||
assert unpacked == '123'
|
||||
|
||||
def test_an_exception_in_objecthook1(self):
|
||||
with pytest.raises(DecodeError, match='Ooops!'):
|
||||
packed = packb({1: {'__complex__': True, 'real': 1, 'imag': 2}})
|
||||
unpackb(packed, object_hook=self.bad_complex_decoder)
|
||||
|
||||
def test_an_exception_in_objecthook2(self):
|
||||
with pytest.raises(DecodeError, match='Ooops!'):
|
||||
packed = packb({1: [{'__complex__': True, 'real': 1, 'imag': 2}]})
|
||||
unpackb(packed, list_hook=self.bad_complex_decoder, use_list=1)
|
||||
@@ -0,0 +1,162 @@
|
||||
# coding: utf-8
|
||||
|
||||
import struct
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import OrderedDict, u
|
||||
|
||||
from pandas import compat
|
||||
|
||||
from pandas.io.msgpack import Packer, Unpacker, packb, unpackb
|
||||
|
||||
|
||||
class TestPack(object):
|
||||
|
||||
def check(self, data, use_list=False):
|
||||
re = unpackb(packb(data), use_list=use_list)
|
||||
assert re == data
|
||||
|
||||
def testPack(self):
|
||||
test_data = [
|
||||
0, 1, 127, 128, 255, 256, 65535, 65536,
|
||||
-1, -32, -33, -128, -129, -32768, -32769,
|
||||
1.0,
|
||||
b"", b"a", b"a" * 31, b"a" * 32,
|
||||
None, True, False,
|
||||
(), ((),), ((), None,),
|
||||
{None: 0},
|
||||
(1 << 23),
|
||||
]
|
||||
for td in test_data:
|
||||
self.check(td)
|
||||
|
||||
def testPackUnicode(self):
|
||||
test_data = [u(""), u("abcd"), [u("defgh")], u("Русский текст"), ]
|
||||
for td in test_data:
|
||||
re = unpackb(
|
||||
packb(td, encoding='utf-8'), use_list=1, encoding='utf-8')
|
||||
assert re == td
|
||||
packer = Packer(encoding='utf-8')
|
||||
data = packer.pack(td)
|
||||
re = Unpacker(
|
||||
compat.BytesIO(data), encoding='utf-8', use_list=1).unpack()
|
||||
assert re == td
|
||||
|
||||
def testPackUTF32(self):
|
||||
test_data = [
|
||||
compat.u(""),
|
||||
compat.u("abcd"),
|
||||
[compat.u("defgh")],
|
||||
compat.u("Русский текст"),
|
||||
]
|
||||
for td in test_data:
|
||||
re = unpackb(
|
||||
packb(td, encoding='utf-32'), use_list=1, encoding='utf-32')
|
||||
assert re == td
|
||||
|
||||
def testPackBytes(self):
|
||||
test_data = [b"", b"abcd", (b"defgh", ), ]
|
||||
for td in test_data:
|
||||
self.check(td)
|
||||
|
||||
def testIgnoreUnicodeErrors(self):
|
||||
re = unpackb(
|
||||
packb(b'abc\xeddef'), encoding='utf-8', unicode_errors='ignore',
|
||||
use_list=1)
|
||||
assert re == "abcdef"
|
||||
|
||||
def testStrictUnicodeUnpack(self):
|
||||
msg = (r"'utf-*8' codec can't decode byte 0xed in position 3:"
|
||||
" invalid continuation byte")
|
||||
with pytest.raises(UnicodeDecodeError, match=msg):
|
||||
unpackb(packb(b'abc\xeddef'), encoding='utf-8', use_list=1)
|
||||
|
||||
def testStrictUnicodePack(self):
|
||||
msg = (r"'ascii' codec can't encode character u*'\\xed' in position 3:"
|
||||
r" ordinal not in range\(128\)")
|
||||
with pytest.raises(UnicodeEncodeError, match=msg):
|
||||
packb(compat.u("abc\xeddef"), encoding='ascii',
|
||||
unicode_errors='strict')
|
||||
|
||||
def testIgnoreErrorsPack(self):
|
||||
re = unpackb(
|
||||
packb(
|
||||
compat.u("abcФФФdef"), encoding='ascii',
|
||||
unicode_errors='ignore'), encoding='utf-8', use_list=1)
|
||||
assert re == compat.u("abcdef")
|
||||
|
||||
def testNoEncoding(self):
|
||||
msg = "Can't encode unicode string: no encoding is specified"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
packb(compat.u("abc"), encoding=None)
|
||||
|
||||
def testDecodeBinary(self):
|
||||
re = unpackb(packb("abc"), encoding=None, use_list=1)
|
||||
assert re == b"abc"
|
||||
|
||||
def testPackFloat(self):
|
||||
assert packb(1.0,
|
||||
use_single_float=True) == b'\xca' + struct.pack('>f', 1.0)
|
||||
assert packb(
|
||||
1.0, use_single_float=False) == b'\xcb' + struct.pack('>d', 1.0)
|
||||
|
||||
def testArraySize(self, sizes=[0, 5, 50, 1000]):
|
||||
bio = compat.BytesIO()
|
||||
packer = Packer()
|
||||
for size in sizes:
|
||||
bio.write(packer.pack_array_header(size))
|
||||
for i in range(size):
|
||||
bio.write(packer.pack(i))
|
||||
|
||||
bio.seek(0)
|
||||
unpacker = Unpacker(bio, use_list=1)
|
||||
for size in sizes:
|
||||
assert unpacker.unpack() == list(range(size))
|
||||
|
||||
def test_manualreset(self, sizes=[0, 5, 50, 1000]):
|
||||
packer = Packer(autoreset=False)
|
||||
for size in sizes:
|
||||
packer.pack_array_header(size)
|
||||
for i in range(size):
|
||||
packer.pack(i)
|
||||
|
||||
bio = compat.BytesIO(packer.bytes())
|
||||
unpacker = Unpacker(bio, use_list=1)
|
||||
for size in sizes:
|
||||
assert unpacker.unpack() == list(range(size))
|
||||
|
||||
packer.reset()
|
||||
assert packer.bytes() == b''
|
||||
|
||||
def testMapSize(self, sizes=[0, 5, 50, 1000]):
|
||||
bio = compat.BytesIO()
|
||||
packer = Packer()
|
||||
for size in sizes:
|
||||
bio.write(packer.pack_map_header(size))
|
||||
for i in range(size):
|
||||
bio.write(packer.pack(i)) # key
|
||||
bio.write(packer.pack(i * 2)) # value
|
||||
|
||||
bio.seek(0)
|
||||
unpacker = Unpacker(bio)
|
||||
for size in sizes:
|
||||
assert unpacker.unpack() == {i: i * 2 for i in range(size)}
|
||||
|
||||
def test_odict(self):
|
||||
seq = [(b'one', 1), (b'two', 2), (b'three', 3), (b'four', 4)]
|
||||
od = OrderedDict(seq)
|
||||
assert unpackb(packb(od), use_list=1) == dict(seq)
|
||||
|
||||
def pair_hook(seq):
|
||||
return list(seq)
|
||||
|
||||
assert unpackb(
|
||||
packb(od), object_pairs_hook=pair_hook, use_list=1) == seq
|
||||
|
||||
def test_pairlist(self):
|
||||
pairlist = [(b'a', 1), (2, b'b'), (b'foo', b'bar')]
|
||||
packer = Packer()
|
||||
packed = packer.pack_map_pairs(pairlist)
|
||||
unpacked = unpackb(packed, object_pairs_hook=list)
|
||||
assert pairlist == unpacked
|
||||
@@ -0,0 +1,71 @@
|
||||
"""Test Unpacker's read_array_header and read_map_header methods"""
|
||||
from pandas.io.msgpack import OutOfData, Unpacker, packb
|
||||
|
||||
UnexpectedTypeException = ValueError
|
||||
|
||||
|
||||
def test_read_array_header():
|
||||
unpacker = Unpacker()
|
||||
unpacker.feed(packb(['a', 'b', 'c']))
|
||||
assert unpacker.read_array_header() == 3
|
||||
assert unpacker.unpack() == b'a'
|
||||
assert unpacker.unpack() == b'b'
|
||||
assert unpacker.unpack() == b'c'
|
||||
try:
|
||||
unpacker.unpack()
|
||||
assert 0, 'should raise exception'
|
||||
except OutOfData:
|
||||
assert 1, 'okay'
|
||||
|
||||
|
||||
def test_read_map_header():
|
||||
unpacker = Unpacker()
|
||||
unpacker.feed(packb({'a': 'A'}))
|
||||
assert unpacker.read_map_header() == 1
|
||||
assert unpacker.unpack() == B'a'
|
||||
assert unpacker.unpack() == B'A'
|
||||
try:
|
||||
unpacker.unpack()
|
||||
assert 0, 'should raise exception'
|
||||
except OutOfData:
|
||||
assert 1, 'okay'
|
||||
|
||||
|
||||
def test_incorrect_type_array():
|
||||
unpacker = Unpacker()
|
||||
unpacker.feed(packb(1))
|
||||
try:
|
||||
unpacker.read_array_header()
|
||||
assert 0, 'should raise exception'
|
||||
except UnexpectedTypeException:
|
||||
assert 1, 'okay'
|
||||
|
||||
|
||||
def test_incorrect_type_map():
|
||||
unpacker = Unpacker()
|
||||
unpacker.feed(packb(1))
|
||||
try:
|
||||
unpacker.read_map_header()
|
||||
assert 0, 'should raise exception'
|
||||
except UnexpectedTypeException:
|
||||
assert 1, 'okay'
|
||||
|
||||
|
||||
def test_correct_type_nested_array():
|
||||
unpacker = Unpacker()
|
||||
unpacker.feed(packb({'a': ['b', 'c', 'd']}))
|
||||
try:
|
||||
unpacker.read_array_header()
|
||||
assert 0, 'should raise exception'
|
||||
except UnexpectedTypeException:
|
||||
assert 1, 'okay'
|
||||
|
||||
|
||||
def test_incorrect_type_nested_map():
|
||||
unpacker = Unpacker()
|
||||
unpacker.feed(packb([{'a': 'b'}]))
|
||||
try:
|
||||
unpacker.read_map_header()
|
||||
assert 0, 'should raise exception'
|
||||
except UnexpectedTypeException:
|
||||
assert 1, 'okay'
|
||||
@@ -0,0 +1,47 @@
|
||||
# coding: utf-8
|
||||
|
||||
import io
|
||||
|
||||
import pandas.io.msgpack as msgpack
|
||||
|
||||
binarydata = bytes(bytearray(range(256)))
|
||||
|
||||
|
||||
def gen_binary_data(idx):
|
||||
return binarydata[:idx % 300]
|
||||
|
||||
|
||||
def test_exceeding_unpacker_read_size():
|
||||
dumpf = io.BytesIO()
|
||||
|
||||
packer = msgpack.Packer()
|
||||
|
||||
NUMBER_OF_STRINGS = 6
|
||||
read_size = 16
|
||||
|
||||
# 5 ok for read_size=16, while 6 glibc detected *** python: double free or
|
||||
# corruption (fasttop):
|
||||
|
||||
# 20 ok for read_size=256, while 25 segfaults / glibc detected *** python:
|
||||
# double free or corruption (!prev)
|
||||
|
||||
# 40 ok for read_size=1024, while 50 introduces errors
|
||||
# 7000 ok for read_size=1024*1024, while 8000 leads to glibc detected ***
|
||||
# python: double free or corruption (!prev):
|
||||
|
||||
for idx in range(NUMBER_OF_STRINGS):
|
||||
data = gen_binary_data(idx)
|
||||
dumpf.write(packer.pack(data))
|
||||
|
||||
f = io.BytesIO(dumpf.getvalue())
|
||||
dumpf.close()
|
||||
|
||||
unpacker = msgpack.Unpacker(f, read_size=read_size, use_list=1)
|
||||
|
||||
read_count = 0
|
||||
for idx, o in enumerate(unpacker):
|
||||
assert type(o) == bytes
|
||||
assert o == gen_binary_data(idx)
|
||||
read_count += 1
|
||||
|
||||
assert read_count == NUMBER_OF_STRINGS
|
||||
@@ -0,0 +1,104 @@
|
||||
# coding: utf-8
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import compat
|
||||
|
||||
from pandas.io.msgpack import BufferFull, OutOfData, Unpacker
|
||||
|
||||
|
||||
class TestPack(object):
|
||||
|
||||
def test_partial_data(self):
|
||||
unpacker = Unpacker()
|
||||
msg = "No more data to unpack"
|
||||
|
||||
for data in [b"\xa5", b"h", b"a", b"l", b"l"]:
|
||||
unpacker.feed(data)
|
||||
with pytest.raises(StopIteration, match=msg):
|
||||
next(iter(unpacker))
|
||||
|
||||
unpacker.feed(b"o")
|
||||
assert next(iter(unpacker)) == b"hallo"
|
||||
|
||||
def test_foobar(self):
|
||||
unpacker = Unpacker(read_size=3, use_list=1)
|
||||
unpacker.feed(b'foobar')
|
||||
assert unpacker.unpack() == ord(b'f')
|
||||
assert unpacker.unpack() == ord(b'o')
|
||||
assert unpacker.unpack() == ord(b'o')
|
||||
assert unpacker.unpack() == ord(b'b')
|
||||
assert unpacker.unpack() == ord(b'a')
|
||||
assert unpacker.unpack() == ord(b'r')
|
||||
msg = "No more data to unpack"
|
||||
with pytest.raises(OutOfData, match=msg):
|
||||
unpacker.unpack()
|
||||
|
||||
unpacker.feed(b'foo')
|
||||
unpacker.feed(b'bar')
|
||||
|
||||
k = 0
|
||||
for o, e in zip(unpacker, 'foobarbaz'):
|
||||
assert o == ord(e)
|
||||
k += 1
|
||||
assert k == len(b'foobar')
|
||||
|
||||
def test_foobar_skip(self):
|
||||
unpacker = Unpacker(read_size=3, use_list=1)
|
||||
unpacker.feed(b'foobar')
|
||||
assert unpacker.unpack() == ord(b'f')
|
||||
unpacker.skip()
|
||||
assert unpacker.unpack() == ord(b'o')
|
||||
unpacker.skip()
|
||||
assert unpacker.unpack() == ord(b'a')
|
||||
unpacker.skip()
|
||||
msg = "No more data to unpack"
|
||||
with pytest.raises(OutOfData, match=msg):
|
||||
unpacker.unpack()
|
||||
|
||||
def test_maxbuffersize_read_size_exceeds_max_buffer_size(self):
|
||||
msg = "read_size should be less or equal to max_buffer_size"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
Unpacker(read_size=5, max_buffer_size=3)
|
||||
|
||||
def test_maxbuffersize_bufferfull(self):
|
||||
unpacker = Unpacker(read_size=3, max_buffer_size=3, use_list=1)
|
||||
unpacker.feed(b'foo')
|
||||
with pytest.raises(BufferFull, match=r'^$'):
|
||||
unpacker.feed(b'b')
|
||||
|
||||
def test_maxbuffersize(self):
|
||||
unpacker = Unpacker(read_size=3, max_buffer_size=3, use_list=1)
|
||||
unpacker.feed(b'foo')
|
||||
assert ord('f') == next(unpacker)
|
||||
unpacker.feed(b'b')
|
||||
assert ord('o') == next(unpacker)
|
||||
assert ord('o') == next(unpacker)
|
||||
assert ord('b') == next(unpacker)
|
||||
|
||||
def test_readbytes(self):
|
||||
unpacker = Unpacker(read_size=3)
|
||||
unpacker.feed(b'foobar')
|
||||
assert unpacker.unpack() == ord(b'f')
|
||||
assert unpacker.read_bytes(3) == b'oob'
|
||||
assert unpacker.unpack() == ord(b'a')
|
||||
assert unpacker.unpack() == ord(b'r')
|
||||
|
||||
# Test buffer refill
|
||||
unpacker = Unpacker(compat.BytesIO(b'foobar'), read_size=3)
|
||||
assert unpacker.unpack() == ord(b'f')
|
||||
assert unpacker.read_bytes(3) == b'oob'
|
||||
assert unpacker.unpack() == ord(b'a')
|
||||
assert unpacker.unpack() == ord(b'r')
|
||||
|
||||
def test_issue124(self):
|
||||
unpacker = Unpacker()
|
||||
unpacker.feed(b'\xa1?\xa1!')
|
||||
assert tuple(unpacker) == (b'?', b'!')
|
||||
assert tuple(unpacker) == ()
|
||||
unpacker.feed(b"\xa1?\xa1")
|
||||
assert tuple(unpacker) == (b'?', )
|
||||
assert tuple(unpacker) == ()
|
||||
unpacker.feed(b"!")
|
||||
assert tuple(unpacker) == (b'!', )
|
||||
assert tuple(unpacker) == ()
|
||||
@@ -0,0 +1,26 @@
|
||||
# coding: utf-8
|
||||
|
||||
from collections import namedtuple
|
||||
|
||||
from pandas.io.msgpack import packb
|
||||
|
||||
|
||||
class MyList(list):
|
||||
pass
|
||||
|
||||
|
||||
class MyDict(dict):
|
||||
pass
|
||||
|
||||
|
||||
class MyTuple(tuple):
|
||||
pass
|
||||
|
||||
|
||||
MyNamedTuple = namedtuple('MyNamedTuple', 'x y')
|
||||
|
||||
|
||||
def test_types():
|
||||
assert packb(MyDict()) == packb(dict())
|
||||
assert packb(MyList()) == packb(list())
|
||||
assert packb(MyNamedTuple(1, 2)) == packb((1, 2))
|
||||
@@ -0,0 +1,67 @@
|
||||
from io import BytesIO
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.io.msgpack import ExtType, OutOfData, Unpacker, packb
|
||||
|
||||
|
||||
class TestUnpack(object):
|
||||
|
||||
def test_unpack_array_header_from_file(self):
|
||||
f = BytesIO(packb([1, 2, 3, 4]))
|
||||
unpacker = Unpacker(f)
|
||||
assert unpacker.read_array_header() == 4
|
||||
assert unpacker.unpack() == 1
|
||||
assert unpacker.unpack() == 2
|
||||
assert unpacker.unpack() == 3
|
||||
assert unpacker.unpack() == 4
|
||||
msg = "No more data to unpack"
|
||||
with pytest.raises(OutOfData, match=msg):
|
||||
unpacker.unpack()
|
||||
|
||||
def test_unpacker_hook_refcnt(self):
|
||||
if not hasattr(sys, 'getrefcount'):
|
||||
pytest.skip('no sys.getrefcount()')
|
||||
result = []
|
||||
|
||||
def hook(x):
|
||||
result.append(x)
|
||||
return x
|
||||
|
||||
basecnt = sys.getrefcount(hook)
|
||||
|
||||
up = Unpacker(object_hook=hook, list_hook=hook)
|
||||
|
||||
assert sys.getrefcount(hook) >= basecnt + 2
|
||||
|
||||
up.feed(packb([{}]))
|
||||
up.feed(packb([{}]))
|
||||
assert up.unpack() == [{}]
|
||||
assert up.unpack() == [{}]
|
||||
assert result == [{}, [{}], {}, [{}]]
|
||||
|
||||
del up
|
||||
|
||||
assert sys.getrefcount(hook) == basecnt
|
||||
|
||||
def test_unpacker_ext_hook(self):
|
||||
class MyUnpacker(Unpacker):
|
||||
|
||||
def __init__(self):
|
||||
super(MyUnpacker, self).__init__(ext_hook=self._hook,
|
||||
encoding='utf-8')
|
||||
|
||||
def _hook(self, code, data):
|
||||
if code == 1:
|
||||
return int(data)
|
||||
else:
|
||||
return ExtType(code, data)
|
||||
|
||||
unpacker = MyUnpacker()
|
||||
unpacker.feed(packb({'a': 1}, encoding='utf-8'))
|
||||
assert unpacker.unpack() == {'a': 1}
|
||||
unpacker.feed(packb({'a': ExtType(1, b'123')}, encoding='utf-8'))
|
||||
assert unpacker.unpack() == {'a': 123}
|
||||
unpacker.feed(packb({'a': ExtType(2, b'321')}, encoding='utf-8'))
|
||||
assert unpacker.unpack() == {'a': ExtType(2, b'321')}
|
||||
@@ -0,0 +1,30 @@
|
||||
"""Tests for cases where the user seeks to obtain packed msgpack objects"""
|
||||
|
||||
import io
|
||||
|
||||
from pandas.io.msgpack import Unpacker, packb
|
||||
|
||||
|
||||
def test_write_bytes():
|
||||
unpacker = Unpacker()
|
||||
unpacker.feed(b'abc')
|
||||
f = io.BytesIO()
|
||||
assert unpacker.unpack(f.write) == ord('a')
|
||||
assert f.getvalue() == b'a'
|
||||
f = io.BytesIO()
|
||||
assert unpacker.skip(f.write) is None
|
||||
assert f.getvalue() == b'b'
|
||||
f = io.BytesIO()
|
||||
assert unpacker.skip() is None
|
||||
assert f.getvalue() == b''
|
||||
|
||||
|
||||
def test_write_bytes_multi_buffer():
|
||||
long_val = (5) * 100
|
||||
expected = packb(long_val)
|
||||
unpacker = Unpacker(io.BytesIO(expected), read_size=3, max_buffer_size=3)
|
||||
|
||||
f = io.BytesIO()
|
||||
unpacked = unpacker.unpack(f.write)
|
||||
assert unpacked == long_val
|
||||
assert f.getvalue() == expected
|
||||
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -0,0 +1,85 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import read_csv, read_table
|
||||
|
||||
|
||||
class BaseParser(object):
|
||||
engine = None
|
||||
low_memory = True
|
||||
float_precision_choices = []
|
||||
|
||||
def update_kwargs(self, kwargs):
|
||||
kwargs = kwargs.copy()
|
||||
kwargs.update(dict(engine=self.engine,
|
||||
low_memory=self.low_memory))
|
||||
|
||||
return kwargs
|
||||
|
||||
def read_csv(self, *args, **kwargs):
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
return read_csv(*args, **kwargs)
|
||||
|
||||
def read_table(self, *args, **kwargs):
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
return read_table(*args, **kwargs)
|
||||
|
||||
|
||||
class CParser(BaseParser):
|
||||
engine = "c"
|
||||
float_precision_choices = [None, "high", "round_trip"]
|
||||
|
||||
|
||||
class CParserHighMemory(CParser):
|
||||
low_memory = False
|
||||
|
||||
|
||||
class CParserLowMemory(CParser):
|
||||
low_memory = True
|
||||
|
||||
|
||||
class PythonParser(BaseParser):
|
||||
engine = "python"
|
||||
float_precision_choices = [None]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def csv_dir_path(datapath):
|
||||
return datapath("io", "parser", "data")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def csv1(csv_dir_path):
|
||||
return os.path.join(csv_dir_path, "test1.csv")
|
||||
|
||||
|
||||
_cParserHighMemory = CParserHighMemory()
|
||||
_cParserLowMemory = CParserLowMemory()
|
||||
_pythonParser = PythonParser()
|
||||
|
||||
_py_parsers_only = [_pythonParser]
|
||||
_c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
|
||||
_all_parsers = _c_parsers_only + _py_parsers_only
|
||||
|
||||
_py_parser_ids = ["python"]
|
||||
_c_parser_ids = ["c_high", "c_low"]
|
||||
_all_parser_ids = _c_parser_ids + _py_parser_ids
|
||||
|
||||
|
||||
@pytest.fixture(params=_all_parsers,
|
||||
ids=_all_parser_ids)
|
||||
def all_parsers(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=_c_parsers_only,
|
||||
ids=_c_parser_ids)
|
||||
def c_parser_only(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=_py_parsers_only,
|
||||
ids=_py_parser_ids)
|
||||
def python_parser_only(request):
|
||||
return request.param
|
||||
@@ -0,0 +1,591 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that apply specifically to the CParser. Unless specifically stated
|
||||
as a CParser-specific issue, the goal is to eventually move as many of
|
||||
these tests out of this module as soon as the Python parser can accept
|
||||
further arguments when parsing.
|
||||
"""
|
||||
|
||||
from io import TextIOWrapper
|
||||
import mmap
|
||||
import os
|
||||
import tarfile
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY3, BytesIO, StringIO, lrange, range
|
||||
from pandas.errors import ParserError
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import DataFrame, concat
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"malformed",
|
||||
["1\r1\r1\r 1\r 1\r",
|
||||
"1\r1\r1\r 1\r 1\r11\r",
|
||||
"1\r1\r1\r 1\r 1\r11\r1\r"],
|
||||
ids=["words pointer", "stream pointer", "lines pointer"])
|
||||
def test_buffer_overflow(c_parser_only, malformed):
|
||||
# see gh-9205: test certain malformed input files that cause
|
||||
# buffer overflows in tokenizer.c
|
||||
msg = "Buffer overflow caught - possible malformed input file."
|
||||
parser = c_parser_only
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(malformed))
|
||||
|
||||
|
||||
def test_buffer_rd_bytes(c_parser_only):
|
||||
# see gh-12098: src->buffer in the C parser can be freed twice leading
|
||||
# to a segfault if a corrupt gzip file is read with 'read_csv', and the
|
||||
# buffer is filled more than once before gzip raises an Exception.
|
||||
|
||||
data = "\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09" \
|
||||
"\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0" \
|
||||
"\xA6\x4D" + "\x55" * 267 + \
|
||||
"\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00" \
|
||||
"\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO"
|
||||
parser = c_parser_only
|
||||
|
||||
for _ in range(100):
|
||||
try:
|
||||
parser.read_csv(StringIO(data), compression="gzip",
|
||||
delim_whitespace=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def test_delim_whitespace_custom_terminator(c_parser_only):
|
||||
# See gh-12912
|
||||
data = "a b c~1 2 3~4 5 6~7 8 9"
|
||||
parser = c_parser_only
|
||||
|
||||
df = parser.read_csv(StringIO(data), lineterminator="~",
|
||||
delim_whitespace=True)
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
def test_dtype_and_names_error(c_parser_only):
|
||||
# see gh-8833: passing both dtype and names
|
||||
# resulting in an error reporting issue
|
||||
parser = c_parser_only
|
||||
data = """
|
||||
1.0 1
|
||||
2.0 2
|
||||
3.0 3
|
||||
"""
|
||||
# base cases
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None)
|
||||
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+",
|
||||
header=None, names=["a", "b"])
|
||||
expected = DataFrame(
|
||||
[[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# fallback casting
|
||||
result = parser.read_csv(StringIO(
|
||||
data), sep=r"\s+", header=None,
|
||||
names=["a", "b"], dtype={"a": np.int32})
|
||||
expected = DataFrame([[1, 1], [2, 2], [3, 3]],
|
||||
columns=["a", "b"])
|
||||
expected["a"] = expected["a"].astype(np.int32)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
data = """
|
||||
1.0 1
|
||||
nan 2
|
||||
3.0 3
|
||||
"""
|
||||
# fallback casting, but not castable
|
||||
with pytest.raises(ValueError, match="cannot safely convert"):
|
||||
parser.read_csv(StringIO(data), sep=r"\s+", header=None,
|
||||
names=["a", "b"], dtype={"a": np.int32})
|
||||
|
||||
|
||||
@pytest.mark.parametrize("match,kwargs", [
|
||||
# For each of these cases, all of the dtypes are valid, just unsupported.
|
||||
(("the dtype datetime64 is not supported for parsing, "
|
||||
"pass this column using parse_dates instead"),
|
||||
dict(dtype={"A": "datetime64", "B": "float64"})),
|
||||
|
||||
(("the dtype datetime64 is not supported for parsing, "
|
||||
"pass this column using parse_dates instead"),
|
||||
dict(dtype={"A": "datetime64", "B": "float64"},
|
||||
parse_dates=["B"])),
|
||||
|
||||
("the dtype timedelta64 is not supported for parsing",
|
||||
dict(dtype={"A": "timedelta64", "B": "float64"})),
|
||||
|
||||
("the dtype <U8 is not supported for parsing",
|
||||
dict(dtype={"A": "U8"}))
|
||||
], ids=["dt64-0", "dt64-1", "td64", "<U8"])
|
||||
def test_unsupported_dtype(c_parser_only, match, kwargs):
|
||||
parser = c_parser_only
|
||||
df = DataFrame(np.random.rand(5, 2), columns=list(
|
||||
"AB"), index=["1A", "1B", "1C", "1D", "1E"])
|
||||
|
||||
with tm.ensure_clean("__unsupported_dtype__.csv") as path:
|
||||
df.to_csv(path)
|
||||
|
||||
with pytest.raises(TypeError, match=match):
|
||||
parser.read_csv(path, index_col=0, **kwargs)
|
||||
|
||||
|
||||
@td.skip_if_32bit
|
||||
def test_precise_conversion(c_parser_only):
|
||||
from decimal import Decimal
|
||||
parser = c_parser_only
|
||||
|
||||
normal_errors = []
|
||||
precise_errors = []
|
||||
|
||||
# test numbers between 1 and 2
|
||||
for num in np.linspace(1., 2., num=500):
|
||||
# 25 decimal digits of precision
|
||||
text = "a\n{0:.25}".format(num)
|
||||
|
||||
normal_val = float(parser.read_csv(StringIO(text))["a"][0])
|
||||
precise_val = float(parser.read_csv(
|
||||
StringIO(text), float_precision="high")["a"][0])
|
||||
roundtrip_val = float(parser.read_csv(
|
||||
StringIO(text), float_precision="round_trip")["a"][0])
|
||||
actual_val = Decimal(text[2:])
|
||||
|
||||
def error(val):
|
||||
return abs(Decimal("{0:.100}".format(val)) - actual_val)
|
||||
|
||||
normal_errors.append(error(normal_val))
|
||||
precise_errors.append(error(precise_val))
|
||||
|
||||
# round-trip should match float()
|
||||
assert roundtrip_val == float(text[2:])
|
||||
|
||||
assert sum(precise_errors) <= sum(normal_errors)
|
||||
assert max(precise_errors) <= max(normal_errors)
|
||||
|
||||
|
||||
def test_usecols_dtypes(c_parser_only):
|
||||
parser = c_parser_only
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=(0, 1, 2),
|
||||
names=("a", "b", "c"),
|
||||
header=None,
|
||||
converters={"a": str},
|
||||
dtype={"b": int, "c": float})
|
||||
result2 = parser.read_csv(StringIO(data), usecols=(0, 2),
|
||||
names=("a", "b", "c"),
|
||||
header=None,
|
||||
converters={"a": str},
|
||||
dtype={"b": int, "c": float})
|
||||
|
||||
assert (result.dtypes == [object, np.int, np.float]).all()
|
||||
assert (result2.dtypes == [object, np.float]).all()
|
||||
|
||||
|
||||
def test_disable_bool_parsing(c_parser_only):
|
||||
# see gh-2090
|
||||
|
||||
parser = c_parser_only
|
||||
data = """A,B,C
|
||||
Yes,No,Yes
|
||||
No,Yes,Yes
|
||||
Yes,,Yes
|
||||
No,No,No"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=object)
|
||||
assert (result.dtypes == object).all()
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=object, na_filter=False)
|
||||
assert result["B"][2] == ""
|
||||
|
||||
|
||||
def test_custom_lineterminator(c_parser_only):
|
||||
parser = c_parser_only
|
||||
data = "a,b,c~1,2,3~4,5,6"
|
||||
|
||||
result = parser.read_csv(StringIO(data), lineterminator="~")
|
||||
expected = parser.read_csv(StringIO(data.replace("~", "\n")))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_ragged_csv(c_parser_only):
|
||||
parser = c_parser_only
|
||||
data = """1,2,3
|
||||
1,2,3,4
|
||||
1,2,3,4,5
|
||||
1,2
|
||||
1,2,3,4"""
|
||||
|
||||
nice_data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
result = parser.read_csv(StringIO(data), header=None,
|
||||
names=["a", "b", "c", "d", "e"])
|
||||
|
||||
expected = parser.read_csv(StringIO(nice_data), header=None,
|
||||
names=["a", "b", "c", "d", "e"])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# too many columns, cause segfault if not careful
|
||||
data = "1,2\n3,4,5"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None,
|
||||
names=lrange(50))
|
||||
expected = parser.read_csv(StringIO(data), header=None,
|
||||
names=lrange(3)).reindex(columns=lrange(50))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_tokenize_CR_with_quoting(c_parser_only):
|
||||
# see gh-3453
|
||||
parser = c_parser_only
|
||||
data = " a,b,c\r\"a,b\",\"e,d\",\"f,f\""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = parser.read_csv(StringIO(data.replace("\r", "\n")),
|
||||
header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = parser.read_csv(StringIO(data.replace("\r", "\n")))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_grow_boundary_at_cap(c_parser_only):
|
||||
# See gh-12494
|
||||
#
|
||||
# Cause of error was that the C parser
|
||||
# was not increasing the buffer size when
|
||||
# the desired space would fill the buffer
|
||||
# to capacity, which would later cause a
|
||||
# buffer overflow error when checking the
|
||||
# EOF terminator of the CSV stream.
|
||||
parser = c_parser_only
|
||||
|
||||
def test_empty_header_read(count):
|
||||
s = StringIO("," * count)
|
||||
expected = DataFrame(columns=[
|
||||
"Unnamed: {i}".format(i=i)
|
||||
for i in range(count + 1)])
|
||||
df = parser.read_csv(s)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
for cnt in range(1, 101):
|
||||
test_empty_header_read(cnt)
|
||||
|
||||
|
||||
def test_parse_trim_buffers(c_parser_only):
|
||||
# This test is part of a bugfix for gh-13703. It attempts to
|
||||
# to stress the system memory allocator, to cause it to move the
|
||||
# stream buffer and either let the OS reclaim the region, or let
|
||||
# other memory requests of parser otherwise modify the contents
|
||||
# of memory space, where it was formally located.
|
||||
# This test is designed to cause a `segfault` with unpatched
|
||||
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
|
||||
# times it fails due to memory corruption, which causes the
|
||||
# loaded DataFrame to differ from the expected one.
|
||||
|
||||
parser = c_parser_only
|
||||
|
||||
# Generate a large mixed-type CSV file on-the-fly (one record is
|
||||
# approx 1.5KiB).
|
||||
record_ = \
|
||||
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \
|
||||
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \
|
||||
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \
|
||||
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \
|
||||
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \
|
||||
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \
|
||||
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \
|
||||
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \
|
||||
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \
|
||||
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \
|
||||
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \
|
||||
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \
|
||||
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \
|
||||
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \
|
||||
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \
|
||||
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \
|
||||
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \
|
||||
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \
|
||||
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \
|
||||
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \
|
||||
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \
|
||||
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \
|
||||
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \
|
||||
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \
|
||||
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \
|
||||
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \
|
||||
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \
|
||||
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \
|
||||
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
|
||||
|
||||
# Set the number of lines so that a call to `parser_trim_buffers`
|
||||
# is triggered: after a couple of full chunks are consumed a
|
||||
# relatively small 'residual' chunk would cause reallocation
|
||||
# within the parser.
|
||||
chunksize, n_lines = 128, 2 * 128 + 15
|
||||
csv_data = "\n".join([record_] * n_lines) + "\n"
|
||||
|
||||
# We will use StringIO to load the CSV from this text buffer.
|
||||
# pd.read_csv() will iterate over the file in chunks and will
|
||||
# finally read a residual chunk of really small size.
|
||||
|
||||
# Generate the expected output: manually create the dataframe
|
||||
# by splitting by comma and repeating the `n_lines` times.
|
||||
row = tuple(val_ if val_ else np.nan
|
||||
for val_ in record_.split(","))
|
||||
expected = DataFrame([row for _ in range(n_lines)],
|
||||
dtype=object, columns=None, index=None)
|
||||
|
||||
# Iterate over the CSV file in chunks of `chunksize` lines
|
||||
chunks_ = parser.read_csv(StringIO(csv_data), header=None,
|
||||
dtype=object, chunksize=chunksize)
|
||||
result = concat(chunks_, axis=0, ignore_index=True)
|
||||
|
||||
# Check for data corruption if there was no segfault
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# This extra test was added to replicate the fault in gh-5291.
|
||||
# Force 'utf-8' encoding, so that `_string_convert` would take
|
||||
# a different execution branch.
|
||||
chunks_ = parser.read_csv(StringIO(csv_data), header=None,
|
||||
dtype=object, chunksize=chunksize,
|
||||
encoding="utf_8")
|
||||
result = concat(chunks_, axis=0, ignore_index=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_internal_null_byte(c_parser_only):
|
||||
# see gh-14012
|
||||
#
|
||||
# The null byte ('\x00') should not be used as a
|
||||
# true line terminator, escape character, or comment
|
||||
# character, only as a placeholder to indicate that
|
||||
# none was specified.
|
||||
#
|
||||
# This test should be moved to test_common.py ONLY when
|
||||
# Python's csv class supports parsing '\x00'.
|
||||
parser = c_parser_only
|
||||
|
||||
names = ["a", "b", "c"]
|
||||
data = "1,2,3\n4,\x00,6\n7,8,9"
|
||||
expected = DataFrame([[1, 2.0, 3], [4, np.nan, 6],
|
||||
[7, 8, 9]], columns=names)
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_nrows_large(c_parser_only):
|
||||
# gh-7626 - Read only nrows of data in for large inputs (>262144b)
|
||||
parser = c_parser_only
|
||||
header_narrow = "\t".join(["COL_HEADER_" + str(i)
|
||||
for i in range(10)]) + "\n"
|
||||
data_narrow = "\t".join(["somedatasomedatasomedata1"
|
||||
for _ in range(10)]) + "\n"
|
||||
header_wide = "\t".join(["COL_HEADER_" + str(i)
|
||||
for i in range(15)]) + "\n"
|
||||
data_wide = "\t".join(["somedatasomedatasomedata2"
|
||||
for _ in range(15)]) + "\n"
|
||||
test_input = (header_narrow + data_narrow * 1050 +
|
||||
header_wide + data_wide * 2)
|
||||
|
||||
df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010)
|
||||
|
||||
assert df.size == 1010 * 10
|
||||
|
||||
|
||||
def test_float_precision_round_trip_with_text(c_parser_only):
|
||||
# see gh-15140 - This should not segfault on Python 2.7+
|
||||
parser = c_parser_only
|
||||
df = parser.read_csv(StringIO("a"), header=None,
|
||||
float_precision="round_trip")
|
||||
tm.assert_frame_equal(df, DataFrame({0: ["a"]}))
|
||||
|
||||
|
||||
def test_large_difference_in_columns(c_parser_only):
|
||||
# see gh-14125
|
||||
parser = c_parser_only
|
||||
|
||||
count = 10000
|
||||
large_row = ("X," * count)[:-1] + "\n"
|
||||
normal_row = "XXXXXX XXXXXX,111111111111111\n"
|
||||
test_input = (large_row + normal_row * 6)[:-1]
|
||||
|
||||
result = parser.read_csv(StringIO(test_input), header=None, usecols=[0])
|
||||
rows = test_input.split("\n")
|
||||
|
||||
expected = DataFrame([row.split(",")[0] for row in rows])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_data_after_quote(c_parser_only):
|
||||
# see gh-15910
|
||||
parser = c_parser_only
|
||||
|
||||
data = "a\n1\n\"b\"a"
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame({"a": ["1", "ba"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_whitespace_delimited(c_parser_only, capsys):
|
||||
parser = c_parser_only
|
||||
test_input = """\
|
||||
1 2
|
||||
2 2 3
|
||||
3 2 3 # 3 fields
|
||||
4 2 3# 3 fields
|
||||
5 2 # 2 fields
|
||||
6 2# 2 fields
|
||||
7 # 1 field, NaN
|
||||
8# 1 field, NaN
|
||||
9 2 3 # skipped line
|
||||
# comment"""
|
||||
df = parser.read_csv(StringIO(test_input), comment="#", header=None,
|
||||
delimiter="\\s+", skiprows=0,
|
||||
error_bad_lines=False)
|
||||
captured = capsys.readouterr()
|
||||
# skipped lines 2, 3, 4, 9
|
||||
for line_num in (2, 3, 4, 9):
|
||||
assert "Skipping line {}".format(line_num) in captured.err
|
||||
expected = DataFrame([[1, 2],
|
||||
[5, 2],
|
||||
[6, 2],
|
||||
[7, np.nan],
|
||||
[8, np.nan]])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
def test_file_like_no_next(c_parser_only):
|
||||
# gh-16530: the file-like need not have a "next" or "__next__"
|
||||
# attribute despite having an "__iter__" attribute.
|
||||
#
|
||||
# NOTE: This is only true for the C engine, not Python engine.
|
||||
class NoNextBuffer(StringIO):
|
||||
def __next__(self):
|
||||
raise AttributeError("No next method")
|
||||
|
||||
next = __next__
|
||||
|
||||
parser = c_parser_only
|
||||
data = "a\n1"
|
||||
|
||||
expected = DataFrame({"a": [1]})
|
||||
result = parser.read_csv(NoNextBuffer(data))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_buffer_rd_bytes_bad_unicode(c_parser_only):
|
||||
# see gh-22748
|
||||
parser = c_parser_only
|
||||
t = BytesIO(b"\xB0")
|
||||
|
||||
if PY3:
|
||||
msg = "'utf-8' codec can't encode character"
|
||||
t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
|
||||
else:
|
||||
msg = "'utf8' codec can't decode byte"
|
||||
|
||||
with pytest.raises(UnicodeError, match=msg):
|
||||
parser.read_csv(t, encoding="UTF-8")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
|
||||
def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix):
|
||||
# see gh-16530
|
||||
#
|
||||
# Unfortunately, Python's CSV library can't handle
|
||||
# tarfile objects (expects string, not bytes when
|
||||
# iterating through a file-like).
|
||||
parser = c_parser_only
|
||||
tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix)
|
||||
|
||||
with tarfile.open(tar_path, "r") as tar:
|
||||
data_file = tar.extractfile("tar_data.csv")
|
||||
|
||||
out = parser.read_csv(data_file)
|
||||
expected = DataFrame({"a": [1]})
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
|
||||
@pytest.mark.high_memory
|
||||
def test_bytes_exceed_2gb(c_parser_only):
|
||||
# see gh-16798
|
||||
#
|
||||
# Read from a "CSV" that has a column larger than 2GB.
|
||||
parser = c_parser_only
|
||||
|
||||
if parser.low_memory:
|
||||
pytest.skip("not a high_memory test")
|
||||
|
||||
csv = StringIO("strings\n" + "\n".join(
|
||||
["x" * (1 << 20) for _ in range(2100)]))
|
||||
df = parser.read_csv(csv)
|
||||
assert not df.empty
|
||||
|
||||
|
||||
def test_chunk_whitespace_on_boundary(c_parser_only):
|
||||
# see gh-9735: this issue is C parser-specific (bug when
|
||||
# parsing whitespace and characters at chunk boundary)
|
||||
#
|
||||
# This test case has a field too large for the Python parser / CSV library.
|
||||
parser = c_parser_only
|
||||
|
||||
chunk1 = "a" * (1024 * 256 - 2) + "\na"
|
||||
chunk2 = "\n a"
|
||||
result = parser.read_csv(StringIO(chunk1 + chunk2), header=None)
|
||||
|
||||
expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_file_handles_mmap(c_parser_only, csv1):
|
||||
# gh-14418
|
||||
#
|
||||
# Don't close user provided file handles.
|
||||
parser = c_parser_only
|
||||
|
||||
with open(csv1, "r") as f:
|
||||
m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
||||
parser.read_csv(m)
|
||||
|
||||
if PY3:
|
||||
assert not m.closed
|
||||
m.close()
|
||||
|
||||
|
||||
def test_file_binary_mode(c_parser_only):
|
||||
# see gh-23779
|
||||
parser = c_parser_only
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w") as f:
|
||||
f.write("1,2,3\n4,5,6")
|
||||
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,136 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that comments are properly handled during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_values", [None, ["NaN"]])
|
||||
def test_comment(all_parsers, na_values):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
1,2.,4.#hello world
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
|
||||
columns=["A", "B", "C"])
|
||||
result = parser.read_csv(StringIO(data), comment="#",
|
||||
na_values=na_values)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("read_kwargs", [
|
||||
dict(),
|
||||
dict(lineterminator="*"),
|
||||
dict(delim_whitespace=True),
|
||||
])
|
||||
def test_line_comment(all_parsers, read_kwargs):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
A,B,C
|
||||
1,2.,4.#hello world
|
||||
#ignore this line
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
if read_kwargs.get("delim_whitespace"):
|
||||
data = data.replace(",", " ")
|
||||
elif read_kwargs.get("lineterminator"):
|
||||
if parser.engine != "c":
|
||||
pytest.skip("Custom terminator not supported with Python engine")
|
||||
|
||||
data = data.replace("\n", read_kwargs.get("lineterminator"))
|
||||
|
||||
read_kwargs["comment"] = "#"
|
||||
result = parser.read_csv(StringIO(data), **read_kwargs)
|
||||
|
||||
expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
|
||||
columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_skiprows(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
random line
|
||||
# second empty line
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# This should ignore the first four lines (including comments).
|
||||
expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
|
||||
columns=["A", "B", "C"])
|
||||
result = parser.read_csv(StringIO(data), comment="#", skiprows=4)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_header(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
# second empty line
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# Header should begin at the second non-comment line.
|
||||
expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
|
||||
columns=["A", "B", "C"])
|
||||
result = parser.read_csv(StringIO(data), comment="#", header=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_skiprows_header(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
# second empty line
|
||||
# third empty line
|
||||
X,Y,Z
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# Skiprows should skip the first 4 lines (including comments),
|
||||
# while header should start from the second non-commented line,
|
||||
# starting with line 5.
|
||||
expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
|
||||
columns=["A", "B", "C"])
|
||||
result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"])
|
||||
def test_custom_comment_char(all_parsers, comment_char):
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
|
||||
result = parser.read_csv(StringIO(data.replace("#", comment_char)),
|
||||
comment=comment_char)
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", ["infer", None])
|
||||
def test_comment_first_line(all_parsers, header):
|
||||
# see gh-4623
|
||||
parser = all_parsers
|
||||
data = "# notes\na,b,c\n# more notes\n1,2,3"
|
||||
|
||||
if header is None:
|
||||
expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]})
|
||||
else:
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), comment="#", header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,154 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests compressed data parsing functionality for all
|
||||
of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import zipfile
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def buffer(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def parser_and_data(all_parsers, csv1):
|
||||
parser = all_parsers
|
||||
|
||||
with open(csv1, "rb") as f:
|
||||
data = f.read()
|
||||
expected = parser.read_csv(csv1)
|
||||
|
||||
return parser, data, expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("compression", ["zip", "infer", "zip2"])
|
||||
def test_zip(parser_and_data, compression):
|
||||
parser, data, expected = parser_and_data
|
||||
|
||||
with tm.ensure_clean("test_file.zip") as path:
|
||||
with zipfile.ZipFile(path, mode="w") as tmp:
|
||||
tmp.writestr("test_file", data)
|
||||
|
||||
if compression == "zip2":
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, compression="zip")
|
||||
else:
|
||||
result = parser.read_csv(path, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("compression", ["zip", "infer"])
|
||||
def test_zip_error_multiple_files(parser_and_data, compression):
|
||||
parser, data, expected = parser_and_data
|
||||
|
||||
with tm.ensure_clean("combined_zip.zip") as path:
|
||||
inner_file_names = ["test_file", "second_file"]
|
||||
|
||||
with zipfile.ZipFile(path, mode="w") as tmp:
|
||||
for file_name in inner_file_names:
|
||||
tmp.writestr(file_name, data)
|
||||
|
||||
with pytest.raises(ValueError, match="Multiple files"):
|
||||
parser.read_csv(path, compression=compression)
|
||||
|
||||
|
||||
def test_zip_error_no_files(parser_and_data):
|
||||
parser, _, _ = parser_and_data
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with zipfile.ZipFile(path, mode="w"):
|
||||
pass
|
||||
|
||||
with pytest.raises(ValueError, match="Zero files"):
|
||||
parser.read_csv(path, compression="zip")
|
||||
|
||||
|
||||
def test_zip_error_invalid_zip(parser_and_data):
|
||||
parser, _, _ = parser_and_data
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "wb") as f:
|
||||
with pytest.raises(zipfile.BadZipfile,
|
||||
match="File is not a zip file"):
|
||||
parser.read_csv(f, compression="zip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [None, "test.{ext}"])
|
||||
def test_compression(parser_and_data, compression_only, buffer, filename):
|
||||
parser, data, expected = parser_and_data
|
||||
compress_type = compression_only
|
||||
|
||||
ext = "gz" if compress_type == "gzip" else compress_type
|
||||
filename = filename if filename is None else filename.format(ext=ext)
|
||||
|
||||
if filename and buffer:
|
||||
pytest.skip("Cannot deduce compression from "
|
||||
"buffer of compressed data.")
|
||||
|
||||
with tm.ensure_clean(filename=filename) as path:
|
||||
tm.write_to_compressed(compress_type, path, data)
|
||||
compression = "infer" if filename else compress_type
|
||||
|
||||
if buffer:
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, compression=compression)
|
||||
else:
|
||||
result = parser.read_csv(path, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ext", [None, "gz", "bz2"])
|
||||
def test_infer_compression(all_parsers, csv1, buffer, ext):
|
||||
# see gh-9770
|
||||
parser = all_parsers
|
||||
kwargs = dict(index_col=0, parse_dates=True)
|
||||
|
||||
expected = parser.read_csv(csv1, **kwargs)
|
||||
kwargs["compression"] = "infer"
|
||||
|
||||
if buffer:
|
||||
with open(csv1) as f:
|
||||
result = parser.read_csv(f, **kwargs)
|
||||
else:
|
||||
ext = "." + ext if ext else ""
|
||||
result = parser.read_csv(csv1 + ext, **kwargs)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_compression_utf16_encoding(all_parsers, csv_dir_path):
|
||||
# see gh-18071
|
||||
parser = all_parsers
|
||||
path = os.path.join(csv_dir_path, "utf16_ex_small.zip")
|
||||
|
||||
result = parser.read_csv(path, encoding="utf-16",
|
||||
compression="zip", sep="\t")
|
||||
expected = pd.DataFrame({
|
||||
u"Country": [u"Venezuela", u"Venezuela"],
|
||||
u"Twitter": [u"Hugo Chávez Frías", u"Henrique Capriles R."]
|
||||
})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
|
||||
def test_invalid_compression(all_parsers, invalid_compression):
|
||||
parser = all_parsers
|
||||
compress_kwargs = dict(compression=invalid_compression)
|
||||
|
||||
msg = ("Unrecognized compression "
|
||||
"type: {compression}".format(**compress_kwargs))
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv("test_file.zip", **compress_kwargs)
|
||||
@@ -0,0 +1,158 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests column conversion functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO, lmap, parse_date
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def test_converters_type_must_be_dict(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
"""
|
||||
|
||||
with pytest.raises(TypeError, match="Type converters.+"):
|
||||
parser.read_csv(StringIO(data), converters=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("column", [3, "D"])
|
||||
@pytest.mark.parametrize("converter", [
|
||||
parse_date,
|
||||
lambda x: int(x.split("/")[2]) # Produce integer.
|
||||
])
|
||||
def test_converters(all_parsers, column, converter):
|
||||
parser = all_parsers
|
||||
data = """A,B,C,D
|
||||
a,1,2,01/01/2009
|
||||
b,3,4,01/02/2009
|
||||
c,4,5,01/03/2009
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), converters={column: converter})
|
||||
|
||||
expected = parser.read_csv(StringIO(data))
|
||||
expected["D"] = expected["D"].map(converter)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_no_implicit_conv(all_parsers):
|
||||
# see gh-2184
|
||||
parser = all_parsers
|
||||
data = """000102,1.2,A\n001245,2,B"""
|
||||
|
||||
converters = {0: lambda x: x.strip()}
|
||||
result = parser.read_csv(StringIO(data), header=None,
|
||||
converters=converters)
|
||||
|
||||
# Column 0 should not be casted to numeric and should remain as object.
|
||||
expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_euro_decimal_format(all_parsers):
|
||||
# see gh-583
|
||||
converters = dict()
|
||||
parser = all_parsers
|
||||
|
||||
data = """Id;Number1;Number2;Text1;Text2;Number3
|
||||
1;1521,1541;187101,9543;ABC;poi;4,7387
|
||||
2;121,12;14897,76;DEF;uyt;0,3773
|
||||
3;878,158;108013,434;GHI;rez;2,7356"""
|
||||
converters["Number1"] = converters["Number2"] =\
|
||||
converters["Number3"] = lambda x: float(x.replace(",", "."))
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=";", converters=converters)
|
||||
expected = DataFrame([[1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387],
|
||||
[2, 121.12, 14897.76, "DEF", "uyt", 0.3773],
|
||||
[3, 878.158, 108013.434, "GHI", "rez", 2.7356]],
|
||||
columns=["Id", "Number1", "Number2",
|
||||
"Text1", "Text2", "Number3"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_corner_with_nans(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """id,score,days
|
||||
1,2,12
|
||||
2,2-5,
|
||||
3,,14+
|
||||
4,6-12,2"""
|
||||
|
||||
# Example converters.
|
||||
def convert_days(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
is_plus = x.endswith("+")
|
||||
|
||||
if is_plus:
|
||||
x = int(x[:-1]) + 1
|
||||
else:
|
||||
x = int(x)
|
||||
|
||||
return x
|
||||
|
||||
def convert_days_sentinel(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
is_plus = x.endswith("+")
|
||||
|
||||
if is_plus:
|
||||
x = int(x[:-1]) + 1
|
||||
else:
|
||||
x = int(x)
|
||||
|
||||
return x
|
||||
|
||||
def convert_score(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
if x.find("-") > 0:
|
||||
val_min, val_max = lmap(int, x.split("-"))
|
||||
val = 0.5 * (val_min + val_max)
|
||||
else:
|
||||
val = float(x)
|
||||
|
||||
return val
|
||||
|
||||
results = []
|
||||
|
||||
for day_converter in [convert_days, convert_days_sentinel]:
|
||||
result = parser.read_csv(StringIO(data),
|
||||
converters={"score": convert_score,
|
||||
"days": day_converter},
|
||||
na_values=["", None])
|
||||
assert pd.isna(result["days"][1])
|
||||
results.append(result)
|
||||
|
||||
tm.assert_frame_equal(results[0], results[1])
|
||||
|
||||
|
||||
def test_converter_index_col_bug(all_parsers):
|
||||
# see gh-1835
|
||||
parser = all_parsers
|
||||
data = "A;B\n1;2\n3;4"
|
||||
|
||||
rs = parser.read_csv(StringIO(data), sep=";", index_col="A",
|
||||
converters={"A": lambda x: x})
|
||||
|
||||
xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A"))
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
@@ -0,0 +1,135 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that dialects are properly handled during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import csv
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def custom_dialect():
|
||||
dialect_name = "weird"
|
||||
dialect_kwargs = dict(doublequote=False, escapechar="~", delimiter=":",
|
||||
skipinitialspace=False, quotechar="~", quoting=3)
|
||||
return dialect_name, dialect_kwargs
|
||||
|
||||
|
||||
def test_dialect(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
label1,label2,label3
|
||||
index1,"a,c,e
|
||||
index2,b,d,f
|
||||
"""
|
||||
|
||||
dia = csv.excel()
|
||||
dia.quoting = csv.QUOTE_NONE
|
||||
df = parser.read_csv(StringIO(data), dialect=dia)
|
||||
|
||||
data = """\
|
||||
label1,label2,label3
|
||||
index1,a,c,e
|
||||
index2,b,d,f
|
||||
"""
|
||||
exp = parser.read_csv(StringIO(data))
|
||||
exp.replace("a", "\"a", inplace=True)
|
||||
tm.assert_frame_equal(df, exp)
|
||||
|
||||
|
||||
def test_dialect_str(all_parsers):
|
||||
dialect_name = "mydialect"
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
fruit:vegetable
|
||||
apple:broccoli
|
||||
pear:tomato
|
||||
"""
|
||||
exp = DataFrame({
|
||||
"fruit": ["apple", "pear"],
|
||||
"vegetable": ["broccoli", "tomato"]
|
||||
})
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, delimiter=":"):
|
||||
df = parser.read_csv(StringIO(data), dialect=dialect_name)
|
||||
tm.assert_frame_equal(df, exp)
|
||||
|
||||
|
||||
def test_invalid_dialect(all_parsers):
|
||||
class InvalidDialect(object):
|
||||
pass
|
||||
|
||||
data = "a\n1"
|
||||
parser = all_parsers
|
||||
msg = "Invalid dialect"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dialect=InvalidDialect)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("arg", [None, "doublequote", "escapechar",
|
||||
"skipinitialspace", "quotechar", "quoting"])
|
||||
@pytest.mark.parametrize("value", ["dialect", "default", "other"])
|
||||
def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect,
|
||||
arg, value):
|
||||
# see gh-23761.
|
||||
dialect_name, dialect_kwargs = custom_dialect
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
data = "a:b\n1:2"
|
||||
|
||||
warning_klass = None
|
||||
kwds = dict()
|
||||
|
||||
# arg=None tests when we pass in the dialect without any other arguments.
|
||||
if arg is not None:
|
||||
if "value" == "dialect": # No conflict --> no warning.
|
||||
kwds[arg] = dialect_kwargs[arg]
|
||||
elif "value" == "default": # Default --> no warning.
|
||||
from pandas.io.parsers import _parser_defaults
|
||||
kwds[arg] = _parser_defaults[arg]
|
||||
else: # Non-default + conflict with dialect --> warning.
|
||||
warning_klass = ParserWarning
|
||||
kwds[arg] = "blah"
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
|
||||
with tm.assert_produces_warning(warning_klass):
|
||||
result = parser.read_csv(StringIO(data),
|
||||
dialect=dialect_name, **kwds)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs,warning_klass", [
|
||||
(dict(sep=","), None), # sep is default --> sep_override=True
|
||||
(dict(sep="."), ParserWarning), # sep isn't default --> sep_override=False
|
||||
(dict(delimiter=":"), None), # No conflict
|
||||
(dict(delimiter=None), None), # Default arguments --> sep_override=True
|
||||
(dict(delimiter=","), ParserWarning), # Conflict
|
||||
(dict(delimiter="."), ParserWarning), # Conflict
|
||||
], ids=["sep-override-true", "sep-override-false",
|
||||
"delimiter-no-conflict", "delimiter-default-arg",
|
||||
"delimiter-conflict", "delimiter-conflict2"])
|
||||
def test_dialect_conflict_delimiter(all_parsers, custom_dialect,
|
||||
kwargs, warning_klass):
|
||||
# see gh-23761.
|
||||
dialect_name, dialect_kwargs = custom_dialect
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
data = "a:b\n1:2"
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
|
||||
with tm.assert_produces_warning(warning_klass):
|
||||
result = parser.read_csv(StringIO(data),
|
||||
dialect=dialect_name, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,514 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests dtype specification during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat)
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [str, object])
|
||||
@pytest.mark.parametrize("check_orig", [True, False])
|
||||
def test_dtype_all_columns(all_parsers, dtype, check_orig):
|
||||
# see gh-3795, gh-6607
|
||||
parser = all_parsers
|
||||
|
||||
df = DataFrame(np.random.rand(5, 2).round(4), columns=list("AB"),
|
||||
index=["1A", "1B", "1C", "1D", "1E"])
|
||||
|
||||
with tm.ensure_clean("__passing_str_as_dtype__.csv") as path:
|
||||
df.to_csv(path)
|
||||
|
||||
result = parser.read_csv(path, dtype=dtype, index_col=0)
|
||||
|
||||
if check_orig:
|
||||
expected = df.copy()
|
||||
result = result.astype(float)
|
||||
else:
|
||||
expected = df.astype(str)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_all_columns_empty(all_parsers):
|
||||
# see gh-12048
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO("A,B"), dtype=str)
|
||||
|
||||
expected = DataFrame({"A": [], "B": []}, index=[], dtype=str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_per_column(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
one,two
|
||||
1,2.5
|
||||
2,3.5
|
||||
3,4.5
|
||||
4,5.5"""
|
||||
expected = DataFrame([[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]],
|
||||
columns=["one", "two"])
|
||||
expected["one"] = expected["one"].astype(np.float64)
|
||||
expected["two"] = expected["two"].astype(object)
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": np.float64,
|
||||
1: str})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_invalid_dtype_per_column(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
one,two
|
||||
1,2.5
|
||||
2,3.5
|
||||
3,4.5
|
||||
4,5.5"""
|
||||
|
||||
with pytest.raises(TypeError, match="data type 'foo' not understood"):
|
||||
parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [
|
||||
"category",
|
||||
CategoricalDtype(),
|
||||
{"a": "category",
|
||||
"b": "category",
|
||||
"c": CategoricalDtype()}
|
||||
])
|
||||
def test_categorical_dtype(all_parsers, dtype):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,a,3.4
|
||||
1,a,3.4
|
||||
2,b,4.5"""
|
||||
expected = DataFrame({"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["a", "a", "b"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"])})
|
||||
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [
|
||||
{"b": "category"},
|
||||
{1: "category"}
|
||||
])
|
||||
def test_categorical_dtype_single(all_parsers, dtype):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,a,3.4
|
||||
1,a,3.4
|
||||
2,b,4.5"""
|
||||
expected = DataFrame({"a": [1, 1, 2],
|
||||
"b": Categorical(["a", "a", "b"]),
|
||||
"c": [3.4, 3.4, 4.5]})
|
||||
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_unsorted(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,b,3.4
|
||||
1,b,3.4
|
||||
2,a,4.5"""
|
||||
expected = DataFrame({"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["b", "b", "a"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"])})
|
||||
actual = parser.read_csv(StringIO(data), dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_missing(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,b,3.4
|
||||
1,nan,3.4
|
||||
2,a,4.5"""
|
||||
expected = DataFrame({"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["b", np.nan, "a"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"])})
|
||||
actual = parser.read_csv(StringIO(data), dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_categorical_dtype_high_cardinality_numeric(all_parsers):
|
||||
# see gh-18186
|
||||
parser = all_parsers
|
||||
data = np.sort([str(i) for i in range(524289)])
|
||||
expected = DataFrame({"a": Categorical(data, ordered=True)})
|
||||
|
||||
actual = parser.read_csv(StringIO("a\n" + "\n".join(data)),
|
||||
dtype="category")
|
||||
actual["a"] = actual["a"].cat.reorder_categories(
|
||||
np.sort(actual.a.cat.categories), ordered=True)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
|
||||
# see gh-10153
|
||||
pth = os.path.join(csv_dir_path, "unicode_series.csv")
|
||||
parser = all_parsers
|
||||
encoding = "latin-1"
|
||||
|
||||
expected = parser.read_csv(pth, header=None, encoding=encoding)
|
||||
expected[1] = Categorical(expected[1])
|
||||
|
||||
actual = parser.read_csv(pth, header=None, encoding=encoding,
|
||||
dtype={1: "category"})
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
|
||||
# see gh-10153
|
||||
pth = os.path.join(csv_dir_path, "utf16_ex.txt")
|
||||
parser = all_parsers
|
||||
encoding = "utf-16"
|
||||
sep = ","
|
||||
|
||||
expected = parser.read_csv(pth, sep=sep, encoding=encoding)
|
||||
expected = expected.apply(Categorical)
|
||||
|
||||
actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_chunksize_infer_categories(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
expecteds = [DataFrame({"a": [1, 1],
|
||||
"b": Categorical(["a", "b"])}),
|
||||
DataFrame({"a": [1, 2],
|
||||
"b": Categorical(["b", "c"])},
|
||||
index=[2, 3])]
|
||||
actuals = parser.read_csv(StringIO(data), dtype={"b": "category"},
|
||||
chunksize=2)
|
||||
|
||||
for actual, expected in zip(actuals, expecteds):
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
cats = ["a", "b", "c"]
|
||||
expecteds = [DataFrame({"a": [1, 1],
|
||||
"b": Categorical(["a", "b"],
|
||||
categories=cats)}),
|
||||
DataFrame({"a": [1, 2],
|
||||
"b": Categorical(["b", "c"],
|
||||
categories=cats)},
|
||||
index=[2, 3])]
|
||||
dtype = CategoricalDtype(cats)
|
||||
actuals = parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2)
|
||||
|
||||
for actual, expected in zip(actuals, expecteds):
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ordered", [False, True])
|
||||
@pytest.mark.parametrize("categories", [
|
||||
["a", "b", "c"],
|
||||
["a", "c", "b"],
|
||||
["a", "b", "c", "d"],
|
||||
["c", "b", "a"],
|
||||
])
|
||||
def test_categorical_category_dtype(all_parsers, categories, ordered):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
expected = DataFrame({
|
||||
"a": [1, 1, 1, 2],
|
||||
"b": Categorical(["a", "b", "b", "c"],
|
||||
categories=categories,
|
||||
ordered=ordered)
|
||||
})
|
||||
|
||||
dtype = {"b": CategoricalDtype(categories=categories,
|
||||
ordered=ordered)}
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_category_dtype_unsorted(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
dtype = CategoricalDtype(["c", "b", "a"])
|
||||
expected = DataFrame({
|
||||
"a": [1, 1, 1, 2],
|
||||
"b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"])
|
||||
})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype={"b": dtype})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_numeric(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([1, 2, 3])}
|
||||
|
||||
data = "b\n1\n1\n2\n3"
|
||||
expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_datetime(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype(pd.date_range("2017", "2019", freq="AS"))}
|
||||
|
||||
data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
|
||||
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_timestamp(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([Timestamp("2014")])}
|
||||
|
||||
data = "b\n2014-01-01\n2014-01-01T00:00:00"
|
||||
expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_timedelta(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}
|
||||
|
||||
data = "b\n1H\n2H\n3H"
|
||||
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", [
|
||||
"b\nTrue\nFalse\nNA\nFalse",
|
||||
"b\ntrue\nfalse\nNA\nfalse",
|
||||
"b\nTRUE\nFALSE\nNA\nFALSE",
|
||||
"b\nTrue\nFalse\nNA\nFALSE",
|
||||
])
|
||||
def test_categorical_dtype_coerces_boolean(all_parsers, data):
|
||||
# see gh-20498
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([False, True])}
|
||||
expected = DataFrame({"b": Categorical([True, False, None, False])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_unexpected_categories(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
|
||||
|
||||
data = "b\nd\na\nc\nd" # Unexpected c
|
||||
expected = DataFrame({"b": Categorical(list("dacd"),
|
||||
dtype=dtype["b"])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two"
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": "u1"})
|
||||
|
||||
expected = DataFrame({"one": np.empty(0, dtype="u1"),
|
||||
"two": np.empty(0, dtype=np.object)},
|
||||
index=Index([], dtype=object))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_index_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two"
|
||||
result = parser.read_csv(StringIO(data), index_col=["one"],
|
||||
dtype={"one": "u1", 1: "f"})
|
||||
|
||||
expected = DataFrame({"two": np.empty(0, dtype="f")},
|
||||
index=Index([], dtype="u1", name="one"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_multi_index_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two,three"
|
||||
result = parser.read_csv(StringIO(data), index_col=["one", "two"],
|
||||
dtype={"one": "u1", 1: "f8"})
|
||||
|
||||
exp_idx = MultiIndex.from_arrays([np.empty(0, dtype="u1"),
|
||||
np.empty(0, dtype=np.float64)],
|
||||
names=["one", "two"])
|
||||
expected = DataFrame({"three": np.empty(0, dtype=np.object)},
|
||||
index=exp_idx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"})
|
||||
|
||||
expected = DataFrame({"one": np.empty(0, dtype="u1"),
|
||||
"one.1": np.empty(0, dtype="f")},
|
||||
index=Index([], dtype=object))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
|
||||
|
||||
expected = DataFrame({"one": np.empty(0, dtype="u1"),
|
||||
"one.1": np.empty(0, dtype="f")},
|
||||
index=Index([], dtype=object))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
|
||||
# see gh-9424
|
||||
parser = all_parsers
|
||||
expected = concat([Series([], name="one", dtype="u1"),
|
||||
Series([], name="one.1", dtype="f")], axis=1)
|
||||
expected.index = expected.index.astype(object)
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_dup_column_pass_dtype_by_indexes_warn(all_parsers):
|
||||
# see gh-9424
|
||||
parser = all_parsers
|
||||
expected = concat([Series([], name="one", dtype="u1"),
|
||||
Series([], name="one.1", dtype="f")], axis=1)
|
||||
expected.index = expected.index.astype(object)
|
||||
|
||||
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
|
||||
data = ""
|
||||
result = parser.read_csv(StringIO(data), names=["one", "one"],
|
||||
dtype={0: "u1", 1: "f"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_raise_on_passed_int_dtype_with_nas(all_parsers):
|
||||
# see gh-2631
|
||||
parser = all_parsers
|
||||
data = """YEAR, DOY, a
|
||||
2001,106380451,10
|
||||
2001,,11
|
||||
2001,106380451,67"""
|
||||
|
||||
msg = ("Integer column has NA values" if parser.engine == "c" else
|
||||
"Unable to convert column DOY")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dtype={"DOY": np.int64},
|
||||
skipinitialspace=True)
|
||||
|
||||
|
||||
def test_dtype_with_converters(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1.1,2.2
|
||||
1.2,2.3"""
|
||||
|
||||
# Dtype spec ignored if converted specified.
|
||||
with tm.assert_produces_warning(ParserWarning):
|
||||
result = parser.read_csv(StringIO(data), dtype={"a": "i8"},
|
||||
converters={"a": lambda x: str(x)})
|
||||
expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype,expected", [
|
||||
(np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)),
|
||||
("category", DataFrame({"a": Categorical([]),
|
||||
"b": Categorical([])},
|
||||
index=[])),
|
||||
(dict(a="category", b="category"),
|
||||
DataFrame({"a": Categorical([]),
|
||||
"b": Categorical([])},
|
||||
index=[])),
|
||||
("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")),
|
||||
("timedelta64[ns]", DataFrame({"a": Series([], dtype="timedelta64[ns]"),
|
||||
"b": Series([], dtype="timedelta64[ns]")},
|
||||
index=[])),
|
||||
(dict(a=np.int64,
|
||||
b=np.int32), DataFrame({"a": Series([], dtype=np.int64),
|
||||
"b": Series([], dtype=np.int32)},
|
||||
index=[])),
|
||||
({0: np.int64, 1: np.int32}, DataFrame({"a": Series([], dtype=np.int64),
|
||||
"b": Series([], dtype=np.int32)},
|
||||
index=[])),
|
||||
({"a": np.int64, 1: np.int32}, DataFrame({"a": Series([], dtype=np.int64),
|
||||
"b": Series([], dtype=np.int32)},
|
||||
index=[])),
|
||||
])
|
||||
def test_empty_dtype(all_parsers, dtype, expected):
|
||||
# see gh-14712
|
||||
parser = all_parsers
|
||||
data = "a,b"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=0, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", list(np.typecodes["AllInteger"] +
|
||||
np.typecodes["Float"]))
|
||||
def test_numeric_dtype(all_parsers, dtype):
|
||||
data = "0\n1"
|
||||
parser = all_parsers
|
||||
expected = DataFrame([0, 1], dtype=dtype)
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, dtype=dtype)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
@@ -0,0 +1,428 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that the file header is properly handled or inferred
|
||||
during parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from collections import namedtuple
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO, u
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def test_read_with_bad_header(all_parsers):
|
||||
parser = all_parsers
|
||||
msg = r"but only \d+ lines in file"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
s = StringIO(",,")
|
||||
parser.read_csv(s, header=[10])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [True, False])
|
||||
def test_bool_header_arg(all_parsers, header):
|
||||
# see gh-6114
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
MyColumn
|
||||
a
|
||||
b
|
||||
a
|
||||
b"""
|
||||
msg = "Passing a bool to header is invalid"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=header)
|
||||
|
||||
|
||||
def test_no_header_prefix(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), prefix="Field", header=None)
|
||||
expected = DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10],
|
||||
[11, 12, 13, 14, 15]],
|
||||
columns=["Field0", "Field1", "Field2",
|
||||
"Field3", "Field4"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_with_index_col(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
names = ["A", "B", "C"]
|
||||
result = parser.read_csv(StringIO(data), names=names)
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_not_first_line(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """got,to,ignore,this,line
|
||||
got,to,ignore,this,line
|
||||
index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
"""
|
||||
data2 = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=2, index_col=0)
|
||||
expected = parser.read_csv(StringIO(data2), header=0, index_col=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_multi_index(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = tm.makeCustomDataframe(
|
||||
5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
|
||||
|
||||
data = """\
|
||||
C0,,C_l0_g0,C_l0_g1,C_l0_g2
|
||||
|
||||
C1,,C_l1_g0,C_l1_g1,C_l1_g2
|
||||
C2,,C_l2_g0,C_l2_g1,C_l2_g2
|
||||
C3,,C_l3_g0,C_l3_g1,C_l3_g2
|
||||
R0,R1,,,
|
||||
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
|
||||
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
|
||||
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
|
||||
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
|
||||
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3],
|
||||
index_col=[0, 1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs,msg", [
|
||||
(dict(index_col=["foo", "bar"]), ("index_col must only contain "
|
||||
"row numbers when specifying "
|
||||
"a multi-index header")),
|
||||
(dict(index_col=[0, 1], names=["foo", "bar"]), ("cannot specify names "
|
||||
"when specifying a "
|
||||
"multi-index header")),
|
||||
(dict(index_col=[0, 1], usecols=["foo", "bar"]), ("cannot specify "
|
||||
"usecols when "
|
||||
"specifying a "
|
||||
"multi-index header")),
|
||||
])
|
||||
def test_header_multi_index_invalid(all_parsers, kwargs, msg):
|
||||
data = """\
|
||||
C0,,C_l0_g0,C_l0_g1,C_l0_g2
|
||||
|
||||
C1,,C_l1_g0,C_l1_g1,C_l1_g2
|
||||
C2,,C_l2_g0,C_l2_g1,C_l2_g2
|
||||
C3,,C_l3_g0,C_l3_g1,C_l3_g2
|
||||
R0,R1,,,
|
||||
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
|
||||
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
|
||||
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
|
||||
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
|
||||
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
|
||||
"""
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs)
|
||||
|
||||
|
||||
_TestTuple = namedtuple("names", ["first", "second"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [
|
||||
dict(header=[0, 1]),
|
||||
dict(skiprows=3,
|
||||
names=[("a", "q"), ("a", "r"), ("a", "s"),
|
||||
("b", "t"), ("c", "u"), ("c", "v")]),
|
||||
dict(skiprows=3,
|
||||
names=[_TestTuple("a", "q"), _TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"), _TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"), _TestTuple("c", "v")])
|
||||
])
|
||||
def test_header_multi_index_common_format1(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"),
|
||||
("b", "t"), ("c", "u"), ("c", "v")]))
|
||||
data = """,a,a,a,b,c,c
|
||||
,q,r,s,t,u,v
|
||||
,,,,,,
|
||||
one,1,2,3,4,5,6
|
||||
two,7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [
|
||||
dict(header=[0, 1]),
|
||||
dict(skiprows=2,
|
||||
names=[("a", "q"), ("a", "r"), ("a", "s"),
|
||||
("b", "t"), ("c", "u"), ("c", "v")]),
|
||||
dict(skiprows=2,
|
||||
names=[_TestTuple("a", "q"), _TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"), _TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"), _TestTuple("c", "v")])
|
||||
])
|
||||
def test_header_multi_index_common_format2(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"),
|
||||
("b", "t"), ("c", "u"), ("c", "v")]))
|
||||
data = """,a,a,a,b,c,c
|
||||
,q,r,s,t,u,v
|
||||
one,1,2,3,4,5,6
|
||||
two,7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [
|
||||
dict(header=[0, 1]),
|
||||
dict(skiprows=2,
|
||||
names=[("a", "q"), ("a", "r"), ("a", "s"),
|
||||
("b", "t"), ("c", "u"), ("c", "v")]),
|
||||
dict(skiprows=2,
|
||||
names=[_TestTuple("a", "q"), _TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"), _TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"), _TestTuple("c", "v")])
|
||||
])
|
||||
def test_header_multi_index_common_format3(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"),
|
||||
("b", "t"), ("c", "u"), ("c", "v")]))
|
||||
expected = expected.reset_index(drop=True)
|
||||
data = """a,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_multi_index_common_format_malformed1(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(np.array(
|
||||
[[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
|
||||
index=Index([1, 7]),
|
||||
columns=MultiIndex(levels=[[u("a"), u("b"), u("c")],
|
||||
[u("r"), u("s"), u("t"),
|
||||
u("u"), u("v")]],
|
||||
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
|
||||
names=[u("a"), u("q")]))
|
||||
data = """a,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
def test_header_multi_index_common_format_malformed2(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(np.array(
|
||||
[[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
|
||||
index=Index([1, 7]),
|
||||
columns=MultiIndex(levels=[[u("a"), u("b"), u("c")],
|
||||
[u("r"), u("s"), u("t"),
|
||||
u("u"), u("v")]],
|
||||
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
|
||||
names=[None, u("q")]))
|
||||
|
||||
data = """,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
def test_header_multi_index_common_format_malformed3(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(np.array(
|
||||
[[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"),
|
||||
index=MultiIndex(levels=[[1, 7], [2, 8]],
|
||||
codes=[[0, 1], [0, 1]]),
|
||||
columns=MultiIndex(levels=[[u("a"), u("b"), u("c")],
|
||||
[u("s"), u("t"), u("u"), u("v")]],
|
||||
codes=[[0, 1, 2, 2], [0, 1, 2, 3]],
|
||||
names=[None, u("q")]))
|
||||
data = """,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,header", [
|
||||
("1,2,3\n4,5,6", None),
|
||||
("foo,bar,baz\n1,2,3\n4,5,6", 0),
|
||||
])
|
||||
def test_header_names_backward_compat(all_parsers, data, header):
|
||||
# see gh-2539
|
||||
parser = all_parsers
|
||||
expected = parser.read_csv(StringIO("1,2,3\n4,5,6"),
|
||||
names=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=["a", "b", "c"],
|
||||
header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [
|
||||
dict(), dict(index_col=False)
|
||||
])
|
||||
def test_read_only_header_no_rows(all_parsers, kwargs):
|
||||
# See gh-7773
|
||||
parser = all_parsers
|
||||
expected = DataFrame(columns=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO("a,b,c"), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs,names", [
|
||||
(dict(), [0, 1, 2, 3, 4]),
|
||||
(dict(prefix="X"), ["X0", "X1", "X2", "X3", "X4"]),
|
||||
(dict(names=["foo", "bar", "baz", "quux", "panda"]),
|
||||
["foo", "bar", "baz", "quux", "panda"])
|
||||
])
|
||||
def test_no_header(all_parsers, kwargs, names):
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
expected = DataFrame([[1, 2, 3, 4, 5],
|
||||
[6, 7, 8, 9, 10],
|
||||
[11, 12, 13, 14, 15]], columns=names)
|
||||
result = parser.read_csv(StringIO(data), header=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [
|
||||
["a", "b"],
|
||||
"string_header"
|
||||
])
|
||||
def test_non_int_header(all_parsers, header):
|
||||
# see gh-16338
|
||||
msg = "header must be integer or list of integers"
|
||||
data = """1,2\n3,4"""
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=header)
|
||||
|
||||
|
||||
def test_singleton_header(all_parsers):
|
||||
# see gh-7757
|
||||
data = """a,b,c\n0,1,2\n1,2,3"""
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
|
||||
result = parser.read_csv(StringIO(data), header=[0])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,expected", [
|
||||
("A,A,A,B\none,one,one,two\n0,40,34,0.1",
|
||||
DataFrame([[0, 40, 34, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("A", "one"), ("A", "one.1"),
|
||||
("A", "one.2"), ("B", "two")]))),
|
||||
("A,A,A,B\none,one,one.1,two\n0,40,34,0.1",
|
||||
DataFrame([[0, 40, 34, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("A", "one"), ("A", "one.1"),
|
||||
("A", "one.1.1"), ("B", "two")]))),
|
||||
("A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1",
|
||||
DataFrame([[0, 40, 34, 0.1, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("A", "one"), ("A", "one.1"),
|
||||
("A", "one.1.1"), ("B", "two"),
|
||||
("B", "two.1")])))
|
||||
])
|
||||
def test_mangles_multi_index(all_parsers, data, expected):
|
||||
# see gh-18062
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", [None, [0]])
|
||||
@pytest.mark.parametrize("columns", [None,
|
||||
(["", "Unnamed"]),
|
||||
(["Unnamed", ""]),
|
||||
(["Unnamed", "NotUnnamed"])])
|
||||
def test_multi_index_unnamed(all_parsers, index_col, columns):
|
||||
# see gh-23687
|
||||
#
|
||||
# When specifying a multi-index header, make sure that
|
||||
# we don't error just because one of the rows in our header
|
||||
# has ALL column names containing the string "Unnamed". The
|
||||
# correct condition to check is whether the row contains
|
||||
# ALL columns that did not have names (and instead were given
|
||||
# placeholder ones).
|
||||
parser = all_parsers
|
||||
header = [0, 1]
|
||||
|
||||
if index_col is None:
|
||||
data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
|
||||
else:
|
||||
data = (",".join([""] + (columns or ["", ""])) +
|
||||
"\n,0,1\n0,2,3\n1,4,5\n")
|
||||
|
||||
if columns is None:
|
||||
msg = (r"Passed header=\[0,1\] are too "
|
||||
r"many rows for this multi_index of columns")
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=header,
|
||||
index_col=index_col)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), header=header,
|
||||
index_col=index_col)
|
||||
template = "Unnamed: {i}_level_0"
|
||||
exp_columns = []
|
||||
|
||||
for i, col in enumerate(columns):
|
||||
if not col: # Unnamed.
|
||||
col = template.format(i=i if index_col is None else i + 1)
|
||||
|
||||
exp_columns.append(col)
|
||||
|
||||
columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
|
||||
expected = DataFrame([[2, 3], [4, 5]], columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,152 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that the specified index column (a.k.a "index_col")
|
||||
is properly handled or inferred during parsing for all of
|
||||
the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("with_header", [True, False])
|
||||
def test_index_col_named(all_parsers, with_header):
|
||||
parser = all_parsers
|
||||
no_header = """\
|
||||
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa
|
||||
header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" # noqa
|
||||
|
||||
if with_header:
|
||||
data = header + no_header
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col="ID")
|
||||
expected = parser.read_csv(StringIO(data), header=0).set_index("ID")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
data = no_header
|
||||
msg = "Index ID invalid"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), index_col="ID")
|
||||
|
||||
|
||||
def test_index_col_named2(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
1,2,3,4,hello
|
||||
5,6,7,8,world
|
||||
9,10,11,12,foo
|
||||
"""
|
||||
|
||||
expected = DataFrame({"a": [1, 5, 9], "b": [2, 6, 10],
|
||||
"c": [3, 7, 11], "d": [4, 8, 12]},
|
||||
index=Index(["hello", "world", "foo"],
|
||||
name="message"))
|
||||
names = ["a", "b", "c", "d", "message"]
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names,
|
||||
index_col=["message"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_col_is_true(all_parsers):
|
||||
# see gh-9798
|
||||
data = "a,b\n1,2"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match="The value of index_col "
|
||||
"couldn't be 'True'"):
|
||||
parser.read_csv(StringIO(data), index_col=True)
|
||||
|
||||
|
||||
def test_infer_index_col(all_parsers):
|
||||
data = """A,B,C
|
||||
foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col,kwargs", [
|
||||
(None, dict(columns=["x", "y", "z"])),
|
||||
(False, dict(columns=["x", "y", "z"])),
|
||||
(0, dict(columns=["y", "z"], index=Index([], name="x"))),
|
||||
(1, dict(columns=["x", "z"], index=Index([], name="y"))),
|
||||
("x", dict(columns=["y", "z"], index=Index([], name="x"))),
|
||||
("y", dict(columns=["x", "z"], index=Index([], name="y"))),
|
||||
([0, 1], dict(columns=["z"], index=MultiIndex.from_arrays(
|
||||
[[]] * 2, names=["x", "y"]))),
|
||||
(["x", "y"], dict(columns=["z"], index=MultiIndex.from_arrays(
|
||||
[[]] * 2, names=["x", "y"]))),
|
||||
([1, 0], dict(columns=["z"], index=MultiIndex.from_arrays(
|
||||
[[]] * 2, names=["y", "x"]))),
|
||||
(["y", "x"], dict(columns=["z"], index=MultiIndex.from_arrays(
|
||||
[[]] * 2, names=["y", "x"]))),
|
||||
])
|
||||
def test_index_col_empty_data(all_parsers, index_col, kwargs):
|
||||
data = "x,y,z"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=index_col)
|
||||
|
||||
expected = DataFrame([], **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_index_col_false(all_parsers):
|
||||
# see gh-10413
|
||||
data = "x,y"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=False)
|
||||
|
||||
expected = DataFrame([], columns=["x", "y"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_names", [
|
||||
["", ""],
|
||||
["foo", ""],
|
||||
["", "bar"],
|
||||
["foo", "bar"],
|
||||
["NotReallyUnnamed", "Unnamed: 0"],
|
||||
])
|
||||
def test_multi_index_naming(all_parsers, index_names):
|
||||
parser = all_parsers
|
||||
|
||||
# We don't want empty index names being replaced with "Unnamed: 0"
|
||||
data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
|
||||
result = parser.read_csv(StringIO(data), index_col=[0, 1])
|
||||
|
||||
expected = DataFrame({"col": [1, 2, 3, 4]},
|
||||
index=MultiIndex.from_product([["a", "b"],
|
||||
["c", "d"]]))
|
||||
expected.index.names = [name if name else None for name in index_names]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multi_index_naming_not_all_at_beginning(all_parsers):
|
||||
parser = all_parsers
|
||||
data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
|
||||
result = parser.read_csv(StringIO(data), index_col=[0, 2])
|
||||
|
||||
expected = DataFrame({"Unnamed: 2": ["c", "d", "c", "d"]},
|
||||
index=MultiIndex(
|
||||
levels=[['a', 'b'], [1, 2, 3, 4]],
|
||||
codes=[[0, 0, 1, 1], [0, 1, 2, 3]]))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,119 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that duplicate columns are handled appropriately when parsed by the
|
||||
CSV engine. In general, the expected result is that they are either thoroughly
|
||||
de-duplicated (if mangling requested) or ignored otherwise.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)])
|
||||
def test_basic(all_parsers, kwargs):
|
||||
# TODO: add test for condition "mangle_dupe_cols=False"
|
||||
# once it is actually supported (gh-12935)
|
||||
parser = all_parsers
|
||||
|
||||
data = "a,a,b,b,b\n1,2,3,4,5"
|
||||
result = parser.read_csv(StringIO(data), sep=",", **kwargs)
|
||||
|
||||
expected = DataFrame([[1, 2, 3, 4, 5]],
|
||||
columns=["a", "a.1", "b", "b.1", "b.2"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_basic_names(all_parsers):
|
||||
# See gh-7160
|
||||
parser = all_parsers
|
||||
|
||||
data = "a,b,a\n0,1,2\n3,4,5"
|
||||
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
|
||||
columns=["a", "b", "a.1"])
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_basic_names_warn(all_parsers):
|
||||
# See gh-7160
|
||||
parser = all_parsers
|
||||
|
||||
data = "0,1,2\n3,4,5"
|
||||
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
|
||||
columns=["a", "b", "a.1"])
|
||||
|
||||
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
|
||||
result = parser.read_csv(StringIO(data), names=["a", "b", "a"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,expected", [
|
||||
("a,a,a.1\n1,2,3",
|
||||
DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])),
|
||||
("a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
|
||||
DataFrame([[1, 2, 3, 4, 5, 6]], columns=["a", "a.1", "a.1.1", "a.1.1.1",
|
||||
"a.1.1.1.1", "a.1.1.1.1.1"])),
|
||||
("a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
|
||||
DataFrame([[1, 2, 3, 4, 5, 6, 7]], columns=["a", "a.1", "a.3", "a.1.1",
|
||||
"a.2", "a.2.1", "a.3.1"]))
|
||||
])
|
||||
def test_thorough_mangle_columns(all_parsers, data, expected):
|
||||
# see gh-17060
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,names,expected", [
|
||||
("a,b,b\n1,2,3",
|
||||
["a.1", "a.1", "a.1.1"],
|
||||
DataFrame([["a", "b", "b"], ["1", "2", "3"]],
|
||||
columns=["a.1", "a.1.1", "a.1.1.1"])),
|
||||
("a,b,c,d,e,f\n1,2,3,4,5,6",
|
||||
["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
|
||||
DataFrame([["a", "b", "c", "d", "e", "f"],
|
||||
["1", "2", "3", "4", "5", "6"]],
|
||||
columns=["a", "a.1", "a.1.1", "a.1.1.1",
|
||||
"a.1.1.1.1", "a.1.1.1.1.1"])),
|
||||
("a,b,c,d,e,f,g\n1,2,3,4,5,6,7",
|
||||
["a", "a", "a.3", "a.1", "a.2", "a", "a"],
|
||||
DataFrame([["a", "b", "c", "d", "e", "f", "g"],
|
||||
["1", "2", "3", "4", "5", "6", "7"]],
|
||||
columns=["a", "a.1", "a.3", "a.1.1",
|
||||
"a.2", "a.2.1", "a.3.1"])),
|
||||
])
|
||||
def test_thorough_mangle_names(all_parsers, data, names, expected):
|
||||
# see gh-17095
|
||||
parser = all_parsers
|
||||
|
||||
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
|
||||
result = parser.read_csv(StringIO(data), names=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_mangled_unnamed_placeholders(all_parsers):
|
||||
# xref gh-13017
|
||||
orig_key = "0"
|
||||
parser = all_parsers
|
||||
|
||||
orig_value = [1, 2, 3]
|
||||
df = DataFrame({orig_key: orig_value})
|
||||
|
||||
# This test recursively updates `df`.
|
||||
for i in range(3):
|
||||
expected = DataFrame()
|
||||
|
||||
for j in range(i + 1):
|
||||
expected["Unnamed: 0" + ".1" * j] = [0, 1, 2]
|
||||
|
||||
expected[orig_key] = orig_value
|
||||
df = parser.read_csv(StringIO(df.to_csv()))
|
||||
|
||||
tm.assert_frame_equal(df, expected)
|
||||
@@ -0,0 +1,145 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests multithreading behaviour for reading and
|
||||
parsing files for each parser defined in parsers.py
|
||||
"""
|
||||
|
||||
from __future__ import division
|
||||
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat import BytesIO, range
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def _construct_dataframe(num_rows):
|
||||
"""
|
||||
Construct a DataFrame for testing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
num_rows : int
|
||||
The number of rows for our DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
"""
|
||||
df = DataFrame(np.random.rand(num_rows, 5), columns=list("abcde"))
|
||||
df["foo"] = "foo"
|
||||
df["bar"] = "bar"
|
||||
df["baz"] = "baz"
|
||||
df["date"] = pd.date_range("20000101 09:00:00",
|
||||
periods=num_rows,
|
||||
freq="s")
|
||||
df["int"] = np.arange(num_rows, dtype="int64")
|
||||
return df
|
||||
|
||||
|
||||
def test_multi_thread_string_io_read_csv(all_parsers):
|
||||
# see gh-11786
|
||||
parser = all_parsers
|
||||
max_row_range = 10000
|
||||
num_files = 100
|
||||
|
||||
bytes_to_df = [
|
||||
"\n".join(
|
||||
["%d,%d,%d" % (i, i, i) for i in range(max_row_range)]
|
||||
).encode() for _ in range(num_files)]
|
||||
files = [BytesIO(b) for b in bytes_to_df]
|
||||
|
||||
# Read all files in many threads.
|
||||
pool = ThreadPool(8)
|
||||
|
||||
results = pool.map(parser.read_csv, files)
|
||||
first_result = results[0]
|
||||
|
||||
for result in results:
|
||||
tm.assert_frame_equal(first_result, result)
|
||||
|
||||
|
||||
def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks):
|
||||
"""
|
||||
Generate a DataFrame via multi-thread.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
parser : BaseParser
|
||||
The parser object to use for reading the data.
|
||||
path : str
|
||||
The location of the CSV file to read.
|
||||
num_rows : int
|
||||
The number of rows to read per task.
|
||||
num_tasks : int
|
||||
The number of tasks to use for reading this DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
"""
|
||||
def reader(arg):
|
||||
"""
|
||||
Create a reader for part of the CSV.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arg : tuple
|
||||
A tuple of the following:
|
||||
|
||||
* start : int
|
||||
The starting row to start for parsing CSV
|
||||
* nrows : int
|
||||
The number of rows to read.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
"""
|
||||
start, nrows = arg
|
||||
|
||||
if not start:
|
||||
return parser.read_csv(path, index_col=0, header=0,
|
||||
nrows=nrows, parse_dates=["date"])
|
||||
|
||||
return parser.read_csv(path, index_col=0, header=None,
|
||||
skiprows=int(start) + 1,
|
||||
nrows=nrows, parse_dates=[9])
|
||||
|
||||
tasks = [
|
||||
(num_rows * i // num_tasks,
|
||||
num_rows // num_tasks) for i in range(num_tasks)
|
||||
]
|
||||
|
||||
pool = ThreadPool(processes=num_tasks)
|
||||
results = pool.map(reader, tasks)
|
||||
|
||||
header = results[0].columns
|
||||
|
||||
for r in results[1:]:
|
||||
r.columns = header
|
||||
|
||||
final_dataframe = pd.concat(results)
|
||||
return final_dataframe
|
||||
|
||||
|
||||
def test_multi_thread_path_multipart_read_csv(all_parsers):
|
||||
# see gh-11786
|
||||
num_tasks = 4
|
||||
num_rows = 100000
|
||||
|
||||
parser = all_parsers
|
||||
file_name = "__thread_pool_reader__.csv"
|
||||
df = _construct_dataframe(num_rows)
|
||||
|
||||
with tm.ensure_clean(file_name) as path:
|
||||
df.to_csv(path)
|
||||
|
||||
final_dataframe = _generate_multi_thread_dataframe(parser, path,
|
||||
num_rows, num_tasks)
|
||||
tm.assert_frame_equal(df, final_dataframe)
|
||||
@@ -0,0 +1,441 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that NA values are properly handled during
|
||||
parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO, range
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex
|
||||
import pandas.util.testing as tm
|
||||
|
||||
import pandas.io.common as com
|
||||
|
||||
|
||||
def test_string_nas(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
a,b,c
|
||||
d,,f
|
||||
,g,h
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame([["a", "b", "c"],
|
||||
["d", np.nan, "f"],
|
||||
[np.nan, "g", "h"]],
|
||||
columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_detect_string_na(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B
|
||||
foo,bar
|
||||
NA,baz
|
||||
NaN,nan
|
||||
"""
|
||||
expected = DataFrame([["foo", "bar"], [np.nan, "baz"],
|
||||
[np.nan, np.nan]], columns=["A", "B"])
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_values", [
|
||||
["-999.0", "-999"],
|
||||
[-999, -999.0],
|
||||
[-999.0, -999],
|
||||
["-999.0"], ["-999"],
|
||||
[-999.0], [-999]
|
||||
])
|
||||
@pytest.mark.parametrize("data", [
|
||||
"""A,B
|
||||
-999,1.2
|
||||
2,-999
|
||||
3,4.5
|
||||
""",
|
||||
"""A,B
|
||||
-999,1.200
|
||||
2,-999.000
|
||||
3,4.500
|
||||
"""
|
||||
])
|
||||
def test_non_string_na_values(all_parsers, data, na_values):
|
||||
# see gh-3611: with an odd float format, we can't match
|
||||
# the string "999.0" exactly but still need float matching
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[np.nan, 1.2], [2.0, np.nan],
|
||||
[3.0, 4.5]], columns=["A", "B"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_default_na_values(all_parsers):
|
||||
_NA_VALUES = {"-1.#IND", "1.#QNAN", "1.#IND", "-1.#QNAN", "#N/A",
|
||||
"N/A", "n/a", "NA", "#NA", "NULL", "null", "NaN", "nan",
|
||||
"-NaN", "-nan", "#N/A N/A", ""}
|
||||
assert _NA_VALUES == com._NA_VALUES
|
||||
|
||||
parser = all_parsers
|
||||
nv = len(_NA_VALUES)
|
||||
|
||||
def f(i, v):
|
||||
if i == 0:
|
||||
buf = ""
|
||||
elif i > 0:
|
||||
buf = "".join([","] * i)
|
||||
|
||||
buf = "{0}{1}".format(buf, v)
|
||||
|
||||
if i < nv - 1:
|
||||
buf = "{0}{1}".format(buf, "".join([","] * (nv - i - 1)))
|
||||
|
||||
return buf
|
||||
|
||||
data = StringIO("\n".join(f(i, v) for i, v in enumerate(_NA_VALUES)))
|
||||
expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
|
||||
|
||||
result = parser.read_csv(data, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_values", ["baz", ["baz"]])
|
||||
def test_custom_na_values(all_parsers, na_values):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
ignore,this,row
|
||||
1,NA,3
|
||||
-1.#IND,5,baz
|
||||
7,8,NaN
|
||||
"""
|
||||
expected = DataFrame([[1., np.nan, 3], [np.nan, 5, np.nan],
|
||||
[7, 8, np.nan]], columns=["A", "B", "C"])
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_bool_na_values(all_parsers):
|
||||
data = """A,B,C
|
||||
True,False,True
|
||||
NA,True,False
|
||||
False,NA,True"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame({"A": np.array([True, np.nan, False], dtype=object),
|
||||
"B": np.array([False, True, np.nan], dtype=object),
|
||||
"C": [True, False, True]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_na_value_dict(all_parsers):
|
||||
data = """A,B,C
|
||||
foo,bar,NA
|
||||
bar,foo,foo
|
||||
foo,bar,NA
|
||||
bar,foo,foo"""
|
||||
parser = all_parsers
|
||||
df = parser.read_csv(StringIO(data),
|
||||
na_values={"A": ["foo"], "B": ["bar"]})
|
||||
expected = DataFrame({"A": [np.nan, "bar", np.nan, "bar"],
|
||||
"B": [np.nan, "foo", np.nan, "foo"],
|
||||
"C": [np.nan, "foo", np.nan, "foo"]})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col,expected", [
|
||||
([0], DataFrame({"b": [np.nan], "c": [1], "d": [5]},
|
||||
index=Index([0], name="a"))),
|
||||
([0, 2], DataFrame({"b": [np.nan], "d": [5]},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(0, 1)], names=["a", "c"]))),
|
||||
(["a", "c"], DataFrame({"b": [np.nan], "d": [5]},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(0, 1)], names=["a", "c"]))),
|
||||
])
|
||||
def test_na_value_dict_multi_index(all_parsers, index_col, expected):
|
||||
data = """\
|
||||
a,b,c,d
|
||||
0,NA,1,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), na_values=set(),
|
||||
index_col=index_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs,expected", [
|
||||
(dict(), DataFrame({"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", np.nan, "five",
|
||||
np.nan, "seven"]})),
|
||||
(dict(na_values={"A": [], "C": []}, keep_default_na=False),
|
||||
DataFrame({"A": ["a", "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", "nan", "five", "", "seven"]})),
|
||||
(dict(na_values=["a"], keep_default_na=False),
|
||||
DataFrame({"A": [np.nan, "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", "nan", "five", "", "seven"]})),
|
||||
(dict(na_values={"A": [], "C": []}),
|
||||
DataFrame({"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", np.nan,
|
||||
"five", np.nan, "seven"]})),
|
||||
])
|
||||
def test_na_values_keep_default(all_parsers, kwargs, expected):
|
||||
data = """\
|
||||
A,B,C
|
||||
a,1,one
|
||||
b,2,two
|
||||
,3,three
|
||||
d,4,nan
|
||||
e,5,five
|
||||
nan,6,
|
||||
g,7,seven
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_na_values_no_keep_default(all_parsers):
|
||||
# see gh-4318: passing na_values=None and
|
||||
# keep_default_na=False yields 'None" as a na_value
|
||||
data = """\
|
||||
A,B,C
|
||||
a,1,None
|
||||
b,2,two
|
||||
,3,None
|
||||
d,4,nan
|
||||
e,5,five
|
||||
nan,6,
|
||||
g,7,seven
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), keep_default_na=False)
|
||||
|
||||
expected = DataFrame({"A": ["a", "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["None", "two", "None", "nan",
|
||||
"five", "", "seven"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_keep_default_na_dict_na_values(all_parsers):
|
||||
# see gh-19227
|
||||
data = "a,b\n,2"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), na_values={"b": ["2"]},
|
||||
keep_default_na=False)
|
||||
expected = DataFrame({"a": [""], "b": [np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
|
||||
# see gh-19227
|
||||
#
|
||||
# Scalar values shouldn't cause the parsing to crash or fail.
|
||||
data = "a,b\n1,2"
|
||||
parser = all_parsers
|
||||
df = parser.read_csv(StringIO(data), na_values={"b": 2},
|
||||
keep_default_na=False)
|
||||
expected = DataFrame({"a": [1], "b": [np.nan]})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("col_zero_na_values", [
|
||||
113125, "113125"
|
||||
])
|
||||
def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers,
|
||||
col_zero_na_values):
|
||||
# see gh-19227
|
||||
data = """\
|
||||
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
|
||||
729639,"qwer","",asdfkj,466.681,,252.373
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame({0: [np.nan, 729639.0],
|
||||
1: [np.nan, "qwer"],
|
||||
2: ["/blaha", np.nan],
|
||||
3: ["kjsdkj", "asdfkj"],
|
||||
4: [412.166, 466.681],
|
||||
5: ["225.874", ""],
|
||||
6: [np.nan, 252.373]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None,
|
||||
keep_default_na=False,
|
||||
na_values={2: "", 6: "214.008",
|
||||
1: "blah", 0: col_zero_na_values})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_filter,row_data", [
|
||||
(True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
|
||||
(False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
|
||||
])
|
||||
def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
|
||||
data = """\
|
||||
A,B
|
||||
1,A
|
||||
nan,B
|
||||
3,C
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), na_values=["B"],
|
||||
na_filter=na_filter)
|
||||
|
||||
expected = DataFrame(row_data, columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_na_trailing_columns(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax
|
||||
2012-03-14,USD,AAPL,BUY,1000
|
||||
2012-05-12,USD,SBUX,SELL,500"""
|
||||
|
||||
# Trailing columns should be all NaN.
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame([
|
||||
["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan],
|
||||
["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan],
|
||||
], columns=["Date", "Currency", "Symbol", "Type",
|
||||
"Units", "UnitPrice", "Cost", "Tax"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_values,row_data", [
|
||||
(1, [[np.nan, 2.0], [2.0, np.nan]]),
|
||||
({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]),
|
||||
])
|
||||
def test_na_values_scalar(all_parsers, na_values, row_data):
|
||||
# see gh-12224
|
||||
parser = all_parsers
|
||||
names = ["a", "b"]
|
||||
data = "1,2\n2,1"
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
expected = DataFrame(row_data, columns=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_na_values_dict_aliasing(all_parsers):
|
||||
parser = all_parsers
|
||||
na_values = {"a": 2, "b": 1}
|
||||
na_values_copy = na_values.copy()
|
||||
|
||||
names = ["a", "b"]
|
||||
data = "1,2\n2,1"
|
||||
|
||||
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
|
||||
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_dict_equal(na_values, na_values_copy)
|
||||
|
||||
|
||||
def test_na_values_dict_col_index(all_parsers):
|
||||
# see gh-14203
|
||||
data = "a\nfoo\n1"
|
||||
parser = all_parsers
|
||||
na_values = {0: "foo"}
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values)
|
||||
expected = DataFrame({"a": [np.nan, 1]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,kwargs,expected", [
|
||||
(str(2**63) + "\n" + str(2**63 + 1),
|
||||
dict(na_values=[2**63]), DataFrame([str(2**63), str(2**63 + 1)])),
|
||||
(str(2**63) + ",1" + "\n,2",
|
||||
dict(), DataFrame([[str(2**63), 1], ['', 2]])),
|
||||
(str(2**63) + "\n1",
|
||||
dict(na_values=[2**63]), DataFrame([np.nan, 1])),
|
||||
])
|
||||
def test_na_values_uint64(all_parsers, data, kwargs, expected):
|
||||
# see gh-14983
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), header=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_na_values_no_default_with_index(all_parsers):
|
||||
# see gh-15835
|
||||
data = "a,1\nb,2"
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"1": [2]}, index=Index(["b"], name="a"))
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0,
|
||||
keep_default_na=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_filter,index_data", [
|
||||
(False, ["", "5"]),
|
||||
(True, [np.nan, 5.0]),
|
||||
])
|
||||
def test_no_na_filter_on_index(all_parsers, na_filter, index_data):
|
||||
# see gh-5239
|
||||
#
|
||||
# Don't parse NA-values in index unless na_filter=True
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,,3\n4,5,6"
|
||||
|
||||
expected = DataFrame({"a": [1, 4], "c": [3, 6]},
|
||||
index=Index(index_data, name="b"))
|
||||
result = parser.read_csv(StringIO(data), index_col=[1],
|
||||
na_filter=na_filter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_inf_na_values_with_int_index(all_parsers):
|
||||
# see gh-17128
|
||||
parser = all_parsers
|
||||
data = "idx,col1,col2\n1,3,4\n2,inf,-inf"
|
||||
|
||||
# Don't fail with OverflowError with inf's and integer index column.
|
||||
out = parser.read_csv(StringIO(data), index_col=[0],
|
||||
na_values=["inf", "-inf"])
|
||||
expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]},
|
||||
index=Index([1, 2], name="idx"))
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_filter", [True, False])
|
||||
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
|
||||
# see gh-20377
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,,3\n4,5,6"
|
||||
|
||||
# na_filter=True --> missing value becomes NaN.
|
||||
# na_filter=False --> missing value remains empty string.
|
||||
empty = np.nan if na_filter else ""
|
||||
expected = DataFrame({"a": ["1", "4"],
|
||||
"b": [empty, "5"],
|
||||
"c": ["3", "6"]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data, na_values", [
|
||||
("false,1\n,1\ntrue", None),
|
||||
("false,1\nnull,1\ntrue", None),
|
||||
("false,1\nnan,1\ntrue", None),
|
||||
("false,1\nfoo,1\ntrue", 'foo'),
|
||||
("false,1\nfoo,1\ntrue", ['foo']),
|
||||
("false,1\nfoo,1\ntrue", {'a': 'foo'}),
|
||||
])
|
||||
def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
|
||||
parser = all_parsers
|
||||
msg = ("(Bool column has NA values in column [0a])|"
|
||||
"(cannot safely convert passed user dtype of "
|
||||
"bool for object dtyped data in column 0)")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=None, names=['a', 'b'],
|
||||
dtype={'a': 'bool'}, na_values=na_values)
|
||||
@@ -0,0 +1,204 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests parsers ability to read and parse non-local files
|
||||
and hence require a network connection to be read.
|
||||
"""
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import BytesIO, StringIO
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.parsers import read_csv
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.parametrize(
|
||||
"compress_type, extension", [
|
||||
('gzip', '.gz'), ('bz2', '.bz2'), ('zip', '.zip'),
|
||||
pytest.param('xz', '.xz', marks=td.skip_if_no_lzma)
|
||||
]
|
||||
)
|
||||
@pytest.mark.parametrize('mode', ['explicit', 'infer'])
|
||||
@pytest.mark.parametrize('engine', ['python', 'c'])
|
||||
def test_compressed_urls(salaries_table, compress_type, extension, mode,
|
||||
engine):
|
||||
check_compressed_urls(salaries_table, compress_type, extension, mode,
|
||||
engine)
|
||||
|
||||
|
||||
@tm.network
|
||||
def check_compressed_urls(salaries_table, compression, extension, mode,
|
||||
engine):
|
||||
# test reading compressed urls with various engines and
|
||||
# extension inference
|
||||
base_url = ('https://github.com/pandas-dev/pandas/raw/master/'
|
||||
'pandas/tests/io/parser/data/salaries.csv')
|
||||
|
||||
url = base_url + extension
|
||||
|
||||
if mode != 'explicit':
|
||||
compression = mode
|
||||
|
||||
url_table = read_csv(url, sep='\t', compression=compression, engine=engine)
|
||||
tm.assert_frame_equal(url_table, salaries_table)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tips_df(datapath):
|
||||
"""DataFrame with the tips dataset."""
|
||||
return read_csv(datapath('io', 'parser', 'data', 'tips.csv'))
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("s3_resource")
|
||||
@td.skip_if_not_us_locale()
|
||||
class TestS3(object):
|
||||
|
||||
def test_parse_public_s3_bucket(self, tips_df):
|
||||
pytest.importorskip('s3fs')
|
||||
|
||||
# more of an integration test due to the not-public contents portion
|
||||
# can probably mock this though.
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df = read_csv('s3://pandas-test/tips.csv' +
|
||||
ext, compression=comp)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
# Read public file from bucket with not-public contents
|
||||
df = read_csv('s3://cant_get_it/tips.csv')
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_public_s3n_bucket(self, tips_df):
|
||||
|
||||
# Read from AWS s3 as "s3n" URL
|
||||
df = read_csv('s3n://pandas-test/tips.csv', nrows=10)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3a_bucket(self, tips_df):
|
||||
# Read from AWS s3 as "s3a" URL
|
||||
df = read_csv('s3a://pandas-test/tips.csv', nrows=10)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3_bucket_nrows(self, tips_df):
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df = read_csv('s3://pandas-test/tips.csv' +
|
||||
ext, nrows=10, compression=comp)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3_bucket_chunked(self, tips_df):
|
||||
# Read with a chunksize
|
||||
chunksize = 5
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
|
||||
chunksize=chunksize, compression=comp)
|
||||
assert df_reader.chunksize == chunksize
|
||||
for i_chunk in [0, 1, 2]:
|
||||
# Read a couple of chunks and make sure we see them
|
||||
# properly.
|
||||
df = df_reader.get_chunk()
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
true_df = tips_df.iloc[
|
||||
chunksize * i_chunk: chunksize * (i_chunk + 1)]
|
||||
tm.assert_frame_equal(true_df, df)
|
||||
|
||||
def test_parse_public_s3_bucket_chunked_python(self, tips_df):
|
||||
# Read with a chunksize using the Python parser
|
||||
chunksize = 5
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
|
||||
chunksize=chunksize, compression=comp,
|
||||
engine='python')
|
||||
assert df_reader.chunksize == chunksize
|
||||
for i_chunk in [0, 1, 2]:
|
||||
# Read a couple of chunks and make sure we see them properly.
|
||||
df = df_reader.get_chunk()
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
true_df = tips_df.iloc[
|
||||
chunksize * i_chunk: chunksize * (i_chunk + 1)]
|
||||
tm.assert_frame_equal(true_df, df)
|
||||
|
||||
def test_parse_public_s3_bucket_python(self, tips_df):
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
|
||||
compression=comp)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_infer_s3_compression(self, tips_df):
|
||||
for ext in ['', '.gz', '.bz2']:
|
||||
df = read_csv('s3://pandas-test/tips.csv' + ext,
|
||||
engine='python', compression='infer')
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_public_s3_bucket_nrows_python(self, tips_df):
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
|
||||
nrows=10, compression=comp)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_s3_fails(self):
|
||||
with pytest.raises(IOError):
|
||||
read_csv('s3://nyqpug/asdf.csv')
|
||||
|
||||
# Receive a permission error when trying to read a private bucket.
|
||||
# It's irrelevant here that this isn't actually a table.
|
||||
with pytest.raises(IOError):
|
||||
read_csv('s3://cant_get_it/')
|
||||
|
||||
def test_read_csv_handles_boto_s3_object(self,
|
||||
s3_resource,
|
||||
tips_file):
|
||||
# see gh-16135
|
||||
|
||||
s3_object = s3_resource.meta.client.get_object(
|
||||
Bucket='pandas-test',
|
||||
Key='tips.csv')
|
||||
|
||||
result = read_csv(BytesIO(s3_object["Body"].read()), encoding='utf8')
|
||||
assert isinstance(result, DataFrame)
|
||||
assert not result.empty
|
||||
|
||||
expected = read_csv(tips_file)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_read_csv_chunked_download(self, s3_resource, caplog):
|
||||
# 8 MB, S3FS usees 5MB chunks
|
||||
df = DataFrame(np.random.randn(100000, 4), columns=list('abcd'))
|
||||
buf = BytesIO()
|
||||
str_buf = StringIO()
|
||||
|
||||
df.to_csv(str_buf)
|
||||
|
||||
buf = BytesIO(str_buf.getvalue().encode('utf-8'))
|
||||
|
||||
s3_resource.Bucket("pandas-test").put_object(
|
||||
Key="large-file.csv",
|
||||
Body=buf)
|
||||
|
||||
with caplog.at_level(logging.DEBUG, logger='s3fs.core'):
|
||||
read_csv("s3://pandas-test/large-file.csv", nrows=5)
|
||||
# log of fetch_range (start, stop)
|
||||
assert ((0, 5505024) in {x.args[-2:] for x in caplog.records})
|
||||
@@ -0,0 +1,849 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests date parsing functionality for all of the
|
||||
parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from datetime import date, datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
from pandas._libs.tslib import Timestamp
|
||||
from pandas._libs.tslibs import parsing
|
||||
from pandas.compat import StringIO, lrange, parse_date
|
||||
from pandas.compat.numpy import np_array_datetime64_compat
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, DatetimeIndex, Index, MultiIndex
|
||||
from pandas.core.indexes.datetimes import date_range
|
||||
import pandas.util.testing as tm
|
||||
|
||||
import pandas.io.date_converters as conv
|
||||
import pandas.io.parsers as parsers
|
||||
|
||||
|
||||
def test_separator_date_conflict(all_parsers):
|
||||
# Regression test for gh-4678
|
||||
#
|
||||
# Make sure thousands separator and
|
||||
# date parsing do not conflict.
|
||||
parser = all_parsers
|
||||
data = "06-02-2013;13:00;1-000.215"
|
||||
expected = DataFrame([[datetime(2013, 6, 2, 13, 0, 0), 1000.215]],
|
||||
columns=["Date", 2])
|
||||
|
||||
df = parser.read_csv(StringIO(data), sep=";", thousands="-",
|
||||
parse_dates={"Date": [0, 1]}, header=None)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("keep_date_col", [True, False])
|
||||
def test_multiple_date_col_custom(all_parsers, keep_date_col):
|
||||
data = """\
|
||||
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
|
||||
"""
|
||||
parser = all_parsers
|
||||
|
||||
def date_parser(*date_cols):
|
||||
"""
|
||||
Test date parser.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
date_cols : args
|
||||
The list of data columns to parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
parsed : Series
|
||||
"""
|
||||
return parsing.try_parse_dates(parsers._concat_date_cols(date_cols))
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None,
|
||||
date_parser=date_parser, prefix="X",
|
||||
parse_dates={"actual": [1, 2],
|
||||
"nominal": [1, 3]},
|
||||
keep_date_col=keep_date_col)
|
||||
expected = DataFrame([
|
||||
[datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56),
|
||||
"KORD", "19990127", " 19:00:00", " 18:56:00",
|
||||
0.81, 2.81, 7.2, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56),
|
||||
"KORD", "19990127", " 20:00:00", " 19:56:00",
|
||||
0.01, 2.21, 7.2, 0.0, 260.0],
|
||||
[datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56),
|
||||
"KORD", "19990127", " 21:00:00", " 20:56:00",
|
||||
-0.59, 2.21, 5.7, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18),
|
||||
"KORD", "19990127", " 21:00:00", " 21:18:00",
|
||||
-0.99, 2.01, 3.6, 0.0, 270.0],
|
||||
[datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56),
|
||||
"KORD", "19990127", " 22:00:00", " 21:56:00",
|
||||
-0.59, 1.71, 5.1, 0.0, 290.0],
|
||||
[datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56),
|
||||
"KORD", "19990127", " 23:00:00", " 22:56:00",
|
||||
-0.59, 1.71, 4.6, 0.0, 280.0],
|
||||
], columns=["actual", "nominal", "X0", "X1", "X2",
|
||||
"X3", "X4", "X5", "X6", "X7", "X8"])
|
||||
|
||||
if not keep_date_col:
|
||||
expected = expected.drop(["X1", "X2", "X3"], axis=1)
|
||||
elif parser.engine == "python":
|
||||
expected["X1"] = expected["X1"].astype(np.int64)
|
||||
|
||||
# Python can sometimes be flaky about how
|
||||
# the aggregated columns are entered, so
|
||||
# this standardizes the order.
|
||||
result = result[expected.columns]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("keep_date_col", [True, False])
|
||||
def test_multiple_date_col(all_parsers, keep_date_col):
|
||||
data = """\
|
||||
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), header=None,
|
||||
prefix="X", parse_dates=[[1, 2], [1, 3]],
|
||||
keep_date_col=keep_date_col)
|
||||
expected = DataFrame([
|
||||
[datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56),
|
||||
"KORD", "19990127", " 19:00:00", " 18:56:00",
|
||||
0.81, 2.81, 7.2, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56),
|
||||
"KORD", "19990127", " 20:00:00", " 19:56:00",
|
||||
0.01, 2.21, 7.2, 0.0, 260.0],
|
||||
[datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56),
|
||||
"KORD", "19990127", " 21:00:00", " 20:56:00",
|
||||
-0.59, 2.21, 5.7, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18),
|
||||
"KORD", "19990127", " 21:00:00", " 21:18:00",
|
||||
-0.99, 2.01, 3.6, 0.0, 270.0],
|
||||
[datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56),
|
||||
"KORD", "19990127", " 22:00:00", " 21:56:00",
|
||||
-0.59, 1.71, 5.1, 0.0, 290.0],
|
||||
[datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56),
|
||||
"KORD", "19990127", " 23:00:00", " 22:56:00",
|
||||
-0.59, 1.71, 4.6, 0.0, 280.0],
|
||||
], columns=["X1_X2", "X1_X3", "X0", "X1", "X2",
|
||||
"X3", "X4", "X5", "X6", "X7", "X8"])
|
||||
|
||||
if not keep_date_col:
|
||||
expected = expected.drop(["X1", "X2", "X3"], axis=1)
|
||||
elif parser.engine == "python":
|
||||
expected["X1"] = expected["X1"].astype(np.int64)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_date_col_as_index_col(all_parsers):
|
||||
data = """\
|
||||
KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), header=None, prefix="X",
|
||||
parse_dates=[1], index_col=1)
|
||||
|
||||
index = Index([datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 20, 0),
|
||||
datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 0),
|
||||
datetime(1999, 1, 27, 22, 0)], name="X1")
|
||||
expected = DataFrame([
|
||||
["KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0],
|
||||
["KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0],
|
||||
["KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0],
|
||||
["KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0],
|
||||
["KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0],
|
||||
], columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"], index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiple_date_cols_int_cast(all_parsers):
|
||||
data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
|
||||
"KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
|
||||
"KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
|
||||
"KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
|
||||
"KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
|
||||
"KORD,19990127, 23:00:00, 22:56:00, -0.5900")
|
||||
parse_dates = {"actual": [1, 2], "nominal": [1, 3]}
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None,
|
||||
date_parser=conv.parse_date_time,
|
||||
parse_dates=parse_dates, prefix="X")
|
||||
expected = DataFrame([
|
||||
[datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56),
|
||||
"KORD", 0.81],
|
||||
[datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56),
|
||||
"KORD", 0.01],
|
||||
[datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56),
|
||||
"KORD", -0.59],
|
||||
[datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18),
|
||||
"KORD", -0.99],
|
||||
[datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56),
|
||||
"KORD", -0.59],
|
||||
[datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56),
|
||||
"KORD", -0.59],
|
||||
], columns=["actual", "nominal", "X0", "X4"])
|
||||
|
||||
# Python can sometimes be flaky about how
|
||||
# the aggregated columns are entered, so
|
||||
# this standardizes the order.
|
||||
result = result[expected.columns]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiple_date_col_timestamp_parse(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
|
||||
05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), parse_dates=[[0, 1]],
|
||||
header=None, date_parser=Timestamp)
|
||||
expected = DataFrame([
|
||||
[Timestamp("05/31/2012, 15:30:00.029"),
|
||||
1306.25, 1, "E", 0, np.nan, 1306.25],
|
||||
[Timestamp("05/31/2012, 15:30:00.029"),
|
||||
1306.25, 8, "E", 0, np.nan, 1306.25]
|
||||
], columns=["0_1", 2, 3, 4, 5, 6, 7])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiple_date_cols_with_header(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
|
||||
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]})
|
||||
expected = DataFrame([
|
||||
[datetime(1999, 1, 27, 19, 0), "KORD", " 18:56:00",
|
||||
0.81, 2.81, 7.2, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 20, 0), "KORD", " 19:56:00",
|
||||
0.01, 2.21, 7.2, 0.0, 260.0],
|
||||
[datetime(1999, 1, 27, 21, 0), "KORD", " 20:56:00",
|
||||
-0.59, 2.21, 5.7, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 21, 0), "KORD", " 21:18:00",
|
||||
-0.99, 2.01, 3.6, 0.0, 270.0],
|
||||
[datetime(1999, 1, 27, 22, 0), "KORD", " 21:56:00",
|
||||
-0.59, 1.71, 5.1, 0.0, 290.0],
|
||||
[datetime(1999, 1, 27, 23, 0), "KORD", " 22:56:00",
|
||||
-0.59, 1.71, 4.6, 0.0, 280.0],
|
||||
], columns=["nominal", "ID", "ActualTime", "TDew",
|
||||
"TAir", "Windspeed", "Precip", "WindDir"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,parse_dates,msg", [
|
||||
("""\
|
||||
date_NominalTime,date,NominalTime
|
||||
KORD1,19990127, 19:00:00
|
||||
KORD2,19990127, 20:00:00""", [[1, 2]], ("New date column already "
|
||||
"in dict date_NominalTime")),
|
||||
("""\
|
||||
ID,date,nominalTime
|
||||
KORD,19990127, 19:00:00
|
||||
KORD,19990127, 20:00:00""", dict(ID=[1, 2]), "Date column ID already in dict")
|
||||
])
|
||||
def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg):
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), parse_dates=parse_dates)
|
||||
|
||||
|
||||
def test_date_parser_int_bug(all_parsers):
|
||||
# see gh-3071
|
||||
parser = all_parsers
|
||||
data = ("posix_timestamp,elapsed,sys,user,queries,query_time,rows,"
|
||||
"accountid,userid,contactid,level,silo,method\n"
|
||||
"1343103150,0.062353,0,4,6,0.01690,3,"
|
||||
"12345,1,-1,3,invoice_InvoiceResource,search\n")
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), index_col=0, parse_dates=[0],
|
||||
date_parser=lambda x: datetime.utcfromtimestamp(int(x)))
|
||||
expected = DataFrame([[0.062353, 0, 4, 6, 0.01690, 3, 12345, 1, -1,
|
||||
3, "invoice_InvoiceResource", "search"]],
|
||||
columns=["elapsed", "sys", "user", "queries",
|
||||
"query_time", "rows", "accountid",
|
||||
"userid", "contactid", "level",
|
||||
"silo", "method"],
|
||||
index=Index([Timestamp("2012-07-24 04:12:30")],
|
||||
name="posix_timestamp"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nat_parse(all_parsers):
|
||||
# see gh-3062
|
||||
parser = all_parsers
|
||||
df = DataFrame(dict({"A": np.asarray(lrange(10), dtype="float64"),
|
||||
"B": pd.Timestamp("20010101")}))
|
||||
df.iloc[3:6, :] = np.nan
|
||||
|
||||
with tm.ensure_clean("__nat_parse_.csv") as path:
|
||||
df.to_csv(path)
|
||||
|
||||
result = parser.read_csv(path, index_col=0, parse_dates=["B"])
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_csv_custom_parser(all_parsers):
|
||||
data = """A,B,C
|
||||
20090101,a,1,2
|
||||
20090102,b,3,4
|
||||
20090103,c,4,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
date_parser=lambda x: datetime.strptime(x, "%Y%m%d"))
|
||||
expected = parser.read_csv(StringIO(data), parse_dates=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_dates_implicit_first_col(all_parsers):
|
||||
data = """A,B,C
|
||||
20090101,a,1,2
|
||||
20090102,b,3,4
|
||||
20090103,c,4,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), parse_dates=True)
|
||||
|
||||
expected = parser.read_csv(StringIO(data), index_col=0,
|
||||
parse_dates=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_dates_string(all_parsers):
|
||||
data = """date,A,B,C
|
||||
20090101,a,1,2
|
||||
20090102,b,3,4
|
||||
20090103,c,4,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col="date",
|
||||
parse_dates=["date"])
|
||||
index = date_range("1/1/2009", periods=3)
|
||||
index.name = "date"
|
||||
|
||||
expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4],
|
||||
"C": [2, 4, 5]}, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# Bug in https://github.com/dateutil/dateutil/issues/217
|
||||
# has been addressed, but we just don't pass in the `yearfirst`
|
||||
@pytest.mark.xfail(reason="yearfirst is not surfaced in read_*")
|
||||
@pytest.mark.parametrize("parse_dates", [
|
||||
[["date", "time"]],
|
||||
[[0, 1]]
|
||||
])
|
||||
def test_yy_format_with_year_first(all_parsers, parse_dates):
|
||||
data = """date,time,B,C
|
||||
090131,0010,1,2
|
||||
090228,1020,3,4
|
||||
090331,0830,5,6
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=0,
|
||||
parse_dates=parse_dates)
|
||||
index = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0),
|
||||
datetime(2009, 2, 28, 10, 20, 0),
|
||||
datetime(2009, 3, 31, 8, 30, 0)],
|
||||
dtype=object, name="date_time")
|
||||
expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]])
|
||||
def test_parse_dates_column_list(all_parsers, parse_dates):
|
||||
data = "a,b,c\n01/01/2010,1,15/02/2010"
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [datetime(2010, 1, 1)], "b": [1],
|
||||
"c": [datetime(2010, 2, 15)]})
|
||||
expected = expected.set_index(["a", "b"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=[0, 1],
|
||||
parse_dates=parse_dates, dayfirst=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
|
||||
def test_multi_index_parse_dates(all_parsers, index_col):
|
||||
data = """index1,index2,A,B,C
|
||||
20090101,one,a,1,2
|
||||
20090101,two,b,3,4
|
||||
20090101,three,c,4,5
|
||||
20090102,one,a,1,2
|
||||
20090102,two,b,3,4
|
||||
20090102,three,c,4,5
|
||||
20090103,one,a,1,2
|
||||
20090103,two,b,3,4
|
||||
20090103,three,c,4,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
index = MultiIndex.from_product([
|
||||
(datetime(2009, 1, 1), datetime(2009, 1, 2),
|
||||
datetime(2009, 1, 3)), ("one", "two", "three")],
|
||||
names=["index1", "index2"])
|
||||
|
||||
# Out of order.
|
||||
if index_col == [1, 0]:
|
||||
index = index.swaplevel(0, 1)
|
||||
|
||||
expected = DataFrame([["a", 1, 2], ["b", 3, 4], ["c", 4, 5],
|
||||
["a", 1, 2], ["b", 3, 4], ["c", 4, 5],
|
||||
["a", 1, 2], ["b", 3, 4], ["c", 4, 5]],
|
||||
columns=["A", "B", "C"], index=index)
|
||||
result = parser.read_csv(StringIO(data), index_col=index_col,
|
||||
parse_dates=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [
|
||||
dict(dayfirst=True), dict(day_first=True)
|
||||
])
|
||||
def test_parse_dates_custom_euro_format(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
data = """foo,bar,baz
|
||||
31/01/2010,1,2
|
||||
01/02/2010,1,NA
|
||||
02/02/2010,1,2
|
||||
"""
|
||||
if "dayfirst" in kwargs:
|
||||
df = parser.read_csv(StringIO(data), names=["time", "Q", "NTU"],
|
||||
date_parser=lambda d: parse_date(d, **kwargs),
|
||||
header=0, index_col=0, parse_dates=True,
|
||||
na_values=["NA"])
|
||||
exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1),
|
||||
datetime(2010, 2, 2)], name="time")
|
||||
expected = DataFrame({"Q": [1, 1, 1], "NTU": [2, np.nan, 2]},
|
||||
index=exp_index, columns=["Q", "NTU"])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
else:
|
||||
msg = "got an unexpected keyword argument 'day_first'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=["time", "Q", "NTU"],
|
||||
date_parser=lambda d: parse_date(d, **kwargs),
|
||||
skiprows=[0], index_col=0, parse_dates=True,
|
||||
na_values=["NA"])
|
||||
|
||||
|
||||
def test_parse_tz_aware(all_parsers):
|
||||
# See gh-1693
|
||||
parser = all_parsers
|
||||
data = "Date,x\n2012-06-13T01:39:00Z,0.5"
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0,
|
||||
parse_dates=True)
|
||||
expected = DataFrame({"x": [0.5]}, index=Index([Timestamp(
|
||||
"2012-06-13 01:39:00+00:00")], name="Date"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert result.index.tz is pytz.utc
|
||||
|
||||
|
||||
@pytest.mark.parametrize("parse_dates,index_col", [
|
||||
({"nominal": [1, 2]}, "nominal"),
|
||||
({"nominal": [1, 2]}, 0),
|
||||
([[1, 2]], 0),
|
||||
])
|
||||
def test_multiple_date_cols_index(all_parsers, parse_dates, index_col):
|
||||
parser = all_parsers
|
||||
data = """
|
||||
ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
|
||||
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
|
||||
"""
|
||||
expected = DataFrame([
|
||||
[datetime(1999, 1, 27, 19, 0), "KORD1", " 18:56:00",
|
||||
0.81, 2.81, 7.2, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 20, 0), "KORD2", " 19:56:00",
|
||||
0.01, 2.21, 7.2, 0.0, 260.0],
|
||||
[datetime(1999, 1, 27, 21, 0), "KORD3", " 20:56:00",
|
||||
-0.59, 2.21, 5.7, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 21, 0), "KORD4", " 21:18:00",
|
||||
-0.99, 2.01, 3.6, 0.0, 270.0],
|
||||
[datetime(1999, 1, 27, 22, 0), "KORD5", " 21:56:00",
|
||||
-0.59, 1.71, 5.1, 0.0, 290.0],
|
||||
[datetime(1999, 1, 27, 23, 0), "KORD6", " 22:56:00",
|
||||
-0.59, 1.71, 4.6, 0.0, 280.0],
|
||||
], columns=["nominal", "ID", "ActualTime", "TDew",
|
||||
"TAir", "Windspeed", "Precip", "WindDir"])
|
||||
expected = expected.set_index("nominal")
|
||||
|
||||
if not isinstance(parse_dates, dict):
|
||||
expected.index.name = "date_NominalTime"
|
||||
|
||||
result = parser.read_csv(StringIO(data), parse_dates=parse_dates,
|
||||
index_col=index_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiple_date_cols_chunked(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
ID,date,nominalTime,actualTime,A,B,C,D,E
|
||||
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
|
||||
"""
|
||||
|
||||
expected = DataFrame([
|
||||
[datetime(1999, 1, 27, 19, 0), "KORD", " 18:56:00",
|
||||
0.81, 2.81, 7.2, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 20, 0), "KORD", " 19:56:00",
|
||||
0.01, 2.21, 7.2, 0.0, 260.0],
|
||||
[datetime(1999, 1, 27, 21, 0), "KORD", " 20:56:00",
|
||||
-0.59, 2.21, 5.7, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 21, 0), "KORD", " 21:18:00",
|
||||
-0.99, 2.01, 3.6, 0.0, 270.0],
|
||||
[datetime(1999, 1, 27, 22, 0), "KORD", " 21:56:00",
|
||||
-0.59, 1.71, 5.1, 0.0, 290.0],
|
||||
[datetime(1999, 1, 27, 23, 0), "KORD", " 22:56:00",
|
||||
-0.59, 1.71, 4.6, 0.0, 280.0],
|
||||
], columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"])
|
||||
expected = expected.set_index("nominal")
|
||||
|
||||
reader = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]},
|
||||
index_col="nominal", chunksize=2)
|
||||
chunks = list(reader)
|
||||
|
||||
tm.assert_frame_equal(chunks[0], expected[:2])
|
||||
tm.assert_frame_equal(chunks[1], expected[2:4])
|
||||
tm.assert_frame_equal(chunks[2], expected[4:])
|
||||
|
||||
|
||||
def test_multiple_date_col_named_index_compat(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
ID,date,nominalTime,actualTime,A,B,C,D,E
|
||||
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
|
||||
"""
|
||||
|
||||
with_indices = parser.read_csv(StringIO(data),
|
||||
parse_dates={"nominal": [1, 2]},
|
||||
index_col="nominal")
|
||||
with_names = parser.read_csv(StringIO(data), index_col="nominal",
|
||||
parse_dates={"nominal": [
|
||||
"date", "nominalTime"]})
|
||||
tm.assert_frame_equal(with_indices, with_names)
|
||||
|
||||
|
||||
def test_multiple_date_col_multiple_index_compat(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
ID,date,nominalTime,actualTime,A,B,C,D,E
|
||||
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), index_col=["nominal", "ID"],
|
||||
parse_dates={"nominal": [1, 2]})
|
||||
expected = parser.read_csv(StringIO(data),
|
||||
parse_dates={"nominal": [1, 2]})
|
||||
|
||||
expected = expected.set_index(["nominal", "ID"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [dict(), dict(index_col="C")])
|
||||
def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs):
|
||||
# see gh-5636
|
||||
parser = all_parsers
|
||||
msg = ("Only booleans, lists, and dictionaries "
|
||||
"are accepted for the 'parse_dates' parameter")
|
||||
data = """A,B,C
|
||||
1,2,2003-11-1"""
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), parse_dates="C", **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("parse_dates", [
|
||||
(1,), np.array([4, 5]), {1, 3, 3}
|
||||
])
|
||||
def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates):
|
||||
parser = all_parsers
|
||||
msg = ("Only booleans, lists, and dictionaries "
|
||||
"are accepted for the 'parse_dates' parameter")
|
||||
data = """A,B,C
|
||||
1,2,2003-11-1"""
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), parse_dates=(1,))
|
||||
|
||||
|
||||
def test_parse_dates_empty_string(all_parsers):
|
||||
# see gh-2263
|
||||
parser = all_parsers
|
||||
data = "Date,test\n2012-01-01,1\n,2"
|
||||
result = parser.read_csv(StringIO(data), parse_dates=["Date"],
|
||||
na_filter=False)
|
||||
|
||||
expected = DataFrame([[datetime(2012, 1, 1), 1], [pd.NaT, 2]],
|
||||
columns=["Date", "test"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,kwargs,expected", [
|
||||
("a\n04.15.2016", dict(parse_dates=["a"]),
|
||||
DataFrame([datetime(2016, 4, 15)], columns=["a"])),
|
||||
("a\n04.15.2016", dict(parse_dates=True, index_col=0),
|
||||
DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"))),
|
||||
("a,b\n04.15.2016,09.16.2013", dict(parse_dates=["a", "b"]),
|
||||
DataFrame([[datetime(2016, 4, 15), datetime(2013, 9, 16)]],
|
||||
columns=["a", "b"])),
|
||||
("a,b\n04.15.2016,09.16.2013", dict(parse_dates=True, index_col=[0, 1]),
|
||||
DataFrame(index=MultiIndex.from_tuples(
|
||||
[(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"]))),
|
||||
])
|
||||
def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected):
|
||||
# see gh-14066
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), thousands=".", **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_date_time_multi_level_column_name(all_parsers):
|
||||
data = """\
|
||||
D,T,A,B
|
||||
date, time,a,b
|
||||
2001-01-05, 09:00:00, 0.0, 10.
|
||||
2001-01-06, 00:00:00, 1.0, 11.
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1],
|
||||
parse_dates={"date_time": [0, 1]},
|
||||
date_parser=conv.parse_date_time)
|
||||
|
||||
expected_data = [[datetime(2001, 1, 5, 9, 0, 0), 0., 10.],
|
||||
[datetime(2001, 1, 6, 0, 0, 0), 1., 11.]]
|
||||
expected = DataFrame(expected_data,
|
||||
columns=["date_time", ("A", "a"), ("B", "b")])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,kwargs,expected", [
|
||||
("""\
|
||||
date,time,a,b
|
||||
2001-01-05, 10:00:00, 0.0, 10.
|
||||
2001-01-05, 00:00:00, 1., 11.
|
||||
""", dict(header=0, parse_dates={"date_time": [0, 1]}),
|
||||
DataFrame([[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10],
|
||||
[datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0]],
|
||||
columns=["date_time", "a", "b"])),
|
||||
(("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
|
||||
"KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
|
||||
"KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
|
||||
"KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
|
||||
"KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
|
||||
"KORD,19990127, 23:00:00, 22:56:00, -0.5900"),
|
||||
dict(header=None, parse_dates={"actual": [1, 2], "nominal": [1, 3]}),
|
||||
DataFrame([
|
||||
[datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56),
|
||||
"KORD", 0.81],
|
||||
[datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56),
|
||||
"KORD", 0.01],
|
||||
[datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56),
|
||||
"KORD", -0.59],
|
||||
[datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18),
|
||||
"KORD", -0.99],
|
||||
[datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56),
|
||||
"KORD", -0.59],
|
||||
[datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56),
|
||||
"KORD", -0.59]], columns=["actual", "nominal", 0, 4])),
|
||||
])
|
||||
def test_parse_date_time(all_parsers, data, kwargs, expected):
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), date_parser=conv.parse_date_time,
|
||||
**kwargs)
|
||||
|
||||
# Python can sometimes be flaky about how
|
||||
# the aggregated columns are entered, so
|
||||
# this standardizes the order.
|
||||
result = result[expected.columns]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_date_fields(all_parsers):
|
||||
parser = all_parsers
|
||||
data = ("year,month,day,a\n2001,01,10,10.\n"
|
||||
"2001,02,1,11.")
|
||||
result = parser.read_csv(StringIO(data), header=0,
|
||||
parse_dates={"ymd": [0, 1, 2]},
|
||||
date_parser=conv.parse_date_fields)
|
||||
|
||||
expected = DataFrame([[datetime(2001, 1, 10), 10.],
|
||||
[datetime(2001, 2, 1), 11.]], columns=["ymd", "a"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_date_all_fields(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
year,month,day,hour,minute,second,a,b
|
||||
2001,01,05,10,00,0,0.0,10.
|
||||
2001,01,5,10,0,00,1.,11.
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), header=0,
|
||||
date_parser=conv.parse_all_fields,
|
||||
parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]})
|
||||
expected = DataFrame([[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0],
|
||||
[datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0]],
|
||||
columns=["ymdHMS", "a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_datetime_fractional_seconds(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
year,month,day,hour,minute,second,a,b
|
||||
2001,01,05,10,00,0.123456,0.0,10.
|
||||
2001,01,5,10,0,0.500000,1.,11.
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), header=0,
|
||||
date_parser=conv.parse_all_fields,
|
||||
parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]})
|
||||
expected = DataFrame([[datetime(2001, 1, 5, 10, 0, 0,
|
||||
microsecond=123456), 0.0, 10.0],
|
||||
[datetime(2001, 1, 5, 10, 0, 0,
|
||||
microsecond=500000), 1.0, 11.0]],
|
||||
columns=["ymdHMS", "a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_generic(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=0,
|
||||
parse_dates={"ym": [0, 1]},
|
||||
date_parser=lambda y, m: date(year=int(y),
|
||||
month=int(m),
|
||||
day=1))
|
||||
expected = DataFrame([[date(2001, 1, 1), 10, 10.],
|
||||
[date(2001, 2, 1), 1, 11.]],
|
||||
columns=["ym", "day", "a"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_date_parser_resolution_if_not_ns(all_parsers):
|
||||
# see gh-10245
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
date,time,prn,rxstatus
|
||||
2013-11-03,19:00:00,126,00E80000
|
||||
2013-11-03,19:00:00,23,00E80000
|
||||
2013-11-03,19:00:00,13,00E80000
|
||||
"""
|
||||
|
||||
def date_parser(dt, time):
|
||||
return np_array_datetime64_compat(dt + "T" + time + "Z",
|
||||
dtype="datetime64[s]")
|
||||
|
||||
result = parser.read_csv(StringIO(data), date_parser=date_parser,
|
||||
parse_dates={"datetime": ["date", "time"]},
|
||||
index_col=["datetime", "prn"])
|
||||
|
||||
datetimes = np_array_datetime64_compat(["2013-11-03T19:00:00Z"] * 3,
|
||||
dtype="datetime64[s]")
|
||||
expected = DataFrame(data={"rxstatus": ["00E80000"] * 3},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(datetimes[0], 126), (datetimes[1], 23),
|
||||
(datetimes[2], 13)], names=["datetime", "prn"]))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_date_column_with_empty_string(all_parsers):
|
||||
# see gh-6428
|
||||
parser = all_parsers
|
||||
data = "case,opdate\n7,10/18/2006\n7,10/18/2008\n621, "
|
||||
result = parser.read_csv(StringIO(data), parse_dates=["opdate"])
|
||||
|
||||
expected_data = [[7, "10/18/2006"],
|
||||
[7, "10/18/2008"],
|
||||
[621, " "]]
|
||||
expected = DataFrame(expected_data, columns=["case", "opdate"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,expected", [
|
||||
("a\n135217135789158401\n1352171357E+5",
|
||||
DataFrame({"a": [135217135789158401,
|
||||
135217135700000]}, dtype="float64")),
|
||||
("a\n99999999999\n123456789012345\n1234E+0",
|
||||
DataFrame({"a": [99999999999,
|
||||
123456789012345,
|
||||
1234]}, dtype="float64"))
|
||||
])
|
||||
@pytest.mark.parametrize("parse_dates", [True, False])
|
||||
def test_parse_date_float(all_parsers, data, expected, parse_dates):
|
||||
# see gh-2697
|
||||
#
|
||||
# Date parsing should fail, so we leave the data untouched
|
||||
# (i.e. float precision should remain unchanged).
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_timezone(all_parsers):
|
||||
# see gh-22256
|
||||
parser = all_parsers
|
||||
data = """dt,val
|
||||
2018-01-04 09:01:00+09:00,23350
|
||||
2018-01-04 09:02:00+09:00,23400
|
||||
2018-01-04 09:03:00+09:00,23400
|
||||
2018-01-04 09:04:00+09:00,23400
|
||||
2018-01-04 09:05:00+09:00,23400"""
|
||||
result = parser.read_csv(StringIO(data), parse_dates=["dt"])
|
||||
|
||||
dti = pd.date_range(start="2018-01-04 09:01:00",
|
||||
end="2018-01-04 09:05:00", freq="1min",
|
||||
tz=pytz.FixedOffset(540))
|
||||
expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]}
|
||||
|
||||
expected = DataFrame(expected_data)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
+301
@@ -0,0 +1,301 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that apply specifically to the Python parser. Unless specifically
|
||||
stated as a Python-specific issue, the goal is to eventually move as many of
|
||||
these tests out of this module as soon as the C parser can accept further
|
||||
arguments when parsing.
|
||||
"""
|
||||
|
||||
import csv
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas.compat as compat
|
||||
from pandas.compat import BytesIO, StringIO, u
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def test_default_separator(python_parser_only):
|
||||
# see gh-17333
|
||||
#
|
||||
# csv.Sniffer in Python treats "o" as separator.
|
||||
data = "aob\n1o2\n3o4"
|
||||
parser = python_parser_only
|
||||
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("skipfooter", ["foo", 1.5, True])
|
||||
def test_invalid_skipfooter_non_int(python_parser_only, skipfooter):
|
||||
# see gh-15925 (comment)
|
||||
data = "a\n1\n2"
|
||||
parser = python_parser_only
|
||||
msg = "skipfooter must be an integer"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=skipfooter)
|
||||
|
||||
|
||||
def test_invalid_skipfooter_negative(python_parser_only):
|
||||
# see gh-15925 (comment)
|
||||
data = "a\n1\n2"
|
||||
parser = python_parser_only
|
||||
msg = "skipfooter cannot be negative"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=-1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [
|
||||
dict(sep=None),
|
||||
dict(delimiter="|")
|
||||
])
|
||||
def test_sniff_delimiter(python_parser_only, kwargs):
|
||||
data = """index|A|B|C
|
||||
foo|1|2|3
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"],
|
||||
index=Index(["foo", "bar", "baz"], name="index"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("encoding", [None, "utf-8"])
|
||||
def test_sniff_delimiter_encoding(python_parser_only, encoding):
|
||||
parser = python_parser_only
|
||||
data = """ignore this
|
||||
ignore this too
|
||||
index|A|B|C
|
||||
foo|1|2|3
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
|
||||
if encoding is not None:
|
||||
data = u(data).encode(encoding)
|
||||
data = BytesIO(data)
|
||||
|
||||
if compat.PY3:
|
||||
from io import TextIOWrapper
|
||||
data = TextIOWrapper(data, encoding=encoding)
|
||||
else:
|
||||
data = StringIO(data)
|
||||
|
||||
result = parser.read_csv(data, index_col=0, sep=None,
|
||||
skiprows=2, encoding=encoding)
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"],
|
||||
index=Index(["foo", "bar", "baz"], name="index"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_single_line(python_parser_only):
|
||||
# see gh-6607: sniff separator
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO("1,2"), names=["a", "b"],
|
||||
header=None, sep=None)
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [dict(skipfooter=2), dict(nrows=3)])
|
||||
def test_skipfooter(python_parser_only, kwargs):
|
||||
# see gh-6607
|
||||
data = """A,B,C
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
want to skip this
|
||||
also also skip this
|
||||
"""
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("compression,klass", [
|
||||
("gzip", "GzipFile"),
|
||||
("bz2", "BZ2File"),
|
||||
])
|
||||
def test_decompression_regex_sep(python_parser_only, csv1, compression, klass):
|
||||
# see gh-6607
|
||||
parser = python_parser_only
|
||||
|
||||
with open(csv1, "rb") as f:
|
||||
data = f.read()
|
||||
|
||||
data = data.replace(b",", b"::")
|
||||
expected = parser.read_csv(csv1)
|
||||
|
||||
module = pytest.importorskip(compression)
|
||||
klass = getattr(module, klass)
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
tmp = klass(path, mode="wb")
|
||||
tmp.write(data)
|
||||
tmp.close()
|
||||
|
||||
result = parser.read_csv(path, sep="::",
|
||||
compression=compression)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_buglet_4x_multi_index(python_parser_only):
|
||||
# see gh-6607
|
||||
data = """ A B C D E
|
||||
one two three four
|
||||
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
|
||||
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
|
||||
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
|
||||
parser = python_parser_only
|
||||
|
||||
expected = DataFrame([[-0.5109, -2.3358, -0.4645, 0.05076, 0.3640],
|
||||
[0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
|
||||
[-0.6662, -0.5243, -0.3580, 0.89145, 2.5838]],
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
index=MultiIndex.from_tuples([
|
||||
("a", "b", 10.0032, 5),
|
||||
("a", "q", 20, 4),
|
||||
("x", "q", 30, 3),
|
||||
], names=["one", "two", "three", "four"]))
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_buglet_4x_multi_index2(python_parser_only):
|
||||
# see gh-6893
|
||||
data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9"
|
||||
parser = python_parser_only
|
||||
|
||||
expected = DataFrame.from_records(
|
||||
[(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
|
||||
columns=list("abcABC"), index=list("abc"))
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("add_footer", [True, False])
|
||||
def test_skipfooter_with_decimal(python_parser_only, add_footer):
|
||||
# see gh-6971
|
||||
data = "1#2\n3#4"
|
||||
parser = python_parser_only
|
||||
expected = DataFrame({"a": [1.2, 3.4]})
|
||||
|
||||
if add_footer:
|
||||
# The stray footer line should not mess with the
|
||||
# casting of the first two lines if we skip it.
|
||||
kwargs = dict(skipfooter=1)
|
||||
data += "\nFooter"
|
||||
else:
|
||||
kwargs = dict()
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=["a"],
|
||||
decimal="#", **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sep", ["::", "#####", "!!!", "123", "#1!c5",
|
||||
"%!c!d", "@@#4:2", "_!pd#_"])
|
||||
@pytest.mark.parametrize("encoding", ["utf-16", "utf-16-be", "utf-16-le",
|
||||
"utf-32", "cp037"])
|
||||
def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding):
|
||||
# see gh-3404
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
parser = python_parser_only
|
||||
|
||||
data = "1" + sep + "2"
|
||||
encoded_data = data.encode(encoding)
|
||||
|
||||
result = parser.read_csv(BytesIO(encoded_data), sep=sep,
|
||||
names=["a", "b"], encoding=encoding)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
|
||||
def test_multi_char_sep_quotes(python_parser_only, quoting):
|
||||
# see gh-13374
|
||||
kwargs = dict(sep=",,")
|
||||
parser = python_parser_only
|
||||
|
||||
data = 'a,,b\n1,,a\n2,,"2,,b"'
|
||||
msg = "ignored when a multi-char delimiter is used"
|
||||
|
||||
def fail_read():
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
|
||||
|
||||
if quoting == csv.QUOTE_NONE:
|
||||
# We expect no match, so there should be an assertion
|
||||
# error out of the inner context manager.
|
||||
with pytest.raises(AssertionError):
|
||||
fail_read()
|
||||
else:
|
||||
fail_read()
|
||||
|
||||
|
||||
def test_none_delimiter(python_parser_only, capsys):
|
||||
# see gh-13374 and gh-17465
|
||||
parser = python_parser_only
|
||||
data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
|
||||
expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]})
|
||||
|
||||
# We expect the third line in the data to be
|
||||
# skipped because it is malformed, but we do
|
||||
# not expect any errors to occur.
|
||||
result = parser.read_csv(StringIO(data), header=0,
|
||||
sep=None, warn_bad_lines=True,
|
||||
error_bad_lines=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert "Skipping line 3" in captured.err
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", [
|
||||
'a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
|
||||
@pytest.mark.parametrize("skipfooter", [0, 1])
|
||||
def test_skipfooter_bad_row(python_parser_only, data, skipfooter):
|
||||
# see gh-13879 and gh-15910
|
||||
msg = "parsing errors in the skipped footer rows"
|
||||
parser = python_parser_only
|
||||
|
||||
def fail_read():
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=skipfooter)
|
||||
|
||||
if skipfooter:
|
||||
fail_read()
|
||||
else:
|
||||
# We expect no match, so there should be an assertion
|
||||
# error out of the inner context manager.
|
||||
with pytest.raises(AssertionError):
|
||||
fail_read()
|
||||
|
||||
|
||||
def test_malformed_skipfooter(python_parser_only):
|
||||
parser = python_parser_only
|
||||
data = """ignore
|
||||
A,B,C
|
||||
1,2,3 # comment
|
||||
1,2,3,4,5
|
||||
2,3,4
|
||||
footer
|
||||
"""
|
||||
msg = "Expected 3 fields in line 4, saw 5"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=1,
|
||||
comment="#", skipfooter=1)
|
||||
@@ -0,0 +1,158 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that quoting specifications are properly handled
|
||||
during parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import csv
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY2, StringIO, u
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs,msg", [
|
||||
(dict(quotechar="foo"), '"quotechar" must be a(n)? 1-character string'),
|
||||
(dict(quotechar=None, quoting=csv.QUOTE_MINIMAL),
|
||||
"quotechar must be set if quoting enabled"),
|
||||
(dict(quotechar=2), '"quotechar" must be string, not int')
|
||||
])
|
||||
def test_bad_quote_char(all_parsers, kwargs, msg):
|
||||
data = "1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quoting,msg", [
|
||||
("foo", '"quoting" must be an integer'),
|
||||
(5, 'bad "quoting" value'), # quoting must be in the range [0, 3]
|
||||
])
|
||||
def test_bad_quoting(all_parsers, quoting, msg):
|
||||
data = "1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), quoting=quoting)
|
||||
|
||||
|
||||
def test_quote_char_basic(all_parsers):
|
||||
parser = all_parsers
|
||||
data = 'a,b,c\n1,2,"cat"'
|
||||
expected = DataFrame([[1, 2, "cat"]],
|
||||
columns=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar='"')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
|
||||
def test_quote_char_various(all_parsers, quote_char):
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[1, 2, "cat"]],
|
||||
columns=["a", "b", "c"])
|
||||
|
||||
data = 'a,b,c\n1,2,"cat"'
|
||||
new_data = data.replace('"', quote_char)
|
||||
|
||||
result = parser.read_csv(StringIO(new_data), quotechar=quote_char)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
|
||||
@pytest.mark.parametrize("quote_char", ["", None])
|
||||
def test_null_quote_char(all_parsers, quoting, quote_char):
|
||||
kwargs = dict(quotechar=quote_char, quoting=quoting)
|
||||
data = "a,b,c\n1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
if quoting != csv.QUOTE_NONE:
|
||||
# Sanity checking.
|
||||
msg = "quotechar must be set if quoting enabled"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs,exp_data", [
|
||||
(dict(), [[1, 2, "foo"]]), # Test default.
|
||||
|
||||
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
|
||||
(dict(quotechar='"', quoting=csv.QUOTE_MINIMAL), [[1, 2, "foo"]]),
|
||||
|
||||
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
|
||||
(dict(quotechar='"', quoting=csv.QUOTE_ALL), [[1, 2, "foo"]]),
|
||||
|
||||
# QUOTE_NONE tells the reader to do no special handling
|
||||
# of quote characters and leave them alone.
|
||||
(dict(quotechar='"', quoting=csv.QUOTE_NONE), [[1, 2, '"foo"']]),
|
||||
|
||||
# QUOTE_NONNUMERIC tells the reader to cast
|
||||
# all non-quoted fields to float
|
||||
(dict(quotechar='"', quoting=csv.QUOTE_NONNUMERIC), [[1.0, 2.0, "foo"]])
|
||||
])
|
||||
def test_quoting_various(all_parsers, kwargs, exp_data):
|
||||
data = '1,2,"foo"'
|
||||
parser = all_parsers
|
||||
columns = ["a", "b", "c"]
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=columns, **kwargs)
|
||||
expected = DataFrame(exp_data, columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("doublequote,exp_data", [
|
||||
(True, [[3, '4 " 5']]),
|
||||
(False, [[3, '4 " 5"']]),
|
||||
])
|
||||
def test_double_quote(all_parsers, doublequote, exp_data):
|
||||
parser = all_parsers
|
||||
data = 'a,b\n3,"4 "" 5"'
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar='"',
|
||||
doublequote=doublequote)
|
||||
expected = DataFrame(exp_data, columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quotechar", [
|
||||
u('"'),
|
||||
pytest.param(u('\u0001'), marks=pytest.mark.skipif(
|
||||
PY2, reason="Python 2.x does not handle unicode well."))])
|
||||
def test_quotechar_unicode(all_parsers, quotechar):
|
||||
# see gh-14477
|
||||
data = "a\n1"
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar=quotechar)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("balanced", [True, False])
|
||||
def test_unbalanced_quoting(all_parsers, balanced):
|
||||
# see gh-22789.
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,2,\"3"
|
||||
|
||||
if balanced:
|
||||
# Re-balance the quoting and read in without errors.
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
result = parser.read_csv(StringIO(data + '"'))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
msg = ("EOF inside string starting at row 1" if parser.engine == "c"
|
||||
else "unexpected end of data")
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
@@ -0,0 +1,580 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests the 'read_fwf' function in parsers.py. This
|
||||
test suite is independent of the others because the
|
||||
engine is set to 'python-fwf' internally.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.compat as compat
|
||||
from pandas.compat import BytesIO, StringIO
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, DatetimeIndex
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.parsers import EmptyDataError, read_csv, read_fwf
|
||||
|
||||
|
||||
def test_basic():
|
||||
data = """\
|
||||
A B C D
|
||||
201158 360.242940 149.910199 11950.7
|
||||
201159 444.953632 166.985655 11788.4
|
||||
201160 364.136849 183.628767 11806.2
|
||||
201161 413.836124 184.375703 11916.8
|
||||
201162 502.953953 173.237159 12468.3
|
||||
"""
|
||||
result = read_fwf(StringIO(data))
|
||||
expected = DataFrame([[201158, 360.242940, 149.910199, 11950.7],
|
||||
[201159, 444.953632, 166.985655, 11788.4],
|
||||
[201160, 364.136849, 183.628767, 11806.2],
|
||||
[201161, 413.836124, 184.375703, 11916.8],
|
||||
[201162, 502.953953, 173.237159, 12468.3]],
|
||||
columns=["A", "B", "C", "D"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_colspecs():
|
||||
data = """\
|
||||
A B C D E
|
||||
201158 360.242940 149.910199 11950.7
|
||||
201159 444.953632 166.985655 11788.4
|
||||
201160 364.136849 183.628767 11806.2
|
||||
201161 413.836124 184.375703 11916.8
|
||||
201162 502.953953 173.237159 12468.3
|
||||
"""
|
||||
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs)
|
||||
|
||||
expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7],
|
||||
[2011, 59, 444.953632, 166.985655, 11788.4],
|
||||
[2011, 60, 364.136849, 183.628767, 11806.2],
|
||||
[2011, 61, 413.836124, 184.375703, 11916.8],
|
||||
[2011, 62, 502.953953, 173.237159, 12468.3]],
|
||||
columns=["A", "B", "C", "D", "E"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_widths():
|
||||
data = """\
|
||||
A B C D E
|
||||
2011 58 360.242940 149.910199 11950.7
|
||||
2011 59 444.953632 166.985655 11788.4
|
||||
2011 60 364.136849 183.628767 11806.2
|
||||
2011 61 413.836124 184.375703 11916.8
|
||||
2011 62 502.953953 173.237159 12468.3
|
||||
"""
|
||||
result = read_fwf(StringIO(data), widths=[5, 5, 13, 13, 7])
|
||||
|
||||
expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7],
|
||||
[2011, 59, 444.953632, 166.985655, 11788.4],
|
||||
[2011, 60, 364.136849, 183.628767, 11806.2],
|
||||
[2011, 61, 413.836124, 184.375703, 11916.8],
|
||||
[2011, 62, 502.953953, 173.237159, 12468.3]],
|
||||
columns=["A", "B", "C", "D", "E"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_non_space_filler():
|
||||
# From Thomas Kluyver:
|
||||
#
|
||||
# Apparently, some non-space filler characters can be seen, this is
|
||||
# supported by specifying the 'delimiter' character:
|
||||
#
|
||||
# http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html
|
||||
data = """\
|
||||
A~~~~B~~~~C~~~~~~~~~~~~D~~~~~~~~~~~~E
|
||||
201158~~~~360.242940~~~149.910199~~~11950.7
|
||||
201159~~~~444.953632~~~166.985655~~~11788.4
|
||||
201160~~~~364.136849~~~183.628767~~~11806.2
|
||||
201161~~~~413.836124~~~184.375703~~~11916.8
|
||||
201162~~~~502.953953~~~173.237159~~~12468.3
|
||||
"""
|
||||
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs, delimiter="~")
|
||||
|
||||
expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7],
|
||||
[2011, 59, 444.953632, 166.985655, 11788.4],
|
||||
[2011, 60, 364.136849, 183.628767, 11806.2],
|
||||
[2011, 61, 413.836124, 184.375703, 11916.8],
|
||||
[2011, 62, 502.953953, 173.237159, 12468.3]],
|
||||
columns=["A", "B", "C", "D", "E"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_over_specified():
|
||||
data = """\
|
||||
A B C D E
|
||||
201158 360.242940 149.910199 11950.7
|
||||
201159 444.953632 166.985655 11788.4
|
||||
201160 364.136849 183.628767 11806.2
|
||||
201161 413.836124 184.375703 11916.8
|
||||
201162 502.953953 173.237159 12468.3
|
||||
"""
|
||||
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
|
||||
|
||||
with pytest.raises(ValueError, match="must specify only one of"):
|
||||
read_fwf(StringIO(data), colspecs=colspecs, widths=[6, 10, 10, 7])
|
||||
|
||||
|
||||
def test_under_specified():
|
||||
data = """\
|
||||
A B C D E
|
||||
201158 360.242940 149.910199 11950.7
|
||||
201159 444.953632 166.985655 11788.4
|
||||
201160 364.136849 183.628767 11806.2
|
||||
201161 413.836124 184.375703 11916.8
|
||||
201162 502.953953 173.237159 12468.3
|
||||
"""
|
||||
with pytest.raises(ValueError, match="Must specify either"):
|
||||
read_fwf(StringIO(data), colspecs=None, widths=None)
|
||||
|
||||
|
||||
def test_read_csv_compat():
|
||||
csv_data = """\
|
||||
A,B,C,D,E
|
||||
2011,58,360.242940,149.910199,11950.7
|
||||
2011,59,444.953632,166.985655,11788.4
|
||||
2011,60,364.136849,183.628767,11806.2
|
||||
2011,61,413.836124,184.375703,11916.8
|
||||
2011,62,502.953953,173.237159,12468.3
|
||||
"""
|
||||
expected = read_csv(StringIO(csv_data), engine="python")
|
||||
|
||||
fwf_data = """\
|
||||
A B C D E
|
||||
201158 360.242940 149.910199 11950.7
|
||||
201159 444.953632 166.985655 11788.4
|
||||
201160 364.136849 183.628767 11806.2
|
||||
201161 413.836124 184.375703 11916.8
|
||||
201162 502.953953 173.237159 12468.3
|
||||
"""
|
||||
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
|
||||
result = read_fwf(StringIO(fwf_data), colspecs=colspecs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_bytes_io_input():
|
||||
if not compat.PY3:
|
||||
pytest.skip("Bytes-related test - only needs to work on Python 3")
|
||||
|
||||
result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')),
|
||||
widths=[2, 2], encoding="utf8")
|
||||
expected = DataFrame([["של", "ום"]], columns=["של", "ום"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_fwf_colspecs_is_list_or_tuple():
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
|
||||
msg = "column specifications must be a list or tuple.+"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
read_fwf(StringIO(data), colspecs={"a": 1}, delimiter=",")
|
||||
|
||||
|
||||
def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples():
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
|
||||
msg = "Each column specification must be.+"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
read_fwf(StringIO(data), [("a", 1)])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("colspecs,exp_data", [
|
||||
([(0, 3), (3, None)], [[123, 456], [456, 789]]),
|
||||
([(None, 3), (3, 6)], [[123, 456], [456, 789]]),
|
||||
([(0, None), (3, None)], [[123456, 456], [456789, 789]]),
|
||||
([(None, None), (3, 6)], [[123456, 456], [456789, 789]]),
|
||||
])
|
||||
def test_fwf_colspecs_none(colspecs, exp_data):
|
||||
# see gh-7079
|
||||
data = """\
|
||||
123456
|
||||
456789
|
||||
"""
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("infer_nrows,exp_data", [
|
||||
# infer_nrows --> colspec == [(2, 3), (5, 6)]
|
||||
(1, [[1, 2], [3, 8]]),
|
||||
|
||||
# infer_nrows > number of rows
|
||||
(10, [[1, 2], [123, 98]]),
|
||||
])
|
||||
def test_fwf_colspecs_infer_nrows(infer_nrows, exp_data):
|
||||
# see gh-15138
|
||||
data = """\
|
||||
1 2
|
||||
123 98
|
||||
"""
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = read_fwf(StringIO(data), infer_nrows=infer_nrows, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_fwf_regression():
|
||||
# see gh-3594
|
||||
#
|
||||
# Turns out "T060" is parsable as a datetime slice!
|
||||
tz_list = [1, 10, 20, 30, 60, 80, 100]
|
||||
widths = [16] + [8] * len(tz_list)
|
||||
names = ["SST"] + ["T%03d" % z for z in tz_list[1:]]
|
||||
|
||||
data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192
|
||||
2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869
|
||||
2009164204000 9.5873 9.1326 8.4694 7.5889 6.0422 5.8526 5.4657
|
||||
2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379
|
||||
2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039
|
||||
"""
|
||||
|
||||
result = read_fwf(StringIO(data), index_col=0, header=None, names=names,
|
||||
widths=widths, parse_dates=True,
|
||||
date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"))
|
||||
expected = DataFrame([
|
||||
[9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192],
|
||||
[9.5435, 9.2010, 8.6167, 7.8176, 6.0804, 5.8728, 5.4869],
|
||||
[9.5873, 9.1326, 8.4694, 7.5889, 6.0422, 5.8526, 5.4657],
|
||||
[9.5810, 9.0896, 8.4009, 7.4652, 6.0322, 5.8189, 5.4379],
|
||||
[9.6034, 9.0897, 8.3822, 7.4905, 6.0908, 5.7904, 5.4039],
|
||||
], index=DatetimeIndex(["2009-06-13 20:20:00", "2009-06-13 20:30:00",
|
||||
"2009-06-13 20:40:00", "2009-06-13 20:50:00",
|
||||
"2009-06-13 21:00:00"]),
|
||||
columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_fwf_for_uint8():
|
||||
data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127
|
||||
1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" # noqa
|
||||
df = read_fwf(StringIO(data),
|
||||
colspecs=[(0, 17), (25, 26), (33, 37),
|
||||
(49, 51), (58, 62), (63, 1000)],
|
||||
names=["time", "pri", "pgn", "dst", "src", "data"],
|
||||
converters={
|
||||
"pgn": lambda x: int(x, 16),
|
||||
"src": lambda x: int(x, 16),
|
||||
"dst": lambda x: int(x, 16),
|
||||
"data": lambda x: len(x.split(" "))})
|
||||
|
||||
expected = DataFrame([[1421302965.213420, 3, 61184, 23, 40, 8],
|
||||
[1421302964.226776, 6, 61442, None, 71, 8]],
|
||||
columns=["time", "pri", "pgn",
|
||||
"dst", "src", "data"])
|
||||
expected["dst"] = expected["dst"].astype(object)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("comment", ["#", "~", "!"])
|
||||
def test_fwf_comment(comment):
|
||||
data = """\
|
||||
1 2. 4 #hello world
|
||||
5 NaN 10.0
|
||||
"""
|
||||
data = data.replace("#", comment)
|
||||
|
||||
colspecs = [(0, 3), (4, 9), (9, 25)]
|
||||
expected = DataFrame([[1, 2., 4], [5, np.nan, 10.]])
|
||||
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs,
|
||||
header=None, comment=comment)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("thousands", [",", "#", "~"])
|
||||
def test_fwf_thousands(thousands):
|
||||
data = """\
|
||||
1 2,334.0 5
|
||||
10 13 10.
|
||||
"""
|
||||
data = data.replace(",", thousands)
|
||||
|
||||
colspecs = [(0, 3), (3, 11), (12, 16)]
|
||||
expected = DataFrame([[1, 2334., 5], [10, 13, 10.]])
|
||||
|
||||
result = read_fwf(StringIO(data), header=None,
|
||||
colspecs=colspecs, thousands=thousands)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [True, False])
|
||||
def test_bool_header_arg(header):
|
||||
# see gh-6114
|
||||
data = """\
|
||||
MyColumn
|
||||
a
|
||||
b
|
||||
a
|
||||
b"""
|
||||
|
||||
msg = "Passing a bool to header is invalid"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
read_fwf(StringIO(data), header=header)
|
||||
|
||||
|
||||
def test_full_file():
|
||||
# File with all values.
|
||||
test = """index A B C
|
||||
2000-01-03T00:00:00 0.980268513777 3 foo
|
||||
2000-01-04T00:00:00 1.04791624281 -4 bar
|
||||
2000-01-05T00:00:00 0.498580885705 73 baz
|
||||
2000-01-06T00:00:00 1.12020151869 1 foo
|
||||
2000-01-07T00:00:00 0.487094399463 0 bar
|
||||
2000-01-10T00:00:00 0.836648671666 2 baz
|
||||
2000-01-11T00:00:00 0.157160753327 34 foo"""
|
||||
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
|
||||
result = read_fwf(StringIO(test))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_full_file_with_missing():
|
||||
# File with missing values.
|
||||
test = """index A B C
|
||||
2000-01-03T00:00:00 0.980268513777 3 foo
|
||||
2000-01-04T00:00:00 1.04791624281 -4 bar
|
||||
0.498580885705 73 baz
|
||||
2000-01-06T00:00:00 1.12020151869 1 foo
|
||||
2000-01-07T00:00:00 0 bar
|
||||
2000-01-10T00:00:00 0.836648671666 2 baz
|
||||
34"""
|
||||
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
|
||||
result = read_fwf(StringIO(test))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_full_file_with_spaces():
|
||||
# File with spaces in columns.
|
||||
test = """
|
||||
Account Name Balance CreditLimit AccountCreated
|
||||
101 Keanu Reeves 9315.45 10000.00 1/17/1998
|
||||
312 Gerard Butler 90.00 1000.00 8/6/2003
|
||||
868 Jennifer Love Hewitt 0 17000.00 5/25/1985
|
||||
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
|
||||
317 Bill Murray 789.65 5000.00 2/5/2007
|
||||
""".strip("\r\n")
|
||||
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
|
||||
result = read_fwf(StringIO(test))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_full_file_with_spaces_and_missing():
|
||||
# File with spaces and missing values in columns.
|
||||
test = """
|
||||
Account Name Balance CreditLimit AccountCreated
|
||||
101 10000.00 1/17/1998
|
||||
312 Gerard Butler 90.00 1000.00 8/6/2003
|
||||
868 5/25/1985
|
||||
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
|
||||
317 Bill Murray 789.65
|
||||
""".strip("\r\n")
|
||||
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
|
||||
result = read_fwf(StringIO(test))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_messed_up_data():
|
||||
# Completely messed up file.
|
||||
test = """
|
||||
Account Name Balance Credit Limit Account Created
|
||||
101 10000.00 1/17/1998
|
||||
312 Gerard Butler 90.00 1000.00
|
||||
|
||||
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
|
||||
317 Bill Murray 789.65
|
||||
""".strip("\r\n")
|
||||
colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
|
||||
result = read_fwf(StringIO(test))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiple_delimiters():
|
||||
test = r"""
|
||||
col1~~~~~col2 col3++++++++++++++++++col4
|
||||
~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
|
||||
33+++122.33\\\bar.........Gerard Butler
|
||||
++44~~~~12.01 baz~~Jennifer Love Hewitt
|
||||
~~55 11+++foo++++Jada Pinkett-Smith
|
||||
..66++++++.03~~~bar Bill Murray
|
||||
""".strip("\r\n")
|
||||
delimiter = " +~.\\"
|
||||
colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs, delimiter=delimiter)
|
||||
|
||||
result = read_fwf(StringIO(test), delimiter=delimiter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_variable_width_unicode():
|
||||
if not compat.PY3:
|
||||
pytest.skip("Bytes-related test - only needs to work on Python 3")
|
||||
|
||||
data = """
|
||||
שלום שלום
|
||||
ום שלל
|
||||
של ום
|
||||
""".strip("\r\n")
|
||||
encoding = "utf8"
|
||||
kwargs = dict(header=None, encoding=encoding)
|
||||
|
||||
expected = read_fwf(BytesIO(data.encode(encoding)),
|
||||
colspecs=[(0, 4), (5, 9)], **kwargs)
|
||||
result = read_fwf(BytesIO(data.encode(encoding)), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [
|
||||
dict(), {"a": "float64", "b": str, "c": "int32"}
|
||||
])
|
||||
def test_dtype(dtype):
|
||||
data = """ a b c
|
||||
1 2 3.2
|
||||
3 4 5.2
|
||||
"""
|
||||
colspecs = [(0, 5), (5, 10), (10, None)]
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs, dtype=dtype)
|
||||
|
||||
expected = pd.DataFrame({
|
||||
"a": [1, 3], "b": [2, 4],
|
||||
"c": [3.2, 5.2]}, columns=["a", "b", "c"])
|
||||
|
||||
for col, dt in dtype.items():
|
||||
expected[col] = expected[col].astype(dt)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skiprows_inference():
|
||||
# see gh-11256
|
||||
data = """
|
||||
Text contained in the file header
|
||||
|
||||
DataCol1 DataCol2
|
||||
0.0 1.0
|
||||
101.6 956.1
|
||||
""".strip()
|
||||
skiprows = 2
|
||||
expected = read_csv(StringIO(data), skiprows=skiprows,
|
||||
delim_whitespace=True)
|
||||
|
||||
result = read_fwf(StringIO(data), skiprows=skiprows)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skiprows_by_index_inference():
|
||||
data = """
|
||||
To be skipped
|
||||
Not To Be Skipped
|
||||
Once more to be skipped
|
||||
123 34 8 123
|
||||
456 78 9 456
|
||||
""".strip()
|
||||
skiprows = [0, 2]
|
||||
expected = read_csv(StringIO(data), skiprows=skiprows,
|
||||
delim_whitespace=True)
|
||||
|
||||
result = read_fwf(StringIO(data), skiprows=skiprows)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skiprows_inference_empty():
|
||||
data = """
|
||||
AA BBB C
|
||||
12 345 6
|
||||
78 901 2
|
||||
""".strip()
|
||||
|
||||
msg = "No rows from which to infer column width"
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
read_fwf(StringIO(data), skiprows=3)
|
||||
|
||||
|
||||
def test_whitespace_preservation():
|
||||
# see gh-16772
|
||||
header = None
|
||||
csv_data = """
|
||||
a ,bbb
|
||||
cc,dd """
|
||||
|
||||
fwf_data = """
|
||||
a bbb
|
||||
ccdd """
|
||||
result = read_fwf(StringIO(fwf_data), widths=[3, 3],
|
||||
header=header, skiprows=[0], delimiter="\n\t")
|
||||
expected = read_csv(StringIO(csv_data), header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_default_delimiter():
|
||||
header = None
|
||||
csv_data = """
|
||||
a,bbb
|
||||
cc,dd"""
|
||||
|
||||
fwf_data = """
|
||||
a \tbbb
|
||||
cc\tdd """
|
||||
result = read_fwf(StringIO(fwf_data), widths=[3, 3],
|
||||
header=header, skiprows=[0])
|
||||
expected = read_csv(StringIO(csv_data), header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("infer", [True, False, None])
|
||||
def test_fwf_compression(compression_only, infer):
|
||||
data = """1111111111
|
||||
2222222222
|
||||
3333333333""".strip()
|
||||
|
||||
compression = compression_only
|
||||
extension = "gz" if compression == "gzip" else compression
|
||||
|
||||
kwargs = dict(widths=[5, 5], names=["one", "two"])
|
||||
expected = read_fwf(StringIO(data), **kwargs)
|
||||
|
||||
if compat.PY3:
|
||||
data = bytes(data, encoding="utf-8")
|
||||
|
||||
with tm.ensure_clean(filename="tmp." + extension) as path:
|
||||
tm.write_to_compressed(compression, path, data)
|
||||
|
||||
if infer is not None:
|
||||
kwargs["compression"] = "infer" if infer else compression
|
||||
|
||||
result = read_fwf(path, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,222 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that skipped rows are properly handled during
|
||||
parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO, lrange, range
|
||||
from pandas.errors import EmptyDataError
|
||||
|
||||
from pandas import DataFrame, Index
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("skiprows", [lrange(6), 6])
|
||||
def test_skip_rows_bug(all_parsers, skiprows):
|
||||
# see gh-505
|
||||
parser = all_parsers
|
||||
text = """#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
1/1/2000,1.,2.,3.
|
||||
1/2/2000,4,5,6
|
||||
1/3/2000,7,8,9
|
||||
"""
|
||||
result = parser.read_csv(StringIO(text), skiprows=skiprows, header=None,
|
||||
index_col=0, parse_dates=True)
|
||||
index = Index([datetime(2000, 1, 1), datetime(2000, 1, 2),
|
||||
datetime(2000, 1, 3)], name=0)
|
||||
|
||||
expected = DataFrame(np.arange(1., 10.).reshape((3, 3)),
|
||||
columns=[1, 2, 3], index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_deep_skip_rows(all_parsers):
|
||||
# see gh-4382
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n" + "\n".join([",".join([str(i), str(i + 1), str(i + 2)])
|
||||
for i in range(10)])
|
||||
condensed_data = "a,b,c\n" + "\n".join([
|
||||
",".join([str(i), str(i + 1), str(i + 2)])
|
||||
for i in [0, 1, 2, 3, 4, 6, 8, 9]])
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=[6, 8])
|
||||
condensed_result = parser.read_csv(StringIO(condensed_data))
|
||||
tm.assert_frame_equal(result, condensed_result)
|
||||
|
||||
|
||||
def test_skip_rows_blank(all_parsers):
|
||||
# see gh-9832
|
||||
parser = all_parsers
|
||||
text = """#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
|
||||
1/1/2000,1.,2.,3.
|
||||
1/2/2000,4,5,6
|
||||
1/3/2000,7,8,9
|
||||
"""
|
||||
data = parser.read_csv(StringIO(text), skiprows=6, header=None,
|
||||
index_col=0, parse_dates=True)
|
||||
index = Index([datetime(2000, 1, 1), datetime(2000, 1, 2),
|
||||
datetime(2000, 1, 3)], name=0)
|
||||
|
||||
expected = DataFrame(np.arange(1., 10.).reshape((3, 3)),
|
||||
columns=[1, 2, 3],
|
||||
index=index)
|
||||
tm.assert_frame_equal(data, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,kwargs,expected", [
|
||||
("""id,text,num_lines
|
||||
1,"line 11
|
||||
line 12",2
|
||||
2,"line 21
|
||||
line 22",2
|
||||
3,"line 31",1""",
|
||||
dict(skiprows=[1]),
|
||||
DataFrame([[2, "line 21\nline 22", 2],
|
||||
[3, "line 31", 1]], columns=["id", "text", "num_lines"])),
|
||||
("a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
|
||||
dict(quotechar="~", skiprows=[2]),
|
||||
DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"])),
|
||||
(("Text,url\n~example\n "
|
||||
"sentence\n one~,url1\n~"
|
||||
"example\n sentence\n two~,url2\n~"
|
||||
"example\n sentence\n three~,url3"),
|
||||
dict(quotechar="~", skiprows=[1, 3]),
|
||||
DataFrame([['example\n sentence\n two', 'url2']],
|
||||
columns=["Text", "url"]))
|
||||
])
|
||||
def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skip_row_with_quote(all_parsers):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
data = """id,text,num_lines
|
||||
1,"line '11' line 12",2
|
||||
2,"line '21' line 22",2
|
||||
3,"line '31' line 32",1"""
|
||||
|
||||
exp_data = [[2, "line '21' line 22", 2],
|
||||
[3, "line '31' line 32", 1]]
|
||||
expected = DataFrame(exp_data, columns=[
|
||||
"id", "text", "num_lines"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=[1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,exp_data", [
|
||||
("""id,text,num_lines
|
||||
1,"line \n'11' line 12",2
|
||||
2,"line \n'21' line 22",2
|
||||
3,"line \n'31' line 32",1""",
|
||||
[[2, "line \n'21' line 22", 2],
|
||||
[3, "line \n'31' line 32", 1]]),
|
||||
("""id,text,num_lines
|
||||
1,"line '11\n' line 12",2
|
||||
2,"line '21\n' line 22",2
|
||||
3,"line '31\n' line 32",1""",
|
||||
[[2, "line '21\n' line 22", 2],
|
||||
[3, "line '31\n' line 32", 1]]),
|
||||
("""id,text,num_lines
|
||||
1,"line '11\n' \r\tline 12",2
|
||||
2,"line '21\n' \r\tline 22",2
|
||||
3,"line '31\n' \r\tline 32",1""",
|
||||
[[2, "line '21\n' \r\tline 22", 2],
|
||||
[3, "line '31\n' \r\tline 32", 1]]),
|
||||
])
|
||||
def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), skiprows=[1])
|
||||
|
||||
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("line_terminator", [
|
||||
"\n", # "LF"
|
||||
"\r\n", # "CRLF"
|
||||
"\r" # "CR"
|
||||
])
|
||||
def test_skiprows_lineterminator(all_parsers, line_terminator):
|
||||
# see gh-9079
|
||||
parser = all_parsers
|
||||
data = "\n".join(["SMOSMANIA ThetaProbe-ML2X ",
|
||||
"2007/01/01 01:00 0.2140 U M ",
|
||||
"2007/01/01 02:00 0.2141 M O ",
|
||||
"2007/01/01 04:00 0.2142 D M "])
|
||||
expected = DataFrame([["2007/01/01", "01:00", 0.2140, "U", "M"],
|
||||
["2007/01/01", "02:00", 0.2141, "M", "O"],
|
||||
["2007/01/01", "04:00", 0.2142, "D", "M"]],
|
||||
columns=["date", "time", "var", "flag",
|
||||
"oflag"])
|
||||
|
||||
if parser.engine == "python" and line_terminator == "\r":
|
||||
pytest.skip("'CR' not respect with the Python parser yet")
|
||||
|
||||
data = data.replace("\n", line_terminator)
|
||||
result = parser.read_csv(StringIO(data), skiprows=1, delim_whitespace=True,
|
||||
names=["date", "time", "var", "flag", "oflag"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skiprows_infield_quote(all_parsers):
|
||||
# see gh-14459
|
||||
parser = all_parsers
|
||||
data = "a\"\nb\"\na\n1"
|
||||
expected = DataFrame({"a": [1]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs,expected", [
|
||||
(dict(), DataFrame({"1": [3, 5]})),
|
||||
(dict(header=0, names=["foo"]), DataFrame({"foo": [3, 5]}))
|
||||
])
|
||||
def test_skip_rows_callable(all_parsers, kwargs, expected):
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
|
||||
result = parser.read_csv(StringIO(data),
|
||||
skiprows=lambda x: x % 2 == 0,
|
||||
**kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skip_rows_skip_all(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
msg = "No columns to parse from file"
|
||||
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
parser.read_csv(StringIO(data), skiprows=lambda x: True)
|
||||
|
||||
|
||||
def test_skip_rows_bad_callable(all_parsers):
|
||||
msg = "by zero"
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
|
||||
with pytest.raises(ZeroDivisionError, match=msg):
|
||||
parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
|
||||
@@ -0,0 +1,353 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests the TextReader class in parsers.pyx, which
|
||||
is integral to the C engine in parsers.py
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
from numpy import nan
|
||||
import pytest
|
||||
|
||||
import pandas._libs.parsers as parser
|
||||
from pandas._libs.parsers import TextReader
|
||||
import pandas.compat as compat
|
||||
from pandas.compat import BytesIO, StringIO, map
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
from pandas.io.parsers import TextFileReader, read_csv
|
||||
|
||||
|
||||
class TestTextReader(object):
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_method(self, datapath):
|
||||
self.dirpath = datapath('io', 'parser', 'data')
|
||||
self.csv1 = os.path.join(self.dirpath, 'test1.csv')
|
||||
self.csv2 = os.path.join(self.dirpath, 'test2.csv')
|
||||
self.xls1 = os.path.join(self.dirpath, 'test.xls')
|
||||
|
||||
def test_file_handle(self):
|
||||
with open(self.csv1, 'rb') as f:
|
||||
reader = TextReader(f)
|
||||
reader.read()
|
||||
|
||||
def test_string_filename(self):
|
||||
reader = TextReader(self.csv1, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_file_handle_mmap(self):
|
||||
with open(self.csv1, 'rb') as f:
|
||||
reader = TextReader(f, memory_map=True, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_StringIO(self):
|
||||
with open(self.csv1, 'rb') as f:
|
||||
text = f.read()
|
||||
src = BytesIO(text)
|
||||
reader = TextReader(src, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_string_factorize(self):
|
||||
# should this be optional?
|
||||
data = 'a\nb\na\nb\na'
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
assert len(set(map(id, result[0]))) == 2
|
||||
|
||||
def test_skipinitialspace(self):
|
||||
data = ('a, b\n'
|
||||
'a, b\n'
|
||||
'a, b\n'
|
||||
'a, b')
|
||||
|
||||
reader = TextReader(StringIO(data), skipinitialspace=True,
|
||||
header=None)
|
||||
result = reader.read()
|
||||
|
||||
tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a', 'a'],
|
||||
dtype=np.object_))
|
||||
tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b', 'b'],
|
||||
dtype=np.object_))
|
||||
|
||||
def test_parse_booleans(self):
|
||||
data = 'True\nFalse\nTrue\nTrue'
|
||||
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
|
||||
assert result[0].dtype == np.bool_
|
||||
|
||||
def test_delimit_whitespace(self):
|
||||
data = 'a b\na\t\t "b"\n"a"\t \t b'
|
||||
|
||||
reader = TextReader(StringIO(data), delim_whitespace=True,
|
||||
header=None)
|
||||
result = reader.read()
|
||||
|
||||
tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a'],
|
||||
dtype=np.object_))
|
||||
tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b'],
|
||||
dtype=np.object_))
|
||||
|
||||
def test_embedded_newline(self):
|
||||
data = 'a\n"hello\nthere"\nthis'
|
||||
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array(['a', 'hello\nthere', 'this'], dtype=np.object_)
|
||||
tm.assert_numpy_array_equal(result[0], expected)
|
||||
|
||||
def test_euro_decimal(self):
|
||||
data = '12345,67\n345,678'
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=':',
|
||||
decimal=',', header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array([12345.67, 345.678])
|
||||
tm.assert_almost_equal(result[0], expected)
|
||||
|
||||
def test_integer_thousands(self):
|
||||
data = '123,456\n12,500'
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=':',
|
||||
thousands=',', header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array([123456, 12500], dtype=np.int64)
|
||||
tm.assert_almost_equal(result[0], expected)
|
||||
|
||||
def test_integer_thousands_alt(self):
|
||||
data = '123.456\n12.500'
|
||||
|
||||
reader = TextFileReader(StringIO(data), delimiter=':',
|
||||
thousands='.', header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = DataFrame([123456, 12500])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_skip_bad_lines(self, capsys):
|
||||
# too many lines, see #2430 for why
|
||||
data = ('a:b:c\n'
|
||||
'd:e:f\n'
|
||||
'g:h:i\n'
|
||||
'j:k:l:m\n'
|
||||
'l:m:n\n'
|
||||
'o:p:q:r')
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=':',
|
||||
header=None)
|
||||
msg = (r"Error tokenizing data\. C error: Expected 3 fields in"
|
||||
" line 4, saw 4")
|
||||
with pytest.raises(parser.ParserError, match=msg):
|
||||
reader.read()
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=':',
|
||||
header=None,
|
||||
error_bad_lines=False,
|
||||
warn_bad_lines=False)
|
||||
result = reader.read()
|
||||
expected = {0: np.array(['a', 'd', 'g', 'l'], dtype=object),
|
||||
1: np.array(['b', 'e', 'h', 'm'], dtype=object),
|
||||
2: np.array(['c', 'f', 'i', 'n'], dtype=object)}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=':',
|
||||
header=None,
|
||||
error_bad_lines=False,
|
||||
warn_bad_lines=True)
|
||||
reader.read()
|
||||
captured = capsys.readouterr()
|
||||
|
||||
assert 'Skipping line 4' in captured.err
|
||||
assert 'Skipping line 6' in captured.err
|
||||
|
||||
def test_header_not_enough_lines(self):
|
||||
data = ('skip this\n'
|
||||
'skip this\n'
|
||||
'a,b,c\n'
|
||||
'1,2,3\n'
|
||||
'4,5,6')
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=',', header=2)
|
||||
header = reader.header
|
||||
expected = [['a', 'b', 'c']]
|
||||
assert header == expected
|
||||
|
||||
recs = reader.read()
|
||||
expected = {0: np.array([1, 4], dtype=np.int64),
|
||||
1: np.array([2, 5], dtype=np.int64),
|
||||
2: np.array([3, 6], dtype=np.int64)}
|
||||
assert_array_dicts_equal(recs, expected)
|
||||
|
||||
def test_escapechar(self):
|
||||
data = ('\\"hello world\"\n'
|
||||
'\\"hello world\"\n'
|
||||
'\\"hello world\"')
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=',', header=None,
|
||||
escapechar='\\')
|
||||
result = reader.read()
|
||||
expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
def test_eof_has_eol(self):
|
||||
# handling of new line at EOF
|
||||
pass
|
||||
|
||||
def test_na_substitution(self):
|
||||
pass
|
||||
|
||||
def test_numpy_string_dtype(self):
|
||||
data = """\
|
||||
a,1
|
||||
aa,2
|
||||
aaa,3
|
||||
aaaa,4
|
||||
aaaaa,5"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
return TextReader(StringIO(data), delimiter=',', header=None,
|
||||
**kwds)
|
||||
|
||||
reader = _make_reader(dtype='S5,i4')
|
||||
result = reader.read()
|
||||
|
||||
assert result[0].dtype == 'S5'
|
||||
|
||||
ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaaa'], dtype='S5')
|
||||
assert (result[0] == ex_values).all()
|
||||
assert result[1].dtype == 'i4'
|
||||
|
||||
reader = _make_reader(dtype='S4')
|
||||
result = reader.read()
|
||||
assert result[0].dtype == 'S4'
|
||||
ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaa'], dtype='S4')
|
||||
assert (result[0] == ex_values).all()
|
||||
assert result[1].dtype == 'S4'
|
||||
|
||||
def test_pass_dtype(self):
|
||||
data = """\
|
||||
one,two
|
||||
1,a
|
||||
2,b
|
||||
3,c
|
||||
4,d"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
return TextReader(StringIO(data), delimiter=',', **kwds)
|
||||
|
||||
reader = _make_reader(dtype={'one': 'u1', 1: 'S1'})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == 'u1'
|
||||
assert result[1].dtype == 'S1'
|
||||
|
||||
reader = _make_reader(dtype={'one': np.uint8, 1: object})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == 'u1'
|
||||
assert result[1].dtype == 'O'
|
||||
|
||||
reader = _make_reader(dtype={'one': np.dtype('u1'),
|
||||
1: np.dtype('O')})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == 'u1'
|
||||
assert result[1].dtype == 'O'
|
||||
|
||||
def test_usecols(self):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
return TextReader(StringIO(data), delimiter=',', **kwds)
|
||||
|
||||
reader = _make_reader(usecols=(1, 2))
|
||||
result = reader.read()
|
||||
|
||||
exp = _make_reader().read()
|
||||
assert len(result) == 2
|
||||
assert (result[1] == exp[1]).all()
|
||||
assert (result[2] == exp[2]).all()
|
||||
|
||||
def test_cr_delimited(self):
|
||||
def _test(text, **kwargs):
|
||||
nice_text = text.replace('\r', '\r\n')
|
||||
result = TextReader(StringIO(text), **kwargs).read()
|
||||
expected = TextReader(StringIO(nice_text), **kwargs).read()
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
data = 'a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12'
|
||||
_test(data, delimiter=',')
|
||||
|
||||
data = 'a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12'
|
||||
_test(data, delim_whitespace=True)
|
||||
|
||||
data = 'a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12'
|
||||
_test(data, delimiter=',')
|
||||
|
||||
sample = ('A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r'
|
||||
'AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r'
|
||||
',BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0')
|
||||
_test(sample, delimiter=',')
|
||||
|
||||
data = 'A B C\r 2 3\r4 5 6'
|
||||
_test(data, delim_whitespace=True)
|
||||
|
||||
data = 'A B C\r2 3\r4 5 6'
|
||||
_test(data, delim_whitespace=True)
|
||||
|
||||
def test_empty_field_eof(self):
|
||||
data = 'a,b,c\n1,2,3\n4,,'
|
||||
|
||||
result = TextReader(StringIO(data), delimiter=',').read()
|
||||
|
||||
expected = {0: np.array([1, 4], dtype=np.int64),
|
||||
1: np.array(['2', ''], dtype=object),
|
||||
2: np.array(['3', ''], dtype=object)}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
# GH5664
|
||||
a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c'])
|
||||
b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]],
|
||||
columns=list('abcd'),
|
||||
index=[1, 1])
|
||||
c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan],
|
||||
[8, 9, 10, 11], [13, 14, nan, nan]],
|
||||
columns=list('abcd'),
|
||||
index=[0, 5, 7, 12])
|
||||
|
||||
for _ in range(100):
|
||||
df = read_csv(StringIO('a,b\nc\n'), skiprows=0,
|
||||
names=['a'], engine='c')
|
||||
assert_frame_equal(df, a)
|
||||
|
||||
df = read_csv(StringIO('1,1,1,1,0\n' * 2 + '\n' * 2),
|
||||
names=list("abcd"), engine='c')
|
||||
assert_frame_equal(df, b)
|
||||
|
||||
df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'),
|
||||
names=list('abcd'), engine='c')
|
||||
assert_frame_equal(df, c)
|
||||
|
||||
def test_empty_csv_input(self):
|
||||
# GH14867
|
||||
df = read_csv(StringIO(), chunksize=20, header=None,
|
||||
names=['a', 'b', 'c'])
|
||||
assert isinstance(df, TextFileReader)
|
||||
|
||||
|
||||
def assert_array_dicts_equal(left, right):
|
||||
for k, v in compat.iteritems(left):
|
||||
assert tm.assert_numpy_array_equal(np.asarray(v),
|
||||
np.asarray(right[k]))
|
||||
@@ -0,0 +1,140 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that features that are currently unsupported in
|
||||
either the Python or C parser are actually enforced
|
||||
and are clearly communicated to the user.
|
||||
|
||||
Ultimately, the goal is to remove test cases from this
|
||||
test suite as new feature support is added to the parsers.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
from pandas.errors import ParserError
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
import pandas.io.parsers as parsers
|
||||
from pandas.io.parsers import read_csv
|
||||
|
||||
|
||||
@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
|
||||
def python_engine(request):
|
||||
return request.param
|
||||
|
||||
|
||||
class TestUnsupportedFeatures(object):
|
||||
|
||||
def test_mangle_dupe_cols_false(self):
|
||||
# see gh-12935
|
||||
data = 'a b c\n1 2 3'
|
||||
msg = 'is not supported'
|
||||
|
||||
for engine in ('c', 'python'):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine=engine,
|
||||
mangle_dupe_cols=False)
|
||||
|
||||
def test_c_engine(self):
|
||||
# see gh-6607
|
||||
data = 'a b c\n1 2 3'
|
||||
msg = 'does not support'
|
||||
|
||||
# specify C engine with unsupported options (raise)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine='c',
|
||||
sep=None, delim_whitespace=False)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine='c', sep=r'\s')
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine='c', sep='\t', quotechar=chr(128))
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine='c', skipfooter=1)
|
||||
|
||||
# specify C-unsupported options without python-unsupported options
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), sep=None, delim_whitespace=False)
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), sep=r'\s')
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), sep='\t', quotechar=chr(128))
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), skipfooter=1)
|
||||
|
||||
text = """ A B C D E
|
||||
one two three four
|
||||
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
|
||||
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
|
||||
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
|
||||
msg = 'Error tokenizing data'
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
read_csv(StringIO(text), sep='\\s+')
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
read_csv(StringIO(text), engine='c', sep='\\s+')
|
||||
|
||||
msg = "Only length-1 thousands markers supported"
|
||||
data = """A|B|C
|
||||
1|2,334|5
|
||||
10|13|10.
|
||||
"""
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), thousands=',,')
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), thousands='')
|
||||
|
||||
msg = "Only length-1 line terminators supported"
|
||||
data = 'a,b,c~~1,2,3~~4,5,6'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), lineterminator='~~')
|
||||
|
||||
def test_python_engine(self, python_engine):
|
||||
from pandas.io.parsers import _python_unsupported as py_unsupported
|
||||
|
||||
data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
|
||||
for default in py_unsupported:
|
||||
msg = ('The %r option is not supported '
|
||||
'with the %r engine' % (default, python_engine))
|
||||
|
||||
kwargs = {default: object()}
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine=python_engine, **kwargs)
|
||||
|
||||
def test_python_engine_file_no_next(self, python_engine):
|
||||
# see gh-16530
|
||||
class NoNextBuffer(object):
|
||||
def __init__(self, csv_data):
|
||||
self.data = csv_data
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def read(self):
|
||||
return self.data
|
||||
|
||||
data = "a\n1"
|
||||
msg = "The 'python' engine cannot iterate"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(NoNextBuffer(data), engine=python_engine)
|
||||
|
||||
|
||||
class TestDeprecatedFeatures(object):
|
||||
|
||||
@pytest.mark.parametrize("engine", ["c", "python"])
|
||||
@pytest.mark.parametrize("kwargs", [{"tupleize_cols": True},
|
||||
{"tupleize_cols": False}])
|
||||
def test_deprecated_args(self, engine, kwargs):
|
||||
data = "1,2,3"
|
||||
arg, _ = list(kwargs.items())[0]
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, check_stacklevel=False):
|
||||
read_csv(StringIO(data), engine=engine, **kwargs)
|
||||
@@ -0,0 +1,534 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests the usecols functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslib import Timestamp
|
||||
from pandas.compat import StringIO
|
||||
|
||||
from pandas import DataFrame, Index
|
||||
import pandas.util.testing as tm
|
||||
|
||||
_msg_validate_usecols_arg = ("'usecols' must either be list-like "
|
||||
"of all strings, all unicode, all "
|
||||
"integers or a callable.")
|
||||
_msg_validate_usecols_names = ("Usecols do not match columns, columns "
|
||||
"expected but not found: {0}")
|
||||
|
||||
|
||||
def test_raise_on_mixed_dtype_usecols(all_parsers):
|
||||
# See gh-12678
|
||||
data = """a,b,c
|
||||
1000,2000,3000
|
||||
4000,5000,6000
|
||||
"""
|
||||
usecols = [0, "b", 2]
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
|
||||
def test_usecols(all_parsers, usecols):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9],
|
||||
[11, 12]], columns=["b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_names(all_parsers):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
names = ["foo", "bar"]
|
||||
result = parser.read_csv(StringIO(data), names=names,
|
||||
usecols=[1, 2], header=0)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9],
|
||||
[11, 12]], columns=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("names,usecols", [
|
||||
(["b", "c"], [1, 2]),
|
||||
(["a", "b", "c"], ["b", "c"])
|
||||
])
|
||||
def test_usecols_relative_to_names(all_parsers, names, usecols):
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), names=names,
|
||||
header=None, usecols=usecols)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9],
|
||||
[11, 12]], columns=["b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_relative_to_names2(all_parsers):
|
||||
# see gh-5766
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), names=["a", "b"],
|
||||
header=None, usecols=[0, 1])
|
||||
|
||||
expected = DataFrame([[1, 2], [4, 5], [7, 8],
|
||||
[10, 11]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_name_length_conflict(all_parsers):
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
msg = ("Number of passed names did not "
|
||||
"match number of header fields in the file"
|
||||
if parser.engine == "python" else
|
||||
"Passed header names mismatches usecols")
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=["a", "b"],
|
||||
header=None, usecols=[1])
|
||||
|
||||
|
||||
def test_usecols_single_string(all_parsers):
|
||||
# see gh-20558
|
||||
parser = all_parsers
|
||||
data = """foo, bar, baz
|
||||
1000, 2000, 3000
|
||||
4000, 5000, 6000"""
|
||||
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols="foo")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", ["a,b,c,d\n1,2,3,4\n5,6,7,8",
|
||||
"a,b,c,d\n1,2,3,4,\n5,6,7,8,"])
|
||||
def test_usecols_index_col_false(all_parsers, data):
|
||||
# see gh-9082
|
||||
parser = all_parsers
|
||||
usecols = ["a", "c", "d"]
|
||||
expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", ["b", 0])
|
||||
@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
|
||||
def test_usecols_index_col_conflict(all_parsers, usecols, index_col):
|
||||
# see gh-4201: test that index_col as integer reflects usecols
|
||||
parser = all_parsers
|
||||
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
|
||||
expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols,
|
||||
index_col=index_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_index_col_conflict2(all_parsers):
|
||||
# see gh-4201: test that index_col as integer reflects usecols
|
||||
parser = all_parsers
|
||||
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
|
||||
|
||||
expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")})
|
||||
expected = expected.set_index(["b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"],
|
||||
index_col=["b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_implicit_index_col(all_parsers):
|
||||
# see gh-2654
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["a", "b"])
|
||||
expected = DataFrame({"a": ["apple", "orange"],
|
||||
"b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_regex_sep(all_parsers):
|
||||
# see gh-2733
|
||||
parser = all_parsers
|
||||
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
|
||||
|
||||
expected = DataFrame({"a": ["apple", "orange"],
|
||||
"b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_whitespace(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
|
||||
|
||||
result = parser.read_csv(StringIO(data), delim_whitespace=True,
|
||||
usecols=("a", "b"))
|
||||
expected = DataFrame({"a": ["apple", "orange"],
|
||||
"b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols,expected", [
|
||||
# Column selection by index.
|
||||
([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]],
|
||||
columns=["2", "0"])),
|
||||
|
||||
# Column selection by name.
|
||||
(["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]],
|
||||
columns=["0", "1"])),
|
||||
])
|
||||
def test_usecols_with_integer_like_header(all_parsers, usecols, expected):
|
||||
parser = all_parsers
|
||||
data = """2,0,1
|
||||
1000,2000,3000
|
||||
4000,5000,6000"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
|
||||
def test_usecols_with_parse_dates(all_parsers, usecols):
|
||||
# see gh-9755
|
||||
data = """a,b,c,d,e
|
||||
0,1,20140101,0900,4
|
||||
0,1,20140102,1000,4"""
|
||||
parser = all_parsers
|
||||
parse_dates = [[1, 2]]
|
||||
|
||||
cols = {
|
||||
"a": [0, 0],
|
||||
"c_d": [
|
||||
Timestamp("2014-01-01 09:00:00"),
|
||||
Timestamp("2014-01-02 10:00:00")
|
||||
]
|
||||
}
|
||||
expected = DataFrame(cols, columns=["c_d", "a"])
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols,
|
||||
parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_parse_dates2(all_parsers):
|
||||
# see gh-13604
|
||||
parser = all_parsers
|
||||
data = """2008-02-07 09:40,1032.43
|
||||
2008-02-07 09:50,1042.54
|
||||
2008-02-07 10:00,1051.65"""
|
||||
|
||||
names = ["date", "values"]
|
||||
usecols = names[:]
|
||||
parse_dates = [0]
|
||||
|
||||
index = Index([Timestamp("2008-02-07 09:40"),
|
||||
Timestamp("2008-02-07 09:50"),
|
||||
Timestamp("2008-02-07 10:00")],
|
||||
name="date")
|
||||
cols = {"values": [1032.43, 1042.54, 1051.65]}
|
||||
expected = DataFrame(cols, index=index)
|
||||
|
||||
result = parser.read_csv(StringIO(data), parse_dates=parse_dates,
|
||||
index_col=0, usecols=usecols,
|
||||
header=None, names=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_parse_dates3(all_parsers):
|
||||
# see gh-14792
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d,e,f,g,h,i,j
|
||||
2016/09/21,1,1,2,3,4,5,6,7,8"""
|
||||
|
||||
usecols = list("abcdefghij")
|
||||
parse_dates = [0]
|
||||
|
||||
cols = {"a": Timestamp("2016-09-21"),
|
||||
"b": [1], "c": [1], "d": [2],
|
||||
"e": [3], "f": [4], "g": [5],
|
||||
"h": [6], "i": [7], "j": [8]}
|
||||
expected = DataFrame(cols, columns=usecols)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols,
|
||||
parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_parse_dates4(all_parsers):
|
||||
data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
|
||||
usecols = list("abcdefghij")
|
||||
parse_dates = [[0, 1]]
|
||||
parser = all_parsers
|
||||
|
||||
cols = {"a_b": "2016/09/21 1",
|
||||
"c": [1], "d": [2], "e": [3], "f": [4],
|
||||
"g": [5], "h": [6], "i": [7], "j": [8]}
|
||||
expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols,
|
||||
parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
|
||||
@pytest.mark.parametrize("names", [
|
||||
list("abcde"), # Names span all columns in original data.
|
||||
list("acd"), # Names span only the selected columns.
|
||||
])
|
||||
def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names):
|
||||
# see gh-9755
|
||||
s = """0,1,20140101,0900,4
|
||||
0,1,20140102,1000,4"""
|
||||
parse_dates = [[1, 2]]
|
||||
parser = all_parsers
|
||||
|
||||
cols = {
|
||||
"a": [0, 0],
|
||||
"c_d": [
|
||||
Timestamp("2014-01-01 09:00:00"),
|
||||
Timestamp("2014-01-02 10:00:00")
|
||||
]
|
||||
}
|
||||
expected = DataFrame(cols, columns=["c_d", "a"])
|
||||
|
||||
result = parser.read_csv(StringIO(s), names=names,
|
||||
parse_dates=parse_dates,
|
||||
usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_unicode_strings(all_parsers):
|
||||
# see gh-13219
|
||||
data = """AAA,BBB,CCC,DDD
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"AAA": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002
|
||||
},
|
||||
"BBB": {0: 8, 1: 2, 2: 7}
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=[u"AAA", u"BBB"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_single_byte_unicode_strings(all_parsers):
|
||||
# see gh-13219
|
||||
data = """A,B,C,D
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"A": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002
|
||||
},
|
||||
"B": {0: 8, 1: 2, 2: 7}
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=[u"A", u"B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [[u"AAA", b"BBB"], [b"AAA", u"BBB"]])
|
||||
def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
|
||||
data = """AAA,BBB,CCC,DDD
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [
|
||||
["あああ", "いい"],
|
||||
[u"あああ", u"いい"]
|
||||
])
|
||||
def test_usecols_with_multi_byte_characters(all_parsers, usecols):
|
||||
data = """あああ,いい,ううう,ええええ
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"あああ": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002
|
||||
},
|
||||
"いい": {0: 8, 1: 2, 2: 7}
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_usecols(all_parsers):
|
||||
data = "a,b,c\n1,2,3\n4,5,6"
|
||||
expected = DataFrame()
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=set())
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_np_array_usecols(all_parsers):
|
||||
# see gh-12546
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,2,3"
|
||||
usecols = np.array(["a", "b"])
|
||||
|
||||
expected = DataFrame([[1, 2]], columns=usecols)
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols,expected", [
|
||||
(lambda x: x.upper() in ["AAA", "BBB", "DDD"],
|
||||
DataFrame({
|
||||
"AaA": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002
|
||||
},
|
||||
"bBb": {0: 8, 1: 2, 2: 7},
|
||||
"ddd": {0: "a", 1: "b", 2: "a"}
|
||||
})),
|
||||
(lambda x: False, DataFrame()),
|
||||
])
|
||||
def test_callable_usecols(all_parsers, usecols, expected):
|
||||
# see gh-14154
|
||||
data = """AaA,bBb,CCC,ddd
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
|
||||
def test_incomplete_first_row(all_parsers, usecols):
|
||||
# see gh-6710
|
||||
data = "1,2\n1,2,3"
|
||||
parser = all_parsers
|
||||
names = ["a", "b", "c"]
|
||||
expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,usecols,kwargs,expected", [
|
||||
# see gh-8985
|
||||
("19,29,39\n" * 2 + "10,20,30,40", [0, 1, 2],
|
||||
dict(header=None), DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]])),
|
||||
|
||||
# see gh-9549
|
||||
(("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n"
|
||||
"1,2,3,,,1,\n1,2,3\n5,6,7"), ["A", "B", "C"],
|
||||
dict(), DataFrame({"A": [1, 3, 1, 1, 1, 5],
|
||||
"B": [2, 4, 2, 2, 2, 6],
|
||||
"C": [3, 5, 4, 3, 3, 7]})),
|
||||
])
|
||||
def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
|
||||
# see gh-8985
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols,kwargs,expected,msg", [
|
||||
(["a", "b", "c", "d"], dict(),
|
||||
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), None),
|
||||
(["a", "b", "c", "f"], dict(), None,
|
||||
_msg_validate_usecols_names.format(r"\['f'\]")),
|
||||
(["a", "b", "f"], dict(), None,
|
||||
_msg_validate_usecols_names.format(r"\['f'\]")),
|
||||
(["a", "b", "f", "g"], dict(), None,
|
||||
_msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]")),
|
||||
|
||||
# see gh-14671
|
||||
(None, dict(header=0, names=["A", "B", "C", "D"]),
|
||||
DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7],
|
||||
"D": [4, 8]}), None),
|
||||
(["A", "B", "C", "f"], dict(header=0, names=["A", "B", "C", "D"]),
|
||||
None, _msg_validate_usecols_names.format(r"\['f'\]")),
|
||||
(["A", "B", "f"], dict(names=["A", "B", "C", "D"]),
|
||||
None, _msg_validate_usecols_names.format(r"\['f'\]")),
|
||||
])
|
||||
def test_raises_on_usecols_names_mismatch(all_parsers, usecols,
|
||||
kwargs, expected, msg):
|
||||
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
|
||||
kwargs.update(usecols=usecols)
|
||||
parser = all_parsers
|
||||
|
||||
if expected is None:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason="see gh-16469: works on the C engine but not the Python engine",
|
||||
strict=False)
|
||||
@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
|
||||
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols):
|
||||
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
|
||||
names = ["A", "B", "C", "D"]
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=0,
|
||||
names=names, usecols=usecols)
|
||||
expected = DataFrame({"A": [1, 5], "C": [3, 7]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -0,0 +1,25 @@
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
|
||||
from pandas import read_sas
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
class TestSas(object):
|
||||
|
||||
def test_sas_buffer_format(self):
|
||||
# see gh-14947
|
||||
b = StringIO("")
|
||||
|
||||
msg = ("If this is a buffer object rather than a string "
|
||||
"name, you must specify a format string")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_sas(b)
|
||||
|
||||
def test_sas_read_no_format_or_extension(self):
|
||||
# see gh-24548
|
||||
msg = ("unable to infer format of SAS file")
|
||||
with tm.ensure_clean('test_file_no_extension') as path:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_sas(path)
|
||||
@@ -0,0 +1,227 @@
|
||||
import io
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY2
|
||||
from pandas.errors import EmptyDataError
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
# https://github.com/cython/cython/issues/1720
|
||||
@pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning")
|
||||
class TestSAS7BDAT(object):
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_method(self, datapath):
|
||||
self.dirpath = datapath("io", "sas", "data")
|
||||
self.data = []
|
||||
self.test_ix = [list(range(1, 16)), [16]]
|
||||
for j in 1, 2:
|
||||
fname = os.path.join(
|
||||
self.dirpath, "test_sas7bdat_{j}.csv".format(j=j))
|
||||
df = pd.read_csv(fname)
|
||||
epoch = pd.datetime(1960, 1, 1)
|
||||
t1 = pd.to_timedelta(df["Column4"], unit='d')
|
||||
df["Column4"] = epoch + t1
|
||||
t2 = pd.to_timedelta(df["Column12"], unit='d')
|
||||
df["Column12"] = epoch + t2
|
||||
for k in range(df.shape[1]):
|
||||
col = df.iloc[:, k]
|
||||
if col.dtype == np.int64:
|
||||
df.iloc[:, k] = df.iloc[:, k].astype(np.float64)
|
||||
elif col.dtype == np.dtype('O'):
|
||||
if PY2:
|
||||
f = lambda x: (x.decode('utf-8') if
|
||||
isinstance(x, str) else x)
|
||||
df.iloc[:, k] = df.iloc[:, k].apply(f)
|
||||
self.data.append(df)
|
||||
|
||||
def test_from_file(self):
|
||||
for j in 0, 1:
|
||||
df0 = self.data[j]
|
||||
for k in self.test_ix[j]:
|
||||
fname = os.path.join(
|
||||
self.dirpath, "test{k}.sas7bdat".format(k=k))
|
||||
df = pd.read_sas(fname, encoding='utf-8')
|
||||
tm.assert_frame_equal(df, df0)
|
||||
|
||||
def test_from_buffer(self):
|
||||
for j in 0, 1:
|
||||
df0 = self.data[j]
|
||||
for k in self.test_ix[j]:
|
||||
fname = os.path.join(
|
||||
self.dirpath, "test{k}.sas7bdat".format(k=k))
|
||||
with open(fname, 'rb') as f:
|
||||
byts = f.read()
|
||||
buf = io.BytesIO(byts)
|
||||
rdr = pd.read_sas(buf, format="sas7bdat",
|
||||
iterator=True, encoding='utf-8')
|
||||
df = rdr.read()
|
||||
tm.assert_frame_equal(df, df0, check_exact=False)
|
||||
rdr.close()
|
||||
|
||||
def test_from_iterator(self):
|
||||
for j in 0, 1:
|
||||
df0 = self.data[j]
|
||||
for k in self.test_ix[j]:
|
||||
fname = os.path.join(
|
||||
self.dirpath, "test{k}.sas7bdat".format(k=k))
|
||||
rdr = pd.read_sas(fname, iterator=True, encoding='utf-8')
|
||||
df = rdr.read(2)
|
||||
tm.assert_frame_equal(df, df0.iloc[0:2, :])
|
||||
df = rdr.read(3)
|
||||
tm.assert_frame_equal(df, df0.iloc[2:5, :])
|
||||
rdr.close()
|
||||
|
||||
@td.skip_if_no('pathlib')
|
||||
def test_path_pathlib(self):
|
||||
from pathlib import Path
|
||||
for j in 0, 1:
|
||||
df0 = self.data[j]
|
||||
for k in self.test_ix[j]:
|
||||
fname = Path(os.path.join(
|
||||
self.dirpath, "test{k}.sas7bdat".format(k=k)))
|
||||
df = pd.read_sas(fname, encoding='utf-8')
|
||||
tm.assert_frame_equal(df, df0)
|
||||
|
||||
@td.skip_if_no('py.path')
|
||||
def test_path_localpath(self):
|
||||
from py.path import local as LocalPath
|
||||
for j in 0, 1:
|
||||
df0 = self.data[j]
|
||||
for k in self.test_ix[j]:
|
||||
fname = LocalPath(os.path.join(
|
||||
self.dirpath, "test{k}.sas7bdat".format(k=k)))
|
||||
df = pd.read_sas(fname, encoding='utf-8')
|
||||
tm.assert_frame_equal(df, df0)
|
||||
|
||||
def test_iterator_loop(self):
|
||||
# github #13654
|
||||
for j in 0, 1:
|
||||
for k in self.test_ix[j]:
|
||||
for chunksize in 3, 5, 10, 11:
|
||||
fname = os.path.join(
|
||||
self.dirpath, "test{k}.sas7bdat".format(k=k))
|
||||
rdr = pd.read_sas(fname, chunksize=10, encoding='utf-8')
|
||||
y = 0
|
||||
for x in rdr:
|
||||
y += x.shape[0]
|
||||
assert y == rdr.row_count
|
||||
rdr.close()
|
||||
|
||||
def test_iterator_read_too_much(self):
|
||||
# github #14734
|
||||
k = self.test_ix[0][0]
|
||||
fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k))
|
||||
rdr = pd.read_sas(fname, format="sas7bdat",
|
||||
iterator=True, encoding='utf-8')
|
||||
d1 = rdr.read(rdr.row_count + 20)
|
||||
rdr.close()
|
||||
|
||||
rdr = pd.read_sas(fname, iterator=True, encoding="utf-8")
|
||||
d2 = rdr.read(rdr.row_count + 20)
|
||||
tm.assert_frame_equal(d1, d2)
|
||||
rdr.close()
|
||||
|
||||
|
||||
def test_encoding_options(datapath):
|
||||
fname = datapath("io", "sas", "data", "test1.sas7bdat")
|
||||
df1 = pd.read_sas(fname)
|
||||
df2 = pd.read_sas(fname, encoding='utf-8')
|
||||
for col in df1.columns:
|
||||
try:
|
||||
df1[col] = df1[col].str.decode('utf-8')
|
||||
except AttributeError:
|
||||
pass
|
||||
tm.assert_frame_equal(df1, df2)
|
||||
|
||||
from pandas.io.sas.sas7bdat import SAS7BDATReader
|
||||
rdr = SAS7BDATReader(fname, convert_header_text=False)
|
||||
df3 = rdr.read()
|
||||
rdr.close()
|
||||
for x, y in zip(df1.columns, df3.columns):
|
||||
assert(x == y.decode())
|
||||
|
||||
|
||||
def test_productsales(datapath):
|
||||
fname = datapath("io", "sas", "data", "productsales.sas7bdat")
|
||||
df = pd.read_sas(fname, encoding='utf-8')
|
||||
fname = datapath("io", "sas", "data", "productsales.csv")
|
||||
df0 = pd.read_csv(fname, parse_dates=['MONTH'])
|
||||
vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"]
|
||||
df0[vn] = df0[vn].astype(np.float64)
|
||||
tm.assert_frame_equal(df, df0)
|
||||
|
||||
|
||||
def test_12659(datapath):
|
||||
fname = datapath("io", "sas", "data", "test_12659.sas7bdat")
|
||||
df = pd.read_sas(fname)
|
||||
fname = datapath("io", "sas", "data", "test_12659.csv")
|
||||
df0 = pd.read_csv(fname)
|
||||
df0 = df0.astype(np.float64)
|
||||
tm.assert_frame_equal(df, df0)
|
||||
|
||||
|
||||
def test_airline(datapath):
|
||||
fname = datapath("io", "sas", "data", "airline.sas7bdat")
|
||||
df = pd.read_sas(fname)
|
||||
fname = datapath("io", "sas", "data", "airline.csv")
|
||||
df0 = pd.read_csv(fname)
|
||||
df0 = df0.astype(np.float64)
|
||||
tm.assert_frame_equal(df, df0, check_exact=False)
|
||||
|
||||
|
||||
def test_date_time(datapath):
|
||||
# Support of different SAS date/datetime formats (PR #15871)
|
||||
fname = datapath("io", "sas", "data", "datetime.sas7bdat")
|
||||
df = pd.read_sas(fname)
|
||||
fname = datapath("io", "sas", "data", "datetime.csv")
|
||||
df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime',
|
||||
'DateTimeHi', 'Taiw'])
|
||||
# GH 19732: Timestamps imported from sas will incur floating point errors
|
||||
df.iloc[:, 3] = df.iloc[:, 3].dt.round('us')
|
||||
tm.assert_frame_equal(df, df0)
|
||||
|
||||
|
||||
def test_compact_numerical_values(datapath):
|
||||
# Regression test for #21616
|
||||
fname = datapath("io", "sas", "data", "cars.sas7bdat")
|
||||
df = pd.read_sas(fname, encoding='latin-1')
|
||||
# The two columns CYL and WGT in cars.sas7bdat have column
|
||||
# width < 8 and only contain integral values.
|
||||
# Test that pandas doesn't corrupt the numbers by adding
|
||||
# decimals.
|
||||
result = df['WGT']
|
||||
expected = df['WGT'].round()
|
||||
tm.assert_series_equal(result, expected, check_exact=True)
|
||||
result = df['CYL']
|
||||
expected = df['CYL'].round()
|
||||
tm.assert_series_equal(result, expected, check_exact=True)
|
||||
|
||||
|
||||
def test_many_columns(datapath):
|
||||
# Test for looking for column information in more places (PR #22628)
|
||||
fname = datapath("io", "sas", "data", "many_columns.sas7bdat")
|
||||
df = pd.read_sas(fname, encoding='latin-1')
|
||||
fname = datapath("io", "sas", "data", "many_columns.csv")
|
||||
df0 = pd.read_csv(fname, encoding='latin-1')
|
||||
tm.assert_frame_equal(df, df0)
|
||||
|
||||
|
||||
def test_inconsistent_number_of_rows(datapath):
|
||||
# Regression test for issue #16615. (PR #22628)
|
||||
fname = datapath("io", "sas", "data", "load_log.sas7bdat")
|
||||
df = pd.read_sas(fname, encoding='latin-1')
|
||||
assert len(df) == 2097
|
||||
|
||||
|
||||
def test_zero_variables(datapath):
|
||||
# Check if the SAS file has zero variables (PR #18184)
|
||||
fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")
|
||||
with pytest.raises(EmptyDataError):
|
||||
pd.read_sas(fname)
|
||||
@@ -0,0 +1,146 @@
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.sas.sasreader import read_sas
|
||||
|
||||
# CSV versions of test xpt files were obtained using the R foreign library
|
||||
|
||||
# Numbers in a SAS xport file are always float64, so need to convert
|
||||
# before making comparisons.
|
||||
|
||||
|
||||
def numeric_as_float(data):
|
||||
for v in data.columns:
|
||||
if data[v].dtype is np.dtype('int64'):
|
||||
data[v] = data[v].astype(np.float64)
|
||||
|
||||
|
||||
class TestXport(object):
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_method(self, datapath):
|
||||
self.dirpath = datapath("io", "sas", "data")
|
||||
self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt")
|
||||
self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt")
|
||||
self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt")
|
||||
self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt")
|
||||
|
||||
def test1_basic(self):
|
||||
# Tests with DEMO_G.xpt (all numeric file)
|
||||
|
||||
# Compare to this
|
||||
data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
|
||||
numeric_as_float(data_csv)
|
||||
|
||||
# Read full file
|
||||
data = read_sas(self.file01, format="xport")
|
||||
tm.assert_frame_equal(data, data_csv)
|
||||
num_rows = data.shape[0]
|
||||
|
||||
# Test reading beyond end of file
|
||||
reader = read_sas(self.file01, format="xport", iterator=True)
|
||||
data = reader.read(num_rows + 100)
|
||||
assert data.shape[0] == num_rows
|
||||
reader.close()
|
||||
|
||||
# Test incremental read with `read` method.
|
||||
reader = read_sas(self.file01, format="xport", iterator=True)
|
||||
data = reader.read(10)
|
||||
reader.close()
|
||||
tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
|
||||
|
||||
# Test incremental read with `get_chunk` method.
|
||||
reader = read_sas(self.file01, format="xport", chunksize=10)
|
||||
data = reader.get_chunk()
|
||||
reader.close()
|
||||
tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
|
||||
|
||||
# Test read in loop
|
||||
m = 0
|
||||
reader = read_sas(self.file01, format="xport", chunksize=100)
|
||||
for x in reader:
|
||||
m += x.shape[0]
|
||||
reader.close()
|
||||
assert m == num_rows
|
||||
|
||||
# Read full file with `read_sas` method
|
||||
data = read_sas(self.file01)
|
||||
tm.assert_frame_equal(data, data_csv)
|
||||
|
||||
def test1_index(self):
|
||||
# Tests with DEMO_G.xpt using index (all numeric file)
|
||||
|
||||
# Compare to this
|
||||
data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
|
||||
data_csv = data_csv.set_index("SEQN")
|
||||
numeric_as_float(data_csv)
|
||||
|
||||
# Read full file
|
||||
data = read_sas(self.file01, index="SEQN", format="xport")
|
||||
tm.assert_frame_equal(data, data_csv, check_index_type=False)
|
||||
|
||||
# Test incremental read with `read` method.
|
||||
reader = read_sas(self.file01, index="SEQN", format="xport",
|
||||
iterator=True)
|
||||
data = reader.read(10)
|
||||
reader.close()
|
||||
tm.assert_frame_equal(data, data_csv.iloc[0:10, :],
|
||||
check_index_type=False)
|
||||
|
||||
# Test incremental read with `get_chunk` method.
|
||||
reader = read_sas(self.file01, index="SEQN", format="xport",
|
||||
chunksize=10)
|
||||
data = reader.get_chunk()
|
||||
reader.close()
|
||||
tm.assert_frame_equal(data, data_csv.iloc[0:10, :],
|
||||
check_index_type=False)
|
||||
|
||||
def test1_incremental(self):
|
||||
# Test with DEMO_G.xpt, reading full file incrementally
|
||||
|
||||
data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
|
||||
data_csv = data_csv.set_index("SEQN")
|
||||
numeric_as_float(data_csv)
|
||||
|
||||
reader = read_sas(self.file01, index="SEQN", chunksize=1000)
|
||||
|
||||
all_data = [x for x in reader]
|
||||
data = pd.concat(all_data, axis=0)
|
||||
|
||||
tm.assert_frame_equal(data, data_csv, check_index_type=False)
|
||||
|
||||
def test2(self):
|
||||
# Test with SSHSV1_A.xpt
|
||||
|
||||
# Compare to this
|
||||
data_csv = pd.read_csv(self.file02.replace(".xpt", ".csv"))
|
||||
numeric_as_float(data_csv)
|
||||
|
||||
data = read_sas(self.file02)
|
||||
tm.assert_frame_equal(data, data_csv)
|
||||
|
||||
def test_multiple_types(self):
|
||||
# Test with DRXFCD_G.xpt (contains text and numeric variables)
|
||||
|
||||
# Compare to this
|
||||
data_csv = pd.read_csv(self.file03.replace(".xpt", ".csv"))
|
||||
|
||||
data = read_sas(self.file03, encoding="utf-8")
|
||||
tm.assert_frame_equal(data, data_csv)
|
||||
|
||||
def test_truncated_float_support(self):
|
||||
# Test with paxraw_d_short.xpt, a shortened version of:
|
||||
# http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP
|
||||
# This file has truncated floats (5 bytes in this case).
|
||||
|
||||
# GH 11713
|
||||
|
||||
data_csv = pd.read_csv(self.file04.replace(".xpt", ".csv"))
|
||||
|
||||
data = read_sas(self.file04, format="xport")
|
||||
tm.assert_frame_equal(data.astype('int64'), data_csv)
|
||||
@@ -0,0 +1,227 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from textwrap import dedent
|
||||
|
||||
import numpy as np
|
||||
from numpy.random import randint
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY2
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, get_option, read_clipboard
|
||||
from pandas.util import testing as tm
|
||||
from pandas.util.testing import makeCustomDataframe as mkdf
|
||||
|
||||
from pandas.io.clipboard.exceptions import PyperclipException
|
||||
|
||||
try:
|
||||
DataFrame({'A': [1, 2]}).to_clipboard()
|
||||
_DEPS_INSTALLED = 1
|
||||
except (PyperclipException, RuntimeError):
|
||||
_DEPS_INSTALLED = 0
|
||||
|
||||
|
||||
def build_kwargs(sep, excel):
|
||||
kwargs = {}
|
||||
if excel != 'default':
|
||||
kwargs['excel'] = excel
|
||||
if sep != 'default':
|
||||
kwargs['sep'] = sep
|
||||
return kwargs
|
||||
|
||||
|
||||
@pytest.fixture(params=['delims', 'utf8', 'string', 'long', 'nonascii',
|
||||
'colwidth', 'mixed', 'float', 'int'])
|
||||
def df(request):
|
||||
data_type = request.param
|
||||
|
||||
if data_type == 'delims':
|
||||
return pd.DataFrame({'a': ['"a,\t"b|c', 'd\tef´'],
|
||||
'b': ['hi\'j', 'k\'\'lm']})
|
||||
elif data_type == 'utf8':
|
||||
return pd.DataFrame({'a': ['µasd', 'Ωœ∑´'],
|
||||
'b': ['øπ∆˚¬', 'œ∑´®']})
|
||||
elif data_type == 'string':
|
||||
return mkdf(5, 3, c_idx_type='s', r_idx_type='i',
|
||||
c_idx_names=[None], r_idx_names=[None])
|
||||
elif data_type == 'long':
|
||||
max_rows = get_option('display.max_rows')
|
||||
return mkdf(max_rows + 1, 3,
|
||||
data_gen_f=lambda *args: randint(2),
|
||||
c_idx_type='s', r_idx_type='i',
|
||||
c_idx_names=[None], r_idx_names=[None])
|
||||
elif data_type == 'nonascii':
|
||||
return pd.DataFrame({'en': 'in English'.split(),
|
||||
'es': 'en español'.split()})
|
||||
elif data_type == 'colwidth':
|
||||
_cw = get_option('display.max_colwidth') + 1
|
||||
return mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw,
|
||||
c_idx_type='s', r_idx_type='i',
|
||||
c_idx_names=[None], r_idx_names=[None])
|
||||
elif data_type == 'mixed':
|
||||
return DataFrame({'a': np.arange(1.0, 6.0) + 0.01,
|
||||
'b': np.arange(1, 6),
|
||||
'c': list('abcde')})
|
||||
elif data_type == 'float':
|
||||
return mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01,
|
||||
c_idx_type='s', r_idx_type='i',
|
||||
c_idx_names=[None], r_idx_names=[None])
|
||||
elif data_type == 'int':
|
||||
return mkdf(5, 3, data_gen_f=lambda *args: randint(2),
|
||||
c_idx_type='s', r_idx_type='i',
|
||||
c_idx_names=[None], r_idx_names=[None])
|
||||
else:
|
||||
raise ValueError
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_clipboard(monkeypatch, request):
|
||||
"""Fixture mocking clipboard IO.
|
||||
|
||||
This mocks pandas.io.clipboard.clipboard_get and
|
||||
pandas.io.clipboard.clipboard_set.
|
||||
|
||||
This uses a local dict for storing data. The dictionary
|
||||
key used is the test ID, available with ``request.node.name``.
|
||||
|
||||
This returns the local dictionary, for direct manipulation by
|
||||
tests.
|
||||
"""
|
||||
|
||||
# our local clipboard for tests
|
||||
_mock_data = {}
|
||||
|
||||
def _mock_set(data):
|
||||
_mock_data[request.node.name] = data
|
||||
|
||||
def _mock_get():
|
||||
return _mock_data[request.node.name]
|
||||
|
||||
monkeypatch.setattr("pandas.io.clipboard.clipboard_set", _mock_set)
|
||||
monkeypatch.setattr("pandas.io.clipboard.clipboard_get", _mock_get)
|
||||
|
||||
yield _mock_data
|
||||
|
||||
|
||||
@pytest.mark.clipboard
|
||||
def test_mock_clipboard(mock_clipboard):
|
||||
import pandas.io.clipboard
|
||||
pandas.io.clipboard.clipboard_set("abc")
|
||||
assert "abc" in set(mock_clipboard.values())
|
||||
result = pandas.io.clipboard.clipboard_get()
|
||||
assert result == "abc"
|
||||
|
||||
|
||||
@pytest.mark.single
|
||||
@pytest.mark.clipboard
|
||||
@pytest.mark.skipif(not _DEPS_INSTALLED,
|
||||
reason="clipboard primitives not installed")
|
||||
@pytest.mark.usefixtures("mock_clipboard")
|
||||
class TestClipboard(object):
|
||||
|
||||
def check_round_trip_frame(self, data, excel=None, sep=None,
|
||||
encoding=None):
|
||||
data.to_clipboard(excel=excel, sep=sep, encoding=encoding)
|
||||
result = read_clipboard(sep=sep or '\t', index_col=0,
|
||||
encoding=encoding)
|
||||
tm.assert_frame_equal(data, result, check_dtype=False)
|
||||
|
||||
# Test that default arguments copy as tab delimited
|
||||
def test_round_trip_frame(self, df):
|
||||
self.check_round_trip_frame(df)
|
||||
|
||||
# Test that explicit delimiters are respected
|
||||
@pytest.mark.parametrize('sep', ['\t', ',', '|'])
|
||||
def test_round_trip_frame_sep(self, df, sep):
|
||||
self.check_round_trip_frame(df, sep=sep)
|
||||
|
||||
# Test white space separator
|
||||
def test_round_trip_frame_string(self, df):
|
||||
df.to_clipboard(excel=False, sep=None)
|
||||
result = read_clipboard()
|
||||
assert df.to_string() == result.to_string()
|
||||
assert df.shape == result.shape
|
||||
|
||||
# Two character separator is not supported in to_clipboard
|
||||
# Test that multi-character separators are not silently passed
|
||||
def test_excel_sep_warning(self, df):
|
||||
with tm.assert_produces_warning():
|
||||
df.to_clipboard(excel=True, sep=r'\t')
|
||||
|
||||
# Separator is ignored when excel=False and should produce a warning
|
||||
def test_copy_delim_warning(self, df):
|
||||
with tm.assert_produces_warning():
|
||||
df.to_clipboard(excel=False, sep='\t')
|
||||
|
||||
# Tests that the default behavior of to_clipboard is tab
|
||||
# delimited and excel="True"
|
||||
@pytest.mark.parametrize('sep', ['\t', None, 'default'])
|
||||
@pytest.mark.parametrize('excel', [True, None, 'default'])
|
||||
def test_clipboard_copy_tabs_default(self, sep, excel, df, request,
|
||||
mock_clipboard):
|
||||
kwargs = build_kwargs(sep, excel)
|
||||
df.to_clipboard(**kwargs)
|
||||
if PY2:
|
||||
# to_clipboard copies unicode, to_csv produces bytes. This is
|
||||
# expected behavior
|
||||
result = mock_clipboard[request.node.name].encode('utf-8')
|
||||
expected = df.to_csv(sep='\t')
|
||||
assert result == expected
|
||||
else:
|
||||
assert mock_clipboard[request.node.name] == df.to_csv(sep='\t')
|
||||
|
||||
# Tests reading of white space separated tables
|
||||
@pytest.mark.parametrize('sep', [None, 'default'])
|
||||
@pytest.mark.parametrize('excel', [False])
|
||||
def test_clipboard_copy_strings(self, sep, excel, df):
|
||||
kwargs = build_kwargs(sep, excel)
|
||||
df.to_clipboard(**kwargs)
|
||||
result = read_clipboard(sep=r'\s+')
|
||||
assert result.to_string() == df.to_string()
|
||||
assert df.shape == result.shape
|
||||
|
||||
def test_read_clipboard_infer_excel(self, request,
|
||||
mock_clipboard):
|
||||
# gh-19010: avoid warnings
|
||||
clip_kwargs = dict(engine="python")
|
||||
|
||||
text = dedent("""
|
||||
John James Charlie Mingus
|
||||
1 2
|
||||
4 Harry Carney
|
||||
""".strip())
|
||||
mock_clipboard[request.node.name] = text
|
||||
df = pd.read_clipboard(**clip_kwargs)
|
||||
|
||||
# excel data is parsed correctly
|
||||
assert df.iloc[1][1] == 'Harry Carney'
|
||||
|
||||
# having diff tab counts doesn't trigger it
|
||||
text = dedent("""
|
||||
a\t b
|
||||
1 2
|
||||
3 4
|
||||
""".strip())
|
||||
mock_clipboard[request.node.name] = text
|
||||
res = pd.read_clipboard(**clip_kwargs)
|
||||
|
||||
text = dedent("""
|
||||
a b
|
||||
1 2
|
||||
3 4
|
||||
""".strip())
|
||||
mock_clipboard[request.node.name] = text
|
||||
exp = pd.read_clipboard(**clip_kwargs)
|
||||
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_invalid_encoding(self, df):
|
||||
# test case for testing invalid encoding
|
||||
with pytest.raises(ValueError):
|
||||
df.to_clipboard(encoding='ascii')
|
||||
with pytest.raises(NotImplementedError):
|
||||
pd.read_clipboard(encoding='ascii')
|
||||
|
||||
@pytest.mark.parametrize('enc', ['UTF-8', 'utf-8', 'utf8'])
|
||||
def test_round_trip_valid_encodings(self, enc, df):
|
||||
self.check_round_trip_frame(df, encoding=enc)
|
||||
@@ -0,0 +1,357 @@
|
||||
"""
|
||||
Tests for the pandas.io.common functionalities
|
||||
"""
|
||||
import mmap
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import FileNotFoundError, StringIO, is_platform_windows
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
|
||||
import pandas.io.common as icom
|
||||
|
||||
|
||||
class CustomFSPath(object):
|
||||
"""For testing fspath on unknown objects"""
|
||||
def __init__(self, path):
|
||||
self.path = path
|
||||
|
||||
def __fspath__(self):
|
||||
return self.path
|
||||
|
||||
|
||||
# Functions that consume a string path and return a string or path-like object
|
||||
path_types = [str, CustomFSPath]
|
||||
|
||||
try:
|
||||
from pathlib import Path
|
||||
path_types.append(Path)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from py.path import local as LocalPath
|
||||
path_types.append(LocalPath)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
HERE = os.path.abspath(os.path.dirname(__file__))
|
||||
|
||||
|
||||
# https://github.com/cython/cython/issues/1720
|
||||
@pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning")
|
||||
class TestCommonIOCapabilities(object):
|
||||
data1 = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
|
||||
def test_expand_user(self):
|
||||
filename = '~/sometest'
|
||||
expanded_name = icom._expand_user(filename)
|
||||
|
||||
assert expanded_name != filename
|
||||
assert os.path.isabs(expanded_name)
|
||||
assert os.path.expanduser(filename) == expanded_name
|
||||
|
||||
def test_expand_user_normal_path(self):
|
||||
filename = '/somefolder/sometest'
|
||||
expanded_name = icom._expand_user(filename)
|
||||
|
||||
assert expanded_name == filename
|
||||
assert os.path.expanduser(filename) == expanded_name
|
||||
|
||||
@td.skip_if_no('pathlib')
|
||||
def test_stringify_path_pathlib(self):
|
||||
rel_path = icom._stringify_path(Path('.'))
|
||||
assert rel_path == '.'
|
||||
redundant_path = icom._stringify_path(Path('foo//bar'))
|
||||
assert redundant_path == os.path.join('foo', 'bar')
|
||||
|
||||
@td.skip_if_no('py.path')
|
||||
def test_stringify_path_localpath(self):
|
||||
path = os.path.join('foo', 'bar')
|
||||
abs_path = os.path.abspath(path)
|
||||
lpath = LocalPath(path)
|
||||
assert icom._stringify_path(lpath) == abs_path
|
||||
|
||||
def test_stringify_path_fspath(self):
|
||||
p = CustomFSPath('foo/bar.csv')
|
||||
result = icom._stringify_path(p)
|
||||
assert result == 'foo/bar.csv'
|
||||
|
||||
@pytest.mark.parametrize('extension,expected', [
|
||||
('', None),
|
||||
('.gz', 'gzip'),
|
||||
('.bz2', 'bz2'),
|
||||
('.zip', 'zip'),
|
||||
('.xz', 'xz'),
|
||||
])
|
||||
@pytest.mark.parametrize('path_type', path_types)
|
||||
def test_infer_compression_from_path(self, extension, expected, path_type):
|
||||
path = path_type('foo/bar.csv' + extension)
|
||||
compression = icom._infer_compression(path, compression='infer')
|
||||
assert compression == expected
|
||||
|
||||
def test_get_filepath_or_buffer_with_path(self):
|
||||
filename = '~/sometest'
|
||||
filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(
|
||||
filename)
|
||||
assert filepath_or_buffer != filename
|
||||
assert os.path.isabs(filepath_or_buffer)
|
||||
assert os.path.expanduser(filename) == filepath_or_buffer
|
||||
assert not should_close
|
||||
|
||||
def test_get_filepath_or_buffer_with_buffer(self):
|
||||
input_buffer = StringIO()
|
||||
filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(
|
||||
input_buffer)
|
||||
assert filepath_or_buffer == input_buffer
|
||||
assert not should_close
|
||||
|
||||
def test_iterator(self):
|
||||
reader = pd.read_csv(StringIO(self.data1), chunksize=1)
|
||||
result = pd.concat(reader, ignore_index=True)
|
||||
expected = pd.read_csv(StringIO(self.data1))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH12153
|
||||
it = pd.read_csv(StringIO(self.data1), chunksize=1)
|
||||
first = next(it)
|
||||
tm.assert_frame_equal(first, expected.iloc[[0]])
|
||||
tm.assert_frame_equal(pd.concat(it), expected.iloc[1:])
|
||||
|
||||
@pytest.mark.parametrize('reader, module, error_class, fn_ext', [
|
||||
(pd.read_csv, 'os', FileNotFoundError, 'csv'),
|
||||
(pd.read_fwf, 'os', FileNotFoundError, 'txt'),
|
||||
(pd.read_excel, 'xlrd', FileNotFoundError, 'xlsx'),
|
||||
(pd.read_feather, 'feather', Exception, 'feather'),
|
||||
(pd.read_hdf, 'tables', FileNotFoundError, 'h5'),
|
||||
(pd.read_stata, 'os', FileNotFoundError, 'dta'),
|
||||
(pd.read_sas, 'os', FileNotFoundError, 'sas7bdat'),
|
||||
(pd.read_json, 'os', ValueError, 'json'),
|
||||
(pd.read_msgpack, 'os', ValueError, 'mp'),
|
||||
(pd.read_pickle, 'os', FileNotFoundError, 'pickle'),
|
||||
])
|
||||
def test_read_non_existant(self, reader, module, error_class, fn_ext):
|
||||
pytest.importorskip(module)
|
||||
|
||||
path = os.path.join(HERE, 'data', 'does_not_exist.' + fn_ext)
|
||||
msg1 = (r"File (b')?.+does_not_exist\.{}'? does not exist"
|
||||
.format(fn_ext))
|
||||
msg2 = (r"\[Errno 2\] No such file or directory: '.+does_not_exist"
|
||||
r"\.{}'").format(fn_ext)
|
||||
msg3 = "Expected object or value"
|
||||
msg4 = "path_or_buf needs to be a string file path or file-like"
|
||||
msg5 = (r"\[Errno 2\] File .+does_not_exist\.{} does not exist:"
|
||||
r" '.+does_not_exist\.{}'").format(fn_ext, fn_ext)
|
||||
with pytest.raises(error_class, match=r"({}|{}|{}|{}|{})".format(
|
||||
msg1, msg2, msg3, msg4, msg5)):
|
||||
reader(path)
|
||||
|
||||
@pytest.mark.parametrize('reader, module, error_class, fn_ext', [
|
||||
(pd.read_csv, 'os', FileNotFoundError, 'csv'),
|
||||
(pd.read_fwf, 'os', FileNotFoundError, 'txt'),
|
||||
(pd.read_excel, 'xlrd', FileNotFoundError, 'xlsx'),
|
||||
(pd.read_feather, 'feather', Exception, 'feather'),
|
||||
(pd.read_hdf, 'tables', FileNotFoundError, 'h5'),
|
||||
(pd.read_stata, 'os', FileNotFoundError, 'dta'),
|
||||
(pd.read_sas, 'os', FileNotFoundError, 'sas7bdat'),
|
||||
(pd.read_json, 'os', ValueError, 'json'),
|
||||
(pd.read_msgpack, 'os', ValueError, 'mp'),
|
||||
(pd.read_pickle, 'os', FileNotFoundError, 'pickle'),
|
||||
])
|
||||
def test_read_expands_user_home_dir(self, reader, module,
|
||||
error_class, fn_ext, monkeypatch):
|
||||
pytest.importorskip(module)
|
||||
|
||||
path = os.path.join('~', 'does_not_exist.' + fn_ext)
|
||||
monkeypatch.setattr(icom, '_expand_user',
|
||||
lambda x: os.path.join('foo', x))
|
||||
|
||||
msg1 = (r"File (b')?.+does_not_exist\.{}'? does not exist"
|
||||
.format(fn_ext))
|
||||
msg2 = (r"\[Errno 2\] No such file or directory:"
|
||||
r" '.+does_not_exist\.{}'").format(fn_ext)
|
||||
msg3 = "Unexpected character found when decoding 'false'"
|
||||
msg4 = "path_or_buf needs to be a string file path or file-like"
|
||||
msg5 = (r"\[Errno 2\] File .+does_not_exist\.{} does not exist:"
|
||||
r" '.+does_not_exist\.{}'").format(fn_ext, fn_ext)
|
||||
|
||||
with pytest.raises(error_class, match=r"({}|{}|{}|{}|{})".format(
|
||||
msg1, msg2, msg3, msg4, msg5)):
|
||||
reader(path)
|
||||
|
||||
def test_read_non_existant_read_table(self):
|
||||
path = os.path.join(HERE, 'data', 'does_not_exist.' + 'csv')
|
||||
msg1 = r"File b'.+does_not_exist\.csv' does not exist"
|
||||
msg2 = (r"\[Errno 2\] File .+does_not_exist\.csv does not exist:"
|
||||
r" '.+does_not_exist\.csv'")
|
||||
with pytest.raises(FileNotFoundError, match=r"({}|{})".format(
|
||||
msg1, msg2)):
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
pd.read_table(path)
|
||||
|
||||
@pytest.mark.parametrize('reader, module, path', [
|
||||
(pd.read_csv, 'os', ('io', 'data', 'iris.csv')),
|
||||
(pd.read_fwf, 'os', ('io', 'data', 'fixed_width_format.txt')),
|
||||
(pd.read_excel, 'xlrd', ('io', 'data', 'test1.xlsx')),
|
||||
(pd.read_feather, 'feather', ('io', 'data', 'feather-0_3_1.feather')),
|
||||
(pd.read_hdf, 'tables', ('io', 'data', 'legacy_hdf',
|
||||
'datetimetz_object.h5')),
|
||||
(pd.read_stata, 'os', ('io', 'data', 'stata10_115.dta')),
|
||||
(pd.read_sas, 'os', ('io', 'sas', 'data', 'test1.sas7bdat')),
|
||||
(pd.read_json, 'os', ('io', 'json', 'data', 'tsframe_v012.json')),
|
||||
(pd.read_msgpack, 'os', ('io', 'msgpack', 'data', 'frame.mp')),
|
||||
(pd.read_pickle, 'os', ('io', 'data', 'categorical_0_14_1.pickle')),
|
||||
])
|
||||
def test_read_fspath_all(self, reader, module, path, datapath):
|
||||
pytest.importorskip(module)
|
||||
path = datapath(*path)
|
||||
|
||||
mypath = CustomFSPath(path)
|
||||
result = reader(mypath)
|
||||
expected = reader(path)
|
||||
|
||||
if path.endswith('.pickle'):
|
||||
# categorical
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
else:
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_read_fspath_all_read_table(self, datapath):
|
||||
path = datapath('io', 'data', 'iris.csv')
|
||||
|
||||
mypath = CustomFSPath(path)
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
result = pd.read_table(mypath)
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
expected = pd.read_table(path)
|
||||
|
||||
if path.endswith('.pickle'):
|
||||
# categorical
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
else:
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize('writer_name, writer_kwargs, module', [
|
||||
('to_csv', {}, 'os'),
|
||||
('to_excel', {'engine': 'xlwt'}, 'xlwt'),
|
||||
('to_feather', {}, 'feather'),
|
||||
('to_html', {}, 'os'),
|
||||
('to_json', {}, 'os'),
|
||||
('to_latex', {}, 'os'),
|
||||
('to_msgpack', {}, 'os'),
|
||||
('to_pickle', {}, 'os'),
|
||||
('to_stata', {}, 'os'),
|
||||
])
|
||||
def test_write_fspath_all(self, writer_name, writer_kwargs, module):
|
||||
p1 = tm.ensure_clean('string')
|
||||
p2 = tm.ensure_clean('fspath')
|
||||
df = pd.DataFrame({"A": [1, 2]})
|
||||
|
||||
with p1 as string, p2 as fspath:
|
||||
pytest.importorskip(module)
|
||||
mypath = CustomFSPath(fspath)
|
||||
writer = getattr(df, writer_name)
|
||||
|
||||
writer(string, **writer_kwargs)
|
||||
with open(string, 'rb') as f:
|
||||
expected = f.read()
|
||||
|
||||
writer(mypath, **writer_kwargs)
|
||||
with open(fspath, 'rb') as f:
|
||||
result = f.read()
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_write_fspath_hdf5(self):
|
||||
# Same test as write_fspath_all, except HDF5 files aren't
|
||||
# necessarily byte-for-byte identical for a given dataframe, so we'll
|
||||
# have to read and compare equality
|
||||
pytest.importorskip('tables')
|
||||
|
||||
df = pd.DataFrame({"A": [1, 2]})
|
||||
p1 = tm.ensure_clean('string')
|
||||
p2 = tm.ensure_clean('fspath')
|
||||
|
||||
with p1 as string, p2 as fspath:
|
||||
mypath = CustomFSPath(fspath)
|
||||
df.to_hdf(mypath, key='bar')
|
||||
df.to_hdf(string, key='bar')
|
||||
|
||||
result = pd.read_hdf(fspath, key='bar')
|
||||
expected = pd.read_hdf(string, key='bar')
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mmap_file(datapath):
|
||||
return datapath('io', 'data', 'test_mmap.csv')
|
||||
|
||||
|
||||
class TestMMapWrapper(object):
|
||||
|
||||
def test_constructor_bad_file(self, mmap_file):
|
||||
non_file = StringIO('I am not a file')
|
||||
non_file.fileno = lambda: -1
|
||||
|
||||
# the error raised is different on Windows
|
||||
if is_platform_windows():
|
||||
msg = "The parameter is incorrect"
|
||||
err = OSError
|
||||
else:
|
||||
msg = "[Errno 22]"
|
||||
err = mmap.error
|
||||
|
||||
with pytest.raises(err, match=msg):
|
||||
icom.MMapWrapper(non_file)
|
||||
|
||||
target = open(mmap_file, 'r')
|
||||
target.close()
|
||||
|
||||
msg = "I/O operation on closed file"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
icom.MMapWrapper(target)
|
||||
|
||||
def test_get_attr(self, mmap_file):
|
||||
with open(mmap_file, 'r') as target:
|
||||
wrapper = icom.MMapWrapper(target)
|
||||
|
||||
attrs = dir(wrapper.mmap)
|
||||
attrs = [attr for attr in attrs
|
||||
if not attr.startswith('__')]
|
||||
attrs.append('__next__')
|
||||
|
||||
for attr in attrs:
|
||||
assert hasattr(wrapper, attr)
|
||||
|
||||
assert not hasattr(wrapper, 'foo')
|
||||
|
||||
def test_next(self, mmap_file):
|
||||
with open(mmap_file, 'r') as target:
|
||||
wrapper = icom.MMapWrapper(target)
|
||||
lines = target.readlines()
|
||||
|
||||
for line in lines:
|
||||
next_line = next(wrapper)
|
||||
assert next_line.strip() == line.strip()
|
||||
|
||||
with pytest.raises(StopIteration, match=r'^$'):
|
||||
next(wrapper)
|
||||
|
||||
def test_unknown_engine(self):
|
||||
with tm.ensure_clean() as path:
|
||||
df = tm.makeDataFrame()
|
||||
df.to_csv(path)
|
||||
with pytest.raises(ValueError, match='Unknown engine'):
|
||||
pd.read_csv(path, engine='pyt')
|
||||
@@ -0,0 +1,116 @@
|
||||
import contextlib
|
||||
import os
|
||||
import warnings
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
|
||||
import pandas.io.common as icom
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def catch_to_csv_depr():
|
||||
# Catching warnings because Series.to_csv has
|
||||
# been deprecated. Remove this context when
|
||||
# Series.to_csv has been aligned.
|
||||
|
||||
with warnings.catch_warnings(record=True):
|
||||
warnings.simplefilter("ignore", FutureWarning)
|
||||
yield
|
||||
|
||||
|
||||
@pytest.mark.parametrize('obj', [
|
||||
pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567],
|
||||
[12.32112, 123123.2, 321321.2]],
|
||||
columns=['X', 'Y', 'Z']),
|
||||
pd.Series(100 * [0.123456, 0.234567, 0.567567], name='X')])
|
||||
@pytest.mark.parametrize('method', ['to_pickle', 'to_json', 'to_csv'])
|
||||
def test_compression_size(obj, method, compression_only):
|
||||
with tm.ensure_clean() as path:
|
||||
with catch_to_csv_depr():
|
||||
getattr(obj, method)(path, compression=compression_only)
|
||||
compressed_size = os.path.getsize(path)
|
||||
getattr(obj, method)(path, compression=None)
|
||||
uncompressed_size = os.path.getsize(path)
|
||||
assert uncompressed_size > compressed_size
|
||||
|
||||
|
||||
@pytest.mark.parametrize('obj', [
|
||||
pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567],
|
||||
[12.32112, 123123.2, 321321.2]],
|
||||
columns=['X', 'Y', 'Z']),
|
||||
pd.Series(100 * [0.123456, 0.234567, 0.567567], name='X')])
|
||||
@pytest.mark.parametrize('method', ['to_csv', 'to_json'])
|
||||
def test_compression_size_fh(obj, method, compression_only):
|
||||
with tm.ensure_clean() as path:
|
||||
f, handles = icom._get_handle(path, 'w', compression=compression_only)
|
||||
with catch_to_csv_depr():
|
||||
with f:
|
||||
getattr(obj, method)(f)
|
||||
assert not f.closed
|
||||
assert f.closed
|
||||
compressed_size = os.path.getsize(path)
|
||||
with tm.ensure_clean() as path:
|
||||
f, handles = icom._get_handle(path, 'w', compression=None)
|
||||
with catch_to_csv_depr():
|
||||
with f:
|
||||
getattr(obj, method)(f)
|
||||
assert not f.closed
|
||||
assert f.closed
|
||||
uncompressed_size = os.path.getsize(path)
|
||||
assert uncompressed_size > compressed_size
|
||||
|
||||
|
||||
@pytest.mark.parametrize('write_method, write_kwargs, read_method', [
|
||||
('to_csv', {'index': False}, pd.read_csv),
|
||||
('to_json', {}, pd.read_json),
|
||||
('to_pickle', {}, pd.read_pickle),
|
||||
])
|
||||
def test_dataframe_compression_defaults_to_infer(
|
||||
write_method, write_kwargs, read_method, compression_only):
|
||||
# GH22004
|
||||
input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=['X', 'Y', 'Z'])
|
||||
extension = icom._compression_to_extension[compression_only]
|
||||
with tm.ensure_clean('compressed' + extension) as path:
|
||||
getattr(input, write_method)(path, **write_kwargs)
|
||||
output = read_method(path, compression=compression_only)
|
||||
tm.assert_frame_equal(output, input)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('write_method,write_kwargs,read_method,read_kwargs', [
|
||||
('to_csv', {'index': False, 'header': True},
|
||||
pd.read_csv, {'squeeze': True}),
|
||||
('to_json', {}, pd.read_json, {'typ': 'series'}),
|
||||
('to_pickle', {}, pd.read_pickle, {}),
|
||||
])
|
||||
def test_series_compression_defaults_to_infer(
|
||||
write_method, write_kwargs, read_method, read_kwargs,
|
||||
compression_only):
|
||||
# GH22004
|
||||
input = pd.Series([0, 5, -2, 10], name='X')
|
||||
extension = icom._compression_to_extension[compression_only]
|
||||
with tm.ensure_clean('compressed' + extension) as path:
|
||||
getattr(input, write_method)(path, **write_kwargs)
|
||||
output = read_method(path, compression=compression_only, **read_kwargs)
|
||||
tm.assert_series_equal(output, input, check_names=False)
|
||||
|
||||
|
||||
def test_compression_warning(compression_only):
|
||||
# Assert that passing a file object to to_csv while explicitly specifying a
|
||||
# compression protocol triggers a RuntimeWarning, as per GH21227.
|
||||
# Note that pytest has an issue that causes assert_produces_warning to fail
|
||||
# in Python 2 if the warning has occurred in previous tests
|
||||
# (see https://git.io/fNEBm & https://git.io/fNEBC). Hence, should this
|
||||
# test fail in just Python 2 builds, it likely indicates that other tests
|
||||
# are producing RuntimeWarnings, thereby triggering the pytest bug.
|
||||
df = pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567],
|
||||
[12.32112, 123123.2, 321321.2]],
|
||||
columns=['X', 'Y', 'Z'])
|
||||
with tm.ensure_clean() as path:
|
||||
f, handles = icom._get_handle(path, 'w', compression=compression_only)
|
||||
with tm.assert_produces_warning(RuntimeWarning,
|
||||
check_stacklevel=False):
|
||||
with f:
|
||||
df.to_csv(f, compression=compression_only)
|
||||
@@ -0,0 +1,43 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
import pandas.io.date_converters as conv
|
||||
|
||||
|
||||
def test_parse_date_time():
|
||||
dates = np.array(['2007/1/3', '2008/2/4'], dtype=object)
|
||||
times = np.array(['05:07:09', '06:08:00'], dtype=object)
|
||||
expected = np.array([datetime(2007, 1, 3, 5, 7, 9),
|
||||
datetime(2008, 2, 4, 6, 8, 0)])
|
||||
|
||||
result = conv.parse_date_time(dates, times)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_date_fields():
|
||||
days = np.array([3, 4])
|
||||
months = np.array([1, 2])
|
||||
years = np.array([2007, 2008])
|
||||
result = conv.parse_date_fields(years, months, days)
|
||||
|
||||
expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)])
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_all_fields():
|
||||
hours = np.array([5, 6])
|
||||
minutes = np.array([7, 8])
|
||||
seconds = np.array([9, 0])
|
||||
|
||||
days = np.array([3, 4])
|
||||
years = np.array([2007, 2008])
|
||||
months = np.array([1, 2])
|
||||
|
||||
result = conv.parse_all_fields(years, months, days,
|
||||
hours, minutes, seconds)
|
||||
expected = np.array([datetime(2007, 1, 3, 5, 7, 9),
|
||||
datetime(2008, 2, 4, 6, 8, 0)])
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,158 @@
|
||||
""" test feather-format compat """
|
||||
from distutils.version import LooseVersion
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, ensure_clean
|
||||
|
||||
from pandas.io.feather_format import read_feather, to_feather # noqa:E402
|
||||
|
||||
pyarrow = pytest.importorskip('pyarrow')
|
||||
|
||||
|
||||
pyarrow_version = LooseVersion(pyarrow.__version__)
|
||||
|
||||
|
||||
@pytest.mark.single
|
||||
class TestFeather(object):
|
||||
|
||||
def check_error_on_write(self, df, exc):
|
||||
# check that we are raising the exception
|
||||
# on writing
|
||||
|
||||
with pytest.raises(exc):
|
||||
with ensure_clean() as path:
|
||||
to_feather(df, path)
|
||||
|
||||
def check_round_trip(self, df, expected=None, **kwargs):
|
||||
|
||||
if expected is None:
|
||||
expected = df
|
||||
|
||||
with ensure_clean() as path:
|
||||
to_feather(df, path)
|
||||
|
||||
result = read_feather(path, **kwargs)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_error(self):
|
||||
|
||||
for obj in [pd.Series([1, 2, 3]), 1, 'foo', pd.Timestamp('20130101'),
|
||||
np.array([1, 2, 3])]:
|
||||
self.check_error_on_write(obj, ValueError)
|
||||
|
||||
def test_basic(self):
|
||||
|
||||
df = pd.DataFrame({'string': list('abc'),
|
||||
'int': list(range(1, 4)),
|
||||
'uint': np.arange(3, 6).astype('u1'),
|
||||
'float': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'float_with_null': [1., np.nan, 3],
|
||||
'bool': [True, False, True],
|
||||
'bool_with_null': [True, np.nan, False],
|
||||
'cat': pd.Categorical(list('abc')),
|
||||
'dt': pd.date_range('20130101', periods=3),
|
||||
'dttz': pd.date_range('20130101', periods=3,
|
||||
tz='US/Eastern'),
|
||||
'dt_with_null': [pd.Timestamp('20130101'), pd.NaT,
|
||||
pd.Timestamp('20130103')],
|
||||
'dtns': pd.date_range('20130101', periods=3,
|
||||
freq='ns')})
|
||||
|
||||
assert df.dttz.dtype.tz.zone == 'US/Eastern'
|
||||
self.check_round_trip(df)
|
||||
|
||||
def test_duplicate_columns(self):
|
||||
|
||||
# https://github.com/wesm/feather/issues/53
|
||||
# not currently able to handle duplicate columns
|
||||
df = pd.DataFrame(np.arange(12).reshape(4, 3),
|
||||
columns=list('aaa')).copy()
|
||||
self.check_error_on_write(df, ValueError)
|
||||
|
||||
def test_stringify_columns(self):
|
||||
|
||||
df = pd.DataFrame(np.arange(12).reshape(4, 3)).copy()
|
||||
self.check_error_on_write(df, ValueError)
|
||||
|
||||
def test_read_columns(self):
|
||||
# GH 24025
|
||||
df = pd.DataFrame({'col1': list('abc'),
|
||||
'col2': list(range(1, 4)),
|
||||
'col3': list('xyz'),
|
||||
'col4': list(range(4, 7))})
|
||||
columns = ['col1', 'col3']
|
||||
self.check_round_trip(df, expected=df[columns],
|
||||
columns=columns)
|
||||
|
||||
def test_unsupported_other(self):
|
||||
|
||||
# period
|
||||
df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
|
||||
# Some versions raise ValueError, others raise ArrowInvalid.
|
||||
self.check_error_on_write(df, Exception)
|
||||
|
||||
def test_rw_nthreads(self):
|
||||
df = pd.DataFrame({'A': np.arange(100000)})
|
||||
expected_warning = (
|
||||
"the 'nthreads' keyword is deprecated, "
|
||||
"use 'use_threads' instead"
|
||||
)
|
||||
# TODO: make the warning work with check_stacklevel=True
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, check_stacklevel=False) as w:
|
||||
self.check_round_trip(df, nthreads=2)
|
||||
# we have an extra FutureWarning because of #GH23752
|
||||
assert any(expected_warning in str(x) for x in w)
|
||||
|
||||
# TODO: make the warning work with check_stacklevel=True
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, check_stacklevel=False) as w:
|
||||
self.check_round_trip(df, nthreads=1)
|
||||
# we have an extra FutureWarnings because of #GH23752
|
||||
assert any(expected_warning in str(x) for x in w)
|
||||
|
||||
def test_rw_use_threads(self):
|
||||
df = pd.DataFrame({'A': np.arange(100000)})
|
||||
self.check_round_trip(df, use_threads=True)
|
||||
self.check_round_trip(df, use_threads=False)
|
||||
|
||||
def test_write_with_index(self):
|
||||
|
||||
df = pd.DataFrame({'A': [1, 2, 3]})
|
||||
self.check_round_trip(df)
|
||||
|
||||
# non-default index
|
||||
for index in [[2, 3, 4],
|
||||
pd.date_range('20130101', periods=3),
|
||||
list('abc'),
|
||||
[1, 3, 4],
|
||||
pd.MultiIndex.from_tuples([('a', 1), ('a', 2),
|
||||
('b', 1)]),
|
||||
]:
|
||||
|
||||
df.index = index
|
||||
self.check_error_on_write(df, ValueError)
|
||||
|
||||
# index with meta-data
|
||||
df.index = [0, 1, 2]
|
||||
df.index.name = 'foo'
|
||||
self.check_error_on_write(df, ValueError)
|
||||
|
||||
# column multi-index
|
||||
df.index = [0, 1, 2]
|
||||
df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]),
|
||||
self.check_error_on_write(df, ValueError)
|
||||
|
||||
def test_path_pathlib(self):
|
||||
df = tm.makeDataFrame().reset_index()
|
||||
result = tm.round_trip_pathlib(df.to_feather, pd.read_feather)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
def test_path_localpath(self):
|
||||
df = tm.makeDataFrame().reset_index()
|
||||
result = tm.round_trip_localpath(df.to_feather, pd.read_feather)
|
||||
tm.assert_frame_equal(df, result)
|
||||
@@ -0,0 +1,153 @@
|
||||
from datetime import datetime
|
||||
import os
|
||||
import platform
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
from pandas.compat import range
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, compat
|
||||
import pandas.util.testing as tm
|
||||
|
||||
api_exceptions = pytest.importorskip("google.api_core.exceptions")
|
||||
bigquery = pytest.importorskip("google.cloud.bigquery")
|
||||
service_account = pytest.importorskip("google.oauth2.service_account")
|
||||
pandas_gbq = pytest.importorskip("pandas_gbq")
|
||||
|
||||
PROJECT_ID = None
|
||||
PRIVATE_KEY_JSON_PATH = None
|
||||
PRIVATE_KEY_JSON_CONTENTS = None
|
||||
|
||||
if compat.PY3:
|
||||
DATASET_ID = 'pydata_pandas_bq_testing_py3'
|
||||
else:
|
||||
DATASET_ID = 'pydata_pandas_bq_testing_py2'
|
||||
|
||||
TABLE_ID = 'new_test'
|
||||
DESTINATION_TABLE = "{0}.{1}".format(DATASET_ID + "1", TABLE_ID)
|
||||
|
||||
VERSION = platform.python_version()
|
||||
|
||||
|
||||
def _skip_if_no_project_id():
|
||||
if not _get_project_id():
|
||||
pytest.skip(
|
||||
"Cannot run integration tests without a project id")
|
||||
|
||||
|
||||
def _skip_if_no_private_key_path():
|
||||
if not _get_private_key_path():
|
||||
pytest.skip("Cannot run integration tests without a "
|
||||
"private key json file path")
|
||||
|
||||
|
||||
def _in_travis_environment():
|
||||
return 'TRAVIS_BUILD_DIR' in os.environ and \
|
||||
'GBQ_PROJECT_ID' in os.environ
|
||||
|
||||
|
||||
def _get_project_id():
|
||||
if _in_travis_environment():
|
||||
return os.environ.get('GBQ_PROJECT_ID')
|
||||
return PROJECT_ID or os.environ.get('GBQ_PROJECT_ID')
|
||||
|
||||
|
||||
def _get_private_key_path():
|
||||
if _in_travis_environment():
|
||||
return os.path.join(*[os.environ.get('TRAVIS_BUILD_DIR'), 'ci',
|
||||
'travis_gbq.json'])
|
||||
|
||||
private_key_path = PRIVATE_KEY_JSON_PATH
|
||||
if not private_key_path:
|
||||
private_key_path = os.environ.get('GBQ_GOOGLE_APPLICATION_CREDENTIALS')
|
||||
return private_key_path
|
||||
|
||||
|
||||
def _get_credentials():
|
||||
private_key_path = _get_private_key_path()
|
||||
if private_key_path:
|
||||
return service_account.Credentials.from_service_account_file(
|
||||
private_key_path)
|
||||
|
||||
|
||||
def _get_client():
|
||||
project_id = _get_project_id()
|
||||
credentials = _get_credentials()
|
||||
return bigquery.Client(project=project_id, credentials=credentials)
|
||||
|
||||
|
||||
def make_mixed_dataframe_v2(test_size):
|
||||
# create df to test for all BQ datatypes except RECORD
|
||||
bools = np.random.randint(2, size=(1, test_size)).astype(bool)
|
||||
flts = np.random.randn(1, test_size)
|
||||
ints = np.random.randint(1, 10, size=(1, test_size))
|
||||
strs = np.random.randint(1, 10, size=(1, test_size)).astype(str)
|
||||
times = [datetime.now(pytz.timezone('US/Arizona'))
|
||||
for t in range(test_size)]
|
||||
return DataFrame({'bools': bools[0],
|
||||
'flts': flts[0],
|
||||
'ints': ints[0],
|
||||
'strs': strs[0],
|
||||
'times': times[0]},
|
||||
index=range(test_size))
|
||||
|
||||
|
||||
def test_read_gbq_without_dialect_warns_future_change(monkeypatch):
|
||||
# Default dialect is changing to standard SQL. See:
|
||||
# https://github.com/pydata/pandas-gbq/issues/195
|
||||
|
||||
def mock_read_gbq(*args, **kwargs):
|
||||
return DataFrame([[1.0]])
|
||||
|
||||
monkeypatch.setattr(pandas_gbq, 'read_gbq', mock_read_gbq)
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
pd.read_gbq("SELECT 1")
|
||||
|
||||
|
||||
@pytest.mark.single
|
||||
class TestToGBQIntegrationWithServiceAccountKeyPath(object):
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
# - GLOBAL CLASS FIXTURES -
|
||||
# put here any instruction you want to execute only *ONCE* *BEFORE*
|
||||
# executing *ALL* tests described below.
|
||||
|
||||
_skip_if_no_project_id()
|
||||
_skip_if_no_private_key_path()
|
||||
|
||||
cls.client = _get_client()
|
||||
cls.dataset = cls.client.dataset(DATASET_ID + "1")
|
||||
try:
|
||||
# Clean-up previous test runs.
|
||||
cls.client.delete_dataset(cls.dataset, delete_contents=True)
|
||||
except api_exceptions.NotFound:
|
||||
pass # It's OK if the dataset doesn't already exist.
|
||||
|
||||
cls.client.create_dataset(bigquery.Dataset(cls.dataset))
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls):
|
||||
# - GLOBAL CLASS FIXTURES -
|
||||
# put here any instruction you want to execute only *ONCE* *AFTER*
|
||||
# executing all tests.
|
||||
cls.client.delete_dataset(cls.dataset, delete_contents=True)
|
||||
|
||||
def test_roundtrip(self):
|
||||
destination_table = DESTINATION_TABLE + "1"
|
||||
|
||||
test_size = 20001
|
||||
df = make_mixed_dataframe_v2(test_size)
|
||||
|
||||
df.to_gbq(destination_table, _get_project_id(), chunksize=None,
|
||||
credentials=_get_credentials())
|
||||
|
||||
result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}"
|
||||
.format(destination_table),
|
||||
project_id=_get_project_id(),
|
||||
credentials=_get_credentials(),
|
||||
dialect="standard")
|
||||
assert result['num_rows'][0] == test_size
|
||||
@@ -0,0 +1,72 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
|
||||
from pandas import DataFrame, date_range, read_csv
|
||||
from pandas.util import _test_decorators as td
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
from pandas.io.common import is_gcs_url
|
||||
|
||||
|
||||
def test_is_gcs_url():
|
||||
assert is_gcs_url("gcs://pandas/somethingelse.com")
|
||||
assert is_gcs_url("gs://pandas/somethingelse.com")
|
||||
assert not is_gcs_url("s3://pandas/somethingelse.com")
|
||||
|
||||
|
||||
@td.skip_if_no('gcsfs')
|
||||
def test_read_csv_gcs(monkeypatch):
|
||||
df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
|
||||
'dt': date_range('2018-06-18', periods=2)})
|
||||
|
||||
class MockGCSFileSystem():
|
||||
def open(*args):
|
||||
return StringIO(df1.to_csv(index=False))
|
||||
|
||||
monkeypatch.setattr('gcsfs.GCSFileSystem', MockGCSFileSystem)
|
||||
df2 = read_csv('gs://test/test.csv', parse_dates=['dt'])
|
||||
|
||||
assert_frame_equal(df1, df2)
|
||||
|
||||
|
||||
@td.skip_if_no('gcsfs')
|
||||
def test_to_csv_gcs(monkeypatch):
|
||||
df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
|
||||
'dt': date_range('2018-06-18', periods=2)})
|
||||
s = StringIO()
|
||||
|
||||
class MockGCSFileSystem():
|
||||
def open(*args):
|
||||
return s
|
||||
|
||||
monkeypatch.setattr('gcsfs.GCSFileSystem', MockGCSFileSystem)
|
||||
df1.to_csv('gs://test/test.csv', index=True)
|
||||
df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0)
|
||||
|
||||
assert_frame_equal(df1, df2)
|
||||
|
||||
|
||||
@td.skip_if_no('gcsfs')
|
||||
def test_gcs_get_filepath_or_buffer(monkeypatch):
|
||||
df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
|
||||
'dt': date_range('2018-06-18', periods=2)})
|
||||
|
||||
def mock_get_filepath_or_buffer(*args, **kwargs):
|
||||
return (StringIO(df1.to_csv(index=False)),
|
||||
None, None, False)
|
||||
|
||||
monkeypatch.setattr('pandas.io.gcs.get_filepath_or_buffer',
|
||||
mock_get_filepath_or_buffer)
|
||||
df2 = read_csv('gs://test/test.csv', parse_dates=['dt'])
|
||||
|
||||
assert_frame_equal(df1, df2)
|
||||
|
||||
|
||||
@pytest.mark.skipif(td.safe_import('gcsfs'),
|
||||
reason='Only check when gcsfs not installed')
|
||||
def test_gcs_not_present_exception():
|
||||
with pytest.raises(ImportError) as e:
|
||||
read_csv('gs://test/test.csv')
|
||||
assert 'gcsfs library is required' in str(e.value)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,954 @@
|
||||
import datetime
|
||||
from distutils.version import LooseVersion
|
||||
import glob
|
||||
import os
|
||||
from warnings import catch_warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslib import iNaT
|
||||
from pandas.compat import PY3, u
|
||||
from pandas.errors import PerformanceWarning
|
||||
|
||||
import pandas
|
||||
from pandas import (
|
||||
Categorical, DataFrame, Index, Interval, MultiIndex, NaT, Panel, Period,
|
||||
Series, Timestamp, bdate_range, compat, date_range, period_range)
|
||||
from pandas.tests.test_panel import assert_panel_equal
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (
|
||||
assert_categorical_equal, assert_frame_equal, assert_index_equal,
|
||||
assert_series_equal, ensure_clean)
|
||||
|
||||
from pandas.io.packers import read_msgpack, to_msgpack
|
||||
|
||||
nan = np.nan
|
||||
|
||||
try:
|
||||
import blosc # NOQA
|
||||
except ImportError:
|
||||
_BLOSC_INSTALLED = False
|
||||
else:
|
||||
_BLOSC_INSTALLED = True
|
||||
|
||||
try:
|
||||
import zlib # NOQA
|
||||
except ImportError:
|
||||
_ZLIB_INSTALLED = False
|
||||
else:
|
||||
_ZLIB_INSTALLED = True
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def current_packers_data():
|
||||
# our current version packers data
|
||||
from pandas.tests.io.generate_legacy_storage_files import (
|
||||
create_msgpack_data)
|
||||
return create_msgpack_data()
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def all_packers_data():
|
||||
# our all of our current version packers data
|
||||
from pandas.tests.io.generate_legacy_storage_files import (
|
||||
create_data)
|
||||
return create_data()
|
||||
|
||||
|
||||
def check_arbitrary(a, b):
|
||||
|
||||
if isinstance(a, (list, tuple)) and isinstance(b, (list, tuple)):
|
||||
assert(len(a) == len(b))
|
||||
for a_, b_ in zip(a, b):
|
||||
check_arbitrary(a_, b_)
|
||||
elif isinstance(a, Panel):
|
||||
assert_panel_equal(a, b)
|
||||
elif isinstance(a, DataFrame):
|
||||
assert_frame_equal(a, b)
|
||||
elif isinstance(a, Series):
|
||||
assert_series_equal(a, b)
|
||||
elif isinstance(a, Index):
|
||||
assert_index_equal(a, b)
|
||||
elif isinstance(a, Categorical):
|
||||
# Temp,
|
||||
# Categorical.categories is changed from str to bytes in PY3
|
||||
# maybe the same as GH 13591
|
||||
if PY3 and b.categories.inferred_type == 'string':
|
||||
pass
|
||||
else:
|
||||
tm.assert_categorical_equal(a, b)
|
||||
elif a is NaT:
|
||||
assert b is NaT
|
||||
elif isinstance(a, Timestamp):
|
||||
assert a == b
|
||||
assert a.freq == b.freq
|
||||
else:
|
||||
assert(a == b)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
|
||||
class TestPackers(object):
|
||||
|
||||
def setup_method(self, method):
|
||||
self.path = '__%s__.msg' % tm.rands(10)
|
||||
|
||||
def teardown_method(self, method):
|
||||
pass
|
||||
|
||||
def encode_decode(self, x, compress=None, **kwargs):
|
||||
with ensure_clean(self.path) as p:
|
||||
to_msgpack(p, x, compress=compress, **kwargs)
|
||||
return read_msgpack(p, **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
|
||||
class TestAPI(TestPackers):
|
||||
|
||||
def test_string_io(self):
|
||||
|
||||
df = DataFrame(np.random.randn(10, 2))
|
||||
s = df.to_msgpack(None)
|
||||
result = read_msgpack(s)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
s = df.to_msgpack()
|
||||
result = read_msgpack(s)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
s = df.to_msgpack()
|
||||
result = read_msgpack(compat.BytesIO(s))
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
s = to_msgpack(None, df)
|
||||
result = read_msgpack(s)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
with ensure_clean(self.path) as p:
|
||||
|
||||
s = df.to_msgpack()
|
||||
with open(p, 'wb') as fh:
|
||||
fh.write(s)
|
||||
result = read_msgpack(p)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
def test_path_pathlib(self):
|
||||
df = tm.makeDataFrame()
|
||||
result = tm.round_trip_pathlib(df.to_msgpack, read_msgpack)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
def test_path_localpath(self):
|
||||
df = tm.makeDataFrame()
|
||||
result = tm.round_trip_localpath(df.to_msgpack, read_msgpack)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
def test_iterator_with_string_io(self):
|
||||
|
||||
dfs = [DataFrame(np.random.randn(10, 2)) for i in range(5)]
|
||||
s = to_msgpack(None, *dfs)
|
||||
for i, result in enumerate(read_msgpack(s, iterator=True)):
|
||||
tm.assert_frame_equal(result, dfs[i])
|
||||
|
||||
def test_invalid_arg(self):
|
||||
# GH10369
|
||||
class A(object):
|
||||
|
||||
def __init__(self):
|
||||
self.read = 0
|
||||
|
||||
msg = (r"Invalid file path or buffer object type: <(class|type)"
|
||||
r" '{}'>")
|
||||
with pytest.raises(ValueError, match=msg.format('NoneType')):
|
||||
read_msgpack(path_or_buf=None)
|
||||
with pytest.raises(ValueError, match=msg.format('dict')):
|
||||
read_msgpack(path_or_buf={})
|
||||
with pytest.raises(ValueError, match=msg.format(r'.*\.A')):
|
||||
read_msgpack(path_or_buf=A())
|
||||
|
||||
|
||||
class TestNumpy(TestPackers):
|
||||
|
||||
def test_numpy_scalar_float(self):
|
||||
x = np.float32(np.random.rand())
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_almost_equal(x, x_rec)
|
||||
|
||||
def test_numpy_scalar_complex(self):
|
||||
x = np.complex64(np.random.rand() + 1j * np.random.rand())
|
||||
x_rec = self.encode_decode(x)
|
||||
assert np.allclose(x, x_rec)
|
||||
|
||||
def test_scalar_float(self):
|
||||
x = np.random.rand()
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_almost_equal(x, x_rec)
|
||||
|
||||
def test_scalar_bool(self):
|
||||
x = np.bool_(1)
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_almost_equal(x, x_rec)
|
||||
|
||||
x = np.bool_(0)
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_almost_equal(x, x_rec)
|
||||
|
||||
def test_scalar_complex(self):
|
||||
x = np.random.rand() + 1j * np.random.rand()
|
||||
x_rec = self.encode_decode(x)
|
||||
assert np.allclose(x, x_rec)
|
||||
|
||||
def test_list_numpy_float(self):
|
||||
x = [np.float32(np.random.rand()) for i in range(5)]
|
||||
x_rec = self.encode_decode(x)
|
||||
# current msgpack cannot distinguish list/tuple
|
||||
tm.assert_almost_equal(tuple(x), x_rec)
|
||||
|
||||
x_rec = self.encode_decode(tuple(x))
|
||||
tm.assert_almost_equal(tuple(x), x_rec)
|
||||
|
||||
def test_list_numpy_float_complex(self):
|
||||
if not hasattr(np, 'complex128'):
|
||||
pytest.skip('numpy can not handle complex128')
|
||||
|
||||
x = [np.float32(np.random.rand()) for i in range(5)] + \
|
||||
[np.complex128(np.random.rand() + 1j * np.random.rand())
|
||||
for i in range(5)]
|
||||
x_rec = self.encode_decode(x)
|
||||
assert np.allclose(x, x_rec)
|
||||
|
||||
def test_list_float(self):
|
||||
x = [np.random.rand() for i in range(5)]
|
||||
x_rec = self.encode_decode(x)
|
||||
# current msgpack cannot distinguish list/tuple
|
||||
tm.assert_almost_equal(tuple(x), x_rec)
|
||||
|
||||
x_rec = self.encode_decode(tuple(x))
|
||||
tm.assert_almost_equal(tuple(x), x_rec)
|
||||
|
||||
def test_list_float_complex(self):
|
||||
x = [np.random.rand() for i in range(5)] + \
|
||||
[(np.random.rand() + 1j * np.random.rand()) for i in range(5)]
|
||||
x_rec = self.encode_decode(x)
|
||||
assert np.allclose(x, x_rec)
|
||||
|
||||
def test_dict_float(self):
|
||||
x = {'foo': 1.0, 'bar': 2.0}
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_almost_equal(x, x_rec)
|
||||
|
||||
def test_dict_complex(self):
|
||||
x = {'foo': 1.0 + 1.0j, 'bar': 2.0 + 2.0j}
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_dict_equal(x, x_rec)
|
||||
|
||||
for key in x:
|
||||
tm.assert_class_equal(x[key], x_rec[key], obj="complex value")
|
||||
|
||||
def test_dict_numpy_float(self):
|
||||
x = {'foo': np.float32(1.0), 'bar': np.float32(2.0)}
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_almost_equal(x, x_rec)
|
||||
|
||||
def test_dict_numpy_complex(self):
|
||||
x = {'foo': np.complex128(1.0 + 1.0j),
|
||||
'bar': np.complex128(2.0 + 2.0j)}
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_dict_equal(x, x_rec)
|
||||
|
||||
for key in x:
|
||||
tm.assert_class_equal(x[key], x_rec[key], obj="numpy complex128")
|
||||
|
||||
def test_numpy_array_float(self):
|
||||
|
||||
# run multiple times
|
||||
for n in range(10):
|
||||
x = np.random.rand(10)
|
||||
for dtype in ['float32', 'float64']:
|
||||
x = x.astype(dtype)
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_almost_equal(x, x_rec)
|
||||
|
||||
def test_numpy_array_complex(self):
|
||||
x = (np.random.rand(5) + 1j * np.random.rand(5)).astype(np.complex128)
|
||||
x_rec = self.encode_decode(x)
|
||||
assert (all(map(lambda x, y: x == y, x, x_rec)) and
|
||||
x.dtype == x_rec.dtype)
|
||||
|
||||
def test_list_mixed(self):
|
||||
x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo'), np.bool_(1)]
|
||||
x_rec = self.encode_decode(x)
|
||||
# current msgpack cannot distinguish list/tuple
|
||||
tm.assert_almost_equal(tuple(x), x_rec)
|
||||
|
||||
x_rec = self.encode_decode(tuple(x))
|
||||
tm.assert_almost_equal(tuple(x), x_rec)
|
||||
|
||||
|
||||
class TestBasic(TestPackers):
|
||||
|
||||
def test_timestamp(self):
|
||||
|
||||
for i in [Timestamp(
|
||||
'20130101'), Timestamp('20130101', tz='US/Eastern'),
|
||||
Timestamp('201301010501')]:
|
||||
i_rec = self.encode_decode(i)
|
||||
assert i == i_rec
|
||||
|
||||
def test_nat(self):
|
||||
nat_rec = self.encode_decode(NaT)
|
||||
assert NaT is nat_rec
|
||||
|
||||
def test_datetimes(self):
|
||||
|
||||
for i in [datetime.datetime(2013, 1, 1),
|
||||
datetime.datetime(2013, 1, 1, 5, 1),
|
||||
datetime.date(2013, 1, 1),
|
||||
np.datetime64(datetime.datetime(2013, 1, 5, 2, 15))]:
|
||||
i_rec = self.encode_decode(i)
|
||||
assert i == i_rec
|
||||
|
||||
def test_timedeltas(self):
|
||||
|
||||
for i in [datetime.timedelta(days=1),
|
||||
datetime.timedelta(days=1, seconds=10),
|
||||
np.timedelta64(1000000)]:
|
||||
i_rec = self.encode_decode(i)
|
||||
assert i == i_rec
|
||||
|
||||
def test_periods(self):
|
||||
# 13463
|
||||
for i in [Period('2010-09', 'M'), Period('2014-Q1', 'Q')]:
|
||||
i_rec = self.encode_decode(i)
|
||||
assert i == i_rec
|
||||
|
||||
def test_intervals(self):
|
||||
# 19967
|
||||
for i in [Interval(0, 1), Interval(0, 1, 'left'),
|
||||
Interval(10, 25., 'right')]:
|
||||
i_rec = self.encode_decode(i)
|
||||
assert i == i_rec
|
||||
|
||||
|
||||
class TestIndex(TestPackers):
|
||||
|
||||
def setup_method(self, method):
|
||||
super(TestIndex, self).setup_method(method)
|
||||
|
||||
self.d = {
|
||||
'string': tm.makeStringIndex(100),
|
||||
'date': tm.makeDateIndex(100),
|
||||
'int': tm.makeIntIndex(100),
|
||||
'rng': tm.makeRangeIndex(100),
|
||||
'float': tm.makeFloatIndex(100),
|
||||
'empty': Index([]),
|
||||
'tuple': Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])),
|
||||
'period': Index(period_range('2012-1-1', freq='M', periods=3)),
|
||||
'date2': Index(date_range('2013-01-1', periods=10)),
|
||||
'bdate': Index(bdate_range('2013-01-02', periods=10)),
|
||||
'cat': tm.makeCategoricalIndex(100),
|
||||
'interval': tm.makeIntervalIndex(100),
|
||||
'timedelta': tm.makeTimedeltaIndex(100, 'H')
|
||||
}
|
||||
|
||||
self.mi = {
|
||||
'reg': MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'),
|
||||
('foo', 'two'),
|
||||
('qux', 'one'), ('qux', 'two')],
|
||||
names=['first', 'second']),
|
||||
}
|
||||
|
||||
def test_basic_index(self):
|
||||
|
||||
for s, i in self.d.items():
|
||||
i_rec = self.encode_decode(i)
|
||||
tm.assert_index_equal(i, i_rec)
|
||||
|
||||
# datetime with no freq (GH5506)
|
||||
i = Index([Timestamp('20130101'), Timestamp('20130103')])
|
||||
i_rec = self.encode_decode(i)
|
||||
tm.assert_index_equal(i, i_rec)
|
||||
|
||||
# datetime with timezone
|
||||
i = Index([Timestamp('20130101 9:00:00'), Timestamp(
|
||||
'20130103 11:00:00')]).tz_localize('US/Eastern')
|
||||
i_rec = self.encode_decode(i)
|
||||
tm.assert_index_equal(i, i_rec)
|
||||
|
||||
def test_multi_index(self):
|
||||
|
||||
for s, i in self.mi.items():
|
||||
i_rec = self.encode_decode(i)
|
||||
tm.assert_index_equal(i, i_rec)
|
||||
|
||||
def test_unicode(self):
|
||||
i = tm.makeUnicodeIndex(100)
|
||||
|
||||
i_rec = self.encode_decode(i)
|
||||
tm.assert_index_equal(i, i_rec)
|
||||
|
||||
def categorical_index(self):
|
||||
# GH15487
|
||||
df = DataFrame(np.random.randn(10, 2))
|
||||
df = df.astype({0: 'category'}).set_index(0)
|
||||
result = self.encode_decode(df)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
class TestSeries(TestPackers):
|
||||
|
||||
def setup_method(self, method):
|
||||
super(TestSeries, self).setup_method(method)
|
||||
|
||||
self.d = {}
|
||||
|
||||
s = tm.makeStringSeries()
|
||||
s.name = 'string'
|
||||
self.d['string'] = s
|
||||
|
||||
s = tm.makeObjectSeries()
|
||||
s.name = 'object'
|
||||
self.d['object'] = s
|
||||
|
||||
s = Series(iNaT, dtype='M8[ns]', index=range(5))
|
||||
self.d['date'] = s
|
||||
|
||||
data = {
|
||||
'A': [0., 1., 2., 3., np.nan],
|
||||
'B': [0, 1, 0, 1, 0],
|
||||
'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
|
||||
'D': date_range('1/1/2009', periods=5),
|
||||
'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
|
||||
'F': [Timestamp('20130102', tz='US/Eastern')] * 2 +
|
||||
[Timestamp('20130603', tz='CET')] * 3,
|
||||
'G': [Timestamp('20130102', tz='US/Eastern')] * 5,
|
||||
'H': Categorical([1, 2, 3, 4, 5]),
|
||||
'I': Categorical([1, 2, 3, 4, 5], ordered=True),
|
||||
'J': (np.bool_(1), 2, 3, 4, 5),
|
||||
}
|
||||
|
||||
self.d['float'] = Series(data['A'])
|
||||
self.d['int'] = Series(data['B'])
|
||||
self.d['mixed'] = Series(data['E'])
|
||||
self.d['dt_tz_mixed'] = Series(data['F'])
|
||||
self.d['dt_tz'] = Series(data['G'])
|
||||
self.d['cat_ordered'] = Series(data['H'])
|
||||
self.d['cat_unordered'] = Series(data['I'])
|
||||
self.d['numpy_bool_mixed'] = Series(data['J'])
|
||||
|
||||
def test_basic(self):
|
||||
|
||||
# run multiple times here
|
||||
for n in range(10):
|
||||
for s, i in self.d.items():
|
||||
i_rec = self.encode_decode(i)
|
||||
assert_series_equal(i, i_rec)
|
||||
|
||||
|
||||
class TestCategorical(TestPackers):
|
||||
|
||||
def setup_method(self, method):
|
||||
super(TestCategorical, self).setup_method(method)
|
||||
|
||||
self.d = {}
|
||||
|
||||
self.d['plain_str'] = Categorical(['a', 'b', 'c', 'd', 'e'])
|
||||
self.d['plain_str_ordered'] = Categorical(['a', 'b', 'c', 'd', 'e'],
|
||||
ordered=True)
|
||||
|
||||
self.d['plain_int'] = Categorical([5, 6, 7, 8])
|
||||
self.d['plain_int_ordered'] = Categorical([5, 6, 7, 8], ordered=True)
|
||||
|
||||
def test_basic(self):
|
||||
|
||||
# run multiple times here
|
||||
for n in range(10):
|
||||
for s, i in self.d.items():
|
||||
i_rec = self.encode_decode(i)
|
||||
assert_categorical_equal(i, i_rec)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
|
||||
class TestNDFrame(TestPackers):
|
||||
|
||||
def setup_method(self, method):
|
||||
super(TestNDFrame, self).setup_method(method)
|
||||
|
||||
data = {
|
||||
'A': [0., 1., 2., 3., np.nan],
|
||||
'B': [0, 1, 0, 1, 0],
|
||||
'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
|
||||
'D': date_range('1/1/2009', periods=5),
|
||||
'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
|
||||
'F': [Timestamp('20130102', tz='US/Eastern')] * 5,
|
||||
'G': [Timestamp('20130603', tz='CET')] * 5,
|
||||
'H': Categorical(['a', 'b', 'c', 'd', 'e']),
|
||||
'I': Categorical(['a', 'b', 'c', 'd', 'e'], ordered=True),
|
||||
}
|
||||
|
||||
self.frame = {
|
||||
'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)),
|
||||
'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)),
|
||||
'mixed': DataFrame(data)}
|
||||
|
||||
self.panel = {
|
||||
'float': Panel(dict(ItemA=self.frame['float'],
|
||||
ItemB=self.frame['float'] + 1))}
|
||||
|
||||
def test_basic_frame(self):
|
||||
|
||||
for s, i in self.frame.items():
|
||||
i_rec = self.encode_decode(i)
|
||||
assert_frame_equal(i, i_rec)
|
||||
|
||||
def test_basic_panel(self):
|
||||
|
||||
with catch_warnings(record=True):
|
||||
for s, i in self.panel.items():
|
||||
i_rec = self.encode_decode(i)
|
||||
assert_panel_equal(i, i_rec)
|
||||
|
||||
def test_multi(self):
|
||||
|
||||
i_rec = self.encode_decode(self.frame)
|
||||
for k in self.frame.keys():
|
||||
assert_frame_equal(self.frame[k], i_rec[k])
|
||||
|
||||
packed_items = tuple([self.frame['float'], self.frame['float'].A,
|
||||
self.frame['float'].B, None])
|
||||
l_rec = self.encode_decode(packed_items)
|
||||
check_arbitrary(packed_items, l_rec)
|
||||
|
||||
# this is an oddity in that packed lists will be returned as tuples
|
||||
packed_items = [self.frame['float'], self.frame['float'].A,
|
||||
self.frame['float'].B, None]
|
||||
l_rec = self.encode_decode(packed_items)
|
||||
assert isinstance(l_rec, tuple)
|
||||
check_arbitrary(packed_items, l_rec)
|
||||
|
||||
def test_iterator(self):
|
||||
|
||||
packed_items = [self.frame['float'], self.frame['float'].A,
|
||||
self.frame['float'].B, None]
|
||||
|
||||
with ensure_clean(self.path) as path:
|
||||
to_msgpack(path, *packed_items)
|
||||
for i, packed in enumerate(read_msgpack(path, iterator=True)):
|
||||
check_arbitrary(packed, packed_items[i])
|
||||
|
||||
def tests_datetimeindex_freq_issue(self):
|
||||
|
||||
# GH 5947
|
||||
# inferring freq on the datetimeindex
|
||||
df = DataFrame([1, 2, 3], index=date_range('1/1/2013', '1/3/2013'))
|
||||
result = self.encode_decode(df)
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
df = DataFrame([1, 2], index=date_range('1/1/2013', '1/2/2013'))
|
||||
result = self.encode_decode(df)
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
def test_dataframe_duplicate_column_names(self):
|
||||
|
||||
# GH 9618
|
||||
expected_1 = DataFrame(columns=['a', 'a'])
|
||||
expected_2 = DataFrame(columns=[1] * 100)
|
||||
expected_2.loc[0] = np.random.randn(100)
|
||||
expected_3 = DataFrame(columns=[1, 1])
|
||||
expected_3.loc[0] = ['abc', np.nan]
|
||||
|
||||
result_1 = self.encode_decode(expected_1)
|
||||
result_2 = self.encode_decode(expected_2)
|
||||
result_3 = self.encode_decode(expected_3)
|
||||
|
||||
assert_frame_equal(result_1, expected_1)
|
||||
assert_frame_equal(result_2, expected_2)
|
||||
assert_frame_equal(result_3, expected_3)
|
||||
|
||||
|
||||
class TestSparse(TestPackers):
|
||||
|
||||
def _check_roundtrip(self, obj, comparator, **kwargs):
|
||||
|
||||
# currently these are not implemetned
|
||||
# i_rec = self.encode_decode(obj)
|
||||
# comparator(obj, i_rec, **kwargs)
|
||||
msg = r"msgpack sparse (series|frame) is not implemented"
|
||||
with pytest.raises(NotImplementedError, match=msg):
|
||||
self.encode_decode(obj)
|
||||
|
||||
def test_sparse_series(self):
|
||||
|
||||
s = tm.makeStringSeries()
|
||||
s[3:5] = np.nan
|
||||
ss = s.to_sparse()
|
||||
self._check_roundtrip(ss, tm.assert_series_equal,
|
||||
check_series_type=True)
|
||||
|
||||
ss2 = s.to_sparse(kind='integer')
|
||||
self._check_roundtrip(ss2, tm.assert_series_equal,
|
||||
check_series_type=True)
|
||||
|
||||
ss3 = s.to_sparse(fill_value=0)
|
||||
self._check_roundtrip(ss3, tm.assert_series_equal,
|
||||
check_series_type=True)
|
||||
|
||||
def test_sparse_frame(self):
|
||||
|
||||
s = tm.makeDataFrame()
|
||||
s.loc[3:5, 1:3] = np.nan
|
||||
s.loc[8:10, -2] = np.nan
|
||||
ss = s.to_sparse()
|
||||
|
||||
self._check_roundtrip(ss, tm.assert_frame_equal,
|
||||
check_frame_type=True)
|
||||
|
||||
ss2 = s.to_sparse(kind='integer')
|
||||
self._check_roundtrip(ss2, tm.assert_frame_equal,
|
||||
check_frame_type=True)
|
||||
|
||||
ss3 = s.to_sparse(fill_value=0)
|
||||
self._check_roundtrip(ss3, tm.assert_frame_equal,
|
||||
check_frame_type=True)
|
||||
|
||||
|
||||
class TestCompression(TestPackers):
|
||||
"""See https://github.com/pandas-dev/pandas/pull/9783
|
||||
"""
|
||||
|
||||
def setup_method(self, method):
|
||||
try:
|
||||
from sqlalchemy import create_engine
|
||||
self._create_sql_engine = create_engine
|
||||
except ImportError:
|
||||
self._SQLALCHEMY_INSTALLED = False
|
||||
else:
|
||||
self._SQLALCHEMY_INSTALLED = True
|
||||
|
||||
super(TestCompression, self).setup_method(method)
|
||||
data = {
|
||||
'A': np.arange(1000, dtype=np.float64),
|
||||
'B': np.arange(1000, dtype=np.int32),
|
||||
'C': list(100 * 'abcdefghij'),
|
||||
'D': date_range(datetime.datetime(2015, 4, 1), periods=1000),
|
||||
'E': [datetime.timedelta(days=x) for x in range(1000)],
|
||||
}
|
||||
self.frame = {
|
||||
'float': DataFrame({k: data[k] for k in ['A', 'A']}),
|
||||
'int': DataFrame({k: data[k] for k in ['B', 'B']}),
|
||||
'mixed': DataFrame(data),
|
||||
}
|
||||
|
||||
def test_plain(self):
|
||||
i_rec = self.encode_decode(self.frame)
|
||||
for k in self.frame.keys():
|
||||
assert_frame_equal(self.frame[k], i_rec[k])
|
||||
|
||||
def _test_compression(self, compress):
|
||||
i_rec = self.encode_decode(self.frame, compress=compress)
|
||||
for k in self.frame.keys():
|
||||
value = i_rec[k]
|
||||
expected = self.frame[k]
|
||||
assert_frame_equal(value, expected)
|
||||
# make sure that we can write to the new frames
|
||||
for block in value._data.blocks:
|
||||
assert block.values.flags.writeable
|
||||
|
||||
def test_compression_zlib(self):
|
||||
if not _ZLIB_INSTALLED:
|
||||
pytest.skip('no zlib')
|
||||
self._test_compression('zlib')
|
||||
|
||||
def test_compression_blosc(self):
|
||||
if not _BLOSC_INSTALLED:
|
||||
pytest.skip('no blosc')
|
||||
self._test_compression('blosc')
|
||||
|
||||
def _test_compression_warns_when_decompress_caches(
|
||||
self, monkeypatch, compress):
|
||||
not_garbage = []
|
||||
control = [] # copied data
|
||||
|
||||
compress_module = globals()[compress]
|
||||
real_decompress = compress_module.decompress
|
||||
|
||||
def decompress(ob):
|
||||
"""mock decompress function that delegates to the real
|
||||
decompress but caches the result and a copy of the result.
|
||||
"""
|
||||
res = real_decompress(ob)
|
||||
not_garbage.append(res) # hold a reference to this bytes object
|
||||
control.append(bytearray(res)) # copy the data here to check later
|
||||
return res
|
||||
|
||||
# types mapped to values to add in place.
|
||||
rhs = {
|
||||
np.dtype('float64'): 1.0,
|
||||
np.dtype('int32'): 1,
|
||||
np.dtype('object'): 'a',
|
||||
np.dtype('datetime64[ns]'): np.timedelta64(1, 'ns'),
|
||||
np.dtype('timedelta64[ns]'): np.timedelta64(1, 'ns'),
|
||||
}
|
||||
|
||||
with monkeypatch.context() as m, \
|
||||
tm.assert_produces_warning(PerformanceWarning) as ws:
|
||||
m.setattr(compress_module, 'decompress', decompress)
|
||||
i_rec = self.encode_decode(self.frame, compress=compress)
|
||||
for k in self.frame.keys():
|
||||
|
||||
value = i_rec[k]
|
||||
expected = self.frame[k]
|
||||
assert_frame_equal(value, expected)
|
||||
# make sure that we can write to the new frames even though
|
||||
# we needed to copy the data
|
||||
for block in value._data.blocks:
|
||||
assert block.values.flags.writeable
|
||||
# mutate the data in some way
|
||||
block.values[0] += rhs[block.dtype]
|
||||
|
||||
for w in ws:
|
||||
# check the messages from our warnings
|
||||
assert str(w.message) == ('copying data after decompressing; '
|
||||
'this may mean that decompress is '
|
||||
'caching its result')
|
||||
|
||||
for buf, control_buf in zip(not_garbage, control):
|
||||
# make sure none of our mutations above affected the
|
||||
# original buffers
|
||||
assert buf == control_buf
|
||||
|
||||
def test_compression_warns_when_decompress_caches_zlib(self, monkeypatch):
|
||||
if not _ZLIB_INSTALLED:
|
||||
pytest.skip('no zlib')
|
||||
self._test_compression_warns_when_decompress_caches(
|
||||
monkeypatch, 'zlib')
|
||||
|
||||
def test_compression_warns_when_decompress_caches_blosc(self, monkeypatch):
|
||||
if not _BLOSC_INSTALLED:
|
||||
pytest.skip('no blosc')
|
||||
self._test_compression_warns_when_decompress_caches(
|
||||
monkeypatch, 'blosc')
|
||||
|
||||
def _test_small_strings_no_warn(self, compress):
|
||||
empty = np.array([], dtype='uint8')
|
||||
with tm.assert_produces_warning(None):
|
||||
empty_unpacked = self.encode_decode(empty, compress=compress)
|
||||
|
||||
tm.assert_numpy_array_equal(empty_unpacked, empty)
|
||||
assert empty_unpacked.flags.writeable
|
||||
|
||||
char = np.array([ord(b'a')], dtype='uint8')
|
||||
with tm.assert_produces_warning(None):
|
||||
char_unpacked = self.encode_decode(char, compress=compress)
|
||||
|
||||
tm.assert_numpy_array_equal(char_unpacked, char)
|
||||
assert char_unpacked.flags.writeable
|
||||
# if this test fails I am sorry because the interpreter is now in a
|
||||
# bad state where b'a' points to 98 == ord(b'b').
|
||||
char_unpacked[0] = ord(b'b')
|
||||
|
||||
# we compare the ord of bytes b'a' with unicode u'a' because the should
|
||||
# always be the same (unless we were able to mutate the shared
|
||||
# character singleton in which case ord(b'a') == ord(b'b').
|
||||
assert ord(b'a') == ord(u'a')
|
||||
tm.assert_numpy_array_equal(
|
||||
char_unpacked,
|
||||
np.array([ord(b'b')], dtype='uint8'),
|
||||
)
|
||||
|
||||
def test_small_strings_no_warn_zlib(self):
|
||||
if not _ZLIB_INSTALLED:
|
||||
pytest.skip('no zlib')
|
||||
self._test_small_strings_no_warn('zlib')
|
||||
|
||||
def test_small_strings_no_warn_blosc(self):
|
||||
if not _BLOSC_INSTALLED:
|
||||
pytest.skip('no blosc')
|
||||
self._test_small_strings_no_warn('blosc')
|
||||
|
||||
def test_readonly_axis_blosc(self):
|
||||
# GH11880
|
||||
if not _BLOSC_INSTALLED:
|
||||
pytest.skip('no blosc')
|
||||
df1 = DataFrame({'A': list('abcd')})
|
||||
df2 = DataFrame(df1, index=[1., 2., 3., 4.])
|
||||
assert 1 in self.encode_decode(df1['A'], compress='blosc')
|
||||
assert 1. in self.encode_decode(df2['A'], compress='blosc')
|
||||
|
||||
def test_readonly_axis_zlib(self):
|
||||
# GH11880
|
||||
df1 = DataFrame({'A': list('abcd')})
|
||||
df2 = DataFrame(df1, index=[1., 2., 3., 4.])
|
||||
assert 1 in self.encode_decode(df1['A'], compress='zlib')
|
||||
assert 1. in self.encode_decode(df2['A'], compress='zlib')
|
||||
|
||||
def test_readonly_axis_blosc_to_sql(self):
|
||||
# GH11880
|
||||
if not _BLOSC_INSTALLED:
|
||||
pytest.skip('no blosc')
|
||||
if not self._SQLALCHEMY_INSTALLED:
|
||||
pytest.skip('no sqlalchemy')
|
||||
expected = DataFrame({'A': list('abcd')})
|
||||
df = self.encode_decode(expected, compress='blosc')
|
||||
eng = self._create_sql_engine("sqlite:///:memory:")
|
||||
df.to_sql('test', eng, if_exists='append')
|
||||
result = pandas.read_sql_table('test', eng, index_col='index')
|
||||
result.index.names = [None]
|
||||
assert_frame_equal(expected, result)
|
||||
|
||||
def test_readonly_axis_zlib_to_sql(self):
|
||||
# GH11880
|
||||
if not _ZLIB_INSTALLED:
|
||||
pytest.skip('no zlib')
|
||||
if not self._SQLALCHEMY_INSTALLED:
|
||||
pytest.skip('no sqlalchemy')
|
||||
expected = DataFrame({'A': list('abcd')})
|
||||
df = self.encode_decode(expected, compress='zlib')
|
||||
eng = self._create_sql_engine("sqlite:///:memory:")
|
||||
df.to_sql('test', eng, if_exists='append')
|
||||
result = pandas.read_sql_table('test', eng, index_col='index')
|
||||
result.index.names = [None]
|
||||
assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
class TestEncoding(TestPackers):
|
||||
|
||||
def setup_method(self, method):
|
||||
super(TestEncoding, self).setup_method(method)
|
||||
data = {
|
||||
'A': [compat.u('\u2019')] * 1000,
|
||||
'B': np.arange(1000, dtype=np.int32),
|
||||
'C': list(100 * 'abcdefghij'),
|
||||
'D': date_range(datetime.datetime(2015, 4, 1), periods=1000),
|
||||
'E': [datetime.timedelta(days=x) for x in range(1000)],
|
||||
'G': [400] * 1000
|
||||
}
|
||||
self.frame = {
|
||||
'float': DataFrame({k: data[k] for k in ['A', 'A']}),
|
||||
'int': DataFrame({k: data[k] for k in ['B', 'B']}),
|
||||
'mixed': DataFrame(data),
|
||||
}
|
||||
self.utf_encodings = ['utf8', 'utf16', 'utf32']
|
||||
|
||||
def test_utf(self):
|
||||
# GH10581
|
||||
for encoding in self.utf_encodings:
|
||||
for frame in compat.itervalues(self.frame):
|
||||
result = self.encode_decode(frame, encoding=encoding)
|
||||
assert_frame_equal(result, frame)
|
||||
|
||||
def test_default_encoding(self):
|
||||
for frame in compat.itervalues(self.frame):
|
||||
result = frame.to_msgpack()
|
||||
expected = frame.to_msgpack(encoding='utf8')
|
||||
assert result == expected
|
||||
result = self.encode_decode(frame)
|
||||
assert_frame_equal(result, frame)
|
||||
|
||||
|
||||
files = glob.glob(os.path.join(os.path.dirname(__file__), "data",
|
||||
"legacy_msgpack", "*", "*.msgpack"))
|
||||
|
||||
|
||||
@pytest.fixture(params=files)
|
||||
def legacy_packer(request, datapath):
|
||||
return datapath(request.param)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
|
||||
class TestMsgpack(object):
|
||||
"""
|
||||
How to add msgpack tests:
|
||||
|
||||
1. Install pandas version intended to output the msgpack.
|
||||
TestPackers
|
||||
2. Execute "generate_legacy_storage_files.py" to create the msgpack.
|
||||
$ python generate_legacy_storage_files.py <output_dir> msgpack
|
||||
|
||||
3. Move the created pickle to "data/legacy_msgpack/<version>" directory.
|
||||
"""
|
||||
|
||||
minimum_structure = {'series': ['float', 'int', 'mixed',
|
||||
'ts', 'mi', 'dup'],
|
||||
'frame': ['float', 'int', 'mixed', 'mi'],
|
||||
'panel': ['float'],
|
||||
'index': ['int', 'date', 'period'],
|
||||
'mi': ['reg2']}
|
||||
|
||||
def check_min_structure(self, data, version):
|
||||
for typ, v in self.minimum_structure.items():
|
||||
assert typ in data, '"{0}" not found in unpacked data'.format(typ)
|
||||
for kind in v:
|
||||
msg = '"{0}" not found in data["{1}"]'.format(kind, typ)
|
||||
assert kind in data[typ], msg
|
||||
|
||||
def compare(self, current_data, all_data, vf, version):
|
||||
# GH12277 encoding default used to be latin-1, now utf-8
|
||||
if LooseVersion(version) < LooseVersion('0.18.0'):
|
||||
data = read_msgpack(vf, encoding='latin-1')
|
||||
else:
|
||||
data = read_msgpack(vf)
|
||||
self.check_min_structure(data, version)
|
||||
for typ, dv in data.items():
|
||||
assert typ in all_data, ('unpacked data contains '
|
||||
'extra key "{0}"'
|
||||
.format(typ))
|
||||
for dt, result in dv.items():
|
||||
assert dt in current_data[typ], ('data["{0}"] contains extra '
|
||||
'key "{1}"'.format(typ, dt))
|
||||
try:
|
||||
expected = current_data[typ][dt]
|
||||
except KeyError:
|
||||
continue
|
||||
|
||||
# use a specific comparator
|
||||
# if available
|
||||
comp_method = "compare_{typ}_{dt}".format(typ=typ, dt=dt)
|
||||
comparator = getattr(self, comp_method, None)
|
||||
if comparator is not None:
|
||||
comparator(result, expected, typ, version)
|
||||
else:
|
||||
check_arbitrary(result, expected)
|
||||
|
||||
return data
|
||||
|
||||
def compare_series_dt_tz(self, result, expected, typ, version):
|
||||
# 8260
|
||||
# dtype is object < 0.17.0
|
||||
if LooseVersion(version) < LooseVersion('0.17.0'):
|
||||
expected = expected.astype(object)
|
||||
tm.assert_series_equal(result, expected)
|
||||
else:
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def compare_frame_dt_mixed_tzs(self, result, expected, typ, version):
|
||||
# 8260
|
||||
# dtype is object < 0.17.0
|
||||
if LooseVersion(version) < LooseVersion('0.17.0'):
|
||||
expected = expected.astype(object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_msgpacks_legacy(self, current_packers_data, all_packers_data,
|
||||
legacy_packer, datapath):
|
||||
|
||||
version = os.path.basename(os.path.dirname(legacy_packer))
|
||||
|
||||
# GH12142 0.17 files packed in P2 can't be read in P3
|
||||
if (compat.PY3 and version.startswith('0.17.') and
|
||||
legacy_packer.split('.')[-4][-1] == '2'):
|
||||
msg = "Files packed in Py2 can't be read in Py3 ({})"
|
||||
pytest.skip(msg.format(version))
|
||||
try:
|
||||
with catch_warnings(record=True):
|
||||
self.compare(current_packers_data, all_packers_data,
|
||||
legacy_packer, version)
|
||||
except ImportError:
|
||||
# blosc not installed
|
||||
pass
|
||||
|
||||
def test_msgpack_period_freq(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/24135
|
||||
s = Series(np.random.rand(5), index=date_range('20130101', periods=5))
|
||||
r = read_msgpack(s.to_msgpack())
|
||||
repr(r)
|
||||
@@ -0,0 +1,541 @@
|
||||
""" test parquet compat """
|
||||
import datetime
|
||||
from distutils.version import LooseVersion
|
||||
import os
|
||||
from warnings import catch_warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY3
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas.util import testing as tm
|
||||
|
||||
from pandas.io.parquet import (
|
||||
FastParquetImpl, PyArrowImpl, get_engine, read_parquet, to_parquet)
|
||||
|
||||
try:
|
||||
import pyarrow # noqa
|
||||
_HAVE_PYARROW = True
|
||||
except ImportError:
|
||||
_HAVE_PYARROW = False
|
||||
|
||||
try:
|
||||
import fastparquet # noqa
|
||||
_HAVE_FASTPARQUET = True
|
||||
except ImportError:
|
||||
_HAVE_FASTPARQUET = False
|
||||
|
||||
|
||||
# setup engines & skips
|
||||
@pytest.fixture(params=[
|
||||
pytest.param('fastparquet',
|
||||
marks=pytest.mark.skipif(not _HAVE_FASTPARQUET,
|
||||
reason='fastparquet is '
|
||||
'not installed')),
|
||||
pytest.param('pyarrow',
|
||||
marks=pytest.mark.skipif(not _HAVE_PYARROW,
|
||||
reason='pyarrow is '
|
||||
'not installed'))])
|
||||
def engine(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pa():
|
||||
if not _HAVE_PYARROW:
|
||||
pytest.skip("pyarrow is not installed")
|
||||
return 'pyarrow'
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fp():
|
||||
if not _HAVE_FASTPARQUET:
|
||||
pytest.skip("fastparquet is not installed")
|
||||
return 'fastparquet'
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_compat():
|
||||
return pd.DataFrame({'A': [1, 2, 3], 'B': 'foo'})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_cross_compat():
|
||||
df = pd.DataFrame({'a': list('abc'),
|
||||
'b': list(range(1, 4)),
|
||||
# 'c': np.arange(3, 6).astype('u1'),
|
||||
'd': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'e': [True, False, True],
|
||||
'f': pd.date_range('20130101', periods=3),
|
||||
# 'g': pd.date_range('20130101', periods=3,
|
||||
# tz='US/Eastern'),
|
||||
# 'h': pd.date_range('20130101', periods=3, freq='ns')
|
||||
})
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_full():
|
||||
return pd.DataFrame(
|
||||
{'string': list('abc'),
|
||||
'string_with_nan': ['a', np.nan, 'c'],
|
||||
'string_with_none': ['a', None, 'c'],
|
||||
'bytes': [b'foo', b'bar', b'baz'],
|
||||
'unicode': [u'foo', u'bar', u'baz'],
|
||||
'int': list(range(1, 4)),
|
||||
'uint': np.arange(3, 6).astype('u1'),
|
||||
'float': np.arange(4.0, 7.0, dtype='float64'),
|
||||
'float_with_nan': [2., np.nan, 3.],
|
||||
'bool': [True, False, True],
|
||||
'datetime': pd.date_range('20130101', periods=3),
|
||||
'datetime_with_nat': [pd.Timestamp('20130101'),
|
||||
pd.NaT,
|
||||
pd.Timestamp('20130103')]})
|
||||
|
||||
|
||||
def check_round_trip(df, engine=None, path=None,
|
||||
write_kwargs=None, read_kwargs=None,
|
||||
expected=None, check_names=True,
|
||||
repeat=2):
|
||||
"""Verify parquet serializer and deserializer produce the same results.
|
||||
|
||||
Performs a pandas to disk and disk to pandas round trip,
|
||||
then compares the 2 resulting DataFrames to verify equality.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df: Dataframe
|
||||
engine: str, optional
|
||||
'pyarrow' or 'fastparquet'
|
||||
path: str, optional
|
||||
write_kwargs: dict of str:str, optional
|
||||
read_kwargs: dict of str:str, optional
|
||||
expected: DataFrame, optional
|
||||
Expected deserialization result, otherwise will be equal to `df`
|
||||
check_names: list of str, optional
|
||||
Closed set of column names to be compared
|
||||
repeat: int, optional
|
||||
How many times to repeat the test
|
||||
"""
|
||||
|
||||
write_kwargs = write_kwargs or {'compression': None}
|
||||
read_kwargs = read_kwargs or {}
|
||||
|
||||
if expected is None:
|
||||
expected = df
|
||||
|
||||
if engine:
|
||||
write_kwargs['engine'] = engine
|
||||
read_kwargs['engine'] = engine
|
||||
|
||||
def compare(repeat):
|
||||
for _ in range(repeat):
|
||||
df.to_parquet(path, **write_kwargs)
|
||||
with catch_warnings(record=True):
|
||||
actual = read_parquet(path, **read_kwargs)
|
||||
tm.assert_frame_equal(expected, actual,
|
||||
check_names=check_names)
|
||||
|
||||
if path is None:
|
||||
with tm.ensure_clean() as path:
|
||||
compare(repeat)
|
||||
else:
|
||||
compare(repeat)
|
||||
|
||||
|
||||
def test_invalid_engine(df_compat):
|
||||
with pytest.raises(ValueError):
|
||||
check_round_trip(df_compat, 'foo', 'bar')
|
||||
|
||||
|
||||
def test_options_py(df_compat, pa):
|
||||
# use the set option
|
||||
|
||||
with pd.option_context('io.parquet.engine', 'pyarrow'):
|
||||
check_round_trip(df_compat)
|
||||
|
||||
|
||||
def test_options_fp(df_compat, fp):
|
||||
# use the set option
|
||||
|
||||
with pd.option_context('io.parquet.engine', 'fastparquet'):
|
||||
check_round_trip(df_compat)
|
||||
|
||||
|
||||
def test_options_auto(df_compat, fp, pa):
|
||||
# use the set option
|
||||
|
||||
with pd.option_context('io.parquet.engine', 'auto'):
|
||||
check_round_trip(df_compat)
|
||||
|
||||
|
||||
def test_options_get_engine(fp, pa):
|
||||
assert isinstance(get_engine('pyarrow'), PyArrowImpl)
|
||||
assert isinstance(get_engine('fastparquet'), FastParquetImpl)
|
||||
|
||||
with pd.option_context('io.parquet.engine', 'pyarrow'):
|
||||
assert isinstance(get_engine('auto'), PyArrowImpl)
|
||||
assert isinstance(get_engine('pyarrow'), PyArrowImpl)
|
||||
assert isinstance(get_engine('fastparquet'), FastParquetImpl)
|
||||
|
||||
with pd.option_context('io.parquet.engine', 'fastparquet'):
|
||||
assert isinstance(get_engine('auto'), FastParquetImpl)
|
||||
assert isinstance(get_engine('pyarrow'), PyArrowImpl)
|
||||
assert isinstance(get_engine('fastparquet'), FastParquetImpl)
|
||||
|
||||
with pd.option_context('io.parquet.engine', 'auto'):
|
||||
assert isinstance(get_engine('auto'), PyArrowImpl)
|
||||
assert isinstance(get_engine('pyarrow'), PyArrowImpl)
|
||||
assert isinstance(get_engine('fastparquet'), FastParquetImpl)
|
||||
|
||||
|
||||
def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
|
||||
# cross-compat with differing reading/writing engines
|
||||
|
||||
df = df_cross_compat
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_parquet(path, engine=pa, compression=None)
|
||||
|
||||
result = read_parquet(path, engine=fp)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
result = read_parquet(path, engine=fp, columns=['a', 'd'])
|
||||
tm.assert_frame_equal(result, df[['a', 'd']])
|
||||
|
||||
|
||||
def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
|
||||
# cross-compat with differing reading/writing engines
|
||||
|
||||
df = df_cross_compat
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_parquet(path, engine=fp, compression=None)
|
||||
|
||||
with catch_warnings(record=True):
|
||||
result = read_parquet(path, engine=pa)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
result = read_parquet(path, engine=pa, columns=['a', 'd'])
|
||||
tm.assert_frame_equal(result, df[['a', 'd']])
|
||||
|
||||
|
||||
class Base(object):
|
||||
|
||||
def check_error_on_write(self, df, engine, exc):
|
||||
# check that we are raising the exception on writing
|
||||
with tm.ensure_clean() as path:
|
||||
with pytest.raises(exc):
|
||||
to_parquet(df, path, engine, compression=None)
|
||||
|
||||
|
||||
class TestBasic(Base):
|
||||
|
||||
def test_error(self, engine):
|
||||
for obj in [pd.Series([1, 2, 3]), 1, 'foo', pd.Timestamp('20130101'),
|
||||
np.array([1, 2, 3])]:
|
||||
self.check_error_on_write(obj, engine, ValueError)
|
||||
|
||||
def test_columns_dtypes(self, engine):
|
||||
df = pd.DataFrame({'string': list('abc'),
|
||||
'int': list(range(1, 4))})
|
||||
|
||||
# unicode
|
||||
df.columns = [u'foo', u'bar']
|
||||
check_round_trip(df, engine)
|
||||
|
||||
def test_columns_dtypes_invalid(self, engine):
|
||||
df = pd.DataFrame({'string': list('abc'),
|
||||
'int': list(range(1, 4))})
|
||||
|
||||
# numeric
|
||||
df.columns = [0, 1]
|
||||
self.check_error_on_write(df, engine, ValueError)
|
||||
|
||||
if PY3:
|
||||
# bytes on PY3, on PY2 these are str
|
||||
df.columns = [b'foo', b'bar']
|
||||
self.check_error_on_write(df, engine, ValueError)
|
||||
|
||||
# python object
|
||||
df.columns = [datetime.datetime(2011, 1, 1, 0, 0),
|
||||
datetime.datetime(2011, 1, 1, 1, 1)]
|
||||
self.check_error_on_write(df, engine, ValueError)
|
||||
|
||||
@pytest.mark.parametrize('compression', [None, 'gzip', 'snappy', 'brotli'])
|
||||
def test_compression(self, engine, compression):
|
||||
|
||||
if compression == 'snappy':
|
||||
pytest.importorskip('snappy')
|
||||
|
||||
elif compression == 'brotli':
|
||||
pytest.importorskip('brotli')
|
||||
|
||||
df = pd.DataFrame({'A': [1, 2, 3]})
|
||||
check_round_trip(df, engine, write_kwargs={'compression': compression})
|
||||
|
||||
def test_read_columns(self, engine):
|
||||
# GH18154
|
||||
df = pd.DataFrame({'string': list('abc'),
|
||||
'int': list(range(1, 4))})
|
||||
|
||||
expected = pd.DataFrame({'string': list('abc')})
|
||||
check_round_trip(df, engine, expected=expected,
|
||||
read_kwargs={'columns': ['string']})
|
||||
|
||||
def test_write_index(self, engine):
|
||||
check_names = engine != 'fastparquet'
|
||||
|
||||
df = pd.DataFrame({'A': [1, 2, 3]})
|
||||
check_round_trip(df, engine)
|
||||
|
||||
indexes = [
|
||||
[2, 3, 4],
|
||||
pd.date_range('20130101', periods=3),
|
||||
list('abc'),
|
||||
[1, 3, 4],
|
||||
]
|
||||
# non-default index
|
||||
for index in indexes:
|
||||
df.index = index
|
||||
check_round_trip(df, engine, check_names=check_names)
|
||||
|
||||
# index with meta-data
|
||||
df.index = [0, 1, 2]
|
||||
df.index.name = 'foo'
|
||||
check_round_trip(df, engine)
|
||||
|
||||
def test_write_multiindex(self, pa):
|
||||
# Not suppoprted in fastparquet as of 0.1.3 or older pyarrow version
|
||||
engine = pa
|
||||
|
||||
df = pd.DataFrame({'A': [1, 2, 3]})
|
||||
index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)])
|
||||
df.index = index
|
||||
check_round_trip(df, engine)
|
||||
|
||||
def test_write_column_multiindex(self, engine):
|
||||
# column multi-index
|
||||
mi_columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)])
|
||||
df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
|
||||
self.check_error_on_write(df, engine, ValueError)
|
||||
|
||||
def test_multiindex_with_columns(self, pa):
|
||||
engine = pa
|
||||
dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS')
|
||||
df = pd.DataFrame(np.random.randn(2 * len(dates), 3),
|
||||
columns=list('ABC'))
|
||||
index1 = pd.MultiIndex.from_product(
|
||||
[['Level1', 'Level2'], dates],
|
||||
names=['level', 'date'])
|
||||
index2 = index1.copy(names=None)
|
||||
for index in [index1, index2]:
|
||||
df.index = index
|
||||
|
||||
check_round_trip(df, engine)
|
||||
check_round_trip(df, engine, read_kwargs={'columns': ['A', 'B']},
|
||||
expected=df[['A', 'B']])
|
||||
|
||||
def test_write_ignoring_index(self, engine):
|
||||
# ENH 20768
|
||||
# Ensure index=False omits the index from the written Parquet file.
|
||||
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']})
|
||||
|
||||
write_kwargs = {
|
||||
'compression': None,
|
||||
'index': False,
|
||||
}
|
||||
|
||||
# Because we're dropping the index, we expect the loaded dataframe to
|
||||
# have the default integer index.
|
||||
expected = df.reset_index(drop=True)
|
||||
|
||||
check_round_trip(df, engine, write_kwargs=write_kwargs,
|
||||
expected=expected)
|
||||
|
||||
# Ignore custom index
|
||||
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']},
|
||||
index=['zyx', 'wvu', 'tsr'])
|
||||
|
||||
check_round_trip(df, engine, write_kwargs=write_kwargs,
|
||||
expected=expected)
|
||||
|
||||
# Ignore multi-indexes as well.
|
||||
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
|
||||
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
|
||||
df = pd.DataFrame({'one': [i for i in range(8)],
|
||||
'two': [-i for i in range(8)]}, index=arrays)
|
||||
|
||||
expected = df.reset_index(drop=True)
|
||||
check_round_trip(df, engine, write_kwargs=write_kwargs,
|
||||
expected=expected)
|
||||
|
||||
|
||||
class TestParquetPyArrow(Base):
|
||||
|
||||
def test_basic(self, pa, df_full):
|
||||
|
||||
df = df_full
|
||||
|
||||
# additional supported types for pyarrow
|
||||
df['datetime_tz'] = pd.date_range('20130101', periods=3,
|
||||
tz='Europe/Brussels')
|
||||
df['bool_with_none'] = [True, None, True]
|
||||
|
||||
check_round_trip(df, pa)
|
||||
|
||||
# TODO: This doesn't fail on all systems; track down which
|
||||
@pytest.mark.xfail(reason="pyarrow fails on this (ARROW-1883)",
|
||||
strict=False)
|
||||
def test_basic_subset_columns(self, pa, df_full):
|
||||
# GH18628
|
||||
|
||||
df = df_full
|
||||
# additional supported types for pyarrow
|
||||
df['datetime_tz'] = pd.date_range('20130101', periods=3,
|
||||
tz='Europe/Brussels')
|
||||
|
||||
check_round_trip(df, pa, expected=df[['string', 'int']],
|
||||
read_kwargs={'columns': ['string', 'int']})
|
||||
|
||||
def test_duplicate_columns(self, pa):
|
||||
# not currently able to handle duplicate columns
|
||||
df = pd.DataFrame(np.arange(12).reshape(4, 3),
|
||||
columns=list('aaa')).copy()
|
||||
self.check_error_on_write(df, pa, ValueError)
|
||||
|
||||
def test_unsupported(self, pa):
|
||||
# period
|
||||
df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
|
||||
# pyarrow 0.11 raises ArrowTypeError
|
||||
# older pyarrows raise ArrowInvalid
|
||||
self.check_error_on_write(df, pa, Exception)
|
||||
|
||||
# timedelta
|
||||
df = pd.DataFrame({'a': pd.timedelta_range('1 day',
|
||||
periods=3)})
|
||||
self.check_error_on_write(df, pa, NotImplementedError)
|
||||
|
||||
# mixed python objects
|
||||
df = pd.DataFrame({'a': ['a', 1, 2.0]})
|
||||
# pyarrow 0.11 raises ArrowTypeError
|
||||
# older pyarrows raise ArrowInvalid
|
||||
self.check_error_on_write(df, pa, Exception)
|
||||
|
||||
def test_categorical(self, pa):
|
||||
|
||||
# supported in >= 0.7.0
|
||||
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
|
||||
|
||||
# de-serialized as object
|
||||
expected = df.assign(a=df.a.astype(object))
|
||||
check_round_trip(df, pa, expected=expected)
|
||||
|
||||
def test_s3_roundtrip(self, df_compat, s3_resource, pa):
|
||||
# GH #19134
|
||||
check_round_trip(df_compat, pa,
|
||||
path='s3://pandas-test/pyarrow.parquet')
|
||||
|
||||
def test_partition_cols_supported(self, pa, df_full):
|
||||
# GH #23283
|
||||
partition_cols = ['bool', 'int']
|
||||
df = df_full
|
||||
with tm.ensure_clean_dir() as path:
|
||||
df.to_parquet(path, partition_cols=partition_cols,
|
||||
compression=None)
|
||||
import pyarrow.parquet as pq
|
||||
dataset = pq.ParquetDataset(path, validate_schema=False)
|
||||
assert len(dataset.partitions.partition_names) == 2
|
||||
assert dataset.partitions.partition_names == set(partition_cols)
|
||||
|
||||
|
||||
class TestParquetFastParquet(Base):
|
||||
|
||||
@td.skip_if_no('fastparquet', min_version="0.2.1")
|
||||
def test_basic(self, fp, df_full):
|
||||
df = df_full
|
||||
|
||||
# additional supported types for fastparquet
|
||||
if LooseVersion(fastparquet.__version__) >= LooseVersion('0.1.4'):
|
||||
df['datetime_tz'] = pd.date_range('20130101', periods=3,
|
||||
tz='US/Eastern')
|
||||
df['timedelta'] = pd.timedelta_range('1 day', periods=3)
|
||||
check_round_trip(df, fp)
|
||||
|
||||
@pytest.mark.skip(reason="not supported")
|
||||
def test_duplicate_columns(self, fp):
|
||||
|
||||
# not currently able to handle duplicate columns
|
||||
df = pd.DataFrame(np.arange(12).reshape(4, 3),
|
||||
columns=list('aaa')).copy()
|
||||
self.check_error_on_write(df, fp, ValueError)
|
||||
|
||||
def test_bool_with_none(self, fp):
|
||||
df = pd.DataFrame({'a': [True, None, False]})
|
||||
expected = pd.DataFrame({'a': [1.0, np.nan, 0.0]}, dtype='float16')
|
||||
check_round_trip(df, fp, expected=expected)
|
||||
|
||||
def test_unsupported(self, fp):
|
||||
|
||||
# period
|
||||
df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
|
||||
self.check_error_on_write(df, fp, ValueError)
|
||||
|
||||
# mixed
|
||||
df = pd.DataFrame({'a': ['a', 1, 2.0]})
|
||||
self.check_error_on_write(df, fp, ValueError)
|
||||
|
||||
def test_categorical(self, fp):
|
||||
if LooseVersion(fastparquet.__version__) < LooseVersion("0.1.3"):
|
||||
pytest.skip("CategoricalDtype not supported for older fp")
|
||||
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
|
||||
check_round_trip(df, fp)
|
||||
|
||||
def test_filter_row_groups(self, fp):
|
||||
d = {'a': list(range(0, 3))}
|
||||
df = pd.DataFrame(d)
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_parquet(path, fp, compression=None,
|
||||
row_group_offsets=1)
|
||||
result = read_parquet(path, fp, filters=[('a', '==', 0)])
|
||||
assert len(result) == 1
|
||||
|
||||
def test_s3_roundtrip(self, df_compat, s3_resource, fp):
|
||||
# GH #19134
|
||||
check_round_trip(df_compat, fp,
|
||||
path='s3://pandas-test/fastparquet.parquet')
|
||||
|
||||
def test_partition_cols_supported(self, fp, df_full):
|
||||
# GH #23283
|
||||
partition_cols = ['bool', 'int']
|
||||
df = df_full
|
||||
with tm.ensure_clean_dir() as path:
|
||||
df.to_parquet(path, engine="fastparquet",
|
||||
partition_cols=partition_cols, compression=None)
|
||||
assert os.path.exists(path)
|
||||
import fastparquet
|
||||
actual_partition_cols = fastparquet.ParquetFile(path, False).cats
|
||||
assert len(actual_partition_cols) == 2
|
||||
|
||||
def test_partition_on_supported(self, fp, df_full):
|
||||
# GH #23283
|
||||
partition_cols = ['bool', 'int']
|
||||
df = df_full
|
||||
with tm.ensure_clean_dir() as path:
|
||||
df.to_parquet(path, engine="fastparquet", compression=None,
|
||||
partition_on=partition_cols)
|
||||
assert os.path.exists(path)
|
||||
import fastparquet
|
||||
actual_partition_cols = fastparquet.ParquetFile(path, False).cats
|
||||
assert len(actual_partition_cols) == 2
|
||||
|
||||
def test_error_on_using_partition_cols_and_partition_on(self, fp, df_full):
|
||||
# GH #23283
|
||||
partition_cols = ['bool', 'int']
|
||||
df = df_full
|
||||
with pytest.raises(ValueError):
|
||||
with tm.ensure_clean_dir() as path:
|
||||
df.to_parquet(path, engine="fastparquet", compression=None,
|
||||
partition_on=partition_cols,
|
||||
partition_cols=partition_cols)
|
||||
@@ -0,0 +1,481 @@
|
||||
# pylint: disable=E1101,E1103,W0232
|
||||
|
||||
"""
|
||||
manage legacy pickle tests
|
||||
|
||||
How to add pickle tests:
|
||||
|
||||
1. Install pandas version intended to output the pickle.
|
||||
|
||||
2. Execute "generate_legacy_storage_files.py" to create the pickle.
|
||||
$ python generate_legacy_storage_files.py <output_dir> pickle
|
||||
|
||||
3. Move the created pickle to "data/legacy_pickle/<version>" directory.
|
||||
"""
|
||||
from distutils.version import LooseVersion
|
||||
import glob
|
||||
import os
|
||||
import shutil
|
||||
from warnings import catch_warnings, simplefilter
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY3, is_platform_little_endian
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import Index
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.tseries.offsets import Day, MonthEnd
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def current_pickle_data():
|
||||
# our current version pickle data
|
||||
from pandas.tests.io.generate_legacy_storage_files import (
|
||||
create_pickle_data)
|
||||
return create_pickle_data()
|
||||
|
||||
|
||||
# ---------------------
|
||||
# comparison functions
|
||||
# ---------------------
|
||||
def compare_element(result, expected, typ, version=None):
|
||||
if isinstance(expected, Index):
|
||||
tm.assert_index_equal(expected, result)
|
||||
return
|
||||
|
||||
if typ.startswith('sp_'):
|
||||
comparator = getattr(tm, "assert_%s_equal" % typ)
|
||||
comparator(result, expected, exact_indices=False)
|
||||
elif typ == 'timestamp':
|
||||
if expected is pd.NaT:
|
||||
assert result is pd.NaT
|
||||
else:
|
||||
assert result == expected
|
||||
assert result.freq == expected.freq
|
||||
else:
|
||||
comparator = getattr(tm, "assert_%s_equal" %
|
||||
typ, tm.assert_almost_equal)
|
||||
comparator(result, expected)
|
||||
|
||||
|
||||
def compare(data, vf, version):
|
||||
|
||||
# py3 compat when reading py2 pickle
|
||||
try:
|
||||
data = pd.read_pickle(vf)
|
||||
except (ValueError) as e:
|
||||
if 'unsupported pickle protocol:' in str(e):
|
||||
# trying to read a py3 pickle in py2
|
||||
return
|
||||
else:
|
||||
raise
|
||||
|
||||
m = globals()
|
||||
for typ, dv in data.items():
|
||||
for dt, result in dv.items():
|
||||
try:
|
||||
expected = data[typ][dt]
|
||||
except (KeyError):
|
||||
if version in ('0.10.1', '0.11.0') and dt == 'reg':
|
||||
break
|
||||
else:
|
||||
raise
|
||||
|
||||
# use a specific comparator
|
||||
# if available
|
||||
comparator = "compare_{typ}_{dt}".format(typ=typ, dt=dt)
|
||||
|
||||
comparator = m.get(comparator, m['compare_element'])
|
||||
comparator(result, expected, typ, version)
|
||||
return data
|
||||
|
||||
|
||||
def compare_sp_series_ts(res, exp, typ, version):
|
||||
# SparseTimeSeries integrated into SparseSeries in 0.12.0
|
||||
# and deprecated in 0.17.0
|
||||
if version and LooseVersion(version) <= LooseVersion("0.12.0"):
|
||||
tm.assert_sp_series_equal(res, exp, check_series_type=False)
|
||||
else:
|
||||
tm.assert_sp_series_equal(res, exp)
|
||||
|
||||
|
||||
def compare_series_ts(result, expected, typ, version):
|
||||
# GH 7748
|
||||
tm.assert_series_equal(result, expected)
|
||||
assert result.index.freq == expected.index.freq
|
||||
assert not result.index.freq.normalize
|
||||
tm.assert_series_equal(result > 0, expected > 0)
|
||||
|
||||
# GH 9291
|
||||
freq = result.index.freq
|
||||
assert freq + Day(1) == Day(2)
|
||||
|
||||
res = freq + pd.Timedelta(hours=1)
|
||||
assert isinstance(res, pd.Timedelta)
|
||||
assert res == pd.Timedelta(days=1, hours=1)
|
||||
|
||||
res = freq + pd.Timedelta(nanoseconds=1)
|
||||
assert isinstance(res, pd.Timedelta)
|
||||
assert res == pd.Timedelta(days=1, nanoseconds=1)
|
||||
|
||||
|
||||
def compare_series_dt_tz(result, expected, typ, version):
|
||||
# 8260
|
||||
# dtype is object < 0.17.0
|
||||
if LooseVersion(version) < LooseVersion('0.17.0'):
|
||||
expected = expected.astype(object)
|
||||
tm.assert_series_equal(result, expected)
|
||||
else:
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def compare_series_cat(result, expected, typ, version):
|
||||
# Categorical dtype is added in 0.15.0
|
||||
# ordered is changed in 0.16.0
|
||||
if LooseVersion(version) < LooseVersion('0.15.0'):
|
||||
tm.assert_series_equal(result, expected, check_dtype=False,
|
||||
check_categorical=False)
|
||||
elif LooseVersion(version) < LooseVersion('0.16.0'):
|
||||
tm.assert_series_equal(result, expected, check_categorical=False)
|
||||
else:
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def compare_frame_dt_mixed_tzs(result, expected, typ, version):
|
||||
# 8260
|
||||
# dtype is object < 0.17.0
|
||||
if LooseVersion(version) < LooseVersion('0.17.0'):
|
||||
expected = expected.astype(object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def compare_frame_cat_onecol(result, expected, typ, version):
|
||||
# Categorical dtype is added in 0.15.0
|
||||
# ordered is changed in 0.16.0
|
||||
if LooseVersion(version) < LooseVersion('0.15.0'):
|
||||
tm.assert_frame_equal(result, expected, check_dtype=False,
|
||||
check_categorical=False)
|
||||
elif LooseVersion(version) < LooseVersion('0.16.0'):
|
||||
tm.assert_frame_equal(result, expected, check_categorical=False)
|
||||
else:
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def compare_frame_cat_and_float(result, expected, typ, version):
|
||||
compare_frame_cat_onecol(result, expected, typ, version)
|
||||
|
||||
|
||||
def compare_index_period(result, expected, typ, version):
|
||||
tm.assert_index_equal(result, expected)
|
||||
assert isinstance(result.freq, MonthEnd)
|
||||
assert result.freq == MonthEnd()
|
||||
assert result.freqstr == 'M'
|
||||
tm.assert_index_equal(result.shift(2), expected.shift(2))
|
||||
|
||||
|
||||
def compare_sp_frame_float(result, expected, typ, version):
|
||||
if LooseVersion(version) <= LooseVersion('0.18.1'):
|
||||
tm.assert_sp_frame_equal(result, expected, exact_indices=False,
|
||||
check_dtype=False)
|
||||
else:
|
||||
tm.assert_sp_frame_equal(result, expected)
|
||||
|
||||
|
||||
files = glob.glob(os.path.join(os.path.dirname(__file__), "data",
|
||||
"legacy_pickle", "*", "*.pickle"))
|
||||
|
||||
|
||||
@pytest.fixture(params=files)
|
||||
def legacy_pickle(request, datapath):
|
||||
return datapath(request.param)
|
||||
|
||||
|
||||
# ---------------------
|
||||
# tests
|
||||
# ---------------------
|
||||
def test_pickles(current_pickle_data, legacy_pickle):
|
||||
if not is_platform_little_endian():
|
||||
pytest.skip("known failure on non-little endian")
|
||||
|
||||
version = os.path.basename(os.path.dirname(legacy_pickle))
|
||||
with catch_warnings(record=True):
|
||||
simplefilter("ignore")
|
||||
compare(current_pickle_data, legacy_pickle, version)
|
||||
|
||||
|
||||
def test_round_trip_current(current_pickle_data):
|
||||
|
||||
try:
|
||||
import cPickle as c_pickle
|
||||
|
||||
def c_pickler(obj, path):
|
||||
with open(path, 'wb') as fh:
|
||||
c_pickle.dump(obj, fh, protocol=-1)
|
||||
|
||||
def c_unpickler(path):
|
||||
with open(path, 'rb') as fh:
|
||||
fh.seek(0)
|
||||
return c_pickle.load(fh)
|
||||
except ImportError:
|
||||
c_pickler = None
|
||||
c_unpickler = None
|
||||
|
||||
import pickle as python_pickle
|
||||
|
||||
def python_pickler(obj, path):
|
||||
with open(path, 'wb') as fh:
|
||||
python_pickle.dump(obj, fh, protocol=-1)
|
||||
|
||||
def python_unpickler(path):
|
||||
with open(path, 'rb') as fh:
|
||||
fh.seek(0)
|
||||
return python_pickle.load(fh)
|
||||
|
||||
data = current_pickle_data
|
||||
for typ, dv in data.items():
|
||||
for dt, expected in dv.items():
|
||||
|
||||
for writer in [pd.to_pickle, c_pickler, python_pickler]:
|
||||
if writer is None:
|
||||
continue
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
|
||||
# test writing with each pickler
|
||||
writer(expected, path)
|
||||
|
||||
# test reading with each unpickler
|
||||
result = pd.read_pickle(path)
|
||||
compare_element(result, expected, typ)
|
||||
|
||||
if c_unpickler is not None:
|
||||
result = c_unpickler(path)
|
||||
compare_element(result, expected, typ)
|
||||
|
||||
result = python_unpickler(path)
|
||||
compare_element(result, expected, typ)
|
||||
|
||||
|
||||
def test_pickle_v0_14_1(datapath):
|
||||
|
||||
cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False,
|
||||
categories=['a', 'b', 'c', 'd'])
|
||||
pickle_path = datapath('io', 'data', 'categorical_0_14_1.pickle')
|
||||
# This code was executed once on v0.14.1 to generate the pickle:
|
||||
#
|
||||
# cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
|
||||
# name='foobar')
|
||||
# with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
|
||||
#
|
||||
tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
|
||||
|
||||
|
||||
def test_pickle_v0_15_2(datapath):
|
||||
# ordered -> _ordered
|
||||
# GH 9347
|
||||
|
||||
cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False,
|
||||
categories=['a', 'b', 'c', 'd'])
|
||||
pickle_path = datapath('io', 'data', 'categorical_0_15_2.pickle')
|
||||
# This code was executed once on v0.15.2 to generate the pickle:
|
||||
#
|
||||
# cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
|
||||
# name='foobar')
|
||||
# with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
|
||||
#
|
||||
tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
|
||||
|
||||
|
||||
def test_pickle_path_pathlib():
|
||||
df = tm.makeDataFrame()
|
||||
result = tm.round_trip_pathlib(df.to_pickle, pd.read_pickle)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
|
||||
def test_pickle_path_localpath():
|
||||
df = tm.makeDataFrame()
|
||||
result = tm.round_trip_localpath(df.to_pickle, pd.read_pickle)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
|
||||
# ---------------------
|
||||
# test pickle compression
|
||||
# ---------------------
|
||||
|
||||
@pytest.fixture
|
||||
def get_random_path():
|
||||
return u'__%s__.pickle' % tm.rands(10)
|
||||
|
||||
|
||||
class TestCompression(object):
|
||||
|
||||
_compression_to_extension = {
|
||||
None: ".none",
|
||||
'gzip': '.gz',
|
||||
'bz2': '.bz2',
|
||||
'zip': '.zip',
|
||||
'xz': '.xz',
|
||||
}
|
||||
|
||||
def compress_file(self, src_path, dest_path, compression):
|
||||
if compression is None:
|
||||
shutil.copyfile(src_path, dest_path)
|
||||
return
|
||||
|
||||
if compression == 'gzip':
|
||||
import gzip
|
||||
f = gzip.open(dest_path, "w")
|
||||
elif compression == 'bz2':
|
||||
import bz2
|
||||
f = bz2.BZ2File(dest_path, "w")
|
||||
elif compression == 'zip':
|
||||
import zipfile
|
||||
with zipfile.ZipFile(dest_path, "w",
|
||||
compression=zipfile.ZIP_DEFLATED) as f:
|
||||
f.write(src_path, os.path.basename(src_path))
|
||||
elif compression == 'xz':
|
||||
lzma = pd.compat.import_lzma()
|
||||
f = lzma.LZMAFile(dest_path, "w")
|
||||
else:
|
||||
msg = 'Unrecognized compression type: {}'.format(compression)
|
||||
raise ValueError(msg)
|
||||
|
||||
if compression != "zip":
|
||||
with open(src_path, "rb") as fh, f:
|
||||
f.write(fh.read())
|
||||
|
||||
def test_write_explicit(self, compression, get_random_path):
|
||||
base = get_random_path
|
||||
path1 = base + ".compressed"
|
||||
path2 = base + ".raw"
|
||||
|
||||
with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
|
||||
df = tm.makeDataFrame()
|
||||
|
||||
# write to compressed file
|
||||
df.to_pickle(p1, compression=compression)
|
||||
|
||||
# decompress
|
||||
with tm.decompress_file(p1, compression=compression) as f:
|
||||
with open(p2, "wb") as fh:
|
||||
fh.write(f.read())
|
||||
|
||||
# read decompressed file
|
||||
df2 = pd.read_pickle(p2, compression=None)
|
||||
|
||||
tm.assert_frame_equal(df, df2)
|
||||
|
||||
@pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z'])
|
||||
def test_write_explicit_bad(self, compression, get_random_path):
|
||||
with pytest.raises(ValueError, match="Unrecognized compression type"):
|
||||
with tm.ensure_clean(get_random_path) as path:
|
||||
df = tm.makeDataFrame()
|
||||
df.to_pickle(path, compression=compression)
|
||||
|
||||
@pytest.mark.parametrize('ext', [
|
||||
'', '.gz', '.bz2', '.no_compress',
|
||||
pytest.param('.xz', marks=td.skip_if_no_lzma)
|
||||
])
|
||||
def test_write_infer(self, ext, get_random_path):
|
||||
base = get_random_path
|
||||
path1 = base + ext
|
||||
path2 = base + ".raw"
|
||||
compression = None
|
||||
for c in self._compression_to_extension:
|
||||
if self._compression_to_extension[c] == ext:
|
||||
compression = c
|
||||
break
|
||||
|
||||
with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
|
||||
df = tm.makeDataFrame()
|
||||
|
||||
# write to compressed file by inferred compression method
|
||||
df.to_pickle(p1)
|
||||
|
||||
# decompress
|
||||
with tm.decompress_file(p1, compression=compression) as f:
|
||||
with open(p2, "wb") as fh:
|
||||
fh.write(f.read())
|
||||
|
||||
# read decompressed file
|
||||
df2 = pd.read_pickle(p2, compression=None)
|
||||
|
||||
tm.assert_frame_equal(df, df2)
|
||||
|
||||
def test_read_explicit(self, compression, get_random_path):
|
||||
base = get_random_path
|
||||
path1 = base + ".raw"
|
||||
path2 = base + ".compressed"
|
||||
|
||||
with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
|
||||
df = tm.makeDataFrame()
|
||||
|
||||
# write to uncompressed file
|
||||
df.to_pickle(p1, compression=None)
|
||||
|
||||
# compress
|
||||
self.compress_file(p1, p2, compression=compression)
|
||||
|
||||
# read compressed file
|
||||
df2 = pd.read_pickle(p2, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(df, df2)
|
||||
|
||||
@pytest.mark.parametrize('ext', [
|
||||
'', '.gz', '.bz2', '.zip', '.no_compress',
|
||||
pytest.param('.xz', marks=td.skip_if_no_lzma)
|
||||
])
|
||||
def test_read_infer(self, ext, get_random_path):
|
||||
base = get_random_path
|
||||
path1 = base + ".raw"
|
||||
path2 = base + ext
|
||||
compression = None
|
||||
for c in self._compression_to_extension:
|
||||
if self._compression_to_extension[c] == ext:
|
||||
compression = c
|
||||
break
|
||||
|
||||
with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
|
||||
df = tm.makeDataFrame()
|
||||
|
||||
# write to uncompressed file
|
||||
df.to_pickle(p1, compression=None)
|
||||
|
||||
# compress
|
||||
self.compress_file(p1, p2, compression=compression)
|
||||
|
||||
# read compressed file by inferred compression method
|
||||
df2 = pd.read_pickle(p2)
|
||||
|
||||
tm.assert_frame_equal(df, df2)
|
||||
|
||||
|
||||
# ---------------------
|
||||
# test pickle compression
|
||||
# ---------------------
|
||||
|
||||
class TestProtocol(object):
|
||||
|
||||
@pytest.mark.parametrize('protocol', [-1, 0, 1, 2])
|
||||
def test_read(self, protocol, get_random_path):
|
||||
with tm.ensure_clean(get_random_path) as path:
|
||||
df = tm.makeDataFrame()
|
||||
df.to_pickle(path, protocol=protocol)
|
||||
df2 = pd.read_pickle(path)
|
||||
tm.assert_frame_equal(df, df2)
|
||||
|
||||
@pytest.mark.parametrize('protocol', [3, 4])
|
||||
@pytest.mark.skipif(PY3, reason="Testing invalid parameters for Python 2")
|
||||
def test_read_bad_versions(self, protocol, get_random_path):
|
||||
# For Python 2, HIGHEST_PROTOCOL should be 2.
|
||||
msg = ("pickle protocol {protocol} asked for; the highest available "
|
||||
"protocol is 2").format(protocol=protocol)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.ensure_clean(get_random_path) as path:
|
||||
df = tm.makeDataFrame()
|
||||
df.to_pickle(path, protocol=protocol)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,29 @@
|
||||
import pytest
|
||||
|
||||
from pandas.compat import BytesIO
|
||||
|
||||
from pandas import read_csv
|
||||
|
||||
from pandas.io.common import is_s3_url
|
||||
|
||||
|
||||
class TestS3URL(object):
|
||||
|
||||
def test_is_s3_url(self):
|
||||
assert is_s3_url("s3://pandas/somethingelse.com")
|
||||
assert not is_s3_url("s4://pandas/somethingelse.com")
|
||||
|
||||
|
||||
def test_streaming_s3_objects():
|
||||
# GH17135
|
||||
# botocore gained iteration support in 1.10.47, can now be used in read_*
|
||||
pytest.importorskip('botocore', minversion='1.10.47')
|
||||
from botocore.response import StreamingBody
|
||||
|
||||
data = [
|
||||
b'foo,bar,baz\n1,2,3\n4,5,6\n',
|
||||
b'just,the,header\n',
|
||||
]
|
||||
for el in data:
|
||||
body = StreamingBody(BytesIO(el), content_length=len(el))
|
||||
read_csv(body)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user