demo + utils venv
This commit is contained in:
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -0,0 +1,85 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import read_csv, read_table
|
||||
|
||||
|
||||
class BaseParser(object):
|
||||
engine = None
|
||||
low_memory = True
|
||||
float_precision_choices = []
|
||||
|
||||
def update_kwargs(self, kwargs):
|
||||
kwargs = kwargs.copy()
|
||||
kwargs.update(dict(engine=self.engine,
|
||||
low_memory=self.low_memory))
|
||||
|
||||
return kwargs
|
||||
|
||||
def read_csv(self, *args, **kwargs):
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
return read_csv(*args, **kwargs)
|
||||
|
||||
def read_table(self, *args, **kwargs):
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
return read_table(*args, **kwargs)
|
||||
|
||||
|
||||
class CParser(BaseParser):
|
||||
engine = "c"
|
||||
float_precision_choices = [None, "high", "round_trip"]
|
||||
|
||||
|
||||
class CParserHighMemory(CParser):
|
||||
low_memory = False
|
||||
|
||||
|
||||
class CParserLowMemory(CParser):
|
||||
low_memory = True
|
||||
|
||||
|
||||
class PythonParser(BaseParser):
|
||||
engine = "python"
|
||||
float_precision_choices = [None]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def csv_dir_path(datapath):
|
||||
return datapath("io", "parser", "data")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def csv1(csv_dir_path):
|
||||
return os.path.join(csv_dir_path, "test1.csv")
|
||||
|
||||
|
||||
_cParserHighMemory = CParserHighMemory()
|
||||
_cParserLowMemory = CParserLowMemory()
|
||||
_pythonParser = PythonParser()
|
||||
|
||||
_py_parsers_only = [_pythonParser]
|
||||
_c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
|
||||
_all_parsers = _c_parsers_only + _py_parsers_only
|
||||
|
||||
_py_parser_ids = ["python"]
|
||||
_c_parser_ids = ["c_high", "c_low"]
|
||||
_all_parser_ids = _c_parser_ids + _py_parser_ids
|
||||
|
||||
|
||||
@pytest.fixture(params=_all_parsers,
|
||||
ids=_all_parser_ids)
|
||||
def all_parsers(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=_c_parsers_only,
|
||||
ids=_c_parser_ids)
|
||||
def c_parser_only(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=_py_parsers_only,
|
||||
ids=_py_parser_ids)
|
||||
def python_parser_only(request):
|
||||
return request.param
|
||||
@@ -0,0 +1,591 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that apply specifically to the CParser. Unless specifically stated
|
||||
as a CParser-specific issue, the goal is to eventually move as many of
|
||||
these tests out of this module as soon as the Python parser can accept
|
||||
further arguments when parsing.
|
||||
"""
|
||||
|
||||
from io import TextIOWrapper
|
||||
import mmap
|
||||
import os
|
||||
import tarfile
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY3, BytesIO, StringIO, lrange, range
|
||||
from pandas.errors import ParserError
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import DataFrame, concat
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"malformed",
|
||||
["1\r1\r1\r 1\r 1\r",
|
||||
"1\r1\r1\r 1\r 1\r11\r",
|
||||
"1\r1\r1\r 1\r 1\r11\r1\r"],
|
||||
ids=["words pointer", "stream pointer", "lines pointer"])
|
||||
def test_buffer_overflow(c_parser_only, malformed):
|
||||
# see gh-9205: test certain malformed input files that cause
|
||||
# buffer overflows in tokenizer.c
|
||||
msg = "Buffer overflow caught - possible malformed input file."
|
||||
parser = c_parser_only
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(malformed))
|
||||
|
||||
|
||||
def test_buffer_rd_bytes(c_parser_only):
|
||||
# see gh-12098: src->buffer in the C parser can be freed twice leading
|
||||
# to a segfault if a corrupt gzip file is read with 'read_csv', and the
|
||||
# buffer is filled more than once before gzip raises an Exception.
|
||||
|
||||
data = "\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09" \
|
||||
"\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0" \
|
||||
"\xA6\x4D" + "\x55" * 267 + \
|
||||
"\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00" \
|
||||
"\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO"
|
||||
parser = c_parser_only
|
||||
|
||||
for _ in range(100):
|
||||
try:
|
||||
parser.read_csv(StringIO(data), compression="gzip",
|
||||
delim_whitespace=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def test_delim_whitespace_custom_terminator(c_parser_only):
|
||||
# See gh-12912
|
||||
data = "a b c~1 2 3~4 5 6~7 8 9"
|
||||
parser = c_parser_only
|
||||
|
||||
df = parser.read_csv(StringIO(data), lineterminator="~",
|
||||
delim_whitespace=True)
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
def test_dtype_and_names_error(c_parser_only):
|
||||
# see gh-8833: passing both dtype and names
|
||||
# resulting in an error reporting issue
|
||||
parser = c_parser_only
|
||||
data = """
|
||||
1.0 1
|
||||
2.0 2
|
||||
3.0 3
|
||||
"""
|
||||
# base cases
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None)
|
||||
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+",
|
||||
header=None, names=["a", "b"])
|
||||
expected = DataFrame(
|
||||
[[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# fallback casting
|
||||
result = parser.read_csv(StringIO(
|
||||
data), sep=r"\s+", header=None,
|
||||
names=["a", "b"], dtype={"a": np.int32})
|
||||
expected = DataFrame([[1, 1], [2, 2], [3, 3]],
|
||||
columns=["a", "b"])
|
||||
expected["a"] = expected["a"].astype(np.int32)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
data = """
|
||||
1.0 1
|
||||
nan 2
|
||||
3.0 3
|
||||
"""
|
||||
# fallback casting, but not castable
|
||||
with pytest.raises(ValueError, match="cannot safely convert"):
|
||||
parser.read_csv(StringIO(data), sep=r"\s+", header=None,
|
||||
names=["a", "b"], dtype={"a": np.int32})
|
||||
|
||||
|
||||
@pytest.mark.parametrize("match,kwargs", [
|
||||
# For each of these cases, all of the dtypes are valid, just unsupported.
|
||||
(("the dtype datetime64 is not supported for parsing, "
|
||||
"pass this column using parse_dates instead"),
|
||||
dict(dtype={"A": "datetime64", "B": "float64"})),
|
||||
|
||||
(("the dtype datetime64 is not supported for parsing, "
|
||||
"pass this column using parse_dates instead"),
|
||||
dict(dtype={"A": "datetime64", "B": "float64"},
|
||||
parse_dates=["B"])),
|
||||
|
||||
("the dtype timedelta64 is not supported for parsing",
|
||||
dict(dtype={"A": "timedelta64", "B": "float64"})),
|
||||
|
||||
("the dtype <U8 is not supported for parsing",
|
||||
dict(dtype={"A": "U8"}))
|
||||
], ids=["dt64-0", "dt64-1", "td64", "<U8"])
|
||||
def test_unsupported_dtype(c_parser_only, match, kwargs):
|
||||
parser = c_parser_only
|
||||
df = DataFrame(np.random.rand(5, 2), columns=list(
|
||||
"AB"), index=["1A", "1B", "1C", "1D", "1E"])
|
||||
|
||||
with tm.ensure_clean("__unsupported_dtype__.csv") as path:
|
||||
df.to_csv(path)
|
||||
|
||||
with pytest.raises(TypeError, match=match):
|
||||
parser.read_csv(path, index_col=0, **kwargs)
|
||||
|
||||
|
||||
@td.skip_if_32bit
|
||||
def test_precise_conversion(c_parser_only):
|
||||
from decimal import Decimal
|
||||
parser = c_parser_only
|
||||
|
||||
normal_errors = []
|
||||
precise_errors = []
|
||||
|
||||
# test numbers between 1 and 2
|
||||
for num in np.linspace(1., 2., num=500):
|
||||
# 25 decimal digits of precision
|
||||
text = "a\n{0:.25}".format(num)
|
||||
|
||||
normal_val = float(parser.read_csv(StringIO(text))["a"][0])
|
||||
precise_val = float(parser.read_csv(
|
||||
StringIO(text), float_precision="high")["a"][0])
|
||||
roundtrip_val = float(parser.read_csv(
|
||||
StringIO(text), float_precision="round_trip")["a"][0])
|
||||
actual_val = Decimal(text[2:])
|
||||
|
||||
def error(val):
|
||||
return abs(Decimal("{0:.100}".format(val)) - actual_val)
|
||||
|
||||
normal_errors.append(error(normal_val))
|
||||
precise_errors.append(error(precise_val))
|
||||
|
||||
# round-trip should match float()
|
||||
assert roundtrip_val == float(text[2:])
|
||||
|
||||
assert sum(precise_errors) <= sum(normal_errors)
|
||||
assert max(precise_errors) <= max(normal_errors)
|
||||
|
||||
|
||||
def test_usecols_dtypes(c_parser_only):
|
||||
parser = c_parser_only
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=(0, 1, 2),
|
||||
names=("a", "b", "c"),
|
||||
header=None,
|
||||
converters={"a": str},
|
||||
dtype={"b": int, "c": float})
|
||||
result2 = parser.read_csv(StringIO(data), usecols=(0, 2),
|
||||
names=("a", "b", "c"),
|
||||
header=None,
|
||||
converters={"a": str},
|
||||
dtype={"b": int, "c": float})
|
||||
|
||||
assert (result.dtypes == [object, np.int, np.float]).all()
|
||||
assert (result2.dtypes == [object, np.float]).all()
|
||||
|
||||
|
||||
def test_disable_bool_parsing(c_parser_only):
|
||||
# see gh-2090
|
||||
|
||||
parser = c_parser_only
|
||||
data = """A,B,C
|
||||
Yes,No,Yes
|
||||
No,Yes,Yes
|
||||
Yes,,Yes
|
||||
No,No,No"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=object)
|
||||
assert (result.dtypes == object).all()
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=object, na_filter=False)
|
||||
assert result["B"][2] == ""
|
||||
|
||||
|
||||
def test_custom_lineterminator(c_parser_only):
|
||||
parser = c_parser_only
|
||||
data = "a,b,c~1,2,3~4,5,6"
|
||||
|
||||
result = parser.read_csv(StringIO(data), lineterminator="~")
|
||||
expected = parser.read_csv(StringIO(data.replace("~", "\n")))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_ragged_csv(c_parser_only):
|
||||
parser = c_parser_only
|
||||
data = """1,2,3
|
||||
1,2,3,4
|
||||
1,2,3,4,5
|
||||
1,2
|
||||
1,2,3,4"""
|
||||
|
||||
nice_data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
result = parser.read_csv(StringIO(data), header=None,
|
||||
names=["a", "b", "c", "d", "e"])
|
||||
|
||||
expected = parser.read_csv(StringIO(nice_data), header=None,
|
||||
names=["a", "b", "c", "d", "e"])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# too many columns, cause segfault if not careful
|
||||
data = "1,2\n3,4,5"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None,
|
||||
names=lrange(50))
|
||||
expected = parser.read_csv(StringIO(data), header=None,
|
||||
names=lrange(3)).reindex(columns=lrange(50))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_tokenize_CR_with_quoting(c_parser_only):
|
||||
# see gh-3453
|
||||
parser = c_parser_only
|
||||
data = " a,b,c\r\"a,b\",\"e,d\",\"f,f\""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = parser.read_csv(StringIO(data.replace("\r", "\n")),
|
||||
header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = parser.read_csv(StringIO(data.replace("\r", "\n")))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_grow_boundary_at_cap(c_parser_only):
|
||||
# See gh-12494
|
||||
#
|
||||
# Cause of error was that the C parser
|
||||
# was not increasing the buffer size when
|
||||
# the desired space would fill the buffer
|
||||
# to capacity, which would later cause a
|
||||
# buffer overflow error when checking the
|
||||
# EOF terminator of the CSV stream.
|
||||
parser = c_parser_only
|
||||
|
||||
def test_empty_header_read(count):
|
||||
s = StringIO("," * count)
|
||||
expected = DataFrame(columns=[
|
||||
"Unnamed: {i}".format(i=i)
|
||||
for i in range(count + 1)])
|
||||
df = parser.read_csv(s)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
for cnt in range(1, 101):
|
||||
test_empty_header_read(cnt)
|
||||
|
||||
|
||||
def test_parse_trim_buffers(c_parser_only):
|
||||
# This test is part of a bugfix for gh-13703. It attempts to
|
||||
# to stress the system memory allocator, to cause it to move the
|
||||
# stream buffer and either let the OS reclaim the region, or let
|
||||
# other memory requests of parser otherwise modify the contents
|
||||
# of memory space, where it was formally located.
|
||||
# This test is designed to cause a `segfault` with unpatched
|
||||
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
|
||||
# times it fails due to memory corruption, which causes the
|
||||
# loaded DataFrame to differ from the expected one.
|
||||
|
||||
parser = c_parser_only
|
||||
|
||||
# Generate a large mixed-type CSV file on-the-fly (one record is
|
||||
# approx 1.5KiB).
|
||||
record_ = \
|
||||
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \
|
||||
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \
|
||||
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \
|
||||
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \
|
||||
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \
|
||||
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \
|
||||
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \
|
||||
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \
|
||||
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \
|
||||
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \
|
||||
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \
|
||||
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \
|
||||
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \
|
||||
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \
|
||||
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \
|
||||
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \
|
||||
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \
|
||||
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \
|
||||
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \
|
||||
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \
|
||||
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \
|
||||
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \
|
||||
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \
|
||||
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \
|
||||
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \
|
||||
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \
|
||||
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \
|
||||
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \
|
||||
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
|
||||
|
||||
# Set the number of lines so that a call to `parser_trim_buffers`
|
||||
# is triggered: after a couple of full chunks are consumed a
|
||||
# relatively small 'residual' chunk would cause reallocation
|
||||
# within the parser.
|
||||
chunksize, n_lines = 128, 2 * 128 + 15
|
||||
csv_data = "\n".join([record_] * n_lines) + "\n"
|
||||
|
||||
# We will use StringIO to load the CSV from this text buffer.
|
||||
# pd.read_csv() will iterate over the file in chunks and will
|
||||
# finally read a residual chunk of really small size.
|
||||
|
||||
# Generate the expected output: manually create the dataframe
|
||||
# by splitting by comma and repeating the `n_lines` times.
|
||||
row = tuple(val_ if val_ else np.nan
|
||||
for val_ in record_.split(","))
|
||||
expected = DataFrame([row for _ in range(n_lines)],
|
||||
dtype=object, columns=None, index=None)
|
||||
|
||||
# Iterate over the CSV file in chunks of `chunksize` lines
|
||||
chunks_ = parser.read_csv(StringIO(csv_data), header=None,
|
||||
dtype=object, chunksize=chunksize)
|
||||
result = concat(chunks_, axis=0, ignore_index=True)
|
||||
|
||||
# Check for data corruption if there was no segfault
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# This extra test was added to replicate the fault in gh-5291.
|
||||
# Force 'utf-8' encoding, so that `_string_convert` would take
|
||||
# a different execution branch.
|
||||
chunks_ = parser.read_csv(StringIO(csv_data), header=None,
|
||||
dtype=object, chunksize=chunksize,
|
||||
encoding="utf_8")
|
||||
result = concat(chunks_, axis=0, ignore_index=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_internal_null_byte(c_parser_only):
|
||||
# see gh-14012
|
||||
#
|
||||
# The null byte ('\x00') should not be used as a
|
||||
# true line terminator, escape character, or comment
|
||||
# character, only as a placeholder to indicate that
|
||||
# none was specified.
|
||||
#
|
||||
# This test should be moved to test_common.py ONLY when
|
||||
# Python's csv class supports parsing '\x00'.
|
||||
parser = c_parser_only
|
||||
|
||||
names = ["a", "b", "c"]
|
||||
data = "1,2,3\n4,\x00,6\n7,8,9"
|
||||
expected = DataFrame([[1, 2.0, 3], [4, np.nan, 6],
|
||||
[7, 8, 9]], columns=names)
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_nrows_large(c_parser_only):
|
||||
# gh-7626 - Read only nrows of data in for large inputs (>262144b)
|
||||
parser = c_parser_only
|
||||
header_narrow = "\t".join(["COL_HEADER_" + str(i)
|
||||
for i in range(10)]) + "\n"
|
||||
data_narrow = "\t".join(["somedatasomedatasomedata1"
|
||||
for _ in range(10)]) + "\n"
|
||||
header_wide = "\t".join(["COL_HEADER_" + str(i)
|
||||
for i in range(15)]) + "\n"
|
||||
data_wide = "\t".join(["somedatasomedatasomedata2"
|
||||
for _ in range(15)]) + "\n"
|
||||
test_input = (header_narrow + data_narrow * 1050 +
|
||||
header_wide + data_wide * 2)
|
||||
|
||||
df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010)
|
||||
|
||||
assert df.size == 1010 * 10
|
||||
|
||||
|
||||
def test_float_precision_round_trip_with_text(c_parser_only):
|
||||
# see gh-15140 - This should not segfault on Python 2.7+
|
||||
parser = c_parser_only
|
||||
df = parser.read_csv(StringIO("a"), header=None,
|
||||
float_precision="round_trip")
|
||||
tm.assert_frame_equal(df, DataFrame({0: ["a"]}))
|
||||
|
||||
|
||||
def test_large_difference_in_columns(c_parser_only):
|
||||
# see gh-14125
|
||||
parser = c_parser_only
|
||||
|
||||
count = 10000
|
||||
large_row = ("X," * count)[:-1] + "\n"
|
||||
normal_row = "XXXXXX XXXXXX,111111111111111\n"
|
||||
test_input = (large_row + normal_row * 6)[:-1]
|
||||
|
||||
result = parser.read_csv(StringIO(test_input), header=None, usecols=[0])
|
||||
rows = test_input.split("\n")
|
||||
|
||||
expected = DataFrame([row.split(",")[0] for row in rows])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_data_after_quote(c_parser_only):
|
||||
# see gh-15910
|
||||
parser = c_parser_only
|
||||
|
||||
data = "a\n1\n\"b\"a"
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame({"a": ["1", "ba"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_whitespace_delimited(c_parser_only, capsys):
|
||||
parser = c_parser_only
|
||||
test_input = """\
|
||||
1 2
|
||||
2 2 3
|
||||
3 2 3 # 3 fields
|
||||
4 2 3# 3 fields
|
||||
5 2 # 2 fields
|
||||
6 2# 2 fields
|
||||
7 # 1 field, NaN
|
||||
8# 1 field, NaN
|
||||
9 2 3 # skipped line
|
||||
# comment"""
|
||||
df = parser.read_csv(StringIO(test_input), comment="#", header=None,
|
||||
delimiter="\\s+", skiprows=0,
|
||||
error_bad_lines=False)
|
||||
captured = capsys.readouterr()
|
||||
# skipped lines 2, 3, 4, 9
|
||||
for line_num in (2, 3, 4, 9):
|
||||
assert "Skipping line {}".format(line_num) in captured.err
|
||||
expected = DataFrame([[1, 2],
|
||||
[5, 2],
|
||||
[6, 2],
|
||||
[7, np.nan],
|
||||
[8, np.nan]])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
def test_file_like_no_next(c_parser_only):
|
||||
# gh-16530: the file-like need not have a "next" or "__next__"
|
||||
# attribute despite having an "__iter__" attribute.
|
||||
#
|
||||
# NOTE: This is only true for the C engine, not Python engine.
|
||||
class NoNextBuffer(StringIO):
|
||||
def __next__(self):
|
||||
raise AttributeError("No next method")
|
||||
|
||||
next = __next__
|
||||
|
||||
parser = c_parser_only
|
||||
data = "a\n1"
|
||||
|
||||
expected = DataFrame({"a": [1]})
|
||||
result = parser.read_csv(NoNextBuffer(data))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_buffer_rd_bytes_bad_unicode(c_parser_only):
|
||||
# see gh-22748
|
||||
parser = c_parser_only
|
||||
t = BytesIO(b"\xB0")
|
||||
|
||||
if PY3:
|
||||
msg = "'utf-8' codec can't encode character"
|
||||
t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
|
||||
else:
|
||||
msg = "'utf8' codec can't decode byte"
|
||||
|
||||
with pytest.raises(UnicodeError, match=msg):
|
||||
parser.read_csv(t, encoding="UTF-8")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
|
||||
def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix):
|
||||
# see gh-16530
|
||||
#
|
||||
# Unfortunately, Python's CSV library can't handle
|
||||
# tarfile objects (expects string, not bytes when
|
||||
# iterating through a file-like).
|
||||
parser = c_parser_only
|
||||
tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix)
|
||||
|
||||
with tarfile.open(tar_path, "r") as tar:
|
||||
data_file = tar.extractfile("tar_data.csv")
|
||||
|
||||
out = parser.read_csv(data_file)
|
||||
expected = DataFrame({"a": [1]})
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
|
||||
@pytest.mark.high_memory
|
||||
def test_bytes_exceed_2gb(c_parser_only):
|
||||
# see gh-16798
|
||||
#
|
||||
# Read from a "CSV" that has a column larger than 2GB.
|
||||
parser = c_parser_only
|
||||
|
||||
if parser.low_memory:
|
||||
pytest.skip("not a high_memory test")
|
||||
|
||||
csv = StringIO("strings\n" + "\n".join(
|
||||
["x" * (1 << 20) for _ in range(2100)]))
|
||||
df = parser.read_csv(csv)
|
||||
assert not df.empty
|
||||
|
||||
|
||||
def test_chunk_whitespace_on_boundary(c_parser_only):
|
||||
# see gh-9735: this issue is C parser-specific (bug when
|
||||
# parsing whitespace and characters at chunk boundary)
|
||||
#
|
||||
# This test case has a field too large for the Python parser / CSV library.
|
||||
parser = c_parser_only
|
||||
|
||||
chunk1 = "a" * (1024 * 256 - 2) + "\na"
|
||||
chunk2 = "\n a"
|
||||
result = parser.read_csv(StringIO(chunk1 + chunk2), header=None)
|
||||
|
||||
expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_file_handles_mmap(c_parser_only, csv1):
|
||||
# gh-14418
|
||||
#
|
||||
# Don't close user provided file handles.
|
||||
parser = c_parser_only
|
||||
|
||||
with open(csv1, "r") as f:
|
||||
m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
||||
parser.read_csv(m)
|
||||
|
||||
if PY3:
|
||||
assert not m.closed
|
||||
m.close()
|
||||
|
||||
|
||||
def test_file_binary_mode(c_parser_only):
|
||||
# see gh-23779
|
||||
parser = c_parser_only
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w") as f:
|
||||
f.write("1,2,3\n4,5,6")
|
||||
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,136 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that comments are properly handled during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_values", [None, ["NaN"]])
|
||||
def test_comment(all_parsers, na_values):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
1,2.,4.#hello world
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
|
||||
columns=["A", "B", "C"])
|
||||
result = parser.read_csv(StringIO(data), comment="#",
|
||||
na_values=na_values)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("read_kwargs", [
|
||||
dict(),
|
||||
dict(lineterminator="*"),
|
||||
dict(delim_whitespace=True),
|
||||
])
|
||||
def test_line_comment(all_parsers, read_kwargs):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
A,B,C
|
||||
1,2.,4.#hello world
|
||||
#ignore this line
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
if read_kwargs.get("delim_whitespace"):
|
||||
data = data.replace(",", " ")
|
||||
elif read_kwargs.get("lineterminator"):
|
||||
if parser.engine != "c":
|
||||
pytest.skip("Custom terminator not supported with Python engine")
|
||||
|
||||
data = data.replace("\n", read_kwargs.get("lineterminator"))
|
||||
|
||||
read_kwargs["comment"] = "#"
|
||||
result = parser.read_csv(StringIO(data), **read_kwargs)
|
||||
|
||||
expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
|
||||
columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_skiprows(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
random line
|
||||
# second empty line
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# This should ignore the first four lines (including comments).
|
||||
expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
|
||||
columns=["A", "B", "C"])
|
||||
result = parser.read_csv(StringIO(data), comment="#", skiprows=4)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_header(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
# second empty line
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# Header should begin at the second non-comment line.
|
||||
expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
|
||||
columns=["A", "B", "C"])
|
||||
result = parser.read_csv(StringIO(data), comment="#", header=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_skiprows_header(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
# second empty line
|
||||
# third empty line
|
||||
X,Y,Z
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# Skiprows should skip the first 4 lines (including comments),
|
||||
# while header should start from the second non-commented line,
|
||||
# starting with line 5.
|
||||
expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]],
|
||||
columns=["A", "B", "C"])
|
||||
result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"])
|
||||
def test_custom_comment_char(all_parsers, comment_char):
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
|
||||
result = parser.read_csv(StringIO(data.replace("#", comment_char)),
|
||||
comment=comment_char)
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", ["infer", None])
|
||||
def test_comment_first_line(all_parsers, header):
|
||||
# see gh-4623
|
||||
parser = all_parsers
|
||||
data = "# notes\na,b,c\n# more notes\n1,2,3"
|
||||
|
||||
if header is None:
|
||||
expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]})
|
||||
else:
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), comment="#", header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,154 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests compressed data parsing functionality for all
|
||||
of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import zipfile
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def buffer(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def parser_and_data(all_parsers, csv1):
|
||||
parser = all_parsers
|
||||
|
||||
with open(csv1, "rb") as f:
|
||||
data = f.read()
|
||||
expected = parser.read_csv(csv1)
|
||||
|
||||
return parser, data, expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("compression", ["zip", "infer", "zip2"])
|
||||
def test_zip(parser_and_data, compression):
|
||||
parser, data, expected = parser_and_data
|
||||
|
||||
with tm.ensure_clean("test_file.zip") as path:
|
||||
with zipfile.ZipFile(path, mode="w") as tmp:
|
||||
tmp.writestr("test_file", data)
|
||||
|
||||
if compression == "zip2":
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, compression="zip")
|
||||
else:
|
||||
result = parser.read_csv(path, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("compression", ["zip", "infer"])
|
||||
def test_zip_error_multiple_files(parser_and_data, compression):
|
||||
parser, data, expected = parser_and_data
|
||||
|
||||
with tm.ensure_clean("combined_zip.zip") as path:
|
||||
inner_file_names = ["test_file", "second_file"]
|
||||
|
||||
with zipfile.ZipFile(path, mode="w") as tmp:
|
||||
for file_name in inner_file_names:
|
||||
tmp.writestr(file_name, data)
|
||||
|
||||
with pytest.raises(ValueError, match="Multiple files"):
|
||||
parser.read_csv(path, compression=compression)
|
||||
|
||||
|
||||
def test_zip_error_no_files(parser_and_data):
|
||||
parser, _, _ = parser_and_data
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with zipfile.ZipFile(path, mode="w"):
|
||||
pass
|
||||
|
||||
with pytest.raises(ValueError, match="Zero files"):
|
||||
parser.read_csv(path, compression="zip")
|
||||
|
||||
|
||||
def test_zip_error_invalid_zip(parser_and_data):
|
||||
parser, _, _ = parser_and_data
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "wb") as f:
|
||||
with pytest.raises(zipfile.BadZipfile,
|
||||
match="File is not a zip file"):
|
||||
parser.read_csv(f, compression="zip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [None, "test.{ext}"])
|
||||
def test_compression(parser_and_data, compression_only, buffer, filename):
|
||||
parser, data, expected = parser_and_data
|
||||
compress_type = compression_only
|
||||
|
||||
ext = "gz" if compress_type == "gzip" else compress_type
|
||||
filename = filename if filename is None else filename.format(ext=ext)
|
||||
|
||||
if filename and buffer:
|
||||
pytest.skip("Cannot deduce compression from "
|
||||
"buffer of compressed data.")
|
||||
|
||||
with tm.ensure_clean(filename=filename) as path:
|
||||
tm.write_to_compressed(compress_type, path, data)
|
||||
compression = "infer" if filename else compress_type
|
||||
|
||||
if buffer:
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, compression=compression)
|
||||
else:
|
||||
result = parser.read_csv(path, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ext", [None, "gz", "bz2"])
|
||||
def test_infer_compression(all_parsers, csv1, buffer, ext):
|
||||
# see gh-9770
|
||||
parser = all_parsers
|
||||
kwargs = dict(index_col=0, parse_dates=True)
|
||||
|
||||
expected = parser.read_csv(csv1, **kwargs)
|
||||
kwargs["compression"] = "infer"
|
||||
|
||||
if buffer:
|
||||
with open(csv1) as f:
|
||||
result = parser.read_csv(f, **kwargs)
|
||||
else:
|
||||
ext = "." + ext if ext else ""
|
||||
result = parser.read_csv(csv1 + ext, **kwargs)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_compression_utf16_encoding(all_parsers, csv_dir_path):
|
||||
# see gh-18071
|
||||
parser = all_parsers
|
||||
path = os.path.join(csv_dir_path, "utf16_ex_small.zip")
|
||||
|
||||
result = parser.read_csv(path, encoding="utf-16",
|
||||
compression="zip", sep="\t")
|
||||
expected = pd.DataFrame({
|
||||
u"Country": [u"Venezuela", u"Venezuela"],
|
||||
u"Twitter": [u"Hugo Chávez Frías", u"Henrique Capriles R."]
|
||||
})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
|
||||
def test_invalid_compression(all_parsers, invalid_compression):
|
||||
parser = all_parsers
|
||||
compress_kwargs = dict(compression=invalid_compression)
|
||||
|
||||
msg = ("Unrecognized compression "
|
||||
"type: {compression}".format(**compress_kwargs))
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv("test_file.zip", **compress_kwargs)
|
||||
@@ -0,0 +1,158 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests column conversion functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO, lmap, parse_date
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def test_converters_type_must_be_dict(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
"""
|
||||
|
||||
with pytest.raises(TypeError, match="Type converters.+"):
|
||||
parser.read_csv(StringIO(data), converters=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("column", [3, "D"])
|
||||
@pytest.mark.parametrize("converter", [
|
||||
parse_date,
|
||||
lambda x: int(x.split("/")[2]) # Produce integer.
|
||||
])
|
||||
def test_converters(all_parsers, column, converter):
|
||||
parser = all_parsers
|
||||
data = """A,B,C,D
|
||||
a,1,2,01/01/2009
|
||||
b,3,4,01/02/2009
|
||||
c,4,5,01/03/2009
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), converters={column: converter})
|
||||
|
||||
expected = parser.read_csv(StringIO(data))
|
||||
expected["D"] = expected["D"].map(converter)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_no_implicit_conv(all_parsers):
|
||||
# see gh-2184
|
||||
parser = all_parsers
|
||||
data = """000102,1.2,A\n001245,2,B"""
|
||||
|
||||
converters = {0: lambda x: x.strip()}
|
||||
result = parser.read_csv(StringIO(data), header=None,
|
||||
converters=converters)
|
||||
|
||||
# Column 0 should not be casted to numeric and should remain as object.
|
||||
expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_euro_decimal_format(all_parsers):
|
||||
# see gh-583
|
||||
converters = dict()
|
||||
parser = all_parsers
|
||||
|
||||
data = """Id;Number1;Number2;Text1;Text2;Number3
|
||||
1;1521,1541;187101,9543;ABC;poi;4,7387
|
||||
2;121,12;14897,76;DEF;uyt;0,3773
|
||||
3;878,158;108013,434;GHI;rez;2,7356"""
|
||||
converters["Number1"] = converters["Number2"] =\
|
||||
converters["Number3"] = lambda x: float(x.replace(",", "."))
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=";", converters=converters)
|
||||
expected = DataFrame([[1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387],
|
||||
[2, 121.12, 14897.76, "DEF", "uyt", 0.3773],
|
||||
[3, 878.158, 108013.434, "GHI", "rez", 2.7356]],
|
||||
columns=["Id", "Number1", "Number2",
|
||||
"Text1", "Text2", "Number3"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_corner_with_nans(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """id,score,days
|
||||
1,2,12
|
||||
2,2-5,
|
||||
3,,14+
|
||||
4,6-12,2"""
|
||||
|
||||
# Example converters.
|
||||
def convert_days(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
is_plus = x.endswith("+")
|
||||
|
||||
if is_plus:
|
||||
x = int(x[:-1]) + 1
|
||||
else:
|
||||
x = int(x)
|
||||
|
||||
return x
|
||||
|
||||
def convert_days_sentinel(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
is_plus = x.endswith("+")
|
||||
|
||||
if is_plus:
|
||||
x = int(x[:-1]) + 1
|
||||
else:
|
||||
x = int(x)
|
||||
|
||||
return x
|
||||
|
||||
def convert_score(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
if x.find("-") > 0:
|
||||
val_min, val_max = lmap(int, x.split("-"))
|
||||
val = 0.5 * (val_min + val_max)
|
||||
else:
|
||||
val = float(x)
|
||||
|
||||
return val
|
||||
|
||||
results = []
|
||||
|
||||
for day_converter in [convert_days, convert_days_sentinel]:
|
||||
result = parser.read_csv(StringIO(data),
|
||||
converters={"score": convert_score,
|
||||
"days": day_converter},
|
||||
na_values=["", None])
|
||||
assert pd.isna(result["days"][1])
|
||||
results.append(result)
|
||||
|
||||
tm.assert_frame_equal(results[0], results[1])
|
||||
|
||||
|
||||
def test_converter_index_col_bug(all_parsers):
|
||||
# see gh-1835
|
||||
parser = all_parsers
|
||||
data = "A;B\n1;2\n3;4"
|
||||
|
||||
rs = parser.read_csv(StringIO(data), sep=";", index_col="A",
|
||||
converters={"A": lambda x: x})
|
||||
|
||||
xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A"))
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
@@ -0,0 +1,135 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that dialects are properly handled during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import csv
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def custom_dialect():
|
||||
dialect_name = "weird"
|
||||
dialect_kwargs = dict(doublequote=False, escapechar="~", delimiter=":",
|
||||
skipinitialspace=False, quotechar="~", quoting=3)
|
||||
return dialect_name, dialect_kwargs
|
||||
|
||||
|
||||
def test_dialect(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
label1,label2,label3
|
||||
index1,"a,c,e
|
||||
index2,b,d,f
|
||||
"""
|
||||
|
||||
dia = csv.excel()
|
||||
dia.quoting = csv.QUOTE_NONE
|
||||
df = parser.read_csv(StringIO(data), dialect=dia)
|
||||
|
||||
data = """\
|
||||
label1,label2,label3
|
||||
index1,a,c,e
|
||||
index2,b,d,f
|
||||
"""
|
||||
exp = parser.read_csv(StringIO(data))
|
||||
exp.replace("a", "\"a", inplace=True)
|
||||
tm.assert_frame_equal(df, exp)
|
||||
|
||||
|
||||
def test_dialect_str(all_parsers):
|
||||
dialect_name = "mydialect"
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
fruit:vegetable
|
||||
apple:broccoli
|
||||
pear:tomato
|
||||
"""
|
||||
exp = DataFrame({
|
||||
"fruit": ["apple", "pear"],
|
||||
"vegetable": ["broccoli", "tomato"]
|
||||
})
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, delimiter=":"):
|
||||
df = parser.read_csv(StringIO(data), dialect=dialect_name)
|
||||
tm.assert_frame_equal(df, exp)
|
||||
|
||||
|
||||
def test_invalid_dialect(all_parsers):
|
||||
class InvalidDialect(object):
|
||||
pass
|
||||
|
||||
data = "a\n1"
|
||||
parser = all_parsers
|
||||
msg = "Invalid dialect"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dialect=InvalidDialect)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("arg", [None, "doublequote", "escapechar",
|
||||
"skipinitialspace", "quotechar", "quoting"])
|
||||
@pytest.mark.parametrize("value", ["dialect", "default", "other"])
|
||||
def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect,
|
||||
arg, value):
|
||||
# see gh-23761.
|
||||
dialect_name, dialect_kwargs = custom_dialect
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
data = "a:b\n1:2"
|
||||
|
||||
warning_klass = None
|
||||
kwds = dict()
|
||||
|
||||
# arg=None tests when we pass in the dialect without any other arguments.
|
||||
if arg is not None:
|
||||
if "value" == "dialect": # No conflict --> no warning.
|
||||
kwds[arg] = dialect_kwargs[arg]
|
||||
elif "value" == "default": # Default --> no warning.
|
||||
from pandas.io.parsers import _parser_defaults
|
||||
kwds[arg] = _parser_defaults[arg]
|
||||
else: # Non-default + conflict with dialect --> warning.
|
||||
warning_klass = ParserWarning
|
||||
kwds[arg] = "blah"
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
|
||||
with tm.assert_produces_warning(warning_klass):
|
||||
result = parser.read_csv(StringIO(data),
|
||||
dialect=dialect_name, **kwds)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs,warning_klass", [
|
||||
(dict(sep=","), None), # sep is default --> sep_override=True
|
||||
(dict(sep="."), ParserWarning), # sep isn't default --> sep_override=False
|
||||
(dict(delimiter=":"), None), # No conflict
|
||||
(dict(delimiter=None), None), # Default arguments --> sep_override=True
|
||||
(dict(delimiter=","), ParserWarning), # Conflict
|
||||
(dict(delimiter="."), ParserWarning), # Conflict
|
||||
], ids=["sep-override-true", "sep-override-false",
|
||||
"delimiter-no-conflict", "delimiter-default-arg",
|
||||
"delimiter-conflict", "delimiter-conflict2"])
|
||||
def test_dialect_conflict_delimiter(all_parsers, custom_dialect,
|
||||
kwargs, warning_klass):
|
||||
# see gh-23761.
|
||||
dialect_name, dialect_kwargs = custom_dialect
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
data = "a:b\n1:2"
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
|
||||
with tm.assert_produces_warning(warning_klass):
|
||||
result = parser.read_csv(StringIO(data),
|
||||
dialect=dialect_name, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,514 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests dtype specification during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat)
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [str, object])
|
||||
@pytest.mark.parametrize("check_orig", [True, False])
|
||||
def test_dtype_all_columns(all_parsers, dtype, check_orig):
|
||||
# see gh-3795, gh-6607
|
||||
parser = all_parsers
|
||||
|
||||
df = DataFrame(np.random.rand(5, 2).round(4), columns=list("AB"),
|
||||
index=["1A", "1B", "1C", "1D", "1E"])
|
||||
|
||||
with tm.ensure_clean("__passing_str_as_dtype__.csv") as path:
|
||||
df.to_csv(path)
|
||||
|
||||
result = parser.read_csv(path, dtype=dtype, index_col=0)
|
||||
|
||||
if check_orig:
|
||||
expected = df.copy()
|
||||
result = result.astype(float)
|
||||
else:
|
||||
expected = df.astype(str)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_all_columns_empty(all_parsers):
|
||||
# see gh-12048
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO("A,B"), dtype=str)
|
||||
|
||||
expected = DataFrame({"A": [], "B": []}, index=[], dtype=str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_per_column(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
one,two
|
||||
1,2.5
|
||||
2,3.5
|
||||
3,4.5
|
||||
4,5.5"""
|
||||
expected = DataFrame([[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]],
|
||||
columns=["one", "two"])
|
||||
expected["one"] = expected["one"].astype(np.float64)
|
||||
expected["two"] = expected["two"].astype(object)
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": np.float64,
|
||||
1: str})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_invalid_dtype_per_column(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
one,two
|
||||
1,2.5
|
||||
2,3.5
|
||||
3,4.5
|
||||
4,5.5"""
|
||||
|
||||
with pytest.raises(TypeError, match="data type 'foo' not understood"):
|
||||
parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [
|
||||
"category",
|
||||
CategoricalDtype(),
|
||||
{"a": "category",
|
||||
"b": "category",
|
||||
"c": CategoricalDtype()}
|
||||
])
|
||||
def test_categorical_dtype(all_parsers, dtype):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,a,3.4
|
||||
1,a,3.4
|
||||
2,b,4.5"""
|
||||
expected = DataFrame({"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["a", "a", "b"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"])})
|
||||
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [
|
||||
{"b": "category"},
|
||||
{1: "category"}
|
||||
])
|
||||
def test_categorical_dtype_single(all_parsers, dtype):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,a,3.4
|
||||
1,a,3.4
|
||||
2,b,4.5"""
|
||||
expected = DataFrame({"a": [1, 1, 2],
|
||||
"b": Categorical(["a", "a", "b"]),
|
||||
"c": [3.4, 3.4, 4.5]})
|
||||
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_unsorted(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,b,3.4
|
||||
1,b,3.4
|
||||
2,a,4.5"""
|
||||
expected = DataFrame({"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["b", "b", "a"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"])})
|
||||
actual = parser.read_csv(StringIO(data), dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_missing(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,b,3.4
|
||||
1,nan,3.4
|
||||
2,a,4.5"""
|
||||
expected = DataFrame({"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["b", np.nan, "a"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"])})
|
||||
actual = parser.read_csv(StringIO(data), dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_categorical_dtype_high_cardinality_numeric(all_parsers):
|
||||
# see gh-18186
|
||||
parser = all_parsers
|
||||
data = np.sort([str(i) for i in range(524289)])
|
||||
expected = DataFrame({"a": Categorical(data, ordered=True)})
|
||||
|
||||
actual = parser.read_csv(StringIO("a\n" + "\n".join(data)),
|
||||
dtype="category")
|
||||
actual["a"] = actual["a"].cat.reorder_categories(
|
||||
np.sort(actual.a.cat.categories), ordered=True)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
|
||||
# see gh-10153
|
||||
pth = os.path.join(csv_dir_path, "unicode_series.csv")
|
||||
parser = all_parsers
|
||||
encoding = "latin-1"
|
||||
|
||||
expected = parser.read_csv(pth, header=None, encoding=encoding)
|
||||
expected[1] = Categorical(expected[1])
|
||||
|
||||
actual = parser.read_csv(pth, header=None, encoding=encoding,
|
||||
dtype={1: "category"})
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
|
||||
# see gh-10153
|
||||
pth = os.path.join(csv_dir_path, "utf16_ex.txt")
|
||||
parser = all_parsers
|
||||
encoding = "utf-16"
|
||||
sep = ","
|
||||
|
||||
expected = parser.read_csv(pth, sep=sep, encoding=encoding)
|
||||
expected = expected.apply(Categorical)
|
||||
|
||||
actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_chunksize_infer_categories(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
expecteds = [DataFrame({"a": [1, 1],
|
||||
"b": Categorical(["a", "b"])}),
|
||||
DataFrame({"a": [1, 2],
|
||||
"b": Categorical(["b", "c"])},
|
||||
index=[2, 3])]
|
||||
actuals = parser.read_csv(StringIO(data), dtype={"b": "category"},
|
||||
chunksize=2)
|
||||
|
||||
for actual, expected in zip(actuals, expecteds):
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
cats = ["a", "b", "c"]
|
||||
expecteds = [DataFrame({"a": [1, 1],
|
||||
"b": Categorical(["a", "b"],
|
||||
categories=cats)}),
|
||||
DataFrame({"a": [1, 2],
|
||||
"b": Categorical(["b", "c"],
|
||||
categories=cats)},
|
||||
index=[2, 3])]
|
||||
dtype = CategoricalDtype(cats)
|
||||
actuals = parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2)
|
||||
|
||||
for actual, expected in zip(actuals, expecteds):
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ordered", [False, True])
|
||||
@pytest.mark.parametrize("categories", [
|
||||
["a", "b", "c"],
|
||||
["a", "c", "b"],
|
||||
["a", "b", "c", "d"],
|
||||
["c", "b", "a"],
|
||||
])
|
||||
def test_categorical_category_dtype(all_parsers, categories, ordered):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
expected = DataFrame({
|
||||
"a": [1, 1, 1, 2],
|
||||
"b": Categorical(["a", "b", "b", "c"],
|
||||
categories=categories,
|
||||
ordered=ordered)
|
||||
})
|
||||
|
||||
dtype = {"b": CategoricalDtype(categories=categories,
|
||||
ordered=ordered)}
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_category_dtype_unsorted(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
dtype = CategoricalDtype(["c", "b", "a"])
|
||||
expected = DataFrame({
|
||||
"a": [1, 1, 1, 2],
|
||||
"b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"])
|
||||
})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype={"b": dtype})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_numeric(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([1, 2, 3])}
|
||||
|
||||
data = "b\n1\n1\n2\n3"
|
||||
expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_datetime(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype(pd.date_range("2017", "2019", freq="AS"))}
|
||||
|
||||
data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
|
||||
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_timestamp(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([Timestamp("2014")])}
|
||||
|
||||
data = "b\n2014-01-01\n2014-01-01T00:00:00"
|
||||
expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_timedelta(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}
|
||||
|
||||
data = "b\n1H\n2H\n3H"
|
||||
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", [
|
||||
"b\nTrue\nFalse\nNA\nFalse",
|
||||
"b\ntrue\nfalse\nNA\nfalse",
|
||||
"b\nTRUE\nFALSE\nNA\nFALSE",
|
||||
"b\nTrue\nFalse\nNA\nFALSE",
|
||||
])
|
||||
def test_categorical_dtype_coerces_boolean(all_parsers, data):
|
||||
# see gh-20498
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([False, True])}
|
||||
expected = DataFrame({"b": Categorical([True, False, None, False])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_unexpected_categories(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
|
||||
|
||||
data = "b\nd\na\nc\nd" # Unexpected c
|
||||
expected = DataFrame({"b": Categorical(list("dacd"),
|
||||
dtype=dtype["b"])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two"
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": "u1"})
|
||||
|
||||
expected = DataFrame({"one": np.empty(0, dtype="u1"),
|
||||
"two": np.empty(0, dtype=np.object)},
|
||||
index=Index([], dtype=object))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_index_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two"
|
||||
result = parser.read_csv(StringIO(data), index_col=["one"],
|
||||
dtype={"one": "u1", 1: "f"})
|
||||
|
||||
expected = DataFrame({"two": np.empty(0, dtype="f")},
|
||||
index=Index([], dtype="u1", name="one"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_multi_index_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two,three"
|
||||
result = parser.read_csv(StringIO(data), index_col=["one", "two"],
|
||||
dtype={"one": "u1", 1: "f8"})
|
||||
|
||||
exp_idx = MultiIndex.from_arrays([np.empty(0, dtype="u1"),
|
||||
np.empty(0, dtype=np.float64)],
|
||||
names=["one", "two"])
|
||||
expected = DataFrame({"three": np.empty(0, dtype=np.object)},
|
||||
index=exp_idx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"})
|
||||
|
||||
expected = DataFrame({"one": np.empty(0, dtype="u1"),
|
||||
"one.1": np.empty(0, dtype="f")},
|
||||
index=Index([], dtype=object))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
|
||||
|
||||
expected = DataFrame({"one": np.empty(0, dtype="u1"),
|
||||
"one.1": np.empty(0, dtype="f")},
|
||||
index=Index([], dtype=object))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
|
||||
# see gh-9424
|
||||
parser = all_parsers
|
||||
expected = concat([Series([], name="one", dtype="u1"),
|
||||
Series([], name="one.1", dtype="f")], axis=1)
|
||||
expected.index = expected.index.astype(object)
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_dup_column_pass_dtype_by_indexes_warn(all_parsers):
|
||||
# see gh-9424
|
||||
parser = all_parsers
|
||||
expected = concat([Series([], name="one", dtype="u1"),
|
||||
Series([], name="one.1", dtype="f")], axis=1)
|
||||
expected.index = expected.index.astype(object)
|
||||
|
||||
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
|
||||
data = ""
|
||||
result = parser.read_csv(StringIO(data), names=["one", "one"],
|
||||
dtype={0: "u1", 1: "f"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_raise_on_passed_int_dtype_with_nas(all_parsers):
|
||||
# see gh-2631
|
||||
parser = all_parsers
|
||||
data = """YEAR, DOY, a
|
||||
2001,106380451,10
|
||||
2001,,11
|
||||
2001,106380451,67"""
|
||||
|
||||
msg = ("Integer column has NA values" if parser.engine == "c" else
|
||||
"Unable to convert column DOY")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dtype={"DOY": np.int64},
|
||||
skipinitialspace=True)
|
||||
|
||||
|
||||
def test_dtype_with_converters(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1.1,2.2
|
||||
1.2,2.3"""
|
||||
|
||||
# Dtype spec ignored if converted specified.
|
||||
with tm.assert_produces_warning(ParserWarning):
|
||||
result = parser.read_csv(StringIO(data), dtype={"a": "i8"},
|
||||
converters={"a": lambda x: str(x)})
|
||||
expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype,expected", [
|
||||
(np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)),
|
||||
("category", DataFrame({"a": Categorical([]),
|
||||
"b": Categorical([])},
|
||||
index=[])),
|
||||
(dict(a="category", b="category"),
|
||||
DataFrame({"a": Categorical([]),
|
||||
"b": Categorical([])},
|
||||
index=[])),
|
||||
("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")),
|
||||
("timedelta64[ns]", DataFrame({"a": Series([], dtype="timedelta64[ns]"),
|
||||
"b": Series([], dtype="timedelta64[ns]")},
|
||||
index=[])),
|
||||
(dict(a=np.int64,
|
||||
b=np.int32), DataFrame({"a": Series([], dtype=np.int64),
|
||||
"b": Series([], dtype=np.int32)},
|
||||
index=[])),
|
||||
({0: np.int64, 1: np.int32}, DataFrame({"a": Series([], dtype=np.int64),
|
||||
"b": Series([], dtype=np.int32)},
|
||||
index=[])),
|
||||
({"a": np.int64, 1: np.int32}, DataFrame({"a": Series([], dtype=np.int64),
|
||||
"b": Series([], dtype=np.int32)},
|
||||
index=[])),
|
||||
])
|
||||
def test_empty_dtype(all_parsers, dtype, expected):
|
||||
# see gh-14712
|
||||
parser = all_parsers
|
||||
data = "a,b"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=0, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", list(np.typecodes["AllInteger"] +
|
||||
np.typecodes["Float"]))
|
||||
def test_numeric_dtype(all_parsers, dtype):
|
||||
data = "0\n1"
|
||||
parser = all_parsers
|
||||
expected = DataFrame([0, 1], dtype=dtype)
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, dtype=dtype)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
@@ -0,0 +1,428 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that the file header is properly handled or inferred
|
||||
during parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from collections import namedtuple
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO, u
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def test_read_with_bad_header(all_parsers):
|
||||
parser = all_parsers
|
||||
msg = r"but only \d+ lines in file"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
s = StringIO(",,")
|
||||
parser.read_csv(s, header=[10])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [True, False])
|
||||
def test_bool_header_arg(all_parsers, header):
|
||||
# see gh-6114
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
MyColumn
|
||||
a
|
||||
b
|
||||
a
|
||||
b"""
|
||||
msg = "Passing a bool to header is invalid"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=header)
|
||||
|
||||
|
||||
def test_no_header_prefix(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), prefix="Field", header=None)
|
||||
expected = DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10],
|
||||
[11, 12, 13, 14, 15]],
|
||||
columns=["Field0", "Field1", "Field2",
|
||||
"Field3", "Field4"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_with_index_col(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
names = ["A", "B", "C"]
|
||||
result = parser.read_csv(StringIO(data), names=names)
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_not_first_line(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """got,to,ignore,this,line
|
||||
got,to,ignore,this,line
|
||||
index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
"""
|
||||
data2 = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=2, index_col=0)
|
||||
expected = parser.read_csv(StringIO(data2), header=0, index_col=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_multi_index(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = tm.makeCustomDataframe(
|
||||
5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
|
||||
|
||||
data = """\
|
||||
C0,,C_l0_g0,C_l0_g1,C_l0_g2
|
||||
|
||||
C1,,C_l1_g0,C_l1_g1,C_l1_g2
|
||||
C2,,C_l2_g0,C_l2_g1,C_l2_g2
|
||||
C3,,C_l3_g0,C_l3_g1,C_l3_g2
|
||||
R0,R1,,,
|
||||
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
|
||||
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
|
||||
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
|
||||
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
|
||||
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3],
|
||||
index_col=[0, 1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs,msg", [
|
||||
(dict(index_col=["foo", "bar"]), ("index_col must only contain "
|
||||
"row numbers when specifying "
|
||||
"a multi-index header")),
|
||||
(dict(index_col=[0, 1], names=["foo", "bar"]), ("cannot specify names "
|
||||
"when specifying a "
|
||||
"multi-index header")),
|
||||
(dict(index_col=[0, 1], usecols=["foo", "bar"]), ("cannot specify "
|
||||
"usecols when "
|
||||
"specifying a "
|
||||
"multi-index header")),
|
||||
])
|
||||
def test_header_multi_index_invalid(all_parsers, kwargs, msg):
|
||||
data = """\
|
||||
C0,,C_l0_g0,C_l0_g1,C_l0_g2
|
||||
|
||||
C1,,C_l1_g0,C_l1_g1,C_l1_g2
|
||||
C2,,C_l2_g0,C_l2_g1,C_l2_g2
|
||||
C3,,C_l3_g0,C_l3_g1,C_l3_g2
|
||||
R0,R1,,,
|
||||
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
|
||||
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
|
||||
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
|
||||
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
|
||||
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
|
||||
"""
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs)
|
||||
|
||||
|
||||
_TestTuple = namedtuple("names", ["first", "second"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [
|
||||
dict(header=[0, 1]),
|
||||
dict(skiprows=3,
|
||||
names=[("a", "q"), ("a", "r"), ("a", "s"),
|
||||
("b", "t"), ("c", "u"), ("c", "v")]),
|
||||
dict(skiprows=3,
|
||||
names=[_TestTuple("a", "q"), _TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"), _TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"), _TestTuple("c", "v")])
|
||||
])
|
||||
def test_header_multi_index_common_format1(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"),
|
||||
("b", "t"), ("c", "u"), ("c", "v")]))
|
||||
data = """,a,a,a,b,c,c
|
||||
,q,r,s,t,u,v
|
||||
,,,,,,
|
||||
one,1,2,3,4,5,6
|
||||
two,7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [
|
||||
dict(header=[0, 1]),
|
||||
dict(skiprows=2,
|
||||
names=[("a", "q"), ("a", "r"), ("a", "s"),
|
||||
("b", "t"), ("c", "u"), ("c", "v")]),
|
||||
dict(skiprows=2,
|
||||
names=[_TestTuple("a", "q"), _TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"), _TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"), _TestTuple("c", "v")])
|
||||
])
|
||||
def test_header_multi_index_common_format2(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"),
|
||||
("b", "t"), ("c", "u"), ("c", "v")]))
|
||||
data = """,a,a,a,b,c,c
|
||||
,q,r,s,t,u,v
|
||||
one,1,2,3,4,5,6
|
||||
two,7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [
|
||||
dict(header=[0, 1]),
|
||||
dict(skiprows=2,
|
||||
names=[("a", "q"), ("a", "r"), ("a", "s"),
|
||||
("b", "t"), ("c", "u"), ("c", "v")]),
|
||||
dict(skiprows=2,
|
||||
names=[_TestTuple("a", "q"), _TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"), _TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"), _TestTuple("c", "v")])
|
||||
])
|
||||
def test_header_multi_index_common_format3(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"),
|
||||
("b", "t"), ("c", "u"), ("c", "v")]))
|
||||
expected = expected.reset_index(drop=True)
|
||||
data = """a,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_multi_index_common_format_malformed1(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(np.array(
|
||||
[[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
|
||||
index=Index([1, 7]),
|
||||
columns=MultiIndex(levels=[[u("a"), u("b"), u("c")],
|
||||
[u("r"), u("s"), u("t"),
|
||||
u("u"), u("v")]],
|
||||
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
|
||||
names=[u("a"), u("q")]))
|
||||
data = """a,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
def test_header_multi_index_common_format_malformed2(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(np.array(
|
||||
[[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
|
||||
index=Index([1, 7]),
|
||||
columns=MultiIndex(levels=[[u("a"), u("b"), u("c")],
|
||||
[u("r"), u("s"), u("t"),
|
||||
u("u"), u("v")]],
|
||||
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
|
||||
names=[None, u("q")]))
|
||||
|
||||
data = """,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
def test_header_multi_index_common_format_malformed3(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(np.array(
|
||||
[[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"),
|
||||
index=MultiIndex(levels=[[1, 7], [2, 8]],
|
||||
codes=[[0, 1], [0, 1]]),
|
||||
columns=MultiIndex(levels=[[u("a"), u("b"), u("c")],
|
||||
[u("s"), u("t"), u("u"), u("v")]],
|
||||
codes=[[0, 1, 2, 2], [0, 1, 2, 3]],
|
||||
names=[None, u("q")]))
|
||||
data = """,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,header", [
|
||||
("1,2,3\n4,5,6", None),
|
||||
("foo,bar,baz\n1,2,3\n4,5,6", 0),
|
||||
])
|
||||
def test_header_names_backward_compat(all_parsers, data, header):
|
||||
# see gh-2539
|
||||
parser = all_parsers
|
||||
expected = parser.read_csv(StringIO("1,2,3\n4,5,6"),
|
||||
names=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=["a", "b", "c"],
|
||||
header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [
|
||||
dict(), dict(index_col=False)
|
||||
])
|
||||
def test_read_only_header_no_rows(all_parsers, kwargs):
|
||||
# See gh-7773
|
||||
parser = all_parsers
|
||||
expected = DataFrame(columns=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO("a,b,c"), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs,names", [
|
||||
(dict(), [0, 1, 2, 3, 4]),
|
||||
(dict(prefix="X"), ["X0", "X1", "X2", "X3", "X4"]),
|
||||
(dict(names=["foo", "bar", "baz", "quux", "panda"]),
|
||||
["foo", "bar", "baz", "quux", "panda"])
|
||||
])
|
||||
def test_no_header(all_parsers, kwargs, names):
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
expected = DataFrame([[1, 2, 3, 4, 5],
|
||||
[6, 7, 8, 9, 10],
|
||||
[11, 12, 13, 14, 15]], columns=names)
|
||||
result = parser.read_csv(StringIO(data), header=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [
|
||||
["a", "b"],
|
||||
"string_header"
|
||||
])
|
||||
def test_non_int_header(all_parsers, header):
|
||||
# see gh-16338
|
||||
msg = "header must be integer or list of integers"
|
||||
data = """1,2\n3,4"""
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=header)
|
||||
|
||||
|
||||
def test_singleton_header(all_parsers):
|
||||
# see gh-7757
|
||||
data = """a,b,c\n0,1,2\n1,2,3"""
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
|
||||
result = parser.read_csv(StringIO(data), header=[0])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,expected", [
|
||||
("A,A,A,B\none,one,one,two\n0,40,34,0.1",
|
||||
DataFrame([[0, 40, 34, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("A", "one"), ("A", "one.1"),
|
||||
("A", "one.2"), ("B", "two")]))),
|
||||
("A,A,A,B\none,one,one.1,two\n0,40,34,0.1",
|
||||
DataFrame([[0, 40, 34, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("A", "one"), ("A", "one.1"),
|
||||
("A", "one.1.1"), ("B", "two")]))),
|
||||
("A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1",
|
||||
DataFrame([[0, 40, 34, 0.1, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("A", "one"), ("A", "one.1"),
|
||||
("A", "one.1.1"), ("B", "two"),
|
||||
("B", "two.1")])))
|
||||
])
|
||||
def test_mangles_multi_index(all_parsers, data, expected):
|
||||
# see gh-18062
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", [None, [0]])
|
||||
@pytest.mark.parametrize("columns", [None,
|
||||
(["", "Unnamed"]),
|
||||
(["Unnamed", ""]),
|
||||
(["Unnamed", "NotUnnamed"])])
|
||||
def test_multi_index_unnamed(all_parsers, index_col, columns):
|
||||
# see gh-23687
|
||||
#
|
||||
# When specifying a multi-index header, make sure that
|
||||
# we don't error just because one of the rows in our header
|
||||
# has ALL column names containing the string "Unnamed". The
|
||||
# correct condition to check is whether the row contains
|
||||
# ALL columns that did not have names (and instead were given
|
||||
# placeholder ones).
|
||||
parser = all_parsers
|
||||
header = [0, 1]
|
||||
|
||||
if index_col is None:
|
||||
data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
|
||||
else:
|
||||
data = (",".join([""] + (columns or ["", ""])) +
|
||||
"\n,0,1\n0,2,3\n1,4,5\n")
|
||||
|
||||
if columns is None:
|
||||
msg = (r"Passed header=\[0,1\] are too "
|
||||
r"many rows for this multi_index of columns")
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=header,
|
||||
index_col=index_col)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), header=header,
|
||||
index_col=index_col)
|
||||
template = "Unnamed: {i}_level_0"
|
||||
exp_columns = []
|
||||
|
||||
for i, col in enumerate(columns):
|
||||
if not col: # Unnamed.
|
||||
col = template.format(i=i if index_col is None else i + 1)
|
||||
|
||||
exp_columns.append(col)
|
||||
|
||||
columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
|
||||
expected = DataFrame([[2, 3], [4, 5]], columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,152 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that the specified index column (a.k.a "index_col")
|
||||
is properly handled or inferred during parsing for all of
|
||||
the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("with_header", [True, False])
|
||||
def test_index_col_named(all_parsers, with_header):
|
||||
parser = all_parsers
|
||||
no_header = """\
|
||||
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa
|
||||
header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" # noqa
|
||||
|
||||
if with_header:
|
||||
data = header + no_header
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col="ID")
|
||||
expected = parser.read_csv(StringIO(data), header=0).set_index("ID")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
data = no_header
|
||||
msg = "Index ID invalid"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), index_col="ID")
|
||||
|
||||
|
||||
def test_index_col_named2(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
1,2,3,4,hello
|
||||
5,6,7,8,world
|
||||
9,10,11,12,foo
|
||||
"""
|
||||
|
||||
expected = DataFrame({"a": [1, 5, 9], "b": [2, 6, 10],
|
||||
"c": [3, 7, 11], "d": [4, 8, 12]},
|
||||
index=Index(["hello", "world", "foo"],
|
||||
name="message"))
|
||||
names = ["a", "b", "c", "d", "message"]
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names,
|
||||
index_col=["message"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_col_is_true(all_parsers):
|
||||
# see gh-9798
|
||||
data = "a,b\n1,2"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match="The value of index_col "
|
||||
"couldn't be 'True'"):
|
||||
parser.read_csv(StringIO(data), index_col=True)
|
||||
|
||||
|
||||
def test_infer_index_col(all_parsers):
|
||||
data = """A,B,C
|
||||
foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col,kwargs", [
|
||||
(None, dict(columns=["x", "y", "z"])),
|
||||
(False, dict(columns=["x", "y", "z"])),
|
||||
(0, dict(columns=["y", "z"], index=Index([], name="x"))),
|
||||
(1, dict(columns=["x", "z"], index=Index([], name="y"))),
|
||||
("x", dict(columns=["y", "z"], index=Index([], name="x"))),
|
||||
("y", dict(columns=["x", "z"], index=Index([], name="y"))),
|
||||
([0, 1], dict(columns=["z"], index=MultiIndex.from_arrays(
|
||||
[[]] * 2, names=["x", "y"]))),
|
||||
(["x", "y"], dict(columns=["z"], index=MultiIndex.from_arrays(
|
||||
[[]] * 2, names=["x", "y"]))),
|
||||
([1, 0], dict(columns=["z"], index=MultiIndex.from_arrays(
|
||||
[[]] * 2, names=["y", "x"]))),
|
||||
(["y", "x"], dict(columns=["z"], index=MultiIndex.from_arrays(
|
||||
[[]] * 2, names=["y", "x"]))),
|
||||
])
|
||||
def test_index_col_empty_data(all_parsers, index_col, kwargs):
|
||||
data = "x,y,z"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=index_col)
|
||||
|
||||
expected = DataFrame([], **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_index_col_false(all_parsers):
|
||||
# see gh-10413
|
||||
data = "x,y"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=False)
|
||||
|
||||
expected = DataFrame([], columns=["x", "y"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_names", [
|
||||
["", ""],
|
||||
["foo", ""],
|
||||
["", "bar"],
|
||||
["foo", "bar"],
|
||||
["NotReallyUnnamed", "Unnamed: 0"],
|
||||
])
|
||||
def test_multi_index_naming(all_parsers, index_names):
|
||||
parser = all_parsers
|
||||
|
||||
# We don't want empty index names being replaced with "Unnamed: 0"
|
||||
data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
|
||||
result = parser.read_csv(StringIO(data), index_col=[0, 1])
|
||||
|
||||
expected = DataFrame({"col": [1, 2, 3, 4]},
|
||||
index=MultiIndex.from_product([["a", "b"],
|
||||
["c", "d"]]))
|
||||
expected.index.names = [name if name else None for name in index_names]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multi_index_naming_not_all_at_beginning(all_parsers):
|
||||
parser = all_parsers
|
||||
data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
|
||||
result = parser.read_csv(StringIO(data), index_col=[0, 2])
|
||||
|
||||
expected = DataFrame({"Unnamed: 2": ["c", "d", "c", "d"]},
|
||||
index=MultiIndex(
|
||||
levels=[['a', 'b'], [1, 2, 3, 4]],
|
||||
codes=[[0, 0, 1, 1], [0, 1, 2, 3]]))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,119 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that duplicate columns are handled appropriately when parsed by the
|
||||
CSV engine. In general, the expected result is that they are either thoroughly
|
||||
de-duplicated (if mangling requested) or ignored otherwise.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)])
|
||||
def test_basic(all_parsers, kwargs):
|
||||
# TODO: add test for condition "mangle_dupe_cols=False"
|
||||
# once it is actually supported (gh-12935)
|
||||
parser = all_parsers
|
||||
|
||||
data = "a,a,b,b,b\n1,2,3,4,5"
|
||||
result = parser.read_csv(StringIO(data), sep=",", **kwargs)
|
||||
|
||||
expected = DataFrame([[1, 2, 3, 4, 5]],
|
||||
columns=["a", "a.1", "b", "b.1", "b.2"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_basic_names(all_parsers):
|
||||
# See gh-7160
|
||||
parser = all_parsers
|
||||
|
||||
data = "a,b,a\n0,1,2\n3,4,5"
|
||||
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
|
||||
columns=["a", "b", "a.1"])
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_basic_names_warn(all_parsers):
|
||||
# See gh-7160
|
||||
parser = all_parsers
|
||||
|
||||
data = "0,1,2\n3,4,5"
|
||||
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
|
||||
columns=["a", "b", "a.1"])
|
||||
|
||||
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
|
||||
result = parser.read_csv(StringIO(data), names=["a", "b", "a"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,expected", [
|
||||
("a,a,a.1\n1,2,3",
|
||||
DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])),
|
||||
("a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
|
||||
DataFrame([[1, 2, 3, 4, 5, 6]], columns=["a", "a.1", "a.1.1", "a.1.1.1",
|
||||
"a.1.1.1.1", "a.1.1.1.1.1"])),
|
||||
("a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
|
||||
DataFrame([[1, 2, 3, 4, 5, 6, 7]], columns=["a", "a.1", "a.3", "a.1.1",
|
||||
"a.2", "a.2.1", "a.3.1"]))
|
||||
])
|
||||
def test_thorough_mangle_columns(all_parsers, data, expected):
|
||||
# see gh-17060
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,names,expected", [
|
||||
("a,b,b\n1,2,3",
|
||||
["a.1", "a.1", "a.1.1"],
|
||||
DataFrame([["a", "b", "b"], ["1", "2", "3"]],
|
||||
columns=["a.1", "a.1.1", "a.1.1.1"])),
|
||||
("a,b,c,d,e,f\n1,2,3,4,5,6",
|
||||
["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
|
||||
DataFrame([["a", "b", "c", "d", "e", "f"],
|
||||
["1", "2", "3", "4", "5", "6"]],
|
||||
columns=["a", "a.1", "a.1.1", "a.1.1.1",
|
||||
"a.1.1.1.1", "a.1.1.1.1.1"])),
|
||||
("a,b,c,d,e,f,g\n1,2,3,4,5,6,7",
|
||||
["a", "a", "a.3", "a.1", "a.2", "a", "a"],
|
||||
DataFrame([["a", "b", "c", "d", "e", "f", "g"],
|
||||
["1", "2", "3", "4", "5", "6", "7"]],
|
||||
columns=["a", "a.1", "a.3", "a.1.1",
|
||||
"a.2", "a.2.1", "a.3.1"])),
|
||||
])
|
||||
def test_thorough_mangle_names(all_parsers, data, names, expected):
|
||||
# see gh-17095
|
||||
parser = all_parsers
|
||||
|
||||
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
|
||||
result = parser.read_csv(StringIO(data), names=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_mangled_unnamed_placeholders(all_parsers):
|
||||
# xref gh-13017
|
||||
orig_key = "0"
|
||||
parser = all_parsers
|
||||
|
||||
orig_value = [1, 2, 3]
|
||||
df = DataFrame({orig_key: orig_value})
|
||||
|
||||
# This test recursively updates `df`.
|
||||
for i in range(3):
|
||||
expected = DataFrame()
|
||||
|
||||
for j in range(i + 1):
|
||||
expected["Unnamed: 0" + ".1" * j] = [0, 1, 2]
|
||||
|
||||
expected[orig_key] = orig_value
|
||||
df = parser.read_csv(StringIO(df.to_csv()))
|
||||
|
||||
tm.assert_frame_equal(df, expected)
|
||||
@@ -0,0 +1,145 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests multithreading behaviour for reading and
|
||||
parsing files for each parser defined in parsers.py
|
||||
"""
|
||||
|
||||
from __future__ import division
|
||||
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat import BytesIO, range
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def _construct_dataframe(num_rows):
|
||||
"""
|
||||
Construct a DataFrame for testing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
num_rows : int
|
||||
The number of rows for our DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
"""
|
||||
df = DataFrame(np.random.rand(num_rows, 5), columns=list("abcde"))
|
||||
df["foo"] = "foo"
|
||||
df["bar"] = "bar"
|
||||
df["baz"] = "baz"
|
||||
df["date"] = pd.date_range("20000101 09:00:00",
|
||||
periods=num_rows,
|
||||
freq="s")
|
||||
df["int"] = np.arange(num_rows, dtype="int64")
|
||||
return df
|
||||
|
||||
|
||||
def test_multi_thread_string_io_read_csv(all_parsers):
|
||||
# see gh-11786
|
||||
parser = all_parsers
|
||||
max_row_range = 10000
|
||||
num_files = 100
|
||||
|
||||
bytes_to_df = [
|
||||
"\n".join(
|
||||
["%d,%d,%d" % (i, i, i) for i in range(max_row_range)]
|
||||
).encode() for _ in range(num_files)]
|
||||
files = [BytesIO(b) for b in bytes_to_df]
|
||||
|
||||
# Read all files in many threads.
|
||||
pool = ThreadPool(8)
|
||||
|
||||
results = pool.map(parser.read_csv, files)
|
||||
first_result = results[0]
|
||||
|
||||
for result in results:
|
||||
tm.assert_frame_equal(first_result, result)
|
||||
|
||||
|
||||
def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks):
|
||||
"""
|
||||
Generate a DataFrame via multi-thread.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
parser : BaseParser
|
||||
The parser object to use for reading the data.
|
||||
path : str
|
||||
The location of the CSV file to read.
|
||||
num_rows : int
|
||||
The number of rows to read per task.
|
||||
num_tasks : int
|
||||
The number of tasks to use for reading this DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
"""
|
||||
def reader(arg):
|
||||
"""
|
||||
Create a reader for part of the CSV.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arg : tuple
|
||||
A tuple of the following:
|
||||
|
||||
* start : int
|
||||
The starting row to start for parsing CSV
|
||||
* nrows : int
|
||||
The number of rows to read.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
"""
|
||||
start, nrows = arg
|
||||
|
||||
if not start:
|
||||
return parser.read_csv(path, index_col=0, header=0,
|
||||
nrows=nrows, parse_dates=["date"])
|
||||
|
||||
return parser.read_csv(path, index_col=0, header=None,
|
||||
skiprows=int(start) + 1,
|
||||
nrows=nrows, parse_dates=[9])
|
||||
|
||||
tasks = [
|
||||
(num_rows * i // num_tasks,
|
||||
num_rows // num_tasks) for i in range(num_tasks)
|
||||
]
|
||||
|
||||
pool = ThreadPool(processes=num_tasks)
|
||||
results = pool.map(reader, tasks)
|
||||
|
||||
header = results[0].columns
|
||||
|
||||
for r in results[1:]:
|
||||
r.columns = header
|
||||
|
||||
final_dataframe = pd.concat(results)
|
||||
return final_dataframe
|
||||
|
||||
|
||||
def test_multi_thread_path_multipart_read_csv(all_parsers):
|
||||
# see gh-11786
|
||||
num_tasks = 4
|
||||
num_rows = 100000
|
||||
|
||||
parser = all_parsers
|
||||
file_name = "__thread_pool_reader__.csv"
|
||||
df = _construct_dataframe(num_rows)
|
||||
|
||||
with tm.ensure_clean(file_name) as path:
|
||||
df.to_csv(path)
|
||||
|
||||
final_dataframe = _generate_multi_thread_dataframe(parser, path,
|
||||
num_rows, num_tasks)
|
||||
tm.assert_frame_equal(df, final_dataframe)
|
||||
@@ -0,0 +1,441 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that NA values are properly handled during
|
||||
parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO, range
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex
|
||||
import pandas.util.testing as tm
|
||||
|
||||
import pandas.io.common as com
|
||||
|
||||
|
||||
def test_string_nas(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
a,b,c
|
||||
d,,f
|
||||
,g,h
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame([["a", "b", "c"],
|
||||
["d", np.nan, "f"],
|
||||
[np.nan, "g", "h"]],
|
||||
columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_detect_string_na(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B
|
||||
foo,bar
|
||||
NA,baz
|
||||
NaN,nan
|
||||
"""
|
||||
expected = DataFrame([["foo", "bar"], [np.nan, "baz"],
|
||||
[np.nan, np.nan]], columns=["A", "B"])
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_values", [
|
||||
["-999.0", "-999"],
|
||||
[-999, -999.0],
|
||||
[-999.0, -999],
|
||||
["-999.0"], ["-999"],
|
||||
[-999.0], [-999]
|
||||
])
|
||||
@pytest.mark.parametrize("data", [
|
||||
"""A,B
|
||||
-999,1.2
|
||||
2,-999
|
||||
3,4.5
|
||||
""",
|
||||
"""A,B
|
||||
-999,1.200
|
||||
2,-999.000
|
||||
3,4.500
|
||||
"""
|
||||
])
|
||||
def test_non_string_na_values(all_parsers, data, na_values):
|
||||
# see gh-3611: with an odd float format, we can't match
|
||||
# the string "999.0" exactly but still need float matching
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[np.nan, 1.2], [2.0, np.nan],
|
||||
[3.0, 4.5]], columns=["A", "B"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_default_na_values(all_parsers):
|
||||
_NA_VALUES = {"-1.#IND", "1.#QNAN", "1.#IND", "-1.#QNAN", "#N/A",
|
||||
"N/A", "n/a", "NA", "#NA", "NULL", "null", "NaN", "nan",
|
||||
"-NaN", "-nan", "#N/A N/A", ""}
|
||||
assert _NA_VALUES == com._NA_VALUES
|
||||
|
||||
parser = all_parsers
|
||||
nv = len(_NA_VALUES)
|
||||
|
||||
def f(i, v):
|
||||
if i == 0:
|
||||
buf = ""
|
||||
elif i > 0:
|
||||
buf = "".join([","] * i)
|
||||
|
||||
buf = "{0}{1}".format(buf, v)
|
||||
|
||||
if i < nv - 1:
|
||||
buf = "{0}{1}".format(buf, "".join([","] * (nv - i - 1)))
|
||||
|
||||
return buf
|
||||
|
||||
data = StringIO("\n".join(f(i, v) for i, v in enumerate(_NA_VALUES)))
|
||||
expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
|
||||
|
||||
result = parser.read_csv(data, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_values", ["baz", ["baz"]])
|
||||
def test_custom_na_values(all_parsers, na_values):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
ignore,this,row
|
||||
1,NA,3
|
||||
-1.#IND,5,baz
|
||||
7,8,NaN
|
||||
"""
|
||||
expected = DataFrame([[1., np.nan, 3], [np.nan, 5, np.nan],
|
||||
[7, 8, np.nan]], columns=["A", "B", "C"])
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_bool_na_values(all_parsers):
|
||||
data = """A,B,C
|
||||
True,False,True
|
||||
NA,True,False
|
||||
False,NA,True"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame({"A": np.array([True, np.nan, False], dtype=object),
|
||||
"B": np.array([False, True, np.nan], dtype=object),
|
||||
"C": [True, False, True]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_na_value_dict(all_parsers):
|
||||
data = """A,B,C
|
||||
foo,bar,NA
|
||||
bar,foo,foo
|
||||
foo,bar,NA
|
||||
bar,foo,foo"""
|
||||
parser = all_parsers
|
||||
df = parser.read_csv(StringIO(data),
|
||||
na_values={"A": ["foo"], "B": ["bar"]})
|
||||
expected = DataFrame({"A": [np.nan, "bar", np.nan, "bar"],
|
||||
"B": [np.nan, "foo", np.nan, "foo"],
|
||||
"C": [np.nan, "foo", np.nan, "foo"]})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col,expected", [
|
||||
([0], DataFrame({"b": [np.nan], "c": [1], "d": [5]},
|
||||
index=Index([0], name="a"))),
|
||||
([0, 2], DataFrame({"b": [np.nan], "d": [5]},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(0, 1)], names=["a", "c"]))),
|
||||
(["a", "c"], DataFrame({"b": [np.nan], "d": [5]},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(0, 1)], names=["a", "c"]))),
|
||||
])
|
||||
def test_na_value_dict_multi_index(all_parsers, index_col, expected):
|
||||
data = """\
|
||||
a,b,c,d
|
||||
0,NA,1,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), na_values=set(),
|
||||
index_col=index_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs,expected", [
|
||||
(dict(), DataFrame({"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", np.nan, "five",
|
||||
np.nan, "seven"]})),
|
||||
(dict(na_values={"A": [], "C": []}, keep_default_na=False),
|
||||
DataFrame({"A": ["a", "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", "nan", "five", "", "seven"]})),
|
||||
(dict(na_values=["a"], keep_default_na=False),
|
||||
DataFrame({"A": [np.nan, "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", "nan", "five", "", "seven"]})),
|
||||
(dict(na_values={"A": [], "C": []}),
|
||||
DataFrame({"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", np.nan,
|
||||
"five", np.nan, "seven"]})),
|
||||
])
|
||||
def test_na_values_keep_default(all_parsers, kwargs, expected):
|
||||
data = """\
|
||||
A,B,C
|
||||
a,1,one
|
||||
b,2,two
|
||||
,3,three
|
||||
d,4,nan
|
||||
e,5,five
|
||||
nan,6,
|
||||
g,7,seven
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_na_values_no_keep_default(all_parsers):
|
||||
# see gh-4318: passing na_values=None and
|
||||
# keep_default_na=False yields 'None" as a na_value
|
||||
data = """\
|
||||
A,B,C
|
||||
a,1,None
|
||||
b,2,two
|
||||
,3,None
|
||||
d,4,nan
|
||||
e,5,five
|
||||
nan,6,
|
||||
g,7,seven
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), keep_default_na=False)
|
||||
|
||||
expected = DataFrame({"A": ["a", "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["None", "two", "None", "nan",
|
||||
"five", "", "seven"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_keep_default_na_dict_na_values(all_parsers):
|
||||
# see gh-19227
|
||||
data = "a,b\n,2"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), na_values={"b": ["2"]},
|
||||
keep_default_na=False)
|
||||
expected = DataFrame({"a": [""], "b": [np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
|
||||
# see gh-19227
|
||||
#
|
||||
# Scalar values shouldn't cause the parsing to crash or fail.
|
||||
data = "a,b\n1,2"
|
||||
parser = all_parsers
|
||||
df = parser.read_csv(StringIO(data), na_values={"b": 2},
|
||||
keep_default_na=False)
|
||||
expected = DataFrame({"a": [1], "b": [np.nan]})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("col_zero_na_values", [
|
||||
113125, "113125"
|
||||
])
|
||||
def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers,
|
||||
col_zero_na_values):
|
||||
# see gh-19227
|
||||
data = """\
|
||||
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
|
||||
729639,"qwer","",asdfkj,466.681,,252.373
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame({0: [np.nan, 729639.0],
|
||||
1: [np.nan, "qwer"],
|
||||
2: ["/blaha", np.nan],
|
||||
3: ["kjsdkj", "asdfkj"],
|
||||
4: [412.166, 466.681],
|
||||
5: ["225.874", ""],
|
||||
6: [np.nan, 252.373]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None,
|
||||
keep_default_na=False,
|
||||
na_values={2: "", 6: "214.008",
|
||||
1: "blah", 0: col_zero_na_values})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_filter,row_data", [
|
||||
(True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
|
||||
(False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
|
||||
])
|
||||
def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
|
||||
data = """\
|
||||
A,B
|
||||
1,A
|
||||
nan,B
|
||||
3,C
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), na_values=["B"],
|
||||
na_filter=na_filter)
|
||||
|
||||
expected = DataFrame(row_data, columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_na_trailing_columns(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax
|
||||
2012-03-14,USD,AAPL,BUY,1000
|
||||
2012-05-12,USD,SBUX,SELL,500"""
|
||||
|
||||
# Trailing columns should be all NaN.
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame([
|
||||
["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan],
|
||||
["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan],
|
||||
], columns=["Date", "Currency", "Symbol", "Type",
|
||||
"Units", "UnitPrice", "Cost", "Tax"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_values,row_data", [
|
||||
(1, [[np.nan, 2.0], [2.0, np.nan]]),
|
||||
({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]),
|
||||
])
|
||||
def test_na_values_scalar(all_parsers, na_values, row_data):
|
||||
# see gh-12224
|
||||
parser = all_parsers
|
||||
names = ["a", "b"]
|
||||
data = "1,2\n2,1"
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
expected = DataFrame(row_data, columns=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_na_values_dict_aliasing(all_parsers):
|
||||
parser = all_parsers
|
||||
na_values = {"a": 2, "b": 1}
|
||||
na_values_copy = na_values.copy()
|
||||
|
||||
names = ["a", "b"]
|
||||
data = "1,2\n2,1"
|
||||
|
||||
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
|
||||
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_dict_equal(na_values, na_values_copy)
|
||||
|
||||
|
||||
def test_na_values_dict_col_index(all_parsers):
|
||||
# see gh-14203
|
||||
data = "a\nfoo\n1"
|
||||
parser = all_parsers
|
||||
na_values = {0: "foo"}
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values)
|
||||
expected = DataFrame({"a": [np.nan, 1]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,kwargs,expected", [
|
||||
(str(2**63) + "\n" + str(2**63 + 1),
|
||||
dict(na_values=[2**63]), DataFrame([str(2**63), str(2**63 + 1)])),
|
||||
(str(2**63) + ",1" + "\n,2",
|
||||
dict(), DataFrame([[str(2**63), 1], ['', 2]])),
|
||||
(str(2**63) + "\n1",
|
||||
dict(na_values=[2**63]), DataFrame([np.nan, 1])),
|
||||
])
|
||||
def test_na_values_uint64(all_parsers, data, kwargs, expected):
|
||||
# see gh-14983
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), header=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_na_values_no_default_with_index(all_parsers):
|
||||
# see gh-15835
|
||||
data = "a,1\nb,2"
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"1": [2]}, index=Index(["b"], name="a"))
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0,
|
||||
keep_default_na=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_filter,index_data", [
|
||||
(False, ["", "5"]),
|
||||
(True, [np.nan, 5.0]),
|
||||
])
|
||||
def test_no_na_filter_on_index(all_parsers, na_filter, index_data):
|
||||
# see gh-5239
|
||||
#
|
||||
# Don't parse NA-values in index unless na_filter=True
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,,3\n4,5,6"
|
||||
|
||||
expected = DataFrame({"a": [1, 4], "c": [3, 6]},
|
||||
index=Index(index_data, name="b"))
|
||||
result = parser.read_csv(StringIO(data), index_col=[1],
|
||||
na_filter=na_filter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_inf_na_values_with_int_index(all_parsers):
|
||||
# see gh-17128
|
||||
parser = all_parsers
|
||||
data = "idx,col1,col2\n1,3,4\n2,inf,-inf"
|
||||
|
||||
# Don't fail with OverflowError with inf's and integer index column.
|
||||
out = parser.read_csv(StringIO(data), index_col=[0],
|
||||
na_values=["inf", "-inf"])
|
||||
expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]},
|
||||
index=Index([1, 2], name="idx"))
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_filter", [True, False])
|
||||
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
|
||||
# see gh-20377
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,,3\n4,5,6"
|
||||
|
||||
# na_filter=True --> missing value becomes NaN.
|
||||
# na_filter=False --> missing value remains empty string.
|
||||
empty = np.nan if na_filter else ""
|
||||
expected = DataFrame({"a": ["1", "4"],
|
||||
"b": [empty, "5"],
|
||||
"c": ["3", "6"]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data, na_values", [
|
||||
("false,1\n,1\ntrue", None),
|
||||
("false,1\nnull,1\ntrue", None),
|
||||
("false,1\nnan,1\ntrue", None),
|
||||
("false,1\nfoo,1\ntrue", 'foo'),
|
||||
("false,1\nfoo,1\ntrue", ['foo']),
|
||||
("false,1\nfoo,1\ntrue", {'a': 'foo'}),
|
||||
])
|
||||
def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
|
||||
parser = all_parsers
|
||||
msg = ("(Bool column has NA values in column [0a])|"
|
||||
"(cannot safely convert passed user dtype of "
|
||||
"bool for object dtyped data in column 0)")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=None, names=['a', 'b'],
|
||||
dtype={'a': 'bool'}, na_values=na_values)
|
||||
@@ -0,0 +1,204 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests parsers ability to read and parse non-local files
|
||||
and hence require a network connection to be read.
|
||||
"""
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import BytesIO, StringIO
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.parsers import read_csv
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.parametrize(
|
||||
"compress_type, extension", [
|
||||
('gzip', '.gz'), ('bz2', '.bz2'), ('zip', '.zip'),
|
||||
pytest.param('xz', '.xz', marks=td.skip_if_no_lzma)
|
||||
]
|
||||
)
|
||||
@pytest.mark.parametrize('mode', ['explicit', 'infer'])
|
||||
@pytest.mark.parametrize('engine', ['python', 'c'])
|
||||
def test_compressed_urls(salaries_table, compress_type, extension, mode,
|
||||
engine):
|
||||
check_compressed_urls(salaries_table, compress_type, extension, mode,
|
||||
engine)
|
||||
|
||||
|
||||
@tm.network
|
||||
def check_compressed_urls(salaries_table, compression, extension, mode,
|
||||
engine):
|
||||
# test reading compressed urls with various engines and
|
||||
# extension inference
|
||||
base_url = ('https://github.com/pandas-dev/pandas/raw/master/'
|
||||
'pandas/tests/io/parser/data/salaries.csv')
|
||||
|
||||
url = base_url + extension
|
||||
|
||||
if mode != 'explicit':
|
||||
compression = mode
|
||||
|
||||
url_table = read_csv(url, sep='\t', compression=compression, engine=engine)
|
||||
tm.assert_frame_equal(url_table, salaries_table)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tips_df(datapath):
|
||||
"""DataFrame with the tips dataset."""
|
||||
return read_csv(datapath('io', 'parser', 'data', 'tips.csv'))
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("s3_resource")
|
||||
@td.skip_if_not_us_locale()
|
||||
class TestS3(object):
|
||||
|
||||
def test_parse_public_s3_bucket(self, tips_df):
|
||||
pytest.importorskip('s3fs')
|
||||
|
||||
# more of an integration test due to the not-public contents portion
|
||||
# can probably mock this though.
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df = read_csv('s3://pandas-test/tips.csv' +
|
||||
ext, compression=comp)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
# Read public file from bucket with not-public contents
|
||||
df = read_csv('s3://cant_get_it/tips.csv')
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_public_s3n_bucket(self, tips_df):
|
||||
|
||||
# Read from AWS s3 as "s3n" URL
|
||||
df = read_csv('s3n://pandas-test/tips.csv', nrows=10)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3a_bucket(self, tips_df):
|
||||
# Read from AWS s3 as "s3a" URL
|
||||
df = read_csv('s3a://pandas-test/tips.csv', nrows=10)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3_bucket_nrows(self, tips_df):
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df = read_csv('s3://pandas-test/tips.csv' +
|
||||
ext, nrows=10, compression=comp)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3_bucket_chunked(self, tips_df):
|
||||
# Read with a chunksize
|
||||
chunksize = 5
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
|
||||
chunksize=chunksize, compression=comp)
|
||||
assert df_reader.chunksize == chunksize
|
||||
for i_chunk in [0, 1, 2]:
|
||||
# Read a couple of chunks and make sure we see them
|
||||
# properly.
|
||||
df = df_reader.get_chunk()
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
true_df = tips_df.iloc[
|
||||
chunksize * i_chunk: chunksize * (i_chunk + 1)]
|
||||
tm.assert_frame_equal(true_df, df)
|
||||
|
||||
def test_parse_public_s3_bucket_chunked_python(self, tips_df):
|
||||
# Read with a chunksize using the Python parser
|
||||
chunksize = 5
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
|
||||
chunksize=chunksize, compression=comp,
|
||||
engine='python')
|
||||
assert df_reader.chunksize == chunksize
|
||||
for i_chunk in [0, 1, 2]:
|
||||
# Read a couple of chunks and make sure we see them properly.
|
||||
df = df_reader.get_chunk()
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
true_df = tips_df.iloc[
|
||||
chunksize * i_chunk: chunksize * (i_chunk + 1)]
|
||||
tm.assert_frame_equal(true_df, df)
|
||||
|
||||
def test_parse_public_s3_bucket_python(self, tips_df):
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
|
||||
compression=comp)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_infer_s3_compression(self, tips_df):
|
||||
for ext in ['', '.gz', '.bz2']:
|
||||
df = read_csv('s3://pandas-test/tips.csv' + ext,
|
||||
engine='python', compression='infer')
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_public_s3_bucket_nrows_python(self, tips_df):
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
|
||||
nrows=10, compression=comp)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_s3_fails(self):
|
||||
with pytest.raises(IOError):
|
||||
read_csv('s3://nyqpug/asdf.csv')
|
||||
|
||||
# Receive a permission error when trying to read a private bucket.
|
||||
# It's irrelevant here that this isn't actually a table.
|
||||
with pytest.raises(IOError):
|
||||
read_csv('s3://cant_get_it/')
|
||||
|
||||
def test_read_csv_handles_boto_s3_object(self,
|
||||
s3_resource,
|
||||
tips_file):
|
||||
# see gh-16135
|
||||
|
||||
s3_object = s3_resource.meta.client.get_object(
|
||||
Bucket='pandas-test',
|
||||
Key='tips.csv')
|
||||
|
||||
result = read_csv(BytesIO(s3_object["Body"].read()), encoding='utf8')
|
||||
assert isinstance(result, DataFrame)
|
||||
assert not result.empty
|
||||
|
||||
expected = read_csv(tips_file)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_read_csv_chunked_download(self, s3_resource, caplog):
|
||||
# 8 MB, S3FS usees 5MB chunks
|
||||
df = DataFrame(np.random.randn(100000, 4), columns=list('abcd'))
|
||||
buf = BytesIO()
|
||||
str_buf = StringIO()
|
||||
|
||||
df.to_csv(str_buf)
|
||||
|
||||
buf = BytesIO(str_buf.getvalue().encode('utf-8'))
|
||||
|
||||
s3_resource.Bucket("pandas-test").put_object(
|
||||
Key="large-file.csv",
|
||||
Body=buf)
|
||||
|
||||
with caplog.at_level(logging.DEBUG, logger='s3fs.core'):
|
||||
read_csv("s3://pandas-test/large-file.csv", nrows=5)
|
||||
# log of fetch_range (start, stop)
|
||||
assert ((0, 5505024) in {x.args[-2:] for x in caplog.records})
|
||||
@@ -0,0 +1,849 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests date parsing functionality for all of the
|
||||
parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from datetime import date, datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
from pandas._libs.tslib import Timestamp
|
||||
from pandas._libs.tslibs import parsing
|
||||
from pandas.compat import StringIO, lrange, parse_date
|
||||
from pandas.compat.numpy import np_array_datetime64_compat
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, DatetimeIndex, Index, MultiIndex
|
||||
from pandas.core.indexes.datetimes import date_range
|
||||
import pandas.util.testing as tm
|
||||
|
||||
import pandas.io.date_converters as conv
|
||||
import pandas.io.parsers as parsers
|
||||
|
||||
|
||||
def test_separator_date_conflict(all_parsers):
|
||||
# Regression test for gh-4678
|
||||
#
|
||||
# Make sure thousands separator and
|
||||
# date parsing do not conflict.
|
||||
parser = all_parsers
|
||||
data = "06-02-2013;13:00;1-000.215"
|
||||
expected = DataFrame([[datetime(2013, 6, 2, 13, 0, 0), 1000.215]],
|
||||
columns=["Date", 2])
|
||||
|
||||
df = parser.read_csv(StringIO(data), sep=";", thousands="-",
|
||||
parse_dates={"Date": [0, 1]}, header=None)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("keep_date_col", [True, False])
|
||||
def test_multiple_date_col_custom(all_parsers, keep_date_col):
|
||||
data = """\
|
||||
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
|
||||
"""
|
||||
parser = all_parsers
|
||||
|
||||
def date_parser(*date_cols):
|
||||
"""
|
||||
Test date parser.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
date_cols : args
|
||||
The list of data columns to parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
parsed : Series
|
||||
"""
|
||||
return parsing.try_parse_dates(parsers._concat_date_cols(date_cols))
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None,
|
||||
date_parser=date_parser, prefix="X",
|
||||
parse_dates={"actual": [1, 2],
|
||||
"nominal": [1, 3]},
|
||||
keep_date_col=keep_date_col)
|
||||
expected = DataFrame([
|
||||
[datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56),
|
||||
"KORD", "19990127", " 19:00:00", " 18:56:00",
|
||||
0.81, 2.81, 7.2, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56),
|
||||
"KORD", "19990127", " 20:00:00", " 19:56:00",
|
||||
0.01, 2.21, 7.2, 0.0, 260.0],
|
||||
[datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56),
|
||||
"KORD", "19990127", " 21:00:00", " 20:56:00",
|
||||
-0.59, 2.21, 5.7, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18),
|
||||
"KORD", "19990127", " 21:00:00", " 21:18:00",
|
||||
-0.99, 2.01, 3.6, 0.0, 270.0],
|
||||
[datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56),
|
||||
"KORD", "19990127", " 22:00:00", " 21:56:00",
|
||||
-0.59, 1.71, 5.1, 0.0, 290.0],
|
||||
[datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56),
|
||||
"KORD", "19990127", " 23:00:00", " 22:56:00",
|
||||
-0.59, 1.71, 4.6, 0.0, 280.0],
|
||||
], columns=["actual", "nominal", "X0", "X1", "X2",
|
||||
"X3", "X4", "X5", "X6", "X7", "X8"])
|
||||
|
||||
if not keep_date_col:
|
||||
expected = expected.drop(["X1", "X2", "X3"], axis=1)
|
||||
elif parser.engine == "python":
|
||||
expected["X1"] = expected["X1"].astype(np.int64)
|
||||
|
||||
# Python can sometimes be flaky about how
|
||||
# the aggregated columns are entered, so
|
||||
# this standardizes the order.
|
||||
result = result[expected.columns]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("keep_date_col", [True, False])
|
||||
def test_multiple_date_col(all_parsers, keep_date_col):
|
||||
data = """\
|
||||
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), header=None,
|
||||
prefix="X", parse_dates=[[1, 2], [1, 3]],
|
||||
keep_date_col=keep_date_col)
|
||||
expected = DataFrame([
|
||||
[datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56),
|
||||
"KORD", "19990127", " 19:00:00", " 18:56:00",
|
||||
0.81, 2.81, 7.2, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56),
|
||||
"KORD", "19990127", " 20:00:00", " 19:56:00",
|
||||
0.01, 2.21, 7.2, 0.0, 260.0],
|
||||
[datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56),
|
||||
"KORD", "19990127", " 21:00:00", " 20:56:00",
|
||||
-0.59, 2.21, 5.7, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18),
|
||||
"KORD", "19990127", " 21:00:00", " 21:18:00",
|
||||
-0.99, 2.01, 3.6, 0.0, 270.0],
|
||||
[datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56),
|
||||
"KORD", "19990127", " 22:00:00", " 21:56:00",
|
||||
-0.59, 1.71, 5.1, 0.0, 290.0],
|
||||
[datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56),
|
||||
"KORD", "19990127", " 23:00:00", " 22:56:00",
|
||||
-0.59, 1.71, 4.6, 0.0, 280.0],
|
||||
], columns=["X1_X2", "X1_X3", "X0", "X1", "X2",
|
||||
"X3", "X4", "X5", "X6", "X7", "X8"])
|
||||
|
||||
if not keep_date_col:
|
||||
expected = expected.drop(["X1", "X2", "X3"], axis=1)
|
||||
elif parser.engine == "python":
|
||||
expected["X1"] = expected["X1"].astype(np.int64)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_date_col_as_index_col(all_parsers):
|
||||
data = """\
|
||||
KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), header=None, prefix="X",
|
||||
parse_dates=[1], index_col=1)
|
||||
|
||||
index = Index([datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 20, 0),
|
||||
datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 0),
|
||||
datetime(1999, 1, 27, 22, 0)], name="X1")
|
||||
expected = DataFrame([
|
||||
["KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0],
|
||||
["KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0],
|
||||
["KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0],
|
||||
["KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0],
|
||||
["KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0],
|
||||
], columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"], index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiple_date_cols_int_cast(all_parsers):
|
||||
data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
|
||||
"KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
|
||||
"KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
|
||||
"KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
|
||||
"KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
|
||||
"KORD,19990127, 23:00:00, 22:56:00, -0.5900")
|
||||
parse_dates = {"actual": [1, 2], "nominal": [1, 3]}
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None,
|
||||
date_parser=conv.parse_date_time,
|
||||
parse_dates=parse_dates, prefix="X")
|
||||
expected = DataFrame([
|
||||
[datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56),
|
||||
"KORD", 0.81],
|
||||
[datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56),
|
||||
"KORD", 0.01],
|
||||
[datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56),
|
||||
"KORD", -0.59],
|
||||
[datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18),
|
||||
"KORD", -0.99],
|
||||
[datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56),
|
||||
"KORD", -0.59],
|
||||
[datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56),
|
||||
"KORD", -0.59],
|
||||
], columns=["actual", "nominal", "X0", "X4"])
|
||||
|
||||
# Python can sometimes be flaky about how
|
||||
# the aggregated columns are entered, so
|
||||
# this standardizes the order.
|
||||
result = result[expected.columns]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiple_date_col_timestamp_parse(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
|
||||
05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), parse_dates=[[0, 1]],
|
||||
header=None, date_parser=Timestamp)
|
||||
expected = DataFrame([
|
||||
[Timestamp("05/31/2012, 15:30:00.029"),
|
||||
1306.25, 1, "E", 0, np.nan, 1306.25],
|
||||
[Timestamp("05/31/2012, 15:30:00.029"),
|
||||
1306.25, 8, "E", 0, np.nan, 1306.25]
|
||||
], columns=["0_1", 2, 3, 4, 5, 6, 7])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiple_date_cols_with_header(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
|
||||
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]})
|
||||
expected = DataFrame([
|
||||
[datetime(1999, 1, 27, 19, 0), "KORD", " 18:56:00",
|
||||
0.81, 2.81, 7.2, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 20, 0), "KORD", " 19:56:00",
|
||||
0.01, 2.21, 7.2, 0.0, 260.0],
|
||||
[datetime(1999, 1, 27, 21, 0), "KORD", " 20:56:00",
|
||||
-0.59, 2.21, 5.7, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 21, 0), "KORD", " 21:18:00",
|
||||
-0.99, 2.01, 3.6, 0.0, 270.0],
|
||||
[datetime(1999, 1, 27, 22, 0), "KORD", " 21:56:00",
|
||||
-0.59, 1.71, 5.1, 0.0, 290.0],
|
||||
[datetime(1999, 1, 27, 23, 0), "KORD", " 22:56:00",
|
||||
-0.59, 1.71, 4.6, 0.0, 280.0],
|
||||
], columns=["nominal", "ID", "ActualTime", "TDew",
|
||||
"TAir", "Windspeed", "Precip", "WindDir"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,parse_dates,msg", [
|
||||
("""\
|
||||
date_NominalTime,date,NominalTime
|
||||
KORD1,19990127, 19:00:00
|
||||
KORD2,19990127, 20:00:00""", [[1, 2]], ("New date column already "
|
||||
"in dict date_NominalTime")),
|
||||
("""\
|
||||
ID,date,nominalTime
|
||||
KORD,19990127, 19:00:00
|
||||
KORD,19990127, 20:00:00""", dict(ID=[1, 2]), "Date column ID already in dict")
|
||||
])
|
||||
def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg):
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), parse_dates=parse_dates)
|
||||
|
||||
|
||||
def test_date_parser_int_bug(all_parsers):
|
||||
# see gh-3071
|
||||
parser = all_parsers
|
||||
data = ("posix_timestamp,elapsed,sys,user,queries,query_time,rows,"
|
||||
"accountid,userid,contactid,level,silo,method\n"
|
||||
"1343103150,0.062353,0,4,6,0.01690,3,"
|
||||
"12345,1,-1,3,invoice_InvoiceResource,search\n")
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), index_col=0, parse_dates=[0],
|
||||
date_parser=lambda x: datetime.utcfromtimestamp(int(x)))
|
||||
expected = DataFrame([[0.062353, 0, 4, 6, 0.01690, 3, 12345, 1, -1,
|
||||
3, "invoice_InvoiceResource", "search"]],
|
||||
columns=["elapsed", "sys", "user", "queries",
|
||||
"query_time", "rows", "accountid",
|
||||
"userid", "contactid", "level",
|
||||
"silo", "method"],
|
||||
index=Index([Timestamp("2012-07-24 04:12:30")],
|
||||
name="posix_timestamp"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nat_parse(all_parsers):
|
||||
# see gh-3062
|
||||
parser = all_parsers
|
||||
df = DataFrame(dict({"A": np.asarray(lrange(10), dtype="float64"),
|
||||
"B": pd.Timestamp("20010101")}))
|
||||
df.iloc[3:6, :] = np.nan
|
||||
|
||||
with tm.ensure_clean("__nat_parse_.csv") as path:
|
||||
df.to_csv(path)
|
||||
|
||||
result = parser.read_csv(path, index_col=0, parse_dates=["B"])
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_csv_custom_parser(all_parsers):
|
||||
data = """A,B,C
|
||||
20090101,a,1,2
|
||||
20090102,b,3,4
|
||||
20090103,c,4,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
date_parser=lambda x: datetime.strptime(x, "%Y%m%d"))
|
||||
expected = parser.read_csv(StringIO(data), parse_dates=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_dates_implicit_first_col(all_parsers):
|
||||
data = """A,B,C
|
||||
20090101,a,1,2
|
||||
20090102,b,3,4
|
||||
20090103,c,4,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), parse_dates=True)
|
||||
|
||||
expected = parser.read_csv(StringIO(data), index_col=0,
|
||||
parse_dates=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_dates_string(all_parsers):
|
||||
data = """date,A,B,C
|
||||
20090101,a,1,2
|
||||
20090102,b,3,4
|
||||
20090103,c,4,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col="date",
|
||||
parse_dates=["date"])
|
||||
index = date_range("1/1/2009", periods=3)
|
||||
index.name = "date"
|
||||
|
||||
expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4],
|
||||
"C": [2, 4, 5]}, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# Bug in https://github.com/dateutil/dateutil/issues/217
|
||||
# has been addressed, but we just don't pass in the `yearfirst`
|
||||
@pytest.mark.xfail(reason="yearfirst is not surfaced in read_*")
|
||||
@pytest.mark.parametrize("parse_dates", [
|
||||
[["date", "time"]],
|
||||
[[0, 1]]
|
||||
])
|
||||
def test_yy_format_with_year_first(all_parsers, parse_dates):
|
||||
data = """date,time,B,C
|
||||
090131,0010,1,2
|
||||
090228,1020,3,4
|
||||
090331,0830,5,6
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=0,
|
||||
parse_dates=parse_dates)
|
||||
index = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0),
|
||||
datetime(2009, 2, 28, 10, 20, 0),
|
||||
datetime(2009, 3, 31, 8, 30, 0)],
|
||||
dtype=object, name="date_time")
|
||||
expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]])
|
||||
def test_parse_dates_column_list(all_parsers, parse_dates):
|
||||
data = "a,b,c\n01/01/2010,1,15/02/2010"
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [datetime(2010, 1, 1)], "b": [1],
|
||||
"c": [datetime(2010, 2, 15)]})
|
||||
expected = expected.set_index(["a", "b"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=[0, 1],
|
||||
parse_dates=parse_dates, dayfirst=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
|
||||
def test_multi_index_parse_dates(all_parsers, index_col):
|
||||
data = """index1,index2,A,B,C
|
||||
20090101,one,a,1,2
|
||||
20090101,two,b,3,4
|
||||
20090101,three,c,4,5
|
||||
20090102,one,a,1,2
|
||||
20090102,two,b,3,4
|
||||
20090102,three,c,4,5
|
||||
20090103,one,a,1,2
|
||||
20090103,two,b,3,4
|
||||
20090103,three,c,4,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
index = MultiIndex.from_product([
|
||||
(datetime(2009, 1, 1), datetime(2009, 1, 2),
|
||||
datetime(2009, 1, 3)), ("one", "two", "three")],
|
||||
names=["index1", "index2"])
|
||||
|
||||
# Out of order.
|
||||
if index_col == [1, 0]:
|
||||
index = index.swaplevel(0, 1)
|
||||
|
||||
expected = DataFrame([["a", 1, 2], ["b", 3, 4], ["c", 4, 5],
|
||||
["a", 1, 2], ["b", 3, 4], ["c", 4, 5],
|
||||
["a", 1, 2], ["b", 3, 4], ["c", 4, 5]],
|
||||
columns=["A", "B", "C"], index=index)
|
||||
result = parser.read_csv(StringIO(data), index_col=index_col,
|
||||
parse_dates=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [
|
||||
dict(dayfirst=True), dict(day_first=True)
|
||||
])
|
||||
def test_parse_dates_custom_euro_format(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
data = """foo,bar,baz
|
||||
31/01/2010,1,2
|
||||
01/02/2010,1,NA
|
||||
02/02/2010,1,2
|
||||
"""
|
||||
if "dayfirst" in kwargs:
|
||||
df = parser.read_csv(StringIO(data), names=["time", "Q", "NTU"],
|
||||
date_parser=lambda d: parse_date(d, **kwargs),
|
||||
header=0, index_col=0, parse_dates=True,
|
||||
na_values=["NA"])
|
||||
exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1),
|
||||
datetime(2010, 2, 2)], name="time")
|
||||
expected = DataFrame({"Q": [1, 1, 1], "NTU": [2, np.nan, 2]},
|
||||
index=exp_index, columns=["Q", "NTU"])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
else:
|
||||
msg = "got an unexpected keyword argument 'day_first'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=["time", "Q", "NTU"],
|
||||
date_parser=lambda d: parse_date(d, **kwargs),
|
||||
skiprows=[0], index_col=0, parse_dates=True,
|
||||
na_values=["NA"])
|
||||
|
||||
|
||||
def test_parse_tz_aware(all_parsers):
|
||||
# See gh-1693
|
||||
parser = all_parsers
|
||||
data = "Date,x\n2012-06-13T01:39:00Z,0.5"
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0,
|
||||
parse_dates=True)
|
||||
expected = DataFrame({"x": [0.5]}, index=Index([Timestamp(
|
||||
"2012-06-13 01:39:00+00:00")], name="Date"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert result.index.tz is pytz.utc
|
||||
|
||||
|
||||
@pytest.mark.parametrize("parse_dates,index_col", [
|
||||
({"nominal": [1, 2]}, "nominal"),
|
||||
({"nominal": [1, 2]}, 0),
|
||||
([[1, 2]], 0),
|
||||
])
|
||||
def test_multiple_date_cols_index(all_parsers, parse_dates, index_col):
|
||||
parser = all_parsers
|
||||
data = """
|
||||
ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
|
||||
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
|
||||
"""
|
||||
expected = DataFrame([
|
||||
[datetime(1999, 1, 27, 19, 0), "KORD1", " 18:56:00",
|
||||
0.81, 2.81, 7.2, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 20, 0), "KORD2", " 19:56:00",
|
||||
0.01, 2.21, 7.2, 0.0, 260.0],
|
||||
[datetime(1999, 1, 27, 21, 0), "KORD3", " 20:56:00",
|
||||
-0.59, 2.21, 5.7, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 21, 0), "KORD4", " 21:18:00",
|
||||
-0.99, 2.01, 3.6, 0.0, 270.0],
|
||||
[datetime(1999, 1, 27, 22, 0), "KORD5", " 21:56:00",
|
||||
-0.59, 1.71, 5.1, 0.0, 290.0],
|
||||
[datetime(1999, 1, 27, 23, 0), "KORD6", " 22:56:00",
|
||||
-0.59, 1.71, 4.6, 0.0, 280.0],
|
||||
], columns=["nominal", "ID", "ActualTime", "TDew",
|
||||
"TAir", "Windspeed", "Precip", "WindDir"])
|
||||
expected = expected.set_index("nominal")
|
||||
|
||||
if not isinstance(parse_dates, dict):
|
||||
expected.index.name = "date_NominalTime"
|
||||
|
||||
result = parser.read_csv(StringIO(data), parse_dates=parse_dates,
|
||||
index_col=index_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiple_date_cols_chunked(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
ID,date,nominalTime,actualTime,A,B,C,D,E
|
||||
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
|
||||
"""
|
||||
|
||||
expected = DataFrame([
|
||||
[datetime(1999, 1, 27, 19, 0), "KORD", " 18:56:00",
|
||||
0.81, 2.81, 7.2, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 20, 0), "KORD", " 19:56:00",
|
||||
0.01, 2.21, 7.2, 0.0, 260.0],
|
||||
[datetime(1999, 1, 27, 21, 0), "KORD", " 20:56:00",
|
||||
-0.59, 2.21, 5.7, 0.0, 280.0],
|
||||
[datetime(1999, 1, 27, 21, 0), "KORD", " 21:18:00",
|
||||
-0.99, 2.01, 3.6, 0.0, 270.0],
|
||||
[datetime(1999, 1, 27, 22, 0), "KORD", " 21:56:00",
|
||||
-0.59, 1.71, 5.1, 0.0, 290.0],
|
||||
[datetime(1999, 1, 27, 23, 0), "KORD", " 22:56:00",
|
||||
-0.59, 1.71, 4.6, 0.0, 280.0],
|
||||
], columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"])
|
||||
expected = expected.set_index("nominal")
|
||||
|
||||
reader = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]},
|
||||
index_col="nominal", chunksize=2)
|
||||
chunks = list(reader)
|
||||
|
||||
tm.assert_frame_equal(chunks[0], expected[:2])
|
||||
tm.assert_frame_equal(chunks[1], expected[2:4])
|
||||
tm.assert_frame_equal(chunks[2], expected[4:])
|
||||
|
||||
|
||||
def test_multiple_date_col_named_index_compat(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
ID,date,nominalTime,actualTime,A,B,C,D,E
|
||||
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
|
||||
"""
|
||||
|
||||
with_indices = parser.read_csv(StringIO(data),
|
||||
parse_dates={"nominal": [1, 2]},
|
||||
index_col="nominal")
|
||||
with_names = parser.read_csv(StringIO(data), index_col="nominal",
|
||||
parse_dates={"nominal": [
|
||||
"date", "nominalTime"]})
|
||||
tm.assert_frame_equal(with_indices, with_names)
|
||||
|
||||
|
||||
def test_multiple_date_col_multiple_index_compat(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
ID,date,nominalTime,actualTime,A,B,C,D,E
|
||||
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), index_col=["nominal", "ID"],
|
||||
parse_dates={"nominal": [1, 2]})
|
||||
expected = parser.read_csv(StringIO(data),
|
||||
parse_dates={"nominal": [1, 2]})
|
||||
|
||||
expected = expected.set_index(["nominal", "ID"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [dict(), dict(index_col="C")])
|
||||
def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs):
|
||||
# see gh-5636
|
||||
parser = all_parsers
|
||||
msg = ("Only booleans, lists, and dictionaries "
|
||||
"are accepted for the 'parse_dates' parameter")
|
||||
data = """A,B,C
|
||||
1,2,2003-11-1"""
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), parse_dates="C", **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("parse_dates", [
|
||||
(1,), np.array([4, 5]), {1, 3, 3}
|
||||
])
|
||||
def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates):
|
||||
parser = all_parsers
|
||||
msg = ("Only booleans, lists, and dictionaries "
|
||||
"are accepted for the 'parse_dates' parameter")
|
||||
data = """A,B,C
|
||||
1,2,2003-11-1"""
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), parse_dates=(1,))
|
||||
|
||||
|
||||
def test_parse_dates_empty_string(all_parsers):
|
||||
# see gh-2263
|
||||
parser = all_parsers
|
||||
data = "Date,test\n2012-01-01,1\n,2"
|
||||
result = parser.read_csv(StringIO(data), parse_dates=["Date"],
|
||||
na_filter=False)
|
||||
|
||||
expected = DataFrame([[datetime(2012, 1, 1), 1], [pd.NaT, 2]],
|
||||
columns=["Date", "test"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,kwargs,expected", [
|
||||
("a\n04.15.2016", dict(parse_dates=["a"]),
|
||||
DataFrame([datetime(2016, 4, 15)], columns=["a"])),
|
||||
("a\n04.15.2016", dict(parse_dates=True, index_col=0),
|
||||
DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"))),
|
||||
("a,b\n04.15.2016,09.16.2013", dict(parse_dates=["a", "b"]),
|
||||
DataFrame([[datetime(2016, 4, 15), datetime(2013, 9, 16)]],
|
||||
columns=["a", "b"])),
|
||||
("a,b\n04.15.2016,09.16.2013", dict(parse_dates=True, index_col=[0, 1]),
|
||||
DataFrame(index=MultiIndex.from_tuples(
|
||||
[(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"]))),
|
||||
])
|
||||
def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected):
|
||||
# see gh-14066
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), thousands=".", **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_date_time_multi_level_column_name(all_parsers):
|
||||
data = """\
|
||||
D,T,A,B
|
||||
date, time,a,b
|
||||
2001-01-05, 09:00:00, 0.0, 10.
|
||||
2001-01-06, 00:00:00, 1.0, 11.
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1],
|
||||
parse_dates={"date_time": [0, 1]},
|
||||
date_parser=conv.parse_date_time)
|
||||
|
||||
expected_data = [[datetime(2001, 1, 5, 9, 0, 0), 0., 10.],
|
||||
[datetime(2001, 1, 6, 0, 0, 0), 1., 11.]]
|
||||
expected = DataFrame(expected_data,
|
||||
columns=["date_time", ("A", "a"), ("B", "b")])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,kwargs,expected", [
|
||||
("""\
|
||||
date,time,a,b
|
||||
2001-01-05, 10:00:00, 0.0, 10.
|
||||
2001-01-05, 00:00:00, 1., 11.
|
||||
""", dict(header=0, parse_dates={"date_time": [0, 1]}),
|
||||
DataFrame([[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10],
|
||||
[datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0]],
|
||||
columns=["date_time", "a", "b"])),
|
||||
(("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
|
||||
"KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
|
||||
"KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
|
||||
"KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
|
||||
"KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
|
||||
"KORD,19990127, 23:00:00, 22:56:00, -0.5900"),
|
||||
dict(header=None, parse_dates={"actual": [1, 2], "nominal": [1, 3]}),
|
||||
DataFrame([
|
||||
[datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56),
|
||||
"KORD", 0.81],
|
||||
[datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56),
|
||||
"KORD", 0.01],
|
||||
[datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56),
|
||||
"KORD", -0.59],
|
||||
[datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18),
|
||||
"KORD", -0.99],
|
||||
[datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56),
|
||||
"KORD", -0.59],
|
||||
[datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56),
|
||||
"KORD", -0.59]], columns=["actual", "nominal", 0, 4])),
|
||||
])
|
||||
def test_parse_date_time(all_parsers, data, kwargs, expected):
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), date_parser=conv.parse_date_time,
|
||||
**kwargs)
|
||||
|
||||
# Python can sometimes be flaky about how
|
||||
# the aggregated columns are entered, so
|
||||
# this standardizes the order.
|
||||
result = result[expected.columns]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_date_fields(all_parsers):
|
||||
parser = all_parsers
|
||||
data = ("year,month,day,a\n2001,01,10,10.\n"
|
||||
"2001,02,1,11.")
|
||||
result = parser.read_csv(StringIO(data), header=0,
|
||||
parse_dates={"ymd": [0, 1, 2]},
|
||||
date_parser=conv.parse_date_fields)
|
||||
|
||||
expected = DataFrame([[datetime(2001, 1, 10), 10.],
|
||||
[datetime(2001, 2, 1), 11.]], columns=["ymd", "a"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_date_all_fields(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
year,month,day,hour,minute,second,a,b
|
||||
2001,01,05,10,00,0,0.0,10.
|
||||
2001,01,5,10,0,00,1.,11.
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), header=0,
|
||||
date_parser=conv.parse_all_fields,
|
||||
parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]})
|
||||
expected = DataFrame([[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0],
|
||||
[datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0]],
|
||||
columns=["ymdHMS", "a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_datetime_fractional_seconds(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
year,month,day,hour,minute,second,a,b
|
||||
2001,01,05,10,00,0.123456,0.0,10.
|
||||
2001,01,5,10,0,0.500000,1.,11.
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), header=0,
|
||||
date_parser=conv.parse_all_fields,
|
||||
parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]})
|
||||
expected = DataFrame([[datetime(2001, 1, 5, 10, 0, 0,
|
||||
microsecond=123456), 0.0, 10.0],
|
||||
[datetime(2001, 1, 5, 10, 0, 0,
|
||||
microsecond=500000), 1.0, 11.0]],
|
||||
columns=["ymdHMS", "a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_generic(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=0,
|
||||
parse_dates={"ym": [0, 1]},
|
||||
date_parser=lambda y, m: date(year=int(y),
|
||||
month=int(m),
|
||||
day=1))
|
||||
expected = DataFrame([[date(2001, 1, 1), 10, 10.],
|
||||
[date(2001, 2, 1), 1, 11.]],
|
||||
columns=["ym", "day", "a"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_date_parser_resolution_if_not_ns(all_parsers):
|
||||
# see gh-10245
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
date,time,prn,rxstatus
|
||||
2013-11-03,19:00:00,126,00E80000
|
||||
2013-11-03,19:00:00,23,00E80000
|
||||
2013-11-03,19:00:00,13,00E80000
|
||||
"""
|
||||
|
||||
def date_parser(dt, time):
|
||||
return np_array_datetime64_compat(dt + "T" + time + "Z",
|
||||
dtype="datetime64[s]")
|
||||
|
||||
result = parser.read_csv(StringIO(data), date_parser=date_parser,
|
||||
parse_dates={"datetime": ["date", "time"]},
|
||||
index_col=["datetime", "prn"])
|
||||
|
||||
datetimes = np_array_datetime64_compat(["2013-11-03T19:00:00Z"] * 3,
|
||||
dtype="datetime64[s]")
|
||||
expected = DataFrame(data={"rxstatus": ["00E80000"] * 3},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(datetimes[0], 126), (datetimes[1], 23),
|
||||
(datetimes[2], 13)], names=["datetime", "prn"]))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_date_column_with_empty_string(all_parsers):
|
||||
# see gh-6428
|
||||
parser = all_parsers
|
||||
data = "case,opdate\n7,10/18/2006\n7,10/18/2008\n621, "
|
||||
result = parser.read_csv(StringIO(data), parse_dates=["opdate"])
|
||||
|
||||
expected_data = [[7, "10/18/2006"],
|
||||
[7, "10/18/2008"],
|
||||
[621, " "]]
|
||||
expected = DataFrame(expected_data, columns=["case", "opdate"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,expected", [
|
||||
("a\n135217135789158401\n1352171357E+5",
|
||||
DataFrame({"a": [135217135789158401,
|
||||
135217135700000]}, dtype="float64")),
|
||||
("a\n99999999999\n123456789012345\n1234E+0",
|
||||
DataFrame({"a": [99999999999,
|
||||
123456789012345,
|
||||
1234]}, dtype="float64"))
|
||||
])
|
||||
@pytest.mark.parametrize("parse_dates", [True, False])
|
||||
def test_parse_date_float(all_parsers, data, expected, parse_dates):
|
||||
# see gh-2697
|
||||
#
|
||||
# Date parsing should fail, so we leave the data untouched
|
||||
# (i.e. float precision should remain unchanged).
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_timezone(all_parsers):
|
||||
# see gh-22256
|
||||
parser = all_parsers
|
||||
data = """dt,val
|
||||
2018-01-04 09:01:00+09:00,23350
|
||||
2018-01-04 09:02:00+09:00,23400
|
||||
2018-01-04 09:03:00+09:00,23400
|
||||
2018-01-04 09:04:00+09:00,23400
|
||||
2018-01-04 09:05:00+09:00,23400"""
|
||||
result = parser.read_csv(StringIO(data), parse_dates=["dt"])
|
||||
|
||||
dti = pd.date_range(start="2018-01-04 09:01:00",
|
||||
end="2018-01-04 09:05:00", freq="1min",
|
||||
tz=pytz.FixedOffset(540))
|
||||
expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]}
|
||||
|
||||
expected = DataFrame(expected_data)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
+301
@@ -0,0 +1,301 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that apply specifically to the Python parser. Unless specifically
|
||||
stated as a Python-specific issue, the goal is to eventually move as many of
|
||||
these tests out of this module as soon as the C parser can accept further
|
||||
arguments when parsing.
|
||||
"""
|
||||
|
||||
import csv
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas.compat as compat
|
||||
from pandas.compat import BytesIO, StringIO, u
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def test_default_separator(python_parser_only):
|
||||
# see gh-17333
|
||||
#
|
||||
# csv.Sniffer in Python treats "o" as separator.
|
||||
data = "aob\n1o2\n3o4"
|
||||
parser = python_parser_only
|
||||
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("skipfooter", ["foo", 1.5, True])
|
||||
def test_invalid_skipfooter_non_int(python_parser_only, skipfooter):
|
||||
# see gh-15925 (comment)
|
||||
data = "a\n1\n2"
|
||||
parser = python_parser_only
|
||||
msg = "skipfooter must be an integer"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=skipfooter)
|
||||
|
||||
|
||||
def test_invalid_skipfooter_negative(python_parser_only):
|
||||
# see gh-15925 (comment)
|
||||
data = "a\n1\n2"
|
||||
parser = python_parser_only
|
||||
msg = "skipfooter cannot be negative"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=-1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [
|
||||
dict(sep=None),
|
||||
dict(delimiter="|")
|
||||
])
|
||||
def test_sniff_delimiter(python_parser_only, kwargs):
|
||||
data = """index|A|B|C
|
||||
foo|1|2|3
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"],
|
||||
index=Index(["foo", "bar", "baz"], name="index"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("encoding", [None, "utf-8"])
|
||||
def test_sniff_delimiter_encoding(python_parser_only, encoding):
|
||||
parser = python_parser_only
|
||||
data = """ignore this
|
||||
ignore this too
|
||||
index|A|B|C
|
||||
foo|1|2|3
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
|
||||
if encoding is not None:
|
||||
data = u(data).encode(encoding)
|
||||
data = BytesIO(data)
|
||||
|
||||
if compat.PY3:
|
||||
from io import TextIOWrapper
|
||||
data = TextIOWrapper(data, encoding=encoding)
|
||||
else:
|
||||
data = StringIO(data)
|
||||
|
||||
result = parser.read_csv(data, index_col=0, sep=None,
|
||||
skiprows=2, encoding=encoding)
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"],
|
||||
index=Index(["foo", "bar", "baz"], name="index"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_single_line(python_parser_only):
|
||||
# see gh-6607: sniff separator
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO("1,2"), names=["a", "b"],
|
||||
header=None, sep=None)
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [dict(skipfooter=2), dict(nrows=3)])
|
||||
def test_skipfooter(python_parser_only, kwargs):
|
||||
# see gh-6607
|
||||
data = """A,B,C
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
want to skip this
|
||||
also also skip this
|
||||
"""
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("compression,klass", [
|
||||
("gzip", "GzipFile"),
|
||||
("bz2", "BZ2File"),
|
||||
])
|
||||
def test_decompression_regex_sep(python_parser_only, csv1, compression, klass):
|
||||
# see gh-6607
|
||||
parser = python_parser_only
|
||||
|
||||
with open(csv1, "rb") as f:
|
||||
data = f.read()
|
||||
|
||||
data = data.replace(b",", b"::")
|
||||
expected = parser.read_csv(csv1)
|
||||
|
||||
module = pytest.importorskip(compression)
|
||||
klass = getattr(module, klass)
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
tmp = klass(path, mode="wb")
|
||||
tmp.write(data)
|
||||
tmp.close()
|
||||
|
||||
result = parser.read_csv(path, sep="::",
|
||||
compression=compression)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_buglet_4x_multi_index(python_parser_only):
|
||||
# see gh-6607
|
||||
data = """ A B C D E
|
||||
one two three four
|
||||
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
|
||||
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
|
||||
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
|
||||
parser = python_parser_only
|
||||
|
||||
expected = DataFrame([[-0.5109, -2.3358, -0.4645, 0.05076, 0.3640],
|
||||
[0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
|
||||
[-0.6662, -0.5243, -0.3580, 0.89145, 2.5838]],
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
index=MultiIndex.from_tuples([
|
||||
("a", "b", 10.0032, 5),
|
||||
("a", "q", 20, 4),
|
||||
("x", "q", 30, 3),
|
||||
], names=["one", "two", "three", "four"]))
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_buglet_4x_multi_index2(python_parser_only):
|
||||
# see gh-6893
|
||||
data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9"
|
||||
parser = python_parser_only
|
||||
|
||||
expected = DataFrame.from_records(
|
||||
[(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
|
||||
columns=list("abcABC"), index=list("abc"))
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("add_footer", [True, False])
|
||||
def test_skipfooter_with_decimal(python_parser_only, add_footer):
|
||||
# see gh-6971
|
||||
data = "1#2\n3#4"
|
||||
parser = python_parser_only
|
||||
expected = DataFrame({"a": [1.2, 3.4]})
|
||||
|
||||
if add_footer:
|
||||
# The stray footer line should not mess with the
|
||||
# casting of the first two lines if we skip it.
|
||||
kwargs = dict(skipfooter=1)
|
||||
data += "\nFooter"
|
||||
else:
|
||||
kwargs = dict()
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=["a"],
|
||||
decimal="#", **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sep", ["::", "#####", "!!!", "123", "#1!c5",
|
||||
"%!c!d", "@@#4:2", "_!pd#_"])
|
||||
@pytest.mark.parametrize("encoding", ["utf-16", "utf-16-be", "utf-16-le",
|
||||
"utf-32", "cp037"])
|
||||
def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding):
|
||||
# see gh-3404
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
parser = python_parser_only
|
||||
|
||||
data = "1" + sep + "2"
|
||||
encoded_data = data.encode(encoding)
|
||||
|
||||
result = parser.read_csv(BytesIO(encoded_data), sep=sep,
|
||||
names=["a", "b"], encoding=encoding)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
|
||||
def test_multi_char_sep_quotes(python_parser_only, quoting):
|
||||
# see gh-13374
|
||||
kwargs = dict(sep=",,")
|
||||
parser = python_parser_only
|
||||
|
||||
data = 'a,,b\n1,,a\n2,,"2,,b"'
|
||||
msg = "ignored when a multi-char delimiter is used"
|
||||
|
||||
def fail_read():
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
|
||||
|
||||
if quoting == csv.QUOTE_NONE:
|
||||
# We expect no match, so there should be an assertion
|
||||
# error out of the inner context manager.
|
||||
with pytest.raises(AssertionError):
|
||||
fail_read()
|
||||
else:
|
||||
fail_read()
|
||||
|
||||
|
||||
def test_none_delimiter(python_parser_only, capsys):
|
||||
# see gh-13374 and gh-17465
|
||||
parser = python_parser_only
|
||||
data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
|
||||
expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]})
|
||||
|
||||
# We expect the third line in the data to be
|
||||
# skipped because it is malformed, but we do
|
||||
# not expect any errors to occur.
|
||||
result = parser.read_csv(StringIO(data), header=0,
|
||||
sep=None, warn_bad_lines=True,
|
||||
error_bad_lines=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert "Skipping line 3" in captured.err
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", [
|
||||
'a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
|
||||
@pytest.mark.parametrize("skipfooter", [0, 1])
|
||||
def test_skipfooter_bad_row(python_parser_only, data, skipfooter):
|
||||
# see gh-13879 and gh-15910
|
||||
msg = "parsing errors in the skipped footer rows"
|
||||
parser = python_parser_only
|
||||
|
||||
def fail_read():
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=skipfooter)
|
||||
|
||||
if skipfooter:
|
||||
fail_read()
|
||||
else:
|
||||
# We expect no match, so there should be an assertion
|
||||
# error out of the inner context manager.
|
||||
with pytest.raises(AssertionError):
|
||||
fail_read()
|
||||
|
||||
|
||||
def test_malformed_skipfooter(python_parser_only):
|
||||
parser = python_parser_only
|
||||
data = """ignore
|
||||
A,B,C
|
||||
1,2,3 # comment
|
||||
1,2,3,4,5
|
||||
2,3,4
|
||||
footer
|
||||
"""
|
||||
msg = "Expected 3 fields in line 4, saw 5"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=1,
|
||||
comment="#", skipfooter=1)
|
||||
@@ -0,0 +1,158 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that quoting specifications are properly handled
|
||||
during parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import csv
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY2, StringIO, u
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs,msg", [
|
||||
(dict(quotechar="foo"), '"quotechar" must be a(n)? 1-character string'),
|
||||
(dict(quotechar=None, quoting=csv.QUOTE_MINIMAL),
|
||||
"quotechar must be set if quoting enabled"),
|
||||
(dict(quotechar=2), '"quotechar" must be string, not int')
|
||||
])
|
||||
def test_bad_quote_char(all_parsers, kwargs, msg):
|
||||
data = "1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quoting,msg", [
|
||||
("foo", '"quoting" must be an integer'),
|
||||
(5, 'bad "quoting" value'), # quoting must be in the range [0, 3]
|
||||
])
|
||||
def test_bad_quoting(all_parsers, quoting, msg):
|
||||
data = "1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), quoting=quoting)
|
||||
|
||||
|
||||
def test_quote_char_basic(all_parsers):
|
||||
parser = all_parsers
|
||||
data = 'a,b,c\n1,2,"cat"'
|
||||
expected = DataFrame([[1, 2, "cat"]],
|
||||
columns=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar='"')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
|
||||
def test_quote_char_various(all_parsers, quote_char):
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[1, 2, "cat"]],
|
||||
columns=["a", "b", "c"])
|
||||
|
||||
data = 'a,b,c\n1,2,"cat"'
|
||||
new_data = data.replace('"', quote_char)
|
||||
|
||||
result = parser.read_csv(StringIO(new_data), quotechar=quote_char)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
|
||||
@pytest.mark.parametrize("quote_char", ["", None])
|
||||
def test_null_quote_char(all_parsers, quoting, quote_char):
|
||||
kwargs = dict(quotechar=quote_char, quoting=quoting)
|
||||
data = "a,b,c\n1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
if quoting != csv.QUOTE_NONE:
|
||||
# Sanity checking.
|
||||
msg = "quotechar must be set if quoting enabled"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs,exp_data", [
|
||||
(dict(), [[1, 2, "foo"]]), # Test default.
|
||||
|
||||
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
|
||||
(dict(quotechar='"', quoting=csv.QUOTE_MINIMAL), [[1, 2, "foo"]]),
|
||||
|
||||
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
|
||||
(dict(quotechar='"', quoting=csv.QUOTE_ALL), [[1, 2, "foo"]]),
|
||||
|
||||
# QUOTE_NONE tells the reader to do no special handling
|
||||
# of quote characters and leave them alone.
|
||||
(dict(quotechar='"', quoting=csv.QUOTE_NONE), [[1, 2, '"foo"']]),
|
||||
|
||||
# QUOTE_NONNUMERIC tells the reader to cast
|
||||
# all non-quoted fields to float
|
||||
(dict(quotechar='"', quoting=csv.QUOTE_NONNUMERIC), [[1.0, 2.0, "foo"]])
|
||||
])
|
||||
def test_quoting_various(all_parsers, kwargs, exp_data):
|
||||
data = '1,2,"foo"'
|
||||
parser = all_parsers
|
||||
columns = ["a", "b", "c"]
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=columns, **kwargs)
|
||||
expected = DataFrame(exp_data, columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("doublequote,exp_data", [
|
||||
(True, [[3, '4 " 5']]),
|
||||
(False, [[3, '4 " 5"']]),
|
||||
])
|
||||
def test_double_quote(all_parsers, doublequote, exp_data):
|
||||
parser = all_parsers
|
||||
data = 'a,b\n3,"4 "" 5"'
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar='"',
|
||||
doublequote=doublequote)
|
||||
expected = DataFrame(exp_data, columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quotechar", [
|
||||
u('"'),
|
||||
pytest.param(u('\u0001'), marks=pytest.mark.skipif(
|
||||
PY2, reason="Python 2.x does not handle unicode well."))])
|
||||
def test_quotechar_unicode(all_parsers, quotechar):
|
||||
# see gh-14477
|
||||
data = "a\n1"
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar=quotechar)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("balanced", [True, False])
|
||||
def test_unbalanced_quoting(all_parsers, balanced):
|
||||
# see gh-22789.
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,2,\"3"
|
||||
|
||||
if balanced:
|
||||
# Re-balance the quoting and read in without errors.
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
result = parser.read_csv(StringIO(data + '"'))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
msg = ("EOF inside string starting at row 1" if parser.engine == "c"
|
||||
else "unexpected end of data")
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
@@ -0,0 +1,580 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests the 'read_fwf' function in parsers.py. This
|
||||
test suite is independent of the others because the
|
||||
engine is set to 'python-fwf' internally.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.compat as compat
|
||||
from pandas.compat import BytesIO, StringIO
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, DatetimeIndex
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.parsers import EmptyDataError, read_csv, read_fwf
|
||||
|
||||
|
||||
def test_basic():
|
||||
data = """\
|
||||
A B C D
|
||||
201158 360.242940 149.910199 11950.7
|
||||
201159 444.953632 166.985655 11788.4
|
||||
201160 364.136849 183.628767 11806.2
|
||||
201161 413.836124 184.375703 11916.8
|
||||
201162 502.953953 173.237159 12468.3
|
||||
"""
|
||||
result = read_fwf(StringIO(data))
|
||||
expected = DataFrame([[201158, 360.242940, 149.910199, 11950.7],
|
||||
[201159, 444.953632, 166.985655, 11788.4],
|
||||
[201160, 364.136849, 183.628767, 11806.2],
|
||||
[201161, 413.836124, 184.375703, 11916.8],
|
||||
[201162, 502.953953, 173.237159, 12468.3]],
|
||||
columns=["A", "B", "C", "D"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_colspecs():
|
||||
data = """\
|
||||
A B C D E
|
||||
201158 360.242940 149.910199 11950.7
|
||||
201159 444.953632 166.985655 11788.4
|
||||
201160 364.136849 183.628767 11806.2
|
||||
201161 413.836124 184.375703 11916.8
|
||||
201162 502.953953 173.237159 12468.3
|
||||
"""
|
||||
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs)
|
||||
|
||||
expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7],
|
||||
[2011, 59, 444.953632, 166.985655, 11788.4],
|
||||
[2011, 60, 364.136849, 183.628767, 11806.2],
|
||||
[2011, 61, 413.836124, 184.375703, 11916.8],
|
||||
[2011, 62, 502.953953, 173.237159, 12468.3]],
|
||||
columns=["A", "B", "C", "D", "E"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_widths():
|
||||
data = """\
|
||||
A B C D E
|
||||
2011 58 360.242940 149.910199 11950.7
|
||||
2011 59 444.953632 166.985655 11788.4
|
||||
2011 60 364.136849 183.628767 11806.2
|
||||
2011 61 413.836124 184.375703 11916.8
|
||||
2011 62 502.953953 173.237159 12468.3
|
||||
"""
|
||||
result = read_fwf(StringIO(data), widths=[5, 5, 13, 13, 7])
|
||||
|
||||
expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7],
|
||||
[2011, 59, 444.953632, 166.985655, 11788.4],
|
||||
[2011, 60, 364.136849, 183.628767, 11806.2],
|
||||
[2011, 61, 413.836124, 184.375703, 11916.8],
|
||||
[2011, 62, 502.953953, 173.237159, 12468.3]],
|
||||
columns=["A", "B", "C", "D", "E"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_non_space_filler():
|
||||
# From Thomas Kluyver:
|
||||
#
|
||||
# Apparently, some non-space filler characters can be seen, this is
|
||||
# supported by specifying the 'delimiter' character:
|
||||
#
|
||||
# http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html
|
||||
data = """\
|
||||
A~~~~B~~~~C~~~~~~~~~~~~D~~~~~~~~~~~~E
|
||||
201158~~~~360.242940~~~149.910199~~~11950.7
|
||||
201159~~~~444.953632~~~166.985655~~~11788.4
|
||||
201160~~~~364.136849~~~183.628767~~~11806.2
|
||||
201161~~~~413.836124~~~184.375703~~~11916.8
|
||||
201162~~~~502.953953~~~173.237159~~~12468.3
|
||||
"""
|
||||
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs, delimiter="~")
|
||||
|
||||
expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7],
|
||||
[2011, 59, 444.953632, 166.985655, 11788.4],
|
||||
[2011, 60, 364.136849, 183.628767, 11806.2],
|
||||
[2011, 61, 413.836124, 184.375703, 11916.8],
|
||||
[2011, 62, 502.953953, 173.237159, 12468.3]],
|
||||
columns=["A", "B", "C", "D", "E"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_over_specified():
|
||||
data = """\
|
||||
A B C D E
|
||||
201158 360.242940 149.910199 11950.7
|
||||
201159 444.953632 166.985655 11788.4
|
||||
201160 364.136849 183.628767 11806.2
|
||||
201161 413.836124 184.375703 11916.8
|
||||
201162 502.953953 173.237159 12468.3
|
||||
"""
|
||||
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
|
||||
|
||||
with pytest.raises(ValueError, match="must specify only one of"):
|
||||
read_fwf(StringIO(data), colspecs=colspecs, widths=[6, 10, 10, 7])
|
||||
|
||||
|
||||
def test_under_specified():
|
||||
data = """\
|
||||
A B C D E
|
||||
201158 360.242940 149.910199 11950.7
|
||||
201159 444.953632 166.985655 11788.4
|
||||
201160 364.136849 183.628767 11806.2
|
||||
201161 413.836124 184.375703 11916.8
|
||||
201162 502.953953 173.237159 12468.3
|
||||
"""
|
||||
with pytest.raises(ValueError, match="Must specify either"):
|
||||
read_fwf(StringIO(data), colspecs=None, widths=None)
|
||||
|
||||
|
||||
def test_read_csv_compat():
|
||||
csv_data = """\
|
||||
A,B,C,D,E
|
||||
2011,58,360.242940,149.910199,11950.7
|
||||
2011,59,444.953632,166.985655,11788.4
|
||||
2011,60,364.136849,183.628767,11806.2
|
||||
2011,61,413.836124,184.375703,11916.8
|
||||
2011,62,502.953953,173.237159,12468.3
|
||||
"""
|
||||
expected = read_csv(StringIO(csv_data), engine="python")
|
||||
|
||||
fwf_data = """\
|
||||
A B C D E
|
||||
201158 360.242940 149.910199 11950.7
|
||||
201159 444.953632 166.985655 11788.4
|
||||
201160 364.136849 183.628767 11806.2
|
||||
201161 413.836124 184.375703 11916.8
|
||||
201162 502.953953 173.237159 12468.3
|
||||
"""
|
||||
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
|
||||
result = read_fwf(StringIO(fwf_data), colspecs=colspecs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_bytes_io_input():
|
||||
if not compat.PY3:
|
||||
pytest.skip("Bytes-related test - only needs to work on Python 3")
|
||||
|
||||
result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')),
|
||||
widths=[2, 2], encoding="utf8")
|
||||
expected = DataFrame([["של", "ום"]], columns=["של", "ום"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_fwf_colspecs_is_list_or_tuple():
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
|
||||
msg = "column specifications must be a list or tuple.+"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
read_fwf(StringIO(data), colspecs={"a": 1}, delimiter=",")
|
||||
|
||||
|
||||
def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples():
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
|
||||
msg = "Each column specification must be.+"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
read_fwf(StringIO(data), [("a", 1)])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("colspecs,exp_data", [
|
||||
([(0, 3), (3, None)], [[123, 456], [456, 789]]),
|
||||
([(None, 3), (3, 6)], [[123, 456], [456, 789]]),
|
||||
([(0, None), (3, None)], [[123456, 456], [456789, 789]]),
|
||||
([(None, None), (3, 6)], [[123456, 456], [456789, 789]]),
|
||||
])
|
||||
def test_fwf_colspecs_none(colspecs, exp_data):
|
||||
# see gh-7079
|
||||
data = """\
|
||||
123456
|
||||
456789
|
||||
"""
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("infer_nrows,exp_data", [
|
||||
# infer_nrows --> colspec == [(2, 3), (5, 6)]
|
||||
(1, [[1, 2], [3, 8]]),
|
||||
|
||||
# infer_nrows > number of rows
|
||||
(10, [[1, 2], [123, 98]]),
|
||||
])
|
||||
def test_fwf_colspecs_infer_nrows(infer_nrows, exp_data):
|
||||
# see gh-15138
|
||||
data = """\
|
||||
1 2
|
||||
123 98
|
||||
"""
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = read_fwf(StringIO(data), infer_nrows=infer_nrows, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_fwf_regression():
|
||||
# see gh-3594
|
||||
#
|
||||
# Turns out "T060" is parsable as a datetime slice!
|
||||
tz_list = [1, 10, 20, 30, 60, 80, 100]
|
||||
widths = [16] + [8] * len(tz_list)
|
||||
names = ["SST"] + ["T%03d" % z for z in tz_list[1:]]
|
||||
|
||||
data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192
|
||||
2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869
|
||||
2009164204000 9.5873 9.1326 8.4694 7.5889 6.0422 5.8526 5.4657
|
||||
2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379
|
||||
2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039
|
||||
"""
|
||||
|
||||
result = read_fwf(StringIO(data), index_col=0, header=None, names=names,
|
||||
widths=widths, parse_dates=True,
|
||||
date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"))
|
||||
expected = DataFrame([
|
||||
[9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192],
|
||||
[9.5435, 9.2010, 8.6167, 7.8176, 6.0804, 5.8728, 5.4869],
|
||||
[9.5873, 9.1326, 8.4694, 7.5889, 6.0422, 5.8526, 5.4657],
|
||||
[9.5810, 9.0896, 8.4009, 7.4652, 6.0322, 5.8189, 5.4379],
|
||||
[9.6034, 9.0897, 8.3822, 7.4905, 6.0908, 5.7904, 5.4039],
|
||||
], index=DatetimeIndex(["2009-06-13 20:20:00", "2009-06-13 20:30:00",
|
||||
"2009-06-13 20:40:00", "2009-06-13 20:50:00",
|
||||
"2009-06-13 21:00:00"]),
|
||||
columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_fwf_for_uint8():
|
||||
data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127
|
||||
1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" # noqa
|
||||
df = read_fwf(StringIO(data),
|
||||
colspecs=[(0, 17), (25, 26), (33, 37),
|
||||
(49, 51), (58, 62), (63, 1000)],
|
||||
names=["time", "pri", "pgn", "dst", "src", "data"],
|
||||
converters={
|
||||
"pgn": lambda x: int(x, 16),
|
||||
"src": lambda x: int(x, 16),
|
||||
"dst": lambda x: int(x, 16),
|
||||
"data": lambda x: len(x.split(" "))})
|
||||
|
||||
expected = DataFrame([[1421302965.213420, 3, 61184, 23, 40, 8],
|
||||
[1421302964.226776, 6, 61442, None, 71, 8]],
|
||||
columns=["time", "pri", "pgn",
|
||||
"dst", "src", "data"])
|
||||
expected["dst"] = expected["dst"].astype(object)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("comment", ["#", "~", "!"])
|
||||
def test_fwf_comment(comment):
|
||||
data = """\
|
||||
1 2. 4 #hello world
|
||||
5 NaN 10.0
|
||||
"""
|
||||
data = data.replace("#", comment)
|
||||
|
||||
colspecs = [(0, 3), (4, 9), (9, 25)]
|
||||
expected = DataFrame([[1, 2., 4], [5, np.nan, 10.]])
|
||||
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs,
|
||||
header=None, comment=comment)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("thousands", [",", "#", "~"])
|
||||
def test_fwf_thousands(thousands):
|
||||
data = """\
|
||||
1 2,334.0 5
|
||||
10 13 10.
|
||||
"""
|
||||
data = data.replace(",", thousands)
|
||||
|
||||
colspecs = [(0, 3), (3, 11), (12, 16)]
|
||||
expected = DataFrame([[1, 2334., 5], [10, 13, 10.]])
|
||||
|
||||
result = read_fwf(StringIO(data), header=None,
|
||||
colspecs=colspecs, thousands=thousands)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [True, False])
|
||||
def test_bool_header_arg(header):
|
||||
# see gh-6114
|
||||
data = """\
|
||||
MyColumn
|
||||
a
|
||||
b
|
||||
a
|
||||
b"""
|
||||
|
||||
msg = "Passing a bool to header is invalid"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
read_fwf(StringIO(data), header=header)
|
||||
|
||||
|
||||
def test_full_file():
|
||||
# File with all values.
|
||||
test = """index A B C
|
||||
2000-01-03T00:00:00 0.980268513777 3 foo
|
||||
2000-01-04T00:00:00 1.04791624281 -4 bar
|
||||
2000-01-05T00:00:00 0.498580885705 73 baz
|
||||
2000-01-06T00:00:00 1.12020151869 1 foo
|
||||
2000-01-07T00:00:00 0.487094399463 0 bar
|
||||
2000-01-10T00:00:00 0.836648671666 2 baz
|
||||
2000-01-11T00:00:00 0.157160753327 34 foo"""
|
||||
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
|
||||
result = read_fwf(StringIO(test))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_full_file_with_missing():
|
||||
# File with missing values.
|
||||
test = """index A B C
|
||||
2000-01-03T00:00:00 0.980268513777 3 foo
|
||||
2000-01-04T00:00:00 1.04791624281 -4 bar
|
||||
0.498580885705 73 baz
|
||||
2000-01-06T00:00:00 1.12020151869 1 foo
|
||||
2000-01-07T00:00:00 0 bar
|
||||
2000-01-10T00:00:00 0.836648671666 2 baz
|
||||
34"""
|
||||
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
|
||||
result = read_fwf(StringIO(test))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_full_file_with_spaces():
|
||||
# File with spaces in columns.
|
||||
test = """
|
||||
Account Name Balance CreditLimit AccountCreated
|
||||
101 Keanu Reeves 9315.45 10000.00 1/17/1998
|
||||
312 Gerard Butler 90.00 1000.00 8/6/2003
|
||||
868 Jennifer Love Hewitt 0 17000.00 5/25/1985
|
||||
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
|
||||
317 Bill Murray 789.65 5000.00 2/5/2007
|
||||
""".strip("\r\n")
|
||||
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
|
||||
result = read_fwf(StringIO(test))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_full_file_with_spaces_and_missing():
|
||||
# File with spaces and missing values in columns.
|
||||
test = """
|
||||
Account Name Balance CreditLimit AccountCreated
|
||||
101 10000.00 1/17/1998
|
||||
312 Gerard Butler 90.00 1000.00 8/6/2003
|
||||
868 5/25/1985
|
||||
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
|
||||
317 Bill Murray 789.65
|
||||
""".strip("\r\n")
|
||||
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
|
||||
result = read_fwf(StringIO(test))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_messed_up_data():
|
||||
# Completely messed up file.
|
||||
test = """
|
||||
Account Name Balance Credit Limit Account Created
|
||||
101 10000.00 1/17/1998
|
||||
312 Gerard Butler 90.00 1000.00
|
||||
|
||||
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
|
||||
317 Bill Murray 789.65
|
||||
""".strip("\r\n")
|
||||
colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
|
||||
result = read_fwf(StringIO(test))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiple_delimiters():
|
||||
test = r"""
|
||||
col1~~~~~col2 col3++++++++++++++++++col4
|
||||
~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
|
||||
33+++122.33\\\bar.........Gerard Butler
|
||||
++44~~~~12.01 baz~~Jennifer Love Hewitt
|
||||
~~55 11+++foo++++Jada Pinkett-Smith
|
||||
..66++++++.03~~~bar Bill Murray
|
||||
""".strip("\r\n")
|
||||
delimiter = " +~.\\"
|
||||
colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs, delimiter=delimiter)
|
||||
|
||||
result = read_fwf(StringIO(test), delimiter=delimiter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_variable_width_unicode():
|
||||
if not compat.PY3:
|
||||
pytest.skip("Bytes-related test - only needs to work on Python 3")
|
||||
|
||||
data = """
|
||||
שלום שלום
|
||||
ום שלל
|
||||
של ום
|
||||
""".strip("\r\n")
|
||||
encoding = "utf8"
|
||||
kwargs = dict(header=None, encoding=encoding)
|
||||
|
||||
expected = read_fwf(BytesIO(data.encode(encoding)),
|
||||
colspecs=[(0, 4), (5, 9)], **kwargs)
|
||||
result = read_fwf(BytesIO(data.encode(encoding)), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [
|
||||
dict(), {"a": "float64", "b": str, "c": "int32"}
|
||||
])
|
||||
def test_dtype(dtype):
|
||||
data = """ a b c
|
||||
1 2 3.2
|
||||
3 4 5.2
|
||||
"""
|
||||
colspecs = [(0, 5), (5, 10), (10, None)]
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs, dtype=dtype)
|
||||
|
||||
expected = pd.DataFrame({
|
||||
"a": [1, 3], "b": [2, 4],
|
||||
"c": [3.2, 5.2]}, columns=["a", "b", "c"])
|
||||
|
||||
for col, dt in dtype.items():
|
||||
expected[col] = expected[col].astype(dt)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skiprows_inference():
|
||||
# see gh-11256
|
||||
data = """
|
||||
Text contained in the file header
|
||||
|
||||
DataCol1 DataCol2
|
||||
0.0 1.0
|
||||
101.6 956.1
|
||||
""".strip()
|
||||
skiprows = 2
|
||||
expected = read_csv(StringIO(data), skiprows=skiprows,
|
||||
delim_whitespace=True)
|
||||
|
||||
result = read_fwf(StringIO(data), skiprows=skiprows)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skiprows_by_index_inference():
|
||||
data = """
|
||||
To be skipped
|
||||
Not To Be Skipped
|
||||
Once more to be skipped
|
||||
123 34 8 123
|
||||
456 78 9 456
|
||||
""".strip()
|
||||
skiprows = [0, 2]
|
||||
expected = read_csv(StringIO(data), skiprows=skiprows,
|
||||
delim_whitespace=True)
|
||||
|
||||
result = read_fwf(StringIO(data), skiprows=skiprows)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skiprows_inference_empty():
|
||||
data = """
|
||||
AA BBB C
|
||||
12 345 6
|
||||
78 901 2
|
||||
""".strip()
|
||||
|
||||
msg = "No rows from which to infer column width"
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
read_fwf(StringIO(data), skiprows=3)
|
||||
|
||||
|
||||
def test_whitespace_preservation():
|
||||
# see gh-16772
|
||||
header = None
|
||||
csv_data = """
|
||||
a ,bbb
|
||||
cc,dd """
|
||||
|
||||
fwf_data = """
|
||||
a bbb
|
||||
ccdd """
|
||||
result = read_fwf(StringIO(fwf_data), widths=[3, 3],
|
||||
header=header, skiprows=[0], delimiter="\n\t")
|
||||
expected = read_csv(StringIO(csv_data), header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_default_delimiter():
|
||||
header = None
|
||||
csv_data = """
|
||||
a,bbb
|
||||
cc,dd"""
|
||||
|
||||
fwf_data = """
|
||||
a \tbbb
|
||||
cc\tdd """
|
||||
result = read_fwf(StringIO(fwf_data), widths=[3, 3],
|
||||
header=header, skiprows=[0])
|
||||
expected = read_csv(StringIO(csv_data), header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("infer", [True, False, None])
|
||||
def test_fwf_compression(compression_only, infer):
|
||||
data = """1111111111
|
||||
2222222222
|
||||
3333333333""".strip()
|
||||
|
||||
compression = compression_only
|
||||
extension = "gz" if compression == "gzip" else compression
|
||||
|
||||
kwargs = dict(widths=[5, 5], names=["one", "two"])
|
||||
expected = read_fwf(StringIO(data), **kwargs)
|
||||
|
||||
if compat.PY3:
|
||||
data = bytes(data, encoding="utf-8")
|
||||
|
||||
with tm.ensure_clean(filename="tmp." + extension) as path:
|
||||
tm.write_to_compressed(compression, path, data)
|
||||
|
||||
if infer is not None:
|
||||
kwargs["compression"] = "infer" if infer else compression
|
||||
|
||||
result = read_fwf(path, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,222 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that skipped rows are properly handled during
|
||||
parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO, lrange, range
|
||||
from pandas.errors import EmptyDataError
|
||||
|
||||
from pandas import DataFrame, Index
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("skiprows", [lrange(6), 6])
|
||||
def test_skip_rows_bug(all_parsers, skiprows):
|
||||
# see gh-505
|
||||
parser = all_parsers
|
||||
text = """#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
1/1/2000,1.,2.,3.
|
||||
1/2/2000,4,5,6
|
||||
1/3/2000,7,8,9
|
||||
"""
|
||||
result = parser.read_csv(StringIO(text), skiprows=skiprows, header=None,
|
||||
index_col=0, parse_dates=True)
|
||||
index = Index([datetime(2000, 1, 1), datetime(2000, 1, 2),
|
||||
datetime(2000, 1, 3)], name=0)
|
||||
|
||||
expected = DataFrame(np.arange(1., 10.).reshape((3, 3)),
|
||||
columns=[1, 2, 3], index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_deep_skip_rows(all_parsers):
|
||||
# see gh-4382
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n" + "\n".join([",".join([str(i), str(i + 1), str(i + 2)])
|
||||
for i in range(10)])
|
||||
condensed_data = "a,b,c\n" + "\n".join([
|
||||
",".join([str(i), str(i + 1), str(i + 2)])
|
||||
for i in [0, 1, 2, 3, 4, 6, 8, 9]])
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=[6, 8])
|
||||
condensed_result = parser.read_csv(StringIO(condensed_data))
|
||||
tm.assert_frame_equal(result, condensed_result)
|
||||
|
||||
|
||||
def test_skip_rows_blank(all_parsers):
|
||||
# see gh-9832
|
||||
parser = all_parsers
|
||||
text = """#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
|
||||
1/1/2000,1.,2.,3.
|
||||
1/2/2000,4,5,6
|
||||
1/3/2000,7,8,9
|
||||
"""
|
||||
data = parser.read_csv(StringIO(text), skiprows=6, header=None,
|
||||
index_col=0, parse_dates=True)
|
||||
index = Index([datetime(2000, 1, 1), datetime(2000, 1, 2),
|
||||
datetime(2000, 1, 3)], name=0)
|
||||
|
||||
expected = DataFrame(np.arange(1., 10.).reshape((3, 3)),
|
||||
columns=[1, 2, 3],
|
||||
index=index)
|
||||
tm.assert_frame_equal(data, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,kwargs,expected", [
|
||||
("""id,text,num_lines
|
||||
1,"line 11
|
||||
line 12",2
|
||||
2,"line 21
|
||||
line 22",2
|
||||
3,"line 31",1""",
|
||||
dict(skiprows=[1]),
|
||||
DataFrame([[2, "line 21\nline 22", 2],
|
||||
[3, "line 31", 1]], columns=["id", "text", "num_lines"])),
|
||||
("a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
|
||||
dict(quotechar="~", skiprows=[2]),
|
||||
DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"])),
|
||||
(("Text,url\n~example\n "
|
||||
"sentence\n one~,url1\n~"
|
||||
"example\n sentence\n two~,url2\n~"
|
||||
"example\n sentence\n three~,url3"),
|
||||
dict(quotechar="~", skiprows=[1, 3]),
|
||||
DataFrame([['example\n sentence\n two', 'url2']],
|
||||
columns=["Text", "url"]))
|
||||
])
|
||||
def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skip_row_with_quote(all_parsers):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
data = """id,text,num_lines
|
||||
1,"line '11' line 12",2
|
||||
2,"line '21' line 22",2
|
||||
3,"line '31' line 32",1"""
|
||||
|
||||
exp_data = [[2, "line '21' line 22", 2],
|
||||
[3, "line '31' line 32", 1]]
|
||||
expected = DataFrame(exp_data, columns=[
|
||||
"id", "text", "num_lines"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=[1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,exp_data", [
|
||||
("""id,text,num_lines
|
||||
1,"line \n'11' line 12",2
|
||||
2,"line \n'21' line 22",2
|
||||
3,"line \n'31' line 32",1""",
|
||||
[[2, "line \n'21' line 22", 2],
|
||||
[3, "line \n'31' line 32", 1]]),
|
||||
("""id,text,num_lines
|
||||
1,"line '11\n' line 12",2
|
||||
2,"line '21\n' line 22",2
|
||||
3,"line '31\n' line 32",1""",
|
||||
[[2, "line '21\n' line 22", 2],
|
||||
[3, "line '31\n' line 32", 1]]),
|
||||
("""id,text,num_lines
|
||||
1,"line '11\n' \r\tline 12",2
|
||||
2,"line '21\n' \r\tline 22",2
|
||||
3,"line '31\n' \r\tline 32",1""",
|
||||
[[2, "line '21\n' \r\tline 22", 2],
|
||||
[3, "line '31\n' \r\tline 32", 1]]),
|
||||
])
|
||||
def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), skiprows=[1])
|
||||
|
||||
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("line_terminator", [
|
||||
"\n", # "LF"
|
||||
"\r\n", # "CRLF"
|
||||
"\r" # "CR"
|
||||
])
|
||||
def test_skiprows_lineterminator(all_parsers, line_terminator):
|
||||
# see gh-9079
|
||||
parser = all_parsers
|
||||
data = "\n".join(["SMOSMANIA ThetaProbe-ML2X ",
|
||||
"2007/01/01 01:00 0.2140 U M ",
|
||||
"2007/01/01 02:00 0.2141 M O ",
|
||||
"2007/01/01 04:00 0.2142 D M "])
|
||||
expected = DataFrame([["2007/01/01", "01:00", 0.2140, "U", "M"],
|
||||
["2007/01/01", "02:00", 0.2141, "M", "O"],
|
||||
["2007/01/01", "04:00", 0.2142, "D", "M"]],
|
||||
columns=["date", "time", "var", "flag",
|
||||
"oflag"])
|
||||
|
||||
if parser.engine == "python" and line_terminator == "\r":
|
||||
pytest.skip("'CR' not respect with the Python parser yet")
|
||||
|
||||
data = data.replace("\n", line_terminator)
|
||||
result = parser.read_csv(StringIO(data), skiprows=1, delim_whitespace=True,
|
||||
names=["date", "time", "var", "flag", "oflag"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skiprows_infield_quote(all_parsers):
|
||||
# see gh-14459
|
||||
parser = all_parsers
|
||||
data = "a\"\nb\"\na\n1"
|
||||
expected = DataFrame({"a": [1]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs,expected", [
|
||||
(dict(), DataFrame({"1": [3, 5]})),
|
||||
(dict(header=0, names=["foo"]), DataFrame({"foo": [3, 5]}))
|
||||
])
|
||||
def test_skip_rows_callable(all_parsers, kwargs, expected):
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
|
||||
result = parser.read_csv(StringIO(data),
|
||||
skiprows=lambda x: x % 2 == 0,
|
||||
**kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skip_rows_skip_all(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
msg = "No columns to parse from file"
|
||||
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
parser.read_csv(StringIO(data), skiprows=lambda x: True)
|
||||
|
||||
|
||||
def test_skip_rows_bad_callable(all_parsers):
|
||||
msg = "by zero"
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
|
||||
with pytest.raises(ZeroDivisionError, match=msg):
|
||||
parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
|
||||
@@ -0,0 +1,353 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests the TextReader class in parsers.pyx, which
|
||||
is integral to the C engine in parsers.py
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
from numpy import nan
|
||||
import pytest
|
||||
|
||||
import pandas._libs.parsers as parser
|
||||
from pandas._libs.parsers import TextReader
|
||||
import pandas.compat as compat
|
||||
from pandas.compat import BytesIO, StringIO, map
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
from pandas.io.parsers import TextFileReader, read_csv
|
||||
|
||||
|
||||
class TestTextReader(object):
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_method(self, datapath):
|
||||
self.dirpath = datapath('io', 'parser', 'data')
|
||||
self.csv1 = os.path.join(self.dirpath, 'test1.csv')
|
||||
self.csv2 = os.path.join(self.dirpath, 'test2.csv')
|
||||
self.xls1 = os.path.join(self.dirpath, 'test.xls')
|
||||
|
||||
def test_file_handle(self):
|
||||
with open(self.csv1, 'rb') as f:
|
||||
reader = TextReader(f)
|
||||
reader.read()
|
||||
|
||||
def test_string_filename(self):
|
||||
reader = TextReader(self.csv1, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_file_handle_mmap(self):
|
||||
with open(self.csv1, 'rb') as f:
|
||||
reader = TextReader(f, memory_map=True, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_StringIO(self):
|
||||
with open(self.csv1, 'rb') as f:
|
||||
text = f.read()
|
||||
src = BytesIO(text)
|
||||
reader = TextReader(src, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_string_factorize(self):
|
||||
# should this be optional?
|
||||
data = 'a\nb\na\nb\na'
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
assert len(set(map(id, result[0]))) == 2
|
||||
|
||||
def test_skipinitialspace(self):
|
||||
data = ('a, b\n'
|
||||
'a, b\n'
|
||||
'a, b\n'
|
||||
'a, b')
|
||||
|
||||
reader = TextReader(StringIO(data), skipinitialspace=True,
|
||||
header=None)
|
||||
result = reader.read()
|
||||
|
||||
tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a', 'a'],
|
||||
dtype=np.object_))
|
||||
tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b', 'b'],
|
||||
dtype=np.object_))
|
||||
|
||||
def test_parse_booleans(self):
|
||||
data = 'True\nFalse\nTrue\nTrue'
|
||||
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
|
||||
assert result[0].dtype == np.bool_
|
||||
|
||||
def test_delimit_whitespace(self):
|
||||
data = 'a b\na\t\t "b"\n"a"\t \t b'
|
||||
|
||||
reader = TextReader(StringIO(data), delim_whitespace=True,
|
||||
header=None)
|
||||
result = reader.read()
|
||||
|
||||
tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a'],
|
||||
dtype=np.object_))
|
||||
tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b'],
|
||||
dtype=np.object_))
|
||||
|
||||
def test_embedded_newline(self):
|
||||
data = 'a\n"hello\nthere"\nthis'
|
||||
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array(['a', 'hello\nthere', 'this'], dtype=np.object_)
|
||||
tm.assert_numpy_array_equal(result[0], expected)
|
||||
|
||||
def test_euro_decimal(self):
|
||||
data = '12345,67\n345,678'
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=':',
|
||||
decimal=',', header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array([12345.67, 345.678])
|
||||
tm.assert_almost_equal(result[0], expected)
|
||||
|
||||
def test_integer_thousands(self):
|
||||
data = '123,456\n12,500'
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=':',
|
||||
thousands=',', header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array([123456, 12500], dtype=np.int64)
|
||||
tm.assert_almost_equal(result[0], expected)
|
||||
|
||||
def test_integer_thousands_alt(self):
|
||||
data = '123.456\n12.500'
|
||||
|
||||
reader = TextFileReader(StringIO(data), delimiter=':',
|
||||
thousands='.', header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = DataFrame([123456, 12500])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_skip_bad_lines(self, capsys):
|
||||
# too many lines, see #2430 for why
|
||||
data = ('a:b:c\n'
|
||||
'd:e:f\n'
|
||||
'g:h:i\n'
|
||||
'j:k:l:m\n'
|
||||
'l:m:n\n'
|
||||
'o:p:q:r')
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=':',
|
||||
header=None)
|
||||
msg = (r"Error tokenizing data\. C error: Expected 3 fields in"
|
||||
" line 4, saw 4")
|
||||
with pytest.raises(parser.ParserError, match=msg):
|
||||
reader.read()
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=':',
|
||||
header=None,
|
||||
error_bad_lines=False,
|
||||
warn_bad_lines=False)
|
||||
result = reader.read()
|
||||
expected = {0: np.array(['a', 'd', 'g', 'l'], dtype=object),
|
||||
1: np.array(['b', 'e', 'h', 'm'], dtype=object),
|
||||
2: np.array(['c', 'f', 'i', 'n'], dtype=object)}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=':',
|
||||
header=None,
|
||||
error_bad_lines=False,
|
||||
warn_bad_lines=True)
|
||||
reader.read()
|
||||
captured = capsys.readouterr()
|
||||
|
||||
assert 'Skipping line 4' in captured.err
|
||||
assert 'Skipping line 6' in captured.err
|
||||
|
||||
def test_header_not_enough_lines(self):
|
||||
data = ('skip this\n'
|
||||
'skip this\n'
|
||||
'a,b,c\n'
|
||||
'1,2,3\n'
|
||||
'4,5,6')
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=',', header=2)
|
||||
header = reader.header
|
||||
expected = [['a', 'b', 'c']]
|
||||
assert header == expected
|
||||
|
||||
recs = reader.read()
|
||||
expected = {0: np.array([1, 4], dtype=np.int64),
|
||||
1: np.array([2, 5], dtype=np.int64),
|
||||
2: np.array([3, 6], dtype=np.int64)}
|
||||
assert_array_dicts_equal(recs, expected)
|
||||
|
||||
def test_escapechar(self):
|
||||
data = ('\\"hello world\"\n'
|
||||
'\\"hello world\"\n'
|
||||
'\\"hello world\"')
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=',', header=None,
|
||||
escapechar='\\')
|
||||
result = reader.read()
|
||||
expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
def test_eof_has_eol(self):
|
||||
# handling of new line at EOF
|
||||
pass
|
||||
|
||||
def test_na_substitution(self):
|
||||
pass
|
||||
|
||||
def test_numpy_string_dtype(self):
|
||||
data = """\
|
||||
a,1
|
||||
aa,2
|
||||
aaa,3
|
||||
aaaa,4
|
||||
aaaaa,5"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
return TextReader(StringIO(data), delimiter=',', header=None,
|
||||
**kwds)
|
||||
|
||||
reader = _make_reader(dtype='S5,i4')
|
||||
result = reader.read()
|
||||
|
||||
assert result[0].dtype == 'S5'
|
||||
|
||||
ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaaa'], dtype='S5')
|
||||
assert (result[0] == ex_values).all()
|
||||
assert result[1].dtype == 'i4'
|
||||
|
||||
reader = _make_reader(dtype='S4')
|
||||
result = reader.read()
|
||||
assert result[0].dtype == 'S4'
|
||||
ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaa'], dtype='S4')
|
||||
assert (result[0] == ex_values).all()
|
||||
assert result[1].dtype == 'S4'
|
||||
|
||||
def test_pass_dtype(self):
|
||||
data = """\
|
||||
one,two
|
||||
1,a
|
||||
2,b
|
||||
3,c
|
||||
4,d"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
return TextReader(StringIO(data), delimiter=',', **kwds)
|
||||
|
||||
reader = _make_reader(dtype={'one': 'u1', 1: 'S1'})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == 'u1'
|
||||
assert result[1].dtype == 'S1'
|
||||
|
||||
reader = _make_reader(dtype={'one': np.uint8, 1: object})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == 'u1'
|
||||
assert result[1].dtype == 'O'
|
||||
|
||||
reader = _make_reader(dtype={'one': np.dtype('u1'),
|
||||
1: np.dtype('O')})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == 'u1'
|
||||
assert result[1].dtype == 'O'
|
||||
|
||||
def test_usecols(self):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
return TextReader(StringIO(data), delimiter=',', **kwds)
|
||||
|
||||
reader = _make_reader(usecols=(1, 2))
|
||||
result = reader.read()
|
||||
|
||||
exp = _make_reader().read()
|
||||
assert len(result) == 2
|
||||
assert (result[1] == exp[1]).all()
|
||||
assert (result[2] == exp[2]).all()
|
||||
|
||||
def test_cr_delimited(self):
|
||||
def _test(text, **kwargs):
|
||||
nice_text = text.replace('\r', '\r\n')
|
||||
result = TextReader(StringIO(text), **kwargs).read()
|
||||
expected = TextReader(StringIO(nice_text), **kwargs).read()
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
data = 'a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12'
|
||||
_test(data, delimiter=',')
|
||||
|
||||
data = 'a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12'
|
||||
_test(data, delim_whitespace=True)
|
||||
|
||||
data = 'a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12'
|
||||
_test(data, delimiter=',')
|
||||
|
||||
sample = ('A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r'
|
||||
'AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r'
|
||||
',BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0')
|
||||
_test(sample, delimiter=',')
|
||||
|
||||
data = 'A B C\r 2 3\r4 5 6'
|
||||
_test(data, delim_whitespace=True)
|
||||
|
||||
data = 'A B C\r2 3\r4 5 6'
|
||||
_test(data, delim_whitespace=True)
|
||||
|
||||
def test_empty_field_eof(self):
|
||||
data = 'a,b,c\n1,2,3\n4,,'
|
||||
|
||||
result = TextReader(StringIO(data), delimiter=',').read()
|
||||
|
||||
expected = {0: np.array([1, 4], dtype=np.int64),
|
||||
1: np.array(['2', ''], dtype=object),
|
||||
2: np.array(['3', ''], dtype=object)}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
# GH5664
|
||||
a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c'])
|
||||
b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]],
|
||||
columns=list('abcd'),
|
||||
index=[1, 1])
|
||||
c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan],
|
||||
[8, 9, 10, 11], [13, 14, nan, nan]],
|
||||
columns=list('abcd'),
|
||||
index=[0, 5, 7, 12])
|
||||
|
||||
for _ in range(100):
|
||||
df = read_csv(StringIO('a,b\nc\n'), skiprows=0,
|
||||
names=['a'], engine='c')
|
||||
assert_frame_equal(df, a)
|
||||
|
||||
df = read_csv(StringIO('1,1,1,1,0\n' * 2 + '\n' * 2),
|
||||
names=list("abcd"), engine='c')
|
||||
assert_frame_equal(df, b)
|
||||
|
||||
df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'),
|
||||
names=list('abcd'), engine='c')
|
||||
assert_frame_equal(df, c)
|
||||
|
||||
def test_empty_csv_input(self):
|
||||
# GH14867
|
||||
df = read_csv(StringIO(), chunksize=20, header=None,
|
||||
names=['a', 'b', 'c'])
|
||||
assert isinstance(df, TextFileReader)
|
||||
|
||||
|
||||
def assert_array_dicts_equal(left, right):
|
||||
for k, v in compat.iteritems(left):
|
||||
assert tm.assert_numpy_array_equal(np.asarray(v),
|
||||
np.asarray(right[k]))
|
||||
@@ -0,0 +1,140 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that features that are currently unsupported in
|
||||
either the Python or C parser are actually enforced
|
||||
and are clearly communicated to the user.
|
||||
|
||||
Ultimately, the goal is to remove test cases from this
|
||||
test suite as new feature support is added to the parsers.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO
|
||||
from pandas.errors import ParserError
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
import pandas.io.parsers as parsers
|
||||
from pandas.io.parsers import read_csv
|
||||
|
||||
|
||||
@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
|
||||
def python_engine(request):
|
||||
return request.param
|
||||
|
||||
|
||||
class TestUnsupportedFeatures(object):
|
||||
|
||||
def test_mangle_dupe_cols_false(self):
|
||||
# see gh-12935
|
||||
data = 'a b c\n1 2 3'
|
||||
msg = 'is not supported'
|
||||
|
||||
for engine in ('c', 'python'):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine=engine,
|
||||
mangle_dupe_cols=False)
|
||||
|
||||
def test_c_engine(self):
|
||||
# see gh-6607
|
||||
data = 'a b c\n1 2 3'
|
||||
msg = 'does not support'
|
||||
|
||||
# specify C engine with unsupported options (raise)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine='c',
|
||||
sep=None, delim_whitespace=False)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine='c', sep=r'\s')
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine='c', sep='\t', quotechar=chr(128))
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine='c', skipfooter=1)
|
||||
|
||||
# specify C-unsupported options without python-unsupported options
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), sep=None, delim_whitespace=False)
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), sep=r'\s')
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), sep='\t', quotechar=chr(128))
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), skipfooter=1)
|
||||
|
||||
text = """ A B C D E
|
||||
one two three four
|
||||
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
|
||||
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
|
||||
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
|
||||
msg = 'Error tokenizing data'
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
read_csv(StringIO(text), sep='\\s+')
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
read_csv(StringIO(text), engine='c', sep='\\s+')
|
||||
|
||||
msg = "Only length-1 thousands markers supported"
|
||||
data = """A|B|C
|
||||
1|2,334|5
|
||||
10|13|10.
|
||||
"""
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), thousands=',,')
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), thousands='')
|
||||
|
||||
msg = "Only length-1 line terminators supported"
|
||||
data = 'a,b,c~~1,2,3~~4,5,6'
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), lineterminator='~~')
|
||||
|
||||
def test_python_engine(self, python_engine):
|
||||
from pandas.io.parsers import _python_unsupported as py_unsupported
|
||||
|
||||
data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
|
||||
for default in py_unsupported:
|
||||
msg = ('The %r option is not supported '
|
||||
'with the %r engine' % (default, python_engine))
|
||||
|
||||
kwargs = {default: object()}
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine=python_engine, **kwargs)
|
||||
|
||||
def test_python_engine_file_no_next(self, python_engine):
|
||||
# see gh-16530
|
||||
class NoNextBuffer(object):
|
||||
def __init__(self, csv_data):
|
||||
self.data = csv_data
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def read(self):
|
||||
return self.data
|
||||
|
||||
data = "a\n1"
|
||||
msg = "The 'python' engine cannot iterate"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(NoNextBuffer(data), engine=python_engine)
|
||||
|
||||
|
||||
class TestDeprecatedFeatures(object):
|
||||
|
||||
@pytest.mark.parametrize("engine", ["c", "python"])
|
||||
@pytest.mark.parametrize("kwargs", [{"tupleize_cols": True},
|
||||
{"tupleize_cols": False}])
|
||||
def test_deprecated_args(self, engine, kwargs):
|
||||
data = "1,2,3"
|
||||
arg, _ = list(kwargs.items())[0]
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, check_stacklevel=False):
|
||||
read_csv(StringIO(data), engine=engine, **kwargs)
|
||||
@@ -0,0 +1,534 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests the usecols functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslib import Timestamp
|
||||
from pandas.compat import StringIO
|
||||
|
||||
from pandas import DataFrame, Index
|
||||
import pandas.util.testing as tm
|
||||
|
||||
_msg_validate_usecols_arg = ("'usecols' must either be list-like "
|
||||
"of all strings, all unicode, all "
|
||||
"integers or a callable.")
|
||||
_msg_validate_usecols_names = ("Usecols do not match columns, columns "
|
||||
"expected but not found: {0}")
|
||||
|
||||
|
||||
def test_raise_on_mixed_dtype_usecols(all_parsers):
|
||||
# See gh-12678
|
||||
data = """a,b,c
|
||||
1000,2000,3000
|
||||
4000,5000,6000
|
||||
"""
|
||||
usecols = [0, "b", 2]
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
|
||||
def test_usecols(all_parsers, usecols):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9],
|
||||
[11, 12]], columns=["b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_names(all_parsers):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
names = ["foo", "bar"]
|
||||
result = parser.read_csv(StringIO(data), names=names,
|
||||
usecols=[1, 2], header=0)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9],
|
||||
[11, 12]], columns=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("names,usecols", [
|
||||
(["b", "c"], [1, 2]),
|
||||
(["a", "b", "c"], ["b", "c"])
|
||||
])
|
||||
def test_usecols_relative_to_names(all_parsers, names, usecols):
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), names=names,
|
||||
header=None, usecols=usecols)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9],
|
||||
[11, 12]], columns=["b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_relative_to_names2(all_parsers):
|
||||
# see gh-5766
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), names=["a", "b"],
|
||||
header=None, usecols=[0, 1])
|
||||
|
||||
expected = DataFrame([[1, 2], [4, 5], [7, 8],
|
||||
[10, 11]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_name_length_conflict(all_parsers):
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
msg = ("Number of passed names did not "
|
||||
"match number of header fields in the file"
|
||||
if parser.engine == "python" else
|
||||
"Passed header names mismatches usecols")
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=["a", "b"],
|
||||
header=None, usecols=[1])
|
||||
|
||||
|
||||
def test_usecols_single_string(all_parsers):
|
||||
# see gh-20558
|
||||
parser = all_parsers
|
||||
data = """foo, bar, baz
|
||||
1000, 2000, 3000
|
||||
4000, 5000, 6000"""
|
||||
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols="foo")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", ["a,b,c,d\n1,2,3,4\n5,6,7,8",
|
||||
"a,b,c,d\n1,2,3,4,\n5,6,7,8,"])
|
||||
def test_usecols_index_col_false(all_parsers, data):
|
||||
# see gh-9082
|
||||
parser = all_parsers
|
||||
usecols = ["a", "c", "d"]
|
||||
expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", ["b", 0])
|
||||
@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
|
||||
def test_usecols_index_col_conflict(all_parsers, usecols, index_col):
|
||||
# see gh-4201: test that index_col as integer reflects usecols
|
||||
parser = all_parsers
|
||||
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
|
||||
expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols,
|
||||
index_col=index_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_index_col_conflict2(all_parsers):
|
||||
# see gh-4201: test that index_col as integer reflects usecols
|
||||
parser = all_parsers
|
||||
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
|
||||
|
||||
expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")})
|
||||
expected = expected.set_index(["b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"],
|
||||
index_col=["b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_implicit_index_col(all_parsers):
|
||||
# see gh-2654
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["a", "b"])
|
||||
expected = DataFrame({"a": ["apple", "orange"],
|
||||
"b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_regex_sep(all_parsers):
|
||||
# see gh-2733
|
||||
parser = all_parsers
|
||||
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
|
||||
|
||||
expected = DataFrame({"a": ["apple", "orange"],
|
||||
"b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_whitespace(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
|
||||
|
||||
result = parser.read_csv(StringIO(data), delim_whitespace=True,
|
||||
usecols=("a", "b"))
|
||||
expected = DataFrame({"a": ["apple", "orange"],
|
||||
"b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols,expected", [
|
||||
# Column selection by index.
|
||||
([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]],
|
||||
columns=["2", "0"])),
|
||||
|
||||
# Column selection by name.
|
||||
(["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]],
|
||||
columns=["0", "1"])),
|
||||
])
|
||||
def test_usecols_with_integer_like_header(all_parsers, usecols, expected):
|
||||
parser = all_parsers
|
||||
data = """2,0,1
|
||||
1000,2000,3000
|
||||
4000,5000,6000"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
|
||||
def test_usecols_with_parse_dates(all_parsers, usecols):
|
||||
# see gh-9755
|
||||
data = """a,b,c,d,e
|
||||
0,1,20140101,0900,4
|
||||
0,1,20140102,1000,4"""
|
||||
parser = all_parsers
|
||||
parse_dates = [[1, 2]]
|
||||
|
||||
cols = {
|
||||
"a": [0, 0],
|
||||
"c_d": [
|
||||
Timestamp("2014-01-01 09:00:00"),
|
||||
Timestamp("2014-01-02 10:00:00")
|
||||
]
|
||||
}
|
||||
expected = DataFrame(cols, columns=["c_d", "a"])
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols,
|
||||
parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_parse_dates2(all_parsers):
|
||||
# see gh-13604
|
||||
parser = all_parsers
|
||||
data = """2008-02-07 09:40,1032.43
|
||||
2008-02-07 09:50,1042.54
|
||||
2008-02-07 10:00,1051.65"""
|
||||
|
||||
names = ["date", "values"]
|
||||
usecols = names[:]
|
||||
parse_dates = [0]
|
||||
|
||||
index = Index([Timestamp("2008-02-07 09:40"),
|
||||
Timestamp("2008-02-07 09:50"),
|
||||
Timestamp("2008-02-07 10:00")],
|
||||
name="date")
|
||||
cols = {"values": [1032.43, 1042.54, 1051.65]}
|
||||
expected = DataFrame(cols, index=index)
|
||||
|
||||
result = parser.read_csv(StringIO(data), parse_dates=parse_dates,
|
||||
index_col=0, usecols=usecols,
|
||||
header=None, names=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_parse_dates3(all_parsers):
|
||||
# see gh-14792
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d,e,f,g,h,i,j
|
||||
2016/09/21,1,1,2,3,4,5,6,7,8"""
|
||||
|
||||
usecols = list("abcdefghij")
|
||||
parse_dates = [0]
|
||||
|
||||
cols = {"a": Timestamp("2016-09-21"),
|
||||
"b": [1], "c": [1], "d": [2],
|
||||
"e": [3], "f": [4], "g": [5],
|
||||
"h": [6], "i": [7], "j": [8]}
|
||||
expected = DataFrame(cols, columns=usecols)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols,
|
||||
parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_parse_dates4(all_parsers):
|
||||
data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
|
||||
usecols = list("abcdefghij")
|
||||
parse_dates = [[0, 1]]
|
||||
parser = all_parsers
|
||||
|
||||
cols = {"a_b": "2016/09/21 1",
|
||||
"c": [1], "d": [2], "e": [3], "f": [4],
|
||||
"g": [5], "h": [6], "i": [7], "j": [8]}
|
||||
expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols,
|
||||
parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
|
||||
@pytest.mark.parametrize("names", [
|
||||
list("abcde"), # Names span all columns in original data.
|
||||
list("acd"), # Names span only the selected columns.
|
||||
])
|
||||
def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names):
|
||||
# see gh-9755
|
||||
s = """0,1,20140101,0900,4
|
||||
0,1,20140102,1000,4"""
|
||||
parse_dates = [[1, 2]]
|
||||
parser = all_parsers
|
||||
|
||||
cols = {
|
||||
"a": [0, 0],
|
||||
"c_d": [
|
||||
Timestamp("2014-01-01 09:00:00"),
|
||||
Timestamp("2014-01-02 10:00:00")
|
||||
]
|
||||
}
|
||||
expected = DataFrame(cols, columns=["c_d", "a"])
|
||||
|
||||
result = parser.read_csv(StringIO(s), names=names,
|
||||
parse_dates=parse_dates,
|
||||
usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_unicode_strings(all_parsers):
|
||||
# see gh-13219
|
||||
data = """AAA,BBB,CCC,DDD
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"AAA": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002
|
||||
},
|
||||
"BBB": {0: 8, 1: 2, 2: 7}
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=[u"AAA", u"BBB"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_single_byte_unicode_strings(all_parsers):
|
||||
# see gh-13219
|
||||
data = """A,B,C,D
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"A": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002
|
||||
},
|
||||
"B": {0: 8, 1: 2, 2: 7}
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=[u"A", u"B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [[u"AAA", b"BBB"], [b"AAA", u"BBB"]])
|
||||
def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
|
||||
data = """AAA,BBB,CCC,DDD
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [
|
||||
["あああ", "いい"],
|
||||
[u"あああ", u"いい"]
|
||||
])
|
||||
def test_usecols_with_multi_byte_characters(all_parsers, usecols):
|
||||
data = """あああ,いい,ううう,ええええ
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"あああ": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002
|
||||
},
|
||||
"いい": {0: 8, 1: 2, 2: 7}
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_usecols(all_parsers):
|
||||
data = "a,b,c\n1,2,3\n4,5,6"
|
||||
expected = DataFrame()
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=set())
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_np_array_usecols(all_parsers):
|
||||
# see gh-12546
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,2,3"
|
||||
usecols = np.array(["a", "b"])
|
||||
|
||||
expected = DataFrame([[1, 2]], columns=usecols)
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols,expected", [
|
||||
(lambda x: x.upper() in ["AAA", "BBB", "DDD"],
|
||||
DataFrame({
|
||||
"AaA": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002
|
||||
},
|
||||
"bBb": {0: 8, 1: 2, 2: 7},
|
||||
"ddd": {0: "a", 1: "b", 2: "a"}
|
||||
})),
|
||||
(lambda x: False, DataFrame()),
|
||||
])
|
||||
def test_callable_usecols(all_parsers, usecols, expected):
|
||||
# see gh-14154
|
||||
data = """AaA,bBb,CCC,ddd
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
|
||||
def test_incomplete_first_row(all_parsers, usecols):
|
||||
# see gh-6710
|
||||
data = "1,2\n1,2,3"
|
||||
parser = all_parsers
|
||||
names = ["a", "b", "c"]
|
||||
expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data,usecols,kwargs,expected", [
|
||||
# see gh-8985
|
||||
("19,29,39\n" * 2 + "10,20,30,40", [0, 1, 2],
|
||||
dict(header=None), DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]])),
|
||||
|
||||
# see gh-9549
|
||||
(("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n"
|
||||
"1,2,3,,,1,\n1,2,3\n5,6,7"), ["A", "B", "C"],
|
||||
dict(), DataFrame({"A": [1, 3, 1, 1, 1, 5],
|
||||
"B": [2, 4, 2, 2, 2, 6],
|
||||
"C": [3, 5, 4, 3, 3, 7]})),
|
||||
])
|
||||
def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
|
||||
# see gh-8985
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols,kwargs,expected,msg", [
|
||||
(["a", "b", "c", "d"], dict(),
|
||||
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), None),
|
||||
(["a", "b", "c", "f"], dict(), None,
|
||||
_msg_validate_usecols_names.format(r"\['f'\]")),
|
||||
(["a", "b", "f"], dict(), None,
|
||||
_msg_validate_usecols_names.format(r"\['f'\]")),
|
||||
(["a", "b", "f", "g"], dict(), None,
|
||||
_msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]")),
|
||||
|
||||
# see gh-14671
|
||||
(None, dict(header=0, names=["A", "B", "C", "D"]),
|
||||
DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7],
|
||||
"D": [4, 8]}), None),
|
||||
(["A", "B", "C", "f"], dict(header=0, names=["A", "B", "C", "D"]),
|
||||
None, _msg_validate_usecols_names.format(r"\['f'\]")),
|
||||
(["A", "B", "f"], dict(names=["A", "B", "C", "D"]),
|
||||
None, _msg_validate_usecols_names.format(r"\['f'\]")),
|
||||
])
|
||||
def test_raises_on_usecols_names_mismatch(all_parsers, usecols,
|
||||
kwargs, expected, msg):
|
||||
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
|
||||
kwargs.update(usecols=usecols)
|
||||
parser = all_parsers
|
||||
|
||||
if expected is None:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason="see gh-16469: works on the C engine but not the Python engine",
|
||||
strict=False)
|
||||
@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
|
||||
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols):
|
||||
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
|
||||
names = ["A", "B", "C", "D"]
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=0,
|
||||
names=names, usecols=usecols)
|
||||
expected = DataFrame({"A": [1, 5], "C": [3, 7]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
Reference in New Issue
Block a user