started work on backend
This commit is contained in:
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -0,0 +1,487 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that apply specifically to the CParser. Unless specifically stated
|
||||
as a CParser-specific issue, the goal is to eventually move as many of
|
||||
these tests out of this module as soon as the Python parser can accept
|
||||
further arguments when parsing.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tarfile
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
import pandas.util._test_decorators as td
|
||||
from pandas import DataFrame
|
||||
from pandas.compat import StringIO, range, lrange
|
||||
|
||||
|
||||
class CParserTests(object):
|
||||
|
||||
def test_buffer_overflow(self):
|
||||
# see gh-9205: test certain malformed input files that cause
|
||||
# buffer overflows in tokenizer.c
|
||||
|
||||
malfw = "1\r1\r1\r 1\r 1\r" # buffer overflow in words pointer
|
||||
malfs = "1\r1\r1\r 1\r 1\r11\r" # buffer overflow in stream pointer
|
||||
malfl = "1\r1\r1\r 1\r 1\r11\r1\r" # buffer overflow in lines pointer
|
||||
|
||||
cperr = 'Buffer overflow caught - possible malformed input file.'
|
||||
|
||||
for malf in (malfw, malfs, malfl):
|
||||
try:
|
||||
self.read_table(StringIO(malf))
|
||||
except Exception as err:
|
||||
assert cperr in str(err)
|
||||
|
||||
def test_buffer_rd_bytes(self):
|
||||
# see gh-12098: src->buffer in the C parser can be freed twice leading
|
||||
# to a segfault if a corrupt gzip file is read with 'read_csv' and the
|
||||
# buffer is filled more than once before gzip throws an exception
|
||||
|
||||
data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \
|
||||
'\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \
|
||||
'\xA6\x4D' + '\x55' * 267 + \
|
||||
'\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \
|
||||
'\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO'
|
||||
for i in range(100):
|
||||
try:
|
||||
self.read_csv(StringIO(data),
|
||||
compression='gzip',
|
||||
delim_whitespace=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def test_delim_whitespace_custom_terminator(self):
|
||||
# See gh-12912
|
||||
data = """a b c~1 2 3~4 5 6~7 8 9"""
|
||||
df = self.read_csv(StringIO(data), lineterminator='~',
|
||||
delim_whitespace=True)
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=['a', 'b', 'c'])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_dtype_and_names_error(self):
|
||||
# see gh-8833: passing both dtype and names
|
||||
# resulting in an error reporting issue
|
||||
data = """
|
||||
1.0 1
|
||||
2.0 2
|
||||
3.0 3
|
||||
"""
|
||||
# base cases
|
||||
result = self.read_csv(StringIO(data), sep=r'\s+', header=None)
|
||||
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = self.read_csv(StringIO(data), sep=r'\s+',
|
||||
header=None, names=['a', 'b'])
|
||||
expected = DataFrame(
|
||||
[[1.0, 1], [2.0, 2], [3.0, 3]], columns=['a', 'b'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# fallback casting
|
||||
result = self.read_csv(StringIO(
|
||||
data), sep=r'\s+', header=None,
|
||||
names=['a', 'b'], dtype={'a': np.int32})
|
||||
expected = DataFrame([[1, 1], [2, 2], [3, 3]],
|
||||
columns=['a', 'b'])
|
||||
expected['a'] = expected['a'].astype(np.int32)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
data = """
|
||||
1.0 1
|
||||
nan 2
|
||||
3.0 3
|
||||
"""
|
||||
# fallback casting, but not castable
|
||||
with tm.assert_raises_regex(ValueError, 'cannot safely convert'):
|
||||
self.read_csv(StringIO(data), sep=r'\s+', header=None,
|
||||
names=['a', 'b'], dtype={'a': np.int32})
|
||||
|
||||
def test_unsupported_dtype(self):
|
||||
df = DataFrame(np.random.rand(5, 2), columns=list(
|
||||
'AB'), index=['1A', '1B', '1C', '1D', '1E'])
|
||||
|
||||
with tm.ensure_clean('__unsupported_dtype__.csv') as path:
|
||||
df.to_csv(path)
|
||||
|
||||
# valid but we don't support it (date)
|
||||
pytest.raises(TypeError, self.read_csv, path,
|
||||
dtype={'A': 'datetime64', 'B': 'float64'},
|
||||
index_col=0)
|
||||
pytest.raises(TypeError, self.read_csv, path,
|
||||
dtype={'A': 'datetime64', 'B': 'float64'},
|
||||
index_col=0, parse_dates=['B'])
|
||||
|
||||
# valid but we don't support it
|
||||
pytest.raises(TypeError, self.read_csv, path,
|
||||
dtype={'A': 'timedelta64', 'B': 'float64'},
|
||||
index_col=0)
|
||||
|
||||
# valid but unsupported - fixed width unicode string
|
||||
pytest.raises(TypeError, self.read_csv, path,
|
||||
dtype={'A': 'U8'},
|
||||
index_col=0)
|
||||
|
||||
@td.skip_if_32bit
|
||||
def test_precise_conversion(self):
|
||||
from decimal import Decimal
|
||||
|
||||
normal_errors = []
|
||||
precise_errors = []
|
||||
|
||||
# test numbers between 1 and 2
|
||||
for num in np.linspace(1., 2., num=500):
|
||||
# 25 decimal digits of precision
|
||||
text = 'a\n{0:.25}'.format(num)
|
||||
|
||||
normal_val = float(self.read_csv(StringIO(text))['a'][0])
|
||||
precise_val = float(self.read_csv(
|
||||
StringIO(text), float_precision='high')['a'][0])
|
||||
roundtrip_val = float(self.read_csv(
|
||||
StringIO(text), float_precision='round_trip')['a'][0])
|
||||
actual_val = Decimal(text[2:])
|
||||
|
||||
def error(val):
|
||||
return abs(Decimal('{0:.100}'.format(val)) - actual_val)
|
||||
|
||||
normal_errors.append(error(normal_val))
|
||||
precise_errors.append(error(precise_val))
|
||||
|
||||
# round-trip should match float()
|
||||
assert roundtrip_val == float(text[2:])
|
||||
|
||||
assert sum(precise_errors) <= sum(normal_errors)
|
||||
assert max(precise_errors) <= max(normal_errors)
|
||||
|
||||
def test_usecols_dtypes(self):
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
|
||||
result = self.read_csv(StringIO(data), usecols=(0, 1, 2),
|
||||
names=('a', 'b', 'c'),
|
||||
header=None,
|
||||
converters={'a': str},
|
||||
dtype={'b': int, 'c': float},
|
||||
)
|
||||
result2 = self.read_csv(StringIO(data), usecols=(0, 2),
|
||||
names=('a', 'b', 'c'),
|
||||
header=None,
|
||||
converters={'a': str},
|
||||
dtype={'b': int, 'c': float},
|
||||
)
|
||||
assert (result.dtypes == [object, np.int, np.float]).all()
|
||||
assert (result2.dtypes == [object, np.float]).all()
|
||||
|
||||
def test_disable_bool_parsing(self):
|
||||
# #2090
|
||||
|
||||
data = """A,B,C
|
||||
Yes,No,Yes
|
||||
No,Yes,Yes
|
||||
Yes,,Yes
|
||||
No,No,No"""
|
||||
|
||||
result = self.read_csv(StringIO(data), dtype=object)
|
||||
assert (result.dtypes == object).all()
|
||||
|
||||
result = self.read_csv(StringIO(data), dtype=object, na_filter=False)
|
||||
assert result['B'][2] == ''
|
||||
|
||||
def test_custom_lineterminator(self):
|
||||
data = 'a,b,c~1,2,3~4,5,6'
|
||||
|
||||
result = self.read_csv(StringIO(data), lineterminator='~')
|
||||
expected = self.read_csv(StringIO(data.replace('~', '\n')))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_parse_ragged_csv(self):
|
||||
data = """1,2,3
|
||||
1,2,3,4
|
||||
1,2,3,4,5
|
||||
1,2
|
||||
1,2,3,4"""
|
||||
|
||||
nice_data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
result = self.read_csv(StringIO(data), header=None,
|
||||
names=['a', 'b', 'c', 'd', 'e'])
|
||||
|
||||
expected = self.read_csv(StringIO(nice_data), header=None,
|
||||
names=['a', 'b', 'c', 'd', 'e'])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# too many columns, cause segfault if not careful
|
||||
data = "1,2\n3,4,5"
|
||||
|
||||
result = self.read_csv(StringIO(data), header=None,
|
||||
names=lrange(50))
|
||||
expected = self.read_csv(StringIO(data), header=None,
|
||||
names=lrange(3)).reindex(columns=lrange(50))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_tokenize_CR_with_quoting(self):
|
||||
# see gh-3453
|
||||
|
||||
data = ' a,b,c\r"a,b","e,d","f,f"'
|
||||
|
||||
result = self.read_csv(StringIO(data), header=None)
|
||||
expected = self.read_csv(StringIO(data.replace('\r', '\n')),
|
||||
header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = self.read_csv(StringIO(data))
|
||||
expected = self.read_csv(StringIO(data.replace('\r', '\n')))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_grow_boundary_at_cap(self):
|
||||
# See gh-12494
|
||||
#
|
||||
# Cause of error was that the C parser
|
||||
# was not increasing the buffer size when
|
||||
# the desired space would fill the buffer
|
||||
# to capacity, which would later cause a
|
||||
# buffer overflow error when checking the
|
||||
# EOF terminator of the CSV stream
|
||||
def test_empty_header_read(count):
|
||||
s = StringIO(',' * count)
|
||||
expected = DataFrame(columns=[
|
||||
'Unnamed: {i}'.format(i=i)
|
||||
for i in range(count + 1)])
|
||||
df = self.read_csv(s)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
for count in range(1, 101):
|
||||
test_empty_header_read(count)
|
||||
|
||||
def test_parse_trim_buffers(self):
|
||||
# This test is part of a bugfix for issue #13703. It attempts to
|
||||
# to stress the system memory allocator, to cause it to move the
|
||||
# stream buffer and either let the OS reclaim the region, or let
|
||||
# other memory requests of parser otherwise modify the contents
|
||||
# of memory space, where it was formally located.
|
||||
# This test is designed to cause a `segfault` with unpatched
|
||||
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
|
||||
# times it fails due to memory corruption, which causes the
|
||||
# loaded DataFrame to differ from the expected one.
|
||||
|
||||
# Generate a large mixed-type CSV file on-the-fly (one record is
|
||||
# approx 1.5KiB).
|
||||
record_ = \
|
||||
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \
|
||||
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \
|
||||
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \
|
||||
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \
|
||||
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \
|
||||
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \
|
||||
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \
|
||||
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \
|
||||
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \
|
||||
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \
|
||||
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \
|
||||
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \
|
||||
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \
|
||||
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \
|
||||
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \
|
||||
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \
|
||||
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \
|
||||
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \
|
||||
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \
|
||||
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \
|
||||
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \
|
||||
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \
|
||||
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \
|
||||
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \
|
||||
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \
|
||||
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \
|
||||
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \
|
||||
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \
|
||||
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
|
||||
|
||||
# Set the number of lines so that a call to `parser_trim_buffers`
|
||||
# is triggered: after a couple of full chunks are consumed a
|
||||
# relatively small 'residual' chunk would cause reallocation
|
||||
# within the parser.
|
||||
chunksize, n_lines = 128, 2 * 128 + 15
|
||||
csv_data = "\n".join([record_] * n_lines) + "\n"
|
||||
|
||||
# We will use StringIO to load the CSV from this text buffer.
|
||||
# pd.read_csv() will iterate over the file in chunks and will
|
||||
# finally read a residual chunk of really small size.
|
||||
|
||||
# Generate the expected output: manually create the dataframe
|
||||
# by splitting by comma and repeating the `n_lines` times.
|
||||
row = tuple(val_ if val_ else np.nan
|
||||
for val_ in record_.split(","))
|
||||
expected = pd.DataFrame([row for _ in range(n_lines)],
|
||||
dtype=object, columns=None, index=None)
|
||||
|
||||
# Iterate over the CSV file in chunks of `chunksize` lines
|
||||
chunks_ = self.read_csv(StringIO(csv_data), header=None,
|
||||
dtype=object, chunksize=chunksize)
|
||||
result = pd.concat(chunks_, axis=0, ignore_index=True)
|
||||
|
||||
# Check for data corruption if there was no segfault
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# This extra test was added to replicate the fault in gh-5291.
|
||||
# Force 'utf-8' encoding, so that `_string_convert` would take
|
||||
# a different execution branch.
|
||||
chunks_ = self.read_csv(StringIO(csv_data), header=None,
|
||||
dtype=object, chunksize=chunksize,
|
||||
encoding='utf_8')
|
||||
result = pd.concat(chunks_, axis=0, ignore_index=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_internal_null_byte(self):
|
||||
# see gh-14012
|
||||
#
|
||||
# The null byte ('\x00') should not be used as a
|
||||
# true line terminator, escape character, or comment
|
||||
# character, only as a placeholder to indicate that
|
||||
# none was specified.
|
||||
#
|
||||
# This test should be moved to common.py ONLY when
|
||||
# Python's csv class supports parsing '\x00'.
|
||||
names = ['a', 'b', 'c']
|
||||
data = "1,2,3\n4,\x00,6\n7,8,9"
|
||||
expected = pd.DataFrame([[1, 2.0, 3], [4, np.nan, 6],
|
||||
[7, 8, 9]], columns=names)
|
||||
|
||||
result = self.read_csv(StringIO(data), names=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_read_nrows_large(self):
|
||||
# gh-7626 - Read only nrows of data in for large inputs (>262144b)
|
||||
header_narrow = '\t'.join(['COL_HEADER_' + str(i)
|
||||
for i in range(10)]) + '\n'
|
||||
data_narrow = '\t'.join(['somedatasomedatasomedata1'
|
||||
for i in range(10)]) + '\n'
|
||||
header_wide = '\t'.join(['COL_HEADER_' + str(i)
|
||||
for i in range(15)]) + '\n'
|
||||
data_wide = '\t'.join(['somedatasomedatasomedata2'
|
||||
for i in range(15)]) + '\n'
|
||||
test_input = (header_narrow + data_narrow * 1050 +
|
||||
header_wide + data_wide * 2)
|
||||
|
||||
df = self.read_csv(StringIO(test_input), sep='\t', nrows=1010)
|
||||
|
||||
assert df.size == 1010 * 10
|
||||
|
||||
def test_float_precision_round_trip_with_text(self):
|
||||
# gh-15140 - This should not segfault on Python 2.7+
|
||||
df = self.read_csv(StringIO('a'),
|
||||
float_precision='round_trip',
|
||||
header=None)
|
||||
tm.assert_frame_equal(df, DataFrame({0: ['a']}))
|
||||
|
||||
def test_large_difference_in_columns(self):
|
||||
# gh-14125
|
||||
count = 10000
|
||||
large_row = ('X,' * count)[:-1] + '\n'
|
||||
normal_row = 'XXXXXX XXXXXX,111111111111111\n'
|
||||
test_input = (large_row + normal_row * 6)[:-1]
|
||||
result = self.read_csv(StringIO(test_input), header=None, usecols=[0])
|
||||
rows = test_input.split('\n')
|
||||
expected = DataFrame([row.split(',')[0] for row in rows])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_data_after_quote(self):
|
||||
# see gh-15910
|
||||
|
||||
data = 'a\n1\n"b"a'
|
||||
result = self.read_csv(StringIO(data))
|
||||
expected = DataFrame({'a': ['1', 'ba']})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@tm.capture_stderr
|
||||
def test_comment_whitespace_delimited(self):
|
||||
test_input = """\
|
||||
1 2
|
||||
2 2 3
|
||||
3 2 3 # 3 fields
|
||||
4 2 3# 3 fields
|
||||
5 2 # 2 fields
|
||||
6 2# 2 fields
|
||||
7 # 1 field, NaN
|
||||
8# 1 field, NaN
|
||||
9 2 3 # skipped line
|
||||
# comment"""
|
||||
df = self.read_csv(StringIO(test_input), comment='#', header=None,
|
||||
delimiter='\\s+', skiprows=0,
|
||||
error_bad_lines=False)
|
||||
error = sys.stderr.getvalue()
|
||||
# skipped lines 2, 3, 4, 9
|
||||
for line_num in (2, 3, 4, 9):
|
||||
assert 'Skipping line {}'.format(line_num) in error, error
|
||||
expected = DataFrame([[1, 2],
|
||||
[5, 2],
|
||||
[6, 2],
|
||||
[7, np.nan],
|
||||
[8, np.nan]])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_file_like_no_next(self):
|
||||
# gh-16530: the file-like need not have a "next" or "__next__"
|
||||
# attribute despite having an "__iter__" attribute.
|
||||
#
|
||||
# NOTE: This is only true for the C engine, not Python engine.
|
||||
class NoNextBuffer(StringIO):
|
||||
def __next__(self):
|
||||
raise AttributeError("No next method")
|
||||
|
||||
next = __next__
|
||||
|
||||
data = "a\n1"
|
||||
|
||||
expected = pd.DataFrame({"a": [1]})
|
||||
result = self.read_csv(NoNextBuffer(data))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
|
||||
def test_read_tarfile(self, tar_suffix):
|
||||
# see gh-16530
|
||||
#
|
||||
# Unfortunately, Python's CSV library can't handle
|
||||
# tarfile objects (expects string, not bytes when
|
||||
# iterating through a file-like).
|
||||
tar_path = os.path.join(self.dirpath, "tar_csv" + tar_suffix)
|
||||
|
||||
with tarfile.open(tar_path, "r") as tar:
|
||||
data_file = tar.extractfile("tar_data.csv")
|
||||
|
||||
out = self.read_csv(data_file)
|
||||
expected = pd.DataFrame({"a": [1]})
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
@pytest.mark.high_memory
|
||||
def test_bytes_exceed_2gb(self):
|
||||
"""Read from a "CSV" that has a column larger than 2GB.
|
||||
|
||||
GH 16798
|
||||
"""
|
||||
if self.low_memory:
|
||||
pytest.skip("not a high_memory test")
|
||||
|
||||
csv = StringIO('strings\n' + '\n'.join(
|
||||
['x' * (1 << 20) for _ in range(2100)]))
|
||||
df = self.read_csv(csv, low_memory=False)
|
||||
assert not df.empty
|
||||
@@ -0,0 +1,118 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that comments are properly handled during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.compat import StringIO
|
||||
|
||||
|
||||
class CommentTests(object):
|
||||
|
||||
def test_comment(self):
|
||||
data = """A,B,C
|
||||
1,2.,4.#hello world
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
expected = np.array([[1., 2., 4.],
|
||||
[5., np.nan, 10.]])
|
||||
df = self.read_csv(StringIO(data), comment='#')
|
||||
tm.assert_numpy_array_equal(df.values, expected)
|
||||
|
||||
df = self.read_table(StringIO(data), sep=',', comment='#',
|
||||
na_values=['NaN'])
|
||||
tm.assert_numpy_array_equal(df.values, expected)
|
||||
|
||||
def test_line_comment(self):
|
||||
data = """# empty
|
||||
A,B,C
|
||||
1,2.,4.#hello world
|
||||
#ignore this line
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
expected = np.array([[1., 2., 4.],
|
||||
[5., np.nan, 10.]])
|
||||
df = self.read_csv(StringIO(data), comment='#')
|
||||
tm.assert_numpy_array_equal(df.values, expected)
|
||||
|
||||
# check with delim_whitespace=True
|
||||
df = self.read_csv(StringIO(data.replace(',', ' ')), comment='#',
|
||||
delim_whitespace=True)
|
||||
tm.assert_almost_equal(df.values, expected)
|
||||
|
||||
# custom line terminator is not supported
|
||||
# with the Python parser yet
|
||||
if self.engine == 'c':
|
||||
expected = np.array([[1., 2., 4.],
|
||||
[5., np.nan, 10.]])
|
||||
df = self.read_csv(StringIO(data.replace('\n', '*')),
|
||||
comment='#', lineterminator='*')
|
||||
tm.assert_numpy_array_equal(df.values, expected)
|
||||
|
||||
def test_comment_skiprows(self):
|
||||
data = """# empty
|
||||
random line
|
||||
# second empty line
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# this should ignore the first four lines (including comments)
|
||||
expected = np.array([[1., 2., 4.], [5., np.nan, 10.]])
|
||||
df = self.read_csv(StringIO(data), comment='#', skiprows=4)
|
||||
tm.assert_numpy_array_equal(df.values, expected)
|
||||
|
||||
def test_comment_header(self):
|
||||
data = """# empty
|
||||
# second empty line
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# header should begin at the second non-comment line
|
||||
expected = np.array([[1., 2., 4.], [5., np.nan, 10.]])
|
||||
df = self.read_csv(StringIO(data), comment='#', header=1)
|
||||
tm.assert_numpy_array_equal(df.values, expected)
|
||||
|
||||
def test_comment_skiprows_header(self):
|
||||
data = """# empty
|
||||
# second empty line
|
||||
# third empty line
|
||||
X,Y,Z
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# skiprows should skip the first 4 lines (including comments), while
|
||||
# header should start from the second non-commented line starting
|
||||
# with line 5
|
||||
expected = np.array([[1., 2., 4.], [5., np.nan, 10.]])
|
||||
df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1)
|
||||
tm.assert_numpy_array_equal(df.values, expected)
|
||||
|
||||
def test_custom_comment_char(self):
|
||||
data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
|
||||
|
||||
result = self.read_csv(StringIO(data), comment='#')
|
||||
expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_commment_first_line(self):
|
||||
# see gh-4623
|
||||
data = '# notes\na,b,c\n# more notes\n1,2,3'
|
||||
|
||||
expected = DataFrame([[1, 2, 3]], columns=['a', 'b', 'c'])
|
||||
result = self.read_csv(StringIO(data), comment='#')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({0: ['a', '1'], 1: ['b', '2'], 2: ['c', '3']})
|
||||
result = self.read_csv(StringIO(data), comment='#', header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,139 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests compressed data parsing functionality for all
|
||||
of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas.compat as compat
|
||||
import pandas.util.testing as tm
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import gzip
|
||||
import bz2
|
||||
try:
|
||||
lzma = compat.import_lzma()
|
||||
except ImportError:
|
||||
lzma = None
|
||||
|
||||
|
||||
class CompressionTests(object):
|
||||
|
||||
def test_zip(self):
|
||||
import zipfile
|
||||
|
||||
with open(self.csv1, 'rb') as data_file:
|
||||
data = data_file.read()
|
||||
expected = self.read_csv(self.csv1)
|
||||
|
||||
with tm.ensure_clean('test_file.zip') as path:
|
||||
tmp = zipfile.ZipFile(path, mode='w')
|
||||
tmp.writestr('test_file', data)
|
||||
tmp.close()
|
||||
|
||||
result = self.read_csv(path, compression='zip')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = self.read_csv(path, compression='infer')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
if self.engine is not 'python':
|
||||
with open(path, 'rb') as f:
|
||||
result = self.read_csv(f, compression='zip')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with tm.ensure_clean('combined_zip.zip') as path:
|
||||
inner_file_names = ['test_file', 'second_file']
|
||||
tmp = zipfile.ZipFile(path, mode='w')
|
||||
for file_name in inner_file_names:
|
||||
tmp.writestr(file_name, data)
|
||||
tmp.close()
|
||||
|
||||
tm.assert_raises_regex(ValueError, 'Multiple files',
|
||||
self.read_csv, path, compression='zip')
|
||||
|
||||
tm.assert_raises_regex(ValueError, 'Multiple files',
|
||||
self.read_csv, path,
|
||||
compression='infer')
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
tmp = zipfile.ZipFile(path, mode='w')
|
||||
tmp.close()
|
||||
|
||||
tm.assert_raises_regex(ValueError, 'Zero files',
|
||||
self.read_csv, path, compression='zip')
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, 'wb') as f:
|
||||
pytest.raises(zipfile.BadZipfile, self.read_csv,
|
||||
f, compression='zip')
|
||||
|
||||
@pytest.mark.parametrize('compress_type, compress_method, ext', [
|
||||
('gzip', gzip.GzipFile, 'gz'),
|
||||
('bz2', bz2.BZ2File, 'bz2'),
|
||||
pytest.param('xz', getattr(lzma, 'LZMAFile', None), 'xz',
|
||||
marks=td.skip_if_no_lzma)
|
||||
])
|
||||
def test_other_compression(self, compress_type, compress_method, ext):
|
||||
|
||||
with open(self.csv1, 'rb') as data_file:
|
||||
data = data_file.read()
|
||||
expected = self.read_csv(self.csv1)
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
tmp = compress_method(path, mode='wb')
|
||||
tmp.write(data)
|
||||
tmp.close()
|
||||
|
||||
result = self.read_csv(path, compression=compress_type)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
if compress_type == 'bz2':
|
||||
pytest.raises(ValueError, self.read_csv,
|
||||
path, compression='bz3')
|
||||
|
||||
with open(path, 'rb') as fin:
|
||||
result = self.read_csv(fin, compression=compress_type)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with tm.ensure_clean('test.{}'.format(ext)) as path:
|
||||
tmp = compress_method(path, mode='wb')
|
||||
tmp.write(data)
|
||||
tmp.close()
|
||||
result = self.read_csv(path, compression='infer')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_read_csv_infer_compression(self):
|
||||
# see gh-9770
|
||||
expected = self.read_csv(self.csv1, index_col=0, parse_dates=True)
|
||||
|
||||
with open(self.csv1) as f:
|
||||
inputs = [self.csv1, self.csv1 + '.gz',
|
||||
self.csv1 + '.bz2', f]
|
||||
|
||||
for inp in inputs:
|
||||
df = self.read_csv(inp, index_col=0, parse_dates=True,
|
||||
compression='infer')
|
||||
|
||||
tm.assert_frame_equal(expected, df)
|
||||
|
||||
def test_read_csv_compressed_utf16_example(self, datapath):
|
||||
# GH18071
|
||||
path = datapath('io', 'parser', 'data', 'utf16_ex_small.zip')
|
||||
|
||||
result = self.read_csv(path, encoding='utf-16',
|
||||
compression='zip', sep='\t')
|
||||
expected = pd.DataFrame({
|
||||
u'Country': [u'Venezuela', u'Venezuela'],
|
||||
u'Twitter': [u'Hugo Chávez Frías', u'Henrique Capriles R.']
|
||||
})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_invalid_compression(self):
|
||||
msg = 'Unrecognized compression type: sfark'
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
self.read_csv('test_file.zip', compression='sfark')
|
||||
@@ -0,0 +1,153 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests column conversion functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import pytest
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas._libs.tslib import Timestamp
|
||||
from pandas import DataFrame, Index
|
||||
from pandas.compat import parse_date, StringIO, lmap
|
||||
|
||||
|
||||
class ConverterTests(object):
|
||||
|
||||
def test_converters_type_must_be_dict(self):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
"""
|
||||
with tm.assert_raises_regex(TypeError, 'Type converters.+'):
|
||||
self.read_csv(StringIO(data), converters=0)
|
||||
|
||||
def test_converters(self):
|
||||
data = """A,B,C,D
|
||||
a,1,2,01/01/2009
|
||||
b,3,4,01/02/2009
|
||||
c,4,5,01/03/2009
|
||||
"""
|
||||
result = self.read_csv(StringIO(data), converters={'D': parse_date})
|
||||
result2 = self.read_csv(StringIO(data), converters={3: parse_date})
|
||||
|
||||
expected = self.read_csv(StringIO(data))
|
||||
expected['D'] = expected['D'].map(parse_date)
|
||||
|
||||
assert isinstance(result['D'][0], (datetime, Timestamp))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
# produce integer
|
||||
converter = lambda x: int(x.split('/')[2])
|
||||
result = self.read_csv(StringIO(data), converters={'D': converter})
|
||||
expected = self.read_csv(StringIO(data))
|
||||
expected['D'] = expected['D'].map(converter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_converters_no_implicit_conv(self):
|
||||
# see gh-2184
|
||||
data = """000102,1.2,A\n001245,2,B"""
|
||||
f = lambda x: x.strip()
|
||||
converter = {0: f}
|
||||
df = self.read_csv(StringIO(data), header=None, converters=converter)
|
||||
assert df[0].dtype == object
|
||||
|
||||
def test_converters_euro_decimal_format(self):
|
||||
data = """Id;Number1;Number2;Text1;Text2;Number3
|
||||
1;1521,1541;187101,9543;ABC;poi;4,738797819
|
||||
2;121,12;14897,76;DEF;uyt;0,377320872
|
||||
3;878,158;108013,434;GHI;rez;2,735694704"""
|
||||
f = lambda x: float(x.replace(",", "."))
|
||||
converter = {'Number1': f, 'Number2': f, 'Number3': f}
|
||||
df2 = self.read_csv(StringIO(data), sep=';', converters=converter)
|
||||
assert df2['Number1'].dtype == float
|
||||
assert df2['Number2'].dtype == float
|
||||
assert df2['Number3'].dtype == float
|
||||
|
||||
def test_converter_return_string_bug(self):
|
||||
# see gh-583
|
||||
data = """Id;Number1;Number2;Text1;Text2;Number3
|
||||
1;1521,1541;187101,9543;ABC;poi;4,738797819
|
||||
2;121,12;14897,76;DEF;uyt;0,377320872
|
||||
3;878,158;108013,434;GHI;rez;2,735694704"""
|
||||
f = lambda x: float(x.replace(",", "."))
|
||||
converter = {'Number1': f, 'Number2': f, 'Number3': f}
|
||||
df2 = self.read_csv(StringIO(data), sep=';', converters=converter)
|
||||
assert df2['Number1'].dtype == float
|
||||
|
||||
def test_converters_corner_with_nas(self):
|
||||
# skip aberration observed on Win64 Python 3.2.2
|
||||
if hash(np.int64(-1)) != -2:
|
||||
pytest.skip("skipping because of windows hash on Python"
|
||||
" 3.2.2")
|
||||
|
||||
data = """id,score,days
|
||||
1,2,12
|
||||
2,2-5,
|
||||
3,,14+
|
||||
4,6-12,2"""
|
||||
|
||||
def convert_days(x):
|
||||
x = x.strip()
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
is_plus = x.endswith('+')
|
||||
if is_plus:
|
||||
x = int(x[:-1]) + 1
|
||||
else:
|
||||
x = int(x)
|
||||
return x
|
||||
|
||||
def convert_days_sentinel(x):
|
||||
x = x.strip()
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
is_plus = x.endswith('+')
|
||||
if is_plus:
|
||||
x = int(x[:-1]) + 1
|
||||
else:
|
||||
x = int(x)
|
||||
return x
|
||||
|
||||
def convert_score(x):
|
||||
x = x.strip()
|
||||
if not x:
|
||||
return np.nan
|
||||
if x.find('-') > 0:
|
||||
valmin, valmax = lmap(int, x.split('-'))
|
||||
val = 0.5 * (valmin + valmax)
|
||||
else:
|
||||
val = float(x)
|
||||
|
||||
return val
|
||||
|
||||
fh = StringIO(data)
|
||||
result = self.read_csv(fh, converters={'score': convert_score,
|
||||
'days': convert_days},
|
||||
na_values=['', None])
|
||||
assert pd.isna(result['days'][1])
|
||||
|
||||
fh = StringIO(data)
|
||||
result2 = self.read_csv(fh, converters={'score': convert_score,
|
||||
'days': convert_days_sentinel},
|
||||
na_values=['', None])
|
||||
tm.assert_frame_equal(result, result2)
|
||||
|
||||
def test_converter_index_col_bug(self):
|
||||
# see gh-1835
|
||||
data = "A;B\n1;2\n3;4"
|
||||
|
||||
rs = self.read_csv(StringIO(data), sep=';', index_col='A',
|
||||
converters={'A': lambda x: x})
|
||||
|
||||
xp = DataFrame({'B': [2, 4]}, index=Index([1, 3], name='A'))
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
assert rs.index.name == xp.index.name
|
||||
@@ -0,0 +1,78 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that dialects are properly handled during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import csv
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.compat import StringIO
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
class DialectTests(object):
|
||||
|
||||
def test_dialect(self):
|
||||
data = """\
|
||||
label1,label2,label3
|
||||
index1,"a,c,e
|
||||
index2,b,d,f
|
||||
"""
|
||||
|
||||
dia = csv.excel()
|
||||
dia.quoting = csv.QUOTE_NONE
|
||||
with tm.assert_produces_warning(ParserWarning):
|
||||
df = self.read_csv(StringIO(data), dialect=dia)
|
||||
|
||||
data = '''\
|
||||
label1,label2,label3
|
||||
index1,a,c,e
|
||||
index2,b,d,f
|
||||
'''
|
||||
exp = self.read_csv(StringIO(data))
|
||||
exp.replace('a', '"a', inplace=True)
|
||||
tm.assert_frame_equal(df, exp)
|
||||
|
||||
def test_dialect_str(self):
|
||||
data = """\
|
||||
fruit:vegetable
|
||||
apple:brocolli
|
||||
pear:tomato
|
||||
"""
|
||||
exp = DataFrame({
|
||||
'fruit': ['apple', 'pear'],
|
||||
'vegetable': ['brocolli', 'tomato']
|
||||
})
|
||||
csv.register_dialect('mydialect', delimiter=':')
|
||||
with tm.assert_produces_warning(ParserWarning):
|
||||
df = self.read_csv(StringIO(data), dialect='mydialect')
|
||||
|
||||
tm.assert_frame_equal(df, exp)
|
||||
csv.unregister_dialect('mydialect')
|
||||
|
||||
def test_invalid_dialect(self):
|
||||
class InvalidDialect(object):
|
||||
pass
|
||||
|
||||
data = 'a\n1'
|
||||
msg = 'Invalid dialect'
|
||||
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
self.read_csv(StringIO(data), dialect=InvalidDialect)
|
||||
|
||||
def test_dialect_conflict(self):
|
||||
data = 'a,b\n1,2'
|
||||
dialect = 'excel'
|
||||
exp = DataFrame({'a': [1], 'b': [2]})
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
df = self.read_csv(StringIO(data), delimiter=',', dialect=dialect)
|
||||
tm.assert_frame_equal(df, exp)
|
||||
|
||||
with tm.assert_produces_warning(ParserWarning):
|
||||
df = self.read_csv(StringIO(data), delimiter='.', dialect=dialect)
|
||||
tm.assert_frame_equal(df, exp)
|
||||
@@ -0,0 +1,399 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests dtype specification during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas import DataFrame, Series, Index, MultiIndex, Categorical
|
||||
from pandas.compat import StringIO
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
|
||||
class DtypeTests(object):
|
||||
|
||||
def test_passing_dtype(self):
|
||||
# see gh-6607
|
||||
df = DataFrame(np.random.rand(5, 2).round(4), columns=list(
|
||||
'AB'), index=['1A', '1B', '1C', '1D', '1E'])
|
||||
|
||||
with tm.ensure_clean('__passing_str_as_dtype__.csv') as path:
|
||||
df.to_csv(path)
|
||||
|
||||
# see gh-3795: passing 'str' as the dtype
|
||||
result = self.read_csv(path, dtype=str, index_col=0)
|
||||
expected = df.astype(str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# for parsing, interpret object as str
|
||||
result = self.read_csv(path, dtype=object, index_col=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# we expect all object columns, so need to
|
||||
# convert to test for equivalence
|
||||
result = result.astype(float)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
# invalid dtype
|
||||
pytest.raises(TypeError, self.read_csv, path,
|
||||
dtype={'A': 'foo', 'B': 'float64'},
|
||||
index_col=0)
|
||||
|
||||
# see gh-12048: empty frame
|
||||
actual = self.read_csv(StringIO('A,B'), dtype=str)
|
||||
expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_pass_dtype(self):
|
||||
data = """\
|
||||
one,two
|
||||
1,2.5
|
||||
2,3.5
|
||||
3,4.5
|
||||
4,5.5"""
|
||||
|
||||
result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'})
|
||||
assert result['one'].dtype == 'u1'
|
||||
assert result['two'].dtype == 'object'
|
||||
|
||||
def test_categorical_dtype(self):
|
||||
# GH 10153
|
||||
data = """a,b,c
|
||||
1,a,3.4
|
||||
1,a,3.4
|
||||
2,b,4.5"""
|
||||
expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
|
||||
'b': Categorical(['a', 'a', 'b']),
|
||||
'c': Categorical(['3.4', '3.4', '4.5'])})
|
||||
actual = self.read_csv(StringIO(data), dtype='category')
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = self.read_csv(StringIO(data), dtype=CategoricalDtype())
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = self.read_csv(StringIO(data), dtype={'a': 'category',
|
||||
'b': 'category',
|
||||
'c': CategoricalDtype()})
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = self.read_csv(StringIO(data), dtype={'b': 'category'})
|
||||
expected = pd.DataFrame({'a': [1, 1, 2],
|
||||
'b': Categorical(['a', 'a', 'b']),
|
||||
'c': [3.4, 3.4, 4.5]})
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = self.read_csv(StringIO(data), dtype={1: 'category'})
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# unsorted
|
||||
data = """a,b,c
|
||||
1,b,3.4
|
||||
1,b,3.4
|
||||
2,a,4.5"""
|
||||
expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
|
||||
'b': Categorical(['b', 'b', 'a']),
|
||||
'c': Categorical(['3.4', '3.4', '4.5'])})
|
||||
actual = self.read_csv(StringIO(data), dtype='category')
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# missing
|
||||
data = """a,b,c
|
||||
1,b,3.4
|
||||
1,nan,3.4
|
||||
2,a,4.5"""
|
||||
expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
|
||||
'b': Categorical(['b', np.nan, 'a']),
|
||||
'c': Categorical(['3.4', '3.4', '4.5'])})
|
||||
actual = self.read_csv(StringIO(data), dtype='category')
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_categorical_dtype_high_cardinality_numeric(self):
|
||||
# GH 18186
|
||||
data = np.sort([str(i) for i in range(524289)])
|
||||
expected = DataFrame({'a': Categorical(data, ordered=True)})
|
||||
actual = self.read_csv(StringIO('a\n' + '\n'.join(data)),
|
||||
dtype='category')
|
||||
actual["a"] = actual["a"].cat.reorder_categories(
|
||||
np.sort(actual.a.cat.categories), ordered=True)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_categorical_dtype_encoding(self, datapath):
|
||||
# GH 10153
|
||||
pth = datapath('io', 'parser', 'data', 'unicode_series.csv')
|
||||
encoding = 'latin-1'
|
||||
expected = self.read_csv(pth, header=None, encoding=encoding)
|
||||
expected[1] = Categorical(expected[1])
|
||||
actual = self.read_csv(pth, header=None, encoding=encoding,
|
||||
dtype={1: 'category'})
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
pth = datapath('io', 'parser', 'data', 'utf16_ex.txt')
|
||||
encoding = 'utf-16'
|
||||
expected = self.read_table(pth, encoding=encoding)
|
||||
expected = expected.apply(Categorical)
|
||||
actual = self.read_table(pth, encoding=encoding, dtype='category')
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_categorical_dtype_chunksize(self):
|
||||
# GH 10153
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
expecteds = [pd.DataFrame({'a': [1, 1],
|
||||
'b': Categorical(['a', 'b'])}),
|
||||
pd.DataFrame({'a': [1, 2],
|
||||
'b': Categorical(['b', 'c'])},
|
||||
index=[2, 3])]
|
||||
actuals = self.read_csv(StringIO(data), dtype={'b': 'category'},
|
||||
chunksize=2)
|
||||
|
||||
for actual, expected in zip(actuals, expecteds):
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
@pytest.mark.parametrize('ordered', [False, True])
|
||||
@pytest.mark.parametrize('categories', [
|
||||
['a', 'b', 'c'],
|
||||
['a', 'c', 'b'],
|
||||
['a', 'b', 'c', 'd'],
|
||||
['c', 'b', 'a'],
|
||||
])
|
||||
def test_categorical_categoricaldtype(self, categories, ordered):
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
expected = pd.DataFrame({
|
||||
"a": [1, 1, 1, 2],
|
||||
"b": Categorical(['a', 'b', 'b', 'c'],
|
||||
categories=categories,
|
||||
ordered=ordered)
|
||||
})
|
||||
dtype = {"b": CategoricalDtype(categories=categories,
|
||||
ordered=ordered)}
|
||||
result = self.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_categorical_categoricaldtype_unsorted(self):
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
dtype = CategoricalDtype(['c', 'b', 'a'])
|
||||
expected = pd.DataFrame({
|
||||
'a': [1, 1, 1, 2],
|
||||
'b': Categorical(['a', 'b', 'b', 'c'], categories=['c', 'b', 'a'])
|
||||
})
|
||||
result = self.read_csv(StringIO(data), dtype={'b': dtype})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_categoricaldtype_coerces_numeric(self):
|
||||
dtype = {'b': CategoricalDtype([1, 2, 3])}
|
||||
data = "b\n1\n1\n2\n3"
|
||||
expected = pd.DataFrame({'b': Categorical([1, 1, 2, 3])})
|
||||
result = self.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_categoricaldtype_coerces_datetime(self):
|
||||
dtype = {
|
||||
'b': CategoricalDtype(pd.date_range('2017', '2019', freq='AS'))
|
||||
}
|
||||
data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
|
||||
expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)})
|
||||
result = self.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
dtype = {
|
||||
'b': CategoricalDtype([pd.Timestamp("2014")])
|
||||
}
|
||||
data = "b\n2014-01-01\n2014-01-01T00:00:00"
|
||||
expected = pd.DataFrame({'b': Categorical([pd.Timestamp('2014')] * 2)})
|
||||
result = self.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_categoricaldtype_coerces_timedelta(self):
|
||||
dtype = {'b': CategoricalDtype(pd.to_timedelta(['1H', '2H', '3H']))}
|
||||
data = "b\n1H\n2H\n3H"
|
||||
expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)})
|
||||
result = self.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_categoricaldtype_unexpected_categories(self):
|
||||
dtype = {'b': CategoricalDtype(['a', 'b', 'd', 'e'])}
|
||||
data = "b\nd\na\nc\nd" # Unexpected c
|
||||
expected = pd.DataFrame({"b": Categorical(list('dacd'),
|
||||
dtype=dtype['b'])})
|
||||
result = self.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_categorical_categoricaldtype_chunksize(self):
|
||||
# GH 10153
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
cats = ['a', 'b', 'c']
|
||||
expecteds = [pd.DataFrame({'a': [1, 1],
|
||||
'b': Categorical(['a', 'b'],
|
||||
categories=cats)}),
|
||||
pd.DataFrame({'a': [1, 2],
|
||||
'b': Categorical(['b', 'c'],
|
||||
categories=cats)},
|
||||
index=[2, 3])]
|
||||
dtype = CategoricalDtype(cats)
|
||||
actuals = self.read_csv(StringIO(data), dtype={'b': dtype},
|
||||
chunksize=2)
|
||||
|
||||
for actual, expected in zip(actuals, expecteds):
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_empty_pass_dtype(self):
|
||||
data = 'one,two'
|
||||
result = self.read_csv(StringIO(data), dtype={'one': 'u1'})
|
||||
|
||||
expected = DataFrame({'one': np.empty(0, dtype='u1'),
|
||||
'two': np.empty(0, dtype=np.object)})
|
||||
tm.assert_frame_equal(result, expected, check_index_type=False)
|
||||
|
||||
def test_empty_with_index_pass_dtype(self):
|
||||
data = 'one,two'
|
||||
result = self.read_csv(StringIO(data), index_col=['one'],
|
||||
dtype={'one': 'u1', 1: 'f'})
|
||||
|
||||
expected = DataFrame({'two': np.empty(0, dtype='f')},
|
||||
index=Index([], dtype='u1', name='one'))
|
||||
tm.assert_frame_equal(result, expected, check_index_type=False)
|
||||
|
||||
def test_empty_with_multiindex_pass_dtype(self):
|
||||
data = 'one,two,three'
|
||||
result = self.read_csv(StringIO(data), index_col=['one', 'two'],
|
||||
dtype={'one': 'u1', 1: 'f8'})
|
||||
|
||||
exp_idx = MultiIndex.from_arrays([np.empty(0, dtype='u1'),
|
||||
np.empty(0, dtype='O')],
|
||||
names=['one', 'two'])
|
||||
expected = DataFrame(
|
||||
{'three': np.empty(0, dtype=np.object)}, index=exp_idx)
|
||||
tm.assert_frame_equal(result, expected, check_index_type=False)
|
||||
|
||||
def test_empty_with_mangled_column_pass_dtype_by_names(self):
|
||||
data = 'one,one'
|
||||
result = self.read_csv(StringIO(data), dtype={
|
||||
'one': 'u1', 'one.1': 'f'})
|
||||
|
||||
expected = DataFrame(
|
||||
{'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')})
|
||||
tm.assert_frame_equal(result, expected, check_index_type=False)
|
||||
|
||||
def test_empty_with_mangled_column_pass_dtype_by_indexes(self):
|
||||
data = 'one,one'
|
||||
result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})
|
||||
|
||||
expected = DataFrame(
|
||||
{'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')})
|
||||
tm.assert_frame_equal(result, expected, check_index_type=False)
|
||||
|
||||
def test_empty_with_dup_column_pass_dtype_by_indexes(self):
|
||||
# see gh-9424
|
||||
expected = pd.concat([Series([], name='one', dtype='u1'),
|
||||
Series([], name='one.1', dtype='f')], axis=1)
|
||||
|
||||
data = 'one,one'
|
||||
result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})
|
||||
tm.assert_frame_equal(result, expected, check_index_type=False)
|
||||
|
||||
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
|
||||
data = ''
|
||||
result = self.read_csv(StringIO(data), names=['one', 'one'],
|
||||
dtype={0: 'u1', 1: 'f'})
|
||||
tm.assert_frame_equal(result, expected, check_index_type=False)
|
||||
|
||||
def test_raise_on_passed_int_dtype_with_nas(self):
|
||||
# see gh-2631
|
||||
data = """YEAR, DOY, a
|
||||
2001,106380451,10
|
||||
2001,,11
|
||||
2001,106380451,67"""
|
||||
pytest.raises(ValueError, self.read_csv, StringIO(data),
|
||||
sep=",", skipinitialspace=True,
|
||||
dtype={'DOY': np.int64})
|
||||
|
||||
def test_dtype_with_converter(self):
|
||||
data = """a,b
|
||||
1.1,2.2
|
||||
1.2,2.3"""
|
||||
# dtype spec ignored if converted specified
|
||||
with tm.assert_produces_warning(ParserWarning):
|
||||
result = self.read_csv(StringIO(data), dtype={'a': 'i8'},
|
||||
converters={'a': lambda x: str(x)})
|
||||
expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_empty_dtype(self):
|
||||
# see gh-14712
|
||||
data = 'a,b'
|
||||
|
||||
expected = pd.DataFrame(columns=['a', 'b'], dtype=np.float64)
|
||||
result = self.read_csv(StringIO(data), header=0, dtype=np.float64)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = pd.DataFrame({'a': pd.Categorical([]),
|
||||
'b': pd.Categorical([])},
|
||||
index=[])
|
||||
result = self.read_csv(StringIO(data), header=0,
|
||||
dtype='category')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = self.read_csv(StringIO(data), header=0,
|
||||
dtype={'a': 'category', 'b': 'category'})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = pd.DataFrame(columns=['a', 'b'], dtype='datetime64[ns]')
|
||||
result = self.read_csv(StringIO(data), header=0,
|
||||
dtype='datetime64[ns]')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = pd.DataFrame({'a': pd.Series([], dtype='timedelta64[ns]'),
|
||||
'b': pd.Series([], dtype='timedelta64[ns]')},
|
||||
index=[])
|
||||
result = self.read_csv(StringIO(data), header=0,
|
||||
dtype='timedelta64[ns]')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = pd.DataFrame(columns=['a', 'b'])
|
||||
expected['a'] = expected['a'].astype(np.float64)
|
||||
result = self.read_csv(StringIO(data), header=0,
|
||||
dtype={'a': np.float64})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = pd.DataFrame(columns=['a', 'b'])
|
||||
expected['a'] = expected['a'].astype(np.float64)
|
||||
result = self.read_csv(StringIO(data), header=0,
|
||||
dtype={0: np.float64})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = pd.DataFrame(columns=['a', 'b'])
|
||||
expected['a'] = expected['a'].astype(np.int32)
|
||||
expected['b'] = expected['b'].astype(np.float64)
|
||||
result = self.read_csv(StringIO(data), header=0,
|
||||
dtype={'a': np.int32, 1: np.float64})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_numeric_dtype(self):
|
||||
data = '0\n1'
|
||||
|
||||
for dt in np.typecodes['AllInteger'] + np.typecodes['Float']:
|
||||
expected = pd.DataFrame([0, 1], dtype=dt)
|
||||
result = self.read_csv(StringIO(data), header=None, dtype=dt)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
@@ -0,0 +1,312 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that the file header is properly handled or inferred
|
||||
during parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
import numpy as np
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex
|
||||
from pandas.compat import StringIO, lrange, u
|
||||
|
||||
|
||||
class HeaderTests(object):
|
||||
|
||||
def test_read_with_bad_header(self):
|
||||
errmsg = r"but only \d+ lines in file"
|
||||
|
||||
with tm.assert_raises_regex(ValueError, errmsg):
|
||||
s = StringIO(',,')
|
||||
self.read_csv(s, header=[10])
|
||||
|
||||
def test_bool_header_arg(self):
|
||||
# see gh-6114
|
||||
data = """\
|
||||
MyColumn
|
||||
a
|
||||
b
|
||||
a
|
||||
b"""
|
||||
for arg in [True, False]:
|
||||
with pytest.raises(TypeError):
|
||||
self.read_csv(StringIO(data), header=arg)
|
||||
with pytest.raises(TypeError):
|
||||
self.read_table(StringIO(data), header=arg)
|
||||
|
||||
def test_no_header_prefix(self):
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
df_pref = self.read_table(StringIO(data), sep=',', prefix='Field',
|
||||
header=None)
|
||||
|
||||
expected = np.array([[1, 2, 3, 4, 5],
|
||||
[6, 7, 8, 9, 10],
|
||||
[11, 12, 13, 14, 15]], dtype=np.int64)
|
||||
tm.assert_almost_equal(df_pref.values, expected)
|
||||
|
||||
tm.assert_index_equal(df_pref.columns,
|
||||
Index(['Field0', 'Field1', 'Field2',
|
||||
'Field3', 'Field4']))
|
||||
|
||||
def test_header_with_index_col(self):
|
||||
data = """foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
names = ['A', 'B', 'C']
|
||||
df = self.read_csv(StringIO(data), names=names)
|
||||
|
||||
assert list(df.columns) == ['A', 'B', 'C']
|
||||
|
||||
values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
|
||||
expected = DataFrame(values, index=['foo', 'bar', 'baz'],
|
||||
columns=['A', 'B', 'C'])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_header_not_first_line(self):
|
||||
data = """got,to,ignore,this,line
|
||||
got,to,ignore,this,line
|
||||
index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
"""
|
||||
data2 = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
"""
|
||||
|
||||
df = self.read_csv(StringIO(data), header=2, index_col=0)
|
||||
expected = self.read_csv(StringIO(data2), header=0, index_col=0)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_header_multi_index(self):
|
||||
expected = tm.makeCustomDataframe(
|
||||
5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
|
||||
|
||||
data = """\
|
||||
C0,,C_l0_g0,C_l0_g1,C_l0_g2
|
||||
|
||||
C1,,C_l1_g0,C_l1_g1,C_l1_g2
|
||||
C2,,C_l2_g0,C_l2_g1,C_l2_g2
|
||||
C3,,C_l3_g0,C_l3_g1,C_l3_g2
|
||||
R0,R1,,,
|
||||
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
|
||||
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
|
||||
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
|
||||
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
|
||||
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
|
||||
"""
|
||||
|
||||
df = self.read_csv(StringIO(data), header=[0, 1, 2, 3],
|
||||
index_col=[0, 1])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
# skipping lines in the header
|
||||
df = self.read_csv(StringIO(data), header=[0, 1, 2, 3],
|
||||
index_col=[0, 1])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
# INVALID OPTIONS
|
||||
|
||||
# names
|
||||
pytest.raises(ValueError, self.read_csv,
|
||||
StringIO(data), header=[0, 1, 2, 3],
|
||||
index_col=[0, 1], names=['foo', 'bar'])
|
||||
|
||||
# usecols
|
||||
pytest.raises(ValueError, self.read_csv,
|
||||
StringIO(data), header=[0, 1, 2, 3],
|
||||
index_col=[0, 1], usecols=['foo', 'bar'])
|
||||
|
||||
# non-numeric index_col
|
||||
pytest.raises(ValueError, self.read_csv,
|
||||
StringIO(data), header=[0, 1, 2, 3],
|
||||
index_col=['foo', 'bar'])
|
||||
|
||||
def test_header_multiindex_common_format(self):
|
||||
|
||||
df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=['one', 'two'],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[('a', 'q'), ('a', 'r'), ('a', 's'),
|
||||
('b', 't'), ('c', 'u'), ('c', 'v')]))
|
||||
|
||||
# to_csv
|
||||
data = """,a,a,a,b,c,c
|
||||
,q,r,s,t,u,v
|
||||
,,,,,,
|
||||
one,1,2,3,4,5,6
|
||||
two,7,8,9,10,11,12"""
|
||||
|
||||
result = self.read_csv(StringIO(data), header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
# common
|
||||
data = """,a,a,a,b,c,c
|
||||
,q,r,s,t,u,v
|
||||
one,1,2,3,4,5,6
|
||||
two,7,8,9,10,11,12"""
|
||||
|
||||
result = self.read_csv(StringIO(data), header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
# common, no index_col
|
||||
data = """a,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = self.read_csv(StringIO(data), header=[0, 1], index_col=None)
|
||||
tm.assert_frame_equal(df.reset_index(drop=True), result)
|
||||
|
||||
# malformed case 1
|
||||
expected = DataFrame(np.array(
|
||||
[[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'),
|
||||
index=Index([1, 7]),
|
||||
columns=MultiIndex(levels=[[u('a'), u('b'), u('c')],
|
||||
[u('r'), u('s'), u('t'),
|
||||
u('u'), u('v')]],
|
||||
labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
|
||||
names=[u('a'), u('q')]))
|
||||
|
||||
data = """a,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = self.read_csv(StringIO(data), header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
# malformed case 2
|
||||
expected = DataFrame(np.array(
|
||||
[[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'),
|
||||
index=Index([1, 7]),
|
||||
columns=MultiIndex(levels=[[u('a'), u('b'), u('c')],
|
||||
[u('r'), u('s'), u('t'),
|
||||
u('u'), u('v')]],
|
||||
labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
|
||||
names=[None, u('q')]))
|
||||
|
||||
data = """,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = self.read_csv(StringIO(data), header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
# mi on columns and index (malformed)
|
||||
expected = DataFrame(np.array(
|
||||
[[3, 4, 5, 6], [9, 10, 11, 12]], dtype='int64'),
|
||||
index=MultiIndex(levels=[[1, 7], [2, 8]],
|
||||
labels=[[0, 1], [0, 1]]),
|
||||
columns=MultiIndex(levels=[[u('a'), u('b'), u('c')],
|
||||
[u('s'), u('t'), u('u'), u('v')]],
|
||||
labels=[[0, 1, 2, 2], [0, 1, 2, 3]],
|
||||
names=[None, u('q')]))
|
||||
|
||||
data = """,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = self.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
def test_header_names_backward_compat(self):
|
||||
# #2539
|
||||
data = '1,2,3\n4,5,6'
|
||||
|
||||
result = self.read_csv(StringIO(data), names=['a', 'b', 'c'])
|
||||
expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
|
||||
header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
data2 = 'foo,bar,baz\n' + data
|
||||
result = self.read_csv(StringIO(data2), names=['a', 'b', 'c'],
|
||||
header=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_read_only_header_no_rows(self):
|
||||
# See gh-7773
|
||||
expected = DataFrame(columns=['a', 'b', 'c'])
|
||||
|
||||
df = self.read_csv(StringIO('a,b,c'))
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
df = self.read_csv(StringIO('a,b,c'), index_col=False)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_no_header(self):
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
df = self.read_table(StringIO(data), sep=',', header=None)
|
||||
df_pref = self.read_table(StringIO(data), sep=',', prefix='X',
|
||||
header=None)
|
||||
|
||||
names = ['foo', 'bar', 'baz', 'quux', 'panda']
|
||||
df2 = self.read_table(StringIO(data), sep=',', names=names)
|
||||
expected = np.array([[1, 2, 3, 4, 5],
|
||||
[6, 7, 8, 9, 10],
|
||||
[11, 12, 13, 14, 15]], dtype=np.int64)
|
||||
tm.assert_almost_equal(df.values, expected)
|
||||
tm.assert_almost_equal(df.values, df2.values)
|
||||
|
||||
tm.assert_index_equal(df_pref.columns,
|
||||
Index(['X0', 'X1', 'X2', 'X3', 'X4']))
|
||||
tm.assert_index_equal(df.columns, Index(lrange(5)))
|
||||
|
||||
tm.assert_index_equal(df2.columns, Index(names))
|
||||
|
||||
def test_non_int_header(self):
|
||||
# GH 16338
|
||||
msg = 'header must be integer or list of integers'
|
||||
data = """1,2\n3,4"""
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
self.read_csv(StringIO(data), sep=',', header=['a', 'b'])
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
self.read_csv(StringIO(data), sep=',', header='string_header')
|
||||
|
||||
def test_singleton_header(self):
|
||||
# See GH #7757
|
||||
data = """a,b,c\n0,1,2\n1,2,3"""
|
||||
df = self.read_csv(StringIO(data), header=[0])
|
||||
expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_mangles_multi_index(self):
|
||||
# See GH 18062
|
||||
data = """A,A,A,B\none,one,one,two\n0,40,34,0.1"""
|
||||
df = self.read_csv(StringIO(data), header=[0, 1])
|
||||
expected = DataFrame([[0, 40, 34, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[('A', 'one'), ('A', 'one.1'),
|
||||
('A', 'one.2'), ('B', 'two')]))
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
data = """A,A,A,B\none,one,one.1,two\n0,40,34,0.1"""
|
||||
df = self.read_csv(StringIO(data), header=[0, 1])
|
||||
expected = DataFrame([[0, 40, 34, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[('A', 'one'), ('A', 'one.1'),
|
||||
('A', 'one.1.1'), ('B', 'two')]))
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
data = """A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1"""
|
||||
df = self.read_csv(StringIO(data), header=[0, 1])
|
||||
expected = DataFrame([[0, 40, 34, 0.1, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[('A', 'one'), ('A', 'one.1'),
|
||||
('A', 'one.1.1'), ('B', 'two'),
|
||||
('B', 'two.1')]))
|
||||
tm.assert_frame_equal(df, expected)
|
||||
@@ -0,0 +1,143 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that the specified index column (a.k.a 'index_col')
|
||||
is properly handled or inferred during parsing for all of
|
||||
the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex
|
||||
from pandas.compat import StringIO
|
||||
|
||||
|
||||
class IndexColTests(object):
|
||||
|
||||
def test_index_col_named(self):
|
||||
no_header = """\
|
||||
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa
|
||||
|
||||
h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" # noqa
|
||||
data = h + no_header
|
||||
rs = self.read_csv(StringIO(data), index_col='ID')
|
||||
xp = self.read_csv(StringIO(data), header=0).set_index('ID')
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
pytest.raises(ValueError, self.read_csv, StringIO(no_header),
|
||||
index_col='ID')
|
||||
|
||||
data = """\
|
||||
1,2,3,4,hello
|
||||
5,6,7,8,world
|
||||
9,10,11,12,foo
|
||||
"""
|
||||
names = ['a', 'b', 'c', 'd', 'message']
|
||||
xp = DataFrame({'a': [1, 5, 9], 'b': [2, 6, 10], 'c': [3, 7, 11],
|
||||
'd': [4, 8, 12]},
|
||||
index=Index(['hello', 'world', 'foo'], name='message'))
|
||||
rs = self.read_csv(StringIO(data), names=names, index_col=['message'])
|
||||
tm.assert_frame_equal(xp, rs)
|
||||
assert xp.index.name == rs.index.name
|
||||
|
||||
rs = self.read_csv(StringIO(data), names=names, index_col='message')
|
||||
tm.assert_frame_equal(xp, rs)
|
||||
assert xp.index.name == rs.index.name
|
||||
|
||||
def test_index_col_is_true(self):
|
||||
# see gh-9798
|
||||
pytest.raises(ValueError, self.read_csv,
|
||||
StringIO(self.ts_data), index_col=True)
|
||||
|
||||
def test_infer_index_col(self):
|
||||
data = """A,B,C
|
||||
foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
data = self.read_csv(StringIO(data))
|
||||
assert data.index.equals(Index(['foo', 'bar', 'baz']))
|
||||
|
||||
def test_empty_index_col_scenarios(self):
|
||||
data = 'x,y,z'
|
||||
|
||||
# None, no index
|
||||
index_col, expected = None, DataFrame([], columns=list('xyz')),
|
||||
tm.assert_frame_equal(self.read_csv(
|
||||
StringIO(data), index_col=index_col), expected)
|
||||
|
||||
# False, no index
|
||||
index_col, expected = False, DataFrame([], columns=list('xyz')),
|
||||
tm.assert_frame_equal(self.read_csv(
|
||||
StringIO(data), index_col=index_col), expected)
|
||||
|
||||
# int, first column
|
||||
index_col, expected = 0, DataFrame(
|
||||
[], columns=['y', 'z'], index=Index([], name='x'))
|
||||
tm.assert_frame_equal(self.read_csv(
|
||||
StringIO(data), index_col=index_col), expected)
|
||||
|
||||
# int, not first column
|
||||
index_col, expected = 1, DataFrame(
|
||||
[], columns=['x', 'z'], index=Index([], name='y'))
|
||||
tm.assert_frame_equal(self.read_csv(
|
||||
StringIO(data), index_col=index_col), expected)
|
||||
|
||||
# str, first column
|
||||
index_col, expected = 'x', DataFrame(
|
||||
[], columns=['y', 'z'], index=Index([], name='x'))
|
||||
tm.assert_frame_equal(self.read_csv(
|
||||
StringIO(data), index_col=index_col), expected)
|
||||
|
||||
# str, not the first column
|
||||
index_col, expected = 'y', DataFrame(
|
||||
[], columns=['x', 'z'], index=Index([], name='y'))
|
||||
tm.assert_frame_equal(self.read_csv(
|
||||
StringIO(data), index_col=index_col), expected)
|
||||
|
||||
# list of int
|
||||
index_col, expected = [0, 1], DataFrame(
|
||||
[], columns=['z'], index=MultiIndex.from_arrays(
|
||||
[[]] * 2, names=['x', 'y']))
|
||||
tm.assert_frame_equal(self.read_csv(
|
||||
StringIO(data), index_col=index_col),
|
||||
expected, check_index_type=False)
|
||||
|
||||
# list of str
|
||||
index_col = ['x', 'y']
|
||||
expected = DataFrame([], columns=['z'],
|
||||
index=MultiIndex.from_arrays(
|
||||
[[]] * 2, names=['x', 'y']))
|
||||
tm.assert_frame_equal(self.read_csv(StringIO(
|
||||
data), index_col=index_col),
|
||||
expected, check_index_type=False)
|
||||
|
||||
# list of int, reversed sequence
|
||||
index_col = [1, 0]
|
||||
expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays(
|
||||
[[]] * 2, names=['y', 'x']))
|
||||
tm.assert_frame_equal(self.read_csv(
|
||||
StringIO(data), index_col=index_col),
|
||||
expected, check_index_type=False)
|
||||
|
||||
# list of str, reversed sequence
|
||||
index_col = ['y', 'x']
|
||||
expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays(
|
||||
[[]] * 2, names=['y', 'x']))
|
||||
tm.assert_frame_equal(self.read_csv(StringIO(
|
||||
data), index_col=index_col),
|
||||
expected, check_index_type=False)
|
||||
|
||||
def test_empty_with_index_col_false(self):
|
||||
# see gh-10413
|
||||
data = 'x,y'
|
||||
result = self.read_csv(StringIO(data), index_col=False)
|
||||
expected = DataFrame([], columns=['x', 'y'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,88 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that duplicate columns are handled appropriately when parsed by the
|
||||
CSV engine. In general, the expected result is that they are either thoroughly
|
||||
de-duplicated (if mangling requested) or ignored otherwise.
|
||||
"""
|
||||
|
||||
from pandas.compat import StringIO
|
||||
from pandas import DataFrame
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
class DupeColumnTests(object):
|
||||
def test_basic(self):
|
||||
# TODO: add test for condition "mangle_dupe_cols=False"
|
||||
# once it is actually supported (gh-12935)
|
||||
data = "a,a,b,b,b\n1,2,3,4,5"
|
||||
|
||||
for method in ("read_csv", "read_table"):
|
||||
# Check default behavior.
|
||||
expected = ["a", "a.1", "b", "b.1", "b.2"]
|
||||
df = getattr(self, method)(StringIO(data), sep=",")
|
||||
assert list(df.columns) == expected
|
||||
|
||||
df = getattr(self, method)(StringIO(data), sep=",",
|
||||
mangle_dupe_cols=True)
|
||||
assert list(df.columns) == expected
|
||||
|
||||
def test_basic_names(self):
|
||||
# See gh-7160
|
||||
data = "a,b,a\n0,1,2\n3,4,5"
|
||||
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
|
||||
columns=["a", "b", "a.1"])
|
||||
|
||||
df = self.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
|
||||
data = "0,1,2\n3,4,5"
|
||||
df = self.read_csv(StringIO(data),
|
||||
names=["a", "b", "a"])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_thorough_mangle_columns(self):
|
||||
# see gh-17060
|
||||
data = "a,a,a.1\n1,2,3"
|
||||
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
|
||||
assert list(df.columns) == ["a", "a.1", "a.1.1"]
|
||||
|
||||
data = "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6"
|
||||
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
|
||||
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
|
||||
"a.1.1.1.1", "a.1.1.1.1.1"]
|
||||
|
||||
data = "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7"
|
||||
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
|
||||
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
|
||||
"a.2", "a.2.1", "a.3.1"]
|
||||
|
||||
def test_thorough_mangle_names(self):
|
||||
# see gh-17095
|
||||
data = "a,b,b\n1,2,3"
|
||||
names = ["a.1", "a.1", "a.1.1"]
|
||||
|
||||
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
|
||||
df = self.read_csv(StringIO(data), sep=",", names=names,
|
||||
mangle_dupe_cols=True)
|
||||
assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]
|
||||
|
||||
data = "a,b,c,d,e,f\n1,2,3,4,5,6"
|
||||
names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"]
|
||||
|
||||
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
|
||||
df = self.read_csv(StringIO(data), sep=",", names=names,
|
||||
mangle_dupe_cols=True)
|
||||
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
|
||||
"a.1.1.1.1", "a.1.1.1.1.1"]
|
||||
|
||||
data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7"
|
||||
names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"]
|
||||
|
||||
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
|
||||
df = self.read_csv(StringIO(data), sep=",", names=names,
|
||||
mangle_dupe_cols=True)
|
||||
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
|
||||
"a.2", "a.2.1", "a.3.1"]
|
||||
@@ -0,0 +1,99 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests multithreading behaviour for reading and
|
||||
parsing files for each parser defined in parsers.py
|
||||
"""
|
||||
|
||||
from __future__ import division
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.compat import BytesIO, range
|
||||
|
||||
|
||||
def _construct_dataframe(num_rows):
|
||||
|
||||
df = DataFrame(np.random.rand(num_rows, 5), columns=list('abcde'))
|
||||
df['foo'] = 'foo'
|
||||
df['bar'] = 'bar'
|
||||
df['baz'] = 'baz'
|
||||
df['date'] = pd.date_range('20000101 09:00:00',
|
||||
periods=num_rows,
|
||||
freq='s')
|
||||
df['int'] = np.arange(num_rows, dtype='int64')
|
||||
return df
|
||||
|
||||
|
||||
class MultithreadTests(object):
|
||||
|
||||
def _generate_multithread_dataframe(self, path, num_rows, num_tasks):
|
||||
|
||||
def reader(arg):
|
||||
start, nrows = arg
|
||||
|
||||
if not start:
|
||||
return self.read_csv(path, index_col=0, header=0,
|
||||
nrows=nrows, parse_dates=['date'])
|
||||
|
||||
return self.read_csv(path,
|
||||
index_col=0,
|
||||
header=None,
|
||||
skiprows=int(start) + 1,
|
||||
nrows=nrows,
|
||||
parse_dates=[9])
|
||||
|
||||
tasks = [
|
||||
(num_rows * i // num_tasks,
|
||||
num_rows // num_tasks) for i in range(num_tasks)
|
||||
]
|
||||
|
||||
pool = ThreadPool(processes=num_tasks)
|
||||
|
||||
results = pool.map(reader, tasks)
|
||||
|
||||
header = results[0].columns
|
||||
for r in results[1:]:
|
||||
r.columns = header
|
||||
|
||||
final_dataframe = pd.concat(results)
|
||||
|
||||
return final_dataframe
|
||||
|
||||
def test_multithread_stringio_read_csv(self):
|
||||
# see gh-11786
|
||||
max_row_range = 10000
|
||||
num_files = 100
|
||||
|
||||
bytes_to_df = [
|
||||
'\n'.join(
|
||||
['%d,%d,%d' % (i, i, i) for i in range(max_row_range)]
|
||||
).encode() for j in range(num_files)]
|
||||
files = [BytesIO(b) for b in bytes_to_df]
|
||||
|
||||
# read all files in many threads
|
||||
pool = ThreadPool(8)
|
||||
results = pool.map(self.read_csv, files)
|
||||
first_result = results[0]
|
||||
|
||||
for result in results:
|
||||
tm.assert_frame_equal(first_result, result)
|
||||
|
||||
def test_multithread_path_multipart_read_csv(self):
|
||||
# see gh-11786
|
||||
num_tasks = 4
|
||||
file_name = '__threadpool_reader__.csv'
|
||||
num_rows = 100000
|
||||
|
||||
df = _construct_dataframe(num_rows)
|
||||
|
||||
with tm.ensure_clean(file_name) as path:
|
||||
df.to_csv(path)
|
||||
|
||||
final_dataframe = self._generate_multithread_dataframe(
|
||||
path, num_rows, num_tasks)
|
||||
tm.assert_frame_equal(df, final_dataframe)
|
||||
@@ -0,0 +1,371 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that NA values are properly handled during
|
||||
parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from numpy import nan
|
||||
|
||||
import pandas.io.common as com
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex
|
||||
from pandas.compat import StringIO, range
|
||||
|
||||
|
||||
class NAvaluesTests(object):
|
||||
|
||||
def test_string_nas(self):
|
||||
data = """A,B,C
|
||||
a,b,c
|
||||
d,,f
|
||||
,g,h
|
||||
"""
|
||||
result = self.read_csv(StringIO(data))
|
||||
expected = DataFrame([['a', 'b', 'c'],
|
||||
['d', np.nan, 'f'],
|
||||
[np.nan, 'g', 'h']],
|
||||
columns=['A', 'B', 'C'])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_detect_string_na(self):
|
||||
data = """A,B
|
||||
foo,bar
|
||||
NA,baz
|
||||
NaN,nan
|
||||
"""
|
||||
expected = np.array([['foo', 'bar'], [nan, 'baz'], [nan, nan]],
|
||||
dtype=np.object_)
|
||||
df = self.read_csv(StringIO(data))
|
||||
tm.assert_numpy_array_equal(df.values, expected)
|
||||
|
||||
def test_non_string_na_values(self):
|
||||
# see gh-3611: with an odd float format, we can't match
|
||||
# the string '999.0' exactly but still need float matching
|
||||
nice = """A,B
|
||||
-999,1.2
|
||||
2,-999
|
||||
3,4.5
|
||||
"""
|
||||
ugly = """A,B
|
||||
-999,1.200
|
||||
2,-999.000
|
||||
3,4.500
|
||||
"""
|
||||
na_values_param = [['-999.0', '-999'],
|
||||
[-999, -999.0],
|
||||
[-999.0, -999],
|
||||
['-999.0'], ['-999'],
|
||||
[-999.0], [-999]]
|
||||
expected = DataFrame([[np.nan, 1.2], [2.0, np.nan],
|
||||
[3.0, 4.5]], columns=['A', 'B'])
|
||||
|
||||
for data in (nice, ugly):
|
||||
for na_values in na_values_param:
|
||||
out = self.read_csv(StringIO(data), na_values=na_values)
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
def test_default_na_values(self):
|
||||
_NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN',
|
||||
'#N/A', 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null',
|
||||
'NaN', 'nan', '-NaN', '-nan', '#N/A N/A', ''])
|
||||
assert _NA_VALUES == com._NA_VALUES
|
||||
nv = len(_NA_VALUES)
|
||||
|
||||
def f(i, v):
|
||||
if i == 0:
|
||||
buf = ''
|
||||
elif i > 0:
|
||||
buf = ''.join([','] * i)
|
||||
|
||||
buf = "{0}{1}".format(buf, v)
|
||||
|
||||
if i < nv - 1:
|
||||
buf = "{0}{1}".format(buf, ''.join([','] * (nv - i - 1)))
|
||||
|
||||
return buf
|
||||
|
||||
data = StringIO('\n'.join(f(i, v) for i, v in enumerate(_NA_VALUES)))
|
||||
expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
|
||||
df = self.read_csv(data, header=None)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_custom_na_values(self):
|
||||
data = """A,B,C
|
||||
ignore,this,row
|
||||
1,NA,3
|
||||
-1.#IND,5,baz
|
||||
7,8,NaN
|
||||
"""
|
||||
expected = np.array([[1., nan, 3],
|
||||
[nan, 5, nan],
|
||||
[7, 8, nan]])
|
||||
|
||||
df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1])
|
||||
tm.assert_numpy_array_equal(df.values, expected)
|
||||
|
||||
df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'],
|
||||
skiprows=[1])
|
||||
tm.assert_numpy_array_equal(df2.values, expected)
|
||||
|
||||
df3 = self.read_table(StringIO(data), sep=',', na_values='baz',
|
||||
skiprows=[1])
|
||||
tm.assert_numpy_array_equal(df3.values, expected)
|
||||
|
||||
def test_bool_na_values(self):
|
||||
data = """A,B,C
|
||||
True,False,True
|
||||
NA,True,False
|
||||
False,NA,True"""
|
||||
|
||||
result = self.read_csv(StringIO(data))
|
||||
expected = DataFrame({'A': np.array([True, nan, False], dtype=object),
|
||||
'B': np.array([False, True, nan], dtype=object),
|
||||
'C': [True, False, True]})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_na_value_dict(self):
|
||||
data = """A,B,C
|
||||
foo,bar,NA
|
||||
bar,foo,foo
|
||||
foo,bar,NA
|
||||
bar,foo,foo"""
|
||||
|
||||
df = self.read_csv(StringIO(data),
|
||||
na_values={'A': ['foo'], 'B': ['bar']})
|
||||
expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'],
|
||||
'B': [np.nan, 'foo', np.nan, 'foo'],
|
||||
'C': [np.nan, 'foo', np.nan, 'foo']})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
data = """\
|
||||
a,b,c,d
|
||||
0,NA,1,5
|
||||
"""
|
||||
xp = DataFrame({'b': [np.nan], 'c': [1], 'd': [5]}, index=[0])
|
||||
xp.index.name = 'a'
|
||||
df = self.read_csv(StringIO(data), na_values={}, index_col=0)
|
||||
tm.assert_frame_equal(df, xp)
|
||||
|
||||
xp = DataFrame({'b': [np.nan], 'd': [5]},
|
||||
MultiIndex.from_tuples([(0, 1)]))
|
||||
xp.index.names = ['a', 'c']
|
||||
df = self.read_csv(StringIO(data), na_values={}, index_col=[0, 2])
|
||||
tm.assert_frame_equal(df, xp)
|
||||
|
||||
xp = DataFrame({'b': [np.nan], 'd': [5]},
|
||||
MultiIndex.from_tuples([(0, 1)]))
|
||||
xp.index.names = ['a', 'c']
|
||||
df = self.read_csv(StringIO(data), na_values={}, index_col=['a', 'c'])
|
||||
tm.assert_frame_equal(df, xp)
|
||||
|
||||
def test_na_values_keep_default(self):
|
||||
data = """\
|
||||
One,Two,Three
|
||||
a,1,one
|
||||
b,2,two
|
||||
,3,three
|
||||
d,4,nan
|
||||
e,5,five
|
||||
nan,6,
|
||||
g,7,seven
|
||||
"""
|
||||
df = self.read_csv(StringIO(data))
|
||||
xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
|
||||
'Two': [1, 2, 3, 4, 5, 6, 7],
|
||||
'Three': ['one', 'two', 'three', np.nan, 'five',
|
||||
np.nan, 'seven']})
|
||||
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
|
||||
|
||||
df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []},
|
||||
keep_default_na=False)
|
||||
xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'],
|
||||
'Two': [1, 2, 3, 4, 5, 6, 7],
|
||||
'Three': ['one', 'two', 'three', 'nan', 'five',
|
||||
'', 'seven']})
|
||||
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
|
||||
|
||||
df = self.read_csv(
|
||||
StringIO(data), na_values=['a'], keep_default_na=False)
|
||||
xp = DataFrame({'One': [np.nan, 'b', '', 'd', 'e', 'nan', 'g'],
|
||||
'Two': [1, 2, 3, 4, 5, 6, 7],
|
||||
'Three': ['one', 'two', 'three', 'nan', 'five', '',
|
||||
'seven']})
|
||||
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
|
||||
|
||||
df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []})
|
||||
xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
|
||||
'Two': [1, 2, 3, 4, 5, 6, 7],
|
||||
'Three': ['one', 'two', 'three', np.nan, 'five',
|
||||
np.nan, 'seven']})
|
||||
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
|
||||
|
||||
# see gh-4318: passing na_values=None and
|
||||
# keep_default_na=False yields 'None' as a na_value
|
||||
data = """\
|
||||
One,Two,Three
|
||||
a,1,None
|
||||
b,2,two
|
||||
,3,None
|
||||
d,4,nan
|
||||
e,5,five
|
||||
nan,6,
|
||||
g,7,seven
|
||||
"""
|
||||
df = self.read_csv(
|
||||
StringIO(data), keep_default_na=False)
|
||||
xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'],
|
||||
'Two': [1, 2, 3, 4, 5, 6, 7],
|
||||
'Three': ['None', 'two', 'None', 'nan', 'five', '',
|
||||
'seven']})
|
||||
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
|
||||
|
||||
def test_no_keep_default_na_dict_na_values(self):
|
||||
# see gh-19227
|
||||
data = "a,b\n,2"
|
||||
|
||||
df = self.read_csv(StringIO(data), na_values={"b": ["2"]},
|
||||
keep_default_na=False)
|
||||
expected = DataFrame({"a": [""], "b": [np.nan]})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
# Scalar values shouldn't cause the parsing to crash or fail.
|
||||
data = "a,b\n1,2"
|
||||
|
||||
df = self.read_csv(StringIO(data), na_values={"b": 2},
|
||||
keep_default_na=False)
|
||||
expected = DataFrame({"a": [1], "b": [np.nan]})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
data = """\
|
||||
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
|
||||
729639,"qwer","",asdfkj,466.681,,252.373
|
||||
"""
|
||||
expected = DataFrame({0: [np.nan, 729639.0],
|
||||
1: [np.nan, "qwer"],
|
||||
2: ["/blaha", np.nan],
|
||||
3: ["kjsdkj", "asdfkj"],
|
||||
4: [412.166, 466.681],
|
||||
5: ["225.874", ""],
|
||||
6: [np.nan, 252.373]})
|
||||
|
||||
df = self.read_csv(StringIO(data), header=None, keep_default_na=False,
|
||||
na_values={2: "", 6: "214.008",
|
||||
1: "blah", 0: 113125})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
df = self.read_csv(StringIO(data), header=None, keep_default_na=False,
|
||||
na_values={2: "", 6: "214.008",
|
||||
1: "blah", 0: "113125"})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_na_values_na_filter_override(self):
|
||||
data = """\
|
||||
A,B
|
||||
1,A
|
||||
nan,B
|
||||
3,C
|
||||
"""
|
||||
|
||||
expected = DataFrame([[1, 'A'], [np.nan, np.nan], [3, 'C']],
|
||||
columns=['A', 'B'])
|
||||
out = self.read_csv(StringIO(data), na_values=['B'], na_filter=True)
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
expected = DataFrame([['1', 'A'], ['nan', 'B'], ['3', 'C']],
|
||||
columns=['A', 'B'])
|
||||
out = self.read_csv(StringIO(data), na_values=['B'], na_filter=False)
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
def test_na_trailing_columns(self):
|
||||
data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax
|
||||
2012-03-14,USD,AAPL,BUY,1000
|
||||
2012-05-12,USD,SBUX,SELL,500"""
|
||||
|
||||
result = self.read_csv(StringIO(data))
|
||||
assert result['Date'][1] == '2012-05-12'
|
||||
assert result['UnitPrice'].isna().all()
|
||||
|
||||
def test_na_values_scalar(self):
|
||||
# see gh-12224
|
||||
names = ['a', 'b']
|
||||
data = '1,2\n2,1'
|
||||
|
||||
expected = DataFrame([[np.nan, 2.0], [2.0, np.nan]],
|
||||
columns=names)
|
||||
out = self.read_csv(StringIO(data), names=names, na_values=1)
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]],
|
||||
columns=names)
|
||||
out = self.read_csv(StringIO(data), names=names,
|
||||
na_values={'a': 2, 'b': 1})
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
def test_na_values_dict_aliasing(self):
|
||||
na_values = {'a': 2, 'b': 1}
|
||||
na_values_copy = na_values.copy()
|
||||
|
||||
names = ['a', 'b']
|
||||
data = '1,2\n2,1'
|
||||
|
||||
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
|
||||
out = self.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
|
||||
tm.assert_frame_equal(out, expected)
|
||||
tm.assert_dict_equal(na_values, na_values_copy)
|
||||
|
||||
def test_na_values_dict_col_index(self):
|
||||
# see gh-14203
|
||||
|
||||
data = 'a\nfoo\n1'
|
||||
na_values = {0: 'foo'}
|
||||
|
||||
out = self.read_csv(StringIO(data), na_values=na_values)
|
||||
expected = DataFrame({'a': [np.nan, 1]})
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
def test_na_values_uint64(self):
|
||||
# see gh-14983
|
||||
|
||||
na_values = [2**63]
|
||||
data = str(2**63) + '\n' + str(2**63 + 1)
|
||||
expected = DataFrame([str(2**63), str(2**63 + 1)])
|
||||
out = self.read_csv(StringIO(data), header=None, na_values=na_values)
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
data = str(2**63) + ',1' + '\n,2'
|
||||
expected = DataFrame([[str(2**63), 1], ['', 2]])
|
||||
out = self.read_csv(StringIO(data), header=None)
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
def test_empty_na_values_no_default_with_index(self):
|
||||
# see gh-15835
|
||||
data = "a,1\nb,2"
|
||||
|
||||
expected = DataFrame({'1': [2]}, index=Index(["b"], name="a"))
|
||||
out = self.read_csv(StringIO(data), keep_default_na=False, index_col=0)
|
||||
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
def test_no_na_filter_on_index(self):
|
||||
# see gh-5239
|
||||
data = "a,b,c\n1,,3\n4,5,6"
|
||||
|
||||
# Don't parse NA-values in index when na_filter=False.
|
||||
out = self.read_csv(StringIO(data), index_col=[1], na_filter=False)
|
||||
|
||||
expected = DataFrame({"a": [1, 4], "c": [3, 6]},
|
||||
index=Index(["", "5"], name="b"))
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
# Parse NA-values in index when na_filter=True.
|
||||
out = self.read_csv(StringIO(data), index_col=[1], na_filter=True)
|
||||
|
||||
expected = DataFrame({"a": [1, 4], "c": [3, 6]},
|
||||
index=Index([np.nan, 5.0], name="b"))
|
||||
tm.assert_frame_equal(out, expected)
|
||||
@@ -0,0 +1,676 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests date parsing functionality for all of the
|
||||
parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from distutils.version import LooseVersion
|
||||
from datetime import datetime, date
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
from pandas._libs.tslibs import parsing
|
||||
from pandas._libs.tslib import Timestamp
|
||||
|
||||
import pandas as pd
|
||||
import pandas.io.parsers as parsers
|
||||
import pandas.core.tools.datetimes as tools
|
||||
import pandas.util.testing as tm
|
||||
|
||||
import pandas.io.date_converters as conv
|
||||
from pandas import DataFrame, Series, Index, DatetimeIndex, MultiIndex
|
||||
from pandas import compat
|
||||
from pandas.compat import parse_date, StringIO, lrange
|
||||
from pandas.compat.numpy import np_array_datetime64_compat
|
||||
from pandas.core.indexes.datetimes import date_range
|
||||
|
||||
|
||||
class ParseDatesTests(object):
|
||||
|
||||
def test_separator_date_conflict(self):
|
||||
# Regression test for gh-4678: make sure thousands separator and
|
||||
# date parsing do not conflict.
|
||||
data = '06-02-2013;13:00;1-000.215'
|
||||
expected = DataFrame(
|
||||
[[datetime(2013, 6, 2, 13, 0, 0), 1000.215]],
|
||||
columns=['Date', 2]
|
||||
)
|
||||
|
||||
df = self.read_csv(StringIO(data), sep=';', thousands='-',
|
||||
parse_dates={'Date': [0, 1]}, header=None)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_multiple_date_col(self):
|
||||
# Can use multiple date parsers
|
||||
data = """\
|
||||
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
|
||||
"""
|
||||
|
||||
def func(*date_cols):
|
||||
res = parsing.try_parse_dates(parsers._concat_date_cols(date_cols))
|
||||
return res
|
||||
|
||||
df = self.read_csv(StringIO(data), header=None,
|
||||
date_parser=func,
|
||||
prefix='X',
|
||||
parse_dates={'nominal': [1, 2],
|
||||
'actual': [1, 3]})
|
||||
assert 'nominal' in df
|
||||
assert 'actual' in df
|
||||
assert 'X1' not in df
|
||||
assert 'X2' not in df
|
||||
assert 'X3' not in df
|
||||
|
||||
d = datetime(1999, 1, 27, 19, 0)
|
||||
assert df.loc[0, 'nominal'] == d
|
||||
|
||||
df = self.read_csv(StringIO(data), header=None,
|
||||
date_parser=func,
|
||||
parse_dates={'nominal': [1, 2],
|
||||
'actual': [1, 3]},
|
||||
keep_date_col=True)
|
||||
assert 'nominal' in df
|
||||
assert 'actual' in df
|
||||
|
||||
assert 1 in df
|
||||
assert 2 in df
|
||||
assert 3 in df
|
||||
|
||||
data = """\
|
||||
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
|
||||
"""
|
||||
df = self.read_csv(StringIO(data), header=None,
|
||||
prefix='X', parse_dates=[[1, 2], [1, 3]])
|
||||
|
||||
assert 'X1_X2' in df
|
||||
assert 'X1_X3' in df
|
||||
assert 'X1' not in df
|
||||
assert 'X2' not in df
|
||||
assert 'X3' not in df
|
||||
|
||||
d = datetime(1999, 1, 27, 19, 0)
|
||||
assert df.loc[0, 'X1_X2'] == d
|
||||
|
||||
df = self.read_csv(StringIO(data), header=None,
|
||||
parse_dates=[[1, 2], [1, 3]], keep_date_col=True)
|
||||
|
||||
assert '1_2' in df
|
||||
assert '1_3' in df
|
||||
assert 1 in df
|
||||
assert 2 in df
|
||||
assert 3 in df
|
||||
|
||||
data = '''\
|
||||
KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
'''
|
||||
df = self.read_csv(StringIO(data), sep=',', header=None,
|
||||
parse_dates=[1], index_col=1)
|
||||
d = datetime(1999, 1, 27, 19, 0)
|
||||
assert df.index[0] == d
|
||||
|
||||
def test_multiple_date_cols_int_cast(self):
|
||||
data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
|
||||
"KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
|
||||
"KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
|
||||
"KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
|
||||
"KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
|
||||
"KORD,19990127, 23:00:00, 22:56:00, -0.5900")
|
||||
date_spec = {'nominal': [1, 2], 'actual': [1, 3]}
|
||||
import pandas.io.date_converters as conv
|
||||
|
||||
# it works!
|
||||
df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec,
|
||||
date_parser=conv.parse_date_time)
|
||||
assert 'nominal' in df
|
||||
|
||||
def test_multiple_date_col_timestamp_parse(self):
|
||||
data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
|
||||
05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25"""
|
||||
result = self.read_csv(StringIO(data), sep=',', header=None,
|
||||
parse_dates=[[0, 1]], date_parser=Timestamp)
|
||||
|
||||
ex_val = Timestamp('05/31/2012 15:30:00.029')
|
||||
assert result['0_1'][0] == ex_val
|
||||
|
||||
def test_multiple_date_cols_with_header(self):
|
||||
data = """\
|
||||
ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
|
||||
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
|
||||
|
||||
df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]})
|
||||
assert not isinstance(df.nominal[0], compat.string_types)
|
||||
|
||||
ts_data = """\
|
||||
ID,date,nominalTime,actualTime,A,B,C,D,E
|
||||
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
|
||||
"""
|
||||
|
||||
def test_multiple_date_col_name_collision(self):
|
||||
with pytest.raises(ValueError):
|
||||
self.read_csv(StringIO(self.ts_data), parse_dates={'ID': [1, 2]})
|
||||
|
||||
data = """\
|
||||
date_NominalTime,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
|
||||
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
self.read_csv(StringIO(data), parse_dates=[[1, 2]])
|
||||
|
||||
def test_date_parser_int_bug(self):
|
||||
# See gh-3071
|
||||
log_file = StringIO(
|
||||
'posix_timestamp,elapsed,sys,user,queries,query_time,rows,'
|
||||
'accountid,userid,contactid,level,silo,method\n'
|
||||
'1343103150,0.062353,0,4,6,0.01690,3,'
|
||||
'12345,1,-1,3,invoice_InvoiceResource,search\n'
|
||||
)
|
||||
|
||||
def f(posix_string):
|
||||
return datetime.utcfromtimestamp(int(posix_string))
|
||||
|
||||
# it works!
|
||||
self.read_csv(log_file, index_col=0, parse_dates=[0], date_parser=f)
|
||||
|
||||
def test_nat_parse(self):
|
||||
# See gh-3062
|
||||
df = DataFrame(dict({
|
||||
'A': np.asarray(lrange(10), dtype='float64'),
|
||||
'B': pd.Timestamp('20010101')}))
|
||||
df.iloc[3:6, :] = np.nan
|
||||
|
||||
with tm.ensure_clean('__nat_parse_.csv') as path:
|
||||
df.to_csv(path)
|
||||
result = self.read_csv(path, index_col=0, parse_dates=['B'])
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
expected = Series(dict(A='float64', B='datetime64[ns]'))
|
||||
tm.assert_series_equal(expected, result.dtypes)
|
||||
|
||||
# test with NaT for the nan_rep
|
||||
# we don't have a method to specify the Datetime na_rep
|
||||
# (it defaults to '')
|
||||
df.to_csv(path)
|
||||
result = self.read_csv(path, index_col=0, parse_dates=['B'])
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
def test_csv_custom_parser(self):
|
||||
data = """A,B,C
|
||||
20090101,a,1,2
|
||||
20090102,b,3,4
|
||||
20090103,c,4,5
|
||||
"""
|
||||
f = lambda x: datetime.strptime(x, '%Y%m%d')
|
||||
df = self.read_csv(StringIO(data), date_parser=f)
|
||||
expected = self.read_csv(StringIO(data), parse_dates=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_parse_dates_implicit_first_col(self):
|
||||
data = """A,B,C
|
||||
20090101,a,1,2
|
||||
20090102,b,3,4
|
||||
20090103,c,4,5
|
||||
"""
|
||||
df = self.read_csv(StringIO(data), parse_dates=True)
|
||||
expected = self.read_csv(StringIO(data), index_col=0, parse_dates=True)
|
||||
assert isinstance(
|
||||
df.index[0], (datetime, np.datetime64, Timestamp))
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_parse_dates_string(self):
|
||||
data = """date,A,B,C
|
||||
20090101,a,1,2
|
||||
20090102,b,3,4
|
||||
20090103,c,4,5
|
||||
"""
|
||||
rs = self.read_csv(
|
||||
StringIO(data), index_col='date', parse_dates=['date'])
|
||||
idx = date_range('1/1/2009', periods=3)
|
||||
idx.name = 'date'
|
||||
xp = DataFrame({'A': ['a', 'b', 'c'],
|
||||
'B': [1, 3, 4],
|
||||
'C': [2, 4, 5]}, idx)
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
def test_yy_format_with_yearfirst(self):
|
||||
data = """date,time,B,C
|
||||
090131,0010,1,2
|
||||
090228,1020,3,4
|
||||
090331,0830,5,6
|
||||
"""
|
||||
|
||||
# See gh-217
|
||||
import dateutil
|
||||
if LooseVersion(dateutil.__version__) >= LooseVersion('2.5.0'):
|
||||
pytest.skip("testing yearfirst=True not-support"
|
||||
"on datetutil < 2.5.0 this works but"
|
||||
"is wrong")
|
||||
|
||||
rs = self.read_csv(StringIO(data), index_col=0,
|
||||
parse_dates=[['date', 'time']])
|
||||
idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0),
|
||||
datetime(2009, 2, 28, 10, 20, 0),
|
||||
datetime(2009, 3, 31, 8, 30, 0)],
|
||||
dtype=object, name='date_time')
|
||||
xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx)
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
rs = self.read_csv(StringIO(data), index_col=0,
|
||||
parse_dates=[[0, 1]])
|
||||
idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0),
|
||||
datetime(2009, 2, 28, 10, 20, 0),
|
||||
datetime(2009, 3, 31, 8, 30, 0)],
|
||||
dtype=object, name='date_time')
|
||||
xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx)
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
def test_parse_dates_column_list(self):
|
||||
data = 'a,b,c\n01/01/2010,1,15/02/2010'
|
||||
|
||||
expected = DataFrame({'a': [datetime(2010, 1, 1)], 'b': [1],
|
||||
'c': [datetime(2010, 2, 15)]})
|
||||
expected = expected.set_index(['a', 'b'])
|
||||
|
||||
df = self.read_csv(StringIO(data), index_col=[0, 1],
|
||||
parse_dates=[0, 2], dayfirst=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
df = self.read_csv(StringIO(data), index_col=[0, 1],
|
||||
parse_dates=['a', 'c'], dayfirst=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_multi_index_parse_dates(self):
|
||||
data = """index1,index2,A,B,C
|
||||
20090101,one,a,1,2
|
||||
20090101,two,b,3,4
|
||||
20090101,three,c,4,5
|
||||
20090102,one,a,1,2
|
||||
20090102,two,b,3,4
|
||||
20090102,three,c,4,5
|
||||
20090103,one,a,1,2
|
||||
20090103,two,b,3,4
|
||||
20090103,three,c,4,5
|
||||
"""
|
||||
df = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=True)
|
||||
assert isinstance(df.index.levels[0][0],
|
||||
(datetime, np.datetime64, Timestamp))
|
||||
|
||||
# specify columns out of order!
|
||||
df2 = self.read_csv(StringIO(data), index_col=[1, 0], parse_dates=True)
|
||||
assert isinstance(df2.index.levels[1][0],
|
||||
(datetime, np.datetime64, Timestamp))
|
||||
|
||||
def test_parse_dates_custom_euroformat(self):
|
||||
text = """foo,bar,baz
|
||||
31/01/2010,1,2
|
||||
01/02/2010,1,NA
|
||||
02/02/2010,1,2
|
||||
"""
|
||||
parser = lambda d: parse_date(d, dayfirst=True)
|
||||
df = self.read_csv(StringIO(text),
|
||||
names=['time', 'Q', 'NTU'], header=0,
|
||||
index_col=0, parse_dates=True,
|
||||
date_parser=parser, na_values=['NA'])
|
||||
|
||||
exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1),
|
||||
datetime(2010, 2, 2)], name='time')
|
||||
expected = DataFrame({'Q': [1, 1, 1], 'NTU': [2, np.nan, 2]},
|
||||
index=exp_index, columns=['Q', 'NTU'])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
parser = lambda d: parse_date(d, day_first=True)
|
||||
pytest.raises(TypeError, self.read_csv,
|
||||
StringIO(text), skiprows=[0],
|
||||
names=['time', 'Q', 'NTU'], index_col=0,
|
||||
parse_dates=True, date_parser=parser,
|
||||
na_values=['NA'])
|
||||
|
||||
def test_parse_tz_aware(self):
|
||||
# See gh-1693
|
||||
import pytz
|
||||
data = StringIO("Date,x\n2012-06-13T01:39:00Z,0.5")
|
||||
|
||||
# it works
|
||||
result = self.read_csv(data, index_col=0, parse_dates=True)
|
||||
stamp = result.index[0]
|
||||
assert stamp.minute == 39
|
||||
try:
|
||||
assert result.index.tz is pytz.utc
|
||||
except AssertionError: # hello Yaroslav
|
||||
arr = result.index.to_pydatetime()
|
||||
result = tools.to_datetime(arr, utc=True)[0]
|
||||
assert stamp.minute == result.minute
|
||||
assert stamp.hour == result.hour
|
||||
assert stamp.day == result.day
|
||||
|
||||
def test_multiple_date_cols_index(self):
|
||||
data = """
|
||||
ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
|
||||
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
|
||||
"""
|
||||
|
||||
xp = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]})
|
||||
df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]},
|
||||
index_col='nominal')
|
||||
tm.assert_frame_equal(xp.set_index('nominal'), df)
|
||||
df2 = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]},
|
||||
index_col=0)
|
||||
tm.assert_frame_equal(df2, df)
|
||||
|
||||
df3 = self.read_csv(StringIO(data), parse_dates=[[1, 2]], index_col=0)
|
||||
tm.assert_frame_equal(df3, df, check_names=False)
|
||||
|
||||
def test_multiple_date_cols_chunked(self):
|
||||
df = self.read_csv(StringIO(self.ts_data), parse_dates={
|
||||
'nominal': [1, 2]}, index_col='nominal')
|
||||
reader = self.read_csv(StringIO(self.ts_data),
|
||||
parse_dates={'nominal': [1, 2]},
|
||||
index_col='nominal', chunksize=2)
|
||||
|
||||
chunks = list(reader)
|
||||
|
||||
assert 'nominalTime' not in df
|
||||
|
||||
tm.assert_frame_equal(chunks[0], df[:2])
|
||||
tm.assert_frame_equal(chunks[1], df[2:4])
|
||||
tm.assert_frame_equal(chunks[2], df[4:])
|
||||
|
||||
def test_multiple_date_col_named_components(self):
|
||||
xp = self.read_csv(StringIO(self.ts_data),
|
||||
parse_dates={'nominal': [1, 2]},
|
||||
index_col='nominal')
|
||||
colspec = {'nominal': ['date', 'nominalTime']}
|
||||
df = self.read_csv(StringIO(self.ts_data), parse_dates=colspec,
|
||||
index_col='nominal')
|
||||
tm.assert_frame_equal(df, xp)
|
||||
|
||||
def test_multiple_date_col_multiple_index(self):
|
||||
df = self.read_csv(StringIO(self.ts_data),
|
||||
parse_dates={'nominal': [1, 2]},
|
||||
index_col=['nominal', 'ID'])
|
||||
|
||||
xp = self.read_csv(StringIO(self.ts_data),
|
||||
parse_dates={'nominal': [1, 2]})
|
||||
|
||||
tm.assert_frame_equal(xp.set_index(['nominal', 'ID']), df)
|
||||
|
||||
def test_read_with_parse_dates_scalar_non_bool(self):
|
||||
# See gh-5636
|
||||
errmsg = ("Only booleans, lists, and "
|
||||
"dictionaries are accepted "
|
||||
"for the 'parse_dates' parameter")
|
||||
data = """A,B,C
|
||||
1,2,2003-11-1"""
|
||||
|
||||
tm.assert_raises_regex(TypeError, errmsg, self.read_csv,
|
||||
StringIO(data), parse_dates="C")
|
||||
tm.assert_raises_regex(TypeError, errmsg, self.read_csv,
|
||||
StringIO(data), parse_dates="C",
|
||||
index_col="C")
|
||||
|
||||
def test_read_with_parse_dates_invalid_type(self):
|
||||
errmsg = ("Only booleans, lists, and "
|
||||
"dictionaries are accepted "
|
||||
"for the 'parse_dates' parameter")
|
||||
data = """A,B,C
|
||||
1,2,2003-11-1"""
|
||||
|
||||
tm.assert_raises_regex(TypeError, errmsg, self.read_csv,
|
||||
StringIO(data), parse_dates=(1,))
|
||||
tm.assert_raises_regex(TypeError, errmsg,
|
||||
self.read_csv, StringIO(data),
|
||||
parse_dates=np.array([4, 5]))
|
||||
tm.assert_raises_regex(TypeError, errmsg, self.read_csv,
|
||||
StringIO(data), parse_dates=set([1, 3, 3]))
|
||||
|
||||
def test_parse_dates_empty_string(self):
|
||||
# see gh-2263
|
||||
data = "Date, test\n2012-01-01, 1\n,2"
|
||||
result = self.read_csv(StringIO(data), parse_dates=["Date"],
|
||||
na_filter=False)
|
||||
assert result['Date'].isna()[1]
|
||||
|
||||
def test_parse_dates_noconvert_thousands(self):
|
||||
# see gh-14066
|
||||
data = 'a\n04.15.2016'
|
||||
|
||||
expected = DataFrame([datetime(2016, 4, 15)], columns=['a'])
|
||||
result = self.read_csv(StringIO(data), parse_dates=['a'],
|
||||
thousands='.')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
exp_index = DatetimeIndex(['2016-04-15'], name='a')
|
||||
expected = DataFrame(index=exp_index)
|
||||
result = self.read_csv(StringIO(data), index_col=0,
|
||||
parse_dates=True, thousands='.')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
data = 'a,b\n04.15.2016,09.16.2013'
|
||||
|
||||
expected = DataFrame([[datetime(2016, 4, 15),
|
||||
datetime(2013, 9, 16)]],
|
||||
columns=['a', 'b'])
|
||||
result = self.read_csv(StringIO(data), parse_dates=['a', 'b'],
|
||||
thousands='.')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([[datetime(2016, 4, 15),
|
||||
datetime(2013, 9, 16)]],
|
||||
columns=['a', 'b'])
|
||||
expected = expected.set_index(['a', 'b'])
|
||||
result = self.read_csv(StringIO(data), index_col=[0, 1],
|
||||
parse_dates=True, thousands='.')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_parse_date_time_multi_level_column_name(self):
|
||||
data = """\
|
||||
D,T,A,B
|
||||
date, time,a,b
|
||||
2001-01-05, 09:00:00, 0.0, 10.
|
||||
2001-01-06, 00:00:00, 1.0, 11.
|
||||
"""
|
||||
datecols = {'date_time': [0, 1]}
|
||||
result = self.read_csv(StringIO(data), sep=',', header=[0, 1],
|
||||
parse_dates=datecols,
|
||||
date_parser=conv.parse_date_time)
|
||||
|
||||
expected_data = [[datetime(2001, 1, 5, 9, 0, 0), 0., 10.],
|
||||
[datetime(2001, 1, 6, 0, 0, 0), 1., 11.]]
|
||||
expected = DataFrame(expected_data,
|
||||
columns=['date_time', ('A', 'a'), ('B', 'b')])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_parse_date_time(self):
|
||||
dates = np.array(['2007/1/3', '2008/2/4'], dtype=object)
|
||||
times = np.array(['05:07:09', '06:08:00'], dtype=object)
|
||||
expected = np.array([datetime(2007, 1, 3, 5, 7, 9),
|
||||
datetime(2008, 2, 4, 6, 8, 0)])
|
||||
|
||||
result = conv.parse_date_time(dates, times)
|
||||
assert (result == expected).all()
|
||||
|
||||
data = """\
|
||||
date, time, a, b
|
||||
2001-01-05, 10:00:00, 0.0, 10.
|
||||
2001-01-05, 00:00:00, 1., 11.
|
||||
"""
|
||||
datecols = {'date_time': [0, 1]}
|
||||
df = self.read_csv(StringIO(data), sep=',', header=0,
|
||||
parse_dates=datecols,
|
||||
date_parser=conv.parse_date_time)
|
||||
assert 'date_time' in df
|
||||
assert df.date_time.loc[0] == datetime(2001, 1, 5, 10, 0, 0)
|
||||
|
||||
data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
|
||||
"KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
|
||||
"KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
|
||||
"KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
|
||||
"KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
|
||||
"KORD,19990127, 23:00:00, 22:56:00, -0.5900")
|
||||
|
||||
date_spec = {'nominal': [1, 2], 'actual': [1, 3]}
|
||||
df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec,
|
||||
date_parser=conv.parse_date_time)
|
||||
|
||||
def test_parse_date_fields(self):
|
||||
years = np.array([2007, 2008])
|
||||
months = np.array([1, 2])
|
||||
days = np.array([3, 4])
|
||||
result = conv.parse_date_fields(years, months, days)
|
||||
expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)])
|
||||
assert (result == expected).all()
|
||||
|
||||
data = ("year, month, day, a\n 2001 , 01 , 10 , 10.\n"
|
||||
"2001 , 02 , 1 , 11.")
|
||||
datecols = {'ymd': [0, 1, 2]}
|
||||
df = self.read_csv(StringIO(data), sep=',', header=0,
|
||||
parse_dates=datecols,
|
||||
date_parser=conv.parse_date_fields)
|
||||
assert 'ymd' in df
|
||||
assert df.ymd.loc[0] == datetime(2001, 1, 10)
|
||||
|
||||
def test_datetime_six_col(self):
|
||||
years = np.array([2007, 2008])
|
||||
months = np.array([1, 2])
|
||||
days = np.array([3, 4])
|
||||
hours = np.array([5, 6])
|
||||
minutes = np.array([7, 8])
|
||||
seconds = np.array([9, 0])
|
||||
expected = np.array([datetime(2007, 1, 3, 5, 7, 9),
|
||||
datetime(2008, 2, 4, 6, 8, 0)])
|
||||
|
||||
result = conv.parse_all_fields(years, months, days,
|
||||
hours, minutes, seconds)
|
||||
|
||||
assert (result == expected).all()
|
||||
|
||||
data = """\
|
||||
year, month, day, hour, minute, second, a, b
|
||||
2001, 01, 05, 10, 00, 0, 0.0, 10.
|
||||
2001, 01, 5, 10, 0, 00, 1., 11.
|
||||
"""
|
||||
datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]}
|
||||
df = self.read_csv(StringIO(data), sep=',', header=0,
|
||||
parse_dates=datecols,
|
||||
date_parser=conv.parse_all_fields)
|
||||
assert 'ymdHMS' in df
|
||||
assert df.ymdHMS.loc[0] == datetime(2001, 1, 5, 10, 0, 0)
|
||||
|
||||
def test_datetime_fractional_seconds(self):
|
||||
data = """\
|
||||
year, month, day, hour, minute, second, a, b
|
||||
2001, 01, 05, 10, 00, 0.123456, 0.0, 10.
|
||||
2001, 01, 5, 10, 0, 0.500000, 1., 11.
|
||||
"""
|
||||
datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]}
|
||||
df = self.read_csv(StringIO(data), sep=',', header=0,
|
||||
parse_dates=datecols,
|
||||
date_parser=conv.parse_all_fields)
|
||||
assert 'ymdHMS' in df
|
||||
assert df.ymdHMS.loc[0] == datetime(2001, 1, 5, 10, 0, 0,
|
||||
microsecond=123456)
|
||||
assert df.ymdHMS.loc[1] == datetime(2001, 1, 5, 10, 0, 0,
|
||||
microsecond=500000)
|
||||
|
||||
def test_generic(self):
|
||||
data = "year, month, day, a\n 2001, 01, 10, 10.\n 2001, 02, 1, 11."
|
||||
datecols = {'ym': [0, 1]}
|
||||
dateconverter = lambda y, m: date(year=int(y), month=int(m), day=1)
|
||||
df = self.read_csv(StringIO(data), sep=',', header=0,
|
||||
parse_dates=datecols,
|
||||
date_parser=dateconverter)
|
||||
assert 'ym' in df
|
||||
assert df.ym.loc[0] == date(2001, 1, 1)
|
||||
|
||||
def test_dateparser_resolution_if_not_ns(self):
|
||||
# GH 10245
|
||||
data = """\
|
||||
date,time,prn,rxstatus
|
||||
2013-11-03,19:00:00,126,00E80000
|
||||
2013-11-03,19:00:00,23,00E80000
|
||||
2013-11-03,19:00:00,13,00E80000
|
||||
"""
|
||||
|
||||
def date_parser(date, time):
|
||||
datetime = np_array_datetime64_compat(
|
||||
date + 'T' + time + 'Z', dtype='datetime64[s]')
|
||||
return datetime
|
||||
|
||||
df = self.read_csv(StringIO(data), date_parser=date_parser,
|
||||
parse_dates={'datetime': ['date', 'time']},
|
||||
index_col=['datetime', 'prn'])
|
||||
|
||||
datetimes = np_array_datetime64_compat(['2013-11-03T19:00:00Z'] * 3,
|
||||
dtype='datetime64[s]')
|
||||
df_correct = DataFrame(data={'rxstatus': ['00E80000'] * 3},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(datetimes[0], 126),
|
||||
(datetimes[1], 23),
|
||||
(datetimes[2], 13)],
|
||||
names=['datetime', 'prn']))
|
||||
tm.assert_frame_equal(df, df_correct)
|
||||
|
||||
def test_parse_date_column_with_empty_string(self):
|
||||
# GH 6428
|
||||
data = """case,opdate
|
||||
7,10/18/2006
|
||||
7,10/18/2008
|
||||
621, """
|
||||
result = self.read_csv(StringIO(data), parse_dates=['opdate'])
|
||||
expected_data = [[7, '10/18/2006'],
|
||||
[7, '10/18/2008'],
|
||||
[621, ' ']]
|
||||
expected = DataFrame(expected_data, columns=['case', 'opdate'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("data,expected", [
|
||||
("a\n135217135789158401\n1352171357E+5",
|
||||
DataFrame({"a": [135217135789158401,
|
||||
135217135700000]}, dtype="float64")),
|
||||
("a\n99999999999\n123456789012345\n1234E+0",
|
||||
DataFrame({"a": [99999999999,
|
||||
123456789012345,
|
||||
1234]}, dtype="float64"))
|
||||
])
|
||||
@pytest.mark.parametrize("parse_dates", [True, False])
|
||||
def test_parse_date_float(self, data, expected, parse_dates):
|
||||
# see gh-2697
|
||||
#
|
||||
# Date parsing should fail, so we leave the data untouched
|
||||
# (i.e. float precision should remain unchanged).
|
||||
result = self.read_csv(StringIO(data), parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
+263
@@ -0,0 +1,263 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that apply specifically to the Python parser. Unless specifically
|
||||
stated as a Python-specific issue, the goal is to eventually move as many of
|
||||
these tests out of this module as soon as the C parser can accept further
|
||||
arguments when parsing.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import pytest
|
||||
|
||||
import pandas.util.testing as tm
|
||||
from pandas import DataFrame, Index
|
||||
from pandas import compat
|
||||
from pandas.errors import ParserError
|
||||
from pandas.compat import StringIO, BytesIO, u
|
||||
|
||||
|
||||
class PythonParserTests(object):
|
||||
|
||||
def test_default_separator(self):
|
||||
# GH17333
|
||||
# csv.Sniffer in Python treats 'o' as separator.
|
||||
text = 'aob\n1o2\n3o4'
|
||||
expected = DataFrame({'a': [1, 3], 'b': [2, 4]})
|
||||
|
||||
result = self.read_csv(StringIO(text), sep=None)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_invalid_skipfooter(self):
|
||||
text = "a\n1\n2"
|
||||
|
||||
# see gh-15925 (comment)
|
||||
msg = "skipfooter must be an integer"
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
self.read_csv(StringIO(text), skipfooter="foo")
|
||||
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
self.read_csv(StringIO(text), skipfooter=1.5)
|
||||
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
self.read_csv(StringIO(text), skipfooter=True)
|
||||
|
||||
msg = "skipfooter cannot be negative"
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
self.read_csv(StringIO(text), skipfooter=-1)
|
||||
|
||||
def test_sniff_delimiter(self):
|
||||
text = """index|A|B|C
|
||||
foo|1|2|3
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
data = self.read_csv(StringIO(text), index_col=0, sep=None)
|
||||
tm.assert_index_equal(data.index,
|
||||
Index(['foo', 'bar', 'baz'], name='index'))
|
||||
|
||||
data2 = self.read_csv(StringIO(text), index_col=0, delimiter='|')
|
||||
tm.assert_frame_equal(data, data2)
|
||||
|
||||
text = """ignore this
|
||||
ignore this too
|
||||
index|A|B|C
|
||||
foo|1|2|3
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
data3 = self.read_csv(StringIO(text), index_col=0,
|
||||
sep=None, skiprows=2)
|
||||
tm.assert_frame_equal(data, data3)
|
||||
|
||||
text = u("""ignore this
|
||||
ignore this too
|
||||
index|A|B|C
|
||||
foo|1|2|3
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
""").encode('utf-8')
|
||||
|
||||
s = BytesIO(text)
|
||||
if compat.PY3:
|
||||
# somewhat False since the code never sees bytes
|
||||
from io import TextIOWrapper
|
||||
s = TextIOWrapper(s, encoding='utf-8')
|
||||
|
||||
data4 = self.read_csv(s, index_col=0, sep=None, skiprows=2,
|
||||
encoding='utf-8')
|
||||
tm.assert_frame_equal(data, data4)
|
||||
|
||||
def test_BytesIO_input(self):
|
||||
if not compat.PY3:
|
||||
pytest.skip(
|
||||
"Bytes-related test - only needs to work on Python 3")
|
||||
|
||||
data = BytesIO("שלום::1234\n562::123".encode('cp1255'))
|
||||
result = self.read_table(data, sep="::", encoding='cp1255')
|
||||
expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_single_line(self):
|
||||
# see gh-6607: sniff separator
|
||||
df = self.read_csv(StringIO('1,2'), names=['a', 'b'],
|
||||
header=None, sep=None)
|
||||
tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df)
|
||||
|
||||
def test_skipfooter(self):
|
||||
# see gh-6607
|
||||
data = """A,B,C
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
want to skip this
|
||||
also also skip this
|
||||
"""
|
||||
result = self.read_csv(StringIO(data), skipfooter=2)
|
||||
no_footer = '\n'.join(data.split('\n')[:-3])
|
||||
expected = self.read_csv(StringIO(no_footer))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = self.read_csv(StringIO(data), nrows=3)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# skipfooter alias
|
||||
result = self.read_csv(StringIO(data), skipfooter=2)
|
||||
no_footer = '\n'.join(data.split('\n')[:-3])
|
||||
expected = self.read_csv(StringIO(no_footer))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_decompression_regex_sep(self):
|
||||
# see gh-6607
|
||||
|
||||
try:
|
||||
import gzip
|
||||
import bz2
|
||||
except ImportError:
|
||||
pytest.skip('need gzip and bz2 to run')
|
||||
|
||||
with open(self.csv1, 'rb') as f:
|
||||
data = f.read()
|
||||
data = data.replace(b',', b'::')
|
||||
expected = self.read_csv(self.csv1)
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
tmp = gzip.GzipFile(path, mode='wb')
|
||||
tmp.write(data)
|
||||
tmp.close()
|
||||
|
||||
result = self.read_csv(path, sep='::', compression='gzip')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
tmp = bz2.BZ2File(path, mode='wb')
|
||||
tmp.write(data)
|
||||
tmp.close()
|
||||
|
||||
result = self.read_csv(path, sep='::', compression='bz2')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
pytest.raises(ValueError, self.read_csv,
|
||||
path, compression='bz3')
|
||||
|
||||
def test_read_table_buglet_4x_multiindex(self):
|
||||
# see gh-6607
|
||||
text = """ A B C D E
|
||||
one two three four
|
||||
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
|
||||
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
|
||||
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
|
||||
|
||||
df = self.read_table(StringIO(text), sep=r'\s+')
|
||||
assert df.index.names == ('one', 'two', 'three', 'four')
|
||||
|
||||
# see gh-6893
|
||||
data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9'
|
||||
expected = DataFrame.from_records(
|
||||
[(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
|
||||
columns=list('abcABC'), index=list('abc'))
|
||||
actual = self.read_table(StringIO(data), sep=r'\s+')
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_skipfooter_with_decimal(self):
|
||||
# see gh-6971
|
||||
data = '1#2\n3#4'
|
||||
expected = DataFrame({'a': [1.2, 3.4]})
|
||||
|
||||
result = self.read_csv(StringIO(data), names=['a'],
|
||||
decimal='#')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# the stray footer line should not mess with the
|
||||
# casting of the first t wo lines if we skip it
|
||||
data = data + '\nFooter'
|
||||
result = self.read_csv(StringIO(data), names=['a'],
|
||||
decimal='#', skipfooter=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_encoding_non_utf8_multichar_sep(self):
|
||||
# see gh-3404
|
||||
expected = DataFrame({'a': [1], 'b': [2]})
|
||||
|
||||
for sep in ['::', '#####', '!!!', '123', '#1!c5',
|
||||
'%!c!d', '@@#4:2', '_!pd#_']:
|
||||
data = '1' + sep + '2'
|
||||
|
||||
for encoding in ['utf-16', 'utf-16-be', 'utf-16-le',
|
||||
'utf-32', 'cp037']:
|
||||
encoded_data = data.encode(encoding)
|
||||
result = self.read_csv(BytesIO(encoded_data),
|
||||
sep=sep, names=['a', 'b'],
|
||||
encoding=encoding)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_multi_char_sep_quotes(self):
|
||||
# see gh-13374
|
||||
|
||||
data = 'a,,b\n1,,a\n2,,"2,,b"'
|
||||
msg = 'ignored when a multi-char delimiter is used'
|
||||
|
||||
with tm.assert_raises_regex(ParserError, msg):
|
||||
self.read_csv(StringIO(data), sep=',,')
|
||||
|
||||
# We expect no match, so there should be an assertion
|
||||
# error out of the inner context manager.
|
||||
with pytest.raises(AssertionError):
|
||||
with tm.assert_raises_regex(ParserError, msg):
|
||||
self.read_csv(StringIO(data), sep=',,',
|
||||
quoting=csv.QUOTE_NONE)
|
||||
|
||||
def test_none_delimiter(self):
|
||||
# see gh-13374 and gh-17465
|
||||
|
||||
data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
|
||||
expected = DataFrame({'a': [0, 7],
|
||||
'b': [1, 8],
|
||||
'c': [2, 9]})
|
||||
|
||||
# We expect the third line in the data to be
|
||||
# skipped because it is malformed,
|
||||
# but we do not expect any errors to occur.
|
||||
result = self.read_csv(StringIO(data), header=0,
|
||||
sep=None,
|
||||
error_bad_lines=False,
|
||||
warn_bad_lines=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_skipfooter_bad_row(self):
|
||||
# see gh-13879
|
||||
# see gh-15910
|
||||
|
||||
msg = 'parsing errors in the skipped footer rows'
|
||||
|
||||
for data in ('a\n1\n"b"a',
|
||||
'a,b,c\ncat,foo,bar\ndog,foo,"baz'):
|
||||
with tm.assert_raises_regex(ParserError, msg):
|
||||
self.read_csv(StringIO(data), skipfooter=1)
|
||||
|
||||
# We expect no match, so there should be an assertion
|
||||
# error out of the inner context manager.
|
||||
with pytest.raises(AssertionError):
|
||||
with tm.assert_raises_regex(ParserError, msg):
|
||||
self.read_csv(StringIO(data))
|
||||
@@ -0,0 +1,153 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that quoting specifications are properly handled
|
||||
during parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import csv
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.compat import PY3, StringIO, u
|
||||
|
||||
|
||||
class QuotingTests(object):
|
||||
|
||||
def test_bad_quote_char(self):
|
||||
data = '1,2,3'
|
||||
|
||||
# Python 2.x: "...must be an 1-character..."
|
||||
# Python 3.x: "...must be a 1-character..."
|
||||
msg = '"quotechar" must be a(n)? 1-character string'
|
||||
tm.assert_raises_regex(TypeError, msg, self.read_csv,
|
||||
StringIO(data), quotechar='foo')
|
||||
|
||||
msg = 'quotechar must be set if quoting enabled'
|
||||
tm.assert_raises_regex(TypeError, msg, self.read_csv,
|
||||
StringIO(data), quotechar=None,
|
||||
quoting=csv.QUOTE_MINIMAL)
|
||||
|
||||
msg = '"quotechar" must be string, not int'
|
||||
tm.assert_raises_regex(TypeError, msg, self.read_csv,
|
||||
StringIO(data), quotechar=2)
|
||||
|
||||
def test_bad_quoting(self):
|
||||
data = '1,2,3'
|
||||
|
||||
msg = '"quoting" must be an integer'
|
||||
tm.assert_raises_regex(TypeError, msg, self.read_csv,
|
||||
StringIO(data), quoting='foo')
|
||||
|
||||
# quoting must in the range [0, 3]
|
||||
msg = 'bad "quoting" value'
|
||||
tm.assert_raises_regex(TypeError, msg, self.read_csv,
|
||||
StringIO(data), quoting=5)
|
||||
|
||||
def test_quote_char_basic(self):
|
||||
data = 'a,b,c\n1,2,"cat"'
|
||||
expected = DataFrame([[1, 2, 'cat']],
|
||||
columns=['a', 'b', 'c'])
|
||||
result = self.read_csv(StringIO(data), quotechar='"')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_quote_char_various(self):
|
||||
data = 'a,b,c\n1,2,"cat"'
|
||||
expected = DataFrame([[1, 2, 'cat']],
|
||||
columns=['a', 'b', 'c'])
|
||||
quote_chars = ['~', '*', '%', '$', '@', 'P']
|
||||
|
||||
for quote_char in quote_chars:
|
||||
new_data = data.replace('"', quote_char)
|
||||
result = self.read_csv(StringIO(new_data), quotechar=quote_char)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_null_quote_char(self):
|
||||
data = 'a,b,c\n1,2,3'
|
||||
|
||||
# sanity checks
|
||||
msg = 'quotechar must be set if quoting enabled'
|
||||
|
||||
tm.assert_raises_regex(TypeError, msg, self.read_csv,
|
||||
StringIO(data), quotechar=None,
|
||||
quoting=csv.QUOTE_MINIMAL)
|
||||
|
||||
tm.assert_raises_regex(TypeError, msg, self.read_csv,
|
||||
StringIO(data), quotechar='',
|
||||
quoting=csv.QUOTE_MINIMAL)
|
||||
|
||||
# no errors should be raised if quoting is None
|
||||
expected = DataFrame([[1, 2, 3]],
|
||||
columns=['a', 'b', 'c'])
|
||||
|
||||
result = self.read_csv(StringIO(data), quotechar=None,
|
||||
quoting=csv.QUOTE_NONE)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = self.read_csv(StringIO(data), quotechar='',
|
||||
quoting=csv.QUOTE_NONE)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_quoting_various(self):
|
||||
data = '1,2,"foo"'
|
||||
cols = ['a', 'b', 'c']
|
||||
|
||||
# QUOTE_MINIMAL and QUOTE_ALL apply only to
|
||||
# the CSV writer, so they should have no
|
||||
# special effect for the CSV reader
|
||||
expected = DataFrame([[1, 2, 'foo']], columns=cols)
|
||||
|
||||
# test default (afterwards, arguments are all explicit)
|
||||
result = self.read_csv(StringIO(data), names=cols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = self.read_csv(StringIO(data), quotechar='"',
|
||||
quoting=csv.QUOTE_MINIMAL, names=cols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = self.read_csv(StringIO(data), quotechar='"',
|
||||
quoting=csv.QUOTE_ALL, names=cols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# QUOTE_NONE tells the reader to do no special handling
|
||||
# of quote characters and leave them alone
|
||||
expected = DataFrame([[1, 2, '"foo"']], columns=cols)
|
||||
result = self.read_csv(StringIO(data), quotechar='"',
|
||||
quoting=csv.QUOTE_NONE, names=cols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# QUOTE_NONNUMERIC tells the reader to cast
|
||||
# all non-quoted fields to float
|
||||
expected = DataFrame([[1.0, 2.0, 'foo']], columns=cols)
|
||||
result = self.read_csv(StringIO(data), quotechar='"',
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
names=cols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_double_quote(self):
|
||||
data = 'a,b\n3,"4 "" 5"'
|
||||
|
||||
expected = DataFrame([[3, '4 " 5']],
|
||||
columns=['a', 'b'])
|
||||
result = self.read_csv(StringIO(data), quotechar='"',
|
||||
doublequote=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([[3, '4 " 5"']],
|
||||
columns=['a', 'b'])
|
||||
result = self.read_csv(StringIO(data), quotechar='"',
|
||||
doublequote=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_quotechar_unicode(self):
|
||||
# See gh-14477
|
||||
data = 'a\n1'
|
||||
expected = DataFrame({'a': [1]})
|
||||
|
||||
result = self.read_csv(StringIO(data), quotechar=u('"'))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Compared to Python 3.x, Python 2.x does not handle unicode well.
|
||||
if PY3:
|
||||
result = self.read_csv(StringIO(data), quotechar=u('\u0001'))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,225 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that skipped rows are properly handled during
|
||||
parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.errors import EmptyDataError
|
||||
from pandas.compat import StringIO, range, lrange
|
||||
|
||||
|
||||
class SkipRowsTests(object):
|
||||
|
||||
def test_skiprows_bug(self):
|
||||
# see gh-505
|
||||
text = """#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
1/1/2000,1.,2.,3.
|
||||
1/2/2000,4,5,6
|
||||
1/3/2000,7,8,9
|
||||
"""
|
||||
data = self.read_csv(StringIO(text), skiprows=lrange(6), header=None,
|
||||
index_col=0, parse_dates=True)
|
||||
|
||||
data2 = self.read_csv(StringIO(text), skiprows=6, header=None,
|
||||
index_col=0, parse_dates=True)
|
||||
|
||||
expected = DataFrame(np.arange(1., 10.).reshape((3, 3)),
|
||||
columns=[1, 2, 3],
|
||||
index=[datetime(2000, 1, 1), datetime(2000, 1, 2),
|
||||
datetime(2000, 1, 3)])
|
||||
expected.index.name = 0
|
||||
tm.assert_frame_equal(data, expected)
|
||||
tm.assert_frame_equal(data, data2)
|
||||
|
||||
def test_deep_skiprows(self):
|
||||
# see gh-4382
|
||||
text = "a,b,c\n" + \
|
||||
"\n".join([",".join([str(i), str(i + 1), str(i + 2)])
|
||||
for i in range(10)])
|
||||
condensed_text = "a,b,c\n" + \
|
||||
"\n".join([",".join([str(i), str(i + 1), str(i + 2)])
|
||||
for i in [0, 1, 2, 3, 4, 6, 8, 9]])
|
||||
data = self.read_csv(StringIO(text), skiprows=[6, 8])
|
||||
condensed_data = self.read_csv(StringIO(condensed_text))
|
||||
tm.assert_frame_equal(data, condensed_data)
|
||||
|
||||
def test_skiprows_blank(self):
|
||||
# see gh-9832
|
||||
text = """#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
|
||||
1/1/2000,1.,2.,3.
|
||||
1/2/2000,4,5,6
|
||||
1/3/2000,7,8,9
|
||||
"""
|
||||
data = self.read_csv(StringIO(text), skiprows=6, header=None,
|
||||
index_col=0, parse_dates=True)
|
||||
|
||||
expected = DataFrame(np.arange(1., 10.).reshape((3, 3)),
|
||||
columns=[1, 2, 3],
|
||||
index=[datetime(2000, 1, 1), datetime(2000, 1, 2),
|
||||
datetime(2000, 1, 3)])
|
||||
expected.index.name = 0
|
||||
tm.assert_frame_equal(data, expected)
|
||||
|
||||
def test_skiprow_with_newline(self):
|
||||
# see gh-12775 and gh-10911
|
||||
data = """id,text,num_lines
|
||||
1,"line 11
|
||||
line 12",2
|
||||
2,"line 21
|
||||
line 22",2
|
||||
3,"line 31",1"""
|
||||
expected = [[2, 'line 21\nline 22', 2],
|
||||
[3, 'line 31', 1]]
|
||||
expected = DataFrame(expected, columns=[
|
||||
'id', 'text', 'num_lines'])
|
||||
df = self.read_csv(StringIO(data), skiprows=[1])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
data = ('a,b,c\n~a\n b~,~e\n d~,'
|
||||
'~f\n f~\n1,2,~12\n 13\n 14~')
|
||||
expected = [['a\n b', 'e\n d', 'f\n f']]
|
||||
expected = DataFrame(expected, columns=[
|
||||
'a', 'b', 'c'])
|
||||
df = self.read_csv(StringIO(data),
|
||||
quotechar="~",
|
||||
skiprows=[2])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
data = ('Text,url\n~example\n '
|
||||
'sentence\n one~,url1\n~'
|
||||
'example\n sentence\n two~,url2\n~'
|
||||
'example\n sentence\n three~,url3')
|
||||
expected = [['example\n sentence\n two', 'url2']]
|
||||
expected = DataFrame(expected, columns=[
|
||||
'Text', 'url'])
|
||||
df = self.read_csv(StringIO(data),
|
||||
quotechar="~",
|
||||
skiprows=[1, 3])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_skiprow_with_quote(self):
|
||||
# see gh-12775 and gh-10911
|
||||
data = """id,text,num_lines
|
||||
1,"line '11' line 12",2
|
||||
2,"line '21' line 22",2
|
||||
3,"line '31' line 32",1"""
|
||||
expected = [[2, "line '21' line 22", 2],
|
||||
[3, "line '31' line 32", 1]]
|
||||
expected = DataFrame(expected, columns=[
|
||||
'id', 'text', 'num_lines'])
|
||||
df = self.read_csv(StringIO(data), skiprows=[1])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_skiprow_with_newline_and_quote(self):
|
||||
# see gh-12775 and gh-10911
|
||||
data = """id,text,num_lines
|
||||
1,"line \n'11' line 12",2
|
||||
2,"line \n'21' line 22",2
|
||||
3,"line \n'31' line 32",1"""
|
||||
expected = [[2, "line \n'21' line 22", 2],
|
||||
[3, "line \n'31' line 32", 1]]
|
||||
expected = DataFrame(expected, columns=[
|
||||
'id', 'text', 'num_lines'])
|
||||
df = self.read_csv(StringIO(data), skiprows=[1])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
data = """id,text,num_lines
|
||||
1,"line '11\n' line 12",2
|
||||
2,"line '21\n' line 22",2
|
||||
3,"line '31\n' line 32",1"""
|
||||
expected = [[2, "line '21\n' line 22", 2],
|
||||
[3, "line '31\n' line 32", 1]]
|
||||
expected = DataFrame(expected, columns=[
|
||||
'id', 'text', 'num_lines'])
|
||||
df = self.read_csv(StringIO(data), skiprows=[1])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
data = """id,text,num_lines
|
||||
1,"line '11\n' \r\tline 12",2
|
||||
2,"line '21\n' \r\tline 22",2
|
||||
3,"line '31\n' \r\tline 32",1"""
|
||||
expected = [[2, "line '21\n' \r\tline 22", 2],
|
||||
[3, "line '31\n' \r\tline 32", 1]]
|
||||
expected = DataFrame(expected, columns=[
|
||||
'id', 'text', 'num_lines'])
|
||||
df = self.read_csv(StringIO(data), skiprows=[1])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_skiprows_lineterminator(self):
|
||||
# see gh-9079
|
||||
data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ',
|
||||
'2007/01/01 01:00 0.2140 U M ',
|
||||
'2007/01/01 02:00 0.2141 M O ',
|
||||
'2007/01/01 04:00 0.2142 D M '])
|
||||
expected = DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'],
|
||||
['2007/01/01', '02:00', 0.2141, 'M', 'O'],
|
||||
['2007/01/01', '04:00', 0.2142, 'D', 'M']],
|
||||
columns=['date', 'time', 'var', 'flag',
|
||||
'oflag'])
|
||||
|
||||
# test with default line terminators "LF" and "CRLF"
|
||||
df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True,
|
||||
names=['date', 'time', 'var', 'flag', 'oflag'])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
df = self.read_csv(StringIO(data.replace('\n', '\r\n')),
|
||||
skiprows=1, delim_whitespace=True,
|
||||
names=['date', 'time', 'var', 'flag', 'oflag'])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
# "CR" is not respected with the Python parser yet
|
||||
if self.engine == 'c':
|
||||
df = self.read_csv(StringIO(data.replace('\n', '\r')),
|
||||
skiprows=1, delim_whitespace=True,
|
||||
names=['date', 'time', 'var', 'flag', 'oflag'])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_skiprows_infield_quote(self):
|
||||
# see gh-14459
|
||||
data = 'a"\nb"\na\n1'
|
||||
expected = DataFrame({'a': [1]})
|
||||
|
||||
df = self.read_csv(StringIO(data), skiprows=2)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_skiprows_callable(self):
|
||||
data = 'a\n1\n2\n3\n4\n5'
|
||||
|
||||
skiprows = lambda x: x % 2 == 0
|
||||
expected = DataFrame({'1': [3, 5]})
|
||||
df = self.read_csv(StringIO(data), skiprows=skiprows)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
expected = DataFrame({'foo': [3, 5]})
|
||||
df = self.read_csv(StringIO(data), skiprows=skiprows,
|
||||
header=0, names=['foo'])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
skiprows = lambda x: True
|
||||
msg = "No columns to parse from file"
|
||||
with tm.assert_raises_regex(EmptyDataError, msg):
|
||||
self.read_csv(StringIO(data), skiprows=skiprows)
|
||||
|
||||
# This is a bad callable and should raise.
|
||||
msg = "by zero"
|
||||
skiprows = lambda x: 1 / 0
|
||||
with tm.assert_raises_regex(ZeroDivisionError, msg):
|
||||
self.read_csv(StringIO(data), skiprows=skiprows)
|
||||
@@ -0,0 +1,200 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests parsers ability to read and parse non-local files
|
||||
and hence require a network connection to be read.
|
||||
"""
|
||||
import logging
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
import pandas.util.testing as tm
|
||||
import pandas.util._test_decorators as td
|
||||
from pandas import DataFrame
|
||||
from pandas.io.parsers import read_csv, read_table
|
||||
from pandas.compat import BytesIO, StringIO
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.parametrize(
|
||||
"compress_type, extension", [
|
||||
('gzip', '.gz'), ('bz2', '.bz2'), ('zip', '.zip'),
|
||||
pytest.param('xz', '.xz', marks=td.skip_if_no_lzma)
|
||||
]
|
||||
)
|
||||
@pytest.mark.parametrize('mode', ['explicit', 'infer'])
|
||||
@pytest.mark.parametrize('engine', ['python', 'c'])
|
||||
def test_compressed_urls(salaries_table, compress_type, extension, mode,
|
||||
engine):
|
||||
check_compressed_urls(salaries_table, compress_type, extension, mode,
|
||||
engine)
|
||||
|
||||
|
||||
@tm.network
|
||||
def check_compressed_urls(salaries_table, compression, extension, mode,
|
||||
engine):
|
||||
# test reading compressed urls with various engines and
|
||||
# extension inference
|
||||
base_url = ('https://github.com/pandas-dev/pandas/raw/master/'
|
||||
'pandas/tests/io/parser/data/salaries.csv')
|
||||
|
||||
url = base_url + extension
|
||||
|
||||
if mode != 'explicit':
|
||||
compression = mode
|
||||
|
||||
url_table = read_table(url, compression=compression, engine=engine)
|
||||
tm.assert_frame_equal(url_table, salaries_table)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tips_df(datapath):
|
||||
"""DataFrame with the tips dataset."""
|
||||
return read_csv(datapath('io', 'parser', 'data', 'tips.csv'))
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("s3_resource")
|
||||
class TestS3(object):
|
||||
|
||||
def test_parse_public_s3_bucket(self, tips_df):
|
||||
pytest.importorskip('s3fs')
|
||||
# more of an integration test due to the not-public contents portion
|
||||
# can probably mock this though.
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df = read_csv('s3://pandas-test/tips.csv' +
|
||||
ext, compression=comp)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
# Read public file from bucket with not-public contents
|
||||
df = read_csv('s3://cant_get_it/tips.csv')
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_public_s3n_bucket(self, tips_df):
|
||||
|
||||
# Read from AWS s3 as "s3n" URL
|
||||
df = read_csv('s3n://pandas-test/tips.csv', nrows=10)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3a_bucket(self, tips_df):
|
||||
# Read from AWS s3 as "s3a" URL
|
||||
df = read_csv('s3a://pandas-test/tips.csv', nrows=10)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3_bucket_nrows(self, tips_df):
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df = read_csv('s3://pandas-test/tips.csv' +
|
||||
ext, nrows=10, compression=comp)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3_bucket_chunked(self, tips_df):
|
||||
# Read with a chunksize
|
||||
chunksize = 5
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
|
||||
chunksize=chunksize, compression=comp)
|
||||
assert df_reader.chunksize == chunksize
|
||||
for i_chunk in [0, 1, 2]:
|
||||
# Read a couple of chunks and make sure we see them
|
||||
# properly.
|
||||
df = df_reader.get_chunk()
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
true_df = tips_df.iloc[
|
||||
chunksize * i_chunk: chunksize * (i_chunk + 1)]
|
||||
tm.assert_frame_equal(true_df, df)
|
||||
|
||||
def test_parse_public_s3_bucket_chunked_python(self, tips_df):
|
||||
# Read with a chunksize using the Python parser
|
||||
chunksize = 5
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
|
||||
chunksize=chunksize, compression=comp,
|
||||
engine='python')
|
||||
assert df_reader.chunksize == chunksize
|
||||
for i_chunk in [0, 1, 2]:
|
||||
# Read a couple of chunks and make sure we see them properly.
|
||||
df = df_reader.get_chunk()
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
true_df = tips_df.iloc[
|
||||
chunksize * i_chunk: chunksize * (i_chunk + 1)]
|
||||
tm.assert_frame_equal(true_df, df)
|
||||
|
||||
def test_parse_public_s3_bucket_python(self, tips_df):
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
|
||||
compression=comp)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_infer_s3_compression(self, tips_df):
|
||||
for ext in ['', '.gz', '.bz2']:
|
||||
df = read_csv('s3://pandas-test/tips.csv' + ext,
|
||||
engine='python', compression='infer')
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_public_s3_bucket_nrows_python(self, tips_df):
|
||||
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
|
||||
df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
|
||||
nrows=10, compression=comp)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_s3_fails(self):
|
||||
with pytest.raises(IOError):
|
||||
read_csv('s3://nyqpug/asdf.csv')
|
||||
|
||||
# Receive a permission error when trying to read a private bucket.
|
||||
# It's irrelevant here that this isn't actually a table.
|
||||
with pytest.raises(IOError):
|
||||
read_csv('s3://cant_get_it/')
|
||||
|
||||
def test_read_csv_handles_boto_s3_object(self,
|
||||
s3_resource,
|
||||
tips_file):
|
||||
# see gh-16135
|
||||
|
||||
s3_object = s3_resource.meta.client.get_object(
|
||||
Bucket='pandas-test',
|
||||
Key='tips.csv')
|
||||
|
||||
result = read_csv(BytesIO(s3_object["Body"].read()), encoding='utf8')
|
||||
assert isinstance(result, DataFrame)
|
||||
assert not result.empty
|
||||
|
||||
expected = read_csv(tips_file)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_read_csv_chunked_download(self, s3_resource, caplog):
|
||||
# 8 MB, S3FS usees 5MB chunks
|
||||
df = DataFrame(np.random.randn(100000, 4), columns=list('abcd'))
|
||||
buf = BytesIO()
|
||||
str_buf = StringIO()
|
||||
|
||||
df.to_csv(str_buf)
|
||||
|
||||
buf = BytesIO(str_buf.getvalue().encode('utf-8'))
|
||||
|
||||
s3_resource.Bucket("pandas-test").put_object(
|
||||
Key="large-file.csv",
|
||||
Body=buf)
|
||||
|
||||
with caplog.at_level(logging.DEBUG, logger='s3fs.core'):
|
||||
read_csv("s3://pandas-test/large-file.csv", nrows=5)
|
||||
# log of fetch_range (start, stop)
|
||||
assert ((0, 5505024) in set(x.args[-2:] for x in caplog.records))
|
||||
@@ -0,0 +1,154 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import pytest
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas import read_csv, read_table, DataFrame
|
||||
import pandas.core.common as com
|
||||
from pandas._libs.tslib import Timestamp
|
||||
from pandas.compat import StringIO
|
||||
|
||||
from .common import ParserTests
|
||||
from .header import HeaderTests
|
||||
from .comment import CommentTests
|
||||
from .dialect import DialectTests
|
||||
from .quoting import QuotingTests
|
||||
from .usecols import UsecolsTests
|
||||
from .skiprows import SkipRowsTests
|
||||
from .index_col import IndexColTests
|
||||
from .na_values import NAvaluesTests
|
||||
from .converters import ConverterTests
|
||||
from .c_parser_only import CParserTests
|
||||
from .parse_dates import ParseDatesTests
|
||||
from .compression import CompressionTests
|
||||
from .mangle_dupes import DupeColumnTests
|
||||
from .multithread import MultithreadTests
|
||||
from .python_parser_only import PythonParserTests
|
||||
from .dtypes import DtypeTests
|
||||
|
||||
|
||||
class BaseParser(CommentTests, CompressionTests,
|
||||
ConverterTests, DialectTests,
|
||||
DtypeTests, DupeColumnTests,
|
||||
HeaderTests, IndexColTests,
|
||||
MultithreadTests, NAvaluesTests,
|
||||
ParseDatesTests, ParserTests,
|
||||
SkipRowsTests, UsecolsTests,
|
||||
QuotingTests):
|
||||
|
||||
def read_csv(self, *args, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
def read_table(self, *args, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
def float_precision_choices(self):
|
||||
raise com.AbstractMethodError(self)
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_method(self, datapath):
|
||||
self.dirpath = datapath('io', 'parser', 'data')
|
||||
self.csv1 = os.path.join(self.dirpath, 'test1.csv')
|
||||
self.csv2 = os.path.join(self.dirpath, 'test2.csv')
|
||||
self.xls1 = os.path.join(self.dirpath, 'test.xls')
|
||||
self.csv_shiftjs = os.path.join(self.dirpath, 'sauron.SHIFT_JIS.csv')
|
||||
|
||||
|
||||
class TestCParserHighMemory(BaseParser, CParserTests):
|
||||
engine = 'c'
|
||||
low_memory = False
|
||||
float_precision_choices = [None, 'high', 'round_trip']
|
||||
|
||||
def read_csv(self, *args, **kwds):
|
||||
kwds = kwds.copy()
|
||||
kwds['engine'] = self.engine
|
||||
kwds['low_memory'] = self.low_memory
|
||||
return read_csv(*args, **kwds)
|
||||
|
||||
def read_table(self, *args, **kwds):
|
||||
kwds = kwds.copy()
|
||||
kwds['engine'] = self.engine
|
||||
kwds['low_memory'] = self.low_memory
|
||||
return read_table(*args, **kwds)
|
||||
|
||||
|
||||
class TestCParserLowMemory(BaseParser, CParserTests):
|
||||
engine = 'c'
|
||||
low_memory = True
|
||||
float_precision_choices = [None, 'high', 'round_trip']
|
||||
|
||||
def read_csv(self, *args, **kwds):
|
||||
kwds = kwds.copy()
|
||||
kwds['engine'] = self.engine
|
||||
kwds['low_memory'] = self.low_memory
|
||||
return read_csv(*args, **kwds)
|
||||
|
||||
def read_table(self, *args, **kwds):
|
||||
kwds = kwds.copy()
|
||||
kwds['engine'] = self.engine
|
||||
kwds['low_memory'] = True
|
||||
return read_table(*args, **kwds)
|
||||
|
||||
|
||||
class TestPythonParser(BaseParser, PythonParserTests):
|
||||
engine = 'python'
|
||||
float_precision_choices = [None]
|
||||
|
||||
def read_csv(self, *args, **kwds):
|
||||
kwds = kwds.copy()
|
||||
kwds['engine'] = self.engine
|
||||
return read_csv(*args, **kwds)
|
||||
|
||||
def read_table(self, *args, **kwds):
|
||||
kwds = kwds.copy()
|
||||
kwds['engine'] = self.engine
|
||||
return read_table(*args, **kwds)
|
||||
|
||||
|
||||
class TestUnsortedUsecols(object):
|
||||
def test_override__set_noconvert_columns(self):
|
||||
# GH 17351 - usecols needs to be sorted in _setnoconvert_columns
|
||||
# based on the test_usecols_with_parse_dates test from usecols.py
|
||||
from pandas.io.parsers import CParserWrapper, TextFileReader
|
||||
|
||||
s = """a,b,c,d,e
|
||||
0,1,20140101,0900,4
|
||||
0,1,20140102,1000,4"""
|
||||
|
||||
parse_dates = [[1, 2]]
|
||||
cols = {
|
||||
'a': [0, 0],
|
||||
'c_d': [
|
||||
Timestamp('2014-01-01 09:00:00'),
|
||||
Timestamp('2014-01-02 10:00:00')
|
||||
]
|
||||
}
|
||||
expected = DataFrame(cols, columns=['c_d', 'a'])
|
||||
|
||||
class MyTextFileReader(TextFileReader):
|
||||
def __init__(self):
|
||||
self._currow = 0
|
||||
self.squeeze = False
|
||||
|
||||
class MyCParserWrapper(CParserWrapper):
|
||||
def _set_noconvert_columns(self):
|
||||
if self.usecols_dtype == 'integer':
|
||||
# self.usecols is a set, which is documented as unordered
|
||||
# but in practice, a CPython set of integers is sorted.
|
||||
# In other implementations this assumption does not hold.
|
||||
# The following code simulates a different order, which
|
||||
# before GH 17351 would cause the wrong columns to be
|
||||
# converted via the parse_dates parameter
|
||||
self.usecols = list(self.usecols)
|
||||
self.usecols.reverse()
|
||||
return CParserWrapper._set_noconvert_columns(self)
|
||||
|
||||
parser = MyTextFileReader()
|
||||
parser.options = {'usecols': [0, 2, 3],
|
||||
'parse_dates': parse_dates,
|
||||
'delimiter': ','}
|
||||
parser._engine = MyCParserWrapper(StringIO(s), **parser.options)
|
||||
df = parser.read()
|
||||
|
||||
tm.assert_frame_equal(df, expected)
|
||||
@@ -0,0 +1,436 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests the 'read_fwf' function in parsers.py. This
|
||||
test suite is independent of the others because the
|
||||
engine is set to 'python-fwf' internally.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas import compat
|
||||
from pandas.compat import StringIO, BytesIO
|
||||
from pandas.io.parsers import read_csv, read_fwf, EmptyDataError
|
||||
|
||||
|
||||
class TestFwfParsing(object):
|
||||
|
||||
def test_fwf(self):
|
||||
data_expected = """\
|
||||
2011,58,360.242940,149.910199,11950.7
|
||||
2011,59,444.953632,166.985655,11788.4
|
||||
2011,60,364.136849,183.628767,11806.2
|
||||
2011,61,413.836124,184.375703,11916.8
|
||||
2011,62,502.953953,173.237159,12468.3
|
||||
"""
|
||||
expected = read_csv(StringIO(data_expected),
|
||||
engine='python', header=None)
|
||||
|
||||
data1 = """\
|
||||
201158 360.242940 149.910199 11950.7
|
||||
201159 444.953632 166.985655 11788.4
|
||||
201160 364.136849 183.628767 11806.2
|
||||
201161 413.836124 184.375703 11916.8
|
||||
201162 502.953953 173.237159 12468.3
|
||||
"""
|
||||
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
|
||||
df = read_fwf(StringIO(data1), colspecs=colspecs, header=None)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
data2 = """\
|
||||
2011 58 360.242940 149.910199 11950.7
|
||||
2011 59 444.953632 166.985655 11788.4
|
||||
2011 60 364.136849 183.628767 11806.2
|
||||
2011 61 413.836124 184.375703 11916.8
|
||||
2011 62 502.953953 173.237159 12468.3
|
||||
"""
|
||||
df = read_fwf(StringIO(data2), widths=[5, 5, 13, 13, 7], header=None)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
# From Thomas Kluyver: apparently some non-space filler characters can
|
||||
# be seen, this is supported by specifying the 'delimiter' character:
|
||||
# http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html
|
||||
data3 = """\
|
||||
201158~~~~360.242940~~~149.910199~~~11950.7
|
||||
201159~~~~444.953632~~~166.985655~~~11788.4
|
||||
201160~~~~364.136849~~~183.628767~~~11806.2
|
||||
201161~~~~413.836124~~~184.375703~~~11916.8
|
||||
201162~~~~502.953953~~~173.237159~~~12468.3
|
||||
"""
|
||||
df = read_fwf(
|
||||
StringIO(data3), colspecs=colspecs, delimiter='~', header=None)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
with tm.assert_raises_regex(ValueError,
|
||||
"must specify only one of"):
|
||||
read_fwf(StringIO(data3), colspecs=colspecs, widths=[6, 10, 10, 7])
|
||||
|
||||
with tm.assert_raises_regex(ValueError, "Must specify either"):
|
||||
read_fwf(StringIO(data3), colspecs=None, widths=None)
|
||||
|
||||
def test_BytesIO_input(self):
|
||||
if not compat.PY3:
|
||||
pytest.skip(
|
||||
"Bytes-related test - only needs to work on Python 3")
|
||||
|
||||
result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[
|
||||
2, 2], encoding='utf8')
|
||||
expected = DataFrame([["של", "ום"]], columns=["של", "ום"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fwf_colspecs_is_list_or_tuple(self):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
|
||||
with tm.assert_raises_regex(TypeError,
|
||||
'column specifications must '
|
||||
'be a list or tuple.+'):
|
||||
pd.io.parsers.FixedWidthReader(StringIO(data),
|
||||
{'a': 1}, ',', '#')
|
||||
|
||||
def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(self):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
|
||||
with tm.assert_raises_regex(TypeError,
|
||||
'Each column specification '
|
||||
'must be.+'):
|
||||
read_fwf(StringIO(data), [('a', 1)])
|
||||
|
||||
def test_fwf_colspecs_None(self):
|
||||
# GH 7079
|
||||
data = """\
|
||||
123456
|
||||
456789
|
||||
"""
|
||||
colspecs = [(0, 3), (3, None)]
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
|
||||
expected = DataFrame([[123, 456], [456, 789]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
colspecs = [(None, 3), (3, 6)]
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
|
||||
expected = DataFrame([[123, 456], [456, 789]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
colspecs = [(0, None), (3, None)]
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
|
||||
expected = DataFrame([[123456, 456], [456789, 789]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
colspecs = [(None, None), (3, 6)]
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
|
||||
expected = DataFrame([[123456, 456], [456789, 789]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fwf_regression(self):
|
||||
# GH 3594
|
||||
# turns out 'T060' is parsable as a datetime slice!
|
||||
|
||||
tzlist = [1, 10, 20, 30, 60, 80, 100]
|
||||
ntz = len(tzlist)
|
||||
tcolspecs = [16] + [8] * ntz
|
||||
tcolnames = ['SST'] + ["T%03d" % z for z in tzlist[1:]]
|
||||
data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192
|
||||
2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869
|
||||
2009164204000 9.5873 9.1326 8.4694 7.5889 6.0422 5.8526 5.4657
|
||||
2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379
|
||||
2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039
|
||||
"""
|
||||
|
||||
df = read_fwf(StringIO(data),
|
||||
index_col=0,
|
||||
header=None,
|
||||
names=tcolnames,
|
||||
widths=tcolspecs,
|
||||
parse_dates=True,
|
||||
date_parser=lambda s: datetime.strptime(s, '%Y%j%H%M%S'))
|
||||
|
||||
for c in df.columns:
|
||||
res = df.loc[:, c]
|
||||
assert len(res)
|
||||
|
||||
def test_fwf_for_uint8(self):
|
||||
data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127
|
||||
1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" # noqa
|
||||
df = read_fwf(StringIO(data),
|
||||
colspecs=[(0, 17), (25, 26), (33, 37),
|
||||
(49, 51), (58, 62), (63, 1000)],
|
||||
names=['time', 'pri', 'pgn', 'dst', 'src', 'data'],
|
||||
converters={
|
||||
'pgn': lambda x: int(x, 16),
|
||||
'src': lambda x: int(x, 16),
|
||||
'dst': lambda x: int(x, 16),
|
||||
'data': lambda x: len(x.split(' '))})
|
||||
|
||||
expected = DataFrame([[1421302965.213420, 3, 61184, 23, 40, 8],
|
||||
[1421302964.226776, 6, 61442, None, 71, 8]],
|
||||
columns=["time", "pri", "pgn",
|
||||
"dst", "src", "data"])
|
||||
expected["dst"] = expected["dst"].astype(object)
|
||||
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_fwf_compression(self):
|
||||
try:
|
||||
import gzip
|
||||
import bz2
|
||||
except ImportError:
|
||||
pytest.skip("Need gzip and bz2 to run this test")
|
||||
|
||||
data = """1111111111
|
||||
2222222222
|
||||
3333333333""".strip()
|
||||
widths = [5, 5]
|
||||
names = ['one', 'two']
|
||||
expected = read_fwf(StringIO(data), widths=widths, names=names)
|
||||
if compat.PY3:
|
||||
data = bytes(data, encoding='utf-8')
|
||||
comps = [('gzip', gzip.GzipFile), ('bz2', bz2.BZ2File)]
|
||||
for comp_name, compresser in comps:
|
||||
with tm.ensure_clean() as path:
|
||||
tmp = compresser(path, mode='wb')
|
||||
tmp.write(data)
|
||||
tmp.close()
|
||||
result = read_fwf(path, widths=widths, names=names,
|
||||
compression=comp_name)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_comment_fwf(self):
|
||||
data = """
|
||||
1 2. 4 #hello world
|
||||
5 NaN 10.0
|
||||
"""
|
||||
expected = np.array([[1, 2., 4],
|
||||
[5, np.nan, 10.]])
|
||||
df = read_fwf(StringIO(data), colspecs=[(0, 3), (4, 9), (9, 25)],
|
||||
comment='#')
|
||||
tm.assert_almost_equal(df.values, expected)
|
||||
|
||||
def test_1000_fwf(self):
|
||||
data = """
|
||||
1 2,334.0 5
|
||||
10 13 10.
|
||||
"""
|
||||
expected = np.array([[1, 2334., 5],
|
||||
[10, 13, 10]])
|
||||
df = read_fwf(StringIO(data), colspecs=[(0, 3), (3, 11), (12, 16)],
|
||||
thousands=',')
|
||||
tm.assert_almost_equal(df.values, expected)
|
||||
|
||||
def test_bool_header_arg(self):
|
||||
# see gh-6114
|
||||
data = """\
|
||||
MyColumn
|
||||
a
|
||||
b
|
||||
a
|
||||
b"""
|
||||
for arg in [True, False]:
|
||||
with pytest.raises(TypeError):
|
||||
read_fwf(StringIO(data), header=arg)
|
||||
|
||||
def test_full_file(self):
|
||||
# File with all values
|
||||
test = """index A B C
|
||||
2000-01-03T00:00:00 0.980268513777 3 foo
|
||||
2000-01-04T00:00:00 1.04791624281 -4 bar
|
||||
2000-01-05T00:00:00 0.498580885705 73 baz
|
||||
2000-01-06T00:00:00 1.12020151869 1 foo
|
||||
2000-01-07T00:00:00 0.487094399463 0 bar
|
||||
2000-01-10T00:00:00 0.836648671666 2 baz
|
||||
2000-01-11T00:00:00 0.157160753327 34 foo"""
|
||||
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
|
||||
|
||||
def test_full_file_with_missing(self):
|
||||
# File with missing values
|
||||
test = """index A B C
|
||||
2000-01-03T00:00:00 0.980268513777 3 foo
|
||||
2000-01-04T00:00:00 1.04791624281 -4 bar
|
||||
0.498580885705 73 baz
|
||||
2000-01-06T00:00:00 1.12020151869 1 foo
|
||||
2000-01-07T00:00:00 0 bar
|
||||
2000-01-10T00:00:00 0.836648671666 2 baz
|
||||
34"""
|
||||
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
|
||||
|
||||
def test_full_file_with_spaces(self):
|
||||
# File with spaces in columns
|
||||
test = """
|
||||
Account Name Balance CreditLimit AccountCreated
|
||||
101 Keanu Reeves 9315.45 10000.00 1/17/1998
|
||||
312 Gerard Butler 90.00 1000.00 8/6/2003
|
||||
868 Jennifer Love Hewitt 0 17000.00 5/25/1985
|
||||
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
|
||||
317 Bill Murray 789.65 5000.00 2/5/2007
|
||||
""".strip('\r\n')
|
||||
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
|
||||
|
||||
def test_full_file_with_spaces_and_missing(self):
|
||||
# File with spaces and missing values in columns
|
||||
test = """
|
||||
Account Name Balance CreditLimit AccountCreated
|
||||
101 10000.00 1/17/1998
|
||||
312 Gerard Butler 90.00 1000.00 8/6/2003
|
||||
868 5/25/1985
|
||||
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
|
||||
317 Bill Murray 789.65
|
||||
""".strip('\r\n')
|
||||
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
|
||||
|
||||
def test_messed_up_data(self):
|
||||
# Completely messed up file
|
||||
test = """
|
||||
Account Name Balance Credit Limit Account Created
|
||||
101 10000.00 1/17/1998
|
||||
312 Gerard Butler 90.00 1000.00
|
||||
|
||||
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
|
||||
317 Bill Murray 789.65
|
||||
""".strip('\r\n')
|
||||
colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
|
||||
|
||||
def test_multiple_delimiters(self):
|
||||
test = r"""
|
||||
col1~~~~~col2 col3++++++++++++++++++col4
|
||||
~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
|
||||
33+++122.33\\\bar.........Gerard Butler
|
||||
++44~~~~12.01 baz~~Jennifer Love Hewitt
|
||||
~~55 11+++foo++++Jada Pinkett-Smith
|
||||
..66++++++.03~~~bar Bill Murray
|
||||
""".strip('\r\n')
|
||||
colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs,
|
||||
delimiter=' +~.\\')
|
||||
tm.assert_frame_equal(expected, read_fwf(StringIO(test),
|
||||
delimiter=' +~.\\'))
|
||||
|
||||
def test_variable_width_unicode(self):
|
||||
if not compat.PY3:
|
||||
pytest.skip(
|
||||
'Bytes-related test - only needs to work on Python 3')
|
||||
test = """
|
||||
שלום שלום
|
||||
ום שלל
|
||||
של ום
|
||||
""".strip('\r\n')
|
||||
expected = read_fwf(BytesIO(test.encode('utf8')),
|
||||
colspecs=[(0, 4), (5, 9)],
|
||||
header=None, encoding='utf8')
|
||||
tm.assert_frame_equal(expected, read_fwf(
|
||||
BytesIO(test.encode('utf8')), header=None, encoding='utf8'))
|
||||
|
||||
def test_dtype(self):
|
||||
data = """ a b c
|
||||
1 2 3.2
|
||||
3 4 5.2
|
||||
"""
|
||||
colspecs = [(0, 5), (5, 10), (10, None)]
|
||||
result = pd.read_fwf(StringIO(data), colspecs=colspecs)
|
||||
expected = pd.DataFrame({
|
||||
'a': [1, 3],
|
||||
'b': [2, 4],
|
||||
'c': [3.2, 5.2]}, columns=['a', 'b', 'c'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected['a'] = expected['a'].astype('float64')
|
||||
expected['b'] = expected['b'].astype(str)
|
||||
expected['c'] = expected['c'].astype('int32')
|
||||
result = pd.read_fwf(StringIO(data), colspecs=colspecs,
|
||||
dtype={'a': 'float64', 'b': str, 'c': 'int32'})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_skiprows_inference(self):
|
||||
# GH11256
|
||||
test = """
|
||||
Text contained in the file header
|
||||
|
||||
DataCol1 DataCol2
|
||||
0.0 1.0
|
||||
101.6 956.1
|
||||
""".strip()
|
||||
expected = read_csv(StringIO(test), skiprows=2,
|
||||
delim_whitespace=True)
|
||||
tm.assert_frame_equal(expected, read_fwf(
|
||||
StringIO(test), skiprows=2))
|
||||
|
||||
def test_skiprows_by_index_inference(self):
|
||||
test = """
|
||||
To be skipped
|
||||
Not To Be Skipped
|
||||
Once more to be skipped
|
||||
123 34 8 123
|
||||
456 78 9 456
|
||||
""".strip()
|
||||
|
||||
expected = read_csv(StringIO(test), skiprows=[0, 2],
|
||||
delim_whitespace=True)
|
||||
tm.assert_frame_equal(expected, read_fwf(
|
||||
StringIO(test), skiprows=[0, 2]))
|
||||
|
||||
def test_skiprows_inference_empty(self):
|
||||
test = """
|
||||
AA BBB C
|
||||
12 345 6
|
||||
78 901 2
|
||||
""".strip()
|
||||
|
||||
with pytest.raises(EmptyDataError):
|
||||
read_fwf(StringIO(test), skiprows=3)
|
||||
|
||||
def test_whitespace_preservation(self):
|
||||
# Addresses Issue #16772
|
||||
data_expected = """
|
||||
a ,bbb
|
||||
cc,dd """
|
||||
expected = read_csv(StringIO(data_expected), header=None)
|
||||
|
||||
test_data = """
|
||||
a bbb
|
||||
ccdd """
|
||||
result = read_fwf(StringIO(test_data), widths=[3, 3],
|
||||
header=None, skiprows=[0], delimiter="\n\t")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_default_delimiter(self):
|
||||
data_expected = """
|
||||
a,bbb
|
||||
cc,dd"""
|
||||
expected = read_csv(StringIO(data_expected), header=None)
|
||||
|
||||
test_data = """
|
||||
a \tbbb
|
||||
cc\tdd """
|
||||
result = read_fwf(StringIO(test_data), widths=[3, 3],
|
||||
header=None, skiprows=[0])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,354 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests the TextReader class in parsers.pyx, which
|
||||
is integral to the C engine in parsers.py
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import StringIO, BytesIO, map
|
||||
from pandas import compat
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
from numpy import nan
|
||||
import numpy as np
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.io.parsers import (read_csv, TextFileReader)
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas._libs.parsers import TextReader
|
||||
import pandas._libs.parsers as parser
|
||||
|
||||
|
||||
class TestTextReader(object):
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_method(self, datapath):
|
||||
self.dirpath = datapath('io', 'parser', 'data')
|
||||
self.csv1 = os.path.join(self.dirpath, 'test1.csv')
|
||||
self.csv2 = os.path.join(self.dirpath, 'test2.csv')
|
||||
self.xls1 = os.path.join(self.dirpath, 'test.xls')
|
||||
|
||||
def test_file_handle(self):
|
||||
with open(self.csv1, 'rb') as f:
|
||||
reader = TextReader(f)
|
||||
reader.read()
|
||||
|
||||
def test_string_filename(self):
|
||||
reader = TextReader(self.csv1, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_file_handle_mmap(self):
|
||||
with open(self.csv1, 'rb') as f:
|
||||
reader = TextReader(f, memory_map=True, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_StringIO(self):
|
||||
with open(self.csv1, 'rb') as f:
|
||||
text = f.read()
|
||||
src = BytesIO(text)
|
||||
reader = TextReader(src, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_string_factorize(self):
|
||||
# should this be optional?
|
||||
data = 'a\nb\na\nb\na'
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
assert len(set(map(id, result[0]))) == 2
|
||||
|
||||
def test_skipinitialspace(self):
|
||||
data = ('a, b\n'
|
||||
'a, b\n'
|
||||
'a, b\n'
|
||||
'a, b')
|
||||
|
||||
reader = TextReader(StringIO(data), skipinitialspace=True,
|
||||
header=None)
|
||||
result = reader.read()
|
||||
|
||||
tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a', 'a'],
|
||||
dtype=np.object_))
|
||||
tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b', 'b'],
|
||||
dtype=np.object_))
|
||||
|
||||
def test_parse_booleans(self):
|
||||
data = 'True\nFalse\nTrue\nTrue'
|
||||
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
|
||||
assert result[0].dtype == np.bool_
|
||||
|
||||
def test_delimit_whitespace(self):
|
||||
data = 'a b\na\t\t "b"\n"a"\t \t b'
|
||||
|
||||
reader = TextReader(StringIO(data), delim_whitespace=True,
|
||||
header=None)
|
||||
result = reader.read()
|
||||
|
||||
tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a'],
|
||||
dtype=np.object_))
|
||||
tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b'],
|
||||
dtype=np.object_))
|
||||
|
||||
def test_embedded_newline(self):
|
||||
data = 'a\n"hello\nthere"\nthis'
|
||||
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array(['a', 'hello\nthere', 'this'], dtype=np.object_)
|
||||
tm.assert_numpy_array_equal(result[0], expected)
|
||||
|
||||
def test_euro_decimal(self):
|
||||
data = '12345,67\n345,678'
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=':',
|
||||
decimal=',', header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array([12345.67, 345.678])
|
||||
tm.assert_almost_equal(result[0], expected)
|
||||
|
||||
def test_integer_thousands(self):
|
||||
data = '123,456\n12,500'
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=':',
|
||||
thousands=',', header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array([123456, 12500], dtype=np.int64)
|
||||
tm.assert_almost_equal(result[0], expected)
|
||||
|
||||
def test_integer_thousands_alt(self):
|
||||
data = '123.456\n12.500'
|
||||
|
||||
reader = TextFileReader(StringIO(data), delimiter=':',
|
||||
thousands='.', header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = DataFrame([123456, 12500])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@tm.capture_stderr
|
||||
def test_skip_bad_lines(self):
|
||||
# too many lines, see #2430 for why
|
||||
data = ('a:b:c\n'
|
||||
'd:e:f\n'
|
||||
'g:h:i\n'
|
||||
'j:k:l:m\n'
|
||||
'l:m:n\n'
|
||||
'o:p:q:r')
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=':',
|
||||
header=None)
|
||||
pytest.raises(parser.ParserError, reader.read)
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=':',
|
||||
header=None,
|
||||
error_bad_lines=False,
|
||||
warn_bad_lines=False)
|
||||
result = reader.read()
|
||||
expected = {0: np.array(['a', 'd', 'g', 'l'], dtype=object),
|
||||
1: np.array(['b', 'e', 'h', 'm'], dtype=object),
|
||||
2: np.array(['c', 'f', 'i', 'n'], dtype=object)}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=':',
|
||||
header=None,
|
||||
error_bad_lines=False,
|
||||
warn_bad_lines=True)
|
||||
reader.read()
|
||||
val = sys.stderr.getvalue()
|
||||
|
||||
assert 'Skipping line 4' in val
|
||||
assert 'Skipping line 6' in val
|
||||
|
||||
def test_header_not_enough_lines(self):
|
||||
data = ('skip this\n'
|
||||
'skip this\n'
|
||||
'a,b,c\n'
|
||||
'1,2,3\n'
|
||||
'4,5,6')
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=',', header=2)
|
||||
header = reader.header
|
||||
expected = [['a', 'b', 'c']]
|
||||
assert header == expected
|
||||
|
||||
recs = reader.read()
|
||||
expected = {0: np.array([1, 4], dtype=np.int64),
|
||||
1: np.array([2, 5], dtype=np.int64),
|
||||
2: np.array([3, 6], dtype=np.int64)}
|
||||
assert_array_dicts_equal(recs, expected)
|
||||
|
||||
def test_escapechar(self):
|
||||
data = ('\\"hello world\"\n'
|
||||
'\\"hello world\"\n'
|
||||
'\\"hello world\"')
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=',', header=None,
|
||||
escapechar='\\')
|
||||
result = reader.read()
|
||||
expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
def test_eof_has_eol(self):
|
||||
# handling of new line at EOF
|
||||
pass
|
||||
|
||||
def test_na_substitution(self):
|
||||
pass
|
||||
|
||||
def test_numpy_string_dtype(self):
|
||||
data = """\
|
||||
a,1
|
||||
aa,2
|
||||
aaa,3
|
||||
aaaa,4
|
||||
aaaaa,5"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
return TextReader(StringIO(data), delimiter=',', header=None,
|
||||
**kwds)
|
||||
|
||||
reader = _make_reader(dtype='S5,i4')
|
||||
result = reader.read()
|
||||
|
||||
assert result[0].dtype == 'S5'
|
||||
|
||||
ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaaa'], dtype='S5')
|
||||
assert (result[0] == ex_values).all()
|
||||
assert result[1].dtype == 'i4'
|
||||
|
||||
reader = _make_reader(dtype='S4')
|
||||
result = reader.read()
|
||||
assert result[0].dtype == 'S4'
|
||||
ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaa'], dtype='S4')
|
||||
assert (result[0] == ex_values).all()
|
||||
assert result[1].dtype == 'S4'
|
||||
|
||||
def test_pass_dtype(self):
|
||||
data = """\
|
||||
one,two
|
||||
1,a
|
||||
2,b
|
||||
3,c
|
||||
4,d"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
return TextReader(StringIO(data), delimiter=',', **kwds)
|
||||
|
||||
reader = _make_reader(dtype={'one': 'u1', 1: 'S1'})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == 'u1'
|
||||
assert result[1].dtype == 'S1'
|
||||
|
||||
reader = _make_reader(dtype={'one': np.uint8, 1: object})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == 'u1'
|
||||
assert result[1].dtype == 'O'
|
||||
|
||||
reader = _make_reader(dtype={'one': np.dtype('u1'),
|
||||
1: np.dtype('O')})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == 'u1'
|
||||
assert result[1].dtype == 'O'
|
||||
|
||||
def test_usecols(self):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
return TextReader(StringIO(data), delimiter=',', **kwds)
|
||||
|
||||
reader = _make_reader(usecols=(1, 2))
|
||||
result = reader.read()
|
||||
|
||||
exp = _make_reader().read()
|
||||
assert len(result) == 2
|
||||
assert (result[1] == exp[1]).all()
|
||||
assert (result[2] == exp[2]).all()
|
||||
|
||||
def test_cr_delimited(self):
|
||||
def _test(text, **kwargs):
|
||||
nice_text = text.replace('\r', '\r\n')
|
||||
result = TextReader(StringIO(text), **kwargs).read()
|
||||
expected = TextReader(StringIO(nice_text), **kwargs).read()
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
data = 'a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12'
|
||||
_test(data, delimiter=',')
|
||||
|
||||
data = 'a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12'
|
||||
_test(data, delim_whitespace=True)
|
||||
|
||||
data = 'a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12'
|
||||
_test(data, delimiter=',')
|
||||
|
||||
sample = ('A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r'
|
||||
'AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r'
|
||||
',BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0')
|
||||
_test(sample, delimiter=',')
|
||||
|
||||
data = 'A B C\r 2 3\r4 5 6'
|
||||
_test(data, delim_whitespace=True)
|
||||
|
||||
data = 'A B C\r2 3\r4 5 6'
|
||||
_test(data, delim_whitespace=True)
|
||||
|
||||
def test_empty_field_eof(self):
|
||||
data = 'a,b,c\n1,2,3\n4,,'
|
||||
|
||||
result = TextReader(StringIO(data), delimiter=',').read()
|
||||
|
||||
expected = {0: np.array([1, 4], dtype=np.int64),
|
||||
1: np.array(['2', ''], dtype=object),
|
||||
2: np.array(['3', ''], dtype=object)}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
# GH5664
|
||||
a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c'])
|
||||
b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]],
|
||||
columns=list('abcd'),
|
||||
index=[1, 1])
|
||||
c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan],
|
||||
[8, 9, 10, 11], [13, 14, nan, nan]],
|
||||
columns=list('abcd'),
|
||||
index=[0, 5, 7, 12])
|
||||
|
||||
for _ in range(100):
|
||||
df = read_csv(StringIO('a,b\nc\n'), skiprows=0,
|
||||
names=['a'], engine='c')
|
||||
assert_frame_equal(df, a)
|
||||
|
||||
df = read_csv(StringIO('1,1,1,1,0\n' * 2 + '\n' * 2),
|
||||
names=list("abcd"), engine='c')
|
||||
assert_frame_equal(df, b)
|
||||
|
||||
df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'),
|
||||
names=list('abcd'), engine='c')
|
||||
assert_frame_equal(df, c)
|
||||
|
||||
def test_empty_csv_input(self):
|
||||
# GH14867
|
||||
df = read_csv(StringIO(), chunksize=20, header=None,
|
||||
names=['a', 'b', 'c'])
|
||||
assert isinstance(df, TextFileReader)
|
||||
|
||||
|
||||
def assert_array_dicts_equal(left, right):
|
||||
for k, v in compat.iteritems(left):
|
||||
assert tm.assert_numpy_array_equal(np.asarray(v),
|
||||
np.asarray(right[k]))
|
||||
@@ -0,0 +1,139 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests that features that are currently unsupported in
|
||||
either the Python or C parser are actually enforced
|
||||
and are clearly communicated to the user.
|
||||
|
||||
Ultimately, the goal is to remove test cases from this
|
||||
test suite as new feature support is added to the parsers.
|
||||
"""
|
||||
|
||||
import pandas.io.parsers as parsers
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.compat import StringIO
|
||||
from pandas.errors import ParserError
|
||||
from pandas.io.parsers import read_csv, read_table
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
|
||||
def python_engine(request):
|
||||
return request.param
|
||||
|
||||
|
||||
class TestUnsupportedFeatures(object):
|
||||
|
||||
def test_mangle_dupe_cols_false(self):
|
||||
# see gh-12935
|
||||
data = 'a b c\n1 2 3'
|
||||
msg = 'is not supported'
|
||||
|
||||
for engine in ('c', 'python'):
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
read_csv(StringIO(data), engine=engine,
|
||||
mangle_dupe_cols=False)
|
||||
|
||||
def test_c_engine(self):
|
||||
# see gh-6607
|
||||
data = 'a b c\n1 2 3'
|
||||
msg = 'does not support'
|
||||
|
||||
# specify C engine with unsupported options (raise)
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
read_table(StringIO(data), engine='c',
|
||||
sep=None, delim_whitespace=False)
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
read_table(StringIO(data), engine='c', sep=r'\s')
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
read_table(StringIO(data), engine='c', quotechar=chr(128))
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
read_table(StringIO(data), engine='c', skipfooter=1)
|
||||
|
||||
# specify C-unsupported options without python-unsupported options
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_table(StringIO(data), sep=None, delim_whitespace=False)
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_table(StringIO(data), quotechar=chr(128))
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_table(StringIO(data), sep=r'\s')
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_table(StringIO(data), skipfooter=1)
|
||||
|
||||
text = """ A B C D E
|
||||
one two three four
|
||||
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
|
||||
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
|
||||
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
|
||||
msg = 'Error tokenizing data'
|
||||
|
||||
with tm.assert_raises_regex(ParserError, msg):
|
||||
read_table(StringIO(text), sep='\\s+')
|
||||
with tm.assert_raises_regex(ParserError, msg):
|
||||
read_table(StringIO(text), engine='c', sep='\\s+')
|
||||
|
||||
msg = "Only length-1 thousands markers supported"
|
||||
data = """A|B|C
|
||||
1|2,334|5
|
||||
10|13|10.
|
||||
"""
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
read_csv(StringIO(data), thousands=',,')
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
read_csv(StringIO(data), thousands='')
|
||||
|
||||
msg = "Only length-1 line terminators supported"
|
||||
data = 'a,b,c~~1,2,3~~4,5,6'
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
read_csv(StringIO(data), lineterminator='~~')
|
||||
|
||||
def test_python_engine(self, python_engine):
|
||||
from pandas.io.parsers import _python_unsupported as py_unsupported
|
||||
|
||||
data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
|
||||
for default in py_unsupported:
|
||||
msg = ('The %r option is not supported '
|
||||
'with the %r engine' % (default, python_engine))
|
||||
|
||||
kwargs = {default: object()}
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
read_csv(StringIO(data), engine=python_engine, **kwargs)
|
||||
|
||||
def test_python_engine_file_no_next(self, python_engine):
|
||||
# see gh-16530
|
||||
class NoNextBuffer(object):
|
||||
def __init__(self, csv_data):
|
||||
self.data = csv_data
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def read(self):
|
||||
return self.data
|
||||
|
||||
data = "a\n1"
|
||||
msg = "The 'python' engine cannot iterate"
|
||||
|
||||
with tm.assert_raises_regex(ValueError, msg):
|
||||
read_csv(NoNextBuffer(data), engine=python_engine)
|
||||
|
||||
|
||||
class TestDeprecatedFeatures(object):
|
||||
|
||||
@pytest.mark.parametrize("engine", ["c", "python"])
|
||||
@pytest.mark.parametrize("kwargs", [{"tupleize_cols": True},
|
||||
{"tupleize_cols": False}])
|
||||
def test_deprecated_args(self, engine, kwargs):
|
||||
data = "1,2,3"
|
||||
arg, _ = list(kwargs.items())[0]
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, check_stacklevel=False):
|
||||
read_csv(StringIO(data), engine=engine, **kwargs)
|
||||
@@ -0,0 +1,549 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Tests the usecols functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
import numpy as np
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas import DataFrame, Index
|
||||
from pandas._libs.tslib import Timestamp
|
||||
from pandas.compat import StringIO
|
||||
|
||||
|
||||
class UsecolsTests(object):
|
||||
msg_validate_usecols_arg = ("'usecols' must either be list-like of all "
|
||||
"strings, all unicode, all integers or a "
|
||||
"callable.")
|
||||
msg_validate_usecols_names = ("Usecols do not match columns, columns "
|
||||
"expected but not found: {0}")
|
||||
|
||||
def test_raise_on_mixed_dtype_usecols(self):
|
||||
# See gh-12678
|
||||
data = """a,b,c
|
||||
1000,2000,3000
|
||||
4000,5000,6000
|
||||
"""
|
||||
|
||||
usecols = [0, 'b', 2]
|
||||
|
||||
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
|
||||
self.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
def test_usecols(self):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
|
||||
result = self.read_csv(StringIO(data), usecols=(1, 2))
|
||||
result2 = self.read_csv(StringIO(data), usecols=('b', 'c'))
|
||||
exp = self.read_csv(StringIO(data))
|
||||
|
||||
assert len(result.columns) == 2
|
||||
assert (result['b'] == exp['b']).all()
|
||||
assert (result['c'] == exp['c']).all()
|
||||
|
||||
tm.assert_frame_equal(result, result2)
|
||||
|
||||
result = self.read_csv(StringIO(data), usecols=[1, 2], header=0,
|
||||
names=['foo', 'bar'])
|
||||
expected = self.read_csv(StringIO(data), usecols=[1, 2])
|
||||
expected.columns = ['foo', 'bar']
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
result = self.read_csv(StringIO(data), names=['b', 'c'],
|
||||
header=None, usecols=[1, 2])
|
||||
|
||||
expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
|
||||
header=None)
|
||||
expected = expected[['b', 'c']]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result2 = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
|
||||
header=None, usecols=['b', 'c'])
|
||||
tm.assert_frame_equal(result2, result)
|
||||
|
||||
# see gh-5766
|
||||
result = self.read_csv(StringIO(data), names=['a', 'b'],
|
||||
header=None, usecols=[0, 1])
|
||||
|
||||
expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
|
||||
header=None)
|
||||
expected = expected[['a', 'b']]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# length conflict, passed names and usecols disagree
|
||||
pytest.raises(ValueError, self.read_csv, StringIO(data),
|
||||
names=['a', 'b'], usecols=[1], header=None)
|
||||
|
||||
def test_usecols_single_string(self):
|
||||
# GH 20558
|
||||
data = """foo, bar, baz
|
||||
1000, 2000, 3000
|
||||
4000, 5000, 6000
|
||||
"""
|
||||
|
||||
usecols = 'foo'
|
||||
|
||||
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
|
||||
self.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
def test_usecols_index_col_False(self):
|
||||
# see gh-9082
|
||||
s = "a,b,c,d\n1,2,3,4\n5,6,7,8"
|
||||
s_malformed = "a,b,c,d\n1,2,3,4,\n5,6,7,8,"
|
||||
cols = ['a', 'c', 'd']
|
||||
expected = DataFrame({'a': [1, 5], 'c': [3, 7], 'd': [4, 8]})
|
||||
df = self.read_csv(StringIO(s), usecols=cols, index_col=False)
|
||||
tm.assert_frame_equal(expected, df)
|
||||
df = self.read_csv(StringIO(s_malformed),
|
||||
usecols=cols, index_col=False)
|
||||
tm.assert_frame_equal(expected, df)
|
||||
|
||||
def test_usecols_index_col_conflict(self):
|
||||
# see gh-4201: test that index_col as integer reflects usecols
|
||||
data = 'a,b,c,d\nA,a,1,one\nB,b,2,two'
|
||||
expected = DataFrame({'c': [1, 2]}, index=Index(
|
||||
['a', 'b'], name='b'))
|
||||
|
||||
df = self.read_csv(StringIO(data), usecols=['b', 'c'],
|
||||
index_col=0)
|
||||
tm.assert_frame_equal(expected, df)
|
||||
|
||||
df = self.read_csv(StringIO(data), usecols=['b', 'c'],
|
||||
index_col='b')
|
||||
tm.assert_frame_equal(expected, df)
|
||||
|
||||
df = self.read_csv(StringIO(data), usecols=[1, 2],
|
||||
index_col='b')
|
||||
tm.assert_frame_equal(expected, df)
|
||||
|
||||
df = self.read_csv(StringIO(data), usecols=[1, 2],
|
||||
index_col=0)
|
||||
tm.assert_frame_equal(expected, df)
|
||||
|
||||
expected = DataFrame(
|
||||
{'b': ['a', 'b'], 'c': [1, 2], 'd': ('one', 'two')})
|
||||
expected = expected.set_index(['b', 'c'])
|
||||
df = self.read_csv(StringIO(data), usecols=['b', 'c', 'd'],
|
||||
index_col=['b', 'c'])
|
||||
tm.assert_frame_equal(expected, df)
|
||||
|
||||
def test_usecols_implicit_index_col(self):
|
||||
# see gh-2654
|
||||
data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10'
|
||||
|
||||
result = self.read_csv(StringIO(data), usecols=['a', 'b'])
|
||||
expected = DataFrame({'a': ['apple', 'orange'],
|
||||
'b': ['bat', 'cow']}, index=[4, 8])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_usecols_regex_sep(self):
|
||||
# see gh-2733
|
||||
data = 'a b c\n4 apple bat 5.7\n8 orange cow 10'
|
||||
|
||||
df = self.read_csv(StringIO(data), sep=r'\s+', usecols=('a', 'b'))
|
||||
|
||||
expected = DataFrame({'a': ['apple', 'orange'],
|
||||
'b': ['bat', 'cow']}, index=[4, 8])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_usecols_with_whitespace(self):
|
||||
data = 'a b c\n4 apple bat 5.7\n8 orange cow 10'
|
||||
|
||||
result = self.read_csv(StringIO(data), delim_whitespace=True,
|
||||
usecols=('a', 'b'))
|
||||
expected = DataFrame({'a': ['apple', 'orange'],
|
||||
'b': ['bat', 'cow']}, index=[4, 8])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_usecols_with_integer_like_header(self):
|
||||
data = """2,0,1
|
||||
1000,2000,3000
|
||||
4000,5000,6000
|
||||
"""
|
||||
|
||||
usecols = [0, 1] # column selection by index
|
||||
expected = DataFrame(data=[[1000, 2000],
|
||||
[4000, 5000]],
|
||||
columns=['2', '0'])
|
||||
df = self.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
usecols = ['0', '1'] # column selection by name
|
||||
expected = DataFrame(data=[[2000, 3000],
|
||||
[5000, 6000]],
|
||||
columns=['0', '1'])
|
||||
df = self.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_usecols_with_parse_dates(self):
|
||||
# See gh-9755
|
||||
s = """a,b,c,d,e
|
||||
0,1,20140101,0900,4
|
||||
0,1,20140102,1000,4"""
|
||||
parse_dates = [[1, 2]]
|
||||
|
||||
cols = {
|
||||
'a': [0, 0],
|
||||
'c_d': [
|
||||
Timestamp('2014-01-01 09:00:00'),
|
||||
Timestamp('2014-01-02 10:00:00')
|
||||
]
|
||||
}
|
||||
expected = DataFrame(cols, columns=['c_d', 'a'])
|
||||
|
||||
df = self.read_csv(StringIO(s), usecols=[0, 2, 3],
|
||||
parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
df = self.read_csv(StringIO(s), usecols=[3, 0, 2],
|
||||
parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
# See gh-13604
|
||||
s = """2008-02-07 09:40,1032.43
|
||||
2008-02-07 09:50,1042.54
|
||||
2008-02-07 10:00,1051.65
|
||||
"""
|
||||
parse_dates = [0]
|
||||
names = ['date', 'values']
|
||||
usecols = names[:]
|
||||
|
||||
index = Index([Timestamp('2008-02-07 09:40'),
|
||||
Timestamp('2008-02-07 09:50'),
|
||||
Timestamp('2008-02-07 10:00')],
|
||||
name='date')
|
||||
cols = {'values': [1032.43, 1042.54, 1051.65]}
|
||||
expected = DataFrame(cols, index=index)
|
||||
|
||||
df = self.read_csv(StringIO(s), parse_dates=parse_dates, index_col=0,
|
||||
usecols=usecols, header=None, names=names)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
# See gh-14792
|
||||
s = """a,b,c,d,e,f,g,h,i,j
|
||||
2016/09/21,1,1,2,3,4,5,6,7,8"""
|
||||
parse_dates = [0]
|
||||
usecols = list('abcdefghij')
|
||||
cols = {'a': Timestamp('2016-09-21'),
|
||||
'b': [1], 'c': [1], 'd': [2],
|
||||
'e': [3], 'f': [4], 'g': [5],
|
||||
'h': [6], 'i': [7], 'j': [8]}
|
||||
expected = DataFrame(cols, columns=usecols)
|
||||
df = self.read_csv(StringIO(s), usecols=usecols,
|
||||
parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
s = """a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"""
|
||||
parse_dates = [[0, 1]]
|
||||
usecols = list('abcdefghij')
|
||||
cols = {'a_b': '2016/09/21 1',
|
||||
'c': [1], 'd': [2], 'e': [3], 'f': [4],
|
||||
'g': [5], 'h': [6], 'i': [7], 'j': [8]}
|
||||
expected = DataFrame(cols, columns=['a_b'] + list('cdefghij'))
|
||||
df = self.read_csv(StringIO(s), usecols=usecols,
|
||||
parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_usecols_with_parse_dates_and_full_names(self):
|
||||
# See gh-9755
|
||||
s = """0,1,20140101,0900,4
|
||||
0,1,20140102,1000,4"""
|
||||
parse_dates = [[1, 2]]
|
||||
names = list('abcde')
|
||||
|
||||
cols = {
|
||||
'a': [0, 0],
|
||||
'c_d': [
|
||||
Timestamp('2014-01-01 09:00:00'),
|
||||
Timestamp('2014-01-02 10:00:00')
|
||||
]
|
||||
}
|
||||
expected = DataFrame(cols, columns=['c_d', 'a'])
|
||||
|
||||
df = self.read_csv(StringIO(s), names=names,
|
||||
usecols=[0, 2, 3],
|
||||
parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
df = self.read_csv(StringIO(s), names=names,
|
||||
usecols=[3, 0, 2],
|
||||
parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_usecols_with_parse_dates_and_usecol_names(self):
|
||||
# See gh-9755
|
||||
s = """0,1,20140101,0900,4
|
||||
0,1,20140102,1000,4"""
|
||||
parse_dates = [[1, 2]]
|
||||
names = list('acd')
|
||||
|
||||
cols = {
|
||||
'a': [0, 0],
|
||||
'c_d': [
|
||||
Timestamp('2014-01-01 09:00:00'),
|
||||
Timestamp('2014-01-02 10:00:00')
|
||||
]
|
||||
}
|
||||
expected = DataFrame(cols, columns=['c_d', 'a'])
|
||||
|
||||
df = self.read_csv(StringIO(s), names=names,
|
||||
usecols=[0, 2, 3],
|
||||
parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
df = self.read_csv(StringIO(s), names=names,
|
||||
usecols=[3, 0, 2],
|
||||
parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_usecols_with_unicode_strings(self):
|
||||
# see gh-13219
|
||||
|
||||
s = '''AAA,BBB,CCC,DDD
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a
|
||||
'''
|
||||
|
||||
data = {
|
||||
'AAA': {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002
|
||||
},
|
||||
'BBB': {0: 8, 1: 2, 2: 7}
|
||||
}
|
||||
expected = DataFrame(data)
|
||||
|
||||
df = self.read_csv(StringIO(s), usecols=[u'AAA', u'BBB'])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_usecols_with_single_byte_unicode_strings(self):
|
||||
# see gh-13219
|
||||
|
||||
s = '''A,B,C,D
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a
|
||||
'''
|
||||
|
||||
data = {
|
||||
'A': {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002
|
||||
},
|
||||
'B': {0: 8, 1: 2, 2: 7}
|
||||
}
|
||||
expected = DataFrame(data)
|
||||
|
||||
df = self.read_csv(StringIO(s), usecols=[u'A', u'B'])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_usecols_with_mixed_encoding_strings(self):
|
||||
s = '''AAA,BBB,CCC,DDD
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a
|
||||
'''
|
||||
|
||||
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
|
||||
self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])
|
||||
|
||||
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
|
||||
self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB'])
|
||||
|
||||
def test_usecols_with_multibyte_characters(self):
|
||||
s = '''あああ,いい,ううう,ええええ
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a
|
||||
'''
|
||||
data = {
|
||||
'あああ': {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002
|
||||
},
|
||||
'いい': {0: 8, 1: 2, 2: 7}
|
||||
}
|
||||
expected = DataFrame(data)
|
||||
|
||||
df = self.read_csv(StringIO(s), usecols=['あああ', 'いい'])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_usecols_with_multibyte_unicode_characters(self):
|
||||
pytest.skip('TODO: see gh-13253')
|
||||
|
||||
s = '''あああ,いい,ううう,ええええ
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a
|
||||
'''
|
||||
data = {
|
||||
'あああ': {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002
|
||||
},
|
||||
'いい': {0: 8, 1: 2, 2: 7}
|
||||
}
|
||||
expected = DataFrame(data)
|
||||
|
||||
df = self.read_csv(StringIO(s), usecols=[u'あああ', u'いい'])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_empty_usecols(self):
|
||||
# should not raise
|
||||
data = 'a,b,c\n1,2,3\n4,5,6'
|
||||
expected = DataFrame()
|
||||
result = self.read_csv(StringIO(data), usecols=set([]))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_np_array_usecols(self):
|
||||
# See gh-12546
|
||||
data = 'a,b,c\n1,2,3'
|
||||
usecols = np.array(['a', 'b'])
|
||||
|
||||
expected = DataFrame([[1, 2]], columns=usecols)
|
||||
result = self.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_callable_usecols(self):
|
||||
# See gh-14154
|
||||
s = '''AaA,bBb,CCC,ddd
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a
|
||||
'''
|
||||
|
||||
data = {
|
||||
'AaA': {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002
|
||||
},
|
||||
'bBb': {0: 8, 1: 2, 2: 7},
|
||||
'ddd': {0: 'a', 1: 'b', 2: 'a'}
|
||||
}
|
||||
expected = DataFrame(data)
|
||||
df = self.read_csv(StringIO(s), usecols=lambda x:
|
||||
x.upper() in ['AAA', 'BBB', 'DDD'])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
# Check that a callable returning only False returns
|
||||
# an empty DataFrame
|
||||
expected = DataFrame()
|
||||
df = self.read_csv(StringIO(s), usecols=lambda x: False)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_incomplete_first_row(self):
|
||||
# see gh-6710
|
||||
data = '1,2\n1,2,3'
|
||||
names = ['a', 'b', 'c']
|
||||
expected = DataFrame({'a': [1, 1],
|
||||
'c': [np.nan, 3]})
|
||||
|
||||
usecols = ['a', 'c']
|
||||
df = self.read_csv(StringIO(data), names=names, usecols=usecols)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
usecols = lambda x: x in ['a', 'c']
|
||||
df = self.read_csv(StringIO(data), names=names, usecols=usecols)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_uneven_length_cols(self):
|
||||
# see gh-8985
|
||||
usecols = [0, 1, 2]
|
||||
data = '19,29,39\n' * 2 + '10,20,30,40'
|
||||
expected = DataFrame([[19, 29, 39],
|
||||
[19, 29, 39],
|
||||
[10, 20, 30]])
|
||||
df = self.read_csv(StringIO(data), header=None, usecols=usecols)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
# see gh-9549
|
||||
usecols = ['A', 'B', 'C']
|
||||
data = ('A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n'
|
||||
'1,2,3,,,1,\n1,2,3\n5,6,7')
|
||||
expected = DataFrame({'A': [1, 3, 1, 1, 1, 5],
|
||||
'B': [2, 4, 2, 2, 2, 6],
|
||||
'C': [3, 5, 4, 3, 3, 7]})
|
||||
df = self.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_raise_on_usecols_names_mismatch(self):
|
||||
# GH 14671
|
||||
data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
|
||||
|
||||
usecols = ['a', 'b', 'c', 'd']
|
||||
df = self.read_csv(StringIO(data), usecols=usecols)
|
||||
expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7],
|
||||
'd': [4, 8]})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
usecols = ['a', 'b', 'c', 'f']
|
||||
with tm.assert_raises_regex(ValueError,
|
||||
self.msg_validate_usecols_names.format(
|
||||
r"\['f'\]")):
|
||||
self.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
usecols = ['a', 'b', 'f']
|
||||
with tm.assert_raises_regex(ValueError,
|
||||
self.msg_validate_usecols_names.format(
|
||||
r"\['f'\]")):
|
||||
self.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
usecols = ['a', 'b', 'f', 'g']
|
||||
with tm.assert_raises_regex(ValueError,
|
||||
self.msg_validate_usecols_names.format(
|
||||
r"\[('f', 'g'|'g', 'f')\]")):
|
||||
self.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
names = ['A', 'B', 'C', 'D']
|
||||
|
||||
df = self.read_csv(StringIO(data), header=0, names=names)
|
||||
expected = DataFrame({'A': [1, 5], 'B': [2, 6], 'C': [3, 7],
|
||||
'D': [4, 8]})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
# TODO: https://github.com/pandas-dev/pandas/issues/16469
|
||||
# usecols = ['A','C']
|
||||
# df = self.read_csv(StringIO(data), header=0, names=names,
|
||||
# usecols=usecols)
|
||||
# expected = DataFrame({'A': [1,5], 'C': [3,7]})
|
||||
# tm.assert_frame_equal(df, expected)
|
||||
#
|
||||
# usecols = [0,2]
|
||||
# df = self.read_csv(StringIO(data), header=0, names=names,
|
||||
# usecols=usecols)
|
||||
# expected = DataFrame({'A': [1,5], 'C': [3,7]})
|
||||
# tm.assert_frame_equal(df, expected)
|
||||
|
||||
usecols = ['A', 'B', 'C', 'f']
|
||||
with tm.assert_raises_regex(ValueError,
|
||||
self.msg_validate_usecols_names.format(
|
||||
r"\['f'\]")):
|
||||
self.read_csv(StringIO(data), header=0, names=names,
|
||||
usecols=usecols)
|
||||
usecols = ['A', 'B', 'f']
|
||||
with tm.assert_raises_regex(ValueError,
|
||||
self.msg_validate_usecols_names.format(
|
||||
r"\['f'\]")):
|
||||
self.read_csv(StringIO(data), names=names, usecols=usecols)
|
||||
Reference in New Issue
Block a user