started work on backend

2019-01-21 17:36:00 +01:00
parent a1a8bca34b
commit 9f9a7e4974
4032 changed files with 745079 additions and 0 deletions
@@ -0,0 +1,74 @@
+import pytest
+from pandas.io.parsers import read_table
+
+
+@pytest.fixture
+def tips_file(datapath):
+    """Path to the tips dataset"""
+    return datapath('io', 'parser', 'data', 'tips.csv')
+
+
+@pytest.fixture
+def jsonl_file(datapath):
+    """Path a JSONL dataset"""
+    return datapath('io', 'parser', 'data', 'items.jsonl')
+
+
+@pytest.fixture
+def salaries_table(datapath):
+    """DataFrame with the salaries dataset"""
+    return read_table(datapath('io', 'parser', 'data', 'salaries.csv'))
+
+
+@pytest.fixture
+def s3_resource(tips_file, jsonl_file):
+    """Fixture for mocking S3 interaction.
+
+    The primary bucket name is "pandas-test". The following datasets
+    are loaded.
+
+    - tips.csv
+    - tips.csv.gz
+    - tips.csv.bz2
+    - items.jsonl
+
+    A private bucket "cant_get_it" is also created. The boto3 s3 resource
+    is yielded by the fixture.
+    """
+    pytest.importorskip('s3fs')
+    boto3 = pytest.importorskip('boto3')
+    moto = pytest.importorskip('moto')
+
+    test_s3_files = [
+        ('tips.csv', tips_file),
+        ('tips.csv.gz', tips_file + '.gz'),
+        ('tips.csv.bz2', tips_file + '.bz2'),
+        ('items.jsonl', jsonl_file),
+    ]
+
+    def add_tips_files(bucket_name):
+        for s3_key, file_name in test_s3_files:
+            with open(file_name, 'rb') as f:
+                conn.Bucket(bucket_name).put_object(
+                    Key=s3_key,
+                    Body=f)
+
+    try:
+
+        s3 = moto.mock_s3()
+        s3.start()
+
+        # see gh-16135
+        bucket = 'pandas-test'
+        conn = boto3.resource("s3", region_name="us-east-1")
+
+        conn.create_bucket(Bucket=bucket)
+        add_tips_files(bucket)
+
+        conn.create_bucket(Bucket='cant_get_it', ACL='private')
+        add_tips_files('cant_get_it')
+        yield conn
+    except:  # noqa: flake8
+        pytest.skip("failure to use s3 resource")
+    finally:
+        s3.stop()
@@ -0,0 +1,186 @@
+import pytest
+
+from pandas.util import testing as tm
+from pandas.io.formats.css import CSSResolver, CSSWarning
+
+
+def assert_resolves(css, props, inherited=None):
+    resolve = CSSResolver()
+    actual = resolve(css, inherited=inherited)
+    assert props == actual
+
+
+def assert_same_resolution(css1, css2, inherited=None):
+    resolve = CSSResolver()
+    resolved1 = resolve(css1, inherited=inherited)
+    resolved2 = resolve(css2, inherited=inherited)
+    assert resolved1 == resolved2
+
+
+@pytest.mark.parametrize('name,norm,abnorm', [
+    ('whitespace', 'hello: world; foo: bar',
+     ' \t hello \t :\n  world \n  ;  \n foo: \tbar\n\n'),
+    ('case', 'hello: world; foo: bar', 'Hello: WORLD; foO: bar'),
+    ('empty-decl', 'hello: world; foo: bar',
+     '; hello: world;; foo: bar;\n; ;'),
+    ('empty-list', '', ';'),
+])
+def test_css_parse_normalisation(name, norm, abnorm):
+    assert_same_resolution(norm, abnorm)
+
+
+@pytest.mark.parametrize(
+    'invalid_css,remainder', [
+        # No colon
+        ('hello-world', ''),
+        ('border-style: solid; hello-world', 'border-style: solid'),
+        ('border-style: solid; hello-world; font-weight: bold',
+         'border-style: solid; font-weight: bold'),
+        # Unclosed string fail
+        # Invalid size
+        ('font-size: blah', 'font-size: 1em'),
+        ('font-size: 1a2b', 'font-size: 1em'),
+        ('font-size: 1e5pt', 'font-size: 1em'),
+        ('font-size: 1+6pt', 'font-size: 1em'),
+        ('font-size: 1unknownunit', 'font-size: 1em'),
+        ('font-size: 10', 'font-size: 1em'),
+        ('font-size: 10 pt', 'font-size: 1em'),
+    ])
+def test_css_parse_invalid(invalid_css, remainder):
+    with tm.assert_produces_warning(CSSWarning):
+        assert_same_resolution(invalid_css, remainder)
+
+    # TODO: we should be checking that in other cases no warnings are raised
+
+
+@pytest.mark.parametrize(
+    'shorthand,expansions',
+    [('margin', ['margin-top', 'margin-right',
+                 'margin-bottom', 'margin-left']),
+     ('padding', ['padding-top', 'padding-right',
+                  'padding-bottom', 'padding-left']),
+     ('border-width', ['border-top-width', 'border-right-width',
+                       'border-bottom-width', 'border-left-width']),
+     ('border-color', ['border-top-color', 'border-right-color',
+                       'border-bottom-color', 'border-left-color']),
+     ('border-style', ['border-top-style', 'border-right-style',
+                       'border-bottom-style', 'border-left-style']),
+     ])
+def test_css_side_shorthands(shorthand, expansions):
+    top, right, bottom, left = expansions
+
+    assert_resolves('{shorthand}: 1pt'.format(shorthand=shorthand),
+                    {top: '1pt', right: '1pt',
+                     bottom: '1pt', left: '1pt'})
+
+    assert_resolves('{shorthand}: 1pt 4pt'.format(shorthand=shorthand),
+                    {top: '1pt', right: '4pt',
+                     bottom: '1pt', left: '4pt'})
+
+    assert_resolves('{shorthand}: 1pt 4pt 2pt'.format(shorthand=shorthand),
+                    {top: '1pt', right: '4pt',
+                     bottom: '2pt', left: '4pt'})
+
+    assert_resolves('{shorthand}: 1pt 4pt 2pt 0pt'.format(shorthand=shorthand),
+                    {top: '1pt', right: '4pt',
+                     bottom: '2pt', left: '0pt'})
+
+    with tm.assert_produces_warning(CSSWarning):
+        assert_resolves(
+            '{shorthand}: 1pt 1pt 1pt 1pt 1pt'.format(shorthand=shorthand), {})
+
+
+@pytest.mark.parametrize('style,inherited,equiv', [
+    ('margin: 1px; margin: 2px', '',
+     'margin: 2px'),
+    ('margin: 1px', 'margin: 2px',
+     'margin: 1px'),
+    ('margin: 1px; margin: inherit', 'margin: 2px',
+     'margin: 2px'),
+    ('margin: 1px; margin-top: 2px', '',
+     'margin-left: 1px; margin-right: 1px; ' +
+     'margin-bottom: 1px; margin-top: 2px'),
+    ('margin-top: 2px', 'margin: 1px',
+     'margin: 1px; margin-top: 2px'),
+    ('margin: 1px', 'margin-top: 2px',
+     'margin: 1px'),
+    ('margin: 1px; margin-top: inherit', 'margin: 2px',
+     'margin: 1px; margin-top: 2px'),
+])
+def test_css_precedence(style, inherited, equiv):
+    resolve = CSSResolver()
+    inherited_props = resolve(inherited)
+    style_props = resolve(style, inherited=inherited_props)
+    equiv_props = resolve(equiv)
+    assert style_props == equiv_props
+
+
+@pytest.mark.parametrize('style,equiv', [
+    ('margin: 1px; margin-top: inherit',
+     'margin-bottom: 1px; margin-right: 1px; margin-left: 1px'),
+    ('margin-top: inherit', ''),
+    ('margin-top: initial', ''),
+])
+def test_css_none_absent(style, equiv):
+    assert_same_resolution(style, equiv)
+
+
+@pytest.mark.parametrize('size,resolved', [
+    ('xx-small', '6pt'),
+    ('x-small', '{pt:f}pt'.format(pt=7.5)),
+    ('small', '{pt:f}pt'.format(pt=9.6)),
+    ('medium', '12pt'),
+    ('large', '{pt:f}pt'.format(pt=13.5)),
+    ('x-large', '18pt'),
+    ('xx-large', '24pt'),
+
+    ('8px', '6pt'),
+    ('1.25pc', '15pt'),
+    ('.25in', '18pt'),
+    ('02.54cm', '72pt'),
+    ('25.4mm', '72pt'),
+    ('101.6q', '72pt'),
+    ('101.6q', '72pt'),
+])
+@pytest.mark.parametrize('relative_to',  # invariant to inherited size
+                         [None, '16pt'])
+def test_css_absolute_font_size(size, relative_to, resolved):
+    if relative_to is None:
+        inherited = None
+    else:
+        inherited = {'font-size': relative_to}
+    assert_resolves('font-size: {size}'.format(size=size),
+                    {'font-size': resolved}, inherited=inherited)
+
+
+@pytest.mark.parametrize('size,relative_to,resolved', [
+    ('1em', None, '12pt'),
+    ('1.0em', None, '12pt'),
+    ('1.25em', None, '15pt'),
+    ('1em', '16pt', '16pt'),
+    ('1.0em', '16pt', '16pt'),
+    ('1.25em', '16pt', '20pt'),
+    ('1rem', '16pt', '12pt'),
+    ('1.0rem', '16pt', '12pt'),
+    ('1.25rem', '16pt', '15pt'),
+    ('100%', None, '12pt'),
+    ('125%', None, '15pt'),
+    ('100%', '16pt', '16pt'),
+    ('125%', '16pt', '20pt'),
+    ('2ex', None, '12pt'),
+    ('2.0ex', None, '12pt'),
+    ('2.50ex', None, '15pt'),
+    ('inherit', '16pt', '16pt'),
+
+    ('smaller', None, '10pt'),
+    ('smaller', '18pt', '15pt'),
+    ('larger', None, '{pt:f}pt'.format(pt=14.4)),
+    ('larger', '15pt', '18pt'),
+])
+def test_css_relative_font_size(size, relative_to, resolved):
+    if relative_to is None:
+        inherited = None
+    else:
+        inherited = {'font-size': relative_to}
+    assert_resolves('font-size: {size}'.format(size=size),
+                    {'font-size': resolved}, inherited=inherited)
@@ -0,0 +1,193 @@
+import numpy as np
+import pandas as pd
+from pandas import DataFrame
+from pandas.compat import u
+import pandas.io.formats.format as fmt
+from pandas.util import testing as tm
+
+
+class TestEngFormatter(object):
+
+    def test_eng_float_formatter(self):
+        df = DataFrame({'A': [1.41, 141., 14100, 1410000.]})
+
+        fmt.set_eng_float_format()
+        result = df.to_string()
+        expected = ('             A\n'
+                    '0    1.410E+00\n'
+                    '1  141.000E+00\n'
+                    '2   14.100E+03\n'
+                    '3    1.410E+06')
+        assert result == expected
+
+        fmt.set_eng_float_format(use_eng_prefix=True)
+        result = df.to_string()
+        expected = ('         A\n'
+                    '0    1.410\n'
+                    '1  141.000\n'
+                    '2  14.100k\n'
+                    '3   1.410M')
+        assert result == expected
+
+        fmt.set_eng_float_format(accuracy=0)
+        result = df.to_string()
+        expected = ('         A\n'
+                    '0    1E+00\n'
+                    '1  141E+00\n'
+                    '2   14E+03\n'
+                    '3    1E+06')
+        assert result == expected
+
+        tm.reset_display_options()
+
+    def compare(self, formatter, input, output):
+        formatted_input = formatter(input)
+        assert formatted_input == output
+
+    def compare_all(self, formatter, in_out):
+        """
+        Parameters:
+        -----------
+        formatter: EngFormatter under test
+        in_out: list of tuples. Each tuple = (number, expected_formatting)
+
+        It is tested if 'formatter(number) == expected_formatting'.
+        *number* should be >= 0 because formatter(-number) == fmt is also
+        tested. *fmt* is derived from *expected_formatting*
+        """
+        for input, output in in_out:
+            self.compare(formatter, input, output)
+            self.compare(formatter, -input, "-" + output[1:])
+
+    def test_exponents_with_eng_prefix(self):
+        formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True)
+        f = np.sqrt(2)
+        in_out = [
+            (f * 10 ** -24, " 1.414y"), (f * 10 ** -23, " 14.142y"),
+            (f * 10 ** -22, " 141.421y"), (f * 10 ** -21, " 1.414z"),
+            (f * 10 ** -20, " 14.142z"), (f * 10 ** -19, " 141.421z"),
+            (f * 10 ** -18, " 1.414a"), (f * 10 ** -17, " 14.142a"),
+            (f * 10 ** -16, " 141.421a"), (f * 10 ** -15, " 1.414f"),
+            (f * 10 ** -14, " 14.142f"), (f * 10 ** -13, " 141.421f"),
+            (f * 10 ** -12, " 1.414p"), (f * 10 ** -11, " 14.142p"),
+            (f * 10 ** -10, " 141.421p"), (f * 10 ** -9, " 1.414n"),
+            (f * 10 ** -8, " 14.142n"), (f * 10 ** -7, " 141.421n"),
+            (f * 10 ** -6, " 1.414u"), (f * 10 ** -5, " 14.142u"),
+            (f * 10 ** -4, " 141.421u"), (f * 10 ** -3, " 1.414m"),
+            (f * 10 ** -2, " 14.142m"), (f * 10 ** -1, " 141.421m"),
+            (f * 10 ** 0, " 1.414"), (f * 10 ** 1, " 14.142"),
+            (f * 10 ** 2, " 141.421"), (f * 10 ** 3, " 1.414k"),
+            (f * 10 ** 4, " 14.142k"), (f * 10 ** 5, " 141.421k"),
+            (f * 10 ** 6, " 1.414M"), (f * 10 ** 7, " 14.142M"),
+            (f * 10 ** 8, " 141.421M"), (f * 10 ** 9, " 1.414G"),
+            (f * 10 ** 10, " 14.142G"), (f * 10 ** 11, " 141.421G"),
+            (f * 10 ** 12, " 1.414T"), (f * 10 ** 13, " 14.142T"),
+            (f * 10 ** 14, " 141.421T"), (f * 10 ** 15, " 1.414P"),
+            (f * 10 ** 16, " 14.142P"), (f * 10 ** 17, " 141.421P"),
+            (f * 10 ** 18, " 1.414E"), (f * 10 ** 19, " 14.142E"),
+            (f * 10 ** 20, " 141.421E"), (f * 10 ** 21, " 1.414Z"),
+            (f * 10 ** 22, " 14.142Z"), (f * 10 ** 23, " 141.421Z"),
+            (f * 10 ** 24, " 1.414Y"), (f * 10 ** 25, " 14.142Y"),
+            (f * 10 ** 26, " 141.421Y")]
+        self.compare_all(formatter, in_out)
+
+    def test_exponents_without_eng_prefix(self):
+        formatter = fmt.EngFormatter(accuracy=4, use_eng_prefix=False)
+        f = np.pi
+        in_out = [
+            (f * 10 ** -24, " 3.1416E-24"),
+            (f * 10 ** -23, " 31.4159E-24"),
+            (f * 10 ** -22, " 314.1593E-24"),
+            (f * 10 ** -21, " 3.1416E-21"),
+            (f * 10 ** -20, " 31.4159E-21"),
+            (f * 10 ** -19, " 314.1593E-21"),
+            (f * 10 ** -18, " 3.1416E-18"),
+            (f * 10 ** -17, " 31.4159E-18"),
+            (f * 10 ** -16, " 314.1593E-18"),
+            (f * 10 ** -15, " 3.1416E-15"),
+            (f * 10 ** -14, " 31.4159E-15"),
+            (f * 10 ** -13, " 314.1593E-15"),
+            (f * 10 ** -12, " 3.1416E-12"),
+            (f * 10 ** -11, " 31.4159E-12"),
+            (f * 10 ** -10, " 314.1593E-12"),
+            (f * 10 ** -9, " 3.1416E-09"),
+            (f * 10 ** -8, " 31.4159E-09"),
+            (f * 10 ** -7, " 314.1593E-09"),
+            (f * 10 ** -6, " 3.1416E-06"),
+            (f * 10 ** -5, " 31.4159E-06"),
+            (f * 10 ** -4, " 314.1593E-06"),
+            (f * 10 ** -3, " 3.1416E-03"),
+            (f * 10 ** -2, " 31.4159E-03"),
+            (f * 10 ** -1, " 314.1593E-03"),
+            (f * 10 ** 0, " 3.1416E+00"),
+            (f * 10 ** 1, " 31.4159E+00"),
+            (f * 10 ** 2, " 314.1593E+00"),
+            (f * 10 ** 3, " 3.1416E+03"),
+            (f * 10 ** 4, " 31.4159E+03"),
+            (f * 10 ** 5, " 314.1593E+03"),
+            (f * 10 ** 6, " 3.1416E+06"),
+            (f * 10 ** 7, " 31.4159E+06"),
+            (f * 10 ** 8, " 314.1593E+06"),
+            (f * 10 ** 9, " 3.1416E+09"),
+            (f * 10 ** 10, " 31.4159E+09"),
+            (f * 10 ** 11, " 314.1593E+09"),
+            (f * 10 ** 12, " 3.1416E+12"),
+            (f * 10 ** 13, " 31.4159E+12"),
+            (f * 10 ** 14, " 314.1593E+12"),
+            (f * 10 ** 15, " 3.1416E+15"),
+            (f * 10 ** 16, " 31.4159E+15"),
+            (f * 10 ** 17, " 314.1593E+15"),
+            (f * 10 ** 18, " 3.1416E+18"),
+            (f * 10 ** 19, " 31.4159E+18"),
+            (f * 10 ** 20, " 314.1593E+18"),
+            (f * 10 ** 21, " 3.1416E+21"),
+            (f * 10 ** 22, " 31.4159E+21"),
+            (f * 10 ** 23, " 314.1593E+21"),
+            (f * 10 ** 24, " 3.1416E+24"),
+            (f * 10 ** 25, " 31.4159E+24"),
+            (f * 10 ** 26, " 314.1593E+24")]
+        self.compare_all(formatter, in_out)
+
+    def test_rounding(self):
+        formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True)
+        in_out = [(5.55555, ' 5.556'), (55.5555, ' 55.556'),
+                  (555.555, ' 555.555'), (5555.55, ' 5.556k'),
+                  (55555.5, ' 55.556k'), (555555, ' 555.555k')]
+        self.compare_all(formatter, in_out)
+
+        formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True)
+        in_out = [(5.55555, ' 5.6'), (55.5555, ' 55.6'), (555.555, ' 555.6'),
+                  (5555.55, ' 5.6k'), (55555.5, ' 55.6k'), (555555, ' 555.6k')]
+        self.compare_all(formatter, in_out)
+
+        formatter = fmt.EngFormatter(accuracy=0, use_eng_prefix=True)
+        in_out = [(5.55555, ' 6'), (55.5555, ' 56'), (555.555, ' 556'),
+                  (5555.55, ' 6k'), (55555.5, ' 56k'), (555555, ' 556k')]
+        self.compare_all(formatter, in_out)
+
+        formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True)
+        result = formatter(0)
+        assert result == u(' 0.000')
+
+    def test_nan(self):
+        # Issue #11981
+
+        formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True)
+        result = formatter(np.nan)
+        assert result == u('NaN')
+
+        df = pd.DataFrame({'a': [1.5, 10.3, 20.5],
+                           'b': [50.3, 60.67, 70.12],
+                           'c': [100.2, 101.33, 120.33]})
+        pt = df.pivot_table(values='a', index='b', columns='c')
+        fmt.set_eng_float_format(accuracy=1)
+        result = pt.to_string()
+        assert 'NaN' in result
+        tm.reset_display_options()
+
+    def test_inf(self):
+        # Issue #11981
+
+        formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True)
+        result = formatter(np.inf)
+        assert result == u('inf')
@@ -0,0 +1,204 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+import numpy as np
+import pandas as pd
+
+from pandas import compat
+import pandas.io.formats.printing as printing
+import pandas.io.formats.format as fmt
+import pandas.core.config as cf
+
+
+def test_adjoin():
+    data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], ['ggg', 'hhh', 'iii']]
+    expected = 'a  dd  ggg\nb  ee  hhh\nc  ff  iii'
+
+    adjoined = printing.adjoin(2, *data)
+
+    assert (adjoined == expected)
+
+
+def test_repr_binary_type():
+    import string
+    letters = string.ascii_letters
+    btype = compat.binary_type
+    try:
+        raw = btype(letters, encoding=cf.get_option('display.encoding'))
+    except TypeError:
+        raw = btype(letters)
+    b = compat.text_type(compat.bytes_to_str(raw))
+    res = printing.pprint_thing(b, quote_strings=True)
+    assert res == repr(b)
+    res = printing.pprint_thing(b, quote_strings=False)
+    assert res == b
+
+
+class TestFormattBase(object):
+
+    def test_adjoin(self):
+        data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], ['ggg', 'hhh', 'iii']]
+        expected = 'a  dd  ggg\nb  ee  hhh\nc  ff  iii'
+
+        adjoined = printing.adjoin(2, *data)
+
+        assert adjoined == expected
+
+    def test_adjoin_unicode(self):
+        data = [[u'あ', 'b', 'c'], ['dd', u'ええ', 'ff'], ['ggg', 'hhh', u'いいい']]
+        expected = u'あ  dd  ggg\nb  ええ  hhh\nc  ff  いいい'
+        adjoined = printing.adjoin(2, *data)
+        assert adjoined == expected
+
+        adj = fmt.EastAsianTextAdjustment()
+
+        expected = u"""あ  dd    ggg
+b   ええ  hhh
+c   ff    いいい"""
+
+        adjoined = adj.adjoin(2, *data)
+        assert adjoined == expected
+        cols = adjoined.split('\n')
+        assert adj.len(cols[0]) == 13
+        assert adj.len(cols[1]) == 13
+        assert adj.len(cols[2]) == 16
+
+        expected = u"""あ       dd         ggg
+b        ええ       hhh
+c        ff         いいい"""
+
+        adjoined = adj.adjoin(7, *data)
+        assert adjoined == expected
+        cols = adjoined.split('\n')
+        assert adj.len(cols[0]) == 23
+        assert adj.len(cols[1]) == 23
+        assert adj.len(cols[2]) == 26
+
+    def test_justify(self):
+        adj = fmt.EastAsianTextAdjustment()
+
+        def just(x, *args, **kwargs):
+            # wrapper to test single str
+            return adj.justify([x], *args, **kwargs)[0]
+
+        assert just('abc', 5, mode='left') == 'abc  '
+        assert just('abc', 5, mode='center') == ' abc '
+        assert just('abc', 5, mode='right') == '  abc'
+        assert just(u'abc', 5, mode='left') == 'abc  '
+        assert just(u'abc', 5, mode='center') == ' abc '
+        assert just(u'abc', 5, mode='right') == '  abc'
+
+        assert just(u'パンダ', 5, mode='left') == u'パンダ'
+        assert just(u'パンダ', 5, mode='center') == u'パンダ'
+        assert just(u'パンダ', 5, mode='right') == u'パンダ'
+
+        assert just(u'パンダ', 10, mode='left') == u'パンダ    '
+        assert just(u'パンダ', 10, mode='center') == u'  パンダ  '
+        assert just(u'パンダ', 10, mode='right') == u'    パンダ'
+
+    def test_east_asian_len(self):
+        adj = fmt.EastAsianTextAdjustment()
+
+        assert adj.len('abc') == 3
+        assert adj.len(u'abc') == 3
+
+        assert adj.len(u'パンダ') == 6
+        assert adj.len(u'ﾊﾟﾝﾀﾞ') == 5
+        assert adj.len(u'パンダpanda') == 11
+        assert adj.len(u'ﾊﾟﾝﾀﾞpanda') == 10
+
+    def test_ambiguous_width(self):
+        adj = fmt.EastAsianTextAdjustment()
+        assert adj.len(u'¡¡ab') == 4
+
+        with cf.option_context('display.unicode.ambiguous_as_wide', True):
+            adj = fmt.EastAsianTextAdjustment()
+            assert adj.len(u'¡¡ab') == 6
+
+        data = [[u'あ', 'b', 'c'], ['dd', u'ええ', 'ff'],
+                ['ggg', u'¡¡ab', u'いいい']]
+        expected = u'あ  dd    ggg \nb   ええ  ¡¡ab\nc   ff    いいい'
+        adjoined = adj.adjoin(2, *data)
+        assert adjoined == expected
+
+
+class TestTableSchemaRepr(object):
+
+    @classmethod
+    def setup_class(cls):
+        pytest.importorskip('IPython')
+
+        from IPython.core.interactiveshell import InteractiveShell
+        cls.display_formatter = InteractiveShell.instance().display_formatter
+
+    def test_publishes(self):
+
+        df = pd.DataFrame({"A": [1, 2]})
+        objects = [df['A'], df, df]  # dataframe / series
+        expected_keys = [
+            {'text/plain', 'application/vnd.dataresource+json'},
+            {'text/plain', 'text/html', 'application/vnd.dataresource+json'},
+        ]
+
+        opt = pd.option_context('display.html.table_schema', True)
+        for obj, expected in zip(objects, expected_keys):
+            with opt:
+                formatted = self.display_formatter.format(obj)
+            assert set(formatted[0].keys()) == expected
+
+        with_latex = pd.option_context('display.latex.repr', True)
+
+        with opt, with_latex:
+            formatted = self.display_formatter.format(obj)
+
+        expected = {'text/plain', 'text/html', 'text/latex',
+                    'application/vnd.dataresource+json'}
+        assert set(formatted[0].keys()) == expected
+
+    def test_publishes_not_implemented(self):
+        # column MultiIndex
+        # GH 15996
+        midx = pd.MultiIndex.from_product([['A', 'B'], ['a', 'b', 'c']])
+        df = pd.DataFrame(np.random.randn(5, len(midx)), columns=midx)
+
+        opt = pd.option_context('display.html.table_schema', True)
+
+        with opt:
+            formatted = self.display_formatter.format(df)
+
+        expected = {'text/plain', 'text/html'}
+        assert set(formatted[0].keys()) == expected
+
+    def test_config_on(self):
+        df = pd.DataFrame({"A": [1, 2]})
+        with pd.option_context("display.html.table_schema", True):
+            result = df._repr_data_resource_()
+
+        assert result is not None
+
+    def test_config_default_off(self):
+        df = pd.DataFrame({"A": [1, 2]})
+        with pd.option_context("display.html.table_schema", False):
+            result = df._repr_data_resource_()
+
+        assert result is None
+
+    def test_enable_data_resource_formatter(self):
+        # GH 10491
+        formatters = self.display_formatter.formatters
+        mimetype = 'application/vnd.dataresource+json'
+
+        with pd.option_context('display.html.table_schema', True):
+            assert 'application/vnd.dataresource+json' in formatters
+            assert formatters[mimetype].enabled
+
+        # still there, just disabled
+        assert 'application/vnd.dataresource+json' in formatters
+        assert not formatters[mimetype].enabled
+
+        # able to re-set
+        with pd.option_context('display.html.table_schema', True):
+            assert 'application/vnd.dataresource+json' in formatters
+            assert formatters[mimetype].enabled
+            # smoke test that it works
+            self.display_formatter.format(cf)
@@ -0,0 +1,302 @@
+# -*- coding: utf-8 -*-
+
+import sys
+import numpy as np
+import pandas as pd
+import pytest
+from pandas import DataFrame
+from pandas.util import testing as tm
+
+
+class TestToCSV(object):
+
+    @pytest.mark.xfail((3, 6, 5) > sys.version_info >= (3, 5),
+                       reason=("Python csv library bug "
+                               "(see https://bugs.python.org/issue32255)"))
+    def test_to_csv_with_single_column(self):
+        # see gh-18676, https://bugs.python.org/issue32255
+        #
+        # Python's CSV library adds an extraneous '""'
+        # before the newline when the NaN-value is in
+        # the first row. Otherwise, only the newline
+        # character is added. This behavior is inconsistent
+        # and was patched in https://bugs.python.org/pull_request4672.
+        df1 = DataFrame([None, 1])
+        expected1 = """\
+""
+1.0
+"""
+        with tm.ensure_clean('test.csv') as path:
+            df1.to_csv(path, header=None, index=None)
+            with open(path, 'r') as f:
+                assert f.read() == expected1
+
+        df2 = DataFrame([1, None])
+        expected2 = """\
+1.0
+""
+"""
+        with tm.ensure_clean('test.csv') as path:
+            df2.to_csv(path, header=None, index=None)
+            with open(path, 'r') as f:
+                assert f.read() == expected2
+
+    def test_to_csv_defualt_encoding(self):
+        # GH17097
+        df = DataFrame({'col': [u"AAAAA", u"ÄÄÄÄÄ", u"ßßßßß", u"聞聞聞聞聞"]})
+
+        with tm.ensure_clean('test.csv') as path:
+            # the default to_csv encoding in Python 2 is ascii, and that in
+            # Python 3 is uft-8.
+            if pd.compat.PY2:
+                # the encoding argument parameter should be utf-8
+                with tm.assert_raises_regex(UnicodeEncodeError, 'ascii'):
+                    df.to_csv(path)
+            else:
+                df.to_csv(path)
+                tm.assert_frame_equal(pd.read_csv(path, index_col=0), df)
+
+    def test_to_csv_quotechar(self):
+        df = DataFrame({'col': [1, 2]})
+        expected = """\
+"","col"
+"0","1"
+"1","2"
+"""
+
+        with tm.ensure_clean('test.csv') as path:
+            df.to_csv(path, quoting=1)  # 1=QUOTE_ALL
+            with open(path, 'r') as f:
+                assert f.read() == expected
+
+        expected = """\
+$$,$col$
+$0$,$1$
+$1$,$2$
+"""
+
+        with tm.ensure_clean('test.csv') as path:
+            df.to_csv(path, quoting=1, quotechar="$")
+            with open(path, 'r') as f:
+                assert f.read() == expected
+
+        with tm.ensure_clean('test.csv') as path:
+            with tm.assert_raises_regex(TypeError, 'quotechar'):
+                df.to_csv(path, quoting=1, quotechar=None)
+
+    def test_to_csv_doublequote(self):
+        df = DataFrame({'col': ['a"a', '"bb"']})
+        expected = '''\
+"","col"
+"0","a""a"
+"1","""bb"""
+'''
+
+        with tm.ensure_clean('test.csv') as path:
+            df.to_csv(path, quoting=1, doublequote=True)  # QUOTE_ALL
+            with open(path, 'r') as f:
+                assert f.read() == expected
+
+        from _csv import Error
+        with tm.ensure_clean('test.csv') as path:
+            with tm.assert_raises_regex(Error, 'escapechar'):
+                df.to_csv(path, doublequote=False)  # no escapechar set
+
+    def test_to_csv_escapechar(self):
+        df = DataFrame({'col': ['a"a', '"bb"']})
+        expected = '''\
+"","col"
+"0","a\\"a"
+"1","\\"bb\\""
+'''
+
+        with tm.ensure_clean('test.csv') as path:  # QUOTE_ALL
+            df.to_csv(path, quoting=1, doublequote=False, escapechar='\\')
+            with open(path, 'r') as f:
+                assert f.read() == expected
+
+        df = DataFrame({'col': ['a,a', ',bb,']})
+        expected = """\
+,col
+0,a\\,a
+1,\\,bb\\,
+"""
+
+        with tm.ensure_clean('test.csv') as path:
+            df.to_csv(path, quoting=3, escapechar='\\')  # QUOTE_NONE
+            with open(path, 'r') as f:
+                assert f.read() == expected
+
+    def test_csv_to_string(self):
+        df = DataFrame({'col': [1, 2]})
+        expected = ',col\n0,1\n1,2\n'
+        assert df.to_csv() == expected
+
+    def test_to_csv_decimal(self):
+        # GH 781
+        df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]})
+
+        expected_default = ',col1,col2,col3\n0,1,a,10.1\n'
+        assert df.to_csv() == expected_default
+
+        expected_european_excel = ';col1;col2;col3\n0;1;a;10,1\n'
+        assert df.to_csv(decimal=',', sep=';') == expected_european_excel
+
+        expected_float_format_default = ',col1,col2,col3\n0,1,a,10.10\n'
+        assert df.to_csv(float_format='%.2f') == expected_float_format_default
+
+        expected_float_format = ';col1;col2;col3\n0;1;a;10,10\n'
+        assert df.to_csv(decimal=',', sep=';',
+                         float_format='%.2f') == expected_float_format
+
+        # GH 11553: testing if decimal is taken into account for '0.0'
+        df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1})
+        expected = 'a,b,c\n0^0,2^2,1\n1^1,3^3,1\n'
+        assert df.to_csv(index=False, decimal='^') == expected
+
+        # same but for an index
+        assert df.set_index('a').to_csv(decimal='^') == expected
+
+        # same for a multi-index
+        assert df.set_index(['a', 'b']).to_csv(decimal="^") == expected
+
+    def test_to_csv_float_format(self):
+        # testing if float_format is taken into account for the index
+        # GH 11553
+        df = pd.DataFrame({'a': [0, 1], 'b': [2.2, 3.3], 'c': 1})
+        expected = 'a,b,c\n0,2.20,1\n1,3.30,1\n'
+        assert df.set_index('a').to_csv(float_format='%.2f') == expected
+
+        # same for a multi-index
+        assert df.set_index(['a', 'b']).to_csv(
+            float_format='%.2f') == expected
+
+    def test_to_csv_na_rep(self):
+        # testing if NaN values are correctly represented in the index
+        # GH 11553
+        df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]})
+        expected = "a,b,c\n0.0,0,2\n_,1,3\n"
+        assert df.set_index('a').to_csv(na_rep='_') == expected
+        assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected
+
+        # now with an index containing only NaNs
+        df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]})
+        expected = "a,b,c\n_,0,2\n_,1,3\n"
+        assert df.set_index('a').to_csv(na_rep='_') == expected
+        assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected
+
+        # check if na_rep parameter does not break anything when no NaN
+        df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]})
+        expected = "a,b,c\n0,0,2\n0,1,3\n"
+        assert df.set_index('a').to_csv(na_rep='_') == expected
+        assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected
+
+    def test_to_csv_date_format(self):
+        # GH 10209
+        df_sec = DataFrame({'A': pd.date_range('20130101', periods=5, freq='s')
+                            })
+        df_day = DataFrame({'A': pd.date_range('20130101', periods=5, freq='d')
+                            })
+
+        expected_default_sec = (',A\n0,2013-01-01 00:00:00\n1,'
+                                '2013-01-01 00:00:01\n2,2013-01-01 00:00:02'
+                                '\n3,2013-01-01 00:00:03\n4,'
+                                '2013-01-01 00:00:04\n')
+        assert df_sec.to_csv() == expected_default_sec
+
+        expected_ymdhms_day = (',A\n0,2013-01-01 00:00:00\n1,'
+                               '2013-01-02 00:00:00\n2,2013-01-03 00:00:00'
+                               '\n3,2013-01-04 00:00:00\n4,'
+                               '2013-01-05 00:00:00\n')
+        assert (df_day.to_csv(date_format='%Y-%m-%d %H:%M:%S') ==
+                expected_ymdhms_day)
+
+        expected_ymd_sec = (',A\n0,2013-01-01\n1,2013-01-01\n2,'
+                            '2013-01-01\n3,2013-01-01\n4,2013-01-01\n')
+        assert df_sec.to_csv(date_format='%Y-%m-%d') == expected_ymd_sec
+
+        expected_default_day = (',A\n0,2013-01-01\n1,2013-01-02\n2,'
+                                '2013-01-03\n3,2013-01-04\n4,2013-01-05\n')
+        assert df_day.to_csv() == expected_default_day
+        assert df_day.to_csv(date_format='%Y-%m-%d') == expected_default_day
+
+        # testing if date_format parameter is taken into account for
+        # multi-indexed dataframes (GH 7791)
+        df_sec['B'] = 0
+        df_sec['C'] = 1
+        expected_ymd_sec = 'A,B,C\n2013-01-01,0,1\n'
+        df_sec_grouped = df_sec.groupby([pd.Grouper(key='A', freq='1h'), 'B'])
+        assert (df_sec_grouped.mean().to_csv(date_format='%Y-%m-%d') ==
+                expected_ymd_sec)
+
+    def test_to_csv_multi_index(self):
+        # GH 6618
+        df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]))
+
+        exp = ",1\n,2\n0,1\n"
+        assert df.to_csv() == exp
+
+        exp = "1\n2\n1\n"
+        assert df.to_csv(index=False) == exp
+
+        df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]),
+                       index=pd.MultiIndex.from_arrays([[1], [2]]))
+
+        exp = ",,1\n,,2\n1,2,1\n"
+        assert df.to_csv() == exp
+
+        exp = "1\n2\n1\n"
+        assert df.to_csv(index=False) == exp
+
+        df = DataFrame(
+            [1], columns=pd.MultiIndex.from_arrays([['foo'], ['bar']]))
+
+        exp = ",foo\n,bar\n0,1\n"
+        assert df.to_csv() == exp
+
+        exp = "foo\nbar\n1\n"
+        assert df.to_csv(index=False) == exp
+
+    def test_to_csv_string_array_ascii(self):
+        # GH 10813
+        str_array = [{'names': ['foo', 'bar']}, {'names': ['baz', 'qux']}]
+        df = pd.DataFrame(str_array)
+        expected_ascii = '''\
+,names
+0,"['foo', 'bar']"
+1,"['baz', 'qux']"
+'''
+        with tm.ensure_clean('str_test.csv') as path:
+            df.to_csv(path, encoding='ascii')
+            with open(path, 'r') as f:
+                assert f.read() == expected_ascii
+
+    @pytest.mark.xfail
+    def test_to_csv_string_array_utf8(self):
+        # GH 10813
+        str_array = [{'names': ['foo', 'bar']}, {'names': ['baz', 'qux']}]
+        df = pd.DataFrame(str_array)
+        expected_utf8 = '''\
+,names
+0,"[u'foo', u'bar']"
+1,"[u'baz', u'qux']"
+'''
+        with tm.ensure_clean('unicode_test.csv') as path:
+            df.to_csv(path, encoding='utf-8')
+            with open(path, 'r') as f:
+                assert f.read() == expected_utf8
+
+    @tm.capture_stdout
+    def test_to_csv_stdout_file(self):
+        # GH 21561
+        df = pd.DataFrame([['foo', 'bar'], ['baz', 'qux']],
+                          columns=['name_1', 'name_2'])
+        expected_ascii = '''\
+,name_1,name_2
+0,foo,bar
+1,baz,qux
+'''
+        df.to_csv(sys.stdout, encoding='ascii')
+        output = sys.stdout.getvalue()
+        assert output == expected_ascii
+        assert not sys.stdout.closed
@@ -0,0 +1,274 @@
+"""Tests formatting as writer-agnostic ExcelCells
+
+ExcelFormatter is tested implicitly in pandas/tests/io/test_excel.py
+"""
+
+import pytest
+import pandas.util.testing as tm
+
+from warnings import catch_warnings
+from pandas.io.formats.excel import CSSToExcelConverter
+
+
+@pytest.mark.parametrize('css,expected', [
+    # FONT
+    # - name
+    ('font-family: foo,bar', {'font': {'name': 'foo'}}),
+    ('font-family: "foo bar",baz', {'font': {'name': 'foo bar'}}),
+    ('font-family: foo,\nbar', {'font': {'name': 'foo'}}),
+    ('font-family: foo, bar,    baz', {'font': {'name': 'foo'}}),
+    ('font-family: bar, foo', {'font': {'name': 'bar'}}),
+    ('font-family: \'foo bar\', baz', {'font': {'name': 'foo bar'}}),
+    ('font-family: \'foo \\\'bar\', baz', {'font': {'name': 'foo \'bar'}}),
+    ('font-family: "foo \\"bar", baz', {'font': {'name': 'foo "bar'}}),
+    ('font-family: "foo ,bar", baz', {'font': {'name': 'foo ,bar'}}),
+    # - family
+    ('font-family: serif', {'font': {'name': 'serif', 'family': 1}}),
+    ('font-family: Serif', {'font': {'name': 'serif', 'family': 1}}),
+    ('font-family: roman, serif', {'font': {'name': 'roman', 'family': 1}}),
+    ('font-family: roman, sans-serif', {'font': {'name': 'roman',
+                                                 'family': 2}}),
+    ('font-family: roman, sans serif', {'font': {'name': 'roman'}}),
+    ('font-family: roman, sansserif', {'font': {'name': 'roman'}}),
+    ('font-family: roman, cursive', {'font': {'name': 'roman', 'family': 4}}),
+    ('font-family: roman, fantasy', {'font': {'name': 'roman', 'family': 5}}),
+    # - size
+    ('font-size: 1em', {'font': {'size': 12}}),
+    ('font-size: xx-small', {'font': {'size': 6}}),
+    ('font-size: x-small', {'font': {'size': 7.5}}),
+    ('font-size: small', {'font': {'size': 9.6}}),
+    ('font-size: medium', {'font': {'size': 12}}),
+    ('font-size: large', {'font': {'size': 13.5}}),
+    ('font-size: x-large', {'font': {'size': 18}}),
+    ('font-size: xx-large', {'font': {'size': 24}}),
+    ('font-size: 50%', {'font': {'size': 6}}),
+    # - bold
+    ('font-weight: 100', {'font': {'bold': False}}),
+    ('font-weight: 200', {'font': {'bold': False}}),
+    ('font-weight: 300', {'font': {'bold': False}}),
+    ('font-weight: 400', {'font': {'bold': False}}),
+    ('font-weight: normal', {'font': {'bold': False}}),
+    ('font-weight: lighter', {'font': {'bold': False}}),
+    ('font-weight: bold', {'font': {'bold': True}}),
+    ('font-weight: bolder', {'font': {'bold': True}}),
+    ('font-weight: 700', {'font': {'bold': True}}),
+    ('font-weight: 800', {'font': {'bold': True}}),
+    ('font-weight: 900', {'font': {'bold': True}}),
+    # - italic
+    ('font-style: italic', {'font': {'italic': True}}),
+    ('font-style: oblique', {'font': {'italic': True}}),
+    # - underline
+    ('text-decoration: underline',
+     {'font': {'underline': 'single'}}),
+    ('text-decoration: overline',
+     {}),
+    ('text-decoration: none',
+     {}),
+    # - strike
+    ('text-decoration: line-through',
+     {'font': {'strike': True}}),
+    ('text-decoration: underline line-through',
+     {'font': {'strike': True, 'underline': 'single'}}),
+    ('text-decoration: underline; text-decoration: line-through',
+     {'font': {'strike': True}}),
+    # - color
+    ('color: red', {'font': {'color': 'FF0000'}}),
+    ('color: #ff0000', {'font': {'color': 'FF0000'}}),
+    ('color: #f0a', {'font': {'color': 'FF00AA'}}),
+    # - shadow
+    ('text-shadow: none', {'font': {'shadow': False}}),
+    ('text-shadow: 0px -0em 0px #CCC', {'font': {'shadow': False}}),
+    ('text-shadow: 0px -0em 0px #999', {'font': {'shadow': False}}),
+    ('text-shadow: 0px -0em 0px', {'font': {'shadow': False}}),
+    ('text-shadow: 2px -0em 0px #CCC', {'font': {'shadow': True}}),
+    ('text-shadow: 0px -2em 0px #CCC', {'font': {'shadow': True}}),
+    ('text-shadow: 0px -0em 2px #CCC', {'font': {'shadow': True}}),
+    ('text-shadow: 0px -0em 2px', {'font': {'shadow': True}}),
+    ('text-shadow: 0px -2em', {'font': {'shadow': True}}),
+
+    # FILL
+    # - color, fillType
+    ('background-color: red', {'fill': {'fgColor': 'FF0000',
+                                        'patternType': 'solid'}}),
+    ('background-color: #ff0000', {'fill': {'fgColor': 'FF0000',
+                                            'patternType': 'solid'}}),
+    ('background-color: #f0a', {'fill': {'fgColor': 'FF00AA',
+                                         'patternType': 'solid'}}),
+    # BORDER
+    # - style
+    ('border-style: solid',
+     {'border': {'top': {'style': 'medium'},
+                 'bottom': {'style': 'medium'},
+                 'left': {'style': 'medium'},
+                 'right': {'style': 'medium'}}}),
+    ('border-style: solid; border-width: thin',
+     {'border': {'top': {'style': 'thin'},
+                 'bottom': {'style': 'thin'},
+                 'left': {'style': 'thin'},
+                 'right': {'style': 'thin'}}}),
+
+    ('border-top-style: solid; border-top-width: thin',
+     {'border': {'top': {'style': 'thin'}}}),
+    ('border-top-style: solid; border-top-width: 1pt',
+     {'border': {'top': {'style': 'thin'}}}),
+    ('border-top-style: solid',
+     {'border': {'top': {'style': 'medium'}}}),
+    ('border-top-style: solid; border-top-width: medium',
+     {'border': {'top': {'style': 'medium'}}}),
+    ('border-top-style: solid; border-top-width: 2pt',
+     {'border': {'top': {'style': 'medium'}}}),
+    ('border-top-style: solid; border-top-width: thick',
+     {'border': {'top': {'style': 'thick'}}}),
+    ('border-top-style: solid; border-top-width: 4pt',
+     {'border': {'top': {'style': 'thick'}}}),
+
+    ('border-top-style: dotted',
+     {'border': {'top': {'style': 'mediumDashDotDot'}}}),
+    ('border-top-style: dotted; border-top-width: thin',
+     {'border': {'top': {'style': 'dotted'}}}),
+    ('border-top-style: dashed',
+     {'border': {'top': {'style': 'mediumDashed'}}}),
+    ('border-top-style: dashed; border-top-width: thin',
+     {'border': {'top': {'style': 'dashed'}}}),
+    ('border-top-style: double',
+     {'border': {'top': {'style': 'double'}}}),
+    # - color
+    ('border-style: solid; border-color: #0000ff',
+     {'border': {'top': {'style': 'medium', 'color': '0000FF'},
+                 'right': {'style': 'medium', 'color': '0000FF'},
+                 'bottom': {'style': 'medium', 'color': '0000FF'},
+                 'left': {'style': 'medium', 'color': '0000FF'}}}),
+    ('border-top-style: double; border-top-color: blue',
+     {'border': {'top': {'style': 'double', 'color': '0000FF'}}}),
+    ('border-top-style: solid; border-top-color: #06c',
+     {'border': {'top': {'style': 'medium', 'color': '0066CC'}}}),
+    # ALIGNMENT
+    # - horizontal
+    ('text-align: center',
+     {'alignment': {'horizontal': 'center'}}),
+    ('text-align: left',
+     {'alignment': {'horizontal': 'left'}}),
+    ('text-align: right',
+     {'alignment': {'horizontal': 'right'}}),
+    ('text-align: justify',
+     {'alignment': {'horizontal': 'justify'}}),
+    # - vertical
+    ('vertical-align: top',
+     {'alignment': {'vertical': 'top'}}),
+    ('vertical-align: text-top',
+     {'alignment': {'vertical': 'top'}}),
+    ('vertical-align: middle',
+     {'alignment': {'vertical': 'center'}}),
+    ('vertical-align: bottom',
+     {'alignment': {'vertical': 'bottom'}}),
+    ('vertical-align: text-bottom',
+     {'alignment': {'vertical': 'bottom'}}),
+    # - wrap_text
+    ('white-space: nowrap',
+     {'alignment': {'wrap_text': False}}),
+    ('white-space: pre',
+     {'alignment': {'wrap_text': False}}),
+    ('white-space: pre-line',
+     {'alignment': {'wrap_text': False}}),
+    ('white-space: normal',
+     {'alignment': {'wrap_text': True}}),
+])
+def test_css_to_excel(css, expected):
+    convert = CSSToExcelConverter()
+    assert expected == convert(css)
+
+
+def test_css_to_excel_multiple():
+    convert = CSSToExcelConverter()
+    actual = convert('''
+        font-weight: bold;
+        text-decoration: underline;
+        color: red;
+        border-width: thin;
+        text-align: center;
+        vertical-align: top;
+        unused: something;
+    ''')
+    assert {"font": {"bold": True, "underline": "single", "color": "FF0000"},
+            "border": {"top": {"style": "thin"},
+                       "right": {"style": "thin"},
+                       "bottom": {"style": "thin"},
+                       "left": {"style": "thin"}},
+            "alignment": {"horizontal": "center",
+                          "vertical": "top"}} == actual
+
+
+@pytest.mark.parametrize('css,inherited,expected', [
+    ('font-weight: bold', '',
+     {'font': {'bold': True}}),
+    ('', 'font-weight: bold',
+     {'font': {'bold': True}}),
+    ('font-weight: bold', 'font-style: italic',
+     {'font': {'bold': True, 'italic': True}}),
+    ('font-style: normal', 'font-style: italic',
+     {'font': {'italic': False}}),
+    ('font-style: inherit', '', {}),
+    ('font-style: normal; font-style: inherit', 'font-style: italic',
+     {'font': {'italic': True}}),
+])
+def test_css_to_excel_inherited(css, inherited, expected):
+    convert = CSSToExcelConverter(inherited)
+    assert expected == convert(css)
+
+
+@pytest.mark.parametrize("input_color,output_color", (
+    [(name, rgb) for name, rgb in CSSToExcelConverter.NAMED_COLORS.items()] +
+    [("#" + rgb, rgb) for rgb in CSSToExcelConverter.NAMED_COLORS.values()] +
+    [("#F0F", "FF00FF"), ("#ABC", "AABBCC")])
+)
+def test_css_to_excel_good_colors(input_color, output_color):
+    # see gh-18392
+    css = ("border-top-color: {color}; "
+           "border-right-color: {color}; "
+           "border-bottom-color: {color}; "
+           "border-left-color: {color}; "
+           "background-color: {color}; "
+           "color: {color}").format(color=input_color)
+
+    expected = dict()
+
+    expected["fill"] = {
+        "patternType": "solid",
+        "fgColor": output_color
+    }
+
+    expected["font"] = {
+        "color": output_color
+    }
+
+    expected["border"] = {
+        k: {
+            "color": output_color,
+        } for k in ("top", "right", "bottom", "left")
+    }
+
+    with tm.assert_produces_warning(None):
+        convert = CSSToExcelConverter()
+        assert expected == convert(css)
+
+
+@pytest.mark.parametrize("input_color", [None, "not-a-color"])
+def test_css_to_excel_bad_colors(input_color):
+    # see gh-18392
+    css = ("border-top-color: {color}; "
+           "border-right-color: {color}; "
+           "border-bottom-color: {color}; "
+           "border-left-color: {color}; "
+           "background-color: {color}; "
+           "color: {color}").format(color=input_color)
+
+    expected = dict()
+
+    if input_color is not None:
+        expected["fill"] = {
+            "patternType": "solid"
+        }
+
+    with catch_warnings(record=True):
+        convert = CSSToExcelConverter()
+        assert expected == convert(css)
@@ -0,0 +1,710 @@
+from datetime import datetime
+
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, compat, Series
+from pandas.util import testing as tm
+from pandas.compat import u
+import codecs
+
+
+@pytest.fixture
+def frame():
+    return DataFrame(tm.getSeriesData())
+
+
+class TestToLatex(object):
+
+    def test_to_latex_filename(self, frame):
+        with tm.ensure_clean('test.tex') as path:
+            frame.to_latex(path)
+
+            with open(path, 'r') as f:
+                assert frame.to_latex() == f.read()
+
+        # test with utf-8 and encoding option (GH 7061)
+        df = DataFrame([[u'au\xdfgangen']])
+        with tm.ensure_clean('test.tex') as path:
+            df.to_latex(path, encoding='utf-8')
+            with codecs.open(path, 'r', encoding='utf-8') as f:
+                assert df.to_latex() == f.read()
+
+        # test with utf-8 without encoding option
+        if compat.PY3:  # python3: pandas default encoding is utf-8
+            with tm.ensure_clean('test.tex') as path:
+                df.to_latex(path)
+                with codecs.open(path, 'r', encoding='utf-8') as f:
+                    assert df.to_latex() == f.read()
+        else:
+            # python2 default encoding is ascii, so an error should be raised
+            with tm.ensure_clean('test.tex') as path:
+                with pytest.raises(UnicodeEncodeError):
+                    df.to_latex(path)
+
+    def test_to_latex(self, frame):
+        # it works!
+        frame.to_latex()
+
+        df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
+        withindex_result = df.to_latex()
+        withindex_expected = r"""\begin{tabular}{lrl}
+\toprule
+{} &  a &   b \\
+\midrule
+0 &  1 &  b1 \\
+1 &  2 &  b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert withindex_result == withindex_expected
+
+        withoutindex_result = df.to_latex(index=False)
+        withoutindex_expected = r"""\begin{tabular}{rl}
+\toprule
+ a &   b \\
+\midrule
+ 1 &  b1 \\
+ 2 &  b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert withoutindex_result == withoutindex_expected
+
+    def test_to_latex_format(self, frame):
+        # GH Bug #9402
+        frame.to_latex(column_format='ccc')
+
+        df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
+        withindex_result = df.to_latex(column_format='ccc')
+        withindex_expected = r"""\begin{tabular}{ccc}
+\toprule
+{} &  a &   b \\
+\midrule
+0 &  1 &  b1 \\
+1 &  2 &  b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert withindex_result == withindex_expected
+
+    def test_to_latex_empty(self):
+        df = DataFrame()
+        result = df.to_latex()
+        expected = r"""\begin{tabular}{l}
+\toprule
+Empty DataFrame
+Columns: Index([], dtype='object')
+Index: Index([], dtype='object') \\
+\bottomrule
+\end{tabular}
+"""
+        assert result == expected
+
+        result = df.to_latex(longtable=True)
+        expected = r"""\begin{longtable}{l}
+\toprule
+Empty DataFrame
+Columns: Index([], dtype='object')
+Index: Index([], dtype='object') \\
+\end{longtable}
+"""
+        assert result == expected
+
+    def test_to_latex_with_formatters(self):
+        df = DataFrame({'datetime64': [datetime(2016, 1, 1),
+                                       datetime(2016, 2, 5),
+                                       datetime(2016, 3, 3)],
+                        'float': [1.0, 2.0, 3.0],
+                        'int': [1, 2, 3],
+                        'object': [(1, 2), True, False],
+                        })
+
+        formatters = {'datetime64': lambda x: x.strftime('%Y-%m'),
+                      'float': lambda x: '[{x: 4.1f}]'.format(x=x),
+                      'int': lambda x: '0x{x:x}'.format(x=x),
+                      'object': lambda x: '-{x!s}-'.format(x=x),
+                      '__index__': lambda x: 'index: {x}'.format(x=x)}
+        result = df.to_latex(formatters=dict(formatters))
+
+        expected = r"""\begin{tabular}{llrrl}
+\toprule
+{} & datetime64 &  float & int &    object \\
+\midrule
+index: 0 &    2016-01 & [ 1.0] & 0x1 &  -(1, 2)- \\
+index: 1 &    2016-02 & [ 2.0] & 0x2 &    -True- \\
+index: 2 &    2016-03 & [ 3.0] & 0x3 &   -False- \\
+\bottomrule
+\end{tabular}
+"""
+        assert result == expected
+
+    def test_to_latex_multiindex(self):
+        df = DataFrame({('x', 'y'): ['a']})
+        result = df.to_latex()
+        expected = r"""\begin{tabular}{ll}
+\toprule
+{} &  x \\
+{} &  y \\
+\midrule
+0 &  a \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert result == expected
+
+        result = df.T.to_latex()
+        expected = r"""\begin{tabular}{lll}
+\toprule
+  &   &  0 \\
+\midrule
+x & y &  a \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert result == expected
+
+        df = DataFrame.from_dict({
+            ('c1', 0): pd.Series({x: x for x in range(4)}),
+            ('c1', 1): pd.Series({x: x + 4 for x in range(4)}),
+            ('c2', 0): pd.Series({x: x for x in range(4)}),
+            ('c2', 1): pd.Series({x: x + 4 for x in range(4)}),
+            ('c3', 0): pd.Series({x: x for x in range(4)}),
+        }).T
+        result = df.to_latex()
+        expected = r"""\begin{tabular}{llrrrr}
+\toprule
+   &   &  0 &  1 &  2 &  3 \\
+\midrule
+c1 & 0 &  0 &  1 &  2 &  3 \\
+   & 1 &  4 &  5 &  6 &  7 \\
+c2 & 0 &  0 &  1 &  2 &  3 \\
+   & 1 &  4 &  5 &  6 &  7 \\
+c3 & 0 &  0 &  1 &  2 &  3 \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert result == expected
+
+        # GH 14184
+        df = df.T
+        df.columns.names = ['a', 'b']
+        result = df.to_latex()
+        expected = r"""\begin{tabular}{lrrrrr}
+\toprule
+a & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\
+b &  0 &  1 &  0 &  1 &  0 \\
+\midrule
+0 &  0 &  4 &  0 &  4 &  0 \\
+1 &  1 &  5 &  1 &  5 &  1 \\
+2 &  2 &  6 &  2 &  6 &  2 \\
+3 &  3 &  7 &  3 &  7 &  3 \\
+\bottomrule
+\end{tabular}
+"""
+        assert result == expected
+
+        # GH 10660
+        df = pd.DataFrame({'a': [0, 0, 1, 1],
+                           'b': list('abab'),
+                           'c': [1, 2, 3, 4]})
+        result = df.set_index(['a', 'b']).to_latex()
+        expected = r"""\begin{tabular}{llr}
+\toprule
+  &   &  c \\
+a & b &    \\
+\midrule
+0 & a &  1 \\
+  & b &  2 \\
+1 & a &  3 \\
+  & b &  4 \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert result == expected
+
+        result = df.groupby('a').describe().to_latex()
+        expected = r"""\begin{tabular}{lrrrrrrrr}
+\toprule
+{} & \multicolumn{8}{l}{c} \\
+{} & count & mean &       std &  min &   25\% &  50\% &   75\% &  max \\
+a &       &      &           &      &       &      &       &      \\
+\midrule
+0 &   2.0 &  1.5 &  0.707107 &  1.0 &  1.25 &  1.5 &  1.75 &  2.0 \\
+1 &   2.0 &  3.5 &  0.707107 &  3.0 &  3.25 &  3.5 &  3.75 &  4.0 \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert result == expected
+
+    def test_to_latex_multiindex_dupe_level(self):
+        # see gh-14484
+        #
+        # If an index is repeated in subsequent rows, it should be
+        # replaced with a blank in the created table. This should
+        # ONLY happen if all higher order indices (to the left) are
+        # equal too. In this test, 'c' has to be printed both times
+        # because the higher order index 'A' != 'B'.
+        df = pd.DataFrame(index=pd.MultiIndex.from_tuples(
+            [('A', 'c'), ('B', 'c')]), columns=['col'])
+        result = df.to_latex()
+        expected = r"""\begin{tabular}{lll}
+\toprule
+  &   &  col \\
+\midrule
+A & c &  NaN \\
+B & c &  NaN \\
+\bottomrule
+\end{tabular}
+"""
+        assert result == expected
+
+    def test_to_latex_multicolumnrow(self):
+        df = pd.DataFrame({
+            ('c1', 0): {x: x for x in range(5)},
+            ('c1', 1): {x: x + 5 for x in range(5)},
+            ('c2', 0): {x: x for x in range(5)},
+            ('c2', 1): {x: x + 5 for x in range(5)},
+            ('c3', 0): {x: x for x in range(5)}
+        })
+        result = df.to_latex()
+        expected = r"""\begin{tabular}{lrrrrr}
+\toprule
+{} & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\
+{} &  0 &  1 &  0 &  1 &  0 \\
+\midrule
+0 &  0 &  5 &  0 &  5 &  0 \\
+1 &  1 &  6 &  1 &  6 &  1 \\
+2 &  2 &  7 &  2 &  7 &  2 \\
+3 &  3 &  8 &  3 &  8 &  3 \\
+4 &  4 &  9 &  4 &  9 &  4 \\
+\bottomrule
+\end{tabular}
+"""
+        assert result == expected
+
+        result = df.to_latex(multicolumn=False)
+        expected = r"""\begin{tabular}{lrrrrr}
+\toprule
+{} & c1 &    & c2 &    & c3 \\
+{} &  0 &  1 &  0 &  1 &  0 \\
+\midrule
+0 &  0 &  5 &  0 &  5 &  0 \\
+1 &  1 &  6 &  1 &  6 &  1 \\
+2 &  2 &  7 &  2 &  7 &  2 \\
+3 &  3 &  8 &  3 &  8 &  3 \\
+4 &  4 &  9 &  4 &  9 &  4 \\
+\bottomrule
+\end{tabular}
+"""
+        assert result == expected
+
+        result = df.T.to_latex(multirow=True)
+        expected = r"""\begin{tabular}{llrrrrr}
+\toprule
+   &   &  0 &  1 &  2 &  3 &  4 \\
+\midrule
+\multirow{2}{*}{c1} & 0 &  0 &  1 &  2 &  3 &  4 \\
+   & 1 &  5 &  6 &  7 &  8 &  9 \\
+\cline{1-7}
+\multirow{2}{*}{c2} & 0 &  0 &  1 &  2 &  3 &  4 \\
+   & 1 &  5 &  6 &  7 &  8 &  9 \\
+\cline{1-7}
+c3 & 0 &  0 &  1 &  2 &  3 &  4 \\
+\bottomrule
+\end{tabular}
+"""
+        assert result == expected
+
+        df.index = df.T.index
+        result = df.T.to_latex(multirow=True, multicolumn=True,
+                               multicolumn_format='c')
+        expected = r"""\begin{tabular}{llrrrrr}
+\toprule
+   &   & \multicolumn{2}{c}{c1} & \multicolumn{2}{c}{c2} & c3 \\
+   &   &  0 &  1 &  0 &  1 &  0 \\
+\midrule
+\multirow{2}{*}{c1} & 0 &  0 &  1 &  2 &  3 &  4 \\
+   & 1 &  5 &  6 &  7 &  8 &  9 \\
+\cline{1-7}
+\multirow{2}{*}{c2} & 0 &  0 &  1 &  2 &  3 &  4 \\
+   & 1 &  5 &  6 &  7 &  8 &  9 \\
+\cline{1-7}
+c3 & 0 &  0 &  1 &  2 &  3 &  4 \\
+\bottomrule
+\end{tabular}
+"""
+        assert result == expected
+
+    def test_to_latex_escape(self):
+        a = 'a'
+        b = 'b'
+
+        test_dict = {u('co$e^x$'): {a: "a",
+                                    b: "b"},
+                     u('co^l1'): {a: "a",
+                                  b: "b"}}
+
+        unescaped_result = DataFrame(test_dict).to_latex(escape=False)
+        escaped_result = DataFrame(test_dict).to_latex(
+        )  # default: escape=True
+
+        unescaped_expected = r'''\begin{tabular}{lll}
+\toprule
+{} & co$e^x$ & co^l1 \\
+\midrule
+a &       a &     a \\
+b &       b &     b \\
+\bottomrule
+\end{tabular}
+'''
+
+        escaped_expected = r'''\begin{tabular}{lll}
+\toprule
+{} & co\$e\textasciicircum x\$ & co\textasciicircum l1 \\
+\midrule
+a &       a &     a \\
+b &       b &     b \\
+\bottomrule
+\end{tabular}
+'''
+
+        assert unescaped_result == unescaped_expected
+        assert escaped_result == escaped_expected
+
+    def test_to_latex_special_escape(self):
+        df = DataFrame([r"a\b\c", r"^a^b^c", r"~a~b~c"])
+
+        escaped_result = df.to_latex()
+        escaped_expected = r"""\begin{tabular}{ll}
+\toprule
+{} &       0 \\
+\midrule
+0 &   a\textbackslash b\textbackslash c \\
+1 &  \textasciicircum a\textasciicircum b\textasciicircum c \\
+2 &  \textasciitilde a\textasciitilde b\textasciitilde c \\
+\bottomrule
+\end{tabular}
+"""
+        assert escaped_result == escaped_expected
+
+    def test_to_latex_longtable(self, frame):
+        frame.to_latex(longtable=True)
+
+        df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
+        withindex_result = df.to_latex(longtable=True)
+        withindex_expected = r"""\begin{longtable}{lrl}
+\toprule
+{} &  a &   b \\
+\midrule
+\endhead
+\midrule
+\multicolumn{3}{r}{{Continued on next page}} \\
+\midrule
+\endfoot
+
+\bottomrule
+\endlastfoot
+0 &  1 &  b1 \\
+1 &  2 &  b2 \\
+\end{longtable}
+"""
+        assert withindex_result == withindex_expected
+
+        withoutindex_result = df.to_latex(index=False, longtable=True)
+        withoutindex_expected = r"""\begin{longtable}{rl}
+\toprule
+ a &   b \\
+\midrule
+\endhead
+\midrule
+\multicolumn{2}{r}{{Continued on next page}} \\
+\midrule
+\endfoot
+
+\bottomrule
+\endlastfoot
+ 1 &  b1 \\
+ 2 &  b2 \\
+\end{longtable}
+"""
+
+        assert withoutindex_result == withoutindex_expected
+
+        df = DataFrame({'a': [1, 2]})
+        with1column_result = df.to_latex(index=False, longtable=True)
+        assert r"\multicolumn{1}" in with1column_result
+
+        df = DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]})
+        with3columns_result = df.to_latex(index=False, longtable=True)
+        assert r"\multicolumn{3}" in with3columns_result
+
+    def test_to_latex_escape_special_chars(self):
+        special_characters = ['&', '%', '$', '#', '_', '{', '}', '~', '^',
+                              '\\']
+        df = DataFrame(data=special_characters)
+        observed = df.to_latex()
+        expected = r"""\begin{tabular}{ll}
+\toprule
+{} &  0 \\
+\midrule
+0 &  \& \\
+1 &  \% \\
+2 &  \$ \\
+3 &  \# \\
+4 &  \_ \\
+5 &  \{ \\
+6 &  \} \\
+7 &  \textasciitilde  \\
+8 &  \textasciicircum  \\
+9 &  \textbackslash  \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert observed == expected
+
+    def test_to_latex_no_header(self):
+        # GH 7124
+        df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
+        withindex_result = df.to_latex(header=False)
+        withindex_expected = r"""\begin{tabular}{lrl}
+\toprule
+0 &  1 &  b1 \\
+1 &  2 &  b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert withindex_result == withindex_expected
+
+        withoutindex_result = df.to_latex(index=False, header=False)
+        withoutindex_expected = r"""\begin{tabular}{rl}
+\toprule
+ 1 &  b1 \\
+ 2 &  b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert withoutindex_result == withoutindex_expected
+
+    def test_to_latex_specified_header(self):
+        # GH 7124
+        df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
+        withindex_result = df.to_latex(header=['AA', 'BB'])
+        withindex_expected = r"""\begin{tabular}{lrl}
+\toprule
+{} & AA &  BB \\
+\midrule
+0 &  1 &  b1 \\
+1 &  2 &  b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert withindex_result == withindex_expected
+
+        withoutindex_result = df.to_latex(header=['AA', 'BB'], index=False)
+        withoutindex_expected = r"""\begin{tabular}{rl}
+\toprule
+AA &  BB \\
+\midrule
+ 1 &  b1 \\
+ 2 &  b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert withoutindex_result == withoutindex_expected
+
+        withoutescape_result = df.to_latex(header=['$A$', '$B$'], escape=False)
+        withoutescape_expected = r"""\begin{tabular}{lrl}
+\toprule
+{} & $A$ & $B$ \\
+\midrule
+0 &   1 &  b1 \\
+1 &   2 &  b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert withoutescape_result == withoutescape_expected
+
+        with pytest.raises(ValueError):
+            df.to_latex(header=['A'])
+
+    def test_to_latex_decimal(self, frame):
+        # GH 12031
+        frame.to_latex()
+
+        df = DataFrame({'a': [1.0, 2.1], 'b': ['b1', 'b2']})
+        withindex_result = df.to_latex(decimal=',')
+
+        withindex_expected = r"""\begin{tabular}{lrl}
+\toprule
+{} &    a &   b \\
+\midrule
+0 &  1,0 &  b1 \\
+1 &  2,1 &  b2 \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert withindex_result == withindex_expected
+
+    def test_to_latex_series(self):
+        s = Series(['a', 'b', 'c'])
+        withindex_result = s.to_latex()
+        withindex_expected = r"""\begin{tabular}{ll}
+\toprule
+{} &  0 \\
+\midrule
+0 &  a \\
+1 &  b \\
+2 &  c \\
+\bottomrule
+\end{tabular}
+"""
+        assert withindex_result == withindex_expected
+
+    def test_to_latex_bold_rows(self):
+        # GH 16707
+        df = pd.DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
+        observed = df.to_latex(bold_rows=True)
+        expected = r"""\begin{tabular}{lrl}
+\toprule
+{} &  a &   b \\
+\midrule
+\textbf{0} &  1 &  b1 \\
+\textbf{1} &  2 &  b2 \\
+\bottomrule
+\end{tabular}
+"""
+        assert observed == expected
+
+    def test_to_latex_no_bold_rows(self):
+        # GH 16707
+        df = pd.DataFrame({'a': [1, 2], 'b': ['b1', 'b2']})
+        observed = df.to_latex(bold_rows=False)
+        expected = r"""\begin{tabular}{lrl}
+\toprule
+{} &  a &   b \\
+\midrule
+0 &  1 &  b1 \\
+1 &  2 &  b2 \\
+\bottomrule
+\end{tabular}
+"""
+        assert observed == expected
+
+    @pytest.mark.parametrize('name0', [None, 'named0'])
+    @pytest.mark.parametrize('name1', [None, 'named1'])
+    @pytest.mark.parametrize('axes', [[0], [1], [0, 1]])
+    def test_to_latex_multiindex_names(self, name0, name1, axes):
+        # GH 18667
+        names = [name0, name1]
+        mi = pd.MultiIndex.from_product([[1, 2], [3, 4]])
+        df = pd.DataFrame(-1, index=mi.copy(), columns=mi.copy())
+        for idx in axes:
+            df.axes[idx].names = names
+
+        idx_names = tuple(n or '{}' for n in names)
+        idx_names_row = ('%s & %s &    &    &    &    \\\\\n' % idx_names
+                         if (0 in axes and any(names)) else '')
+        placeholder = '{}' if any(names) and 1 in axes else ' '
+        col_names = [n if (bool(n) and 1 in axes) else placeholder
+                     for n in names]
+        observed = df.to_latex()
+        expected = r"""\begin{tabular}{llrrrr}
+\toprule
+  & %s & \multicolumn{2}{l}{1} & \multicolumn{2}{l}{2} \\
+  & %s &  3 &  4 &  3 &  4 \\
+%s\midrule
+1 & 3 & -1 & -1 & -1 & -1 \\
+  & 4 & -1 & -1 & -1 & -1 \\
+2 & 3 & -1 & -1 & -1 & -1 \\
+  & 4 & -1 & -1 & -1 & -1 \\
+\bottomrule
+\end{tabular}
+""" % tuple(list(col_names) + [idx_names_row])
+        assert observed == expected
+
+    @pytest.mark.parametrize('one_row', [True, False])
+    def test_to_latex_multiindex_nans(self, one_row):
+        # GH 14249
+        df = pd.DataFrame({'a': [None, 1], 'b': [2, 3], 'c': [4, 5]})
+        if one_row:
+            df = df.iloc[[0]]
+        observed = df.set_index(['a', 'b']).to_latex()
+        expected = r"""\begin{tabular}{llr}
+\toprule
+    &   &  c \\
+a & b &    \\
+\midrule
+NaN & 2 &  4 \\
+"""
+        if not one_row:
+            expected += r"""1.0 & 3 &  5 \\
+"""
+        expected += r"""\bottomrule
+\end{tabular}
+"""
+        assert observed == expected
+
+    def test_to_latex_non_string_index(self):
+        # GH 19981
+        observed = pd.DataFrame([[1, 2, 3]] * 2).set_index([0, 1]).to_latex()
+        expected = r"""\begin{tabular}{llr}
+\toprule
+  &   &  2 \\
+0 & 1 &    \\
+\midrule
+1 & 2 &  3 \\
+  & 2 &  3 \\
+\bottomrule
+\end{tabular}
+"""
+        assert observed == expected
+
+    def test_to_latex_midrule_location(self):
+        # GH 18326
+        df = pd.DataFrame({'a': [1, 2]})
+        df.index.name = 'foo'
+        observed = df.to_latex(index_names=False)
+        expected = r"""\begin{tabular}{lr}
+\toprule
+{} &  a \\
+\midrule
+0 &  1 \\
+1 &  2 \\
+\bottomrule
+\end{tabular}
+"""
+
+        assert observed == expected
+
+    def test_to_latex_multiindex_empty_name(self):
+        # GH 18669
+        mi = pd.MultiIndex.from_product([[1, 2]], names=[''])
+        df = pd.DataFrame(-1, index=mi, columns=range(4))
+        observed = df.to_latex()
+        expected = r"""\begin{tabular}{lrrrr}
+\toprule
+  &  0 &  1 &  2 &  3 \\
+{} &    &    &    &    \\
+\midrule
+1 & -1 & -1 & -1 & -1 \\
+2 & -1 & -1 & -1 & -1 \\
+\bottomrule
+\end{tabular}
+"""
+        assert observed == expected
@@ -0,0 +1,367 @@
+#!/usr/env/bin python
+
+"""
+self-contained to write legacy storage (pickle/msgpack) files
+
+To use this script. Create an environment where you want
+generate pickles, say its for 0.18.1, with your pandas clone
+in ~/pandas
+
+. activate pandas_0.18.1
+cd ~/
+
+$ python pandas/pandas/tests/io/generate_legacy_storage_files.py \
+    pandas/pandas/tests/io/data/legacy_pickle/0.18.1/ pickle
+
+This script generates a storage file for the current arch, system,
+and python version
+  pandas version: 0.18.1
+  output dir    : pandas/pandas/tests/io/data/legacy_pickle/0.18.1/
+  storage format: pickle
+created pickle file: 0.18.1_x86_64_darwin_3.5.2.pickle
+
+The idea here is you are using the *current* version of the
+generate_legacy_storage_files with an *older* version of pandas to
+generate a pickle file. We will then check this file into a current
+branch, and test using test_pickle.py. This will load the *older*
+pickles and test versus the current data that is generated
+(with master). These are then compared.
+
+If we have cases where we changed the signature (e.g. we renamed
+offset -> freq in Timestamp). Then we have to conditionally execute
+in the generate_legacy_storage_files.py to make it
+run under the older AND the newer version.
+
+"""
+
+from __future__ import print_function
+from warnings import catch_warnings
+from distutils.version import LooseVersion
+from pandas import (Series, DataFrame, Panel,
+                    SparseSeries, SparseDataFrame,
+                    Index, MultiIndex, bdate_range, to_msgpack,
+                    date_range, period_range, timedelta_range,
+                    Timestamp, NaT, Categorical, Period)
+from pandas.tseries.offsets import (
+    DateOffset, Hour, Minute, Day,
+    MonthBegin, MonthEnd, YearBegin,
+    YearEnd, Week, WeekOfMonth, LastWeekOfMonth,
+    BusinessDay, BusinessHour, CustomBusinessDay, FY5253,
+    Easter,
+    SemiMonthEnd, SemiMonthBegin,
+    QuarterBegin, QuarterEnd)
+from pandas.compat import u
+import os
+import sys
+import numpy as np
+import pandas
+import platform as pl
+from datetime import timedelta
+
+_loose_version = LooseVersion(pandas.__version__)
+
+
+def _create_sp_series():
+    nan = np.nan
+
+    # nan-based
+    arr = np.arange(15, dtype=np.float64)
+    arr[7:12] = nan
+    arr[-1:] = nan
+
+    bseries = SparseSeries(arr, kind='block')
+    bseries.name = u'bseries'
+    return bseries
+
+
+def _create_sp_tsseries():
+    nan = np.nan
+
+    # nan-based
+    arr = np.arange(15, dtype=np.float64)
+    arr[7:12] = nan
+    arr[-1:] = nan
+
+    date_index = bdate_range('1/1/2011', periods=len(arr))
+    bseries = SparseSeries(arr, index=date_index, kind='block')
+    bseries.name = u'btsseries'
+    return bseries
+
+
+def _create_sp_frame():
+    nan = np.nan
+
+    data = {u'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
+            u'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
+            u'C': np.arange(10).astype(np.int64),
+            u'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}
+
+    dates = bdate_range('1/1/2011', periods=10)
+    return SparseDataFrame(data, index=dates)
+
+
+def create_data():
+    """ create the pickle/msgpack data """
+
+    data = {
+        u'A': [0., 1., 2., 3., np.nan],
+        u'B': [0, 1, 0, 1, 0],
+        u'C': [u'foo1', u'foo2', u'foo3', u'foo4', u'foo5'],
+        u'D': date_range('1/1/2009', periods=5),
+        u'E': [0., 1, Timestamp('20100101'), u'foo', 2.]
+    }
+
+    scalars = dict(timestamp=Timestamp('20130101'),
+                   period=Period('2012', 'M'))
+
+    index = dict(int=Index(np.arange(10)),
+                 date=date_range('20130101', periods=10),
+                 period=period_range('2013-01-01', freq='M', periods=10),
+                 float=Index(np.arange(10, dtype=np.float64)),
+                 uint=Index(np.arange(10, dtype=np.uint64)),
+                 timedelta=timedelta_range('00:00:00', freq='30T', periods=10))
+
+    if _loose_version >= LooseVersion('0.18'):
+        from pandas import RangeIndex
+        index['range'] = RangeIndex(10)
+
+    if _loose_version >= LooseVersion('0.21'):
+        from pandas import interval_range
+        index['interval'] = interval_range(0, periods=10)
+
+    mi = dict(reg2=MultiIndex.from_tuples(
+        tuple(zip(*[[u'bar', u'bar', u'baz', u'baz', u'foo',
+                     u'foo', u'qux', u'qux'],
+                    [u'one', u'two', u'one', u'two', u'one',
+                     u'two', u'one', u'two']])),
+        names=[u'first', u'second']))
+
+    series = dict(float=Series(data[u'A']),
+                  int=Series(data[u'B']),
+                  mixed=Series(data[u'E']),
+                  ts=Series(np.arange(10).astype(np.int64),
+                            index=date_range('20130101', periods=10)),
+                  mi=Series(np.arange(5).astype(np.float64),
+                            index=MultiIndex.from_tuples(
+                                tuple(zip(*[[1, 1, 2, 2, 2],
+                                            [3, 4, 3, 4, 5]])),
+                                names=[u'one', u'two'])),
+                  dup=Series(np.arange(5).astype(np.float64),
+                             index=[u'A', u'B', u'C', u'D', u'A']),
+                  cat=Series(Categorical([u'foo', u'bar', u'baz'])),
+                  dt=Series(date_range('20130101', periods=5)),
+                  dt_tz=Series(date_range('20130101', periods=5,
+                                          tz='US/Eastern')),
+                  period=Series([Period('2000Q1')] * 5))
+
+    mixed_dup_df = DataFrame(data)
+    mixed_dup_df.columns = list(u"ABCDA")
+    frame = dict(float=DataFrame({u'A': series[u'float'],
+                                  u'B': series[u'float'] + 1}),
+                 int=DataFrame({u'A': series[u'int'],
+                                u'B': series[u'int'] + 1}),
+                 mixed=DataFrame({k: data[k]
+                                  for k in [u'A', u'B', u'C', u'D']}),
+                 mi=DataFrame({u'A': np.arange(5).astype(np.float64),
+                               u'B': np.arange(5).astype(np.int64)},
+                              index=MultiIndex.from_tuples(
+                                  tuple(zip(*[[u'bar', u'bar', u'baz',
+                                               u'baz', u'baz'],
+                                              [u'one', u'two', u'one',
+                                               u'two', u'three']])),
+                                  names=[u'first', u'second'])),
+                 dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
+                               columns=[u'A', u'B', u'A']),
+                 cat_onecol=DataFrame({u'A': Categorical([u'foo', u'bar'])}),
+                 cat_and_float=DataFrame({
+                     u'A': Categorical([u'foo', u'bar', u'baz']),
+                     u'B': np.arange(3).astype(np.int64)}),
+                 mixed_dup=mixed_dup_df,
+                 dt_mixed_tzs=DataFrame({
+                     u'A': Timestamp('20130102', tz='US/Eastern'),
+                     u'B': Timestamp('20130603', tz='CET')}, index=range(5)),
+                 dt_mixed2_tzs=DataFrame({
+                     u'A': Timestamp('20130102', tz='US/Eastern'),
+                     u'B': Timestamp('20130603', tz='CET'),
+                     u'C': Timestamp('20130603', tz='UTC')}, index=range(5))
+                 )
+
+    with catch_warnings(record=True):
+        mixed_dup_panel = Panel({u'ItemA': frame[u'float'],
+                                 u'ItemB': frame[u'int']})
+        mixed_dup_panel.items = [u'ItemA', u'ItemA']
+        panel = dict(float=Panel({u'ItemA': frame[u'float'],
+                                  u'ItemB': frame[u'float'] + 1}),
+                     dup=Panel(
+                         np.arange(30).reshape(3, 5, 2).astype(np.float64),
+                         items=[u'A', u'B', u'A']),
+                     mixed_dup=mixed_dup_panel)
+
+    cat = dict(int8=Categorical(list('abcdefg')),
+               int16=Categorical(np.arange(1000)),
+               int32=Categorical(np.arange(10000)))
+
+    timestamp = dict(normal=Timestamp('2011-01-01'),
+                     nat=NaT,
+                     tz=Timestamp('2011-01-01', tz='US/Eastern'))
+
+    if _loose_version < LooseVersion('0.19.2'):
+        timestamp['freq'] = Timestamp('2011-01-01', offset='D')
+        timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo',
+                                      offset='M')
+    else:
+        timestamp['freq'] = Timestamp('2011-01-01', freq='D')
+        timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo',
+                                      freq='M')
+
+    off = {'DateOffset': DateOffset(years=1),
+           'DateOffset_h_ns': DateOffset(hour=6, nanoseconds=5824),
+           'BusinessDay': BusinessDay(offset=timedelta(seconds=9)),
+           'BusinessHour': BusinessHour(normalize=True, n=6, end='15:14'),
+           'CustomBusinessDay': CustomBusinessDay(weekmask='Mon Fri'),
+           'SemiMonthBegin': SemiMonthBegin(day_of_month=9),
+           'SemiMonthEnd': SemiMonthEnd(day_of_month=24),
+           'MonthBegin': MonthBegin(1),
+           'MonthEnd': MonthEnd(1),
+           'QuarterBegin': QuarterBegin(1),
+           'QuarterEnd': QuarterEnd(1),
+           'Day': Day(1),
+           'YearBegin': YearBegin(1),
+           'YearEnd': YearEnd(1),
+           'Week': Week(1),
+           'Week_Tues': Week(2, normalize=False, weekday=1),
+           'WeekOfMonth': WeekOfMonth(week=3, weekday=4),
+           'LastWeekOfMonth': LastWeekOfMonth(n=1, weekday=3),
+           'FY5253': FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
+           'Easter': Easter(),
+           'Hour': Hour(1),
+           'Minute': Minute(1)}
+
+    return dict(series=series,
+                frame=frame,
+                panel=panel,
+                index=index,
+                scalars=scalars,
+                mi=mi,
+                sp_series=dict(float=_create_sp_series(),
+                               ts=_create_sp_tsseries()),
+                sp_frame=dict(float=_create_sp_frame()),
+                cat=cat,
+                timestamp=timestamp,
+                offsets=off)
+
+
+def create_pickle_data():
+    data = create_data()
+
+    # Pre-0.14.1 versions generated non-unpicklable mixed-type frames and
+    # panels if their columns/items were non-unique.
+    if _loose_version < LooseVersion('0.14.1'):
+        del data['frame']['mixed_dup']
+        del data['panel']['mixed_dup']
+    if _loose_version < LooseVersion('0.17.0'):
+        del data['series']['period']
+        del data['scalars']['period']
+    return data
+
+
+def _u(x):
+    return {u(k): _u(x[k]) for k in x} if isinstance(x, dict) else x
+
+
+def create_msgpack_data():
+    data = create_data()
+    if _loose_version < LooseVersion('0.17.0'):
+        del data['frame']['mixed_dup']
+        del data['panel']['mixed_dup']
+        del data['frame']['dup']
+        del data['panel']['dup']
+    if _loose_version < LooseVersion('0.18.0'):
+        del data['series']['dt_tz']
+        del data['frame']['dt_mixed_tzs']
+    # Not supported
+    del data['sp_series']
+    del data['sp_frame']
+    del data['series']['cat']
+    del data['series']['period']
+    del data['frame']['cat_onecol']
+    del data['frame']['cat_and_float']
+    del data['scalars']['period']
+    if _loose_version < LooseVersion('0.23.0'):
+        del data['index']['interval']
+    del data['offsets']
+    return _u(data)
+
+
+def platform_name():
+    return '_'.join([str(pandas.__version__), str(pl.machine()),
+                     str(pl.system().lower()), str(pl.python_version())])
+
+
+def write_legacy_pickles(output_dir):
+
+    # make sure we are < 0.13 compat (in py3)
+    try:
+        from pandas.compat import zip, cPickle as pickle  # noqa
+    except:
+        import pickle
+
+    version = pandas.__version__
+
+    print("This script generates a storage file for the current arch, system, "
+          "and python version")
+    print("  pandas version: {0}".format(version))
+    print("  output dir    : {0}".format(output_dir))
+    print("  storage format: pickle")
+
+    pth = '{0}.pickle'.format(platform_name())
+
+    fh = open(os.path.join(output_dir, pth), 'wb')
+    pickle.dump(create_pickle_data(), fh, pickle.HIGHEST_PROTOCOL)
+    fh.close()
+
+    print("created pickle file: %s" % pth)
+
+
+def write_legacy_msgpack(output_dir, compress):
+
+    version = pandas.__version__
+
+    print("This script generates a storage file for the current arch, "
+          "system, and python version")
+    print("  pandas version: {0}".format(version))
+    print("  output dir    : {0}".format(output_dir))
+    print("  storage format: msgpack")
+    pth = '{0}.msgpack'.format(platform_name())
+    to_msgpack(os.path.join(output_dir, pth), create_msgpack_data(),
+               compress=compress)
+
+    print("created msgpack file: %s" % pth)
+
+
+def write_legacy_file():
+    # force our cwd to be the first searched
+    sys.path.insert(0, '.')
+
+    if not (3 <= len(sys.argv) <= 4):
+        exit("Specify output directory and storage type: generate_legacy_"
+             "storage_files.py <output_dir> <storage_type> "
+             "<msgpack_compress_type>")
+
+    output_dir = str(sys.argv[1])
+    storage_type = str(sys.argv[2])
+    try:
+        compress_type = str(sys.argv[3])
+    except IndexError:
+        compress_type = None
+
+    if storage_type == 'pickle':
+        write_legacy_pickles(output_dir=output_dir)
+    elif storage_type == 'msgpack':
+        write_legacy_msgpack(output_dir=output_dir, compress=compress_type)
+    else:
+        exit("storage_type must be one of {'pickle', 'msgpack'}")
+
+
+if __name__ == '__main__':
+    write_legacy_file()
@@ -0,0 +1,90 @@
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_raises_regex
+
+
+def test_compression_roundtrip(compression):
+    df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
+                       [12.32112, 123123.2, 321321.2]],
+                      index=['A', 'B'], columns=['X', 'Y', 'Z'])
+
+    with tm.ensure_clean() as path:
+        df.to_json(path, compression=compression)
+        assert_frame_equal(df, pd.read_json(path,
+                                            compression=compression))
+
+        # explicitly ensure file was compressed.
+        with tm.decompress_file(path, compression) as fh:
+            result = fh.read().decode('utf8')
+        assert_frame_equal(df, pd.read_json(result))
+
+
+def test_read_zipped_json(datapath):
+    uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
+    uncompressed_df = pd.read_json(uncompressed_path)
+
+    compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
+    compressed_df = pd.read_json(compressed_path, compression='zip')
+
+    assert_frame_equal(uncompressed_df, compressed_df)
+
+
+def test_with_s3_url(compression):
+    boto3 = pytest.importorskip('boto3')
+    pytest.importorskip('s3fs')
+    moto = pytest.importorskip('moto')
+
+    df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
+    with moto.mock_s3():
+        conn = boto3.resource("s3", region_name="us-east-1")
+        bucket = conn.create_bucket(Bucket="pandas-test")
+
+        with tm.ensure_clean() as path:
+            df.to_json(path, compression=compression)
+            with open(path, 'rb') as f:
+                bucket.put_object(Key='test-1', Body=f)
+
+        roundtripped_df = pd.read_json('s3://pandas-test/test-1',
+                                       compression=compression)
+        assert_frame_equal(df, roundtripped_df)
+
+
+def test_lines_with_compression(compression):
+
+    with tm.ensure_clean() as path:
+        df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
+        df.to_json(path, orient='records', lines=True,
+                   compression=compression)
+        roundtripped_df = pd.read_json(path, lines=True,
+                                       compression=compression)
+        assert_frame_equal(df, roundtripped_df)
+
+
+def test_chunksize_with_compression(compression):
+
+    with tm.ensure_clean() as path:
+        df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
+        df.to_json(path, orient='records', lines=True,
+                   compression=compression)
+
+        res = pd.read_json(path, lines=True, chunksize=1,
+                           compression=compression)
+        roundtripped_df = pd.concat(res)
+        assert_frame_equal(df, roundtripped_df)
+
+
+def test_write_unsupported_compression_type():
+    df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
+    with tm.ensure_clean() as path:
+        msg = "Unrecognized compression type: unsupported"
+        assert_raises_regex(ValueError, msg, df.to_json,
+                            path, compression="unsupported")
+
+
+def test_read_unsupported_compression_type():
+    with tm.ensure_clean() as path:
+        msg = "Unrecognized compression type: unsupported"
+        assert_raises_regex(ValueError, msg, pd.read_json,
+                            path, compression="unsupported")
@@ -0,0 +1,575 @@
+"""Tests for Table Schema integration."""
+import json
+from collections import OrderedDict
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from pandas import DataFrame
+from pandas.core.dtypes.dtypes import (
+    PeriodDtype, CategoricalDtype, DatetimeTZDtype)
+from pandas.io.json.table_schema import (
+    as_json_table_type,
+    build_table_schema,
+    convert_pandas_type_to_json_field,
+    convert_json_field_to_pandas_type,
+    set_default_names)
+import pandas.util.testing as tm
+
+
+class TestBuildSchema(object):
+
+    def setup_method(self, method):
+        self.df = DataFrame(
+            {'A': [1, 2, 3, 4],
+             'B': ['a', 'b', 'c', 'c'],
+             'C': pd.date_range('2016-01-01', freq='d', periods=4),
+             'D': pd.timedelta_range('1H', periods=4, freq='T'),
+             },
+            index=pd.Index(range(4), name='idx'))
+
+    def test_build_table_schema(self):
+        result = build_table_schema(self.df, version=False)
+        expected = {
+            'fields': [{'name': 'idx', 'type': 'integer'},
+                       {'name': 'A', 'type': 'integer'},
+                       {'name': 'B', 'type': 'string'},
+                       {'name': 'C', 'type': 'datetime'},
+                       {'name': 'D', 'type': 'duration'},
+                       ],
+            'primaryKey': ['idx']
+        }
+        assert result == expected
+        result = build_table_schema(self.df)
+        assert "pandas_version" in result
+
+    def test_series(self):
+        s = pd.Series([1, 2, 3], name='foo')
+        result = build_table_schema(s, version=False)
+        expected = {'fields': [{'name': 'index', 'type': 'integer'},
+                               {'name': 'foo', 'type': 'integer'}],
+                    'primaryKey': ['index']}
+        assert result == expected
+        result = build_table_schema(s)
+        assert 'pandas_version' in result
+
+    def test_series_unnamed(self):
+        result = build_table_schema(pd.Series([1, 2, 3]), version=False)
+        expected = {'fields': [{'name': 'index', 'type': 'integer'},
+                               {'name': 'values', 'type': 'integer'}],
+                    'primaryKey': ['index']}
+        assert result == expected
+
+    def test_multiindex(self):
+        df = self.df.copy()
+        idx = pd.MultiIndex.from_product([('a', 'b'), (1, 2)])
+        df.index = idx
+
+        result = build_table_schema(df, version=False)
+        expected = {
+            'fields': [{'name': 'level_0', 'type': 'string'},
+                       {'name': 'level_1', 'type': 'integer'},
+                       {'name': 'A', 'type': 'integer'},
+                       {'name': 'B', 'type': 'string'},
+                       {'name': 'C', 'type': 'datetime'},
+                       {'name': 'D', 'type': 'duration'},
+                       ],
+            'primaryKey': ['level_0', 'level_1']
+        }
+        assert result == expected
+
+        df.index.names = ['idx0', None]
+        expected['fields'][0]['name'] = 'idx0'
+        expected['primaryKey'] = ['idx0', 'level_1']
+        result = build_table_schema(df, version=False)
+        assert result == expected
+
+
+class TestTableSchemaType(object):
+
+    @pytest.mark.parametrize('int_type', [
+        np.int, np.int16, np.int32, np.int64])
+    def test_as_json_table_type_int_data(self, int_type):
+        int_data = [1, 2, 3]
+        assert as_json_table_type(np.array(
+            int_data, dtype=int_type)) == 'integer'
+
+    @pytest.mark.parametrize('float_type', [
+        np.float, np.float16, np.float32, np.float64])
+    def test_as_json_table_type_float_data(self, float_type):
+        float_data = [1., 2., 3.]
+        assert as_json_table_type(np.array(
+            float_data, dtype=float_type)) == 'number'
+
+    @pytest.mark.parametrize('bool_type', [bool, np.bool])
+    def test_as_json_table_type_bool_data(self, bool_type):
+        bool_data = [True, False]
+        assert as_json_table_type(np.array(
+            bool_data, dtype=bool_type)) == 'boolean'
+
+    @pytest.mark.parametrize('date_data', [
+        pd.to_datetime(['2016']),
+        pd.to_datetime(['2016'], utc=True),
+        pd.Series(pd.to_datetime(['2016'])),
+        pd.Series(pd.to_datetime(['2016'], utc=True)),
+        pd.period_range('2016', freq='A', periods=3)
+    ])
+    def test_as_json_table_type_date_data(self, date_data):
+        assert as_json_table_type(date_data) == 'datetime'
+
+    @pytest.mark.parametrize('str_data', [
+        pd.Series(['a', 'b']), pd.Index(['a', 'b'])])
+    def test_as_json_table_type_string_data(self, str_data):
+        assert as_json_table_type(str_data) == 'string'
+
+    @pytest.mark.parametrize('cat_data', [
+        pd.Categorical(['a']),
+        pd.Categorical([1]),
+        pd.Series(pd.Categorical([1])),
+        pd.CategoricalIndex([1]),
+        pd.Categorical([1])])
+    def test_as_json_table_type_categorical_data(self, cat_data):
+        assert as_json_table_type(cat_data) == 'any'
+
+    # ------
+    # dtypes
+    # ------
+    @pytest.mark.parametrize('int_dtype', [
+        np.int, np.int16, np.int32, np.int64])
+    def test_as_json_table_type_int_dtypes(self, int_dtype):
+        assert as_json_table_type(int_dtype) == 'integer'
+
+    @pytest.mark.parametrize('float_dtype', [
+        np.float, np.float16, np.float32, np.float64])
+    def test_as_json_table_type_float_dtypes(self, float_dtype):
+        assert as_json_table_type(float_dtype) == 'number'
+
+    @pytest.mark.parametrize('bool_dtype', [bool, np.bool])
+    def test_as_json_table_type_bool_dtypes(self, bool_dtype):
+        assert as_json_table_type(bool_dtype) == 'boolean'
+
+    @pytest.mark.parametrize('date_dtype', [
+        np.datetime64, np.dtype("<M8[ns]"), PeriodDtype(),
+        DatetimeTZDtype('ns', 'US/Central')])
+    def test_as_json_table_type_date_dtypes(self, date_dtype):
+        # TODO: datedate.date? datetime.time?
+        assert as_json_table_type(date_dtype) == 'datetime'
+
+    @pytest.mark.parametrize('td_dtype', [
+        np.timedelta64, np.dtype("<m8[ns]")])
+    def test_as_json_table_type_timedelta_dtypes(self, td_dtype):
+        assert as_json_table_type(td_dtype) == 'duration'
+
+    @pytest.mark.parametrize('str_dtype', [object])  # TODO
+    def test_as_json_table_type_string_dtypes(self, str_dtype):
+        assert as_json_table_type(str_dtype) == 'string'
+
+    def test_as_json_table_type_categorical_dtypes(self):
+        # TODO: I think before is_categorical_dtype(Categorical)
+        # returned True, but now it's False. Figure out why or
+        # if it matters
+        assert as_json_table_type(pd.Categorical(['a'])) == 'any'
+        assert as_json_table_type(CategoricalDtype()) == 'any'
+
+
+class TestTableOrient(object):
+
+    def setup_method(self, method):
+        self.df = DataFrame(
+            {'A': [1, 2, 3, 4],
+             'B': ['a', 'b', 'c', 'c'],
+             'C': pd.date_range('2016-01-01', freq='d', periods=4),
+             'D': pd.timedelta_range('1H', periods=4, freq='T'),
+             'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
+             'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
+                                           ordered=True)),
+             'G': [1., 2., 3, 4.],
+             'H': pd.date_range('2016-01-01', freq='d', periods=4,
+                                tz='US/Central'),
+             },
+            index=pd.Index(range(4), name='idx'))
+
+    def test_build_series(self):
+        s = pd.Series([1, 2], name='a')
+        s.index.name = 'id'
+        result = s.to_json(orient='table', date_format='iso')
+        result = json.loads(result, object_pairs_hook=OrderedDict)
+
+        assert "pandas_version" in result['schema']
+        result['schema'].pop('pandas_version')
+
+        fields = [{'name': 'id', 'type': 'integer'},
+                  {'name': 'a', 'type': 'integer'}]
+
+        schema = {
+            'fields': fields,
+            'primaryKey': ['id'],
+        }
+
+        expected = OrderedDict([
+            ('schema', schema),
+            ('data', [OrderedDict([('id', 0), ('a', 1)]),
+                      OrderedDict([('id', 1), ('a', 2)])])])
+        assert result == expected
+
+    def test_to_json(self):
+        df = self.df.copy()
+        df.index.name = 'idx'
+        result = df.to_json(orient='table', date_format='iso')
+        result = json.loads(result, object_pairs_hook=OrderedDict)
+
+        assert "pandas_version" in result['schema']
+        result['schema'].pop('pandas_version')
+
+        fields = [
+            {'name': 'idx', 'type': 'integer'},
+            {'name': 'A', 'type': 'integer'},
+            {'name': 'B', 'type': 'string'},
+            {'name': 'C', 'type': 'datetime'},
+            {'name': 'D', 'type': 'duration'},
+            {'constraints': {'enum': ['a', 'b', 'c']},
+             'name': 'E',
+             'ordered': False,
+             'type': 'any'},
+            {'constraints': {'enum': ['a', 'b', 'c']},
+             'name': 'F',
+             'ordered': True,
+             'type': 'any'},
+            {'name': 'G', 'type': 'number'},
+            {'name': 'H', 'type': 'datetime', 'tz': 'US/Central'}
+        ]
+
+        schema = {
+            'fields': fields,
+            'primaryKey': ['idx'],
+        }
+        data = [
+            OrderedDict([('idx', 0), ('A', 1), ('B', 'a'),
+                         ('C', '2016-01-01T00:00:00.000Z'),
+                         ('D', 'P0DT1H0M0S'),
+                         ('E', 'a'), ('F', 'a'), ('G', 1.),
+                         ('H', '2016-01-01T06:00:00.000Z')
+                         ]),
+            OrderedDict([('idx', 1), ('A', 2), ('B', 'b'),
+                         ('C', '2016-01-02T00:00:00.000Z'),
+                         ('D', 'P0DT1H1M0S'),
+                         ('E', 'b'), ('F', 'b'), ('G', 2.),
+                         ('H', '2016-01-02T06:00:00.000Z')
+                         ]),
+            OrderedDict([('idx', 2), ('A', 3), ('B', 'c'),
+                         ('C', '2016-01-03T00:00:00.000Z'),
+                         ('D', 'P0DT1H2M0S'),
+                         ('E', 'c'), ('F', 'c'), ('G', 3.),
+                         ('H', '2016-01-03T06:00:00.000Z')
+                         ]),
+            OrderedDict([('idx', 3), ('A', 4), ('B', 'c'),
+                         ('C', '2016-01-04T00:00:00.000Z'),
+                         ('D', 'P0DT1H3M0S'),
+                         ('E', 'c'), ('F', 'c'), ('G', 4.),
+                         ('H', '2016-01-04T06:00:00.000Z')
+                         ]),
+        ]
+        expected = OrderedDict([('schema', schema), ('data', data)])
+        assert result == expected
+
+    def test_to_json_float_index(self):
+        data = pd.Series(1, index=[1., 2.])
+        result = data.to_json(orient='table', date_format='iso')
+        result = json.loads(result, object_pairs_hook=OrderedDict)
+        result['schema'].pop('pandas_version')
+
+        expected = (
+            OrderedDict([('schema', {
+                'fields': [{'name': 'index', 'type': 'number'},
+                           {'name': 'values', 'type': 'integer'}],
+                'primaryKey': ['index']
+            }),
+                ('data', [OrderedDict([('index', 1.0), ('values', 1)]),
+                          OrderedDict([('index', 2.0), ('values', 1)])])])
+        )
+        assert result == expected
+
+    def test_to_json_period_index(self):
+        idx = pd.period_range('2016', freq='Q-JAN', periods=2)
+        data = pd.Series(1, idx)
+        result = data.to_json(orient='table', date_format='iso')
+        result = json.loads(result, object_pairs_hook=OrderedDict)
+        result['schema'].pop('pandas_version')
+
+        fields = [{'freq': 'Q-JAN', 'name': 'index', 'type': 'datetime'},
+                  {'name': 'values', 'type': 'integer'}]
+
+        schema = {'fields': fields, 'primaryKey': ['index']}
+        data = [OrderedDict([('index', '2015-11-01T00:00:00.000Z'),
+                             ('values', 1)]),
+                OrderedDict([('index', '2016-02-01T00:00:00.000Z'),
+                             ('values', 1)])]
+        expected = OrderedDict([('schema', schema), ('data', data)])
+        assert result == expected
+
+    def test_to_json_categorical_index(self):
+        data = pd.Series(1, pd.CategoricalIndex(['a', 'b']))
+        result = data.to_json(orient='table', date_format='iso')
+        result = json.loads(result, object_pairs_hook=OrderedDict)
+        result['schema'].pop('pandas_version')
+
+        expected = (
+            OrderedDict([('schema',
+                          {'fields': [{'name': 'index', 'type': 'any',
+                                       'constraints': {'enum': ['a', 'b']},
+                                       'ordered': False},
+                                      {'name': 'values', 'type': 'integer'}],
+                           'primaryKey': ['index']}),
+                         ('data', [
+                             OrderedDict([('index', 'a'),
+                                          ('values', 1)]),
+                             OrderedDict([('index', 'b'), ('values', 1)])])])
+        )
+        assert result == expected
+
+    def test_date_format_raises(self):
+        with pytest.raises(ValueError):
+            self.df.to_json(orient='table', date_format='epoch')
+
+        # others work
+        self.df.to_json(orient='table', date_format='iso')
+        self.df.to_json(orient='table')
+
+    @pytest.mark.parametrize('kind', [pd.Series, pd.Index])
+    def test_convert_pandas_type_to_json_field_int(self, kind):
+        data = [1, 2, 3]
+        result = convert_pandas_type_to_json_field(kind(data, name='name'))
+        expected = {"name": "name", "type": "integer"}
+        assert result == expected
+
+    @pytest.mark.parametrize('kind', [pd.Series, pd.Index])
+    def test_convert_pandas_type_to_json_field_float(self, kind):
+        data = [1., 2., 3.]
+        result = convert_pandas_type_to_json_field(kind(data, name='name'))
+        expected = {"name": "name", "type": "number"}
+        assert result == expected
+
+    @pytest.mark.parametrize('dt_args,extra_exp', [
+        ({}, {}), ({'utc': True}, {'tz': 'UTC'})])
+    @pytest.mark.parametrize('wrapper', [None, pd.Series])
+    def test_convert_pandas_type_to_json_field_datetime(self, dt_args,
+                                                        extra_exp, wrapper):
+        data = [1., 2., 3.]
+        data = pd.to_datetime(data, **dt_args)
+        if wrapper is pd.Series:
+            data = pd.Series(data, name='values')
+        result = convert_pandas_type_to_json_field(data)
+        expected = {"name": "values", "type": 'datetime'}
+        expected.update(extra_exp)
+        assert result == expected
+
+    def test_convert_pandas_type_to_json_period_range(self):
+        arr = pd.period_range('2016', freq='A-DEC', periods=4)
+        result = convert_pandas_type_to_json_field(arr)
+        expected = {"name": "values", "type": 'datetime', "freq": "A-DEC"}
+        assert result == expected
+
+    @pytest.mark.parametrize('kind', [pd.Categorical, pd.CategoricalIndex])
+    @pytest.mark.parametrize('ordered', [True, False])
+    def test_convert_pandas_type_to_json_field_categorical(self, kind,
+                                                           ordered):
+        data = ['a', 'b', 'c']
+        if kind is pd.Categorical:
+            arr = pd.Series(kind(data, ordered=ordered), name='cats')
+        elif kind is pd.CategoricalIndex:
+            arr = kind(data, ordered=ordered, name='cats')
+
+        result = convert_pandas_type_to_json_field(arr)
+        expected = {"name": "cats", "type": "any",
+                    "constraints": {"enum": data},
+                    "ordered": ordered}
+        assert result == expected
+
+    @pytest.mark.parametrize("inp,exp", [
+        ({'type': 'integer'}, 'int64'),
+        ({'type': 'number'}, 'float64'),
+        ({'type': 'boolean'}, 'bool'),
+        ({'type': 'duration'}, 'timedelta64'),
+        ({'type': 'datetime'}, 'datetime64[ns]'),
+        ({'type': 'datetime', 'tz': 'US/Hawaii'}, 'datetime64[ns, US/Hawaii]'),
+        ({'type': 'any'}, 'object'),
+        ({'type': 'any', 'constraints': {'enum': ['a', 'b', 'c']},
+          'ordered': False}, CategoricalDtype(categories=['a', 'b', 'c'],
+                                              ordered=False)),
+        ({'type': 'any', 'constraints': {'enum': ['a', 'b', 'c']},
+          'ordered': True}, CategoricalDtype(categories=['a', 'b', 'c'],
+                                             ordered=True)),
+        ({'type': 'string'}, 'object')])
+    def test_convert_json_field_to_pandas_type(self, inp, exp):
+        field = {'name': 'foo'}
+        field.update(inp)
+        assert convert_json_field_to_pandas_type(field) == exp
+
+    @pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"])
+    def test_convert_json_field_to_pandas_type_raises(self, inp):
+        field = {'type': inp}
+        with tm.assert_raises_regex(ValueError, "Unsupported or invalid field "
+                                    "type: {}".format(inp)):
+            convert_json_field_to_pandas_type(field)
+
+    def test_categorical(self):
+        s = pd.Series(pd.Categorical(['a', 'b', 'a']))
+        s.index.name = 'idx'
+        result = s.to_json(orient='table', date_format='iso')
+        result = json.loads(result, object_pairs_hook=OrderedDict)
+        result['schema'].pop('pandas_version')
+
+        fields = [{'name': 'idx', 'type': 'integer'},
+                  {'constraints': {'enum': ['a', 'b']},
+                   'name': 'values',
+                   'ordered': False,
+                   'type': 'any'}]
+
+        expected = OrderedDict([
+            ('schema', {'fields': fields,
+                        'primaryKey': ['idx']}),
+            ('data', [OrderedDict([('idx', 0), ('values', 'a')]),
+                      OrderedDict([('idx', 1), ('values', 'b')]),
+                      OrderedDict([('idx', 2), ('values', 'a')])])])
+        assert result == expected
+
+    @pytest.mark.parametrize('idx,nm,prop', [
+        (pd.Index([1]), 'index', 'name'),
+        (pd.Index([1], name='myname'), 'myname', 'name'),
+        (pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')]),
+         ['level_0', 'level_1'], 'names'),
+        (pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')],
+                                    names=['n1', 'n2']),
+         ['n1', 'n2'], 'names'),
+        (pd.MultiIndex.from_product([('a', 'b'), ('c', 'd')],
+                                    names=['n1', None]),
+         ['n1', 'level_1'], 'names')
+    ])
+    def test_set_names_unset(self, idx, nm, prop):
+        data = pd.Series(1, idx)
+        result = set_default_names(data)
+        assert getattr(result.index, prop) == nm
+
+    @pytest.mark.parametrize("idx", [
+        pd.Index([], name='index'),
+        pd.MultiIndex.from_arrays([['foo'], ['bar']],
+                                  names=('level_0', 'level_1')),
+        pd.MultiIndex.from_arrays([['foo'], ['bar']],
+                                  names=('foo', 'level_1'))
+    ])
+    def test_warns_non_roundtrippable_names(self, idx):
+        # GH 19130
+        df = pd.DataFrame([[]], index=idx)
+        df.index.name = 'index'
+        with tm.assert_produces_warning():
+            set_default_names(df)
+
+    def test_timestamp_in_columns(self):
+        df = pd.DataFrame([[1, 2]], columns=[pd.Timestamp('2016'),
+                                             pd.Timedelta(10, unit='s')])
+        result = df.to_json(orient="table")
+        js = json.loads(result)
+        assert js['schema']['fields'][1]['name'] == 1451606400000
+        assert js['schema']['fields'][2]['name'] == 10000
+
+    @pytest.mark.parametrize('case', [
+        pd.Series([1], index=pd.Index([1], name='a'), name='a'),
+        pd.DataFrame({"A": [1]}, index=pd.Index([1], name="A")),
+        pd.DataFrame({"A": [1]}, index=pd.MultiIndex.from_arrays([
+            ['a'], [1]], names=["A", "a"]))
+    ])
+    def test_overlapping_names(self, case):
+        with tm.assert_raises_regex(ValueError, 'Overlapping'):
+            case.to_json(orient='table')
+
+    def test_mi_falsey_name(self):
+        # GH 16203
+        df = pd.DataFrame(np.random.randn(4, 4),
+                          index=pd.MultiIndex.from_product([('A', 'B'),
+                                                            ('a', 'b')]))
+        result = [x['name'] for x in build_table_schema(df)['fields']]
+        assert result == ['level_0', 'level_1', 0, 1, 2, 3]
+
+
+class TestTableOrientReader(object):
+
+    @pytest.mark.parametrize("index_nm", [
+        None, "idx", pytest.param("index", marks=pytest.mark.xfail),
+        'level_0'])
+    @pytest.mark.parametrize("vals", [
+        {'ints': [1, 2, 3, 4]},
+        {'objects': ['a', 'b', 'c', 'd']},
+        {'date_ranges': pd.date_range('2016-01-01', freq='d', periods=4)},
+        {'categoricals': pd.Series(pd.Categorical(['a', 'b', 'c', 'c']))},
+        {'ordered_cats': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
+                                                  ordered=True))},
+        pytest.param({'floats': [1., 2., 3., 4.]}, marks=pytest.mark.xfail),
+        {'floats': [1.1, 2.2, 3.3, 4.4]},
+        {'bools': [True, False, False, True]}])
+    def test_read_json_table_orient(self, index_nm, vals, recwarn):
+        df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)
+
+    @pytest.mark.parametrize("index_nm", [
+        None, "idx", "index"])
+    @pytest.mark.parametrize("vals", [
+        {'timedeltas': pd.timedelta_range('1H', periods=4, freq='T')},
+        {'timezones': pd.date_range('2016-01-01', freq='d', periods=4,
+                                    tz='US/Central')}])
+    def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
+        df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
+        out = df.to_json(orient="table")
+        with tm.assert_raises_regex(NotImplementedError, 'can not yet read '):
+            pd.read_json(out, orient="table")
+
+    def test_comprehensive(self):
+        df = DataFrame(
+            {'A': [1, 2, 3, 4],
+             'B': ['a', 'b', 'c', 'c'],
+             'C': pd.date_range('2016-01-01', freq='d', periods=4),
+             # 'D': pd.timedelta_range('1H', periods=4, freq='T'),
+             'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
+             'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
+                                           ordered=True)),
+             'G': [1.1, 2.2, 3.3, 4.4],
+             # 'H': pd.date_range('2016-01-01', freq='d', periods=4,
+             #                   tz='US/Central'),
+             'I': [True, False, False, True],
+             },
+            index=pd.Index(range(4), name='idx'))
+
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)
+
+    @pytest.mark.parametrize("index_names", [
+        [None, None], ['foo', 'bar'], ['foo', None], [None, 'foo'],
+        ['index', 'foo']])
+    def test_multiindex(self, index_names):
+        # GH 18912
+        df = pd.DataFrame(
+            [["Arr", "alpha", [1, 2, 3, 4]],
+             ["Bee", "Beta", [10, 20, 30, 40]]],
+            index=[["A", "B"], ["Null", "Eins"]],
+            columns=["Aussprache", "Griechisch", "Args"]
+        )
+        df.index.names = index_names
+        out = df.to_json(orient="table")
+        result = pd.read_json(out, orient="table")
+        tm.assert_frame_equal(df, result)
+
+    @pytest.mark.parametrize("strict_check", [
+        pytest.param(True, marks=pytest.mark.xfail), False])
+    def test_empty_frame_roundtrip(self, strict_check):
+        # GH 21287
+        df = pd.DataFrame([], columns=['a', 'b', 'c'])
+        expected = df.copy()
+        out = df.to_json(orient='table')
+        result = pd.read_json(out, orient='table')
+        # TODO: When DF coercion issue (#21345) is resolved tighten type checks
+        tm.assert_frame_equal(expected, result,
+                              check_dtype=strict_check,
+                              check_index_type=strict_check)
@@ -0,0 +1,442 @@
+import pytest
+import numpy as np
+import json
+
+import pandas.util.testing as tm
+from pandas import compat, Index, DataFrame
+
+from pandas.io.json import json_normalize
+from pandas.io.json.normalize import nested_to_record
+
+
+@pytest.fixture
+def deep_nested():
+    # deeply nested data
+    return [{'country': 'USA',
+             'states': [{'name': 'California',
+                         'cities': [{'name': 'San Francisco',
+                                     'pop': 12345},
+                                    {'name': 'Los Angeles',
+                                     'pop': 12346}]
+                         },
+                        {'name': 'Ohio',
+                         'cities': [{'name': 'Columbus',
+                                     'pop': 1234},
+                                    {'name': 'Cleveland',
+                                     'pop': 1236}]}
+                        ]
+             },
+            {'country': 'Germany',
+             'states': [{'name': 'Bayern',
+                         'cities': [{'name': 'Munich', 'pop': 12347}]
+                         },
+                        {'name': 'Nordrhein-Westfalen',
+                         'cities': [{'name': 'Duesseldorf', 'pop': 1238},
+                                    {'name': 'Koeln', 'pop': 1239}]}
+                        ]
+             }
+            ]
+
+
+@pytest.fixture
+def state_data():
+    return [
+        {'counties': [{'name': 'Dade', 'population': 12345},
+                      {'name': 'Broward', 'population': 40000},
+                      {'name': 'Palm Beach', 'population': 60000}],
+         'info': {'governor': 'Rick Scott'},
+         'shortname': 'FL',
+         'state': 'Florida'},
+        {'counties': [{'name': 'Summit', 'population': 1234},
+                      {'name': 'Cuyahoga', 'population': 1337}],
+         'info': {'governor': 'John Kasich'},
+         'shortname': 'OH',
+         'state': 'Ohio'}]
+
+
+@pytest.fixture
+def author_missing_data():
+    return [
+        {'info': None},
+        {'info':
+            {'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
+            'author_name':
+         {'first': 'Jane', 'last_name': 'Doe'}
+         }]
+
+
+class TestJSONNormalize(object):
+
+    def test_simple_records(self):
+        recs = [{'a': 1, 'b': 2, 'c': 3},
+                {'a': 4, 'b': 5, 'c': 6},
+                {'a': 7, 'b': 8, 'c': 9},
+                {'a': 10, 'b': 11, 'c': 12}]
+
+        result = json_normalize(recs)
+        expected = DataFrame(recs)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_simple_normalize(self, state_data):
+        result = json_normalize(state_data[0], 'counties')
+        expected = DataFrame(state_data[0]['counties'])
+        tm.assert_frame_equal(result, expected)
+
+        result = json_normalize(state_data, 'counties')
+
+        expected = []
+        for rec in state_data:
+            expected.extend(rec['counties'])
+        expected = DataFrame(expected)
+
+        tm.assert_frame_equal(result, expected)
+
+        result = json_normalize(state_data, 'counties', meta='state')
+        expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_empty_array(self):
+        result = json_normalize([])
+        expected = DataFrame()
+        tm.assert_frame_equal(result, expected)
+
+    def test_simple_normalize_with_separator(self, deep_nested):
+        # GH 14883
+        result = json_normalize({'A': {'A': 1, 'B': 2}})
+        expected = DataFrame([[1, 2]], columns=['A.A', 'A.B'])
+        tm.assert_frame_equal(result.reindex_like(expected), expected)
+
+        result = json_normalize({'A': {'A': 1, 'B': 2}}, sep='_')
+        expected = DataFrame([[1, 2]], columns=['A_A', 'A_B'])
+        tm.assert_frame_equal(result.reindex_like(expected), expected)
+
+        result = json_normalize({'A': {'A': 1, 'B': 2}}, sep=u'\u03c3')
+        expected = DataFrame([[1, 2]], columns=[u'A\u03c3A', u'A\u03c3B'])
+        tm.assert_frame_equal(result.reindex_like(expected), expected)
+
+        result = json_normalize(deep_nested, ['states', 'cities'],
+                                meta=['country', ['states', 'name']],
+                                sep='_')
+        expected = Index(['name', 'pop',
+                          'country', 'states_name']).sort_values()
+        assert result.columns.sort_values().equals(expected)
+
+    def test_value_array_record_prefix(self):
+        # GH 21536
+        result = json_normalize({'A': [1, 2]}, 'A', record_prefix='Prefix.')
+        expected = DataFrame([[1], [2]], columns=['Prefix.0'])
+        tm.assert_frame_equal(result, expected)
+
+    def test_more_deeply_nested(self, deep_nested):
+
+        result = json_normalize(deep_nested, ['states', 'cities'],
+                                meta=['country', ['states', 'name']])
+        # meta_prefix={'states': 'state_'})
+
+        ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3,
+                   'states.name': ['California', 'California', 'Ohio', 'Ohio',
+                                   'Bayern', 'Nordrhein-Westfalen',
+                                   'Nordrhein-Westfalen'],
+                   'name': ['San Francisco', 'Los Angeles', 'Columbus',
+                            'Cleveland', 'Munich', 'Duesseldorf', 'Koeln'],
+                   'pop': [12345, 12346, 1234, 1236, 12347, 1238, 1239]}
+
+        expected = DataFrame(ex_data, columns=result.columns)
+        tm.assert_frame_equal(result, expected)
+
+    def test_shallow_nested(self):
+        data = [{'state': 'Florida',
+                 'shortname': 'FL',
+                 'info': {
+                     'governor': 'Rick Scott'
+                 },
+                 'counties': [{'name': 'Dade', 'population': 12345},
+                              {'name': 'Broward', 'population': 40000},
+                              {'name': 'Palm Beach', 'population': 60000}]},
+                {'state': 'Ohio',
+                 'shortname': 'OH',
+                 'info': {
+                     'governor': 'John Kasich'
+                 },
+                 'counties': [{'name': 'Summit', 'population': 1234},
+                              {'name': 'Cuyahoga', 'population': 1337}]}]
+
+        result = json_normalize(data, 'counties',
+                                ['state', 'shortname',
+                                 ['info', 'governor']])
+        ex_data = {'name': ['Dade', 'Broward', 'Palm Beach', 'Summit',
+                            'Cuyahoga'],
+                   'state': ['Florida'] * 3 + ['Ohio'] * 2,
+                   'shortname': ['FL', 'FL', 'FL', 'OH', 'OH'],
+                   'info.governor': ['Rick Scott'] * 3 + ['John Kasich'] * 2,
+                   'population': [12345, 40000, 60000, 1234, 1337]}
+        expected = DataFrame(ex_data, columns=result.columns)
+        tm.assert_frame_equal(result, expected)
+
+    def test_meta_name_conflict(self):
+        data = [{'foo': 'hello',
+                 'bar': 'there',
+                 'data': [{'foo': 'something', 'bar': 'else'},
+                          {'foo': 'something2', 'bar': 'else2'}]}]
+
+        with pytest.raises(ValueError):
+            json_normalize(data, 'data', meta=['foo', 'bar'])
+
+        result = json_normalize(data, 'data', meta=['foo', 'bar'],
+                                meta_prefix='meta')
+
+        for val in ['metafoo', 'metabar', 'foo', 'bar']:
+            assert val in result
+
+    def test_meta_parameter_not_modified(self):
+        # GH 18610
+        data = [{'foo': 'hello',
+                 'bar': 'there',
+                 'data': [{'foo': 'something', 'bar': 'else'},
+                          {'foo': 'something2', 'bar': 'else2'}]}]
+
+        COLUMNS = ['foo', 'bar']
+        result = json_normalize(data, 'data', meta=COLUMNS,
+                                meta_prefix='meta')
+
+        assert COLUMNS == ['foo', 'bar']
+        for val in ['metafoo', 'metabar', 'foo', 'bar']:
+            assert val in result
+
+    def test_record_prefix(self, state_data):
+        result = json_normalize(state_data[0], 'counties')
+        expected = DataFrame(state_data[0]['counties'])
+        tm.assert_frame_equal(result, expected)
+
+        result = json_normalize(state_data, 'counties',
+                                meta='state',
+                                record_prefix='county_')
+
+        expected = []
+        for rec in state_data:
+            expected.extend(rec['counties'])
+        expected = DataFrame(expected)
+        expected = expected.rename(columns=lambda x: 'county_' + x)
+        expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_non_ascii_key(self):
+        if compat.PY3:
+            testjson = (
+                b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' +
+                b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
+            ).decode('utf8')
+        else:
+            testjson = ('[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
+                        '{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]')
+
+        testdata = {
+            u'sub.A': [1, 3],
+            u'sub.B': [2, 4],
+            b"\xc3\x9cnic\xc3\xb8de".decode('utf8'): [0, 1]
+        }
+        expected = DataFrame(testdata)
+
+        result = json_normalize(json.loads(testjson))
+        tm.assert_frame_equal(result, expected)
+
+    def test_missing_field(self, author_missing_data):
+        # GH20030:
+        result = json_normalize(author_missing_data)
+        ex_data = [
+            {'info': np.nan,
+             'author_name.first': np.nan,
+             'author_name.last_name': np.nan,
+             'info.created_at': np.nan,
+             'info.last_updated': np.nan},
+            {'info': None,
+             'author_name.first': 'Jane',
+             'author_name.last_name': 'Doe',
+             'info.created_at': '11/08/1993',
+             'info.last_updated': '26/05/2012'}
+        ]
+        expected = DataFrame(ex_data)
+        tm.assert_frame_equal(result, expected)
+
+
+class TestNestedToRecord(object):
+
+    def test_flat_stays_flat(self):
+        recs = [dict(flat1=1, flat2=2),
+                dict(flat1=3, flat2=4),
+                ]
+
+        result = nested_to_record(recs)
+        expected = recs
+        assert result == expected
+
+    def test_one_level_deep_flattens(self):
+        data = dict(flat1=1,
+                    dict1=dict(c=1, d=2))
+
+        result = nested_to_record(data)
+        expected = {'dict1.c': 1,
+                    'dict1.d': 2,
+                    'flat1': 1}
+
+        assert result == expected
+
+    def test_nested_flattens(self):
+        data = dict(flat1=1,
+                    dict1=dict(c=1, d=2),
+                    nested=dict(e=dict(c=1, d=2),
+                                d=2))
+
+        result = nested_to_record(data)
+        expected = {'dict1.c': 1,
+                    'dict1.d': 2,
+                    'flat1': 1,
+                    'nested.d': 2,
+                    'nested.e.c': 1,
+                    'nested.e.d': 2}
+
+        assert result == expected
+
+    def test_json_normalize_errors(self):
+        # GH14583: If meta keys are not always present
+        # a new option to set errors='ignore' has been implemented
+        i = {
+            "Trades": [{
+                "general": {
+                    "tradeid": 100,
+                    "trade_version": 1,
+                    "stocks": [{
+
+                        "symbol": "AAPL",
+                        "name": "Apple",
+                        "price": "0"
+                    }, {
+                        "symbol": "GOOG",
+                        "name": "Google",
+                        "price": "0"
+                    }
+                    ]
+                }
+            }, {
+                "general": {
+                    "tradeid": 100,
+                    "stocks": [{
+                        "symbol": "AAPL",
+                        "name": "Apple",
+                        "price": "0"
+                    }, {
+                        "symbol": "GOOG",
+                        "name": "Google",
+                        "price": "0"
+                    }
+                    ]
+                }
+            }
+            ]
+        }
+        j = json_normalize(data=i['Trades'],
+                           record_path=[['general', 'stocks']],
+                           meta=[['general', 'tradeid'],
+                                 ['general', 'trade_version']],
+                           errors='ignore')
+        expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
+                    'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
+                    'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
+                    'price': {0: '0', 1: '0', 2: '0', 3: '0'},
+                    'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}
+
+        assert j.fillna('').to_dict() == expected
+
+        pytest.raises(KeyError,
+                      json_normalize, data=i['Trades'],
+                      record_path=[['general', 'stocks']],
+                      meta=[['general', 'tradeid'],
+                            ['general', 'trade_version']],
+                      errors='raise'
+                      )
+
+    def test_donot_drop_nonevalues(self):
+        # GH21356
+        data = [
+            {'info': None,
+             'author_name':
+             {'first': 'Smith', 'last_name': 'Appleseed'}
+             },
+            {'info':
+                {'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
+             'author_name':
+                {'first': 'Jane', 'last_name': 'Doe'}
+             }
+        ]
+        result = nested_to_record(data)
+        expected = [
+            {'info': None,
+             'author_name.first': 'Smith',
+             'author_name.last_name': 'Appleseed'},
+            {'author_name.first': 'Jane',
+             'author_name.last_name': 'Doe',
+             'info.created_at': '11/08/1993',
+             'info.last_updated': '26/05/2012'}]
+
+        assert result == expected
+
+    def test_nonetype_top_level_bottom_level(self):
+        # GH21158: If inner level json has a key with a null value
+        # make sure it doesnt do a new_d.pop twice and except
+        data = {
+            "id": None,
+            "location": {
+                "country": {
+                    "state": {
+                        "id": None,
+                        "town.info": {
+                            "id": None,
+                            "region": None,
+                            "x": 49.151580810546875,
+                            "y": -33.148521423339844,
+                            "z": 27.572303771972656}}}
+            }
+        }
+        result = nested_to_record(data)
+        expected = {
+            'id': None,
+            'location.country.state.id': None,
+            'location.country.state.town.info.id': None,
+            'location.country.state.town.info.region': None,
+            'location.country.state.town.info.x': 49.151580810546875,
+            'location.country.state.town.info.y': -33.148521423339844,
+            'location.country.state.town.info.z': 27.572303771972656}
+        assert result == expected
+
+    def test_nonetype_multiple_levels(self):
+        # GH21158: If inner level json has a key with a null value
+        # make sure it doesnt do a new_d.pop twice and except
+        data = {
+            "id": None,
+            "location": {
+                "id": None,
+                "country": {
+                    "id": None,
+                    "state": {
+                        "id": None,
+                        "town.info": {
+                            "region": None,
+                            "x": 49.151580810546875,
+                            "y": -33.148521423339844,
+                            "z": 27.572303771972656}}}
+            }
+        }
+        result = nested_to_record(data)
+        expected = {
+            'id': None,
+            'location.id': None,
+            'location.country.id': None,
+            'location.country.state.id': None,
+            'location.country.state.town.info.region': None,
+            'location.country.state.town.info.x': 49.151580810546875,
+            'location.country.state.town.info.y': -33.148521423339844,
+            'location.country.state.town.info.z': 27.572303771972656}
+        assert result == expected
@@ -0,0 +1,169 @@
+# -*- coding: utf-8 -*-
+import pytest
+import pandas as pd
+from pandas import DataFrame, read_json
+from pandas.compat import StringIO
+from pandas.io.json.json import JsonReader
+import pandas.util.testing as tm
+from pandas.util.testing import (assert_frame_equal, assert_series_equal,
+                                 ensure_clean)
+
+
+@pytest.fixture
+def lines_json_df():
+    df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+    return df.to_json(lines=True, orient="records")
+
+
+def test_read_jsonl():
+    # GH9180
+    result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
+    expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+    assert_frame_equal(result, expected)
+
+
+def test_read_jsonl_unicode_chars():
+    # GH15132: non-ascii unicode characters
+    # \u201d == RIGHT DOUBLE QUOTATION MARK
+
+    # simulate file handle
+    json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
+    json = StringIO(json)
+    result = read_json(json, lines=True)
+    expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
+                         columns=['a', 'b'])
+    assert_frame_equal(result, expected)
+
+    # simulate string
+    json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
+    result = read_json(json, lines=True)
+    expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
+                         columns=['a', 'b'])
+    assert_frame_equal(result, expected)
+
+
+def test_to_jsonl():
+    # GH9180
+    df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+    result = df.to_json(orient="records", lines=True)
+    expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
+    assert result == expected
+
+    df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
+    result = df.to_json(orient="records", lines=True)
+    expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
+    assert result == expected
+    assert_frame_equal(read_json(result, lines=True), df)
+
+    # GH15096: escaped characters in columns and data
+    df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
+                   columns=["a\\", 'b'])
+    result = df.to_json(orient="records", lines=True)
+    expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
+                '{"a\\\\":"foo\\"","b":"bar"}')
+    assert result == expected
+    assert_frame_equal(read_json(result, lines=True), df)
+
+
+@pytest.mark.parametrize("chunksize", [1, 1.0])
+def test_readjson_chunks(lines_json_df, chunksize):
+    # Basic test that read_json(chunks=True) gives the same result as
+    # read_json(chunks=False)
+    # GH17048: memory usage when lines=True
+
+    unchunked = read_json(StringIO(lines_json_df), lines=True)
+    reader = read_json(StringIO(lines_json_df), lines=True,
+                       chunksize=chunksize)
+    chunked = pd.concat(reader)
+
+    assert_frame_equal(chunked, unchunked)
+
+
+def test_readjson_chunksize_requires_lines(lines_json_df):
+    msg = "chunksize can only be passed if lines=True"
+    with tm.assert_raises_regex(ValueError, msg):
+        pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
+
+
+def test_readjson_chunks_series():
+    # Test reading line-format JSON to Series with chunksize param
+    s = pd.Series({'A': 1, 'B': 2})
+
+    strio = StringIO(s.to_json(lines=True, orient="records"))
+    unchunked = pd.read_json(strio, lines=True, typ='Series')
+
+    strio = StringIO(s.to_json(lines=True, orient="records"))
+    chunked = pd.concat(pd.read_json(
+        strio, lines=True, typ='Series', chunksize=1
+    ))
+
+    assert_series_equal(chunked, unchunked)
+
+
+def test_readjson_each_chunk(lines_json_df):
+    # Other tests check that the final result of read_json(chunksize=True)
+    # is correct. This checks the intermediate chunks.
+    chunks = list(
+        pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
+    )
+    assert chunks[0].shape == (2, 2)
+    assert chunks[1].shape == (1, 2)
+
+
+def test_readjson_chunks_from_file():
+    with ensure_clean('test.json') as path:
+        df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+        df.to_json(path, lines=True, orient="records")
+        chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
+        unchunked = pd.read_json(path, lines=True)
+        assert_frame_equal(unchunked, chunked)
+
+
+@pytest.mark.parametrize("chunksize", [None, 1])
+def test_readjson_chunks_closes(chunksize):
+    with ensure_clean('test.json') as path:
+        df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+        df.to_json(path, lines=True, orient="records")
+        reader = JsonReader(
+            path, orient=None, typ="frame", dtype=True, convert_axes=True,
+            convert_dates=True, keep_default_dates=True, numpy=False,
+            precise_float=False, date_unit=None, encoding=None,
+            lines=True, chunksize=chunksize, compression=None)
+        reader.read()
+        assert reader.open_stream.closed, "didn't close stream with \
+            chunksize = {chunksize}".format(chunksize=chunksize)
+
+
+@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
+def test_readjson_invalid_chunksize(lines_json_df, chunksize):
+    msg = r"'chunksize' must be an integer >=1"
+
+    with tm.assert_raises_regex(ValueError, msg):
+        pd.read_json(StringIO(lines_json_df), lines=True,
+                     chunksize=chunksize)
+
+
+@pytest.mark.parametrize("chunksize", [None, 1, 2])
+def test_readjson_chunks_multiple_empty_lines(chunksize):
+    j = """
+
+    {"A":1,"B":4}
+
+
+
+    {"A":2,"B":5}
+
+
+
+
+
+
+
+    {"A":3,"B":6}
+    """
+    orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+    test = pd.read_json(j, lines=True, chunksize=chunksize)
+    if chunksize is not None:
+        test = pd.concat(test)
+    tm.assert_frame_equal(
+        orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize))
@@ -0,0 +1,10 @@
+from pandas.compat import PY3
+
+
+# array compat
+if PY3:
+    frombytes = lambda obj, data: obj.frombytes(data)
+    tobytes = lambda obj: obj.tobytes()
+else:
+    frombytes = lambda obj, data: obj.fromstring(data)
+    tobytes = lambda obj: obj.tostring()
@@ -0,0 +1,20 @@
+# coding: utf-8
+
+from pandas.io.msgpack import packb, unpackb
+from .common import frombytes
+
+
+def test_unpack_buffer():
+    from array import array
+    buf = array('b')
+    frombytes(buf, packb((b'foo', b'bar')))
+    obj = unpackb(buf, use_list=1)
+    assert [b'foo', b'bar'] == obj
+
+
+def test_unpack_bytearray():
+    buf = bytearray(packb(('foo', 'bar')))
+    obj = unpackb(buf, use_list=1)
+    assert [b'foo', b'bar'] == obj
+    expected_type = bytes
+    assert all(type(s) == expected_type for s in obj)
@@ -0,0 +1,115 @@
+# coding: utf-8
+
+from pandas.io.msgpack import packb, unpackb
+
+
+def check(length, obj):
+    v = packb(obj)
+    assert len(v) == length, \
+        "%r length should be %r but get %r" % (obj, length, len(v))
+    assert unpackb(v, use_list=0) == obj
+
+
+def test_1():
+    for o in [None, True, False, 0, 1, (1 << 6), (1 << 7) - 1, -1,
+              -((1 << 5) - 1), -(1 << 5)]:
+        check(1, o)
+
+
+def test_2():
+    for o in [1 << 7, (1 << 8) - 1, -((1 << 5) + 1), -(1 << 7)]:
+        check(2, o)
+
+
+def test_3():
+    for o in [1 << 8, (1 << 16) - 1, -((1 << 7) + 1), -(1 << 15)]:
+        check(3, o)
+
+
+def test_5():
+    for o in [1 << 16, (1 << 32) - 1, -((1 << 15) + 1), -(1 << 31)]:
+        check(5, o)
+
+
+def test_9():
+    for o in [1 << 32, (1 << 64) - 1, -((1 << 31) + 1), -(1 << 63), 1.0, 0.1,
+              -0.1, -1.0]:
+        check(9, o)
+
+
+def check_raw(overhead, num):
+    check(num + overhead, b" " * num)
+
+
+def test_fixraw():
+    check_raw(1, 0)
+    check_raw(1, (1 << 5) - 1)
+
+
+def test_raw16():
+    check_raw(3, 1 << 5)
+    check_raw(3, (1 << 16) - 1)
+
+
+def test_raw32():
+    check_raw(5, 1 << 16)
+
+
+def check_array(overhead, num):
+    check(num + overhead, (None, ) * num)
+
+
+def test_fixarray():
+    check_array(1, 0)
+    check_array(1, (1 << 4) - 1)
+
+
+def test_array16():
+    check_array(3, 1 << 4)
+    check_array(3, (1 << 16) - 1)
+
+
+def test_array32():
+    check_array(5, (1 << 16))
+
+
+def match(obj, buf):
+    assert packb(obj) == buf
+    assert unpackb(buf, use_list=0) == obj
+
+
+def test_match():
+    cases = [
+        (None, b'\xc0'),
+        (False, b'\xc2'),
+        (True, b'\xc3'),
+        (0, b'\x00'),
+        (127, b'\x7f'),
+        (128, b'\xcc\x80'),
+        (256, b'\xcd\x01\x00'),
+        (-1, b'\xff'),
+        (-33, b'\xd0\xdf'),
+        (-129, b'\xd1\xff\x7f'),
+        ({1: 1}, b'\x81\x01\x01'),
+        (1.0, b"\xcb\x3f\xf0\x00\x00\x00\x00\x00\x00"),
+        ((), b'\x90'),
+        (tuple(range(15)), (b"\x9f\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09"
+                            b"\x0a\x0b\x0c\x0d\x0e")),
+        (tuple(range(16)), (b"\xdc\x00\x10\x00\x01\x02\x03\x04\x05\x06\x07"
+                            b"\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f")),
+        ({}, b'\x80'),
+        ({x: x for x in range(15)},
+         (b'\x8f\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06\x06\x07'
+          b'\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e')),
+        ({x: x for x in range(16)},
+         (b'\xde\x00\x10\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06'
+          b'\x06\x07\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e'
+          b'\x0f\x0f')),
+    ]
+
+    for v, p in cases:
+        match(v, p)
+
+
+def test_unicode():
+    assert unpackb(packb('foobar'), use_list=1) == b'foobar'
@@ -0,0 +1,39 @@
+# coding: utf-8
+
+from datetime import datetime
+from pandas.io.msgpack import packb, unpackb
+
+import pytest
+import pandas.util.testing as tm
+
+
+class DummyException(Exception):
+    pass
+
+
+class TestExceptions(object):
+
+    def test_raise_on_find_unsupported_value(self):
+        msg = "can\'t serialize datetime"
+        with tm.assert_raises_regex(TypeError, msg):
+            packb(datetime.now())
+
+    def test_raise_from_object_hook(self):
+        def hook(_):
+            raise DummyException()
+
+        pytest.raises(DummyException, unpackb, packb({}), object_hook=hook)
+        pytest.raises(DummyException, unpackb, packb({'fizz': 'buzz'}),
+                      object_hook=hook)
+        pytest.raises(DummyException, unpackb, packb({'fizz': 'buzz'}),
+                      object_pairs_hook=hook)
+        pytest.raises(DummyException, unpackb,
+                      packb({'fizz': {'buzz': 'spam'}}), object_hook=hook)
+        pytest.raises(DummyException, unpackb,
+                      packb({'fizz': {'buzz': 'spam'}}),
+                      object_pairs_hook=hook)
+
+    def test_invalid_value(self):
+        msg = "Unpack failed: error"
+        with tm.assert_raises_regex(ValueError, msg):
+            unpackb(b"\xd9\x97#DL_")
@@ -0,0 +1,61 @@
+from __future__ import print_function
+import array
+
+import pandas.io.msgpack as msgpack
+from pandas.io.msgpack import ExtType
+from .common import frombytes, tobytes
+
+
+def test_pack_ext_type():
+    def p(s):
+        packer = msgpack.Packer()
+        packer.pack_ext_type(0x42, s)
+        return packer.bytes()
+
+    assert p(b'A') == b'\xd4\x42A'  # fixext 1
+    assert p(b'AB') == b'\xd5\x42AB'  # fixext 2
+    assert p(b'ABCD') == b'\xd6\x42ABCD'  # fixext 4
+    assert p(b'ABCDEFGH') == b'\xd7\x42ABCDEFGH'  # fixext 8
+    assert p(b'A' * 16) == b'\xd8\x42' + b'A' * 16  # fixext 16
+    assert p(b'ABC') == b'\xc7\x03\x42ABC'  # ext 8
+    assert p(b'A' * 0x0123) == b'\xc8\x01\x23\x42' + b'A' * 0x0123  # ext 16
+    assert (p(b'A' * 0x00012345) ==
+            b'\xc9\x00\x01\x23\x45\x42' + b'A' * 0x00012345)  # ext 32
+
+
+def test_unpack_ext_type():
+    def check(b, expected):
+        assert msgpack.unpackb(b) == expected
+
+    check(b'\xd4\x42A', ExtType(0x42, b'A'))  # fixext 1
+    check(b'\xd5\x42AB', ExtType(0x42, b'AB'))  # fixext 2
+    check(b'\xd6\x42ABCD', ExtType(0x42, b'ABCD'))  # fixext 4
+    check(b'\xd7\x42ABCDEFGH', ExtType(0x42, b'ABCDEFGH'))  # fixext 8
+    check(b'\xd8\x42' + b'A' * 16, ExtType(0x42, b'A' * 16))  # fixext 16
+    check(b'\xc7\x03\x42ABC', ExtType(0x42, b'ABC'))  # ext 8
+    check(b'\xc8\x01\x23\x42' + b'A' * 0x0123,
+          ExtType(0x42, b'A' * 0x0123))  # ext 16
+    check(b'\xc9\x00\x01\x23\x45\x42' + b'A' * 0x00012345,
+          ExtType(0x42, b'A' * 0x00012345))  # ext 32
+
+
+def test_extension_type():
+    def default(obj):
+        print('default called', obj)
+        if isinstance(obj, array.array):
+            typecode = 123  # application specific typecode
+            data = tobytes(obj)
+            return ExtType(typecode, data)
+        raise TypeError("Unknown type object %r" % (obj, ))
+
+    def ext_hook(code, data):
+        print('ext_hook called', code, data)
+        assert code == 123
+        obj = array.array('d')
+        frombytes(obj, data)
+        return obj
+
+    obj = [42, b'hello', array.array('d', [1.1, 2.2, 3.3])]
+    s = msgpack.packb(obj, default=default)
+    obj2 = msgpack.unpackb(s, ext_hook=ext_hook)
+    assert obj == obj2
@@ -0,0 +1,91 @@
+# coding: utf-8
+
+from pandas.io.msgpack import unpackb
+
+
+def check(src, should, use_list=0):
+    assert unpackb(src, use_list=use_list) == should
+
+
+def testSimpleValue():
+    check(b"\x93\xc0\xc2\xc3", (None, False, True, ))
+
+
+def testFixnum():
+    check(b"\x92\x93\x00\x40\x7f\x93\xe0\xf0\xff", ((0,
+                                                     64,
+                                                     127, ),
+                                                    (-32,
+                                                     -16,
+                                                     -1, ), ))
+
+
+def testFixArray():
+    check(b"\x92\x90\x91\x91\xc0", ((), ((None, ), ), ), )
+
+
+def testFixRaw():
+    check(b"\x94\xa0\xa1a\xa2bc\xa3def", (b"", b"a", b"bc", b"def", ), )
+
+
+def testFixMap():
+    check(b"\x82\xc2\x81\xc0\xc0\xc3\x81\xc0\x80",
+          {False: {None: None},
+           True: {None: {}}}, )
+
+
+def testUnsignedInt():
+    check(b"\x99\xcc\x00\xcc\x80\xcc\xff\xcd\x00\x00\xcd\x80\x00"
+          b"\xcd\xff\xff\xce\x00\x00\x00\x00\xce\x80\x00\x00\x00"
+          b"\xce\xff\xff\xff\xff",
+          (0,
+           128,
+           255,
+           0,
+           32768,
+           65535,
+           0,
+           2147483648,
+           4294967295, ), )
+
+
+def testSignedInt():
+    check(b"\x99\xd0\x00\xd0\x80\xd0\xff\xd1\x00\x00\xd1\x80\x00"
+          b"\xd1\xff\xff\xd2\x00\x00\x00\x00\xd2\x80\x00\x00\x00"
+          b"\xd2\xff\xff\xff\xff", (0,
+                                    -128,
+                                    -1,
+                                    0,
+                                    -32768,
+                                    -1,
+                                    0,
+                                    -2147483648,
+                                    -1, ))
+
+
+def testRaw():
+    check(b"\x96\xda\x00\x00\xda\x00\x01a\xda\x00\x02ab\xdb\x00\x00"
+          b"\x00\x00\xdb\x00\x00\x00\x01a\xdb\x00\x00\x00\x02ab",
+          (b"", b"a", b"ab", b"", b"a", b"ab"))
+
+
+def testArray():
+    check(b"\x96\xdc\x00\x00\xdc\x00\x01\xc0\xdc\x00\x02\xc2\xc3\xdd\x00"
+          b"\x00\x00\x00\xdd\x00\x00\x00\x01\xc0\xdd\x00\x00\x00\x02"
+          b"\xc2\xc3", ((), (None, ), (False, True), (), (None, ),
+                        (False, True)))
+
+
+def testMap():
+    check(b"\x96"
+          b"\xde\x00\x00"
+          b"\xde\x00\x01\xc0\xc2"
+          b"\xde\x00\x02\xc0\xc2\xc3\xc2"
+          b"\xdf\x00\x00\x00\x00"
+          b"\xdf\x00\x00\x00\x01\xc0\xc2"
+          b"\xdf\x00\x00\x00\x02\xc0\xc2\xc3\xc2", ({}, {None: False},
+                                                    {True: False,
+                                                     None: False}, {},
+                                                    {None: False},
+                                                    {True: False,
+                                                     None: False}))
@@ -0,0 +1,105 @@
+# coding: utf-8
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+from pandas.io.msgpack import packb, unpackb, Packer, Unpacker, ExtType
+
+import pytest
+import pandas.util.testing as tm
+
+
+class TestLimits(object):
+
+    def test_integer(self):
+        x = -(2 ** 63)
+        assert unpackb(packb(x)) == x
+        pytest.raises((OverflowError, ValueError), packb, x - 1)
+        x = 2 ** 64 - 1
+        assert unpackb(packb(x)) == x
+        pytest.raises((OverflowError, ValueError), packb, x + 1)
+
+    def test_array_header(self):
+        packer = Packer()
+        packer.pack_array_header(2 ** 32 - 1)
+        pytest.raises((OverflowError, ValueError),
+                      packer.pack_array_header, 2 ** 32)
+
+    def test_map_header(self):
+        packer = Packer()
+        packer.pack_map_header(2 ** 32 - 1)
+        pytest.raises((OverflowError, ValueError),
+                      packer.pack_array_header, 2 ** 32)
+
+    def test_max_str_len(self):
+        d = 'x' * 3
+        packed = packb(d)
+
+        unpacker = Unpacker(max_str_len=3, encoding='utf-8')
+        unpacker.feed(packed)
+        assert unpacker.unpack() == d
+
+        unpacker = Unpacker(max_str_len=2, encoding='utf-8')
+        unpacker.feed(packed)
+
+        msg = "3 exceeds max_str_len"
+        with tm.assert_raises_regex(ValueError, msg):
+            unpacker.unpack()
+
+    def test_max_bin_len(self):
+        d = b'x' * 3
+        packed = packb(d, use_bin_type=True)
+
+        unpacker = Unpacker(max_bin_len=3)
+        unpacker.feed(packed)
+        assert unpacker.unpack() == d
+
+        unpacker = Unpacker(max_bin_len=2)
+        unpacker.feed(packed)
+
+        msg = "3 exceeds max_bin_len"
+        with tm.assert_raises_regex(ValueError, msg):
+            unpacker.unpack()
+
+    def test_max_array_len(self):
+        d = [1, 2, 3]
+        packed = packb(d)
+
+        unpacker = Unpacker(max_array_len=3)
+        unpacker.feed(packed)
+        assert unpacker.unpack() == d
+
+        unpacker = Unpacker(max_array_len=2)
+        unpacker.feed(packed)
+
+        msg = "3 exceeds max_array_len"
+        with tm.assert_raises_regex(ValueError, msg):
+            unpacker.unpack()
+
+    def test_max_map_len(self):
+        d = {1: 2, 3: 4, 5: 6}
+        packed = packb(d)
+
+        unpacker = Unpacker(max_map_len=3)
+        unpacker.feed(packed)
+        assert unpacker.unpack() == d
+
+        unpacker = Unpacker(max_map_len=2)
+        unpacker.feed(packed)
+
+        msg = "3 exceeds max_map_len"
+        with tm.assert_raises_regex(ValueError, msg):
+            unpacker.unpack()
+
+    def test_max_ext_len(self):
+        d = ExtType(42, b"abc")
+        packed = packb(d)
+
+        unpacker = Unpacker(max_ext_len=3)
+        unpacker.feed(packed)
+        assert unpacker.unpack() == d
+
+        unpacker = Unpacker(max_ext_len=2)
+        unpacker.feed(packed)
+
+        msg = "4 exceeds max_ext_len"
+        with tm.assert_raises_regex(ValueError, msg):
+            unpacker.unpack()
@@ -0,0 +1,92 @@
+# coding: utf-8
+
+from pandas.io.msgpack import packb, unpackb, ExtType
+
+
+def test_str8():
+    header = b'\xd9'
+    data = b'x' * 32
+    b = packb(data.decode(), use_bin_type=True)
+    assert len(b) == len(data) + 2
+    assert b[0:2] == header + b'\x20'
+    assert b[2:] == data
+    assert unpackb(b) == data
+
+    data = b'x' * 255
+    b = packb(data.decode(), use_bin_type=True)
+    assert len(b) == len(data) + 2
+    assert b[0:2] == header + b'\xff'
+    assert b[2:] == data
+    assert unpackb(b) == data
+
+
+def test_bin8():
+    header = b'\xc4'
+    data = b''
+    b = packb(data, use_bin_type=True)
+    assert len(b) == len(data) + 2
+    assert b[0:2] == header + b'\x00'
+    assert b[2:] == data
+    assert unpackb(b) == data
+
+    data = b'x' * 255
+    b = packb(data, use_bin_type=True)
+    assert len(b) == len(data) + 2
+    assert b[0:2] == header + b'\xff'
+    assert b[2:] == data
+    assert unpackb(b) == data
+
+
+def test_bin16():
+    header = b'\xc5'
+    data = b'x' * 256
+    b = packb(data, use_bin_type=True)
+    assert len(b) == len(data) + 3
+    assert b[0:1] == header
+    assert b[1:3] == b'\x01\x00'
+    assert b[3:] == data
+    assert unpackb(b) == data
+
+    data = b'x' * 65535
+    b = packb(data, use_bin_type=True)
+    assert len(b) == len(data) + 3
+    assert b[0:1] == header
+    assert b[1:3] == b'\xff\xff'
+    assert b[3:] == data
+    assert unpackb(b) == data
+
+
+def test_bin32():
+    header = b'\xc6'
+    data = b'x' * 65536
+    b = packb(data, use_bin_type=True)
+    assert len(b) == len(data) + 5
+    assert b[0:1] == header
+    assert b[1:5] == b'\x00\x01\x00\x00'
+    assert b[5:] == data
+    assert unpackb(b) == data
+
+
+def test_ext():
+    def check(ext, packed):
+        assert packb(ext) == packed
+        assert unpackb(packed) == ext
+
+    check(ExtType(0x42, b'Z'), b'\xd4\x42Z')  # fixext 1
+    check(ExtType(0x42, b'ZZ'), b'\xd5\x42ZZ')  # fixext 2
+    check(ExtType(0x42, b'Z' * 4), b'\xd6\x42' + b'Z' * 4)  # fixext 4
+    check(ExtType(0x42, b'Z' * 8), b'\xd7\x42' + b'Z' * 8)  # fixext 8
+    check(ExtType(0x42, b'Z' * 16), b'\xd8\x42' + b'Z' * 16)  # fixext 16
+    # ext 8
+    check(ExtType(0x42, b''), b'\xc7\x00\x42')
+    check(ExtType(0x42, b'Z' * 255), b'\xc7\xff\x42' + b'Z' * 255)
+    # ext 16
+    check(ExtType(0x42, b'Z' * 256), b'\xc8\x01\x00\x42' + b'Z' * 256)
+    check(ExtType(0x42, b'Z' * 0xffff), b'\xc8\xff\xff\x42' + b'Z' * 0xffff)
+    # ext 32
+    check(
+        ExtType(0x42, b'Z' *
+                0x10000), b'\xc9\x00\x01\x00\x00\x42' + b'Z' * 0x10000)
+    # needs large memory
+    # check(ExtType(0x42, b'Z'*0xffffffff),
+    #              b'\xc9\xff\xff\xff\xff\x42' + b'Z'*0xffffffff)
@@ -0,0 +1,77 @@
+# coding: utf-8
+
+import pytest
+
+from pandas.io.msgpack import packb, unpackb
+
+
+class DecodeError(Exception):
+    pass
+
+
+class TestObj(object):
+
+    def _arr_to_str(self, arr):
+        return ''.join(str(c) for c in arr)
+
+    def bad_complex_decoder(self, o):
+        raise DecodeError("Ooops!")
+
+    def _decode_complex(self, obj):
+        if b'__complex__' in obj:
+            return complex(obj[b'real'], obj[b'imag'])
+        return obj
+
+    def _encode_complex(self, obj):
+        if isinstance(obj, complex):
+            return {b'__complex__': True, b'real': 1, b'imag': 2}
+        return obj
+
+    def test_encode_hook(self):
+        packed = packb([3, 1 + 2j], default=self._encode_complex)
+        unpacked = unpackb(packed, use_list=1)
+        assert unpacked[1] == {b'__complex__': True, b'real': 1, b'imag': 2}
+
+    def test_decode_hook(self):
+        packed = packb([3, {b'__complex__': True, b'real': 1, b'imag': 2}])
+        unpacked = unpackb(packed, object_hook=self._decode_complex,
+                           use_list=1)
+        assert unpacked[1] == 1 + 2j
+
+    def test_decode_pairs_hook(self):
+        packed = packb([3, {1: 2, 3: 4}])
+        prod_sum = 1 * 2 + 3 * 4
+        unpacked = unpackb(
+            packed, object_pairs_hook=lambda l: sum(k * v for k, v in l),
+            use_list=1)
+        assert unpacked[1] == prod_sum
+
+    def test_only_one_obj_hook(self):
+        pytest.raises(TypeError, unpackb, b'', object_hook=lambda x: x,
+                      object_pairs_hook=lambda x: x)
+
+    def test_bad_hook(self):
+        def f():
+            packed = packb([3, 1 + 2j], default=lambda o: o)
+            unpacked = unpackb(packed, use_list=1)  # noqa
+
+        pytest.raises(TypeError, f)
+
+    def test_array_hook(self):
+        packed = packb([1, 2, 3])
+        unpacked = unpackb(packed, list_hook=self._arr_to_str, use_list=1)
+        assert unpacked == '123'
+
+    def test_an_exception_in_objecthook1(self):
+        def f():
+            packed = packb({1: {'__complex__': True, 'real': 1, 'imag': 2}})
+            unpackb(packed, object_hook=self.bad_complex_decoder)
+
+        pytest.raises(DecodeError, f)
+
+    def test_an_exception_in_objecthook2(self):
+        def f():
+            packed = packb({1: [{'__complex__': True, 'real': 1, 'imag': 2}]})
+            unpackb(packed, list_hook=self.bad_complex_decoder, use_list=1)
+
+        pytest.raises(DecodeError, f)
@@ -0,0 +1,153 @@
+# coding: utf-8
+
+import pytest
+
+import struct
+
+from pandas import compat
+from pandas.compat import u, OrderedDict
+from pandas.io.msgpack import packb, unpackb, Unpacker, Packer
+
+
+class TestPack(object):
+
+    def check(self, data, use_list=False):
+        re = unpackb(packb(data), use_list=use_list)
+        assert re == data
+
+    def testPack(self):
+        test_data = [
+            0, 1, 127, 128, 255, 256, 65535, 65536,
+            -1, -32, -33, -128, -129, -32768, -32769,
+            1.0,
+            b"", b"a", b"a" * 31, b"a" * 32,
+            None, True, False,
+            (), ((),), ((), None,),
+            {None: 0},
+            (1 << 23),
+        ]
+        for td in test_data:
+            self.check(td)
+
+    def testPackUnicode(self):
+        test_data = [u(""), u("abcd"), [u("defgh")], u("Русский текст"), ]
+        for td in test_data:
+            re = unpackb(
+                packb(td, encoding='utf-8'), use_list=1, encoding='utf-8')
+            assert re == td
+            packer = Packer(encoding='utf-8')
+            data = packer.pack(td)
+            re = Unpacker(
+                compat.BytesIO(data), encoding='utf-8', use_list=1).unpack()
+            assert re == td
+
+    def testPackUTF32(self):
+        test_data = [
+            compat.u(""),
+            compat.u("abcd"),
+            [compat.u("defgh")],
+            compat.u("Русский текст"),
+        ]
+        for td in test_data:
+            re = unpackb(
+                packb(td, encoding='utf-32'), use_list=1, encoding='utf-32')
+            assert re == td
+
+    def testPackBytes(self):
+        test_data = [b"", b"abcd", (b"defgh", ), ]
+        for td in test_data:
+            self.check(td)
+
+    def testIgnoreUnicodeErrors(self):
+        re = unpackb(
+            packb(b'abc\xeddef'), encoding='utf-8', unicode_errors='ignore',
+            use_list=1)
+        assert re == "abcdef"
+
+    def testStrictUnicodeUnpack(self):
+        pytest.raises(UnicodeDecodeError, unpackb, packb(b'abc\xeddef'),
+                      encoding='utf-8', use_list=1)
+
+    def testStrictUnicodePack(self):
+        pytest.raises(UnicodeEncodeError, packb, compat.u("abc\xeddef"),
+                      encoding='ascii', unicode_errors='strict')
+
+    def testIgnoreErrorsPack(self):
+        re = unpackb(
+            packb(
+                compat.u("abcФФФdef"), encoding='ascii',
+                unicode_errors='ignore'), encoding='utf-8', use_list=1)
+        assert re == compat.u("abcdef")
+
+    def testNoEncoding(self):
+        pytest.raises(TypeError, packb, compat.u("abc"), encoding=None)
+
+    def testDecodeBinary(self):
+        re = unpackb(packb("abc"), encoding=None, use_list=1)
+        assert re == b"abc"
+
+    def testPackFloat(self):
+        assert packb(1.0,
+                     use_single_float=True) == b'\xca' + struct.pack('>f', 1.0)
+        assert packb(
+            1.0, use_single_float=False) == b'\xcb' + struct.pack('>d', 1.0)
+
+    def testArraySize(self, sizes=[0, 5, 50, 1000]):
+        bio = compat.BytesIO()
+        packer = Packer()
+        for size in sizes:
+            bio.write(packer.pack_array_header(size))
+            for i in range(size):
+                bio.write(packer.pack(i))
+
+        bio.seek(0)
+        unpacker = Unpacker(bio, use_list=1)
+        for size in sizes:
+            assert unpacker.unpack() == list(range(size))
+
+    def test_manualreset(self, sizes=[0, 5, 50, 1000]):
+        packer = Packer(autoreset=False)
+        for size in sizes:
+            packer.pack_array_header(size)
+            for i in range(size):
+                packer.pack(i)
+
+        bio = compat.BytesIO(packer.bytes())
+        unpacker = Unpacker(bio, use_list=1)
+        for size in sizes:
+            assert unpacker.unpack() == list(range(size))
+
+        packer.reset()
+        assert packer.bytes() == b''
+
+    def testMapSize(self, sizes=[0, 5, 50, 1000]):
+        bio = compat.BytesIO()
+        packer = Packer()
+        for size in sizes:
+            bio.write(packer.pack_map_header(size))
+            for i in range(size):
+                bio.write(packer.pack(i))  # key
+                bio.write(packer.pack(i * 2))  # value
+
+        bio.seek(0)
+        unpacker = Unpacker(bio)
+        for size in sizes:
+            assert unpacker.unpack() == {i: i * 2 for i in range(size)}
+
+    def test_odict(self):
+        seq = [(b'one', 1), (b'two', 2), (b'three', 3), (b'four', 4)]
+        od = OrderedDict(seq)
+        assert unpackb(packb(od), use_list=1) == dict(seq)
+
+        def pair_hook(seq):
+            return list(seq)
+
+        assert unpackb(
+            packb(od), object_pairs_hook=pair_hook, use_list=1) == seq
+
+    def test_pairlist(self):
+        pairlist = [(b'a', 1), (2, b'b'), (b'foo', b'bar')]
+        packer = Packer()
+        packed = packer.pack_map_pairs(pairlist)
+        unpacked = unpackb(packed, object_pairs_hook=list)
+        assert pairlist == unpacked
@@ -0,0 +1,70 @@
+"""Test Unpacker's read_array_header and read_map_header methods"""
+from pandas.io.msgpack import packb, Unpacker, OutOfData
+UnexpectedTypeException = ValueError
+
+
+def test_read_array_header():
+    unpacker = Unpacker()
+    unpacker.feed(packb(['a', 'b', 'c']))
+    assert unpacker.read_array_header() == 3
+    assert unpacker.unpack() == b'a'
+    assert unpacker.unpack() == b'b'
+    assert unpacker.unpack() == b'c'
+    try:
+        unpacker.unpack()
+        assert 0, 'should raise exception'
+    except OutOfData:
+        assert 1, 'okay'
+
+
+def test_read_map_header():
+    unpacker = Unpacker()
+    unpacker.feed(packb({'a': 'A'}))
+    assert unpacker.read_map_header() == 1
+    assert unpacker.unpack() == B'a'
+    assert unpacker.unpack() == B'A'
+    try:
+        unpacker.unpack()
+        assert 0, 'should raise exception'
+    except OutOfData:
+        assert 1, 'okay'
+
+
+def test_incorrect_type_array():
+    unpacker = Unpacker()
+    unpacker.feed(packb(1))
+    try:
+        unpacker.read_array_header()
+        assert 0, 'should raise exception'
+    except UnexpectedTypeException:
+        assert 1, 'okay'
+
+
+def test_incorrect_type_map():
+    unpacker = Unpacker()
+    unpacker.feed(packb(1))
+    try:
+        unpacker.read_map_header()
+        assert 0, 'should raise exception'
+    except UnexpectedTypeException:
+        assert 1, 'okay'
+
+
+def test_correct_type_nested_array():
+    unpacker = Unpacker()
+    unpacker.feed(packb({'a': ['b', 'c', 'd']}))
+    try:
+        unpacker.read_array_header()
+        assert 0, 'should raise exception'
+    except UnexpectedTypeException:
+        assert 1, 'okay'
+
+
+def test_incorrect_type_nested_map():
+    unpacker = Unpacker()
+    unpacker.feed(packb([{'a': 'b'}]))
+    try:
+        unpacker.read_map_header()
+        assert 0, 'should raise exception'
+    except UnexpectedTypeException:
+        assert 1, 'okay'
@@ -0,0 +1,46 @@
+# coding: utf-8
+
+import io
+import pandas.io.msgpack as msgpack
+
+binarydata = bytes(bytearray(range(256)))
+
+
+def gen_binary_data(idx):
+    return binarydata[:idx % 300]
+
+
+def test_exceeding_unpacker_read_size():
+    dumpf = io.BytesIO()
+
+    packer = msgpack.Packer()
+
+    NUMBER_OF_STRINGS = 6
+    read_size = 16
+
+    # 5 ok for read_size=16, while 6 glibc detected *** python: double free or
+    # corruption (fasttop):
+
+    # 20 ok for read_size=256, while 25 segfaults / glibc detected *** python:
+    # double free or corruption (!prev)
+
+    # 40 ok for read_size=1024, while 50 introduces errors
+    # 7000 ok for read_size=1024*1024, while 8000 leads to glibc detected ***
+    # python: double free or corruption (!prev):
+
+    for idx in range(NUMBER_OF_STRINGS):
+        data = gen_binary_data(idx)
+        dumpf.write(packer.pack(data))
+
+    f = io.BytesIO(dumpf.getvalue())
+    dumpf.close()
+
+    unpacker = msgpack.Unpacker(f, read_size=read_size, use_list=1)
+
+    read_count = 0
+    for idx, o in enumerate(unpacker):
+        assert type(o) == bytes
+        assert o == gen_binary_data(idx)
+        read_count += 1
+
+    assert read_count == NUMBER_OF_STRINGS
@@ -0,0 +1,93 @@
+# coding: utf-8
+
+from pandas import compat
+from pandas.io.msgpack import Unpacker, BufferFull
+from pandas.io.msgpack import OutOfData
+
+import pytest
+import pandas.util.testing as tm
+
+
+class TestPack(object):
+
+    def test_partial_data(self):
+        unpacker = Unpacker()
+        msg = "No more data to unpack"
+
+        for data in [b"\xa5", b"h", b"a", b"l", b"l"]:
+            unpacker.feed(data)
+            with tm.assert_raises_regex(StopIteration, msg):
+                next(iter(unpacker))
+
+        unpacker.feed(b"o")
+        assert next(iter(unpacker)) == b"hallo"
+
+    def test_foobar(self):
+        unpacker = Unpacker(read_size=3, use_list=1)
+        unpacker.feed(b'foobar')
+        assert unpacker.unpack() == ord(b'f')
+        assert unpacker.unpack() == ord(b'o')
+        assert unpacker.unpack() == ord(b'o')
+        assert unpacker.unpack() == ord(b'b')
+        assert unpacker.unpack() == ord(b'a')
+        assert unpacker.unpack() == ord(b'r')
+        pytest.raises(OutOfData, unpacker.unpack)
+
+        unpacker.feed(b'foo')
+        unpacker.feed(b'bar')
+
+        k = 0
+        for o, e in zip(unpacker, 'foobarbaz'):
+            assert o == ord(e)
+            k += 1
+        assert k == len(b'foobar')
+
+    def test_foobar_skip(self):
+        unpacker = Unpacker(read_size=3, use_list=1)
+        unpacker.feed(b'foobar')
+        assert unpacker.unpack() == ord(b'f')
+        unpacker.skip()
+        assert unpacker.unpack() == ord(b'o')
+        unpacker.skip()
+        assert unpacker.unpack() == ord(b'a')
+        unpacker.skip()
+        pytest.raises(OutOfData, unpacker.unpack)
+
+    def test_maxbuffersize(self):
+        pytest.raises(ValueError, Unpacker, read_size=5, max_buffer_size=3)
+        unpacker = Unpacker(read_size=3, max_buffer_size=3, use_list=1)
+        unpacker.feed(b'fo')
+        pytest.raises(BufferFull, unpacker.feed, b'ob')
+        unpacker.feed(b'o')
+        assert ord('f') == next(unpacker)
+        unpacker.feed(b'b')
+        assert ord('o') == next(unpacker)
+        assert ord('o') == next(unpacker)
+        assert ord('b') == next(unpacker)
+
+    def test_readbytes(self):
+        unpacker = Unpacker(read_size=3)
+        unpacker.feed(b'foobar')
+        assert unpacker.unpack() == ord(b'f')
+        assert unpacker.read_bytes(3) == b'oob'
+        assert unpacker.unpack() == ord(b'a')
+        assert unpacker.unpack() == ord(b'r')
+
+        # Test buffer refill
+        unpacker = Unpacker(compat.BytesIO(b'foobar'), read_size=3)
+        assert unpacker.unpack() == ord(b'f')
+        assert unpacker.read_bytes(3) == b'oob'
+        assert unpacker.unpack() == ord(b'a')
+        assert unpacker.unpack() == ord(b'r')
+
+    def test_issue124(self):
+        unpacker = Unpacker()
+        unpacker.feed(b'\xa1?\xa1!')
+        assert tuple(unpacker) == (b'?', b'!')
+        assert tuple(unpacker) == ()
+        unpacker.feed(b"\xa1?\xa1")
+        assert tuple(unpacker) == (b'?', )
+        assert tuple(unpacker) == ()
+        unpacker.feed(b"!")
+        assert tuple(unpacker) == (b'!', )
+        assert tuple(unpacker) == ()
@@ -0,0 +1,25 @@
+# coding: utf-8
+
+from pandas.io.msgpack import packb
+from collections import namedtuple
+
+
+class MyList(list):
+    pass
+
+
+class MyDict(dict):
+    pass
+
+
+class MyTuple(tuple):
+    pass
+
+
+MyNamedTuple = namedtuple('MyNamedTuple', 'x y')
+
+
+def test_types():
+    assert packb(MyDict()) == packb(dict())
+    assert packb(MyList()) == packb(list())
+    assert packb(MyNamedTuple(1, 2)) == packb((1, 2))
@@ -0,0 +1,63 @@
+from io import BytesIO
+import sys
+from pandas.io.msgpack import Unpacker, packb, OutOfData, ExtType
+import pytest
+
+
+class TestUnpack(object):
+
+    def test_unpack_array_header_from_file(self):
+        f = BytesIO(packb([1, 2, 3, 4]))
+        unpacker = Unpacker(f)
+        assert unpacker.read_array_header() == 4
+        assert unpacker.unpack() == 1
+        assert unpacker.unpack() == 2
+        assert unpacker.unpack() == 3
+        assert unpacker.unpack() == 4
+        pytest.raises(OutOfData, unpacker.unpack)
+
+    def test_unpacker_hook_refcnt(self):
+        if not hasattr(sys, 'getrefcount'):
+            pytest.skip('no sys.getrefcount()')
+        result = []
+
+        def hook(x):
+            result.append(x)
+            return x
+
+        basecnt = sys.getrefcount(hook)
+
+        up = Unpacker(object_hook=hook, list_hook=hook)
+
+        assert sys.getrefcount(hook) >= basecnt + 2
+
+        up.feed(packb([{}]))
+        up.feed(packb([{}]))
+        assert up.unpack() == [{}]
+        assert up.unpack() == [{}]
+        assert result == [{}, [{}], {}, [{}]]
+
+        del up
+
+        assert sys.getrefcount(hook) == basecnt
+
+    def test_unpacker_ext_hook(self):
+        class MyUnpacker(Unpacker):
+
+            def __init__(self):
+                super(MyUnpacker, self).__init__(ext_hook=self._hook,
+                                                 encoding='utf-8')
+
+            def _hook(self, code, data):
+                if code == 1:
+                    return int(data)
+                else:
+                    return ExtType(code, data)
+
+        unpacker = MyUnpacker()
+        unpacker.feed(packb({'a': 1}, encoding='utf-8'))
+        assert unpacker.unpack() == {'a': 1}
+        unpacker.feed(packb({'a': ExtType(1, b'123')}, encoding='utf-8'))
+        assert unpacker.unpack() == {'a': 123}
+        unpacker.feed(packb({'a': ExtType(2, b'321')}, encoding='utf-8'))
+        assert unpacker.unpack() == {'a': ExtType(2, b'321')}
@@ -0,0 +1,29 @@
+"""Tests for cases where the user seeks to obtain packed msgpack objects"""
+
+import io
+from pandas.io.msgpack import Unpacker, packb
+
+
+def test_write_bytes():
+    unpacker = Unpacker()
+    unpacker.feed(b'abc')
+    f = io.BytesIO()
+    assert unpacker.unpack(f.write) == ord('a')
+    assert f.getvalue() == b'a'
+    f = io.BytesIO()
+    assert unpacker.skip(f.write) is None
+    assert f.getvalue() == b'b'
+    f = io.BytesIO()
+    assert unpacker.skip() is None
+    assert f.getvalue() == b''
+
+
+def test_write_bytes_multi_buffer():
+    long_val = (5) * 100
+    expected = packb(long_val)
+    unpacker = Unpacker(io.BytesIO(expected), read_size=3, max_buffer_size=3)
+
+    f = io.BytesIO()
+    unpacked = unpacker.unpack(f.write)
+    assert unpacked == long_val
+    assert f.getvalue() == expected
@@ -0,0 +1,487 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that apply specifically to the CParser. Unless specifically stated
+as a CParser-specific issue, the goal is to eventually move as many of
+these tests out of this module as soon as the Python parser can accept
+further arguments when parsing.
+"""
+
+import os
+import sys
+import tarfile
+
+import pytest
+import numpy as np
+
+import pandas as pd
+import pandas.util.testing as tm
+import pandas.util._test_decorators as td
+from pandas import DataFrame
+from pandas.compat import StringIO, range, lrange
+
+
+class CParserTests(object):
+
+    def test_buffer_overflow(self):
+        # see gh-9205: test certain malformed input files that cause
+        # buffer overflows in tokenizer.c
+
+        malfw = "1\r1\r1\r 1\r 1\r"         # buffer overflow in words pointer
+        malfs = "1\r1\r1\r 1\r 1\r11\r"     # buffer overflow in stream pointer
+        malfl = "1\r1\r1\r 1\r 1\r11\r1\r"  # buffer overflow in lines pointer
+
+        cperr = 'Buffer overflow caught - possible malformed input file.'
+
+        for malf in (malfw, malfs, malfl):
+            try:
+                self.read_table(StringIO(malf))
+            except Exception as err:
+                assert cperr in str(err)
+
+    def test_buffer_rd_bytes(self):
+        # see gh-12098: src->buffer in the C parser can be freed twice leading
+        # to a segfault if a corrupt gzip file is read with 'read_csv' and the
+        # buffer is filled more than once before gzip throws an exception
+
+        data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \
+               '\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \
+               '\xA6\x4D' + '\x55' * 267 + \
+               '\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \
+               '\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO'
+        for i in range(100):
+            try:
+                self.read_csv(StringIO(data),
+                              compression='gzip',
+                              delim_whitespace=True)
+            except Exception:
+                pass
+
+    def test_delim_whitespace_custom_terminator(self):
+        # See gh-12912
+        data = """a b c~1 2 3~4 5 6~7 8 9"""
+        df = self.read_csv(StringIO(data), lineterminator='~',
+                           delim_whitespace=True)
+        expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                             columns=['a', 'b', 'c'])
+        tm.assert_frame_equal(df, expected)
+
+    def test_dtype_and_names_error(self):
+        # see gh-8833: passing both dtype and names
+        # resulting in an error reporting issue
+        data = """
+1.0 1
+2.0 2
+3.0 3
+"""
+        # base cases
+        result = self.read_csv(StringIO(data), sep=r'\s+', header=None)
+        expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
+        tm.assert_frame_equal(result, expected)
+
+        result = self.read_csv(StringIO(data), sep=r'\s+',
+                               header=None, names=['a', 'b'])
+        expected = DataFrame(
+            [[1.0, 1], [2.0, 2], [3.0, 3]], columns=['a', 'b'])
+        tm.assert_frame_equal(result, expected)
+
+        # fallback casting
+        result = self.read_csv(StringIO(
+            data), sep=r'\s+', header=None,
+            names=['a', 'b'], dtype={'a': np.int32})
+        expected = DataFrame([[1, 1], [2, 2], [3, 3]],
+                             columns=['a', 'b'])
+        expected['a'] = expected['a'].astype(np.int32)
+        tm.assert_frame_equal(result, expected)
+
+        data = """
+1.0 1
+nan 2
+3.0 3
+"""
+        # fallback casting, but not castable
+        with tm.assert_raises_regex(ValueError, 'cannot safely convert'):
+            self.read_csv(StringIO(data), sep=r'\s+', header=None,
+                          names=['a', 'b'], dtype={'a': np.int32})
+
+    def test_unsupported_dtype(self):
+        df = DataFrame(np.random.rand(5, 2), columns=list(
+            'AB'), index=['1A', '1B', '1C', '1D', '1E'])
+
+        with tm.ensure_clean('__unsupported_dtype__.csv') as path:
+            df.to_csv(path)
+
+            # valid but we don't support it (date)
+            pytest.raises(TypeError, self.read_csv, path,
+                          dtype={'A': 'datetime64', 'B': 'float64'},
+                          index_col=0)
+            pytest.raises(TypeError, self.read_csv, path,
+                          dtype={'A': 'datetime64', 'B': 'float64'},
+                          index_col=0, parse_dates=['B'])
+
+            # valid but we don't support it
+            pytest.raises(TypeError, self.read_csv, path,
+                          dtype={'A': 'timedelta64', 'B': 'float64'},
+                          index_col=0)
+
+            # valid but unsupported - fixed width unicode string
+            pytest.raises(TypeError, self.read_csv, path,
+                          dtype={'A': 'U8'},
+                          index_col=0)
+
+    @td.skip_if_32bit
+    def test_precise_conversion(self):
+        from decimal import Decimal
+
+        normal_errors = []
+        precise_errors = []
+
+        # test numbers between 1 and 2
+        for num in np.linspace(1., 2., num=500):
+            # 25 decimal digits of precision
+            text = 'a\n{0:.25}'.format(num)
+
+            normal_val = float(self.read_csv(StringIO(text))['a'][0])
+            precise_val = float(self.read_csv(
+                StringIO(text), float_precision='high')['a'][0])
+            roundtrip_val = float(self.read_csv(
+                StringIO(text), float_precision='round_trip')['a'][0])
+            actual_val = Decimal(text[2:])
+
+            def error(val):
+                return abs(Decimal('{0:.100}'.format(val)) - actual_val)
+
+            normal_errors.append(error(normal_val))
+            precise_errors.append(error(precise_val))
+
+            # round-trip should match float()
+            assert roundtrip_val == float(text[2:])
+
+        assert sum(precise_errors) <= sum(normal_errors)
+        assert max(precise_errors) <= max(normal_errors)
+
+    def test_usecols_dtypes(self):
+        data = """\
+1,2,3
+4,5,6
+7,8,9
+10,11,12"""
+
+        result = self.read_csv(StringIO(data), usecols=(0, 1, 2),
+                               names=('a', 'b', 'c'),
+                               header=None,
+                               converters={'a': str},
+                               dtype={'b': int, 'c': float},
+                               )
+        result2 = self.read_csv(StringIO(data), usecols=(0, 2),
+                                names=('a', 'b', 'c'),
+                                header=None,
+                                converters={'a': str},
+                                dtype={'b': int, 'c': float},
+                                )
+        assert (result.dtypes == [object, np.int, np.float]).all()
+        assert (result2.dtypes == [object, np.float]).all()
+
+    def test_disable_bool_parsing(self):
+        # #2090
+
+        data = """A,B,C
+Yes,No,Yes
+No,Yes,Yes
+Yes,,Yes
+No,No,No"""
+
+        result = self.read_csv(StringIO(data), dtype=object)
+        assert (result.dtypes == object).all()
+
+        result = self.read_csv(StringIO(data), dtype=object, na_filter=False)
+        assert result['B'][2] == ''
+
+    def test_custom_lineterminator(self):
+        data = 'a,b,c~1,2,3~4,5,6'
+
+        result = self.read_csv(StringIO(data), lineterminator='~')
+        expected = self.read_csv(StringIO(data.replace('~', '\n')))
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_parse_ragged_csv(self):
+        data = """1,2,3
+1,2,3,4
+1,2,3,4,5
+1,2
+1,2,3,4"""
+
+        nice_data = """1,2,3,,
+1,2,3,4,
+1,2,3,4,5
+1,2,,,
+1,2,3,4,"""
+        result = self.read_csv(StringIO(data), header=None,
+                               names=['a', 'b', 'c', 'd', 'e'])
+
+        expected = self.read_csv(StringIO(nice_data), header=None,
+                                 names=['a', 'b', 'c', 'd', 'e'])
+
+        tm.assert_frame_equal(result, expected)
+
+        # too many columns, cause segfault if not careful
+        data = "1,2\n3,4,5"
+
+        result = self.read_csv(StringIO(data), header=None,
+                               names=lrange(50))
+        expected = self.read_csv(StringIO(data), header=None,
+                                 names=lrange(3)).reindex(columns=lrange(50))
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_tokenize_CR_with_quoting(self):
+        # see gh-3453
+
+        data = ' a,b,c\r"a,b","e,d","f,f"'
+
+        result = self.read_csv(StringIO(data), header=None)
+        expected = self.read_csv(StringIO(data.replace('\r', '\n')),
+                                 header=None)
+        tm.assert_frame_equal(result, expected)
+
+        result = self.read_csv(StringIO(data))
+        expected = self.read_csv(StringIO(data.replace('\r', '\n')))
+        tm.assert_frame_equal(result, expected)
+
+    def test_grow_boundary_at_cap(self):
+        # See gh-12494
+        #
+        # Cause of error was that the C parser
+        # was not increasing the buffer size when
+        # the desired space would fill the buffer
+        # to capacity, which would later cause a
+        # buffer overflow error when checking the
+        # EOF terminator of the CSV stream
+        def test_empty_header_read(count):
+            s = StringIO(',' * count)
+            expected = DataFrame(columns=[
+                'Unnamed: {i}'.format(i=i)
+                for i in range(count + 1)])
+            df = self.read_csv(s)
+            tm.assert_frame_equal(df, expected)
+
+        for count in range(1, 101):
+            test_empty_header_read(count)
+
+    def test_parse_trim_buffers(self):
+        # This test is part of a bugfix for issue #13703. It attempts to
+        # to stress the system memory allocator, to cause it to move the
+        # stream buffer and either let the OS reclaim the region, or let
+        # other memory requests of parser otherwise modify the contents
+        # of memory space, where it was formally located.
+        # This test is designed to cause a `segfault` with unpatched
+        # `tokenizer.c`. Sometimes the test fails on `segfault`, other
+        # times it fails due to memory corruption, which causes the
+        # loaded DataFrame to differ from the expected one.
+
+        # Generate a large mixed-type CSV file on-the-fly (one record is
+        # approx 1.5KiB).
+        record_ = \
+            """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \
+            """ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \
+            """ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \
+            """99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \
+            """9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \
+            """99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \
+            """99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \
+            """ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \
+            """ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \
+            """ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \
+            """9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \
+            """999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \
+            """,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \
+            """,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \
+            """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \
+            """,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \
+            """ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \
+            """,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \
+            """,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \
+            """9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \
+            """.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \
+            """,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \
+            """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \
+            """ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \
+            """-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \
+            """ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \
+            """,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \
+            """,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \
+            """.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
+
+        # Set the number of lines so that a call to `parser_trim_buffers`
+        # is triggered: after a couple of full chunks are consumed a
+        # relatively small 'residual' chunk would cause reallocation
+        # within the parser.
+        chunksize, n_lines = 128, 2 * 128 + 15
+        csv_data = "\n".join([record_] * n_lines) + "\n"
+
+        # We will use StringIO to load the CSV from this text buffer.
+        # pd.read_csv() will iterate over the file in chunks and will
+        # finally read a residual chunk of really small size.
+
+        # Generate the expected output: manually create the dataframe
+        # by splitting by comma and repeating the `n_lines` times.
+        row = tuple(val_ if val_ else np.nan
+                    for val_ in record_.split(","))
+        expected = pd.DataFrame([row for _ in range(n_lines)],
+                                dtype=object, columns=None, index=None)
+
+        # Iterate over the CSV file in chunks of `chunksize` lines
+        chunks_ = self.read_csv(StringIO(csv_data), header=None,
+                                dtype=object, chunksize=chunksize)
+        result = pd.concat(chunks_, axis=0, ignore_index=True)
+
+        # Check for data corruption if there was no segfault
+        tm.assert_frame_equal(result, expected)
+
+        # This extra test was added to replicate the fault in gh-5291.
+        # Force 'utf-8' encoding, so that `_string_convert` would take
+        # a different execution branch.
+        chunks_ = self.read_csv(StringIO(csv_data), header=None,
+                                dtype=object, chunksize=chunksize,
+                                encoding='utf_8')
+        result = pd.concat(chunks_, axis=0, ignore_index=True)
+        tm.assert_frame_equal(result, expected)
+
+    def test_internal_null_byte(self):
+        # see gh-14012
+        #
+        # The null byte ('\x00') should not be used as a
+        # true line terminator, escape character, or comment
+        # character, only as a placeholder to indicate that
+        # none was specified.
+        #
+        # This test should be moved to common.py ONLY when
+        # Python's csv class supports parsing '\x00'.
+        names = ['a', 'b', 'c']
+        data = "1,2,3\n4,\x00,6\n7,8,9"
+        expected = pd.DataFrame([[1, 2.0, 3], [4, np.nan, 6],
+                                 [7, 8, 9]], columns=names)
+
+        result = self.read_csv(StringIO(data), names=names)
+        tm.assert_frame_equal(result, expected)
+
+    def test_read_nrows_large(self):
+        # gh-7626 - Read only nrows of data in for large inputs (>262144b)
+        header_narrow = '\t'.join(['COL_HEADER_' + str(i)
+                                   for i in range(10)]) + '\n'
+        data_narrow = '\t'.join(['somedatasomedatasomedata1'
+                                 for i in range(10)]) + '\n'
+        header_wide = '\t'.join(['COL_HEADER_' + str(i)
+                                 for i in range(15)]) + '\n'
+        data_wide = '\t'.join(['somedatasomedatasomedata2'
+                               for i in range(15)]) + '\n'
+        test_input = (header_narrow + data_narrow * 1050 +
+                      header_wide + data_wide * 2)
+
+        df = self.read_csv(StringIO(test_input), sep='\t', nrows=1010)
+
+        assert df.size == 1010 * 10
+
+    def test_float_precision_round_trip_with_text(self):
+        # gh-15140 - This should not segfault on Python 2.7+
+        df = self.read_csv(StringIO('a'),
+                           float_precision='round_trip',
+                           header=None)
+        tm.assert_frame_equal(df, DataFrame({0: ['a']}))
+
+    def test_large_difference_in_columns(self):
+        # gh-14125
+        count = 10000
+        large_row = ('X,' * count)[:-1] + '\n'
+        normal_row = 'XXXXXX XXXXXX,111111111111111\n'
+        test_input = (large_row + normal_row * 6)[:-1]
+        result = self.read_csv(StringIO(test_input), header=None, usecols=[0])
+        rows = test_input.split('\n')
+        expected = DataFrame([row.split(',')[0] for row in rows])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_data_after_quote(self):
+        # see gh-15910
+
+        data = 'a\n1\n"b"a'
+        result = self.read_csv(StringIO(data))
+        expected = DataFrame({'a': ['1', 'ba']})
+
+        tm.assert_frame_equal(result, expected)
+
+    @tm.capture_stderr
+    def test_comment_whitespace_delimited(self):
+        test_input = """\
+1 2
+2 2 3
+3 2 3 # 3 fields
+4 2 3# 3 fields
+5 2 # 2 fields
+6 2# 2 fields
+7 # 1 field, NaN
+8# 1 field, NaN
+9 2 3 # skipped line
+# comment"""
+        df = self.read_csv(StringIO(test_input), comment='#', header=None,
+                           delimiter='\\s+', skiprows=0,
+                           error_bad_lines=False)
+        error = sys.stderr.getvalue()
+        # skipped lines 2, 3, 4, 9
+        for line_num in (2, 3, 4, 9):
+            assert 'Skipping line {}'.format(line_num) in error, error
+        expected = DataFrame([[1, 2],
+                              [5, 2],
+                              [6, 2],
+                              [7, np.nan],
+                              [8, np.nan]])
+        tm.assert_frame_equal(df, expected)
+
+    def test_file_like_no_next(self):
+        # gh-16530: the file-like need not have a "next" or "__next__"
+        # attribute despite having an "__iter__" attribute.
+        #
+        # NOTE: This is only true for the C engine, not Python engine.
+        class NoNextBuffer(StringIO):
+            def __next__(self):
+                raise AttributeError("No next method")
+
+            next = __next__
+
+        data = "a\n1"
+
+        expected = pd.DataFrame({"a": [1]})
+        result = self.read_csv(NoNextBuffer(data))
+
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
+    def test_read_tarfile(self, tar_suffix):
+        # see gh-16530
+        #
+        # Unfortunately, Python's CSV library can't handle
+        # tarfile objects (expects string, not bytes when
+        # iterating through a file-like).
+        tar_path = os.path.join(self.dirpath, "tar_csv" + tar_suffix)
+
+        with tarfile.open(tar_path, "r") as tar:
+            data_file = tar.extractfile("tar_data.csv")
+
+            out = self.read_csv(data_file)
+            expected = pd.DataFrame({"a": [1]})
+            tm.assert_frame_equal(out, expected)
+
+    @pytest.mark.high_memory
+    def test_bytes_exceed_2gb(self):
+        """Read from a "CSV" that has a column larger than 2GB.
+
+        GH 16798
+        """
+        if self.low_memory:
+            pytest.skip("not a high_memory test")
+
+        csv = StringIO('strings\n' + '\n'.join(
+            ['x' * (1 << 20) for _ in range(2100)]))
+        df = self.read_csv(csv, low_memory=False)
+        assert not df.empty
@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that comments are properly handled during parsing
+for all of the parsers defined in parsers.py
+"""
+
+import numpy as np
+import pandas.util.testing as tm
+
+from pandas import DataFrame
+from pandas.compat import StringIO
+
+
+class CommentTests(object):
+
+    def test_comment(self):
+        data = """A,B,C
+1,2.,4.#hello world
+5.,NaN,10.0
+"""
+        expected = np.array([[1., 2., 4.],
+                             [5., np.nan, 10.]])
+        df = self.read_csv(StringIO(data), comment='#')
+        tm.assert_numpy_array_equal(df.values, expected)
+
+        df = self.read_table(StringIO(data), sep=',', comment='#',
+                             na_values=['NaN'])
+        tm.assert_numpy_array_equal(df.values, expected)
+
+    def test_line_comment(self):
+        data = """# empty
+A,B,C
+1,2.,4.#hello world
+#ignore this line
+5.,NaN,10.0
+"""
+        expected = np.array([[1., 2., 4.],
+                             [5., np.nan, 10.]])
+        df = self.read_csv(StringIO(data), comment='#')
+        tm.assert_numpy_array_equal(df.values, expected)
+
+        # check with delim_whitespace=True
+        df = self.read_csv(StringIO(data.replace(',', ' ')), comment='#',
+                           delim_whitespace=True)
+        tm.assert_almost_equal(df.values, expected)
+
+        # custom line terminator is not supported
+        # with the Python parser yet
+        if self.engine == 'c':
+            expected = np.array([[1., 2., 4.],
+                                 [5., np.nan, 10.]])
+            df = self.read_csv(StringIO(data.replace('\n', '*')),
+                               comment='#', lineterminator='*')
+            tm.assert_numpy_array_equal(df.values, expected)
+
+    def test_comment_skiprows(self):
+        data = """# empty
+random line
+# second empty line
+1,2,3
+A,B,C
+1,2.,4.
+5.,NaN,10.0
+"""
+        # this should ignore the first four lines (including comments)
+        expected = np.array([[1., 2., 4.], [5., np.nan, 10.]])
+        df = self.read_csv(StringIO(data), comment='#', skiprows=4)
+        tm.assert_numpy_array_equal(df.values, expected)
+
+    def test_comment_header(self):
+        data = """# empty
+# second empty line
+1,2,3
+A,B,C
+1,2.,4.
+5.,NaN,10.0
+"""
+        # header should begin at the second non-comment line
+        expected = np.array([[1., 2., 4.], [5., np.nan, 10.]])
+        df = self.read_csv(StringIO(data), comment='#', header=1)
+        tm.assert_numpy_array_equal(df.values, expected)
+
+    def test_comment_skiprows_header(self):
+        data = """# empty
+# second empty line
+# third empty line
+X,Y,Z
+1,2,3
+A,B,C
+1,2.,4.
+5.,NaN,10.0
+"""
+        # skiprows should skip the first 4 lines (including comments), while
+        # header should start from the second non-commented line starting
+        # with line 5
+        expected = np.array([[1., 2., 4.], [5., np.nan, 10.]])
+        df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1)
+        tm.assert_numpy_array_equal(df.values, expected)
+
+    def test_custom_comment_char(self):
+        data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
+
+        result = self.read_csv(StringIO(data), comment='#')
+        expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]})
+        tm.assert_frame_equal(result, expected)
+
+    def test_commment_first_line(self):
+        # see gh-4623
+        data = '# notes\na,b,c\n# more notes\n1,2,3'
+
+        expected = DataFrame([[1, 2, 3]], columns=['a', 'b', 'c'])
+        result = self.read_csv(StringIO(data), comment='#')
+        tm.assert_frame_equal(result, expected)
+
+        expected = DataFrame({0: ['a', '1'], 1: ['b', '2'], 2: ['c', '3']})
+        result = self.read_csv(StringIO(data), comment='#', header=None)
+        tm.assert_frame_equal(result, expected)
@@ -0,0 +1,139 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests compressed data parsing functionality for all
+of the parsers defined in parsers.py
+"""
+
+import pytest
+
+import pandas as pd
+import pandas.compat as compat
+import pandas.util.testing as tm
+import pandas.util._test_decorators as td
+
+import gzip
+import bz2
+try:
+    lzma = compat.import_lzma()
+except ImportError:
+    lzma = None
+
+
+class CompressionTests(object):
+
+    def test_zip(self):
+        import zipfile
+
+        with open(self.csv1, 'rb') as data_file:
+            data = data_file.read()
+            expected = self.read_csv(self.csv1)
+
+        with tm.ensure_clean('test_file.zip') as path:
+            tmp = zipfile.ZipFile(path, mode='w')
+            tmp.writestr('test_file', data)
+            tmp.close()
+
+            result = self.read_csv(path, compression='zip')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(path, compression='infer')
+            tm.assert_frame_equal(result, expected)
+
+            if self.engine is not 'python':
+                with open(path, 'rb') as f:
+                    result = self.read_csv(f, compression='zip')
+                    tm.assert_frame_equal(result, expected)
+
+        with tm.ensure_clean('combined_zip.zip') as path:
+            inner_file_names = ['test_file', 'second_file']
+            tmp = zipfile.ZipFile(path, mode='w')
+            for file_name in inner_file_names:
+                tmp.writestr(file_name, data)
+            tmp.close()
+
+            tm.assert_raises_regex(ValueError, 'Multiple files',
+                                   self.read_csv, path, compression='zip')
+
+            tm.assert_raises_regex(ValueError, 'Multiple files',
+                                   self.read_csv, path,
+                                   compression='infer')
+
+        with tm.ensure_clean() as path:
+            tmp = zipfile.ZipFile(path, mode='w')
+            tmp.close()
+
+            tm.assert_raises_regex(ValueError, 'Zero files',
+                                   self.read_csv, path, compression='zip')
+
+        with tm.ensure_clean() as path:
+            with open(path, 'wb') as f:
+                pytest.raises(zipfile.BadZipfile, self.read_csv,
+                              f, compression='zip')
+
+    @pytest.mark.parametrize('compress_type, compress_method, ext', [
+        ('gzip', gzip.GzipFile, 'gz'),
+        ('bz2', bz2.BZ2File, 'bz2'),
+        pytest.param('xz', getattr(lzma, 'LZMAFile', None), 'xz',
+                     marks=td.skip_if_no_lzma)
+    ])
+    def test_other_compression(self, compress_type, compress_method, ext):
+
+        with open(self.csv1, 'rb') as data_file:
+            data = data_file.read()
+            expected = self.read_csv(self.csv1)
+
+        with tm.ensure_clean() as path:
+            tmp = compress_method(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            result = self.read_csv(path, compression=compress_type)
+            tm.assert_frame_equal(result, expected)
+
+            if compress_type == 'bz2':
+                pytest.raises(ValueError, self.read_csv,
+                              path, compression='bz3')
+
+            with open(path, 'rb') as fin:
+                result = self.read_csv(fin, compression=compress_type)
+                tm.assert_frame_equal(result, expected)
+
+        with tm.ensure_clean('test.{}'.format(ext)) as path:
+            tmp = compress_method(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+            result = self.read_csv(path, compression='infer')
+            tm.assert_frame_equal(result, expected)
+
+    def test_read_csv_infer_compression(self):
+        # see gh-9770
+        expected = self.read_csv(self.csv1, index_col=0, parse_dates=True)
+
+        with open(self.csv1) as f:
+            inputs = [self.csv1, self.csv1 + '.gz',
+                      self.csv1 + '.bz2', f]
+
+            for inp in inputs:
+                df = self.read_csv(inp, index_col=0, parse_dates=True,
+                                   compression='infer')
+
+                tm.assert_frame_equal(expected, df)
+
+    def test_read_csv_compressed_utf16_example(self, datapath):
+        # GH18071
+        path = datapath('io', 'parser', 'data', 'utf16_ex_small.zip')
+
+        result = self.read_csv(path, encoding='utf-16',
+                               compression='zip', sep='\t')
+        expected = pd.DataFrame({
+            u'Country': [u'Venezuela', u'Venezuela'],
+            u'Twitter': [u'Hugo Chávez Frías', u'Henrique Capriles R.']
+        })
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_invalid_compression(self):
+        msg = 'Unrecognized compression type: sfark'
+        with tm.assert_raises_regex(ValueError, msg):
+            self.read_csv('test_file.zip', compression='sfark')
@@ -0,0 +1,153 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests column conversion functionality during parsing
+for all of the parsers defined in parsers.py
+"""
+
+from datetime import datetime
+
+import pytest
+
+import numpy as np
+import pandas as pd
+import pandas.util.testing as tm
+
+from pandas._libs.tslib import Timestamp
+from pandas import DataFrame, Index
+from pandas.compat import parse_date, StringIO, lmap
+
+
+class ConverterTests(object):
+
+    def test_converters_type_must_be_dict(self):
+        data = """index,A,B,C,D
+foo,2,3,4,5
+"""
+        with tm.assert_raises_regex(TypeError, 'Type converters.+'):
+            self.read_csv(StringIO(data), converters=0)
+
+    def test_converters(self):
+        data = """A,B,C,D
+a,1,2,01/01/2009
+b,3,4,01/02/2009
+c,4,5,01/03/2009
+"""
+        result = self.read_csv(StringIO(data), converters={'D': parse_date})
+        result2 = self.read_csv(StringIO(data), converters={3: parse_date})
+
+        expected = self.read_csv(StringIO(data))
+        expected['D'] = expected['D'].map(parse_date)
+
+        assert isinstance(result['D'][0], (datetime, Timestamp))
+        tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(result2, expected)
+
+        # produce integer
+        converter = lambda x: int(x.split('/')[2])
+        result = self.read_csv(StringIO(data), converters={'D': converter})
+        expected = self.read_csv(StringIO(data))
+        expected['D'] = expected['D'].map(converter)
+        tm.assert_frame_equal(result, expected)
+
+    def test_converters_no_implicit_conv(self):
+        # see gh-2184
+        data = """000102,1.2,A\n001245,2,B"""
+        f = lambda x: x.strip()
+        converter = {0: f}
+        df = self.read_csv(StringIO(data), header=None, converters=converter)
+        assert df[0].dtype == object
+
+    def test_converters_euro_decimal_format(self):
+        data = """Id;Number1;Number2;Text1;Text2;Number3
+1;1521,1541;187101,9543;ABC;poi;4,738797819
+2;121,12;14897,76;DEF;uyt;0,377320872
+3;878,158;108013,434;GHI;rez;2,735694704"""
+        f = lambda x: float(x.replace(",", "."))
+        converter = {'Number1': f, 'Number2': f, 'Number3': f}
+        df2 = self.read_csv(StringIO(data), sep=';', converters=converter)
+        assert df2['Number1'].dtype == float
+        assert df2['Number2'].dtype == float
+        assert df2['Number3'].dtype == float
+
+    def test_converter_return_string_bug(self):
+        # see gh-583
+        data = """Id;Number1;Number2;Text1;Text2;Number3
+1;1521,1541;187101,9543;ABC;poi;4,738797819
+2;121,12;14897,76;DEF;uyt;0,377320872
+3;878,158;108013,434;GHI;rez;2,735694704"""
+        f = lambda x: float(x.replace(",", "."))
+        converter = {'Number1': f, 'Number2': f, 'Number3': f}
+        df2 = self.read_csv(StringIO(data), sep=';', converters=converter)
+        assert df2['Number1'].dtype == float
+
+    def test_converters_corner_with_nas(self):
+        # skip aberration observed on Win64 Python 3.2.2
+        if hash(np.int64(-1)) != -2:
+            pytest.skip("skipping because of windows hash on Python"
+                        " 3.2.2")
+
+        data = """id,score,days
+1,2,12
+2,2-5,
+3,,14+
+4,6-12,2"""
+
+        def convert_days(x):
+            x = x.strip()
+            if not x:
+                return np.nan
+
+            is_plus = x.endswith('+')
+            if is_plus:
+                x = int(x[:-1]) + 1
+            else:
+                x = int(x)
+            return x
+
+        def convert_days_sentinel(x):
+            x = x.strip()
+            if not x:
+                return np.nan
+
+            is_plus = x.endswith('+')
+            if is_plus:
+                x = int(x[:-1]) + 1
+            else:
+                x = int(x)
+            return x
+
+        def convert_score(x):
+            x = x.strip()
+            if not x:
+                return np.nan
+            if x.find('-') > 0:
+                valmin, valmax = lmap(int, x.split('-'))
+                val = 0.5 * (valmin + valmax)
+            else:
+                val = float(x)
+
+            return val
+
+        fh = StringIO(data)
+        result = self.read_csv(fh, converters={'score': convert_score,
+                                               'days': convert_days},
+                               na_values=['', None])
+        assert pd.isna(result['days'][1])
+
+        fh = StringIO(data)
+        result2 = self.read_csv(fh, converters={'score': convert_score,
+                                                'days': convert_days_sentinel},
+                                na_values=['', None])
+        tm.assert_frame_equal(result, result2)
+
+    def test_converter_index_col_bug(self):
+        # see gh-1835
+        data = "A;B\n1;2\n3;4"
+
+        rs = self.read_csv(StringIO(data), sep=';', index_col='A',
+                           converters={'A': lambda x: x})
+
+        xp = DataFrame({'B': [2, 4]}, index=Index([1, 3], name='A'))
+        tm.assert_frame_equal(rs, xp)
+        assert rs.index.name == xp.index.name
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that dialects are properly handled during parsing
+for all of the parsers defined in parsers.py
+"""
+
+import csv
+
+from pandas import DataFrame
+from pandas.compat import StringIO
+from pandas.errors import ParserWarning
+
+import pandas.util.testing as tm
+
+
+class DialectTests(object):
+
+    def test_dialect(self):
+        data = """\
+label1,label2,label3
+index1,"a,c,e
+index2,b,d,f
+"""
+
+        dia = csv.excel()
+        dia.quoting = csv.QUOTE_NONE
+        with tm.assert_produces_warning(ParserWarning):
+            df = self.read_csv(StringIO(data), dialect=dia)
+
+        data = '''\
+label1,label2,label3
+index1,a,c,e
+index2,b,d,f
+'''
+        exp = self.read_csv(StringIO(data))
+        exp.replace('a', '"a', inplace=True)
+        tm.assert_frame_equal(df, exp)
+
+    def test_dialect_str(self):
+        data = """\
+fruit:vegetable
+apple:brocolli
+pear:tomato
+"""
+        exp = DataFrame({
+            'fruit': ['apple', 'pear'],
+            'vegetable': ['brocolli', 'tomato']
+        })
+        csv.register_dialect('mydialect', delimiter=':')
+        with tm.assert_produces_warning(ParserWarning):
+            df = self.read_csv(StringIO(data), dialect='mydialect')
+
+        tm.assert_frame_equal(df, exp)
+        csv.unregister_dialect('mydialect')
+
+    def test_invalid_dialect(self):
+        class InvalidDialect(object):
+            pass
+
+        data = 'a\n1'
+        msg = 'Invalid dialect'
+
+        with tm.assert_raises_regex(ValueError, msg):
+            self.read_csv(StringIO(data), dialect=InvalidDialect)
+
+    def test_dialect_conflict(self):
+        data = 'a,b\n1,2'
+        dialect = 'excel'
+        exp = DataFrame({'a': [1], 'b': [2]})
+
+        with tm.assert_produces_warning(None):
+            df = self.read_csv(StringIO(data), delimiter=',', dialect=dialect)
+            tm.assert_frame_equal(df, exp)
+
+        with tm.assert_produces_warning(ParserWarning):
+            df = self.read_csv(StringIO(data), delimiter='.', dialect=dialect)
+            tm.assert_frame_equal(df, exp)
@@ -0,0 +1,399 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests dtype specification during parsing
+for all of the parsers defined in parsers.py
+"""
+
+import pytest
+
+import numpy as np
+import pandas as pd
+import pandas.util.testing as tm
+
+from pandas import DataFrame, Series, Index, MultiIndex, Categorical
+from pandas.compat import StringIO
+from pandas.core.dtypes.dtypes import CategoricalDtype
+from pandas.errors import ParserWarning
+
+
+class DtypeTests(object):
+
+    def test_passing_dtype(self):
+        # see gh-6607
+        df = DataFrame(np.random.rand(5, 2).round(4), columns=list(
+            'AB'), index=['1A', '1B', '1C', '1D', '1E'])
+
+        with tm.ensure_clean('__passing_str_as_dtype__.csv') as path:
+            df.to_csv(path)
+
+            # see gh-3795: passing 'str' as the dtype
+            result = self.read_csv(path, dtype=str, index_col=0)
+            expected = df.astype(str)
+            tm.assert_frame_equal(result, expected)
+
+            # for parsing, interpret object as str
+            result = self.read_csv(path, dtype=object, index_col=0)
+            tm.assert_frame_equal(result, expected)
+
+            # we expect all object columns, so need to
+            # convert to test for equivalence
+            result = result.astype(float)
+            tm.assert_frame_equal(result, df)
+
+            # invalid dtype
+            pytest.raises(TypeError, self.read_csv, path,
+                          dtype={'A': 'foo', 'B': 'float64'},
+                          index_col=0)
+
+        # see gh-12048: empty frame
+        actual = self.read_csv(StringIO('A,B'), dtype=str)
+        expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str)
+        tm.assert_frame_equal(actual, expected)
+
+    def test_pass_dtype(self):
+        data = """\
+one,two
+1,2.5
+2,3.5
+3,4.5
+4,5.5"""
+
+        result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'})
+        assert result['one'].dtype == 'u1'
+        assert result['two'].dtype == 'object'
+
+    def test_categorical_dtype(self):
+        # GH 10153
+        data = """a,b,c
+1,a,3.4
+1,a,3.4
+2,b,4.5"""
+        expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
+                                 'b': Categorical(['a', 'a', 'b']),
+                                 'c': Categorical(['3.4', '3.4', '4.5'])})
+        actual = self.read_csv(StringIO(data), dtype='category')
+        tm.assert_frame_equal(actual, expected)
+
+        actual = self.read_csv(StringIO(data), dtype=CategoricalDtype())
+        tm.assert_frame_equal(actual, expected)
+
+        actual = self.read_csv(StringIO(data), dtype={'a': 'category',
+                                                      'b': 'category',
+                                                      'c': CategoricalDtype()})
+        tm.assert_frame_equal(actual, expected)
+
+        actual = self.read_csv(StringIO(data), dtype={'b': 'category'})
+        expected = pd.DataFrame({'a': [1, 1, 2],
+                                 'b': Categorical(['a', 'a', 'b']),
+                                 'c': [3.4, 3.4, 4.5]})
+        tm.assert_frame_equal(actual, expected)
+
+        actual = self.read_csv(StringIO(data), dtype={1: 'category'})
+        tm.assert_frame_equal(actual, expected)
+
+        # unsorted
+        data = """a,b,c
+1,b,3.4
+1,b,3.4
+2,a,4.5"""
+        expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
+                                 'b': Categorical(['b', 'b', 'a']),
+                                 'c': Categorical(['3.4', '3.4', '4.5'])})
+        actual = self.read_csv(StringIO(data), dtype='category')
+        tm.assert_frame_equal(actual, expected)
+
+        # missing
+        data = """a,b,c
+1,b,3.4
+1,nan,3.4
+2,a,4.5"""
+        expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
+                                 'b': Categorical(['b', np.nan, 'a']),
+                                 'c': Categorical(['3.4', '3.4', '4.5'])})
+        actual = self.read_csv(StringIO(data), dtype='category')
+        tm.assert_frame_equal(actual, expected)
+
+    @pytest.mark.slow
+    def test_categorical_dtype_high_cardinality_numeric(self):
+        # GH 18186
+        data = np.sort([str(i) for i in range(524289)])
+        expected = DataFrame({'a': Categorical(data, ordered=True)})
+        actual = self.read_csv(StringIO('a\n' + '\n'.join(data)),
+                               dtype='category')
+        actual["a"] = actual["a"].cat.reorder_categories(
+            np.sort(actual.a.cat.categories), ordered=True)
+        tm.assert_frame_equal(actual, expected)
+
+    def test_categorical_dtype_encoding(self, datapath):
+        # GH 10153
+        pth = datapath('io', 'parser', 'data', 'unicode_series.csv')
+        encoding = 'latin-1'
+        expected = self.read_csv(pth, header=None, encoding=encoding)
+        expected[1] = Categorical(expected[1])
+        actual = self.read_csv(pth, header=None, encoding=encoding,
+                               dtype={1: 'category'})
+        tm.assert_frame_equal(actual, expected)
+
+        pth = datapath('io', 'parser', 'data', 'utf16_ex.txt')
+        encoding = 'utf-16'
+        expected = self.read_table(pth, encoding=encoding)
+        expected = expected.apply(Categorical)
+        actual = self.read_table(pth, encoding=encoding, dtype='category')
+        tm.assert_frame_equal(actual, expected)
+
+    def test_categorical_dtype_chunksize(self):
+        # GH 10153
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        expecteds = [pd.DataFrame({'a': [1, 1],
+                                   'b': Categorical(['a', 'b'])}),
+                     pd.DataFrame({'a': [1, 2],
+                                   'b': Categorical(['b', 'c'])},
+                                  index=[2, 3])]
+        actuals = self.read_csv(StringIO(data), dtype={'b': 'category'},
+                                chunksize=2)
+
+        for actual, expected in zip(actuals, expecteds):
+            tm.assert_frame_equal(actual, expected)
+
+    @pytest.mark.parametrize('ordered', [False, True])
+    @pytest.mark.parametrize('categories', [
+        ['a', 'b', 'c'],
+        ['a', 'c', 'b'],
+        ['a', 'b', 'c', 'd'],
+        ['c', 'b', 'a'],
+    ])
+    def test_categorical_categoricaldtype(self, categories, ordered):
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        expected = pd.DataFrame({
+            "a": [1, 1, 1, 2],
+            "b": Categorical(['a', 'b', 'b', 'c'],
+                             categories=categories,
+                             ordered=ordered)
+        })
+        dtype = {"b": CategoricalDtype(categories=categories,
+                                       ordered=ordered)}
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categorical_categoricaldtype_unsorted(self):
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        dtype = CategoricalDtype(['c', 'b', 'a'])
+        expected = pd.DataFrame({
+            'a': [1, 1, 1, 2],
+            'b': Categorical(['a', 'b', 'b', 'c'], categories=['c', 'b', 'a'])
+        })
+        result = self.read_csv(StringIO(data), dtype={'b': dtype})
+        tm.assert_frame_equal(result, expected)
+
+    def test_categoricaldtype_coerces_numeric(self):
+        dtype = {'b': CategoricalDtype([1, 2, 3])}
+        data = "b\n1\n1\n2\n3"
+        expected = pd.DataFrame({'b': Categorical([1, 1, 2, 3])})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categoricaldtype_coerces_datetime(self):
+        dtype = {
+            'b': CategoricalDtype(pd.date_range('2017', '2019', freq='AS'))
+        }
+        data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
+        expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+        dtype = {
+            'b': CategoricalDtype([pd.Timestamp("2014")])
+        }
+        data = "b\n2014-01-01\n2014-01-01T00:00:00"
+        expected = pd.DataFrame({'b': Categorical([pd.Timestamp('2014')] * 2)})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categoricaldtype_coerces_timedelta(self):
+        dtype = {'b': CategoricalDtype(pd.to_timedelta(['1H', '2H', '3H']))}
+        data = "b\n1H\n2H\n3H"
+        expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categoricaldtype_unexpected_categories(self):
+        dtype = {'b': CategoricalDtype(['a', 'b', 'd', 'e'])}
+        data = "b\nd\na\nc\nd"  # Unexpected c
+        expected = pd.DataFrame({"b": Categorical(list('dacd'),
+                                                  dtype=dtype['b'])})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categorical_categoricaldtype_chunksize(self):
+        # GH 10153
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        cats = ['a', 'b', 'c']
+        expecteds = [pd.DataFrame({'a': [1, 1],
+                                   'b': Categorical(['a', 'b'],
+                                                    categories=cats)}),
+                     pd.DataFrame({'a': [1, 2],
+                                   'b': Categorical(['b', 'c'],
+                                                    categories=cats)},
+                                  index=[2, 3])]
+        dtype = CategoricalDtype(cats)
+        actuals = self.read_csv(StringIO(data), dtype={'b': dtype},
+                                chunksize=2)
+
+        for actual, expected in zip(actuals, expecteds):
+            tm.assert_frame_equal(actual, expected)
+
+    def test_empty_pass_dtype(self):
+        data = 'one,two'
+        result = self.read_csv(StringIO(data), dtype={'one': 'u1'})
+
+        expected = DataFrame({'one': np.empty(0, dtype='u1'),
+                              'two': np.empty(0, dtype=np.object)})
+        tm.assert_frame_equal(result, expected, check_index_type=False)
+
+    def test_empty_with_index_pass_dtype(self):
+        data = 'one,two'
+        result = self.read_csv(StringIO(data), index_col=['one'],
+                               dtype={'one': 'u1', 1: 'f'})
+
+        expected = DataFrame({'two': np.empty(0, dtype='f')},
+                             index=Index([], dtype='u1', name='one'))
+        tm.assert_frame_equal(result, expected, check_index_type=False)
+
+    def test_empty_with_multiindex_pass_dtype(self):
+        data = 'one,two,three'
+        result = self.read_csv(StringIO(data), index_col=['one', 'two'],
+                               dtype={'one': 'u1', 1: 'f8'})
+
+        exp_idx = MultiIndex.from_arrays([np.empty(0, dtype='u1'),
+                                          np.empty(0, dtype='O')],
+                                         names=['one', 'two'])
+        expected = DataFrame(
+            {'three': np.empty(0, dtype=np.object)}, index=exp_idx)
+        tm.assert_frame_equal(result, expected, check_index_type=False)
+
+    def test_empty_with_mangled_column_pass_dtype_by_names(self):
+        data = 'one,one'
+        result = self.read_csv(StringIO(data), dtype={
+            'one': 'u1', 'one.1': 'f'})
+
+        expected = DataFrame(
+            {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')})
+        tm.assert_frame_equal(result, expected, check_index_type=False)
+
+    def test_empty_with_mangled_column_pass_dtype_by_indexes(self):
+        data = 'one,one'
+        result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})
+
+        expected = DataFrame(
+            {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')})
+        tm.assert_frame_equal(result, expected, check_index_type=False)
+
+    def test_empty_with_dup_column_pass_dtype_by_indexes(self):
+        # see gh-9424
+        expected = pd.concat([Series([], name='one', dtype='u1'),
+                              Series([], name='one.1', dtype='f')], axis=1)
+
+        data = 'one,one'
+        result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})
+        tm.assert_frame_equal(result, expected, check_index_type=False)
+
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            data = ''
+            result = self.read_csv(StringIO(data), names=['one', 'one'],
+                                   dtype={0: 'u1', 1: 'f'})
+            tm.assert_frame_equal(result, expected, check_index_type=False)
+
+    def test_raise_on_passed_int_dtype_with_nas(self):
+        # see gh-2631
+        data = """YEAR, DOY, a
+2001,106380451,10
+2001,,11
+2001,106380451,67"""
+        pytest.raises(ValueError, self.read_csv, StringIO(data),
+                      sep=",", skipinitialspace=True,
+                      dtype={'DOY': np.int64})
+
+    def test_dtype_with_converter(self):
+        data = """a,b
+1.1,2.2
+1.2,2.3"""
+        # dtype spec ignored if converted specified
+        with tm.assert_produces_warning(ParserWarning):
+            result = self.read_csv(StringIO(data), dtype={'a': 'i8'},
+                                   converters={'a': lambda x: str(x)})
+        expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]})
+        tm.assert_frame_equal(result, expected)
+
+    def test_empty_dtype(self):
+        # see gh-14712
+        data = 'a,b'
+
+        expected = pd.DataFrame(columns=['a', 'b'], dtype=np.float64)
+        result = self.read_csv(StringIO(data), header=0, dtype=np.float64)
+        tm.assert_frame_equal(result, expected)
+
+        expected = pd.DataFrame({'a': pd.Categorical([]),
+                                 'b': pd.Categorical([])},
+                                index=[])
+        result = self.read_csv(StringIO(data), header=0,
+                               dtype='category')
+        tm.assert_frame_equal(result, expected)
+        result = self.read_csv(StringIO(data), header=0,
+                               dtype={'a': 'category', 'b': 'category'})
+        tm.assert_frame_equal(result, expected)
+
+        expected = pd.DataFrame(columns=['a', 'b'], dtype='datetime64[ns]')
+        result = self.read_csv(StringIO(data), header=0,
+                               dtype='datetime64[ns]')
+        tm.assert_frame_equal(result, expected)
+
+        expected = pd.DataFrame({'a': pd.Series([], dtype='timedelta64[ns]'),
+                                 'b': pd.Series([], dtype='timedelta64[ns]')},
+                                index=[])
+        result = self.read_csv(StringIO(data), header=0,
+                               dtype='timedelta64[ns]')
+        tm.assert_frame_equal(result, expected)
+
+        expected = pd.DataFrame(columns=['a', 'b'])
+        expected['a'] = expected['a'].astype(np.float64)
+        result = self.read_csv(StringIO(data), header=0,
+                               dtype={'a': np.float64})
+        tm.assert_frame_equal(result, expected)
+
+        expected = pd.DataFrame(columns=['a', 'b'])
+        expected['a'] = expected['a'].astype(np.float64)
+        result = self.read_csv(StringIO(data), header=0,
+                               dtype={0: np.float64})
+        tm.assert_frame_equal(result, expected)
+
+        expected = pd.DataFrame(columns=['a', 'b'])
+        expected['a'] = expected['a'].astype(np.int32)
+        expected['b'] = expected['b'].astype(np.float64)
+        result = self.read_csv(StringIO(data), header=0,
+                               dtype={'a': np.int32, 1: np.float64})
+        tm.assert_frame_equal(result, expected)
+
+    def test_numeric_dtype(self):
+        data = '0\n1'
+
+        for dt in np.typecodes['AllInteger'] + np.typecodes['Float']:
+            expected = pd.DataFrame([0, 1], dtype=dt)
+            result = self.read_csv(StringIO(data), header=None, dtype=dt)
+            tm.assert_frame_equal(expected, result)
@@ -0,0 +1,312 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that the file header is properly handled or inferred
+during parsing for all of the parsers defined in parsers.py
+"""
+
+import pytest
+
+import numpy as np
+import pandas.util.testing as tm
+
+from pandas import DataFrame, Index, MultiIndex
+from pandas.compat import StringIO, lrange, u
+
+
+class HeaderTests(object):
+
+    def test_read_with_bad_header(self):
+        errmsg = r"but only \d+ lines in file"
+
+        with tm.assert_raises_regex(ValueError, errmsg):
+            s = StringIO(',,')
+            self.read_csv(s, header=[10])
+
+    def test_bool_header_arg(self):
+        # see gh-6114
+        data = """\
+MyColumn
+   a
+   b
+   a
+   b"""
+        for arg in [True, False]:
+            with pytest.raises(TypeError):
+                self.read_csv(StringIO(data), header=arg)
+            with pytest.raises(TypeError):
+                self.read_table(StringIO(data), header=arg)
+
+    def test_no_header_prefix(self):
+        data = """1,2,3,4,5
+6,7,8,9,10
+11,12,13,14,15
+"""
+        df_pref = self.read_table(StringIO(data), sep=',', prefix='Field',
+                                  header=None)
+
+        expected = np.array([[1, 2, 3, 4, 5],
+                             [6, 7, 8, 9, 10],
+                             [11, 12, 13, 14, 15]], dtype=np.int64)
+        tm.assert_almost_equal(df_pref.values, expected)
+
+        tm.assert_index_equal(df_pref.columns,
+                              Index(['Field0', 'Field1', 'Field2',
+                                     'Field3', 'Field4']))
+
+    def test_header_with_index_col(self):
+        data = """foo,1,2,3
+bar,4,5,6
+baz,7,8,9
+"""
+        names = ['A', 'B', 'C']
+        df = self.read_csv(StringIO(data), names=names)
+
+        assert list(df.columns) == ['A', 'B', 'C']
+
+        values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
+        expected = DataFrame(values, index=['foo', 'bar', 'baz'],
+                             columns=['A', 'B', 'C'])
+        tm.assert_frame_equal(df, expected)
+
+    def test_header_not_first_line(self):
+        data = """got,to,ignore,this,line
+got,to,ignore,this,line
+index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+"""
+        data2 = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+"""
+
+        df = self.read_csv(StringIO(data), header=2, index_col=0)
+        expected = self.read_csv(StringIO(data2), header=0, index_col=0)
+        tm.assert_frame_equal(df, expected)
+
+    def test_header_multi_index(self):
+        expected = tm.makeCustomDataframe(
+            5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
+
+        data = """\
+C0,,C_l0_g0,C_l0_g1,C_l0_g2
+
+C1,,C_l1_g0,C_l1_g1,C_l1_g2
+C2,,C_l2_g0,C_l2_g1,C_l2_g2
+C3,,C_l3_g0,C_l3_g1,C_l3_g2
+R0,R1,,,
+R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
+R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
+R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
+R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
+R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
+"""
+
+        df = self.read_csv(StringIO(data), header=[0, 1, 2, 3],
+                           index_col=[0, 1])
+        tm.assert_frame_equal(df, expected)
+
+        # skipping lines in the header
+        df = self.read_csv(StringIO(data), header=[0, 1, 2, 3],
+                           index_col=[0, 1])
+        tm.assert_frame_equal(df, expected)
+
+        # INVALID OPTIONS
+
+        # names
+        pytest.raises(ValueError, self.read_csv,
+                      StringIO(data), header=[0, 1, 2, 3],
+                      index_col=[0, 1], names=['foo', 'bar'])
+
+        # usecols
+        pytest.raises(ValueError, self.read_csv,
+                      StringIO(data), header=[0, 1, 2, 3],
+                      index_col=[0, 1], usecols=['foo', 'bar'])
+
+        # non-numeric index_col
+        pytest.raises(ValueError, self.read_csv,
+                      StringIO(data), header=[0, 1, 2, 3],
+                      index_col=['foo', 'bar'])
+
+    def test_header_multiindex_common_format(self):
+
+        df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
+                       index=['one', 'two'],
+                       columns=MultiIndex.from_tuples(
+                           [('a', 'q'), ('a', 'r'), ('a', 's'),
+                            ('b', 't'), ('c', 'u'), ('c', 'v')]))
+
+        # to_csv
+        data = """,a,a,a,b,c,c
+,q,r,s,t,u,v
+,,,,,,
+one,1,2,3,4,5,6
+two,7,8,9,10,11,12"""
+
+        result = self.read_csv(StringIO(data), header=[0, 1], index_col=0)
+        tm.assert_frame_equal(df, result)
+
+        # common
+        data = """,a,a,a,b,c,c
+,q,r,s,t,u,v
+one,1,2,3,4,5,6
+two,7,8,9,10,11,12"""
+
+        result = self.read_csv(StringIO(data), header=[0, 1], index_col=0)
+        tm.assert_frame_equal(df, result)
+
+        # common, no index_col
+        data = """a,a,a,b,c,c
+q,r,s,t,u,v
+1,2,3,4,5,6
+7,8,9,10,11,12"""
+
+        result = self.read_csv(StringIO(data), header=[0, 1], index_col=None)
+        tm.assert_frame_equal(df.reset_index(drop=True), result)
+
+        # malformed case 1
+        expected = DataFrame(np.array(
+            [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'),
+            index=Index([1, 7]),
+            columns=MultiIndex(levels=[[u('a'), u('b'), u('c')],
+                                       [u('r'), u('s'), u('t'),
+                                        u('u'), u('v')]],
+                               labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
+                               names=[u('a'), u('q')]))
+
+        data = """a,a,a,b,c,c
+q,r,s,t,u,v
+1,2,3,4,5,6
+7,8,9,10,11,12"""
+
+        result = self.read_csv(StringIO(data), header=[0, 1], index_col=0)
+        tm.assert_frame_equal(expected, result)
+
+        # malformed case 2
+        expected = DataFrame(np.array(
+            [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'),
+            index=Index([1, 7]),
+            columns=MultiIndex(levels=[[u('a'), u('b'), u('c')],
+                                       [u('r'), u('s'), u('t'),
+                                        u('u'), u('v')]],
+                               labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
+                               names=[None, u('q')]))
+
+        data = """,a,a,b,c,c
+q,r,s,t,u,v
+1,2,3,4,5,6
+7,8,9,10,11,12"""
+
+        result = self.read_csv(StringIO(data), header=[0, 1], index_col=0)
+        tm.assert_frame_equal(expected, result)
+
+        # mi on columns and index (malformed)
+        expected = DataFrame(np.array(
+            [[3, 4, 5, 6], [9, 10, 11, 12]], dtype='int64'),
+            index=MultiIndex(levels=[[1, 7], [2, 8]],
+                             labels=[[0, 1], [0, 1]]),
+            columns=MultiIndex(levels=[[u('a'), u('b'), u('c')],
+                                       [u('s'), u('t'), u('u'), u('v')]],
+                               labels=[[0, 1, 2, 2], [0, 1, 2, 3]],
+                               names=[None, u('q')]))
+
+        data = """,a,a,b,c,c
+q,r,s,t,u,v
+1,2,3,4,5,6
+7,8,9,10,11,12"""
+
+        result = self.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
+        tm.assert_frame_equal(expected, result)
+
+    def test_header_names_backward_compat(self):
+        # #2539
+        data = '1,2,3\n4,5,6'
+
+        result = self.read_csv(StringIO(data), names=['a', 'b', 'c'])
+        expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
+                                 header=None)
+        tm.assert_frame_equal(result, expected)
+
+        data2 = 'foo,bar,baz\n' + data
+        result = self.read_csv(StringIO(data2), names=['a', 'b', 'c'],
+                               header=0)
+        tm.assert_frame_equal(result, expected)
+
+    def test_read_only_header_no_rows(self):
+        # See gh-7773
+        expected = DataFrame(columns=['a', 'b', 'c'])
+
+        df = self.read_csv(StringIO('a,b,c'))
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_csv(StringIO('a,b,c'), index_col=False)
+        tm.assert_frame_equal(df, expected)
+
+    def test_no_header(self):
+        data = """1,2,3,4,5
+6,7,8,9,10
+11,12,13,14,15
+"""
+        df = self.read_table(StringIO(data), sep=',', header=None)
+        df_pref = self.read_table(StringIO(data), sep=',', prefix='X',
+                                  header=None)
+
+        names = ['foo', 'bar', 'baz', 'quux', 'panda']
+        df2 = self.read_table(StringIO(data), sep=',', names=names)
+        expected = np.array([[1, 2, 3, 4, 5],
+                             [6, 7, 8, 9, 10],
+                             [11, 12, 13, 14, 15]], dtype=np.int64)
+        tm.assert_almost_equal(df.values, expected)
+        tm.assert_almost_equal(df.values, df2.values)
+
+        tm.assert_index_equal(df_pref.columns,
+                              Index(['X0', 'X1', 'X2', 'X3', 'X4']))
+        tm.assert_index_equal(df.columns, Index(lrange(5)))
+
+        tm.assert_index_equal(df2.columns, Index(names))
+
+    def test_non_int_header(self):
+        # GH 16338
+        msg = 'header must be integer or list of integers'
+        data = """1,2\n3,4"""
+        with tm.assert_raises_regex(ValueError, msg):
+            self.read_csv(StringIO(data), sep=',', header=['a', 'b'])
+        with tm.assert_raises_regex(ValueError, msg):
+            self.read_csv(StringIO(data), sep=',', header='string_header')
+
+    def test_singleton_header(self):
+        # See GH #7757
+        data = """a,b,c\n0,1,2\n1,2,3"""
+        df = self.read_csv(StringIO(data), header=[0])
+        expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
+        tm.assert_frame_equal(df, expected)
+
+    def test_mangles_multi_index(self):
+        # See GH 18062
+        data = """A,A,A,B\none,one,one,two\n0,40,34,0.1"""
+        df = self.read_csv(StringIO(data), header=[0, 1])
+        expected = DataFrame([[0, 40, 34, 0.1]],
+                             columns=MultiIndex.from_tuples(
+                                 [('A', 'one'), ('A', 'one.1'),
+                                  ('A', 'one.2'), ('B', 'two')]))
+        tm.assert_frame_equal(df, expected)
+
+        data = """A,A,A,B\none,one,one.1,two\n0,40,34,0.1"""
+        df = self.read_csv(StringIO(data), header=[0, 1])
+        expected = DataFrame([[0, 40, 34, 0.1]],
+                             columns=MultiIndex.from_tuples(
+                                 [('A', 'one'), ('A', 'one.1'),
+                                  ('A', 'one.1.1'), ('B', 'two')]))
+        tm.assert_frame_equal(df, expected)
+
+        data = """A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1"""
+        df = self.read_csv(StringIO(data), header=[0, 1])
+        expected = DataFrame([[0, 40, 34, 0.1, 0.1]],
+                             columns=MultiIndex.from_tuples(
+                                 [('A', 'one'), ('A', 'one.1'),
+                                  ('A', 'one.1.1'), ('B', 'two'),
+                                  ('B', 'two.1')]))
+        tm.assert_frame_equal(df, expected)
@@ -0,0 +1,143 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that the specified index column (a.k.a 'index_col')
+is properly handled or inferred during parsing for all of
+the parsers defined in parsers.py
+"""
+
+import pytest
+
+import pandas.util.testing as tm
+
+from pandas import DataFrame, Index, MultiIndex
+from pandas.compat import StringIO
+
+
+class IndexColTests(object):
+
+    def test_index_col_named(self):
+        no_header = """\
+KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""  # noqa
+
+        h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"  # noqa
+        data = h + no_header
+        rs = self.read_csv(StringIO(data), index_col='ID')
+        xp = self.read_csv(StringIO(data), header=0).set_index('ID')
+        tm.assert_frame_equal(rs, xp)
+
+        pytest.raises(ValueError, self.read_csv, StringIO(no_header),
+                      index_col='ID')
+
+        data = """\
+1,2,3,4,hello
+5,6,7,8,world
+9,10,11,12,foo
+"""
+        names = ['a', 'b', 'c', 'd', 'message']
+        xp = DataFrame({'a': [1, 5, 9], 'b': [2, 6, 10], 'c': [3, 7, 11],
+                        'd': [4, 8, 12]},
+                       index=Index(['hello', 'world', 'foo'], name='message'))
+        rs = self.read_csv(StringIO(data), names=names, index_col=['message'])
+        tm.assert_frame_equal(xp, rs)
+        assert xp.index.name == rs.index.name
+
+        rs = self.read_csv(StringIO(data), names=names, index_col='message')
+        tm.assert_frame_equal(xp, rs)
+        assert xp.index.name == rs.index.name
+
+    def test_index_col_is_true(self):
+        # see gh-9798
+        pytest.raises(ValueError, self.read_csv,
+                      StringIO(self.ts_data), index_col=True)
+
+    def test_infer_index_col(self):
+        data = """A,B,C
+foo,1,2,3
+bar,4,5,6
+baz,7,8,9
+"""
+        data = self.read_csv(StringIO(data))
+        assert data.index.equals(Index(['foo', 'bar', 'baz']))
+
+    def test_empty_index_col_scenarios(self):
+        data = 'x,y,z'
+
+        # None, no index
+        index_col, expected = None, DataFrame([], columns=list('xyz')),
+        tm.assert_frame_equal(self.read_csv(
+            StringIO(data), index_col=index_col), expected)
+
+        # False, no index
+        index_col, expected = False, DataFrame([], columns=list('xyz')),
+        tm.assert_frame_equal(self.read_csv(
+            StringIO(data), index_col=index_col), expected)
+
+        # int, first column
+        index_col, expected = 0, DataFrame(
+            [], columns=['y', 'z'], index=Index([], name='x'))
+        tm.assert_frame_equal(self.read_csv(
+            StringIO(data), index_col=index_col), expected)
+
+        # int, not first column
+        index_col, expected = 1, DataFrame(
+            [], columns=['x', 'z'], index=Index([], name='y'))
+        tm.assert_frame_equal(self.read_csv(
+            StringIO(data), index_col=index_col), expected)
+
+        # str, first column
+        index_col, expected = 'x', DataFrame(
+            [], columns=['y', 'z'], index=Index([], name='x'))
+        tm.assert_frame_equal(self.read_csv(
+            StringIO(data), index_col=index_col), expected)
+
+        # str, not the first column
+        index_col, expected = 'y', DataFrame(
+            [], columns=['x', 'z'], index=Index([], name='y'))
+        tm.assert_frame_equal(self.read_csv(
+            StringIO(data), index_col=index_col), expected)
+
+        # list of int
+        index_col, expected = [0, 1], DataFrame(
+            [], columns=['z'], index=MultiIndex.from_arrays(
+                [[]] * 2, names=['x', 'y']))
+        tm.assert_frame_equal(self.read_csv(
+            StringIO(data), index_col=index_col),
+            expected, check_index_type=False)
+
+        # list of str
+        index_col = ['x', 'y']
+        expected = DataFrame([], columns=['z'],
+                             index=MultiIndex.from_arrays(
+                                 [[]] * 2, names=['x', 'y']))
+        tm.assert_frame_equal(self.read_csv(StringIO(
+            data), index_col=index_col),
+            expected, check_index_type=False)
+
+        # list of int, reversed sequence
+        index_col = [1, 0]
+        expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays(
+            [[]] * 2, names=['y', 'x']))
+        tm.assert_frame_equal(self.read_csv(
+            StringIO(data), index_col=index_col),
+            expected, check_index_type=False)
+
+        # list of str, reversed sequence
+        index_col = ['y', 'x']
+        expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays(
+            [[]] * 2, names=['y', 'x']))
+        tm.assert_frame_equal(self.read_csv(StringIO(
+            data), index_col=index_col),
+            expected, check_index_type=False)
+
+    def test_empty_with_index_col_false(self):
+        # see gh-10413
+        data = 'x,y'
+        result = self.read_csv(StringIO(data), index_col=False)
+        expected = DataFrame([], columns=['x', 'y'])
+        tm.assert_frame_equal(result, expected)
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that duplicate columns are handled appropriately when parsed by the
+CSV engine. In general, the expected result is that they are either thoroughly
+de-duplicated (if mangling requested) or ignored otherwise.
+"""
+
+from pandas.compat import StringIO
+from pandas import DataFrame
+
+import pandas.util.testing as tm
+
+
+class DupeColumnTests(object):
+    def test_basic(self):
+        # TODO: add test for condition "mangle_dupe_cols=False"
+        # once it is actually supported (gh-12935)
+        data = "a,a,b,b,b\n1,2,3,4,5"
+
+        for method in ("read_csv", "read_table"):
+            # Check default behavior.
+            expected = ["a", "a.1", "b", "b.1", "b.2"]
+            df = getattr(self, method)(StringIO(data), sep=",")
+            assert list(df.columns) == expected
+
+            df = getattr(self, method)(StringIO(data), sep=",",
+                                       mangle_dupe_cols=True)
+            assert list(df.columns) == expected
+
+    def test_basic_names(self):
+        # See gh-7160
+        data = "a,b,a\n0,1,2\n3,4,5"
+        expected = DataFrame([[0, 1, 2], [3, 4, 5]],
+                             columns=["a", "b", "a.1"])
+
+        df = self.read_csv(StringIO(data))
+        tm.assert_frame_equal(df, expected)
+
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            data = "0,1,2\n3,4,5"
+            df = self.read_csv(StringIO(data),
+                               names=["a", "b", "a"])
+            tm.assert_frame_equal(df, expected)
+
+    def test_thorough_mangle_columns(self):
+        # see gh-17060
+        data = "a,a,a.1\n1,2,3"
+        df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
+        assert list(df.columns) == ["a", "a.1", "a.1.1"]
+
+        data = "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6"
+        df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
+        assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
+                                    "a.1.1.1.1", "a.1.1.1.1.1"]
+
+        data = "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7"
+        df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
+        assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
+                                    "a.2", "a.2.1", "a.3.1"]
+
+    def test_thorough_mangle_names(self):
+        # see gh-17095
+        data = "a,b,b\n1,2,3"
+        names = ["a.1", "a.1", "a.1.1"]
+
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            df = self.read_csv(StringIO(data), sep=",", names=names,
+                               mangle_dupe_cols=True)
+            assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]
+
+        data = "a,b,c,d,e,f\n1,2,3,4,5,6"
+        names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"]
+
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            df = self.read_csv(StringIO(data), sep=",", names=names,
+                               mangle_dupe_cols=True)
+            assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
+                                        "a.1.1.1.1", "a.1.1.1.1.1"]
+
+        data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7"
+        names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"]
+
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            df = self.read_csv(StringIO(data), sep=",", names=names,
+                               mangle_dupe_cols=True)
+            assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
+                                        "a.2", "a.2.1", "a.3.1"]
@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests multithreading behaviour for reading and
+parsing files for each parser defined in parsers.py
+"""
+
+from __future__ import division
+from multiprocessing.pool import ThreadPool
+
+import numpy as np
+import pandas as pd
+import pandas.util.testing as tm
+
+from pandas import DataFrame
+from pandas.compat import BytesIO, range
+
+
+def _construct_dataframe(num_rows):
+
+    df = DataFrame(np.random.rand(num_rows, 5), columns=list('abcde'))
+    df['foo'] = 'foo'
+    df['bar'] = 'bar'
+    df['baz'] = 'baz'
+    df['date'] = pd.date_range('20000101 09:00:00',
+                               periods=num_rows,
+                               freq='s')
+    df['int'] = np.arange(num_rows, dtype='int64')
+    return df
+
+
+class MultithreadTests(object):
+
+    def _generate_multithread_dataframe(self, path, num_rows, num_tasks):
+
+        def reader(arg):
+            start, nrows = arg
+
+            if not start:
+                return self.read_csv(path, index_col=0, header=0,
+                                     nrows=nrows, parse_dates=['date'])
+
+            return self.read_csv(path,
+                                 index_col=0,
+                                 header=None,
+                                 skiprows=int(start) + 1,
+                                 nrows=nrows,
+                                 parse_dates=[9])
+
+        tasks = [
+            (num_rows * i // num_tasks,
+             num_rows // num_tasks) for i in range(num_tasks)
+        ]
+
+        pool = ThreadPool(processes=num_tasks)
+
+        results = pool.map(reader, tasks)
+
+        header = results[0].columns
+        for r in results[1:]:
+            r.columns = header
+
+        final_dataframe = pd.concat(results)
+
+        return final_dataframe
+
+    def test_multithread_stringio_read_csv(self):
+        # see gh-11786
+        max_row_range = 10000
+        num_files = 100
+
+        bytes_to_df = [
+            '\n'.join(
+                ['%d,%d,%d' % (i, i, i) for i in range(max_row_range)]
+            ).encode() for j in range(num_files)]
+        files = [BytesIO(b) for b in bytes_to_df]
+
+        # read all files in many threads
+        pool = ThreadPool(8)
+        results = pool.map(self.read_csv, files)
+        first_result = results[0]
+
+        for result in results:
+            tm.assert_frame_equal(first_result, result)
+
+    def test_multithread_path_multipart_read_csv(self):
+        # see gh-11786
+        num_tasks = 4
+        file_name = '__threadpool_reader__.csv'
+        num_rows = 100000
+
+        df = _construct_dataframe(num_rows)
+
+        with tm.ensure_clean(file_name) as path:
+            df.to_csv(path)
+
+            final_dataframe = self._generate_multithread_dataframe(
+                path, num_rows, num_tasks)
+            tm.assert_frame_equal(df, final_dataframe)
@@ -0,0 +1,371 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that NA values are properly handled during
+parsing for all of the parsers defined in parsers.py
+"""
+
+import numpy as np
+from numpy import nan
+
+import pandas.io.common as com
+import pandas.util.testing as tm
+
+from pandas import DataFrame, Index, MultiIndex
+from pandas.compat import StringIO, range
+
+
+class NAvaluesTests(object):
+
+    def test_string_nas(self):
+        data = """A,B,C
+a,b,c
+d,,f
+,g,h
+"""
+        result = self.read_csv(StringIO(data))
+        expected = DataFrame([['a', 'b', 'c'],
+                              ['d', np.nan, 'f'],
+                              [np.nan, 'g', 'h']],
+                             columns=['A', 'B', 'C'])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_detect_string_na(self):
+        data = """A,B
+foo,bar
+NA,baz
+NaN,nan
+"""
+        expected = np.array([['foo', 'bar'], [nan, 'baz'], [nan, nan]],
+                            dtype=np.object_)
+        df = self.read_csv(StringIO(data))
+        tm.assert_numpy_array_equal(df.values, expected)
+
+    def test_non_string_na_values(self):
+        # see gh-3611: with an odd float format, we can't match
+        # the string '999.0' exactly but still need float matching
+        nice = """A,B
+-999,1.2
+2,-999
+3,4.5
+"""
+        ugly = """A,B
+-999,1.200
+2,-999.000
+3,4.500
+"""
+        na_values_param = [['-999.0', '-999'],
+                           [-999, -999.0],
+                           [-999.0, -999],
+                           ['-999.0'], ['-999'],
+                           [-999.0], [-999]]
+        expected = DataFrame([[np.nan, 1.2], [2.0, np.nan],
+                              [3.0, 4.5]], columns=['A', 'B'])
+
+        for data in (nice, ugly):
+            for na_values in na_values_param:
+                out = self.read_csv(StringIO(data), na_values=na_values)
+                tm.assert_frame_equal(out, expected)
+
+    def test_default_na_values(self):
+        _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN',
+                          '#N/A', 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null',
+                          'NaN', 'nan', '-NaN', '-nan', '#N/A N/A', ''])
+        assert _NA_VALUES == com._NA_VALUES
+        nv = len(_NA_VALUES)
+
+        def f(i, v):
+            if i == 0:
+                buf = ''
+            elif i > 0:
+                buf = ''.join([','] * i)
+
+            buf = "{0}{1}".format(buf, v)
+
+            if i < nv - 1:
+                buf = "{0}{1}".format(buf, ''.join([','] * (nv - i - 1)))
+
+            return buf
+
+        data = StringIO('\n'.join(f(i, v) for i, v in enumerate(_NA_VALUES)))
+        expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
+        df = self.read_csv(data, header=None)
+        tm.assert_frame_equal(df, expected)
+
+    def test_custom_na_values(self):
+        data = """A,B,C
+ignore,this,row
+1,NA,3
+-1.#IND,5,baz
+7,8,NaN
+"""
+        expected = np.array([[1., nan, 3],
+                             [nan, 5, nan],
+                             [7, 8, nan]])
+
+        df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1])
+        tm.assert_numpy_array_equal(df.values, expected)
+
+        df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'],
+                              skiprows=[1])
+        tm.assert_numpy_array_equal(df2.values, expected)
+
+        df3 = self.read_table(StringIO(data), sep=',', na_values='baz',
+                              skiprows=[1])
+        tm.assert_numpy_array_equal(df3.values, expected)
+
+    def test_bool_na_values(self):
+        data = """A,B,C
+True,False,True
+NA,True,False
+False,NA,True"""
+
+        result = self.read_csv(StringIO(data))
+        expected = DataFrame({'A': np.array([True, nan, False], dtype=object),
+                              'B': np.array([False, True, nan], dtype=object),
+                              'C': [True, False, True]})
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_na_value_dict(self):
+        data = """A,B,C
+foo,bar,NA
+bar,foo,foo
+foo,bar,NA
+bar,foo,foo"""
+
+        df = self.read_csv(StringIO(data),
+                           na_values={'A': ['foo'], 'B': ['bar']})
+        expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'],
+                              'B': [np.nan, 'foo', np.nan, 'foo'],
+                              'C': [np.nan, 'foo', np.nan, 'foo']})
+        tm.assert_frame_equal(df, expected)
+
+        data = """\
+a,b,c,d
+0,NA,1,5
+"""
+        xp = DataFrame({'b': [np.nan], 'c': [1], 'd': [5]}, index=[0])
+        xp.index.name = 'a'
+        df = self.read_csv(StringIO(data), na_values={}, index_col=0)
+        tm.assert_frame_equal(df, xp)
+
+        xp = DataFrame({'b': [np.nan], 'd': [5]},
+                       MultiIndex.from_tuples([(0, 1)]))
+        xp.index.names = ['a', 'c']
+        df = self.read_csv(StringIO(data), na_values={}, index_col=[0, 2])
+        tm.assert_frame_equal(df, xp)
+
+        xp = DataFrame({'b': [np.nan], 'd': [5]},
+                       MultiIndex.from_tuples([(0, 1)]))
+        xp.index.names = ['a', 'c']
+        df = self.read_csv(StringIO(data), na_values={}, index_col=['a', 'c'])
+        tm.assert_frame_equal(df, xp)
+
+    def test_na_values_keep_default(self):
+        data = """\
+One,Two,Three
+a,1,one
+b,2,two
+,3,three
+d,4,nan
+e,5,five
+nan,6,
+g,7,seven
+"""
+        df = self.read_csv(StringIO(data))
+        xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
+                        'Two': [1, 2, 3, 4, 5, 6, 7],
+                        'Three': ['one', 'two', 'three', np.nan, 'five',
+                                  np.nan, 'seven']})
+        tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
+
+        df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []},
+                           keep_default_na=False)
+        xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'],
+                        'Two': [1, 2, 3, 4, 5, 6, 7],
+                        'Three': ['one', 'two', 'three', 'nan', 'five',
+                                  '', 'seven']})
+        tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
+
+        df = self.read_csv(
+            StringIO(data), na_values=['a'], keep_default_na=False)
+        xp = DataFrame({'One': [np.nan, 'b', '', 'd', 'e', 'nan', 'g'],
+                        'Two': [1, 2, 3, 4, 5, 6, 7],
+                        'Three': ['one', 'two', 'three', 'nan', 'five', '',
+                                  'seven']})
+        tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
+
+        df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []})
+        xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
+                        'Two': [1, 2, 3, 4, 5, 6, 7],
+                        'Three': ['one', 'two', 'three', np.nan, 'five',
+                                  np.nan, 'seven']})
+        tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
+
+        # see gh-4318: passing na_values=None and
+        # keep_default_na=False yields 'None' as a na_value
+        data = """\
+One,Two,Three
+a,1,None
+b,2,two
+,3,None
+d,4,nan
+e,5,five
+nan,6,
+g,7,seven
+"""
+        df = self.read_csv(
+            StringIO(data), keep_default_na=False)
+        xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'],
+                        'Two': [1, 2, 3, 4, 5, 6, 7],
+                        'Three': ['None', 'two', 'None', 'nan', 'five', '',
+                                  'seven']})
+        tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
+
+    def test_no_keep_default_na_dict_na_values(self):
+        # see gh-19227
+        data = "a,b\n,2"
+
+        df = self.read_csv(StringIO(data), na_values={"b": ["2"]},
+                           keep_default_na=False)
+        expected = DataFrame({"a": [""], "b": [np.nan]})
+        tm.assert_frame_equal(df, expected)
+
+        # Scalar values shouldn't cause the parsing to crash or fail.
+        data = "a,b\n1,2"
+
+        df = self.read_csv(StringIO(data), na_values={"b": 2},
+                           keep_default_na=False)
+        expected = DataFrame({"a": [1], "b": [np.nan]})
+        tm.assert_frame_equal(df, expected)
+
+        data = """\
+113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
+729639,"qwer","",asdfkj,466.681,,252.373
+"""
+        expected = DataFrame({0: [np.nan, 729639.0],
+                              1: [np.nan, "qwer"],
+                              2: ["/blaha", np.nan],
+                              3: ["kjsdkj", "asdfkj"],
+                              4: [412.166, 466.681],
+                              5: ["225.874", ""],
+                              6: [np.nan, 252.373]})
+
+        df = self.read_csv(StringIO(data), header=None, keep_default_na=False,
+                           na_values={2: "", 6: "214.008",
+                                      1: "blah", 0: 113125})
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_csv(StringIO(data), header=None, keep_default_na=False,
+                           na_values={2: "", 6: "214.008",
+                                      1: "blah", 0: "113125"})
+        tm.assert_frame_equal(df, expected)
+
+    def test_na_values_na_filter_override(self):
+        data = """\
+A,B
+1,A
+nan,B
+3,C
+"""
+
+        expected = DataFrame([[1, 'A'], [np.nan, np.nan], [3, 'C']],
+                             columns=['A', 'B'])
+        out = self.read_csv(StringIO(data), na_values=['B'], na_filter=True)
+        tm.assert_frame_equal(out, expected)
+
+        expected = DataFrame([['1', 'A'], ['nan', 'B'], ['3', 'C']],
+                             columns=['A', 'B'])
+        out = self.read_csv(StringIO(data), na_values=['B'], na_filter=False)
+        tm.assert_frame_equal(out, expected)
+
+    def test_na_trailing_columns(self):
+        data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax
+2012-03-14,USD,AAPL,BUY,1000
+2012-05-12,USD,SBUX,SELL,500"""
+
+        result = self.read_csv(StringIO(data))
+        assert result['Date'][1] == '2012-05-12'
+        assert result['UnitPrice'].isna().all()
+
+    def test_na_values_scalar(self):
+        # see gh-12224
+        names = ['a', 'b']
+        data = '1,2\n2,1'
+
+        expected = DataFrame([[np.nan, 2.0], [2.0, np.nan]],
+                             columns=names)
+        out = self.read_csv(StringIO(data), names=names, na_values=1)
+        tm.assert_frame_equal(out, expected)
+
+        expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]],
+                             columns=names)
+        out = self.read_csv(StringIO(data), names=names,
+                            na_values={'a': 2, 'b': 1})
+        tm.assert_frame_equal(out, expected)
+
+    def test_na_values_dict_aliasing(self):
+        na_values = {'a': 2, 'b': 1}
+        na_values_copy = na_values.copy()
+
+        names = ['a', 'b']
+        data = '1,2\n2,1'
+
+        expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
+        out = self.read_csv(StringIO(data), names=names, na_values=na_values)
+
+        tm.assert_frame_equal(out, expected)
+        tm.assert_dict_equal(na_values, na_values_copy)
+
+    def test_na_values_dict_col_index(self):
+        # see gh-14203
+
+        data = 'a\nfoo\n1'
+        na_values = {0: 'foo'}
+
+        out = self.read_csv(StringIO(data), na_values=na_values)
+        expected = DataFrame({'a': [np.nan, 1]})
+        tm.assert_frame_equal(out, expected)
+
+    def test_na_values_uint64(self):
+        # see gh-14983
+
+        na_values = [2**63]
+        data = str(2**63) + '\n' + str(2**63 + 1)
+        expected = DataFrame([str(2**63), str(2**63 + 1)])
+        out = self.read_csv(StringIO(data), header=None, na_values=na_values)
+        tm.assert_frame_equal(out, expected)
+
+        data = str(2**63) + ',1' + '\n,2'
+        expected = DataFrame([[str(2**63), 1], ['', 2]])
+        out = self.read_csv(StringIO(data), header=None)
+        tm.assert_frame_equal(out, expected)
+
+    def test_empty_na_values_no_default_with_index(self):
+        # see gh-15835
+        data = "a,1\nb,2"
+
+        expected = DataFrame({'1': [2]}, index=Index(["b"], name="a"))
+        out = self.read_csv(StringIO(data), keep_default_na=False, index_col=0)
+
+        tm.assert_frame_equal(out, expected)
+
+    def test_no_na_filter_on_index(self):
+        # see gh-5239
+        data = "a,b,c\n1,,3\n4,5,6"
+
+        # Don't parse NA-values in index when na_filter=False.
+        out = self.read_csv(StringIO(data), index_col=[1], na_filter=False)
+
+        expected = DataFrame({"a": [1, 4], "c": [3, 6]},
+                             index=Index(["", "5"], name="b"))
+        tm.assert_frame_equal(out, expected)
+
+        # Parse NA-values in index when na_filter=True.
+        out = self.read_csv(StringIO(data), index_col=[1], na_filter=True)
+
+        expected = DataFrame({"a": [1, 4], "c": [3, 6]},
+                             index=Index([np.nan, 5.0], name="b"))
+        tm.assert_frame_equal(out, expected)
@@ -0,0 +1,676 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests date parsing functionality for all of the
+parsers defined in parsers.py
+"""
+
+from distutils.version import LooseVersion
+from datetime import datetime, date
+
+import pytest
+import numpy as np
+from pandas._libs.tslibs import parsing
+from pandas._libs.tslib import Timestamp
+
+import pandas as pd
+import pandas.io.parsers as parsers
+import pandas.core.tools.datetimes as tools
+import pandas.util.testing as tm
+
+import pandas.io.date_converters as conv
+from pandas import DataFrame, Series, Index, DatetimeIndex, MultiIndex
+from pandas import compat
+from pandas.compat import parse_date, StringIO, lrange
+from pandas.compat.numpy import np_array_datetime64_compat
+from pandas.core.indexes.datetimes import date_range
+
+
+class ParseDatesTests(object):
+
+    def test_separator_date_conflict(self):
+        # Regression test for gh-4678: make sure thousands separator and
+        # date parsing do not conflict.
+        data = '06-02-2013;13:00;1-000.215'
+        expected = DataFrame(
+            [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]],
+            columns=['Date', 2]
+        )
+
+        df = self.read_csv(StringIO(data), sep=';', thousands='-',
+                           parse_dates={'Date': [0, 1]}, header=None)
+        tm.assert_frame_equal(df, expected)
+
+    def test_multiple_date_col(self):
+        # Can use multiple date parsers
+        data = """\
+KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
+"""
+
+        def func(*date_cols):
+            res = parsing.try_parse_dates(parsers._concat_date_cols(date_cols))
+            return res
+
+        df = self.read_csv(StringIO(data), header=None,
+                           date_parser=func,
+                           prefix='X',
+                           parse_dates={'nominal': [1, 2],
+                                        'actual': [1, 3]})
+        assert 'nominal' in df
+        assert 'actual' in df
+        assert 'X1' not in df
+        assert 'X2' not in df
+        assert 'X3' not in df
+
+        d = datetime(1999, 1, 27, 19, 0)
+        assert df.loc[0, 'nominal'] == d
+
+        df = self.read_csv(StringIO(data), header=None,
+                           date_parser=func,
+                           parse_dates={'nominal': [1, 2],
+                                        'actual': [1, 3]},
+                           keep_date_col=True)
+        assert 'nominal' in df
+        assert 'actual' in df
+
+        assert 1 in df
+        assert 2 in df
+        assert 3 in df
+
+        data = """\
+KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
+"""
+        df = self.read_csv(StringIO(data), header=None,
+                           prefix='X', parse_dates=[[1, 2], [1, 3]])
+
+        assert 'X1_X2' in df
+        assert 'X1_X3' in df
+        assert 'X1' not in df
+        assert 'X2' not in df
+        assert 'X3' not in df
+
+        d = datetime(1999, 1, 27, 19, 0)
+        assert df.loc[0, 'X1_X2'] == d
+
+        df = self.read_csv(StringIO(data), header=None,
+                           parse_dates=[[1, 2], [1, 3]], keep_date_col=True)
+
+        assert '1_2' in df
+        assert '1_3' in df
+        assert 1 in df
+        assert 2 in df
+        assert 3 in df
+
+        data = '''\
+KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+'''
+        df = self.read_csv(StringIO(data), sep=',', header=None,
+                           parse_dates=[1], index_col=1)
+        d = datetime(1999, 1, 27, 19, 0)
+        assert df.index[0] == d
+
+    def test_multiple_date_cols_int_cast(self):
+        data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
+                "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
+                "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
+                "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
+                "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
+                "KORD,19990127, 23:00:00, 22:56:00, -0.5900")
+        date_spec = {'nominal': [1, 2], 'actual': [1, 3]}
+        import pandas.io.date_converters as conv
+
+        # it works!
+        df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec,
+                           date_parser=conv.parse_date_time)
+        assert 'nominal' in df
+
+    def test_multiple_date_col_timestamp_parse(self):
+        data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
+05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25"""
+        result = self.read_csv(StringIO(data), sep=',', header=None,
+                               parse_dates=[[0, 1]], date_parser=Timestamp)
+
+        ex_val = Timestamp('05/31/2012 15:30:00.029')
+        assert result['0_1'][0] == ex_val
+
+    def test_multiple_date_cols_with_header(self):
+        data = """\
+ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
+KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
+
+        df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]})
+        assert not isinstance(df.nominal[0], compat.string_types)
+
+    ts_data = """\
+ID,date,nominalTime,actualTime,A,B,C,D,E
+KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
+"""
+
+    def test_multiple_date_col_name_collision(self):
+        with pytest.raises(ValueError):
+            self.read_csv(StringIO(self.ts_data), parse_dates={'ID': [1, 2]})
+
+        data = """\
+date_NominalTime,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
+KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""  # noqa
+
+        with pytest.raises(ValueError):
+            self.read_csv(StringIO(data), parse_dates=[[1, 2]])
+
+    def test_date_parser_int_bug(self):
+        # See gh-3071
+        log_file = StringIO(
+            'posix_timestamp,elapsed,sys,user,queries,query_time,rows,'
+            'accountid,userid,contactid,level,silo,method\n'
+            '1343103150,0.062353,0,4,6,0.01690,3,'
+            '12345,1,-1,3,invoice_InvoiceResource,search\n'
+        )
+
+        def f(posix_string):
+            return datetime.utcfromtimestamp(int(posix_string))
+
+        # it works!
+        self.read_csv(log_file, index_col=0, parse_dates=[0], date_parser=f)
+
+    def test_nat_parse(self):
+        # See gh-3062
+        df = DataFrame(dict({
+            'A': np.asarray(lrange(10), dtype='float64'),
+            'B': pd.Timestamp('20010101')}))
+        df.iloc[3:6, :] = np.nan
+
+        with tm.ensure_clean('__nat_parse_.csv') as path:
+            df.to_csv(path)
+            result = self.read_csv(path, index_col=0, parse_dates=['B'])
+            tm.assert_frame_equal(result, df)
+
+            expected = Series(dict(A='float64', B='datetime64[ns]'))
+            tm.assert_series_equal(expected, result.dtypes)
+
+            # test with NaT for the nan_rep
+            # we don't have a method to specify the Datetime na_rep
+            # (it defaults to '')
+            df.to_csv(path)
+            result = self.read_csv(path, index_col=0, parse_dates=['B'])
+            tm.assert_frame_equal(result, df)
+
+    def test_csv_custom_parser(self):
+        data = """A,B,C
+20090101,a,1,2
+20090102,b,3,4
+20090103,c,4,5
+"""
+        f = lambda x: datetime.strptime(x, '%Y%m%d')
+        df = self.read_csv(StringIO(data), date_parser=f)
+        expected = self.read_csv(StringIO(data), parse_dates=True)
+        tm.assert_frame_equal(df, expected)
+
+    def test_parse_dates_implicit_first_col(self):
+        data = """A,B,C
+20090101,a,1,2
+20090102,b,3,4
+20090103,c,4,5
+"""
+        df = self.read_csv(StringIO(data), parse_dates=True)
+        expected = self.read_csv(StringIO(data), index_col=0, parse_dates=True)
+        assert isinstance(
+            df.index[0], (datetime, np.datetime64, Timestamp))
+        tm.assert_frame_equal(df, expected)
+
+    def test_parse_dates_string(self):
+        data = """date,A,B,C
+20090101,a,1,2
+20090102,b,3,4
+20090103,c,4,5
+"""
+        rs = self.read_csv(
+            StringIO(data), index_col='date', parse_dates=['date'])
+        idx = date_range('1/1/2009', periods=3)
+        idx.name = 'date'
+        xp = DataFrame({'A': ['a', 'b', 'c'],
+                        'B': [1, 3, 4],
+                        'C': [2, 4, 5]}, idx)
+        tm.assert_frame_equal(rs, xp)
+
+    def test_yy_format_with_yearfirst(self):
+        data = """date,time,B,C
+090131,0010,1,2
+090228,1020,3,4
+090331,0830,5,6
+"""
+
+        # See gh-217
+        import dateutil
+        if LooseVersion(dateutil.__version__) >= LooseVersion('2.5.0'):
+            pytest.skip("testing yearfirst=True not-support"
+                        "on datetutil < 2.5.0 this works but"
+                        "is wrong")
+
+        rs = self.read_csv(StringIO(data), index_col=0,
+                           parse_dates=[['date', 'time']])
+        idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0),
+                             datetime(2009, 2, 28, 10, 20, 0),
+                             datetime(2009, 3, 31, 8, 30, 0)],
+                            dtype=object, name='date_time')
+        xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx)
+        tm.assert_frame_equal(rs, xp)
+
+        rs = self.read_csv(StringIO(data), index_col=0,
+                           parse_dates=[[0, 1]])
+        idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0),
+                             datetime(2009, 2, 28, 10, 20, 0),
+                             datetime(2009, 3, 31, 8, 30, 0)],
+                            dtype=object, name='date_time')
+        xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx)
+        tm.assert_frame_equal(rs, xp)
+
+    def test_parse_dates_column_list(self):
+        data = 'a,b,c\n01/01/2010,1,15/02/2010'
+
+        expected = DataFrame({'a': [datetime(2010, 1, 1)], 'b': [1],
+                              'c': [datetime(2010, 2, 15)]})
+        expected = expected.set_index(['a', 'b'])
+
+        df = self.read_csv(StringIO(data), index_col=[0, 1],
+                           parse_dates=[0, 2], dayfirst=True)
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_csv(StringIO(data), index_col=[0, 1],
+                           parse_dates=['a', 'c'], dayfirst=True)
+        tm.assert_frame_equal(df, expected)
+
+    def test_multi_index_parse_dates(self):
+        data = """index1,index2,A,B,C
+20090101,one,a,1,2
+20090101,two,b,3,4
+20090101,three,c,4,5
+20090102,one,a,1,2
+20090102,two,b,3,4
+20090102,three,c,4,5
+20090103,one,a,1,2
+20090103,two,b,3,4
+20090103,three,c,4,5
+"""
+        df = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=True)
+        assert isinstance(df.index.levels[0][0],
+                          (datetime, np.datetime64, Timestamp))
+
+        # specify columns out of order!
+        df2 = self.read_csv(StringIO(data), index_col=[1, 0], parse_dates=True)
+        assert isinstance(df2.index.levels[1][0],
+                          (datetime, np.datetime64, Timestamp))
+
+    def test_parse_dates_custom_euroformat(self):
+        text = """foo,bar,baz
+31/01/2010,1,2
+01/02/2010,1,NA
+02/02/2010,1,2
+"""
+        parser = lambda d: parse_date(d, dayfirst=True)
+        df = self.read_csv(StringIO(text),
+                           names=['time', 'Q', 'NTU'], header=0,
+                           index_col=0, parse_dates=True,
+                           date_parser=parser, na_values=['NA'])
+
+        exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1),
+                           datetime(2010, 2, 2)], name='time')
+        expected = DataFrame({'Q': [1, 1, 1], 'NTU': [2, np.nan, 2]},
+                             index=exp_index, columns=['Q', 'NTU'])
+        tm.assert_frame_equal(df, expected)
+
+        parser = lambda d: parse_date(d, day_first=True)
+        pytest.raises(TypeError, self.read_csv,
+                      StringIO(text), skiprows=[0],
+                      names=['time', 'Q', 'NTU'], index_col=0,
+                      parse_dates=True, date_parser=parser,
+                      na_values=['NA'])
+
+    def test_parse_tz_aware(self):
+        # See gh-1693
+        import pytz
+        data = StringIO("Date,x\n2012-06-13T01:39:00Z,0.5")
+
+        # it works
+        result = self.read_csv(data, index_col=0, parse_dates=True)
+        stamp = result.index[0]
+        assert stamp.minute == 39
+        try:
+            assert result.index.tz is pytz.utc
+        except AssertionError:  # hello Yaroslav
+            arr = result.index.to_pydatetime()
+            result = tools.to_datetime(arr, utc=True)[0]
+            assert stamp.minute == result.minute
+            assert stamp.hour == result.hour
+            assert stamp.day == result.day
+
+    def test_multiple_date_cols_index(self):
+        data = """
+ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
+KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
+"""
+
+        xp = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]})
+        df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]},
+                           index_col='nominal')
+        tm.assert_frame_equal(xp.set_index('nominal'), df)
+        df2 = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]},
+                            index_col=0)
+        tm.assert_frame_equal(df2, df)
+
+        df3 = self.read_csv(StringIO(data), parse_dates=[[1, 2]], index_col=0)
+        tm.assert_frame_equal(df3, df, check_names=False)
+
+    def test_multiple_date_cols_chunked(self):
+        df = self.read_csv(StringIO(self.ts_data), parse_dates={
+            'nominal': [1, 2]}, index_col='nominal')
+        reader = self.read_csv(StringIO(self.ts_data),
+                               parse_dates={'nominal': [1, 2]},
+                               index_col='nominal', chunksize=2)
+
+        chunks = list(reader)
+
+        assert 'nominalTime' not in df
+
+        tm.assert_frame_equal(chunks[0], df[:2])
+        tm.assert_frame_equal(chunks[1], df[2:4])
+        tm.assert_frame_equal(chunks[2], df[4:])
+
+    def test_multiple_date_col_named_components(self):
+        xp = self.read_csv(StringIO(self.ts_data),
+                           parse_dates={'nominal': [1, 2]},
+                           index_col='nominal')
+        colspec = {'nominal': ['date', 'nominalTime']}
+        df = self.read_csv(StringIO(self.ts_data), parse_dates=colspec,
+                           index_col='nominal')
+        tm.assert_frame_equal(df, xp)
+
+    def test_multiple_date_col_multiple_index(self):
+        df = self.read_csv(StringIO(self.ts_data),
+                           parse_dates={'nominal': [1, 2]},
+                           index_col=['nominal', 'ID'])
+
+        xp = self.read_csv(StringIO(self.ts_data),
+                           parse_dates={'nominal': [1, 2]})
+
+        tm.assert_frame_equal(xp.set_index(['nominal', 'ID']), df)
+
+    def test_read_with_parse_dates_scalar_non_bool(self):
+        # See gh-5636
+        errmsg = ("Only booleans, lists, and "
+                  "dictionaries are accepted "
+                  "for the 'parse_dates' parameter")
+        data = """A,B,C
+        1,2,2003-11-1"""
+
+        tm.assert_raises_regex(TypeError, errmsg, self.read_csv,
+                               StringIO(data), parse_dates="C")
+        tm.assert_raises_regex(TypeError, errmsg, self.read_csv,
+                               StringIO(data), parse_dates="C",
+                               index_col="C")
+
+    def test_read_with_parse_dates_invalid_type(self):
+        errmsg = ("Only booleans, lists, and "
+                  "dictionaries are accepted "
+                  "for the 'parse_dates' parameter")
+        data = """A,B,C
+        1,2,2003-11-1"""
+
+        tm.assert_raises_regex(TypeError, errmsg, self.read_csv,
+                               StringIO(data), parse_dates=(1,))
+        tm.assert_raises_regex(TypeError, errmsg,
+                               self.read_csv, StringIO(data),
+                               parse_dates=np.array([4, 5]))
+        tm.assert_raises_regex(TypeError, errmsg, self.read_csv,
+                               StringIO(data), parse_dates=set([1, 3, 3]))
+
+    def test_parse_dates_empty_string(self):
+        # see gh-2263
+        data = "Date, test\n2012-01-01, 1\n,2"
+        result = self.read_csv(StringIO(data), parse_dates=["Date"],
+                               na_filter=False)
+        assert result['Date'].isna()[1]
+
+    def test_parse_dates_noconvert_thousands(self):
+        # see gh-14066
+        data = 'a\n04.15.2016'
+
+        expected = DataFrame([datetime(2016, 4, 15)], columns=['a'])
+        result = self.read_csv(StringIO(data), parse_dates=['a'],
+                               thousands='.')
+        tm.assert_frame_equal(result, expected)
+
+        exp_index = DatetimeIndex(['2016-04-15'], name='a')
+        expected = DataFrame(index=exp_index)
+        result = self.read_csv(StringIO(data), index_col=0,
+                               parse_dates=True, thousands='.')
+        tm.assert_frame_equal(result, expected)
+
+        data = 'a,b\n04.15.2016,09.16.2013'
+
+        expected = DataFrame([[datetime(2016, 4, 15),
+                               datetime(2013, 9, 16)]],
+                             columns=['a', 'b'])
+        result = self.read_csv(StringIO(data), parse_dates=['a', 'b'],
+                               thousands='.')
+        tm.assert_frame_equal(result, expected)
+
+        expected = DataFrame([[datetime(2016, 4, 15),
+                               datetime(2013, 9, 16)]],
+                             columns=['a', 'b'])
+        expected = expected.set_index(['a', 'b'])
+        result = self.read_csv(StringIO(data), index_col=[0, 1],
+                               parse_dates=True, thousands='.')
+        tm.assert_frame_equal(result, expected)
+
+    def test_parse_date_time_multi_level_column_name(self):
+        data = """\
+D,T,A,B
+date, time,a,b
+2001-01-05, 09:00:00, 0.0, 10.
+2001-01-06, 00:00:00, 1.0, 11.
+"""
+        datecols = {'date_time': [0, 1]}
+        result = self.read_csv(StringIO(data), sep=',', header=[0, 1],
+                               parse_dates=datecols,
+                               date_parser=conv.parse_date_time)
+
+        expected_data = [[datetime(2001, 1, 5, 9, 0, 0), 0., 10.],
+                         [datetime(2001, 1, 6, 0, 0, 0), 1., 11.]]
+        expected = DataFrame(expected_data,
+                             columns=['date_time', ('A', 'a'), ('B', 'b')])
+        tm.assert_frame_equal(result, expected)
+
+    def test_parse_date_time(self):
+        dates = np.array(['2007/1/3', '2008/2/4'], dtype=object)
+        times = np.array(['05:07:09', '06:08:00'], dtype=object)
+        expected = np.array([datetime(2007, 1, 3, 5, 7, 9),
+                             datetime(2008, 2, 4, 6, 8, 0)])
+
+        result = conv.parse_date_time(dates, times)
+        assert (result == expected).all()
+
+        data = """\
+date, time, a, b
+2001-01-05, 10:00:00, 0.0, 10.
+2001-01-05, 00:00:00, 1., 11.
+"""
+        datecols = {'date_time': [0, 1]}
+        df = self.read_csv(StringIO(data), sep=',', header=0,
+                           parse_dates=datecols,
+                           date_parser=conv.parse_date_time)
+        assert 'date_time' in df
+        assert df.date_time.loc[0] == datetime(2001, 1, 5, 10, 0, 0)
+
+        data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
+                "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
+                "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
+                "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
+                "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
+                "KORD,19990127, 23:00:00, 22:56:00, -0.5900")
+
+        date_spec = {'nominal': [1, 2], 'actual': [1, 3]}
+        df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec,
+                           date_parser=conv.parse_date_time)
+
+    def test_parse_date_fields(self):
+        years = np.array([2007, 2008])
+        months = np.array([1, 2])
+        days = np.array([3, 4])
+        result = conv.parse_date_fields(years, months, days)
+        expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)])
+        assert (result == expected).all()
+
+        data = ("year, month, day, a\n 2001 , 01 , 10 , 10.\n"
+                "2001 , 02 , 1 , 11.")
+        datecols = {'ymd': [0, 1, 2]}
+        df = self.read_csv(StringIO(data), sep=',', header=0,
+                           parse_dates=datecols,
+                           date_parser=conv.parse_date_fields)
+        assert 'ymd' in df
+        assert df.ymd.loc[0] == datetime(2001, 1, 10)
+
+    def test_datetime_six_col(self):
+        years = np.array([2007, 2008])
+        months = np.array([1, 2])
+        days = np.array([3, 4])
+        hours = np.array([5, 6])
+        minutes = np.array([7, 8])
+        seconds = np.array([9, 0])
+        expected = np.array([datetime(2007, 1, 3, 5, 7, 9),
+                             datetime(2008, 2, 4, 6, 8, 0)])
+
+        result = conv.parse_all_fields(years, months, days,
+                                       hours, minutes, seconds)
+
+        assert (result == expected).all()
+
+        data = """\
+year, month, day, hour, minute, second, a, b
+2001, 01, 05, 10, 00, 0, 0.0, 10.
+2001, 01, 5, 10, 0, 00, 1., 11.
+"""
+        datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]}
+        df = self.read_csv(StringIO(data), sep=',', header=0,
+                           parse_dates=datecols,
+                           date_parser=conv.parse_all_fields)
+        assert 'ymdHMS' in df
+        assert df.ymdHMS.loc[0] == datetime(2001, 1, 5, 10, 0, 0)
+
+    def test_datetime_fractional_seconds(self):
+        data = """\
+year, month, day, hour, minute, second, a, b
+2001, 01, 05, 10, 00, 0.123456, 0.0, 10.
+2001, 01, 5, 10, 0, 0.500000, 1., 11.
+"""
+        datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]}
+        df = self.read_csv(StringIO(data), sep=',', header=0,
+                           parse_dates=datecols,
+                           date_parser=conv.parse_all_fields)
+        assert 'ymdHMS' in df
+        assert df.ymdHMS.loc[0] == datetime(2001, 1, 5, 10, 0, 0,
+                                            microsecond=123456)
+        assert df.ymdHMS.loc[1] == datetime(2001, 1, 5, 10, 0, 0,
+                                            microsecond=500000)
+
+    def test_generic(self):
+        data = "year, month, day, a\n 2001, 01, 10, 10.\n 2001, 02, 1, 11."
+        datecols = {'ym': [0, 1]}
+        dateconverter = lambda y, m: date(year=int(y), month=int(m), day=1)
+        df = self.read_csv(StringIO(data), sep=',', header=0,
+                           parse_dates=datecols,
+                           date_parser=dateconverter)
+        assert 'ym' in df
+        assert df.ym.loc[0] == date(2001, 1, 1)
+
+    def test_dateparser_resolution_if_not_ns(self):
+        # GH 10245
+        data = """\
+date,time,prn,rxstatus
+2013-11-03,19:00:00,126,00E80000
+2013-11-03,19:00:00,23,00E80000
+2013-11-03,19:00:00,13,00E80000
+"""
+
+        def date_parser(date, time):
+            datetime = np_array_datetime64_compat(
+                date + 'T' + time + 'Z', dtype='datetime64[s]')
+            return datetime
+
+        df = self.read_csv(StringIO(data), date_parser=date_parser,
+                           parse_dates={'datetime': ['date', 'time']},
+                           index_col=['datetime', 'prn'])
+
+        datetimes = np_array_datetime64_compat(['2013-11-03T19:00:00Z'] * 3,
+                                               dtype='datetime64[s]')
+        df_correct = DataFrame(data={'rxstatus': ['00E80000'] * 3},
+                               index=MultiIndex.from_tuples(
+                                   [(datetimes[0], 126),
+                                    (datetimes[1], 23),
+                                    (datetimes[2], 13)],
+                               names=['datetime', 'prn']))
+        tm.assert_frame_equal(df, df_correct)
+
+    def test_parse_date_column_with_empty_string(self):
+        # GH 6428
+        data = """case,opdate
+                  7,10/18/2006
+                  7,10/18/2008
+                  621, """
+        result = self.read_csv(StringIO(data), parse_dates=['opdate'])
+        expected_data = [[7, '10/18/2006'],
+                         [7, '10/18/2008'],
+                         [621, ' ']]
+        expected = DataFrame(expected_data, columns=['case', 'opdate'])
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("data,expected", [
+        ("a\n135217135789158401\n1352171357E+5",
+         DataFrame({"a": [135217135789158401,
+                          135217135700000]}, dtype="float64")),
+        ("a\n99999999999\n123456789012345\n1234E+0",
+         DataFrame({"a": [99999999999,
+                          123456789012345,
+                          1234]}, dtype="float64"))
+    ])
+    @pytest.mark.parametrize("parse_dates", [True, False])
+    def test_parse_date_float(self, data, expected, parse_dates):
+        # see gh-2697
+        #
+        # Date parsing should fail, so we leave the data untouched
+        # (i.e. float precision should remain unchanged).
+        result = self.read_csv(StringIO(data), parse_dates=parse_dates)
+        tm.assert_frame_equal(result, expected)
@@ -0,0 +1,263 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that apply specifically to the Python parser. Unless specifically
+stated as a Python-specific issue, the goal is to eventually move as many of
+these tests out of this module as soon as the C parser can accept further
+arguments when parsing.
+"""
+
+import csv
+import pytest
+
+import pandas.util.testing as tm
+from pandas import DataFrame, Index
+from pandas import compat
+from pandas.errors import ParserError
+from pandas.compat import StringIO, BytesIO, u
+
+
+class PythonParserTests(object):
+
+    def test_default_separator(self):
+        # GH17333
+        # csv.Sniffer in Python treats 'o' as separator.
+        text = 'aob\n1o2\n3o4'
+        expected = DataFrame({'a': [1, 3], 'b': [2, 4]})
+
+        result = self.read_csv(StringIO(text), sep=None)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_invalid_skipfooter(self):
+        text = "a\n1\n2"
+
+        # see gh-15925 (comment)
+        msg = "skipfooter must be an integer"
+        with tm.assert_raises_regex(ValueError, msg):
+            self.read_csv(StringIO(text), skipfooter="foo")
+
+        with tm.assert_raises_regex(ValueError, msg):
+            self.read_csv(StringIO(text), skipfooter=1.5)
+
+        with tm.assert_raises_regex(ValueError, msg):
+            self.read_csv(StringIO(text), skipfooter=True)
+
+        msg = "skipfooter cannot be negative"
+        with tm.assert_raises_regex(ValueError, msg):
+            self.read_csv(StringIO(text), skipfooter=-1)
+
+    def test_sniff_delimiter(self):
+        text = """index|A|B|C
+foo|1|2|3
+bar|4|5|6
+baz|7|8|9
+"""
+        data = self.read_csv(StringIO(text), index_col=0, sep=None)
+        tm.assert_index_equal(data.index,
+                              Index(['foo', 'bar', 'baz'], name='index'))
+
+        data2 = self.read_csv(StringIO(text), index_col=0, delimiter='|')
+        tm.assert_frame_equal(data, data2)
+
+        text = """ignore this
+ignore this too
+index|A|B|C
+foo|1|2|3
+bar|4|5|6
+baz|7|8|9
+"""
+        data3 = self.read_csv(StringIO(text), index_col=0,
+                              sep=None, skiprows=2)
+        tm.assert_frame_equal(data, data3)
+
+        text = u("""ignore this
+ignore this too
+index|A|B|C
+foo|1|2|3
+bar|4|5|6
+baz|7|8|9
+""").encode('utf-8')
+
+        s = BytesIO(text)
+        if compat.PY3:
+            # somewhat False since the code never sees bytes
+            from io import TextIOWrapper
+            s = TextIOWrapper(s, encoding='utf-8')
+
+        data4 = self.read_csv(s, index_col=0, sep=None, skiprows=2,
+                              encoding='utf-8')
+        tm.assert_frame_equal(data, data4)
+
+    def test_BytesIO_input(self):
+        if not compat.PY3:
+            pytest.skip(
+                "Bytes-related test - only needs to work on Python 3")
+
+        data = BytesIO("שלום::1234\n562::123".encode('cp1255'))
+        result = self.read_table(data, sep="::", encoding='cp1255')
+        expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
+        tm.assert_frame_equal(result, expected)
+
+    def test_single_line(self):
+        # see gh-6607: sniff separator
+        df = self.read_csv(StringIO('1,2'), names=['a', 'b'],
+                           header=None, sep=None)
+        tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df)
+
+    def test_skipfooter(self):
+        # see gh-6607
+        data = """A,B,C
+1,2,3
+4,5,6
+7,8,9
+want to skip this
+also also skip this
+"""
+        result = self.read_csv(StringIO(data), skipfooter=2)
+        no_footer = '\n'.join(data.split('\n')[:-3])
+        expected = self.read_csv(StringIO(no_footer))
+        tm.assert_frame_equal(result, expected)
+
+        result = self.read_csv(StringIO(data), nrows=3)
+        tm.assert_frame_equal(result, expected)
+
+        # skipfooter alias
+        result = self.read_csv(StringIO(data), skipfooter=2)
+        no_footer = '\n'.join(data.split('\n')[:-3])
+        expected = self.read_csv(StringIO(no_footer))
+        tm.assert_frame_equal(result, expected)
+
+    def test_decompression_regex_sep(self):
+        # see gh-6607
+
+        try:
+            import gzip
+            import bz2
+        except ImportError:
+            pytest.skip('need gzip and bz2 to run')
+
+        with open(self.csv1, 'rb') as f:
+            data = f.read()
+        data = data.replace(b',', b'::')
+        expected = self.read_csv(self.csv1)
+
+        with tm.ensure_clean() as path:
+            tmp = gzip.GzipFile(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            result = self.read_csv(path, sep='::', compression='gzip')
+            tm.assert_frame_equal(result, expected)
+
+        with tm.ensure_clean() as path:
+            tmp = bz2.BZ2File(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            result = self.read_csv(path, sep='::', compression='bz2')
+            tm.assert_frame_equal(result, expected)
+
+            pytest.raises(ValueError, self.read_csv,
+                          path, compression='bz3')
+
+    def test_read_table_buglet_4x_multiindex(self):
+        # see gh-6607
+        text = """                      A       B       C       D        E
+one two three   four
+a   b   10.0032 5    -0.5109 -2.3358 -0.4645  0.05076  0.3640
+a   q   20      4     0.4473  1.4152  0.2834  1.00661  0.1744
+x   q   30      3    -0.6662 -0.5243 -0.3580  0.89145  2.5838"""
+
+        df = self.read_table(StringIO(text), sep=r'\s+')
+        assert df.index.names == ('one', 'two', 'three', 'four')
+
+        # see gh-6893
+        data = '      A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9'
+        expected = DataFrame.from_records(
+            [(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
+            columns=list('abcABC'), index=list('abc'))
+        actual = self.read_table(StringIO(data), sep=r'\s+')
+        tm.assert_frame_equal(actual, expected)
+
+    def test_skipfooter_with_decimal(self):
+        # see gh-6971
+        data = '1#2\n3#4'
+        expected = DataFrame({'a': [1.2, 3.4]})
+
+        result = self.read_csv(StringIO(data), names=['a'],
+                               decimal='#')
+        tm.assert_frame_equal(result, expected)
+
+        # the stray footer line should not mess with the
+        # casting of the first t    wo lines if we skip it
+        data = data + '\nFooter'
+        result = self.read_csv(StringIO(data), names=['a'],
+                               decimal='#', skipfooter=1)
+        tm.assert_frame_equal(result, expected)
+
+    def test_encoding_non_utf8_multichar_sep(self):
+        # see gh-3404
+        expected = DataFrame({'a': [1], 'b': [2]})
+
+        for sep in ['::', '#####', '!!!', '123', '#1!c5',
+                    '%!c!d', '@@#4:2', '_!pd#_']:
+            data = '1' + sep + '2'
+
+            for encoding in ['utf-16', 'utf-16-be', 'utf-16-le',
+                             'utf-32', 'cp037']:
+                encoded_data = data.encode(encoding)
+                result = self.read_csv(BytesIO(encoded_data),
+                                       sep=sep, names=['a', 'b'],
+                                       encoding=encoding)
+                tm.assert_frame_equal(result, expected)
+
+    def test_multi_char_sep_quotes(self):
+        # see gh-13374
+
+        data = 'a,,b\n1,,a\n2,,"2,,b"'
+        msg = 'ignored when a multi-char delimiter is used'
+
+        with tm.assert_raises_regex(ParserError, msg):
+            self.read_csv(StringIO(data), sep=',,')
+
+        # We expect no match, so there should be an assertion
+        # error out of the inner context manager.
+        with pytest.raises(AssertionError):
+            with tm.assert_raises_regex(ParserError, msg):
+                self.read_csv(StringIO(data), sep=',,',
+                              quoting=csv.QUOTE_NONE)
+
+    def test_none_delimiter(self):
+        # see gh-13374 and gh-17465
+
+        data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
+        expected = DataFrame({'a': [0, 7],
+                              'b': [1, 8],
+                              'c': [2, 9]})
+
+        # We expect the third line in the data to be
+        # skipped because it is malformed,
+        # but we do not expect any errors to occur.
+        result = self.read_csv(StringIO(data), header=0,
+                               sep=None,
+                               error_bad_lines=False,
+                               warn_bad_lines=True)
+        tm.assert_frame_equal(result, expected)
+
+    def test_skipfooter_bad_row(self):
+        # see gh-13879
+        # see gh-15910
+
+        msg = 'parsing errors in the skipped footer rows'
+
+        for data in ('a\n1\n"b"a',
+                     'a,b,c\ncat,foo,bar\ndog,foo,"baz'):
+            with tm.assert_raises_regex(ParserError, msg):
+                self.read_csv(StringIO(data), skipfooter=1)
+
+            # We expect no match, so there should be an assertion
+            # error out of the inner context manager.
+            with pytest.raises(AssertionError):
+                with tm.assert_raises_regex(ParserError, msg):
+                    self.read_csv(StringIO(data))
@@ -0,0 +1,153 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that quoting specifications are properly handled
+during parsing for all of the parsers defined in parsers.py
+"""
+
+import csv
+import pandas.util.testing as tm
+
+from pandas import DataFrame
+from pandas.compat import PY3, StringIO, u
+
+
+class QuotingTests(object):
+
+    def test_bad_quote_char(self):
+        data = '1,2,3'
+
+        # Python 2.x: "...must be an 1-character..."
+        # Python 3.x: "...must be a 1-character..."
+        msg = '"quotechar" must be a(n)? 1-character string'
+        tm.assert_raises_regex(TypeError, msg, self.read_csv,
+                               StringIO(data), quotechar='foo')
+
+        msg = 'quotechar must be set if quoting enabled'
+        tm.assert_raises_regex(TypeError, msg, self.read_csv,
+                               StringIO(data), quotechar=None,
+                               quoting=csv.QUOTE_MINIMAL)
+
+        msg = '"quotechar" must be string, not int'
+        tm.assert_raises_regex(TypeError, msg, self.read_csv,
+                               StringIO(data), quotechar=2)
+
+    def test_bad_quoting(self):
+        data = '1,2,3'
+
+        msg = '"quoting" must be an integer'
+        tm.assert_raises_regex(TypeError, msg, self.read_csv,
+                               StringIO(data), quoting='foo')
+
+        # quoting must in the range [0, 3]
+        msg = 'bad "quoting" value'
+        tm.assert_raises_regex(TypeError, msg, self.read_csv,
+                               StringIO(data), quoting=5)
+
+    def test_quote_char_basic(self):
+        data = 'a,b,c\n1,2,"cat"'
+        expected = DataFrame([[1, 2, 'cat']],
+                             columns=['a', 'b', 'c'])
+        result = self.read_csv(StringIO(data), quotechar='"')
+        tm.assert_frame_equal(result, expected)
+
+    def test_quote_char_various(self):
+        data = 'a,b,c\n1,2,"cat"'
+        expected = DataFrame([[1, 2, 'cat']],
+                             columns=['a', 'b', 'c'])
+        quote_chars = ['~', '*', '%', '$', '@', 'P']
+
+        for quote_char in quote_chars:
+            new_data = data.replace('"', quote_char)
+            result = self.read_csv(StringIO(new_data), quotechar=quote_char)
+            tm.assert_frame_equal(result, expected)
+
+    def test_null_quote_char(self):
+        data = 'a,b,c\n1,2,3'
+
+        # sanity checks
+        msg = 'quotechar must be set if quoting enabled'
+
+        tm.assert_raises_regex(TypeError, msg, self.read_csv,
+                               StringIO(data), quotechar=None,
+                               quoting=csv.QUOTE_MINIMAL)
+
+        tm.assert_raises_regex(TypeError, msg, self.read_csv,
+                               StringIO(data), quotechar='',
+                               quoting=csv.QUOTE_MINIMAL)
+
+        # no errors should be raised if quoting is None
+        expected = DataFrame([[1, 2, 3]],
+                             columns=['a', 'b', 'c'])
+
+        result = self.read_csv(StringIO(data), quotechar=None,
+                               quoting=csv.QUOTE_NONE)
+        tm.assert_frame_equal(result, expected)
+
+        result = self.read_csv(StringIO(data), quotechar='',
+                               quoting=csv.QUOTE_NONE)
+        tm.assert_frame_equal(result, expected)
+
+    def test_quoting_various(self):
+        data = '1,2,"foo"'
+        cols = ['a', 'b', 'c']
+
+        # QUOTE_MINIMAL and QUOTE_ALL apply only to
+        # the CSV writer, so they should have no
+        # special effect for the CSV reader
+        expected = DataFrame([[1, 2, 'foo']], columns=cols)
+
+        # test default (afterwards, arguments are all explicit)
+        result = self.read_csv(StringIO(data), names=cols)
+        tm.assert_frame_equal(result, expected)
+
+        result = self.read_csv(StringIO(data), quotechar='"',
+                               quoting=csv.QUOTE_MINIMAL, names=cols)
+        tm.assert_frame_equal(result, expected)
+
+        result = self.read_csv(StringIO(data), quotechar='"',
+                               quoting=csv.QUOTE_ALL, names=cols)
+        tm.assert_frame_equal(result, expected)
+
+        # QUOTE_NONE tells the reader to do no special handling
+        # of quote characters and leave them alone
+        expected = DataFrame([[1, 2, '"foo"']], columns=cols)
+        result = self.read_csv(StringIO(data), quotechar='"',
+                               quoting=csv.QUOTE_NONE, names=cols)
+        tm.assert_frame_equal(result, expected)
+
+        # QUOTE_NONNUMERIC tells the reader to cast
+        # all non-quoted fields to float
+        expected = DataFrame([[1.0, 2.0, 'foo']], columns=cols)
+        result = self.read_csv(StringIO(data), quotechar='"',
+                               quoting=csv.QUOTE_NONNUMERIC,
+                               names=cols)
+        tm.assert_frame_equal(result, expected)
+
+    def test_double_quote(self):
+        data = 'a,b\n3,"4 "" 5"'
+
+        expected = DataFrame([[3, '4 " 5']],
+                             columns=['a', 'b'])
+        result = self.read_csv(StringIO(data), quotechar='"',
+                               doublequote=True)
+        tm.assert_frame_equal(result, expected)
+
+        expected = DataFrame([[3, '4 " 5"']],
+                             columns=['a', 'b'])
+        result = self.read_csv(StringIO(data), quotechar='"',
+                               doublequote=False)
+        tm.assert_frame_equal(result, expected)
+
+    def test_quotechar_unicode(self):
+        # See gh-14477
+        data = 'a\n1'
+        expected = DataFrame({'a': [1]})
+
+        result = self.read_csv(StringIO(data), quotechar=u('"'))
+        tm.assert_frame_equal(result, expected)
+
+        # Compared to Python 3.x, Python 2.x does not handle unicode well.
+        if PY3:
+            result = self.read_csv(StringIO(data), quotechar=u('\u0001'))
+            tm.assert_frame_equal(result, expected)
@@ -0,0 +1,225 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that skipped rows are properly handled during
+parsing for all of the parsers defined in parsers.py
+"""
+
+from datetime import datetime
+
+import numpy as np
+
+import pandas.util.testing as tm
+
+from pandas import DataFrame
+from pandas.errors import EmptyDataError
+from pandas.compat import StringIO, range, lrange
+
+
+class SkipRowsTests(object):
+
+    def test_skiprows_bug(self):
+        # see gh-505
+        text = """#foo,a,b,c
+#foo,a,b,c
+#foo,a,b,c
+#foo,a,b,c
+#foo,a,b,c
+#foo,a,b,c
+1/1/2000,1.,2.,3.
+1/2/2000,4,5,6
+1/3/2000,7,8,9
+"""
+        data = self.read_csv(StringIO(text), skiprows=lrange(6), header=None,
+                             index_col=0, parse_dates=True)
+
+        data2 = self.read_csv(StringIO(text), skiprows=6, header=None,
+                              index_col=0, parse_dates=True)
+
+        expected = DataFrame(np.arange(1., 10.).reshape((3, 3)),
+                             columns=[1, 2, 3],
+                             index=[datetime(2000, 1, 1), datetime(2000, 1, 2),
+                                    datetime(2000, 1, 3)])
+        expected.index.name = 0
+        tm.assert_frame_equal(data, expected)
+        tm.assert_frame_equal(data, data2)
+
+    def test_deep_skiprows(self):
+        # see gh-4382
+        text = "a,b,c\n" + \
+               "\n".join([",".join([str(i), str(i + 1), str(i + 2)])
+                          for i in range(10)])
+        condensed_text = "a,b,c\n" + \
+                         "\n".join([",".join([str(i), str(i + 1), str(i + 2)])
+                                    for i in [0, 1, 2, 3, 4, 6, 8, 9]])
+        data = self.read_csv(StringIO(text), skiprows=[6, 8])
+        condensed_data = self.read_csv(StringIO(condensed_text))
+        tm.assert_frame_equal(data, condensed_data)
+
+    def test_skiprows_blank(self):
+        # see gh-9832
+        text = """#foo,a,b,c
+#foo,a,b,c
+
+#foo,a,b,c
+#foo,a,b,c
+
+1/1/2000,1.,2.,3.
+1/2/2000,4,5,6
+1/3/2000,7,8,9
+"""
+        data = self.read_csv(StringIO(text), skiprows=6, header=None,
+                             index_col=0, parse_dates=True)
+
+        expected = DataFrame(np.arange(1., 10.).reshape((3, 3)),
+                             columns=[1, 2, 3],
+                             index=[datetime(2000, 1, 1), datetime(2000, 1, 2),
+                                    datetime(2000, 1, 3)])
+        expected.index.name = 0
+        tm.assert_frame_equal(data, expected)
+
+    def test_skiprow_with_newline(self):
+        # see gh-12775 and gh-10911
+        data = """id,text,num_lines
+1,"line 11
+line 12",2
+2,"line 21
+line 22",2
+3,"line 31",1"""
+        expected = [[2, 'line 21\nline 22', 2],
+                    [3, 'line 31', 1]]
+        expected = DataFrame(expected, columns=[
+            'id', 'text', 'num_lines'])
+        df = self.read_csv(StringIO(data), skiprows=[1])
+        tm.assert_frame_equal(df, expected)
+
+        data = ('a,b,c\n~a\n b~,~e\n d~,'
+                '~f\n f~\n1,2,~12\n 13\n 14~')
+        expected = [['a\n b', 'e\n d', 'f\n f']]
+        expected = DataFrame(expected, columns=[
+            'a', 'b', 'c'])
+        df = self.read_csv(StringIO(data),
+                           quotechar="~",
+                           skiprows=[2])
+        tm.assert_frame_equal(df, expected)
+
+        data = ('Text,url\n~example\n '
+                'sentence\n one~,url1\n~'
+                'example\n sentence\n two~,url2\n~'
+                'example\n sentence\n three~,url3')
+        expected = [['example\n sentence\n two', 'url2']]
+        expected = DataFrame(expected, columns=[
+            'Text', 'url'])
+        df = self.read_csv(StringIO(data),
+                           quotechar="~",
+                           skiprows=[1, 3])
+        tm.assert_frame_equal(df, expected)
+
+    def test_skiprow_with_quote(self):
+        # see gh-12775 and gh-10911
+        data = """id,text,num_lines
+1,"line '11' line 12",2
+2,"line '21' line 22",2
+3,"line '31' line 32",1"""
+        expected = [[2, "line '21' line 22", 2],
+                    [3, "line '31' line 32", 1]]
+        expected = DataFrame(expected, columns=[
+            'id', 'text', 'num_lines'])
+        df = self.read_csv(StringIO(data), skiprows=[1])
+        tm.assert_frame_equal(df, expected)
+
+    def test_skiprow_with_newline_and_quote(self):
+        # see gh-12775 and gh-10911
+        data = """id,text,num_lines
+1,"line \n'11' line 12",2
+2,"line \n'21' line 22",2
+3,"line \n'31' line 32",1"""
+        expected = [[2, "line \n'21' line 22", 2],
+                    [3, "line \n'31' line 32", 1]]
+        expected = DataFrame(expected, columns=[
+            'id', 'text', 'num_lines'])
+        df = self.read_csv(StringIO(data), skiprows=[1])
+        tm.assert_frame_equal(df, expected)
+
+        data = """id,text,num_lines
+1,"line '11\n' line 12",2
+2,"line '21\n' line 22",2
+3,"line '31\n' line 32",1"""
+        expected = [[2, "line '21\n' line 22", 2],
+                    [3, "line '31\n' line 32", 1]]
+        expected = DataFrame(expected, columns=[
+            'id', 'text', 'num_lines'])
+        df = self.read_csv(StringIO(data), skiprows=[1])
+        tm.assert_frame_equal(df, expected)
+
+        data = """id,text,num_lines
+1,"line '11\n' \r\tline 12",2
+2,"line '21\n' \r\tline 22",2
+3,"line '31\n' \r\tline 32",1"""
+        expected = [[2, "line '21\n' \r\tline 22", 2],
+                    [3, "line '31\n' \r\tline 32", 1]]
+        expected = DataFrame(expected, columns=[
+            'id', 'text', 'num_lines'])
+        df = self.read_csv(StringIO(data), skiprows=[1])
+        tm.assert_frame_equal(df, expected)
+
+    def test_skiprows_lineterminator(self):
+        # see gh-9079
+        data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ',
+                          '2007/01/01 01:00   0.2140 U M ',
+                          '2007/01/01 02:00   0.2141 M O ',
+                          '2007/01/01 04:00   0.2142 D M '])
+        expected = DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'],
+                              ['2007/01/01', '02:00', 0.2141, 'M', 'O'],
+                              ['2007/01/01', '04:00', 0.2142, 'D', 'M']],
+                             columns=['date', 'time', 'var', 'flag',
+                                      'oflag'])
+
+        # test with default line terminators "LF" and "CRLF"
+        df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True,
+                           names=['date', 'time', 'var', 'flag', 'oflag'])
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_csv(StringIO(data.replace('\n', '\r\n')),
+                           skiprows=1, delim_whitespace=True,
+                           names=['date', 'time', 'var', 'flag', 'oflag'])
+        tm.assert_frame_equal(df, expected)
+
+        # "CR" is not respected with the Python parser yet
+        if self.engine == 'c':
+            df = self.read_csv(StringIO(data.replace('\n', '\r')),
+                               skiprows=1, delim_whitespace=True,
+                               names=['date', 'time', 'var', 'flag', 'oflag'])
+            tm.assert_frame_equal(df, expected)
+
+    def test_skiprows_infield_quote(self):
+        # see gh-14459
+        data = 'a"\nb"\na\n1'
+        expected = DataFrame({'a': [1]})
+
+        df = self.read_csv(StringIO(data), skiprows=2)
+        tm.assert_frame_equal(df, expected)
+
+    def test_skiprows_callable(self):
+        data = 'a\n1\n2\n3\n4\n5'
+
+        skiprows = lambda x: x % 2 == 0
+        expected = DataFrame({'1': [3, 5]})
+        df = self.read_csv(StringIO(data), skiprows=skiprows)
+        tm.assert_frame_equal(df, expected)
+
+        expected = DataFrame({'foo': [3, 5]})
+        df = self.read_csv(StringIO(data), skiprows=skiprows,
+                           header=0, names=['foo'])
+        tm.assert_frame_equal(df, expected)
+
+        skiprows = lambda x: True
+        msg = "No columns to parse from file"
+        with tm.assert_raises_regex(EmptyDataError, msg):
+            self.read_csv(StringIO(data), skiprows=skiprows)
+
+        # This is a bad callable and should raise.
+        msg = "by zero"
+        skiprows = lambda x: 1 / 0
+        with tm.assert_raises_regex(ZeroDivisionError, msg):
+            self.read_csv(StringIO(data), skiprows=skiprows)
@@ -0,0 +1,200 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests parsers ability to read and parse non-local files
+and hence require a network connection to be read.
+"""
+import logging
+
+import pytest
+import numpy as np
+
+import pandas.util.testing as tm
+import pandas.util._test_decorators as td
+from pandas import DataFrame
+from pandas.io.parsers import read_csv, read_table
+from pandas.compat import BytesIO, StringIO
+
+
+@pytest.mark.network
+@pytest.mark.parametrize(
+    "compress_type, extension", [
+        ('gzip', '.gz'), ('bz2', '.bz2'), ('zip', '.zip'),
+        pytest.param('xz', '.xz', marks=td.skip_if_no_lzma)
+    ]
+)
+@pytest.mark.parametrize('mode', ['explicit', 'infer'])
+@pytest.mark.parametrize('engine', ['python', 'c'])
+def test_compressed_urls(salaries_table, compress_type, extension, mode,
+                         engine):
+    check_compressed_urls(salaries_table, compress_type, extension, mode,
+                          engine)
+
+
+@tm.network
+def check_compressed_urls(salaries_table, compression, extension, mode,
+                          engine):
+    # test reading compressed urls with various engines and
+    # extension inference
+    base_url = ('https://github.com/pandas-dev/pandas/raw/master/'
+                'pandas/tests/io/parser/data/salaries.csv')
+
+    url = base_url + extension
+
+    if mode != 'explicit':
+        compression = mode
+
+    url_table = read_table(url, compression=compression, engine=engine)
+    tm.assert_frame_equal(url_table, salaries_table)
+
+
+@pytest.fixture
+def tips_df(datapath):
+    """DataFrame with the tips dataset."""
+    return read_csv(datapath('io', 'parser', 'data', 'tips.csv'))
+
+
+@pytest.mark.usefixtures("s3_resource")
+class TestS3(object):
+
+    def test_parse_public_s3_bucket(self, tips_df):
+        pytest.importorskip('s3fs')
+        # more of an integration test due to the not-public contents portion
+        # can probably mock this though.
+        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
+            df = read_csv('s3://pandas-test/tips.csv' +
+                          ext, compression=comp)
+            assert isinstance(df, DataFrame)
+            assert not df.empty
+            tm.assert_frame_equal(df, tips_df)
+
+        # Read public file from bucket with not-public contents
+        df = read_csv('s3://cant_get_it/tips.csv')
+        assert isinstance(df, DataFrame)
+        assert not df.empty
+        tm.assert_frame_equal(df, tips_df)
+
+    def test_parse_public_s3n_bucket(self, tips_df):
+
+        # Read from AWS s3 as "s3n" URL
+        df = read_csv('s3n://pandas-test/tips.csv', nrows=10)
+        assert isinstance(df, DataFrame)
+        assert not df.empty
+        tm.assert_frame_equal(tips_df.iloc[:10], df)
+
+    def test_parse_public_s3a_bucket(self, tips_df):
+        # Read from AWS s3 as "s3a" URL
+        df = read_csv('s3a://pandas-test/tips.csv', nrows=10)
+        assert isinstance(df, DataFrame)
+        assert not df.empty
+        tm.assert_frame_equal(tips_df.iloc[:10], df)
+
+    def test_parse_public_s3_bucket_nrows(self, tips_df):
+        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
+            df = read_csv('s3://pandas-test/tips.csv' +
+                          ext, nrows=10, compression=comp)
+            assert isinstance(df, DataFrame)
+            assert not df.empty
+            tm.assert_frame_equal(tips_df.iloc[:10], df)
+
+    def test_parse_public_s3_bucket_chunked(self, tips_df):
+        # Read with a chunksize
+        chunksize = 5
+        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
+            df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
+                                 chunksize=chunksize, compression=comp)
+            assert df_reader.chunksize == chunksize
+            for i_chunk in [0, 1, 2]:
+                # Read a couple of chunks and make sure we see them
+                # properly.
+                df = df_reader.get_chunk()
+                assert isinstance(df, DataFrame)
+                assert not df.empty
+                true_df = tips_df.iloc[
+                    chunksize * i_chunk: chunksize * (i_chunk + 1)]
+                tm.assert_frame_equal(true_df, df)
+
+    def test_parse_public_s3_bucket_chunked_python(self, tips_df):
+        # Read with a chunksize using the Python parser
+        chunksize = 5
+        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
+            df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
+                                 chunksize=chunksize, compression=comp,
+                                 engine='python')
+            assert df_reader.chunksize == chunksize
+            for i_chunk in [0, 1, 2]:
+                # Read a couple of chunks and make sure we see them properly.
+                df = df_reader.get_chunk()
+                assert isinstance(df, DataFrame)
+                assert not df.empty
+                true_df = tips_df.iloc[
+                    chunksize * i_chunk: chunksize * (i_chunk + 1)]
+                tm.assert_frame_equal(true_df, df)
+
+    def test_parse_public_s3_bucket_python(self, tips_df):
+        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
+            df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
+                          compression=comp)
+            assert isinstance(df, DataFrame)
+            assert not df.empty
+            tm.assert_frame_equal(df, tips_df)
+
+    def test_infer_s3_compression(self, tips_df):
+        for ext in ['', '.gz', '.bz2']:
+            df = read_csv('s3://pandas-test/tips.csv' + ext,
+                          engine='python', compression='infer')
+            assert isinstance(df, DataFrame)
+            assert not df.empty
+            tm.assert_frame_equal(df, tips_df)
+
+    def test_parse_public_s3_bucket_nrows_python(self, tips_df):
+        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
+            df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
+                          nrows=10, compression=comp)
+            assert isinstance(df, DataFrame)
+            assert not df.empty
+            tm.assert_frame_equal(tips_df.iloc[:10], df)
+
+    def test_s3_fails(self):
+        with pytest.raises(IOError):
+            read_csv('s3://nyqpug/asdf.csv')
+
+        # Receive a permission error when trying to read a private bucket.
+        # It's irrelevant here that this isn't actually a table.
+        with pytest.raises(IOError):
+            read_csv('s3://cant_get_it/')
+
+    def test_read_csv_handles_boto_s3_object(self,
+                                             s3_resource,
+                                             tips_file):
+        # see gh-16135
+
+        s3_object = s3_resource.meta.client.get_object(
+            Bucket='pandas-test',
+            Key='tips.csv')
+
+        result = read_csv(BytesIO(s3_object["Body"].read()), encoding='utf8')
+        assert isinstance(result, DataFrame)
+        assert not result.empty
+
+        expected = read_csv(tips_file)
+        tm.assert_frame_equal(result, expected)
+
+    def test_read_csv_chunked_download(self, s3_resource, caplog):
+        # 8 MB, S3FS usees 5MB chunks
+        df = DataFrame(np.random.randn(100000, 4), columns=list('abcd'))
+        buf = BytesIO()
+        str_buf = StringIO()
+
+        df.to_csv(str_buf)
+
+        buf = BytesIO(str_buf.getvalue().encode('utf-8'))
+
+        s3_resource.Bucket("pandas-test").put_object(
+            Key="large-file.csv",
+            Body=buf)
+
+        with caplog.at_level(logging.DEBUG, logger='s3fs.core'):
+            read_csv("s3://pandas-test/large-file.csv", nrows=5)
+            # log of fetch_range (start, stop)
+            assert ((0, 5505024) in set(x.args[-2:] for x in caplog.records))
@@ -0,0 +1,154 @@
+# -*- coding: utf-8 -*-
+
+import os
+import pytest
+import pandas.util.testing as tm
+
+from pandas import read_csv, read_table, DataFrame
+import pandas.core.common as com
+from pandas._libs.tslib import Timestamp
+from pandas.compat import StringIO
+
+from .common import ParserTests
+from .header import HeaderTests
+from .comment import CommentTests
+from .dialect import DialectTests
+from .quoting import QuotingTests
+from .usecols import UsecolsTests
+from .skiprows import SkipRowsTests
+from .index_col import IndexColTests
+from .na_values import NAvaluesTests
+from .converters import ConverterTests
+from .c_parser_only import CParserTests
+from .parse_dates import ParseDatesTests
+from .compression import CompressionTests
+from .mangle_dupes import DupeColumnTests
+from .multithread import MultithreadTests
+from .python_parser_only import PythonParserTests
+from .dtypes import DtypeTests
+
+
+class BaseParser(CommentTests, CompressionTests,
+                 ConverterTests, DialectTests,
+                 DtypeTests, DupeColumnTests,
+                 HeaderTests, IndexColTests,
+                 MultithreadTests, NAvaluesTests,
+                 ParseDatesTests, ParserTests,
+                 SkipRowsTests, UsecolsTests,
+                 QuotingTests):
+
+    def read_csv(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def read_table(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def float_precision_choices(self):
+        raise com.AbstractMethodError(self)
+
+    @pytest.fixture(autouse=True)
+    def setup_method(self, datapath):
+        self.dirpath = datapath('io', 'parser', 'data')
+        self.csv1 = os.path.join(self.dirpath, 'test1.csv')
+        self.csv2 = os.path.join(self.dirpath, 'test2.csv')
+        self.xls1 = os.path.join(self.dirpath, 'test.xls')
+        self.csv_shiftjs = os.path.join(self.dirpath, 'sauron.SHIFT_JIS.csv')
+
+
+class TestCParserHighMemory(BaseParser, CParserTests):
+    engine = 'c'
+    low_memory = False
+    float_precision_choices = [None, 'high', 'round_trip']
+
+    def read_csv(self, *args, **kwds):
+        kwds = kwds.copy()
+        kwds['engine'] = self.engine
+        kwds['low_memory'] = self.low_memory
+        return read_csv(*args, **kwds)
+
+    def read_table(self, *args, **kwds):
+        kwds = kwds.copy()
+        kwds['engine'] = self.engine
+        kwds['low_memory'] = self.low_memory
+        return read_table(*args, **kwds)
+
+
+class TestCParserLowMemory(BaseParser, CParserTests):
+    engine = 'c'
+    low_memory = True
+    float_precision_choices = [None, 'high', 'round_trip']
+
+    def read_csv(self, *args, **kwds):
+        kwds = kwds.copy()
+        kwds['engine'] = self.engine
+        kwds['low_memory'] = self.low_memory
+        return read_csv(*args, **kwds)
+
+    def read_table(self, *args, **kwds):
+        kwds = kwds.copy()
+        kwds['engine'] = self.engine
+        kwds['low_memory'] = True
+        return read_table(*args, **kwds)
+
+
+class TestPythonParser(BaseParser, PythonParserTests):
+    engine = 'python'
+    float_precision_choices = [None]
+
+    def read_csv(self, *args, **kwds):
+        kwds = kwds.copy()
+        kwds['engine'] = self.engine
+        return read_csv(*args, **kwds)
+
+    def read_table(self, *args, **kwds):
+        kwds = kwds.copy()
+        kwds['engine'] = self.engine
+        return read_table(*args, **kwds)
+
+
+class TestUnsortedUsecols(object):
+    def test_override__set_noconvert_columns(self):
+        # GH 17351 - usecols needs to be sorted in _setnoconvert_columns
+        # based on the test_usecols_with_parse_dates test from usecols.py
+        from pandas.io.parsers import CParserWrapper, TextFileReader
+
+        s = """a,b,c,d,e
+        0,1,20140101,0900,4
+        0,1,20140102,1000,4"""
+
+        parse_dates = [[1, 2]]
+        cols = {
+            'a': [0, 0],
+            'c_d': [
+                Timestamp('2014-01-01 09:00:00'),
+                Timestamp('2014-01-02 10:00:00')
+            ]
+        }
+        expected = DataFrame(cols, columns=['c_d', 'a'])
+
+        class MyTextFileReader(TextFileReader):
+            def __init__(self):
+                self._currow = 0
+                self.squeeze = False
+
+        class MyCParserWrapper(CParserWrapper):
+            def _set_noconvert_columns(self):
+                if self.usecols_dtype == 'integer':
+                    # self.usecols is a set, which is documented as unordered
+                    # but in practice, a CPython set of integers is sorted.
+                    # In other implementations this assumption does not hold.
+                    # The following code simulates a different order, which
+                    # before GH 17351 would cause the wrong columns to be
+                    # converted via the parse_dates parameter
+                    self.usecols = list(self.usecols)
+                    self.usecols.reverse()
+                return CParserWrapper._set_noconvert_columns(self)
+
+        parser = MyTextFileReader()
+        parser.options = {'usecols': [0, 2, 3],
+                          'parse_dates': parse_dates,
+                          'delimiter': ','}
+        parser._engine = MyCParserWrapper(StringIO(s), **parser.options)
+        df = parser.read()
+
+        tm.assert_frame_equal(df, expected)
@@ -0,0 +1,436 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests the 'read_fwf' function in parsers.py. This
+test suite is independent of the others because the
+engine is set to 'python-fwf' internally.
+"""
+
+from datetime import datetime
+
+import pytest
+import numpy as np
+import pandas as pd
+import pandas.util.testing as tm
+
+from pandas import DataFrame
+from pandas import compat
+from pandas.compat import StringIO, BytesIO
+from pandas.io.parsers import read_csv, read_fwf, EmptyDataError
+
+
+class TestFwfParsing(object):
+
+    def test_fwf(self):
+        data_expected = """\
+2011,58,360.242940,149.910199,11950.7
+2011,59,444.953632,166.985655,11788.4
+2011,60,364.136849,183.628767,11806.2
+2011,61,413.836124,184.375703,11916.8
+2011,62,502.953953,173.237159,12468.3
+"""
+        expected = read_csv(StringIO(data_expected),
+                            engine='python', header=None)
+
+        data1 = """\
+201158    360.242940   149.910199   11950.7
+201159    444.953632   166.985655   11788.4
+201160    364.136849   183.628767   11806.2
+201161    413.836124   184.375703   11916.8
+201162    502.953953   173.237159   12468.3
+"""
+        colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
+        df = read_fwf(StringIO(data1), colspecs=colspecs, header=None)
+        tm.assert_frame_equal(df, expected)
+
+        data2 = """\
+2011 58   360.242940   149.910199   11950.7
+2011 59   444.953632   166.985655   11788.4
+2011 60   364.136849   183.628767   11806.2
+2011 61   413.836124   184.375703   11916.8
+2011 62   502.953953   173.237159   12468.3
+"""
+        df = read_fwf(StringIO(data2), widths=[5, 5, 13, 13, 7], header=None)
+        tm.assert_frame_equal(df, expected)
+
+        # From Thomas Kluyver: apparently some non-space filler characters can
+        # be seen, this is supported by specifying the 'delimiter' character:
+        # http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html
+        data3 = """\
+201158~~~~360.242940~~~149.910199~~~11950.7
+201159~~~~444.953632~~~166.985655~~~11788.4
+201160~~~~364.136849~~~183.628767~~~11806.2
+201161~~~~413.836124~~~184.375703~~~11916.8
+201162~~~~502.953953~~~173.237159~~~12468.3
+"""
+        df = read_fwf(
+            StringIO(data3), colspecs=colspecs, delimiter='~', header=None)
+        tm.assert_frame_equal(df, expected)
+
+        with tm.assert_raises_regex(ValueError,
+                                    "must specify only one of"):
+            read_fwf(StringIO(data3), colspecs=colspecs, widths=[6, 10, 10, 7])
+
+        with tm.assert_raises_regex(ValueError, "Must specify either"):
+            read_fwf(StringIO(data3), colspecs=None, widths=None)
+
+    def test_BytesIO_input(self):
+        if not compat.PY3:
+            pytest.skip(
+                "Bytes-related test - only needs to work on Python 3")
+
+        result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[
+            2, 2], encoding='utf8')
+        expected = DataFrame([["של", "ום"]], columns=["של", "ום"])
+        tm.assert_frame_equal(result, expected)
+
+    def test_fwf_colspecs_is_list_or_tuple(self):
+        data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+
+        with tm.assert_raises_regex(TypeError,
+                                    'column specifications must '
+                                    'be a list or tuple.+'):
+            pd.io.parsers.FixedWidthReader(StringIO(data),
+                                           {'a': 1}, ',', '#')
+
+    def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(self):
+        data = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+
+        with tm.assert_raises_regex(TypeError,
+                                    'Each column specification '
+                                    'must be.+'):
+            read_fwf(StringIO(data), [('a', 1)])
+
+    def test_fwf_colspecs_None(self):
+        # GH 7079
+        data = """\
+123456
+456789
+"""
+        colspecs = [(0, 3), (3, None)]
+        result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
+        expected = DataFrame([[123, 456], [456, 789]])
+        tm.assert_frame_equal(result, expected)
+
+        colspecs = [(None, 3), (3, 6)]
+        result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
+        expected = DataFrame([[123, 456], [456, 789]])
+        tm.assert_frame_equal(result, expected)
+
+        colspecs = [(0, None), (3, None)]
+        result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
+        expected = DataFrame([[123456, 456], [456789, 789]])
+        tm.assert_frame_equal(result, expected)
+
+        colspecs = [(None, None), (3, 6)]
+        result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
+        expected = DataFrame([[123456, 456], [456789, 789]])
+        tm.assert_frame_equal(result, expected)
+
+    def test_fwf_regression(self):
+        # GH 3594
+        # turns out 'T060' is parsable as a datetime slice!
+
+        tzlist = [1, 10, 20, 30, 60, 80, 100]
+        ntz = len(tzlist)
+        tcolspecs = [16] + [8] * ntz
+        tcolnames = ['SST'] + ["T%03d" % z for z in tzlist[1:]]
+        data = """  2009164202000   9.5403  9.4105  8.6571  7.8372  6.0612  5.8843  5.5192
+  2009164203000   9.5435  9.2010  8.6167  7.8176  6.0804  5.8728  5.4869
+  2009164204000   9.5873  9.1326  8.4694  7.5889  6.0422  5.8526  5.4657
+  2009164205000   9.5810  9.0896  8.4009  7.4652  6.0322  5.8189  5.4379
+  2009164210000   9.6034  9.0897  8.3822  7.4905  6.0908  5.7904  5.4039
+"""
+
+        df = read_fwf(StringIO(data),
+                      index_col=0,
+                      header=None,
+                      names=tcolnames,
+                      widths=tcolspecs,
+                      parse_dates=True,
+                      date_parser=lambda s: datetime.strptime(s, '%Y%j%H%M%S'))
+
+        for c in df.columns:
+            res = df.loc[:, c]
+            assert len(res)
+
+    def test_fwf_for_uint8(self):
+        data = """1421302965.213420    PRI=3 PGN=0xef00      DST=0x17 SRC=0x28    04 154 00 00 00 00 00 127
+1421302964.226776    PRI=6 PGN=0xf002               SRC=0x47    243 00 00 255 247 00 00 71"""  # noqa
+        df = read_fwf(StringIO(data),
+                      colspecs=[(0, 17), (25, 26), (33, 37),
+                                (49, 51), (58, 62), (63, 1000)],
+                      names=['time', 'pri', 'pgn', 'dst', 'src', 'data'],
+                      converters={
+                          'pgn': lambda x: int(x, 16),
+                          'src': lambda x: int(x, 16),
+                          'dst': lambda x: int(x, 16),
+                          'data': lambda x: len(x.split(' '))})
+
+        expected = DataFrame([[1421302965.213420, 3, 61184, 23, 40, 8],
+                              [1421302964.226776, 6, 61442, None, 71, 8]],
+                             columns=["time", "pri", "pgn",
+                                      "dst", "src", "data"])
+        expected["dst"] = expected["dst"].astype(object)
+
+        tm.assert_frame_equal(df, expected)
+
+    def test_fwf_compression(self):
+        try:
+            import gzip
+            import bz2
+        except ImportError:
+            pytest.skip("Need gzip and bz2 to run this test")
+
+        data = """1111111111
+        2222222222
+        3333333333""".strip()
+        widths = [5, 5]
+        names = ['one', 'two']
+        expected = read_fwf(StringIO(data), widths=widths, names=names)
+        if compat.PY3:
+            data = bytes(data, encoding='utf-8')
+        comps = [('gzip', gzip.GzipFile), ('bz2', bz2.BZ2File)]
+        for comp_name, compresser in comps:
+            with tm.ensure_clean() as path:
+                tmp = compresser(path, mode='wb')
+                tmp.write(data)
+                tmp.close()
+                result = read_fwf(path, widths=widths, names=names,
+                                  compression=comp_name)
+                tm.assert_frame_equal(result, expected)
+
+    def test_comment_fwf(self):
+        data = """
+  1   2.   4  #hello world
+  5  NaN  10.0
+"""
+        expected = np.array([[1, 2., 4],
+                             [5, np.nan, 10.]])
+        df = read_fwf(StringIO(data), colspecs=[(0, 3), (4, 9), (9, 25)],
+                      comment='#')
+        tm.assert_almost_equal(df.values, expected)
+
+    def test_1000_fwf(self):
+        data = """
+ 1 2,334.0    5
+10   13     10.
+"""
+        expected = np.array([[1, 2334., 5],
+                             [10, 13, 10]])
+        df = read_fwf(StringIO(data), colspecs=[(0, 3), (3, 11), (12, 16)],
+                      thousands=',')
+        tm.assert_almost_equal(df.values, expected)
+
+    def test_bool_header_arg(self):
+        # see gh-6114
+        data = """\
+MyColumn
+   a
+   b
+   a
+   b"""
+        for arg in [True, False]:
+            with pytest.raises(TypeError):
+                read_fwf(StringIO(data), header=arg)
+
+    def test_full_file(self):
+        # File with all values
+        test = """index                             A    B    C
+2000-01-03T00:00:00  0.980268513777    3  foo
+2000-01-04T00:00:00  1.04791624281    -4  bar
+2000-01-05T00:00:00  0.498580885705   73  baz
+2000-01-06T00:00:00  1.12020151869     1  foo
+2000-01-07T00:00:00  0.487094399463    0  bar
+2000-01-10T00:00:00  0.836648671666    2  baz
+2000-01-11T00:00:00  0.157160753327   34  foo"""
+        colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
+        expected = read_fwf(StringIO(test), colspecs=colspecs)
+        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
+
+    def test_full_file_with_missing(self):
+        # File with missing values
+        test = """index                             A    B    C
+2000-01-03T00:00:00  0.980268513777    3  foo
+2000-01-04T00:00:00  1.04791624281    -4  bar
+                     0.498580885705   73  baz
+2000-01-06T00:00:00  1.12020151869     1  foo
+2000-01-07T00:00:00                    0  bar
+2000-01-10T00:00:00  0.836648671666    2  baz
+                                      34"""
+        colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
+        expected = read_fwf(StringIO(test), colspecs=colspecs)
+        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
+
+    def test_full_file_with_spaces(self):
+        # File with spaces in columns
+        test = """
+Account                 Name  Balance     CreditLimit   AccountCreated
+101     Keanu Reeves          9315.45     10000.00           1/17/1998
+312     Gerard Butler         90.00       1000.00             8/6/2003
+868     Jennifer Love Hewitt  0           17000.00           5/25/1985
+761     Jada Pinkett-Smith    49654.87    100000.00          12/5/2006
+317     Bill Murray           789.65      5000.00             2/5/2007
+""".strip('\r\n')
+        colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
+        expected = read_fwf(StringIO(test), colspecs=colspecs)
+        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
+
+    def test_full_file_with_spaces_and_missing(self):
+        # File with spaces and missing values in columns
+        test = """
+Account               Name    Balance     CreditLimit   AccountCreated
+101                           10000.00                       1/17/1998
+312     Gerard Butler         90.00       1000.00             8/6/2003
+868                                                          5/25/1985
+761     Jada Pinkett-Smith    49654.87    100000.00          12/5/2006
+317     Bill Murray           789.65
+""".strip('\r\n')
+        colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
+        expected = read_fwf(StringIO(test), colspecs=colspecs)
+        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
+
+    def test_messed_up_data(self):
+        # Completely messed up file
+        test = """
+   Account          Name             Balance     Credit Limit   Account Created
+       101                           10000.00                       1/17/1998
+       312     Gerard Butler         90.00       1000.00
+
+       761     Jada Pinkett-Smith    49654.87    100000.00          12/5/2006
+  317          Bill Murray           789.65
+""".strip('\r\n')
+        colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
+        expected = read_fwf(StringIO(test), colspecs=colspecs)
+        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
+
+    def test_multiple_delimiters(self):
+        test = r"""
+col1~~~~~col2  col3++++++++++++++++++col4
+~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
+  33+++122.33\\\bar.........Gerard Butler
++44~~~~12.01   baz~~Jennifer Love Hewitt
+~~55       11+++foo++++Jada Pinkett-Smith
+..66++++++.03~~~bar           Bill Murray
+""".strip('\r\n')
+        colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
+        expected = read_fwf(StringIO(test), colspecs=colspecs,
+                            delimiter=' +~.\\')
+        tm.assert_frame_equal(expected, read_fwf(StringIO(test),
+                                                 delimiter=' +~.\\'))
+
+    def test_variable_width_unicode(self):
+        if not compat.PY3:
+            pytest.skip(
+                'Bytes-related test - only needs to work on Python 3')
+        test = """
+שלום שלום
+ום   שלל
+של   ום
+""".strip('\r\n')
+        expected = read_fwf(BytesIO(test.encode('utf8')),
+                            colspecs=[(0, 4), (5, 9)],
+                            header=None, encoding='utf8')
+        tm.assert_frame_equal(expected, read_fwf(
+            BytesIO(test.encode('utf8')), header=None, encoding='utf8'))
+
+    def test_dtype(self):
+        data = """ a    b    c
+1    2    3.2
+3    4    5.2
+"""
+        colspecs = [(0, 5), (5, 10), (10, None)]
+        result = pd.read_fwf(StringIO(data), colspecs=colspecs)
+        expected = pd.DataFrame({
+            'a': [1, 3],
+            'b': [2, 4],
+            'c': [3.2, 5.2]}, columns=['a', 'b', 'c'])
+        tm.assert_frame_equal(result, expected)
+
+        expected['a'] = expected['a'].astype('float64')
+        expected['b'] = expected['b'].astype(str)
+        expected['c'] = expected['c'].astype('int32')
+        result = pd.read_fwf(StringIO(data), colspecs=colspecs,
+                             dtype={'a': 'float64', 'b': str, 'c': 'int32'})
+        tm.assert_frame_equal(result, expected)
+
+    def test_skiprows_inference(self):
+        # GH11256
+        test = """
+Text contained in the file header
+
+DataCol1   DataCol2
+     0.0        1.0
+   101.6      956.1
+""".strip()
+        expected = read_csv(StringIO(test), skiprows=2,
+                            delim_whitespace=True)
+        tm.assert_frame_equal(expected, read_fwf(
+            StringIO(test), skiprows=2))
+
+    def test_skiprows_by_index_inference(self):
+        test = """
+To be skipped
+Not  To  Be  Skipped
+Once more to be skipped
+123  34   8      123
+456  78   9      456
+""".strip()
+
+        expected = read_csv(StringIO(test), skiprows=[0, 2],
+                            delim_whitespace=True)
+        tm.assert_frame_equal(expected, read_fwf(
+            StringIO(test), skiprows=[0, 2]))
+
+    def test_skiprows_inference_empty(self):
+        test = """
+AA   BBB  C
+12   345  6
+78   901  2
+""".strip()
+
+        with pytest.raises(EmptyDataError):
+            read_fwf(StringIO(test), skiprows=3)
+
+    def test_whitespace_preservation(self):
+        # Addresses Issue #16772
+        data_expected = """
+ a ,bbb
+ cc,dd """
+        expected = read_csv(StringIO(data_expected), header=None)
+
+        test_data = """
+ a bbb
+ ccdd """
+        result = read_fwf(StringIO(test_data), widths=[3, 3],
+                          header=None, skiprows=[0], delimiter="\n\t")
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_default_delimiter(self):
+        data_expected = """
+a,bbb
+cc,dd"""
+        expected = read_csv(StringIO(data_expected), header=None)
+
+        test_data = """
+a \tbbb
+cc\tdd """
+        result = read_fwf(StringIO(test_data), widths=[3, 3],
+                          header=None, skiprows=[0])
+
+        tm.assert_frame_equal(result, expected)
@@ -0,0 +1,354 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests the TextReader class in parsers.pyx, which
+is integral to the C engine in parsers.py
+"""
+
+import pytest
+
+from pandas.compat import StringIO, BytesIO, map
+from pandas import compat
+
+import os
+import sys
+
+from numpy import nan
+import numpy as np
+
+from pandas import DataFrame
+from pandas.io.parsers import (read_csv, TextFileReader)
+from pandas.util.testing import assert_frame_equal
+
+import pandas.util.testing as tm
+
+from pandas._libs.parsers import TextReader
+import pandas._libs.parsers as parser
+
+
+class TestTextReader(object):
+
+    @pytest.fixture(autouse=True)
+    def setup_method(self, datapath):
+        self.dirpath = datapath('io', 'parser', 'data')
+        self.csv1 = os.path.join(self.dirpath, 'test1.csv')
+        self.csv2 = os.path.join(self.dirpath, 'test2.csv')
+        self.xls1 = os.path.join(self.dirpath, 'test.xls')
+
+    def test_file_handle(self):
+        with open(self.csv1, 'rb') as f:
+            reader = TextReader(f)
+            reader.read()
+
+    def test_string_filename(self):
+        reader = TextReader(self.csv1, header=None)
+        reader.read()
+
+    def test_file_handle_mmap(self):
+        with open(self.csv1, 'rb') as f:
+            reader = TextReader(f, memory_map=True, header=None)
+            reader.read()
+
+    def test_StringIO(self):
+        with open(self.csv1, 'rb') as f:
+            text = f.read()
+        src = BytesIO(text)
+        reader = TextReader(src, header=None)
+        reader.read()
+
+    def test_string_factorize(self):
+        # should this be optional?
+        data = 'a\nb\na\nb\na'
+        reader = TextReader(StringIO(data), header=None)
+        result = reader.read()
+        assert len(set(map(id, result[0]))) == 2
+
+    def test_skipinitialspace(self):
+        data = ('a,   b\n'
+                'a,   b\n'
+                'a,   b\n'
+                'a,   b')
+
+        reader = TextReader(StringIO(data), skipinitialspace=True,
+                            header=None)
+        result = reader.read()
+
+        tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a', 'a'],
+                                                        dtype=np.object_))
+        tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b', 'b'],
+                                                        dtype=np.object_))
+
+    def test_parse_booleans(self):
+        data = 'True\nFalse\nTrue\nTrue'
+
+        reader = TextReader(StringIO(data), header=None)
+        result = reader.read()
+
+        assert result[0].dtype == np.bool_
+
+    def test_delimit_whitespace(self):
+        data = 'a  b\na\t\t "b"\n"a"\t \t b'
+
+        reader = TextReader(StringIO(data), delim_whitespace=True,
+                            header=None)
+        result = reader.read()
+
+        tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a'],
+                                                        dtype=np.object_))
+        tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b'],
+                                                        dtype=np.object_))
+
+    def test_embedded_newline(self):
+        data = 'a\n"hello\nthere"\nthis'
+
+        reader = TextReader(StringIO(data), header=None)
+        result = reader.read()
+
+        expected = np.array(['a', 'hello\nthere', 'this'], dtype=np.object_)
+        tm.assert_numpy_array_equal(result[0], expected)
+
+    def test_euro_decimal(self):
+        data = '12345,67\n345,678'
+
+        reader = TextReader(StringIO(data), delimiter=':',
+                            decimal=',', header=None)
+        result = reader.read()
+
+        expected = np.array([12345.67, 345.678])
+        tm.assert_almost_equal(result[0], expected)
+
+    def test_integer_thousands(self):
+        data = '123,456\n12,500'
+
+        reader = TextReader(StringIO(data), delimiter=':',
+                            thousands=',', header=None)
+        result = reader.read()
+
+        expected = np.array([123456, 12500], dtype=np.int64)
+        tm.assert_almost_equal(result[0], expected)
+
+    def test_integer_thousands_alt(self):
+        data = '123.456\n12.500'
+
+        reader = TextFileReader(StringIO(data), delimiter=':',
+                                thousands='.', header=None)
+        result = reader.read()
+
+        expected = DataFrame([123456, 12500])
+        tm.assert_frame_equal(result, expected)
+
+    @tm.capture_stderr
+    def test_skip_bad_lines(self):
+        # too many lines, see #2430 for why
+        data = ('a:b:c\n'
+                'd:e:f\n'
+                'g:h:i\n'
+                'j:k:l:m\n'
+                'l:m:n\n'
+                'o:p:q:r')
+
+        reader = TextReader(StringIO(data), delimiter=':',
+                            header=None)
+        pytest.raises(parser.ParserError, reader.read)
+
+        reader = TextReader(StringIO(data), delimiter=':',
+                            header=None,
+                            error_bad_lines=False,
+                            warn_bad_lines=False)
+        result = reader.read()
+        expected = {0: np.array(['a', 'd', 'g', 'l'], dtype=object),
+                    1: np.array(['b', 'e', 'h', 'm'], dtype=object),
+                    2: np.array(['c', 'f', 'i', 'n'], dtype=object)}
+        assert_array_dicts_equal(result, expected)
+
+        reader = TextReader(StringIO(data), delimiter=':',
+                            header=None,
+                            error_bad_lines=False,
+                            warn_bad_lines=True)
+        reader.read()
+        val = sys.stderr.getvalue()
+
+        assert 'Skipping line 4' in val
+        assert 'Skipping line 6' in val
+
+    def test_header_not_enough_lines(self):
+        data = ('skip this\n'
+                'skip this\n'
+                'a,b,c\n'
+                '1,2,3\n'
+                '4,5,6')
+
+        reader = TextReader(StringIO(data), delimiter=',', header=2)
+        header = reader.header
+        expected = [['a', 'b', 'c']]
+        assert header == expected
+
+        recs = reader.read()
+        expected = {0: np.array([1, 4], dtype=np.int64),
+                    1: np.array([2, 5], dtype=np.int64),
+                    2: np.array([3, 6], dtype=np.int64)}
+        assert_array_dicts_equal(recs, expected)
+
+    def test_escapechar(self):
+        data = ('\\"hello world\"\n'
+                '\\"hello world\"\n'
+                '\\"hello world\"')
+
+        reader = TextReader(StringIO(data), delimiter=',', header=None,
+                            escapechar='\\')
+        result = reader.read()
+        expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
+        assert_array_dicts_equal(result, expected)
+
+    def test_eof_has_eol(self):
+        # handling of new line at EOF
+        pass
+
+    def test_na_substitution(self):
+        pass
+
+    def test_numpy_string_dtype(self):
+        data = """\
+a,1
+aa,2
+aaa,3
+aaaa,4
+aaaaa,5"""
+
+        def _make_reader(**kwds):
+            return TextReader(StringIO(data), delimiter=',', header=None,
+                              **kwds)
+
+        reader = _make_reader(dtype='S5,i4')
+        result = reader.read()
+
+        assert result[0].dtype == 'S5'
+
+        ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaaa'], dtype='S5')
+        assert (result[0] == ex_values).all()
+        assert result[1].dtype == 'i4'
+
+        reader = _make_reader(dtype='S4')
+        result = reader.read()
+        assert result[0].dtype == 'S4'
+        ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaa'], dtype='S4')
+        assert (result[0] == ex_values).all()
+        assert result[1].dtype == 'S4'
+
+    def test_pass_dtype(self):
+        data = """\
+one,two
+1,a
+2,b
+3,c
+4,d"""
+
+        def _make_reader(**kwds):
+            return TextReader(StringIO(data), delimiter=',', **kwds)
+
+        reader = _make_reader(dtype={'one': 'u1', 1: 'S1'})
+        result = reader.read()
+        assert result[0].dtype == 'u1'
+        assert result[1].dtype == 'S1'
+
+        reader = _make_reader(dtype={'one': np.uint8, 1: object})
+        result = reader.read()
+        assert result[0].dtype == 'u1'
+        assert result[1].dtype == 'O'
+
+        reader = _make_reader(dtype={'one': np.dtype('u1'),
+                                     1: np.dtype('O')})
+        result = reader.read()
+        assert result[0].dtype == 'u1'
+        assert result[1].dtype == 'O'
+
+    def test_usecols(self):
+        data = """\
+a,b,c
+1,2,3
+4,5,6
+7,8,9
+10,11,12"""
+
+        def _make_reader(**kwds):
+            return TextReader(StringIO(data), delimiter=',', **kwds)
+
+        reader = _make_reader(usecols=(1, 2))
+        result = reader.read()
+
+        exp = _make_reader().read()
+        assert len(result) == 2
+        assert (result[1] == exp[1]).all()
+        assert (result[2] == exp[2]).all()
+
+    def test_cr_delimited(self):
+        def _test(text, **kwargs):
+            nice_text = text.replace('\r', '\r\n')
+            result = TextReader(StringIO(text), **kwargs).read()
+            expected = TextReader(StringIO(nice_text), **kwargs).read()
+            assert_array_dicts_equal(result, expected)
+
+        data = 'a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12'
+        _test(data, delimiter=',')
+
+        data = 'a  b  c\r1  2  3\r4  5  6\r7  8  9\r10  11  12'
+        _test(data, delim_whitespace=True)
+
+        data = 'a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12'
+        _test(data, delimiter=',')
+
+        sample = ('A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r'
+                  'AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r'
+                  ',BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0')
+        _test(sample, delimiter=',')
+
+        data = 'A  B  C\r  2  3\r4  5  6'
+        _test(data, delim_whitespace=True)
+
+        data = 'A B C\r2 3\r4 5 6'
+        _test(data, delim_whitespace=True)
+
+    def test_empty_field_eof(self):
+        data = 'a,b,c\n1,2,3\n4,,'
+
+        result = TextReader(StringIO(data), delimiter=',').read()
+
+        expected = {0: np.array([1, 4], dtype=np.int64),
+                    1: np.array(['2', ''], dtype=object),
+                    2: np.array(['3', ''], dtype=object)}
+        assert_array_dicts_equal(result, expected)
+
+        # GH5664
+        a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c'])
+        b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]],
+                      columns=list('abcd'),
+                      index=[1, 1])
+        c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan],
+                       [8, 9, 10, 11], [13, 14, nan, nan]],
+                      columns=list('abcd'),
+                      index=[0, 5, 7, 12])
+
+        for _ in range(100):
+            df = read_csv(StringIO('a,b\nc\n'), skiprows=0,
+                          names=['a'], engine='c')
+            assert_frame_equal(df, a)
+
+            df = read_csv(StringIO('1,1,1,1,0\n' * 2 + '\n' * 2),
+                          names=list("abcd"), engine='c')
+            assert_frame_equal(df, b)
+
+            df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'),
+                          names=list('abcd'), engine='c')
+            assert_frame_equal(df, c)
+
+    def test_empty_csv_input(self):
+        # GH14867
+        df = read_csv(StringIO(), chunksize=20, header=None,
+                      names=['a', 'b', 'c'])
+        assert isinstance(df, TextFileReader)
+
+
+def assert_array_dicts_equal(left, right):
+    for k, v in compat.iteritems(left):
+        assert tm.assert_numpy_array_equal(np.asarray(v),
+                                           np.asarray(right[k]))
@@ -0,0 +1,139 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that features that are currently unsupported in
+either the Python or C parser are actually enforced
+and are clearly communicated to the user.
+
+Ultimately, the goal is to remove test cases from this
+test suite as new feature support is added to the parsers.
+"""
+
+import pandas.io.parsers as parsers
+import pandas.util.testing as tm
+
+from pandas.compat import StringIO
+from pandas.errors import ParserError
+from pandas.io.parsers import read_csv, read_table
+
+import pytest
+
+
+@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
+def python_engine(request):
+    return request.param
+
+
+class TestUnsupportedFeatures(object):
+
+    def test_mangle_dupe_cols_false(self):
+        # see gh-12935
+        data = 'a b c\n1 2 3'
+        msg = 'is not supported'
+
+        for engine in ('c', 'python'):
+            with tm.assert_raises_regex(ValueError, msg):
+                read_csv(StringIO(data), engine=engine,
+                         mangle_dupe_cols=False)
+
+    def test_c_engine(self):
+        # see gh-6607
+        data = 'a b c\n1 2 3'
+        msg = 'does not support'
+
+        # specify C engine with unsupported options (raise)
+        with tm.assert_raises_regex(ValueError, msg):
+            read_table(StringIO(data), engine='c',
+                       sep=None, delim_whitespace=False)
+        with tm.assert_raises_regex(ValueError, msg):
+            read_table(StringIO(data), engine='c', sep=r'\s')
+        with tm.assert_raises_regex(ValueError, msg):
+            read_table(StringIO(data), engine='c', quotechar=chr(128))
+        with tm.assert_raises_regex(ValueError, msg):
+            read_table(StringIO(data), engine='c', skipfooter=1)
+
+        # specify C-unsupported options without python-unsupported options
+        with tm.assert_produces_warning(parsers.ParserWarning):
+            read_table(StringIO(data), sep=None, delim_whitespace=False)
+        with tm.assert_produces_warning(parsers.ParserWarning):
+            read_table(StringIO(data), quotechar=chr(128))
+        with tm.assert_produces_warning(parsers.ParserWarning):
+            read_table(StringIO(data), sep=r'\s')
+        with tm.assert_produces_warning(parsers.ParserWarning):
+            read_table(StringIO(data), skipfooter=1)
+
+        text = """                      A       B       C       D        E
+one two three   four
+a   b   10.0032 5    -0.5109 -2.3358 -0.4645  0.05076  0.3640
+a   q   20      4     0.4473  1.4152  0.2834  1.00661  0.1744
+x   q   30      3    -0.6662 -0.5243 -0.3580  0.89145  2.5838"""
+        msg = 'Error tokenizing data'
+
+        with tm.assert_raises_regex(ParserError, msg):
+            read_table(StringIO(text), sep='\\s+')
+        with tm.assert_raises_regex(ParserError, msg):
+            read_table(StringIO(text), engine='c', sep='\\s+')
+
+        msg = "Only length-1 thousands markers supported"
+        data = """A|B|C
+1|2,334|5
+10|13|10.
+"""
+        with tm.assert_raises_regex(ValueError, msg):
+            read_csv(StringIO(data), thousands=',,')
+        with tm.assert_raises_regex(ValueError, msg):
+            read_csv(StringIO(data), thousands='')
+
+        msg = "Only length-1 line terminators supported"
+        data = 'a,b,c~~1,2,3~~4,5,6'
+        with tm.assert_raises_regex(ValueError, msg):
+            read_csv(StringIO(data), lineterminator='~~')
+
+    def test_python_engine(self, python_engine):
+        from pandas.io.parsers import _python_unsupported as py_unsupported
+
+        data = """1,2,3,,
+1,2,3,4,
+1,2,3,4,5
+1,2,,,
+1,2,3,4,"""
+
+        for default in py_unsupported:
+            msg = ('The %r option is not supported '
+                   'with the %r engine' % (default, python_engine))
+
+            kwargs = {default: object()}
+            with tm.assert_raises_regex(ValueError, msg):
+                read_csv(StringIO(data), engine=python_engine, **kwargs)
+
+    def test_python_engine_file_no_next(self, python_engine):
+        # see gh-16530
+        class NoNextBuffer(object):
+            def __init__(self, csv_data):
+                self.data = csv_data
+
+            def __iter__(self):
+                return self
+
+            def read(self):
+                return self.data
+
+        data = "a\n1"
+        msg = "The 'python' engine cannot iterate"
+
+        with tm.assert_raises_regex(ValueError, msg):
+            read_csv(NoNextBuffer(data), engine=python_engine)
+
+
+class TestDeprecatedFeatures(object):
+
+    @pytest.mark.parametrize("engine", ["c", "python"])
+    @pytest.mark.parametrize("kwargs", [{"tupleize_cols": True},
+                                        {"tupleize_cols": False}])
+    def test_deprecated_args(self, engine, kwargs):
+        data = "1,2,3"
+        arg, _ = list(kwargs.items())[0]
+
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            read_csv(StringIO(data), engine=engine, **kwargs)
@@ -0,0 +1,549 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests the usecols functionality during parsing
+for all of the parsers defined in parsers.py
+"""
+
+import pytest
+
+import numpy as np
+import pandas.util.testing as tm
+
+from pandas import DataFrame, Index
+from pandas._libs.tslib import Timestamp
+from pandas.compat import StringIO
+
+
+class UsecolsTests(object):
+    msg_validate_usecols_arg = ("'usecols' must either be list-like of all "
+                                "strings, all unicode, all integers or a "
+                                "callable.")
+    msg_validate_usecols_names = ("Usecols do not match columns, columns "
+                                  "expected but not found: {0}")
+
+    def test_raise_on_mixed_dtype_usecols(self):
+        # See gh-12678
+        data = """a,b,c
+        1000,2000,3000
+        4000,5000,6000
+        """
+
+        usecols = [0, 'b', 2]
+
+        with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
+            self.read_csv(StringIO(data), usecols=usecols)
+
+    def test_usecols(self):
+        data = """\
+a,b,c
+1,2,3
+4,5,6
+7,8,9
+10,11,12"""
+
+        result = self.read_csv(StringIO(data), usecols=(1, 2))
+        result2 = self.read_csv(StringIO(data), usecols=('b', 'c'))
+        exp = self.read_csv(StringIO(data))
+
+        assert len(result.columns) == 2
+        assert (result['b'] == exp['b']).all()
+        assert (result['c'] == exp['c']).all()
+
+        tm.assert_frame_equal(result, result2)
+
+        result = self.read_csv(StringIO(data), usecols=[1, 2], header=0,
+                               names=['foo', 'bar'])
+        expected = self.read_csv(StringIO(data), usecols=[1, 2])
+        expected.columns = ['foo', 'bar']
+        tm.assert_frame_equal(result, expected)
+
+        data = """\
+1,2,3
+4,5,6
+7,8,9
+10,11,12"""
+        result = self.read_csv(StringIO(data), names=['b', 'c'],
+                               header=None, usecols=[1, 2])
+
+        expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
+                                 header=None)
+        expected = expected[['b', 'c']]
+        tm.assert_frame_equal(result, expected)
+
+        result2 = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
+                                header=None, usecols=['b', 'c'])
+        tm.assert_frame_equal(result2, result)
+
+        # see gh-5766
+        result = self.read_csv(StringIO(data), names=['a', 'b'],
+                               header=None, usecols=[0, 1])
+
+        expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
+                                 header=None)
+        expected = expected[['a', 'b']]
+        tm.assert_frame_equal(result, expected)
+
+        # length conflict, passed names and usecols disagree
+        pytest.raises(ValueError, self.read_csv, StringIO(data),
+                      names=['a', 'b'], usecols=[1], header=None)
+
+    def test_usecols_single_string(self):
+        # GH 20558
+        data = """foo, bar, baz
+        1000, 2000, 3000
+        4000, 5000, 6000
+        """
+
+        usecols = 'foo'
+
+        with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
+            self.read_csv(StringIO(data), usecols=usecols)
+
+    def test_usecols_index_col_False(self):
+        # see gh-9082
+        s = "a,b,c,d\n1,2,3,4\n5,6,7,8"
+        s_malformed = "a,b,c,d\n1,2,3,4,\n5,6,7,8,"
+        cols = ['a', 'c', 'd']
+        expected = DataFrame({'a': [1, 5], 'c': [3, 7], 'd': [4, 8]})
+        df = self.read_csv(StringIO(s), usecols=cols, index_col=False)
+        tm.assert_frame_equal(expected, df)
+        df = self.read_csv(StringIO(s_malformed),
+                           usecols=cols, index_col=False)
+        tm.assert_frame_equal(expected, df)
+
+    def test_usecols_index_col_conflict(self):
+        # see gh-4201: test that index_col as integer reflects usecols
+        data = 'a,b,c,d\nA,a,1,one\nB,b,2,two'
+        expected = DataFrame({'c': [1, 2]}, index=Index(
+            ['a', 'b'], name='b'))
+
+        df = self.read_csv(StringIO(data), usecols=['b', 'c'],
+                           index_col=0)
+        tm.assert_frame_equal(expected, df)
+
+        df = self.read_csv(StringIO(data), usecols=['b', 'c'],
+                           index_col='b')
+        tm.assert_frame_equal(expected, df)
+
+        df = self.read_csv(StringIO(data), usecols=[1, 2],
+                           index_col='b')
+        tm.assert_frame_equal(expected, df)
+
+        df = self.read_csv(StringIO(data), usecols=[1, 2],
+                           index_col=0)
+        tm.assert_frame_equal(expected, df)
+
+        expected = DataFrame(
+            {'b': ['a', 'b'], 'c': [1, 2], 'd': ('one', 'two')})
+        expected = expected.set_index(['b', 'c'])
+        df = self.read_csv(StringIO(data), usecols=['b', 'c', 'd'],
+                           index_col=['b', 'c'])
+        tm.assert_frame_equal(expected, df)
+
+    def test_usecols_implicit_index_col(self):
+        # see gh-2654
+        data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10'
+
+        result = self.read_csv(StringIO(data), usecols=['a', 'b'])
+        expected = DataFrame({'a': ['apple', 'orange'],
+                              'b': ['bat', 'cow']}, index=[4, 8])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_usecols_regex_sep(self):
+        # see gh-2733
+        data = 'a  b  c\n4  apple  bat  5.7\n8  orange  cow  10'
+
+        df = self.read_csv(StringIO(data), sep=r'\s+', usecols=('a', 'b'))
+
+        expected = DataFrame({'a': ['apple', 'orange'],
+                              'b': ['bat', 'cow']}, index=[4, 8])
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_whitespace(self):
+        data = 'a  b  c\n4  apple  bat  5.7\n8  orange  cow  10'
+
+        result = self.read_csv(StringIO(data), delim_whitespace=True,
+                               usecols=('a', 'b'))
+        expected = DataFrame({'a': ['apple', 'orange'],
+                              'b': ['bat', 'cow']}, index=[4, 8])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_usecols_with_integer_like_header(self):
+        data = """2,0,1
+        1000,2000,3000
+        4000,5000,6000
+        """
+
+        usecols = [0, 1]  # column selection by index
+        expected = DataFrame(data=[[1000, 2000],
+                                   [4000, 5000]],
+                             columns=['2', '0'])
+        df = self.read_csv(StringIO(data), usecols=usecols)
+        tm.assert_frame_equal(df, expected)
+
+        usecols = ['0', '1']  # column selection by name
+        expected = DataFrame(data=[[2000, 3000],
+                                   [5000, 6000]],
+                             columns=['0', '1'])
+        df = self.read_csv(StringIO(data), usecols=usecols)
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_parse_dates(self):
+        # See gh-9755
+        s = """a,b,c,d,e
+        0,1,20140101,0900,4
+        0,1,20140102,1000,4"""
+        parse_dates = [[1, 2]]
+
+        cols = {
+            'a': [0, 0],
+            'c_d': [
+                Timestamp('2014-01-01 09:00:00'),
+                Timestamp('2014-01-02 10:00:00')
+            ]
+        }
+        expected = DataFrame(cols, columns=['c_d', 'a'])
+
+        df = self.read_csv(StringIO(s), usecols=[0, 2, 3],
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_csv(StringIO(s), usecols=[3, 0, 2],
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+        # See gh-13604
+        s = """2008-02-07 09:40,1032.43
+        2008-02-07 09:50,1042.54
+        2008-02-07 10:00,1051.65
+        """
+        parse_dates = [0]
+        names = ['date', 'values']
+        usecols = names[:]
+
+        index = Index([Timestamp('2008-02-07 09:40'),
+                       Timestamp('2008-02-07 09:50'),
+                       Timestamp('2008-02-07 10:00')],
+                      name='date')
+        cols = {'values': [1032.43, 1042.54, 1051.65]}
+        expected = DataFrame(cols, index=index)
+
+        df = self.read_csv(StringIO(s), parse_dates=parse_dates, index_col=0,
+                           usecols=usecols, header=None, names=names)
+        tm.assert_frame_equal(df, expected)
+
+        # See gh-14792
+        s = """a,b,c,d,e,f,g,h,i,j
+        2016/09/21,1,1,2,3,4,5,6,7,8"""
+        parse_dates = [0]
+        usecols = list('abcdefghij')
+        cols = {'a': Timestamp('2016-09-21'),
+                'b': [1], 'c': [1], 'd': [2],
+                'e': [3], 'f': [4], 'g': [5],
+                'h': [6], 'i': [7], 'j': [8]}
+        expected = DataFrame(cols, columns=usecols)
+        df = self.read_csv(StringIO(s), usecols=usecols,
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+        s = """a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"""
+        parse_dates = [[0, 1]]
+        usecols = list('abcdefghij')
+        cols = {'a_b': '2016/09/21 1',
+                'c': [1], 'd': [2], 'e': [3], 'f': [4],
+                'g': [5], 'h': [6], 'i': [7], 'j': [8]}
+        expected = DataFrame(cols, columns=['a_b'] + list('cdefghij'))
+        df = self.read_csv(StringIO(s), usecols=usecols,
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_parse_dates_and_full_names(self):
+        # See gh-9755
+        s = """0,1,20140101,0900,4
+        0,1,20140102,1000,4"""
+        parse_dates = [[1, 2]]
+        names = list('abcde')
+
+        cols = {
+            'a': [0, 0],
+            'c_d': [
+                Timestamp('2014-01-01 09:00:00'),
+                Timestamp('2014-01-02 10:00:00')
+            ]
+        }
+        expected = DataFrame(cols, columns=['c_d', 'a'])
+
+        df = self.read_csv(StringIO(s), names=names,
+                           usecols=[0, 2, 3],
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_csv(StringIO(s), names=names,
+                           usecols=[3, 0, 2],
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_parse_dates_and_usecol_names(self):
+        # See gh-9755
+        s = """0,1,20140101,0900,4
+        0,1,20140102,1000,4"""
+        parse_dates = [[1, 2]]
+        names = list('acd')
+
+        cols = {
+            'a': [0, 0],
+            'c_d': [
+                Timestamp('2014-01-01 09:00:00'),
+                Timestamp('2014-01-02 10:00:00')
+            ]
+        }
+        expected = DataFrame(cols, columns=['c_d', 'a'])
+
+        df = self.read_csv(StringIO(s), names=names,
+                           usecols=[0, 2, 3],
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+        df = self.read_csv(StringIO(s), names=names,
+                           usecols=[3, 0, 2],
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_unicode_strings(self):
+        # see gh-13219
+
+        s = '''AAA,BBB,CCC,DDD
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+
+        data = {
+            'AAA': {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002
+            },
+            'BBB': {0: 8, 1: 2, 2: 7}
+        }
+        expected = DataFrame(data)
+
+        df = self.read_csv(StringIO(s), usecols=[u'AAA', u'BBB'])
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_single_byte_unicode_strings(self):
+        # see gh-13219
+
+        s = '''A,B,C,D
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+
+        data = {
+            'A': {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002
+            },
+            'B': {0: 8, 1: 2, 2: 7}
+        }
+        expected = DataFrame(data)
+
+        df = self.read_csv(StringIO(s), usecols=[u'A', u'B'])
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_mixed_encoding_strings(self):
+        s = '''AAA,BBB,CCC,DDD
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+
+        with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
+            self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])
+
+        with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
+            self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB'])
+
+    def test_usecols_with_multibyte_characters(self):
+        s = '''あああ,いい,ううう,ええええ
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+        data = {
+            'あああ': {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002
+            },
+            'いい': {0: 8, 1: 2, 2: 7}
+        }
+        expected = DataFrame(data)
+
+        df = self.read_csv(StringIO(s), usecols=['あああ', 'いい'])
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_multibyte_unicode_characters(self):
+        pytest.skip('TODO: see gh-13253')
+
+        s = '''あああ,いい,ううう,ええええ
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+        data = {
+            'あああ': {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002
+            },
+            'いい': {0: 8, 1: 2, 2: 7}
+        }
+        expected = DataFrame(data)
+
+        df = self.read_csv(StringIO(s), usecols=[u'あああ', u'いい'])
+        tm.assert_frame_equal(df, expected)
+
+    def test_empty_usecols(self):
+        # should not raise
+        data = 'a,b,c\n1,2,3\n4,5,6'
+        expected = DataFrame()
+        result = self.read_csv(StringIO(data), usecols=set([]))
+        tm.assert_frame_equal(result, expected)
+
+    def test_np_array_usecols(self):
+        # See gh-12546
+        data = 'a,b,c\n1,2,3'
+        usecols = np.array(['a', 'b'])
+
+        expected = DataFrame([[1, 2]], columns=usecols)
+        result = self.read_csv(StringIO(data), usecols=usecols)
+        tm.assert_frame_equal(result, expected)
+
+    def test_callable_usecols(self):
+        # See gh-14154
+        s = '''AaA,bBb,CCC,ddd
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+
+        data = {
+            'AaA': {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002
+            },
+            'bBb': {0: 8, 1: 2, 2: 7},
+            'ddd': {0: 'a', 1: 'b', 2: 'a'}
+        }
+        expected = DataFrame(data)
+        df = self.read_csv(StringIO(s), usecols=lambda x:
+                           x.upper() in ['AAA', 'BBB', 'DDD'])
+        tm.assert_frame_equal(df, expected)
+
+        # Check that a callable returning only False returns
+        # an empty DataFrame
+        expected = DataFrame()
+        df = self.read_csv(StringIO(s), usecols=lambda x: False)
+        tm.assert_frame_equal(df, expected)
+
+    def test_incomplete_first_row(self):
+        # see gh-6710
+        data = '1,2\n1,2,3'
+        names = ['a', 'b', 'c']
+        expected = DataFrame({'a': [1, 1],
+                              'c': [np.nan, 3]})
+
+        usecols = ['a', 'c']
+        df = self.read_csv(StringIO(data), names=names, usecols=usecols)
+        tm.assert_frame_equal(df, expected)
+
+        usecols = lambda x: x in ['a', 'c']
+        df = self.read_csv(StringIO(data), names=names, usecols=usecols)
+        tm.assert_frame_equal(df, expected)
+
+    def test_uneven_length_cols(self):
+        # see gh-8985
+        usecols = [0, 1, 2]
+        data = '19,29,39\n' * 2 + '10,20,30,40'
+        expected = DataFrame([[19, 29, 39],
+                              [19, 29, 39],
+                              [10, 20, 30]])
+        df = self.read_csv(StringIO(data), header=None, usecols=usecols)
+        tm.assert_frame_equal(df, expected)
+
+        # see gh-9549
+        usecols = ['A', 'B', 'C']
+        data = ('A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n'
+                '1,2,3,,,1,\n1,2,3\n5,6,7')
+        expected = DataFrame({'A': [1, 3, 1, 1, 1, 5],
+                              'B': [2, 4, 2, 2, 2, 6],
+                              'C': [3, 5, 4, 3, 3, 7]})
+        df = self.read_csv(StringIO(data), usecols=usecols)
+        tm.assert_frame_equal(df, expected)
+
+    def test_raise_on_usecols_names_mismatch(self):
+        # GH 14671
+        data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
+
+        usecols = ['a', 'b', 'c', 'd']
+        df = self.read_csv(StringIO(data), usecols=usecols)
+        expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7],
+                              'd': [4, 8]})
+        tm.assert_frame_equal(df, expected)
+
+        usecols = ['a', 'b', 'c', 'f']
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\['f'\]")):
+            self.read_csv(StringIO(data), usecols=usecols)
+
+        usecols = ['a', 'b', 'f']
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\['f'\]")):
+            self.read_csv(StringIO(data), usecols=usecols)
+
+        usecols = ['a', 'b', 'f', 'g']
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\[('f', 'g'|'g', 'f')\]")):
+            self.read_csv(StringIO(data), usecols=usecols)
+
+        names = ['A', 'B', 'C', 'D']
+
+        df = self.read_csv(StringIO(data), header=0, names=names)
+        expected = DataFrame({'A': [1, 5], 'B': [2, 6], 'C': [3, 7],
+                              'D': [4, 8]})
+        tm.assert_frame_equal(df, expected)
+
+        # TODO: https://github.com/pandas-dev/pandas/issues/16469
+        # usecols = ['A','C']
+        # df = self.read_csv(StringIO(data), header=0, names=names,
+        #                    usecols=usecols)
+        # expected = DataFrame({'A': [1,5], 'C': [3,7]})
+        # tm.assert_frame_equal(df, expected)
+        #
+        # usecols = [0,2]
+        # df = self.read_csv(StringIO(data), header=0, names=names,
+        #                    usecols=usecols)
+        # expected = DataFrame({'A': [1,5], 'C': [3,7]})
+        # tm.assert_frame_equal(df, expected)
+
+        usecols = ['A', 'B', 'C', 'f']
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\['f'\]")):
+            self.read_csv(StringIO(data), header=0, names=names,
+                          usecols=usecols)
+        usecols = ['A', 'B', 'f']
+        with tm.assert_raises_regex(ValueError,
+                                    self.msg_validate_usecols_names.format(
+                                        r"\['f'\]")):
+            self.read_csv(StringIO(data), names=names, usecols=usecols)
@@ -0,0 +1,16 @@
+from pandas.compat import StringIO
+from pandas import read_sas
+
+import pandas.util.testing as tm
+
+
+class TestSas(object):
+
+    def test_sas_buffer_format(self):
+        # see gh-14947
+        b = StringIO("")
+
+        msg = ("If this is a buffer object rather than a string "
+               "name, you must specify a format string")
+        with tm.assert_raises_regex(ValueError, msg):
+            read_sas(b)
@@ -0,0 +1,190 @@
+import pandas as pd
+from pandas.compat import PY2
+import pandas.util.testing as tm
+import pandas.util._test_decorators as td
+from pandas.errors import EmptyDataError
+import os
+import io
+import numpy as np
+import pytest
+
+
+class TestSAS7BDAT(object):
+
+    @pytest.fixture(autouse=True)
+    def setup_method(self, datapath):
+        self.dirpath = datapath("io", "sas", "data")
+        self.data = []
+        self.test_ix = [list(range(1, 16)), [16]]
+        for j in 1, 2:
+            fname = os.path.join(
+                self.dirpath, "test_sas7bdat_{j}.csv".format(j=j))
+            df = pd.read_csv(fname)
+            epoch = pd.datetime(1960, 1, 1)
+            t1 = pd.to_timedelta(df["Column4"], unit='d')
+            df["Column4"] = epoch + t1
+            t2 = pd.to_timedelta(df["Column12"], unit='d')
+            df["Column12"] = epoch + t2
+            for k in range(df.shape[1]):
+                col = df.iloc[:, k]
+                if col.dtype == np.int64:
+                    df.iloc[:, k] = df.iloc[:, k].astype(np.float64)
+                elif col.dtype == np.dtype('O'):
+                    if PY2:
+                        f = lambda x: (x.decode('utf-8') if
+                                       isinstance(x, str) else x)
+                        df.iloc[:, k] = df.iloc[:, k].apply(f)
+            self.data.append(df)
+
+    def test_from_file(self):
+        for j in 0, 1:
+            df0 = self.data[j]
+            for k in self.test_ix[j]:
+                fname = os.path.join(
+                    self.dirpath, "test{k}.sas7bdat".format(k=k))
+                df = pd.read_sas(fname, encoding='utf-8')
+                tm.assert_frame_equal(df, df0)
+
+    def test_from_buffer(self):
+        for j in 0, 1:
+            df0 = self.data[j]
+            for k in self.test_ix[j]:
+                fname = os.path.join(
+                    self.dirpath, "test{k}.sas7bdat".format(k=k))
+                with open(fname, 'rb') as f:
+                    byts = f.read()
+                buf = io.BytesIO(byts)
+                rdr = pd.read_sas(buf, format="sas7bdat",
+                                  iterator=True, encoding='utf-8')
+                df = rdr.read()
+                tm.assert_frame_equal(df, df0, check_exact=False)
+                rdr.close()
+
+    def test_from_iterator(self):
+        for j in 0, 1:
+            df0 = self.data[j]
+            for k in self.test_ix[j]:
+                fname = os.path.join(
+                    self.dirpath, "test{k}.sas7bdat".format(k=k))
+                rdr = pd.read_sas(fname, iterator=True, encoding='utf-8')
+                df = rdr.read(2)
+                tm.assert_frame_equal(df, df0.iloc[0:2, :])
+                df = rdr.read(3)
+                tm.assert_frame_equal(df, df0.iloc[2:5, :])
+                rdr.close()
+
+    @td.skip_if_no('pathlib')
+    def test_path_pathlib(self):
+        from pathlib import Path
+        for j in 0, 1:
+            df0 = self.data[j]
+            for k in self.test_ix[j]:
+                fname = Path(os.path.join(
+                    self.dirpath, "test{k}.sas7bdat".format(k=k)))
+                df = pd.read_sas(fname, encoding='utf-8')
+                tm.assert_frame_equal(df, df0)
+
+    @td.skip_if_no('py.path')
+    def test_path_localpath(self):
+        from py.path import local as LocalPath
+        for j in 0, 1:
+            df0 = self.data[j]
+            for k in self.test_ix[j]:
+                fname = LocalPath(os.path.join(
+                    self.dirpath, "test{k}.sas7bdat".format(k=k)))
+                df = pd.read_sas(fname, encoding='utf-8')
+                tm.assert_frame_equal(df, df0)
+
+    def test_iterator_loop(self):
+        # github #13654
+        for j in 0, 1:
+            for k in self.test_ix[j]:
+                for chunksize in 3, 5, 10, 11:
+                    fname = os.path.join(
+                        self.dirpath, "test{k}.sas7bdat".format(k=k))
+                    rdr = pd.read_sas(fname, chunksize=10, encoding='utf-8')
+                    y = 0
+                    for x in rdr:
+                        y += x.shape[0]
+                    assert y == rdr.row_count
+                    rdr.close()
+
+    def test_iterator_read_too_much(self):
+        # github #14734
+        k = self.test_ix[0][0]
+        fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k))
+        rdr = pd.read_sas(fname, format="sas7bdat",
+                          iterator=True, encoding='utf-8')
+        d1 = rdr.read(rdr.row_count + 20)
+        rdr.close()
+
+        rdr = pd.read_sas(fname, iterator=True, encoding="utf-8")
+        d2 = rdr.read(rdr.row_count + 20)
+        tm.assert_frame_equal(d1, d2)
+        rdr.close()
+
+
+def test_encoding_options(datapath):
+    fname = datapath("io", "sas", "data", "test1.sas7bdat")
+    df1 = pd.read_sas(fname)
+    df2 = pd.read_sas(fname, encoding='utf-8')
+    for col in df1.columns:
+        try:
+            df1[col] = df1[col].str.decode('utf-8')
+        except AttributeError:
+            pass
+    tm.assert_frame_equal(df1, df2)
+
+    from pandas.io.sas.sas7bdat import SAS7BDATReader
+    rdr = SAS7BDATReader(fname, convert_header_text=False)
+    df3 = rdr.read()
+    rdr.close()
+    for x, y in zip(df1.columns, df3.columns):
+        assert(x == y.decode())
+
+
+def test_productsales(datapath):
+    fname = datapath("io", "sas", "data", "productsales.sas7bdat")
+    df = pd.read_sas(fname, encoding='utf-8')
+    fname = datapath("io", "sas", "data", "productsales.csv")
+    df0 = pd.read_csv(fname, parse_dates=['MONTH'])
+    vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"]
+    df0[vn] = df0[vn].astype(np.float64)
+    tm.assert_frame_equal(df, df0)
+
+
+def test_12659(datapath):
+    fname = datapath("io", "sas", "data", "test_12659.sas7bdat")
+    df = pd.read_sas(fname)
+    fname = datapath("io", "sas", "data", "test_12659.csv")
+    df0 = pd.read_csv(fname)
+    df0 = df0.astype(np.float64)
+    tm.assert_frame_equal(df, df0)
+
+
+def test_airline(datapath):
+    fname = datapath("io", "sas", "data", "airline.sas7bdat")
+    df = pd.read_sas(fname)
+    fname = datapath("io", "sas", "data", "airline.csv")
+    df0 = pd.read_csv(fname)
+    df0 = df0.astype(np.float64)
+    tm.assert_frame_equal(df, df0, check_exact=False)
+
+
+def test_date_time(datapath):
+    # Support of different SAS date/datetime formats (PR #15871)
+    fname = datapath("io", "sas", "data", "datetime.sas7bdat")
+    df = pd.read_sas(fname)
+    fname = datapath("io", "sas", "data", "datetime.csv")
+    df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime',
+                                          'DateTimeHi', 'Taiw'])
+    # GH 19732: Timestamps imported from sas will incur floating point errors
+    df.iloc[:, 3] = df.iloc[:, 3].dt.round('us')
+    tm.assert_frame_equal(df, df0)
+
+
+def test_zero_variables(datapath):
+    # Check if the SAS file has zero variables (PR #18184)
+    fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")
+    with pytest.raises(EmptyDataError):
+        pd.read_sas(fname)
@@ -0,0 +1,143 @@
+import pytest
+import pandas as pd
+import pandas.util.testing as tm
+from pandas.io.sas.sasreader import read_sas
+import numpy as np
+import os
+
+# CSV versions of test xpt files were obtained using the R foreign library
+
+# Numbers in a SAS xport file are always float64, so need to convert
+# before making comparisons.
+
+
+def numeric_as_float(data):
+    for v in data.columns:
+        if data[v].dtype is np.dtype('int64'):
+            data[v] = data[v].astype(np.float64)
+
+
+class TestXport(object):
+
+    @pytest.fixture(autouse=True)
+    def setup_method(self, datapath):
+        self.dirpath = datapath("io", "sas", "data")
+        self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt")
+        self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt")
+        self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt")
+        self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt")
+
+    def test1_basic(self):
+        # Tests with DEMO_G.xpt (all numeric file)
+
+        # Compare to this
+        data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
+        numeric_as_float(data_csv)
+
+        # Read full file
+        data = read_sas(self.file01, format="xport")
+        tm.assert_frame_equal(data, data_csv)
+        num_rows = data.shape[0]
+
+        # Test reading beyond end of file
+        reader = read_sas(self.file01, format="xport", iterator=True)
+        data = reader.read(num_rows + 100)
+        assert data.shape[0] == num_rows
+        reader.close()
+
+        # Test incremental read with `read` method.
+        reader = read_sas(self.file01, format="xport", iterator=True)
+        data = reader.read(10)
+        reader.close()
+        tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
+
+        # Test incremental read with `get_chunk` method.
+        reader = read_sas(self.file01, format="xport", chunksize=10)
+        data = reader.get_chunk()
+        reader.close()
+        tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
+
+        # Test read in loop
+        m = 0
+        reader = read_sas(self.file01, format="xport", chunksize=100)
+        for x in reader:
+            m += x.shape[0]
+        reader.close()
+        assert m == num_rows
+
+        # Read full file with `read_sas` method
+        data = read_sas(self.file01)
+        tm.assert_frame_equal(data, data_csv)
+
+    def test1_index(self):
+        # Tests with DEMO_G.xpt using index (all numeric file)
+
+        # Compare to this
+        data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
+        data_csv = data_csv.set_index("SEQN")
+        numeric_as_float(data_csv)
+
+        # Read full file
+        data = read_sas(self.file01, index="SEQN", format="xport")
+        tm.assert_frame_equal(data, data_csv, check_index_type=False)
+
+        # Test incremental read with `read` method.
+        reader = read_sas(self.file01, index="SEQN", format="xport",
+                          iterator=True)
+        data = reader.read(10)
+        reader.close()
+        tm.assert_frame_equal(data, data_csv.iloc[0:10, :],
+                              check_index_type=False)
+
+        # Test incremental read with `get_chunk` method.
+        reader = read_sas(self.file01, index="SEQN", format="xport",
+                          chunksize=10)
+        data = reader.get_chunk()
+        reader.close()
+        tm.assert_frame_equal(data, data_csv.iloc[0:10, :],
+                              check_index_type=False)
+
+    def test1_incremental(self):
+        # Test with DEMO_G.xpt, reading full file incrementally
+
+        data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
+        data_csv = data_csv.set_index("SEQN")
+        numeric_as_float(data_csv)
+
+        reader = read_sas(self.file01, index="SEQN", chunksize=1000)
+
+        all_data = [x for x in reader]
+        data = pd.concat(all_data, axis=0)
+
+        tm.assert_frame_equal(data, data_csv, check_index_type=False)
+
+    def test2(self):
+        # Test with SSHSV1_A.xpt
+
+        # Compare to this
+        data_csv = pd.read_csv(self.file02.replace(".xpt", ".csv"))
+        numeric_as_float(data_csv)
+
+        data = read_sas(self.file02)
+        tm.assert_frame_equal(data, data_csv)
+
+    def test_multiple_types(self):
+        # Test with DRXFCD_G.xpt (contains text and numeric variables)
+
+        # Compare to this
+        data_csv = pd.read_csv(self.file03.replace(".xpt", ".csv"))
+
+        data = read_sas(self.file03, encoding="utf-8")
+        tm.assert_frame_equal(data, data_csv)
+
+    def test_truncated_float_support(self):
+        # Test with paxraw_d_short.xpt, a shortened version of:
+        # http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP
+        # This file has truncated floats (5 bytes in this case).
+
+        # GH 11713
+
+        data_csv = pd.read_csv(self.file04.replace(".xpt", ".csv"))
+
+        data = read_sas(self.file04, format="xport")
+        tm.assert_frame_equal(data.astype('int64'), data_csv)
@@ -0,0 +1,184 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+from numpy.random import randint
+from textwrap import dedent
+
+import pytest
+import pandas as pd
+
+from pandas import DataFrame
+from pandas import read_clipboard
+from pandas import get_option
+from pandas.compat import PY2
+from pandas.util import testing as tm
+from pandas.util.testing import makeCustomDataframe as mkdf
+from pandas.io.clipboard.exceptions import PyperclipException
+from pandas.io.clipboard import clipboard_set, clipboard_get
+
+
+try:
+    DataFrame({'A': [1, 2]}).to_clipboard()
+    _DEPS_INSTALLED = 1
+except (PyperclipException, RuntimeError):
+    _DEPS_INSTALLED = 0
+
+
+def build_kwargs(sep, excel):
+    kwargs = {}
+    if excel != 'default':
+        kwargs['excel'] = excel
+    if sep != 'default':
+        kwargs['sep'] = sep
+    return kwargs
+
+
+@pytest.fixture(params=['delims', 'utf8', 'string', 'long', 'nonascii',
+                        'colwidth', 'mixed', 'float', 'int'])
+def df(request):
+    data_type = request.param
+
+    if data_type == 'delims':
+        return pd.DataFrame({'a': ['"a,\t"b|c', 'd\tef´'],
+                             'b': ['hi\'j', 'k\'\'lm']})
+    elif data_type == 'utf8':
+        return pd.DataFrame({'a': ['µasd', 'Ωœ∑´'],
+                             'b': ['øπ∆˚¬', 'œ∑´®']})
+    elif data_type == 'string':
+        return mkdf(5, 3, c_idx_type='s', r_idx_type='i',
+                    c_idx_names=[None], r_idx_names=[None])
+    elif data_type == 'long':
+        max_rows = get_option('display.max_rows')
+        return mkdf(max_rows + 1, 3,
+                    data_gen_f=lambda *args: randint(2),
+                    c_idx_type='s', r_idx_type='i',
+                    c_idx_names=[None], r_idx_names=[None])
+    elif data_type == 'nonascii':
+        return pd.DataFrame({'en': 'in English'.split(),
+                             'es': 'en español'.split()})
+    elif data_type == 'colwidth':
+        _cw = get_option('display.max_colwidth') + 1
+        return mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw,
+                    c_idx_type='s', r_idx_type='i',
+                    c_idx_names=[None], r_idx_names=[None])
+    elif data_type == 'mixed':
+        return DataFrame({'a': np.arange(1.0, 6.0) + 0.01,
+                          'b': np.arange(1, 6),
+                          'c': list('abcde')})
+    elif data_type == 'float':
+        return mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01,
+                    c_idx_type='s', r_idx_type='i',
+                    c_idx_names=[None], r_idx_names=[None])
+    elif data_type == 'int':
+        return mkdf(5, 3, data_gen_f=lambda *args: randint(2),
+                    c_idx_type='s', r_idx_type='i',
+                    c_idx_names=[None], r_idx_names=[None])
+    else:
+        raise ValueError
+
+
+@pytest.mark.single
+@pytest.mark.skipif(not _DEPS_INSTALLED,
+                    reason="clipboard primitives not installed")
+class TestClipboard(object):
+    def check_round_trip_frame(self, data, excel=None, sep=None,
+                               encoding=None):
+        data.to_clipboard(excel=excel, sep=sep, encoding=encoding)
+        result = read_clipboard(sep=sep or '\t', index_col=0,
+                                encoding=encoding)
+        tm.assert_frame_equal(data, result, check_dtype=False)
+
+    # Test that default arguments copy as tab delimited
+    def test_round_trip_frame(self, df):
+        self.check_round_trip_frame(df)
+
+    # Test that explicit delimiters are respected
+    @pytest.mark.parametrize('sep', ['\t', ',', '|'])
+    def test_round_trip_frame_sep(self, df, sep):
+        self.check_round_trip_frame(df, sep=sep)
+
+    # Test white space separator
+    def test_round_trip_frame_string(self, df):
+        df.to_clipboard(excel=False, sep=None)
+        result = read_clipboard()
+        assert df.to_string() == result.to_string()
+        assert df.shape == result.shape
+
+    # Two character separator is not supported in to_clipboard
+    # Test that multi-character separators are not silently passed
+    def test_excel_sep_warning(self, df):
+        with tm.assert_produces_warning():
+            df.to_clipboard(excel=True, sep=r'\t')
+
+    # Separator is ignored when excel=False and should produce a warning
+    def test_copy_delim_warning(self, df):
+        with tm.assert_produces_warning():
+            df.to_clipboard(excel=False, sep='\t')
+
+    # Tests that the default behavior of to_clipboard is tab
+    # delimited and excel="True"
+    @pytest.mark.parametrize('sep', ['\t', None, 'default'])
+    @pytest.mark.parametrize('excel', [True, None, 'default'])
+    def test_clipboard_copy_tabs_default(self, sep, excel, df):
+        kwargs = build_kwargs(sep, excel)
+        df.to_clipboard(**kwargs)
+        if PY2:
+            # to_clipboard copies unicode, to_csv produces bytes. This is
+            # expected behavior
+            assert clipboard_get().encode('utf-8') == df.to_csv(sep='\t')
+        else:
+            assert clipboard_get() == df.to_csv(sep='\t')
+
+    # Tests reading of white space separated tables
+    @pytest.mark.parametrize('sep', [None, 'default'])
+    @pytest.mark.parametrize('excel', [False])
+    def test_clipboard_copy_strings(self, sep, excel, df):
+        kwargs = build_kwargs(sep, excel)
+        df.to_clipboard(**kwargs)
+        result = read_clipboard(sep=r'\s+')
+        assert result.to_string() == df.to_string()
+        assert df.shape == result.shape
+
+    def test_read_clipboard_infer_excel(self):
+        # gh-19010: avoid warnings
+        clip_kwargs = dict(engine="python")
+
+        text = dedent("""
+            John James	Charlie Mingus
+            1	2
+            4	Harry Carney
+            """.strip())
+        clipboard_set(text)
+        df = pd.read_clipboard(**clip_kwargs)
+
+        # excel data is parsed correctly
+        assert df.iloc[1][1] == 'Harry Carney'
+
+        # having diff tab counts doesn't trigger it
+        text = dedent("""
+            a\t b
+            1  2
+            3  4
+            """.strip())
+        clipboard_set(text)
+        res = pd.read_clipboard(**clip_kwargs)
+
+        text = dedent("""
+            a  b
+            1  2
+            3  4
+            """.strip())
+        clipboard_set(text)
+        exp = pd.read_clipboard(**clip_kwargs)
+
+        tm.assert_frame_equal(res, exp)
+
+    def test_invalid_encoding(self, df):
+        # test case for testing invalid encoding
+        with pytest.raises(ValueError):
+            df.to_clipboard(encoding='ascii')
+        with pytest.raises(NotImplementedError):
+            pd.read_clipboard(encoding='ascii')
+
+    @pytest.mark.parametrize('enc', ['UTF-8', 'utf-8', 'utf8'])
+    def test_round_trip_valid_encodings(self, enc, df):
+        self.check_round_trip_frame(df, encoding=enc)
@@ -0,0 +1,288 @@
+"""
+    Tests for the pandas.io.common functionalities
+"""
+import mmap
+import pytest
+import os
+from os.path import isabs
+
+import pandas as pd
+import pandas.util.testing as tm
+import pandas.util._test_decorators as td
+
+from pandas.io import common
+from pandas.compat import is_platform_windows, StringIO, FileNotFoundError
+
+from pandas import read_csv, concat
+
+
+class CustomFSPath(object):
+    """For testing fspath on unknown objects"""
+    def __init__(self, path):
+        self.path = path
+
+    def __fspath__(self):
+        return self.path
+
+
+# Functions that consume a string path and return a string or path-like object
+path_types = [str, CustomFSPath]
+
+try:
+    from pathlib import Path
+    path_types.append(Path)
+except ImportError:
+    pass
+
+try:
+    from py.path import local as LocalPath
+    path_types.append(LocalPath)
+except ImportError:
+    pass
+
+HERE = os.path.abspath(os.path.dirname(__file__))
+
+
+class TestCommonIOCapabilities(object):
+    data1 = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+
+    def test_expand_user(self):
+        filename = '~/sometest'
+        expanded_name = common._expand_user(filename)
+
+        assert expanded_name != filename
+        assert isabs(expanded_name)
+        assert os.path.expanduser(filename) == expanded_name
+
+    def test_expand_user_normal_path(self):
+        filename = '/somefolder/sometest'
+        expanded_name = common._expand_user(filename)
+
+        assert expanded_name == filename
+        assert os.path.expanduser(filename) == expanded_name
+
+    @td.skip_if_no('pathlib')
+    def test_stringify_path_pathlib(self):
+        rel_path = common._stringify_path(Path('.'))
+        assert rel_path == '.'
+        redundant_path = common._stringify_path(Path('foo//bar'))
+        assert redundant_path == os.path.join('foo', 'bar')
+
+    @td.skip_if_no('py.path')
+    def test_stringify_path_localpath(self):
+        path = os.path.join('foo', 'bar')
+        abs_path = os.path.abspath(path)
+        lpath = LocalPath(path)
+        assert common._stringify_path(lpath) == abs_path
+
+    def test_stringify_path_fspath(self):
+        p = CustomFSPath('foo/bar.csv')
+        result = common._stringify_path(p)
+        assert result == 'foo/bar.csv'
+
+    @pytest.mark.parametrize('extension,expected', [
+        ('', None),
+        ('.gz', 'gzip'),
+        ('.bz2', 'bz2'),
+        ('.zip', 'zip'),
+        ('.xz', 'xz'),
+    ])
+    @pytest.mark.parametrize('path_type', path_types)
+    def test_infer_compression_from_path(self, extension, expected, path_type):
+        path = path_type('foo/bar.csv' + extension)
+        compression = common._infer_compression(path, compression='infer')
+        assert compression == expected
+
+    def test_get_filepath_or_buffer_with_path(self):
+        filename = '~/sometest'
+        filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer(
+            filename)
+        assert filepath_or_buffer != filename
+        assert isabs(filepath_or_buffer)
+        assert os.path.expanduser(filename) == filepath_or_buffer
+        assert not should_close
+
+    def test_get_filepath_or_buffer_with_buffer(self):
+        input_buffer = StringIO()
+        filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer(
+            input_buffer)
+        assert filepath_or_buffer == input_buffer
+        assert not should_close
+
+    def test_iterator(self):
+        reader = read_csv(StringIO(self.data1), chunksize=1)
+        result = concat(reader, ignore_index=True)
+        expected = read_csv(StringIO(self.data1))
+        tm.assert_frame_equal(result, expected)
+
+        # GH12153
+        it = read_csv(StringIO(self.data1), chunksize=1)
+        first = next(it)
+        tm.assert_frame_equal(first, expected.iloc[[0]])
+        tm.assert_frame_equal(concat(it), expected.iloc[1:])
+
+    @pytest.mark.parametrize('reader, module, error_class, fn_ext', [
+        (pd.read_csv, 'os', FileNotFoundError, 'csv'),
+        (pd.read_table, 'os', FileNotFoundError, 'csv'),
+        (pd.read_fwf, 'os', FileNotFoundError, 'txt'),
+        (pd.read_excel, 'xlrd', FileNotFoundError, 'xlsx'),
+        (pd.read_feather, 'feather', Exception, 'feather'),
+        (pd.read_hdf, 'tables', FileNotFoundError, 'h5'),
+        (pd.read_stata, 'os', FileNotFoundError, 'dta'),
+        (pd.read_sas, 'os', FileNotFoundError, 'sas7bdat'),
+        (pd.read_json, 'os', ValueError, 'json'),
+        (pd.read_msgpack, 'os', ValueError, 'mp'),
+        (pd.read_pickle, 'os', FileNotFoundError, 'pickle'),
+    ])
+    def test_read_non_existant(self, reader, module, error_class, fn_ext):
+        pytest.importorskip(module)
+
+        path = os.path.join(HERE, 'data', 'does_not_exist.' + fn_ext)
+        with pytest.raises(error_class):
+            reader(path)
+
+    @pytest.mark.parametrize('reader, module, path', [
+        (pd.read_csv, 'os', ('io', 'data', 'iris.csv')),
+        (pd.read_table, 'os', ('io', 'data', 'iris.csv')),
+        (pd.read_fwf, 'os', ('io', 'data', 'fixed_width_format.txt')),
+        (pd.read_excel, 'xlrd', ('io', 'data', 'test1.xlsx')),
+        (pd.read_feather, 'feather', ('io', 'data', 'feather-0_3_1.feather')),
+        (pd.read_hdf, 'tables', ('io', 'data', 'legacy_hdf',
+                                 'datetimetz_object.h5')),
+        (pd.read_stata, 'os', ('io', 'data', 'stata10_115.dta')),
+        (pd.read_sas, 'os', ('io', 'sas', 'data', 'test1.sas7bdat')),
+        (pd.read_json, 'os', ('io', 'json', 'data', 'tsframe_v012.json')),
+        (pd.read_msgpack, 'os', ('io', 'msgpack', 'data', 'frame.mp')),
+        (pd.read_pickle, 'os', ('io', 'data', 'categorical_0_14_1.pickle')),
+    ])
+    def test_read_fspath_all(self, reader, module, path, datapath):
+        pytest.importorskip(module)
+        path = datapath(*path)
+
+        mypath = CustomFSPath(path)
+        result = reader(mypath)
+        expected = reader(path)
+        if path.endswith('.pickle'):
+            # categorical
+            tm.assert_categorical_equal(result, expected)
+        else:
+            tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize('writer_name, writer_kwargs, module', [
+        ('to_csv', {}, 'os'),
+        ('to_excel', {'engine': 'xlwt'}, 'xlwt'),
+        ('to_feather', {}, 'feather'),
+        ('to_html', {}, 'os'),
+        ('to_json', {}, 'os'),
+        ('to_latex', {}, 'os'),
+        ('to_msgpack', {}, 'os'),
+        ('to_pickle', {}, 'os'),
+        ('to_stata', {}, 'os'),
+    ])
+    def test_write_fspath_all(self, writer_name, writer_kwargs, module):
+        p1 = tm.ensure_clean('string')
+        p2 = tm.ensure_clean('fspath')
+        df = pd.DataFrame({"A": [1, 2]})
+
+        with p1 as string, p2 as fspath:
+            pytest.importorskip(module)
+            mypath = CustomFSPath(fspath)
+            writer = getattr(df, writer_name)
+
+            writer(string, **writer_kwargs)
+            with open(string, 'rb') as f:
+                expected = f.read()
+
+            writer(mypath, **writer_kwargs)
+            with open(fspath, 'rb') as f:
+                result = f.read()
+
+            assert result == expected
+
+    def test_write_fspath_hdf5(self):
+        # Same test as write_fspath_all, except HDF5 files aren't
+        # necessarily byte-for-byte identical for a given dataframe, so we'll
+        # have to read and compare equality
+        pytest.importorskip('tables')
+
+        df = pd.DataFrame({"A": [1, 2]})
+        p1 = tm.ensure_clean('string')
+        p2 = tm.ensure_clean('fspath')
+
+        with p1 as string, p2 as fspath:
+            mypath = CustomFSPath(fspath)
+            df.to_hdf(mypath, key='bar')
+            df.to_hdf(string, key='bar')
+
+            result = pd.read_hdf(fspath, key='bar')
+            expected = pd.read_hdf(string, key='bar')
+
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.fixture
+def mmap_file(datapath):
+    return datapath('io', 'data', 'test_mmap.csv')
+
+
+class TestMMapWrapper(object):
+
+    def test_constructor_bad_file(self, mmap_file):
+        non_file = StringIO('I am not a file')
+        non_file.fileno = lambda: -1
+
+        # the error raised is different on Windows
+        if is_platform_windows():
+            msg = "The parameter is incorrect"
+            err = OSError
+        else:
+            msg = "[Errno 22]"
+            err = mmap.error
+
+        tm.assert_raises_regex(err, msg, common.MMapWrapper, non_file)
+
+        target = open(mmap_file, 'r')
+        target.close()
+
+        msg = "I/O operation on closed file"
+        tm.assert_raises_regex(
+            ValueError, msg, common.MMapWrapper, target)
+
+    def test_get_attr(self, mmap_file):
+        with open(mmap_file, 'r') as target:
+            wrapper = common.MMapWrapper(target)
+
+        attrs = dir(wrapper.mmap)
+        attrs = [attr for attr in attrs
+                 if not attr.startswith('__')]
+        attrs.append('__next__')
+
+        for attr in attrs:
+            assert hasattr(wrapper, attr)
+
+        assert not hasattr(wrapper, 'foo')
+
+    def test_next(self, mmap_file):
+        with open(mmap_file, 'r') as target:
+            wrapper = common.MMapWrapper(target)
+            lines = target.readlines()
+
+        for line in lines:
+            next_line = next(wrapper)
+            assert next_line.strip() == line.strip()
+
+        pytest.raises(StopIteration, next, wrapper)
+
+    def test_unknown_engine(self):
+        with tm.ensure_clean() as path:
+            df = tm.makeDataFrame()
+            df.to_csv(path)
+            with tm.assert_raises_regex(ValueError, 'Unknown engine'):
+                read_csv(path, engine='pyt')
@@ -0,0 +1,145 @@
+""" test feather-format compat """
+from distutils.version import LooseVersion
+from warnings import catch_warnings
+
+import numpy as np
+
+import pandas as pd
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, ensure_clean
+
+import pytest
+feather = pytest.importorskip('feather')
+from feather import FeatherError  # noqa:E402
+
+from pandas.io.feather_format import to_feather, read_feather  # noqa:E402
+
+fv = LooseVersion(feather.__version__)
+
+
+@pytest.mark.single
+class TestFeather(object):
+
+    def check_error_on_write(self, df, exc):
+        # check that we are raising the exception
+        # on writing
+
+        with pytest.raises(exc):
+            with ensure_clean() as path:
+                to_feather(df, path)
+
+    def check_round_trip(self, df, **kwargs):
+
+        with ensure_clean() as path:
+            to_feather(df, path)
+
+            with catch_warnings(record=True):
+                result = read_feather(path, **kwargs)
+            assert_frame_equal(result, df)
+
+    def test_error(self):
+
+        for obj in [pd.Series([1, 2, 3]), 1, 'foo', pd.Timestamp('20130101'),
+                    np.array([1, 2, 3])]:
+            self.check_error_on_write(obj, ValueError)
+
+    def test_basic(self):
+
+        df = pd.DataFrame({'string': list('abc'),
+                           'int': list(range(1, 4)),
+                           'uint': np.arange(3, 6).astype('u1'),
+                           'float': np.arange(4.0, 7.0, dtype='float64'),
+                           'float_with_null': [1., np.nan, 3],
+                           'bool': [True, False, True],
+                           'bool_with_null': [True, np.nan, False],
+                           'cat': pd.Categorical(list('abc')),
+                           'dt': pd.date_range('20130101', periods=3),
+                           'dttz': pd.date_range('20130101', periods=3,
+                                                 tz='US/Eastern'),
+                           'dt_with_null': [pd.Timestamp('20130101'), pd.NaT,
+                                            pd.Timestamp('20130103')],
+                           'dtns': pd.date_range('20130101', periods=3,
+                                                 freq='ns')})
+
+        assert df.dttz.dtype.tz.zone == 'US/Eastern'
+        self.check_round_trip(df)
+
+    @pytest.mark.skipif(fv >= LooseVersion('0.4.0'), reason='fixed in 0.4.0')
+    def test_strided_data_issues(self):
+
+        # strided data issuehttps://github.com/wesm/feather/issues/97
+        df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list('abc'))
+        self.check_error_on_write(df, FeatherError)
+
+    def test_duplicate_columns(self):
+
+        # https://github.com/wesm/feather/issues/53
+        # not currently able to handle duplicate columns
+        df = pd.DataFrame(np.arange(12).reshape(4, 3),
+                          columns=list('aaa')).copy()
+        self.check_error_on_write(df, ValueError)
+
+    def test_stringify_columns(self):
+
+        df = pd.DataFrame(np.arange(12).reshape(4, 3)).copy()
+        self.check_error_on_write(df, ValueError)
+
+    @pytest.mark.skipif(fv >= LooseVersion('0.4.0'), reason='fixed in 0.4.0')
+    def test_unsupported(self):
+
+        # timedelta
+        df = pd.DataFrame({'a': pd.timedelta_range('1 day', periods=3)})
+        self.check_error_on_write(df, FeatherError)
+
+        # non-strings
+        df = pd.DataFrame({'a': ['a', 1, 2.0]})
+        self.check_error_on_write(df, ValueError)
+
+    def test_unsupported_other(self):
+
+        # period
+        df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
+        self.check_error_on_write(df, ValueError)
+
+    @pytest.mark.skipif(fv < LooseVersion('0.4.0'), reason='new in 0.4.0')
+    def test_rw_nthreads(self):
+
+        df = pd.DataFrame({'A': np.arange(100000)})
+        self.check_round_trip(df, nthreads=2)
+
+    def test_write_with_index(self):
+
+        df = pd.DataFrame({'A': [1, 2, 3]})
+        self.check_round_trip(df)
+
+        # non-default index
+        for index in [[2, 3, 4],
+                      pd.date_range('20130101', periods=3),
+                      list('abc'),
+                      [1, 3, 4],
+                      pd.MultiIndex.from_tuples([('a', 1), ('a', 2),
+                                                 ('b', 1)]),
+                      ]:
+
+            df.index = index
+            self.check_error_on_write(df, ValueError)
+
+        # index with meta-data
+        df.index = [0, 1, 2]
+        df.index.name = 'foo'
+        self.check_error_on_write(df, ValueError)
+
+        # column multi-index
+        df.index = [0, 1, 2]
+        df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]),
+        self.check_error_on_write(df, ValueError)
+
+    def test_path_pathlib(self):
+        df = tm.makeDataFrame().reset_index()
+        result = tm.round_trip_pathlib(df.to_feather, pd.read_feather)
+        tm.assert_frame_equal(df, result)
+
+    def test_path_localpath(self):
+        df = tm.makeDataFrame().reset_index()
+        result = tm.round_trip_localpath(df.to_feather, pd.read_feather)
+        tm.assert_frame_equal(df, result)
@@ -0,0 +1,135 @@
+import pytest
+from datetime import datetime
+import pytz
+import platform
+from time import sleep
+import os
+
+import numpy as np
+import pandas as pd
+from pandas import compat, DataFrame
+
+from pandas.compat import range
+
+pandas_gbq = pytest.importorskip('pandas_gbq')
+
+PROJECT_ID = None
+PRIVATE_KEY_JSON_PATH = None
+PRIVATE_KEY_JSON_CONTENTS = None
+
+if compat.PY3:
+    DATASET_ID = 'pydata_pandas_bq_testing_py3'
+else:
+    DATASET_ID = 'pydata_pandas_bq_testing_py2'
+
+TABLE_ID = 'new_test'
+DESTINATION_TABLE = "{0}.{1}".format(DATASET_ID + "1", TABLE_ID)
+
+VERSION = platform.python_version()
+
+
+def _skip_if_no_project_id():
+    if not _get_project_id():
+        pytest.skip(
+            "Cannot run integration tests without a project id")
+
+
+def _skip_if_no_private_key_path():
+    if not _get_private_key_path():
+        pytest.skip("Cannot run integration tests without a "
+                    "private key json file path")
+
+
+def _in_travis_environment():
+    return 'TRAVIS_BUILD_DIR' in os.environ and \
+           'GBQ_PROJECT_ID' in os.environ
+
+
+def _get_project_id():
+    if _in_travis_environment():
+        return os.environ.get('GBQ_PROJECT_ID')
+    else:
+        return PROJECT_ID
+
+
+def _get_private_key_path():
+    if _in_travis_environment():
+        return os.path.join(*[os.environ.get('TRAVIS_BUILD_DIR'), 'ci',
+                              'travis_gbq.json'])
+    else:
+        return PRIVATE_KEY_JSON_PATH
+
+
+def clean_gbq_environment(private_key=None):
+    dataset = pandas_gbq.gbq._Dataset(_get_project_id(),
+                                      private_key=private_key)
+
+    for i in range(1, 10):
+        if DATASET_ID + str(i) in dataset.datasets():
+            dataset_id = DATASET_ID + str(i)
+            table = pandas_gbq.gbq._Table(_get_project_id(), dataset_id,
+                                          private_key=private_key)
+            for j in range(1, 20):
+                if TABLE_ID + str(j) in dataset.tables(dataset_id):
+                    table.delete(TABLE_ID + str(j))
+
+            dataset.delete(dataset_id)
+
+
+def make_mixed_dataframe_v2(test_size):
+    # create df to test for all BQ datatypes except RECORD
+    bools = np.random.randint(2, size=(1, test_size)).astype(bool)
+    flts = np.random.randn(1, test_size)
+    ints = np.random.randint(1, 10, size=(1, test_size))
+    strs = np.random.randint(1, 10, size=(1, test_size)).astype(str)
+    times = [datetime.now(pytz.timezone('US/Arizona'))
+             for t in range(test_size)]
+    return DataFrame({'bools': bools[0],
+                      'flts': flts[0],
+                      'ints': ints[0],
+                      'strs': strs[0],
+                      'times': times[0]},
+                     index=range(test_size))
+
+
+@pytest.mark.single
+class TestToGBQIntegrationWithServiceAccountKeyPath(object):
+
+    @classmethod
+    def setup_class(cls):
+        # - GLOBAL CLASS FIXTURES -
+        # put here any instruction you want to execute only *ONCE* *BEFORE*
+        # executing *ALL* tests described below.
+
+        _skip_if_no_project_id()
+        _skip_if_no_private_key_path()
+
+        clean_gbq_environment(_get_private_key_path())
+        pandas_gbq.gbq._Dataset(_get_project_id(),
+                                private_key=_get_private_key_path()
+                                ).create(DATASET_ID + "1")
+
+    @classmethod
+    def teardown_class(cls):
+        # - GLOBAL CLASS FIXTURES -
+        # put here any instruction you want to execute only *ONCE* *AFTER*
+        # executing all tests.
+
+        clean_gbq_environment(_get_private_key_path())
+
+    def test_roundtrip(self):
+        destination_table = DESTINATION_TABLE + "1"
+
+        test_size = 20001
+        df = make_mixed_dataframe_v2(test_size)
+
+        df.to_gbq(destination_table, _get_project_id(), chunksize=10000,
+                  private_key=_get_private_key_path())
+
+        sleep(30)  # <- Curses Google!!!
+
+        result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}"
+                             .format(destination_table),
+                             project_id=_get_project_id(),
+                             private_key=_get_private_key_path())
+        assert result['num_rows'][0] == test_size
@@ -0,0 +1,947 @@
+from __future__ import print_function
+
+import os
+import re
+import threading
+
+from functools import partial
+
+import pytest
+
+import numpy as np
+from numpy.random import rand
+
+from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index,
+                    date_range, Series)
+from pandas.compat import (map, zip, StringIO, BytesIO,
+                           is_platform_windows, PY3, reload)
+from pandas.io.common import URLError, file_path_to_url
+import pandas.io.html
+from pandas.io.html import read_html
+from pandas._libs.parsers import ParserError
+
+import pandas.util.testing as tm
+import pandas.util._test_decorators as td
+from pandas.util.testing import makeCustomDataframe as mkdf, network
+
+HERE = os.path.dirname(__file__)
+
+
+@pytest.fixture(params=[
+    'chinese_utf-16.html',
+    'chinese_utf-32.html',
+    'chinese_utf-8.html',
+    'letz_latin1.html',
+])
+def html_encoding_file(request, datapath):
+    """Parametrized fixture for HTML encoding test filenames."""
+    return datapath('io', 'data', 'html_encoding', request.param)
+
+
+def assert_framelist_equal(list1, list2, *args, **kwargs):
+    assert len(list1) == len(list2), ('lists are not of equal size '
+                                      'len(list1) == {0}, '
+                                      'len(list2) == {1}'.format(len(list1),
+                                                                 len(list2)))
+    msg = 'not all list elements are DataFrames'
+    both_frames = all(map(lambda x, y: isinstance(x, DataFrame) and
+                          isinstance(y, DataFrame), list1, list2))
+    assert both_frames, msg
+    for frame_i, frame_j in zip(list1, list2):
+        tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs)
+        assert not frame_i.empty, 'frames are both empty'
+
+
+@td.skip_if_no('bs4')
+def test_bs4_version_fails(monkeypatch, datapath):
+    import bs4
+    monkeypatch.setattr(bs4, '__version__', '4.2')
+    with tm.assert_raises_regex(ValueError, "minimum version"):
+        read_html(datapath("io", "data", "spam.html"), flavor='bs4')
+
+
+def test_invalid_flavor():
+    url = 'google.com'
+    with pytest.raises(ValueError):
+        read_html(url, 'google', flavor='not a* valid**++ flaver')
+
+
+@td.skip_if_no('bs4')
+@td.skip_if_no('lxml')
+def test_same_ordering(datapath):
+    filename = datapath('io', 'data', 'valid_markup.html')
+    dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
+    dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
+    assert_framelist_equal(dfs_lxml, dfs_bs4)
+
+
+@pytest.mark.parametrize("flavor", [
+    pytest.param('bs4', marks=pytest.mark.skipif(
+        not td.safe_import('lxml'), reason='No bs4')),
+    pytest.param('lxml', marks=pytest.mark.skipif(
+        not td.safe_import('lxml'), reason='No lxml'))], scope="class")
+class TestReadHtml(object):
+
+    @pytest.fixture(autouse=True)
+    def set_files(self, datapath):
+        self.spam_data = datapath('io', 'data', 'spam.html')
+        self.spam_data_kwargs = {}
+        if PY3:
+            self.spam_data_kwargs['encoding'] = 'UTF-8'
+        self.banklist_data = datapath("io", "data", "banklist.html")
+
+    @pytest.fixture(autouse=True, scope="function")
+    def set_defaults(self, flavor, request):
+        self.read_html = partial(read_html, flavor=flavor)
+        yield
+
+    def test_to_html_compat(self):
+        df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False,
+                  r_idx_names=False).applymap('{0:.3f}'.format).astype(float)
+        out = df.to_html()
+        res = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0]
+        tm.assert_frame_equal(res, df)
+
+    @network
+    def test_banklist_url(self):
+        url = 'http://www.fdic.gov/bank/individual/failed/banklist.html'
+        df1 = self.read_html(url, 'First Federal Bank of Florida',
+                             attrs={"id": 'table'})
+        df2 = self.read_html(url, 'Metcalf Bank', attrs={'id': 'table'})
+
+        assert_framelist_equal(df1, df2)
+
+    @network
+    def test_spam_url(self):
+        url = ('http://ndb.nal.usda.gov/ndb/foods/show/300772?fg=&man=&'
+               'lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam')
+        df1 = self.read_html(url, '.*Water.*')
+        df2 = self.read_html(url, 'Unit')
+
+        assert_framelist_equal(df1, df2)
+
+    @pytest.mark.slow
+    def test_banklist(self):
+        df1 = self.read_html(self.banklist_data, '.*Florida.*',
+                             attrs={'id': 'table'})
+        df2 = self.read_html(self.banklist_data, 'Metcalf Bank',
+                             attrs={'id': 'table'})
+
+        assert_framelist_equal(df1, df2)
+
+    def test_spam_no_types(self):
+
+        # infer_types removed in #10892
+        df1 = self.read_html(self.spam_data, '.*Water.*')
+        df2 = self.read_html(self.spam_data, 'Unit')
+        assert_framelist_equal(df1, df2)
+        assert df1[0].iloc[0, 0] == 'Proximates'
+        assert df1[0].columns[0] == 'Nutrient'
+
+    def test_spam_with_types(self):
+        df1 = self.read_html(self.spam_data, '.*Water.*')
+        df2 = self.read_html(self.spam_data, 'Unit')
+        assert_framelist_equal(df1, df2)
+
+        assert df1[0].iloc[0, 0] == 'Proximates'
+        assert df1[0].columns[0] == 'Nutrient'
+
+    def test_spam_no_match(self):
+        dfs = self.read_html(self.spam_data)
+        for df in dfs:
+            assert isinstance(df, DataFrame)
+
+    def test_banklist_no_match(self):
+        dfs = self.read_html(self.banklist_data, attrs={'id': 'table'})
+        for df in dfs:
+            assert isinstance(df, DataFrame)
+
+    def test_spam_header(self):
+        df = self.read_html(self.spam_data, '.*Water.*', header=1)[0]
+        assert df.columns[0] == 'Proximates'
+        assert not df.empty
+
+    def test_skiprows_int(self):
+        df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1)
+        df2 = self.read_html(self.spam_data, 'Unit', skiprows=1)
+
+        assert_framelist_equal(df1, df2)
+
+    def test_skiprows_xrange(self):
+        df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=range(2))[0]
+        df2 = self.read_html(self.spam_data, 'Unit', skiprows=range(2))[0]
+        tm.assert_frame_equal(df1, df2)
+
+    def test_skiprows_list(self):
+        df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=[1, 2])
+        df2 = self.read_html(self.spam_data, 'Unit', skiprows=[2, 1])
+
+        assert_framelist_equal(df1, df2)
+
+    def test_skiprows_set(self):
+        df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=set([1, 2]))
+        df2 = self.read_html(self.spam_data, 'Unit', skiprows=set([2, 1]))
+
+        assert_framelist_equal(df1, df2)
+
+    def test_skiprows_slice(self):
+        df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1)
+        df2 = self.read_html(self.spam_data, 'Unit', skiprows=1)
+
+        assert_framelist_equal(df1, df2)
+
+    def test_skiprows_slice_short(self):
+        df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2))
+        df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(2))
+
+        assert_framelist_equal(df1, df2)
+
+    def test_skiprows_slice_long(self):
+        df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2, 5))
+        df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(4, 1, -1))
+
+        assert_framelist_equal(df1, df2)
+
+    def test_skiprows_ndarray(self):
+        df1 = self.read_html(self.spam_data, '.*Water.*',
+                             skiprows=np.arange(2))
+        df2 = self.read_html(self.spam_data, 'Unit', skiprows=np.arange(2))
+
+        assert_framelist_equal(df1, df2)
+
+    def test_skiprows_invalid(self):
+        with tm.assert_raises_regex(TypeError, 'is not a valid type '
+                                    'for skipping rows'):
+            self.read_html(self.spam_data, '.*Water.*', skiprows='asdf')
+
+    def test_index(self):
+        df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0)
+        df2 = self.read_html(self.spam_data, 'Unit', index_col=0)
+        assert_framelist_equal(df1, df2)
+
+    def test_header_and_index_no_types(self):
+        df1 = self.read_html(self.spam_data, '.*Water.*', header=1,
+                             index_col=0)
+        df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0)
+        assert_framelist_equal(df1, df2)
+
+    def test_header_and_index_with_types(self):
+        df1 = self.read_html(self.spam_data, '.*Water.*', header=1,
+                             index_col=0)
+        df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0)
+        assert_framelist_equal(df1, df2)
+
+    def test_infer_types(self):
+
+        # 10892 infer_types removed
+        df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0)
+        df2 = self.read_html(self.spam_data, 'Unit', index_col=0)
+        assert_framelist_equal(df1, df2)
+
+    def test_string_io(self):
+        with open(self.spam_data, **self.spam_data_kwargs) as f:
+            data1 = StringIO(f.read())
+
+        with open(self.spam_data, **self.spam_data_kwargs) as f:
+            data2 = StringIO(f.read())
+
+        df1 = self.read_html(data1, '.*Water.*')
+        df2 = self.read_html(data2, 'Unit')
+        assert_framelist_equal(df1, df2)
+
+    def test_string(self):
+        with open(self.spam_data, **self.spam_data_kwargs) as f:
+            data = f.read()
+
+        df1 = self.read_html(data, '.*Water.*')
+        df2 = self.read_html(data, 'Unit')
+
+        assert_framelist_equal(df1, df2)
+
+    def test_file_like(self):
+        with open(self.spam_data, **self.spam_data_kwargs) as f:
+            df1 = self.read_html(f, '.*Water.*')
+
+        with open(self.spam_data, **self.spam_data_kwargs) as f:
+            df2 = self.read_html(f, 'Unit')
+
+        assert_framelist_equal(df1, df2)
+
+    @network
+    def test_bad_url_protocol(self):
+        with pytest.raises(URLError):
+            self.read_html('git://github.com', match='.*Water.*')
+
+    @network
+    def test_invalid_url(self):
+        try:
+            with pytest.raises(URLError):
+                self.read_html('http://www.a23950sdfa908sd.com',
+                               match='.*Water.*')
+        except ValueError as e:
+            assert str(e) == 'No tables found'
+
+    @pytest.mark.slow
+    def test_file_url(self):
+        url = self.banklist_data
+        dfs = self.read_html(file_path_to_url(os.path.abspath(url)),
+                             'First',
+                             attrs={'id': 'table'})
+        assert isinstance(dfs, list)
+        for df in dfs:
+            assert isinstance(df, DataFrame)
+
+    @pytest.mark.slow
+    def test_invalid_table_attrs(self):
+        url = self.banklist_data
+        with tm.assert_raises_regex(ValueError, 'No tables found'):
+            self.read_html(url, 'First Federal Bank of Florida',
+                           attrs={'id': 'tasdfable'})
+
+    def _bank_data(self, *args, **kwargs):
+        return self.read_html(self.banklist_data, 'Metcalf',
+                              attrs={'id': 'table'}, *args, **kwargs)
+
+    @pytest.mark.slow
+    def test_multiindex_header(self):
+        df = self._bank_data(header=[0, 1])[0]
+        assert isinstance(df.columns, MultiIndex)
+
+    @pytest.mark.slow
+    def test_multiindex_index(self):
+        df = self._bank_data(index_col=[0, 1])[0]
+        assert isinstance(df.index, MultiIndex)
+
+    @pytest.mark.slow
+    def test_multiindex_header_index(self):
+        df = self._bank_data(header=[0, 1], index_col=[0, 1])[0]
+        assert isinstance(df.columns, MultiIndex)
+        assert isinstance(df.index, MultiIndex)
+
+    @pytest.mark.slow
+    def test_multiindex_header_skiprows_tuples(self):
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            df = self._bank_data(header=[0, 1], skiprows=1,
+                                 tupleize_cols=True)[0]
+            assert isinstance(df.columns, Index)
+
+    @pytest.mark.slow
+    def test_multiindex_header_skiprows(self):
+        df = self._bank_data(header=[0, 1], skiprows=1)[0]
+        assert isinstance(df.columns, MultiIndex)
+
+    @pytest.mark.slow
+    def test_multiindex_header_index_skiprows(self):
+        df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0]
+        assert isinstance(df.index, MultiIndex)
+        assert isinstance(df.columns, MultiIndex)
+
+    @pytest.mark.slow
+    def test_regex_idempotency(self):
+        url = self.banklist_data
+        dfs = self.read_html(file_path_to_url(os.path.abspath(url)),
+                             match=re.compile(re.compile('Florida')),
+                             attrs={'id': 'table'})
+        assert isinstance(dfs, list)
+        for df in dfs:
+            assert isinstance(df, DataFrame)
+
+    def test_negative_skiprows(self):
+        with tm.assert_raises_regex(ValueError,
+                                    r'\(you passed a negative value\)'):
+            self.read_html(self.spam_data, 'Water', skiprows=-1)
+
+    @network
+    def test_multiple_matches(self):
+        url = 'https://docs.python.org/2/'
+        dfs = self.read_html(url, match='Python')
+        assert len(dfs) > 1
+
+    @network
+    def test_python_docs_table(self):
+        url = 'https://docs.python.org/2/'
+        dfs = self.read_html(url, match='Python')
+        zz = [df.iloc[0, 0][0:4] for df in dfs]
+        assert sorted(zz) == sorted(['Repo', 'What'])
+
+    @pytest.mark.slow
+    def test_thousands_macau_stats(self, datapath):
+        all_non_nan_table_index = -2
+        macau_data = datapath("io", "data", "macau.html")
+        dfs = self.read_html(macau_data, index_col=0,
+                             attrs={'class': 'style1'})
+        df = dfs[all_non_nan_table_index]
+
+        assert not any(s.isna().any() for _, s in df.iteritems())
+
+    @pytest.mark.slow
+    def test_thousands_macau_index_col(self, datapath):
+        all_non_nan_table_index = -2
+        macau_data = datapath('io', 'data', 'macau.html')
+        dfs = self.read_html(macau_data, index_col=0, header=0)
+        df = dfs[all_non_nan_table_index]
+
+        assert not any(s.isna().any() for _, s in df.iteritems())
+
+    def test_empty_tables(self):
+        """
+        Make sure that read_html ignores empty tables.
+        """
+        data1 = '''<table>
+            <thead>
+                <tr>
+                    <th>A</th>
+                    <th>B</th>
+                </tr>
+            </thead>
+            <tbody>
+                <tr>
+                    <td>1</td>
+                    <td>2</td>
+                </tr>
+            </tbody>
+        </table>'''
+        data2 = data1 + '''<table>
+            <tbody>
+            </tbody>
+        </table>'''
+        res1 = self.read_html(StringIO(data1))
+        res2 = self.read_html(StringIO(data2))
+        assert_framelist_equal(res1, res2)
+
+    def test_multiple_tbody(self):
+        # GH-20690
+        # Read all tbody tags within a single table.
+        data = '''<table>
+            <thead>
+                <tr>
+                    <th>A</th>
+                    <th>B</th>
+                </tr>
+            </thead>
+            <tbody>
+                <tr>
+                    <td>1</td>
+                    <td>2</td>
+                </tr>
+            </tbody>
+            <tbody>
+                <tr>
+                    <td>3</td>
+                    <td>4</td>
+                </tr>
+            </tbody>
+        </table>'''
+        expected = DataFrame({'A': [1, 3], 'B': [2, 4]})
+        result = self.read_html(StringIO(data))[0]
+        tm.assert_frame_equal(result, expected)
+
+    def test_header_and_one_column(self):
+        """
+        Don't fail with bs4 when there is a header and only one column
+        as described in issue #9178
+        """
+        data = StringIO('''<html>
+            <body>
+             <table>
+                <thead>
+                    <tr>
+                        <th>Header</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    <tr>
+                        <td>first</td>
+                    </tr>
+                </tbody>
+            </table>
+            </body>
+        </html>''')
+        expected = DataFrame(data={'Header': 'first'}, index=[0])
+        result = self.read_html(data)[0]
+        tm.assert_frame_equal(result, expected)
+
+    def test_tfoot_read(self):
+        """
+        Make sure that read_html reads tfoot, containing td or th.
+        Ignores empty tfoot
+        """
+        data_template = '''<table>
+            <thead>
+                <tr>
+                    <th>A</th>
+                    <th>B</th>
+                </tr>
+            </thead>
+            <tbody>
+                <tr>
+                    <td>bodyA</td>
+                    <td>bodyB</td>
+                </tr>
+            </tbody>
+            <tfoot>
+                {footer}
+            </tfoot>
+        </table>'''
+
+        data1 = data_template.format(footer="")
+        data2 = data_template.format(
+            footer="<tr><td>footA</td><th>footB</th></tr>")
+
+        d1 = {'A': ['bodyA'], 'B': ['bodyB']}
+        d2 = {'A': ['bodyA', 'footA'], 'B': ['bodyB', 'footB']}
+
+        tm.assert_frame_equal(self.read_html(data1)[0], DataFrame(d1))
+        tm.assert_frame_equal(self.read_html(data2)[0], DataFrame(d2))
+
+    def test_countries_municipalities(self):
+        # GH5048
+        data1 = StringIO('''<table>
+            <thead>
+                <tr>
+                    <th>Country</th>
+                    <th>Municipality</th>
+                    <th>Year</th>
+                </tr>
+            </thead>
+            <tbody>
+                <tr>
+                    <td>Ukraine</td>
+                    <th>Odessa</th>
+                    <td>1944</td>
+                </tr>
+            </tbody>
+        </table>''')
+        data2 = StringIO('''
+        <table>
+            <tbody>
+                <tr>
+                    <th>Country</th>
+                    <th>Municipality</th>
+                    <th>Year</th>
+                </tr>
+                <tr>
+                    <td>Ukraine</td>
+                    <th>Odessa</th>
+                    <td>1944</td>
+                </tr>
+            </tbody>
+        </table>''')
+        res1 = self.read_html(data1)
+        res2 = self.read_html(data2, header=0)
+        assert_framelist_equal(res1, res2)
+
+    def test_nyse_wsj_commas_table(self, datapath):
+        data = datapath('io', 'data', 'nyse_wsj.html')
+        df = self.read_html(data, index_col=0, header=0,
+                            attrs={'class': 'mdcTable'})[0]
+
+        columns = Index(['Issue(Roll over for charts and headlines)',
+                         'Volume', 'Price', 'Chg', '% Chg'])
+        nrows = 100
+        assert df.shape[0] == nrows
+        tm.assert_index_equal(df.columns, columns)
+
+    @pytest.mark.slow
+    def test_banklist_header(self, datapath):
+        from pandas.io.html import _remove_whitespace
+
+        def try_remove_ws(x):
+            try:
+                return _remove_whitespace(x)
+            except AttributeError:
+                return x
+
+        df = self.read_html(self.banklist_data, 'Metcalf',
+                            attrs={'id': 'table'})[0]
+        ground_truth = read_csv(datapath('io', 'data', 'banklist.csv'),
+                                converters={'Updated Date': Timestamp,
+                                            'Closing Date': Timestamp})
+        assert df.shape == ground_truth.shape
+        old = ['First Vietnamese American BankIn Vietnamese',
+               'Westernbank Puerto RicoEn Espanol',
+               'R-G Premier Bank of Puerto RicoEn Espanol',
+               'EurobankEn Espanol', 'Sanderson State BankEn Espanol',
+               'Washington Mutual Bank(Including its subsidiary Washington '
+               'Mutual Bank FSB)',
+               'Silver State BankEn Espanol',
+               'AmTrade International BankEn Espanol',
+               'Hamilton Bank, NAEn Espanol',
+               'The Citizens Savings BankPioneer Community Bank, Inc.']
+        new = ['First Vietnamese American Bank', 'Westernbank Puerto Rico',
+               'R-G Premier Bank of Puerto Rico', 'Eurobank',
+               'Sanderson State Bank', 'Washington Mutual Bank',
+               'Silver State Bank', 'AmTrade International Bank',
+               'Hamilton Bank, NA', 'The Citizens Savings Bank']
+        dfnew = df.applymap(try_remove_ws).replace(old, new)
+        gtnew = ground_truth.applymap(try_remove_ws)
+        converted = dfnew._convert(datetime=True, numeric=True)
+        date_cols = ['Closing Date', 'Updated Date']
+        converted[date_cols] = converted[date_cols]._convert(datetime=True,
+                                                             coerce=True)
+        tm.assert_frame_equal(converted, gtnew)
+
+    @pytest.mark.slow
+    def test_gold_canyon(self):
+        gc = 'Gold Canyon'
+        with open(self.banklist_data, 'r') as f:
+            raw_text = f.read()
+
+        assert gc in raw_text
+        df = self.read_html(self.banklist_data, 'Gold Canyon',
+                            attrs={'id': 'table'})[0]
+        assert gc in df.to_string()
+
+    def test_different_number_of_rows(self):
+        expected = """<table border="1" class="dataframe">
+                        <thead>
+                            <tr style="text-align: right;">
+                            <th></th>
+                            <th>C_l0_g0</th>
+                            <th>C_l0_g1</th>
+                            <th>C_l0_g2</th>
+                            <th>C_l0_g3</th>
+                            <th>C_l0_g4</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            <tr>
+                            <th>R_l0_g0</th>
+                            <td> 0.763</td>
+                            <td> 0.233</td>
+                            <td> nan</td>
+                            <td> nan</td>
+                            <td> nan</td>
+                            </tr>
+                            <tr>
+                            <th>R_l0_g1</th>
+                            <td> 0.244</td>
+                            <td> 0.285</td>
+                            <td> 0.392</td>
+                            <td> 0.137</td>
+                            <td> 0.222</td>
+                            </tr>
+                        </tbody>
+                    </table>"""
+        out = """<table border="1" class="dataframe">
+                    <thead>
+                        <tr style="text-align: right;">
+                        <th></th>
+                        <th>C_l0_g0</th>
+                        <th>C_l0_g1</th>
+                        <th>C_l0_g2</th>
+                        <th>C_l0_g3</th>
+                        <th>C_l0_g4</th>
+                        </tr>
+                    </thead>
+                    <tbody>
+                        <tr>
+                        <th>R_l0_g0</th>
+                        <td> 0.763</td>
+                        <td> 0.233</td>
+                        </tr>
+                        <tr>
+                        <th>R_l0_g1</th>
+                        <td> 0.244</td>
+                        <td> 0.285</td>
+                        <td> 0.392</td>
+                        <td> 0.137</td>
+                        <td> 0.222</td>
+                        </tr>
+                    </tbody>
+                 </table>"""
+        expected = self.read_html(expected, index_col=0)[0]
+        res = self.read_html(out, index_col=0)[0]
+        tm.assert_frame_equal(expected, res)
+
+    def test_parse_dates_list(self):
+        df = DataFrame({'date': date_range('1/1/2001', periods=10)})
+        expected = df.to_html()
+        res = self.read_html(expected, parse_dates=[1], index_col=0)
+        tm.assert_frame_equal(df, res[0])
+        res = self.read_html(expected, parse_dates=['date'], index_col=0)
+        tm.assert_frame_equal(df, res[0])
+
+    def test_parse_dates_combine(self):
+        raw_dates = Series(date_range('1/1/2001', periods=10))
+        df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())),
+                        'time': raw_dates.map(lambda x: str(x.time()))})
+        res = self.read_html(df.to_html(), parse_dates={'datetime': [1, 2]},
+                             index_col=1)
+        newdf = DataFrame({'datetime': raw_dates})
+        tm.assert_frame_equal(newdf, res[0])
+
+    def test_computer_sales_page(self, datapath):
+        data = datapath('io', 'data', 'computer_sales_page.html')
+        with tm.assert_raises_regex(ParserError,
+                                    r"Passed header=\[0,1\] are "
+                                    r"too many rows for this "
+                                    r"multi_index of columns"):
+            self.read_html(data, header=[0, 1])
+
+        data = datapath('io', 'data', 'computer_sales_page.html')
+        assert self.read_html(data, header=[1, 2])
+
+    def test_wikipedia_states_table(self, datapath):
+        data = datapath('io', 'data', 'wikipedia_states.html')
+        assert os.path.isfile(data), '%r is not a file' % data
+        assert os.path.getsize(data), '%r is an empty file' % data
+        result = self.read_html(data, 'Arizona', header=1)[0]
+        assert result['sq mi'].dtype == np.dtype('float64')
+
+    def test_decimal_rows(self):
+
+        # GH 12907
+        data = StringIO('''<html>
+            <body>
+             <table>
+                <thead>
+                    <tr>
+                        <th>Header</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    <tr>
+                        <td>1100#101</td>
+                    </tr>
+                </tbody>
+            </table>
+            </body>
+        </html>''')
+        expected = DataFrame(data={'Header': 1100.101}, index=[0])
+        result = self.read_html(data, decimal='#')[0]
+        assert result['Header'].dtype == np.dtype('float64')
+        tm.assert_frame_equal(result, expected)
+
+    def test_bool_header_arg(self):
+        # GH 6114
+        for arg in [True, False]:
+            with pytest.raises(TypeError):
+                read_html(self.spam_data, header=arg)
+
+    def test_converters(self):
+        # GH 13461
+        html_data = """<table>
+                        <thead>
+                            <th>a</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            <tr>
+                            <td> 0.763</td>
+                            </tr>
+                            <tr>
+                            <td> 0.244</td>
+                            </tr>
+                        </tbody>
+                    </table>"""
+
+        expected_df = DataFrame({'a': ['0.763', '0.244']})
+        html_df = read_html(html_data, converters={'a': str})[0]
+        tm.assert_frame_equal(expected_df, html_df)
+
+    def test_na_values(self):
+        # GH 13461
+        html_data = """<table>
+                        <thead>
+                            <th>a</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            <tr>
+                            <td> 0.763</td>
+                            </tr>
+                            <tr>
+                            <td> 0.244</td>
+                            </tr>
+                        </tbody>
+                    </table>"""
+
+        expected_df = DataFrame({'a': [0.763, np.nan]})
+        html_df = read_html(html_data, na_values=[0.244])[0]
+        tm.assert_frame_equal(expected_df, html_df)
+
+    def test_keep_default_na(self):
+        html_data = """<table>
+                        <thead>
+                            <th>a</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            <tr>
+                            <td> N/A</td>
+                            </tr>
+                            <tr>
+                            <td> NA</td>
+                            </tr>
+                        </tbody>
+                    </table>"""
+
+        expected_df = DataFrame({'a': ['N/A', 'NA']})
+        html_df = read_html(html_data, keep_default_na=False)[0]
+        tm.assert_frame_equal(expected_df, html_df)
+
+        expected_df = DataFrame({'a': [np.nan, np.nan]})
+        html_df = read_html(html_data, keep_default_na=True)[0]
+        tm.assert_frame_equal(expected_df, html_df)
+
+    def test_multiple_header_rows(self):
+        # Issue #13434
+        expected_df = DataFrame(data=[("Hillary", 68, "D"),
+                                      ("Bernie", 74, "D"),
+                                      ("Donald", 69, "R")])
+        expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"],
+                               ["Name", "Unnamed: 1_level_1",
+                                "Unnamed: 2_level_1"]]
+        html = expected_df.to_html(index=False)
+        html_df = read_html(html, )[0]
+        tm.assert_frame_equal(expected_df, html_df)
+
+    def test_works_on_valid_markup(self, datapath):
+        filename = datapath('io', 'data', 'valid_markup.html')
+        dfs = self.read_html(filename, index_col=0)
+        assert isinstance(dfs, list)
+        assert isinstance(dfs[0], DataFrame)
+
+    @pytest.mark.slow
+    def test_fallback_success(self, datapath):
+        banklist_data = datapath('io', 'data', 'banklist.html')
+        self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib'])
+
+    def test_to_html_timestamp(self):
+        rng = date_range('2000-01-01', periods=10)
+        df = DataFrame(np.random.randn(10, 4), index=rng)
+
+        result = df.to_html()
+        assert '2000-01-01' in result
+
+    @pytest.mark.parametrize("displayed_only,exp0,exp1", [
+        (True, DataFrame(["foo"]), None),
+        (False, DataFrame(["foo  bar  baz  qux"]), DataFrame(["foo"]))])
+    def test_displayed_only(self, displayed_only, exp0, exp1):
+        # GH 20027
+        data = StringIO("""<html>
+          <body>
+            <table>
+              <tr>
+                <td>
+                  foo
+                  <span style="display:none;text-align:center">bar</span>
+                  <span style="display:none">baz</span>
+                  <span style="display: none">qux</span>
+                </td>
+              </tr>
+            </table>
+            <table style="display: none">
+              <tr>
+                <td>foo</td>
+              </tr>
+            </table>
+          </body>
+        </html>""")
+
+        dfs = self.read_html(data, displayed_only=displayed_only)
+        tm.assert_frame_equal(dfs[0], exp0)
+
+        if exp1 is not None:
+            tm.assert_frame_equal(dfs[1], exp1)
+        else:
+            assert len(dfs) == 1  # Should not parse hidden table
+
+    def test_encode(self, html_encoding_file):
+        _, encoding = os.path.splitext(
+            os.path.basename(html_encoding_file)
+        )[0].split('_')
+
+        try:
+            with open(html_encoding_file, 'rb') as fobj:
+                from_string = self.read_html(fobj.read(), encoding=encoding,
+                                             index_col=0).pop()
+
+            with open(html_encoding_file, 'rb') as fobj:
+                from_file_like = self.read_html(BytesIO(fobj.read()),
+                                                encoding=encoding,
+                                                index_col=0).pop()
+
+            from_filename = self.read_html(html_encoding_file,
+                                           encoding=encoding,
+                                           index_col=0).pop()
+            tm.assert_frame_equal(from_string, from_file_like)
+            tm.assert_frame_equal(from_string, from_filename)
+        except Exception:
+            # seems utf-16/32 fail on windows
+            if is_platform_windows():
+                if '16' in encoding or '32' in encoding:
+                    pytest.skip()
+                raise
+
+    def test_parse_failure_unseekable(self):
+        # Issue #17975
+
+        if self.read_html.keywords.get('flavor') == 'lxml':
+            pytest.skip("Not applicable for lxml")
+
+        class UnseekableStringIO(StringIO):
+            def seekable(self):
+                return False
+
+        bad = UnseekableStringIO('''
+            <table><tr><td>spam<foobr />eggs</td></tr></table>''')
+
+        assert self.read_html(bad)
+
+        with pytest.raises(ValueError,
+                           match='passed a non-rewindable file object'):
+            self.read_html(bad)
+
+    def test_parse_failure_rewinds(self):
+        # Issue #17975
+
+        class MockFile(object):
+            def __init__(self, data):
+                self.data = data
+                self.at_end = False
+
+            def read(self, size=None):
+                data = '' if self.at_end else self.data
+                self.at_end = True
+                return data
+
+            def seek(self, offset):
+                self.at_end = False
+
+            def seekable(self):
+                return True
+
+        good = MockFile('<table><tr><td>spam<br />eggs</td></tr></table>')
+        bad = MockFile('<table><tr><td>spam<foobr />eggs</td></tr></table>')
+
+        assert self.read_html(good)
+        assert self.read_html(bad)
+
+    @pytest.mark.slow
+    def test_importcheck_thread_safety(self, datapath):
+        # see gh-16928
+
+        class ErrorThread(threading.Thread):
+            def run(self):
+                try:
+                    super(ErrorThread, self).run()
+                except Exception as e:
+                    self.err = e
+                else:
+                    self.err = None
+
+        # force import check by reinitalising global vars in html.py
+        reload(pandas.io.html)
+
+        filename = datapath('io', 'data', 'valid_markup.html')
+        helper_thread1 = ErrorThread(target=self.read_html, args=(filename,))
+        helper_thread2 = ErrorThread(target=self.read_html, args=(filename,))
+
+        helper_thread1.start()
+        helper_thread2.start()
+
+        while helper_thread1.is_alive() or helper_thread2.is_alive():
+            pass
+        assert None is helper_thread1.err is helper_thread2.err
@@ -0,0 +1,940 @@
+import pytest
+
+from warnings import catch_warnings
+import os
+import datetime
+import glob
+import numpy as np
+from distutils.version import LooseVersion
+
+from pandas import compat
+from pandas.compat import u, PY3
+from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range,
+                    date_range, period_range, Index, Categorical,
+                    Period, Interval)
+from pandas.errors import PerformanceWarning
+from pandas.io.packers import to_msgpack, read_msgpack
+import pandas.util.testing as tm
+from pandas.util.testing import (ensure_clean,
+                                 assert_categorical_equal,
+                                 assert_frame_equal,
+                                 assert_index_equal,
+                                 assert_series_equal,
+                                 patch)
+from pandas.tests.test_panel import assert_panel_equal
+
+import pandas
+from pandas import Timestamp, NaT
+from pandas._libs.tslib import iNaT
+
+nan = np.nan
+
+try:
+    import blosc  # NOQA
+except ImportError:
+    _BLOSC_INSTALLED = False
+else:
+    _BLOSC_INSTALLED = True
+
+try:
+    import zlib  # NOQA
+except ImportError:
+    _ZLIB_INSTALLED = False
+else:
+    _ZLIB_INSTALLED = True
+
+
+@pytest.fixture(scope='module')
+def current_packers_data():
+    # our current version packers data
+    from pandas.tests.io.generate_legacy_storage_files import (
+        create_msgpack_data)
+    return create_msgpack_data()
+
+
+@pytest.fixture(scope='module')
+def all_packers_data():
+    # our all of our current version packers data
+    from pandas.tests.io.generate_legacy_storage_files import (
+        create_data)
+    return create_data()
+
+
+def check_arbitrary(a, b):
+
+    if isinstance(a, (list, tuple)) and isinstance(b, (list, tuple)):
+        assert(len(a) == len(b))
+        for a_, b_ in zip(a, b):
+            check_arbitrary(a_, b_)
+    elif isinstance(a, Panel):
+        assert_panel_equal(a, b)
+    elif isinstance(a, DataFrame):
+        assert_frame_equal(a, b)
+    elif isinstance(a, Series):
+        assert_series_equal(a, b)
+    elif isinstance(a, Index):
+        assert_index_equal(a, b)
+    elif isinstance(a, Categorical):
+        # Temp,
+        # Categorical.categories is changed from str to bytes in PY3
+        # maybe the same as GH 13591
+        if PY3 and b.categories.inferred_type == 'string':
+            pass
+        else:
+            tm.assert_categorical_equal(a, b)
+    elif a is NaT:
+        assert b is NaT
+    elif isinstance(a, Timestamp):
+        assert a == b
+        assert a.freq == b.freq
+    else:
+        assert(a == b)
+
+
+class TestPackers(object):
+
+    def setup_method(self, method):
+        self.path = '__%s__.msg' % tm.rands(10)
+
+    def teardown_method(self, method):
+        pass
+
+    def encode_decode(self, x, compress=None, **kwargs):
+        with ensure_clean(self.path) as p:
+            to_msgpack(p, x, compress=compress, **kwargs)
+            return read_msgpack(p, **kwargs)
+
+
+class TestAPI(TestPackers):
+
+    def test_string_io(self):
+
+        df = DataFrame(np.random.randn(10, 2))
+        s = df.to_msgpack(None)
+        result = read_msgpack(s)
+        tm.assert_frame_equal(result, df)
+
+        s = df.to_msgpack()
+        result = read_msgpack(s)
+        tm.assert_frame_equal(result, df)
+
+        s = df.to_msgpack()
+        result = read_msgpack(compat.BytesIO(s))
+        tm.assert_frame_equal(result, df)
+
+        s = to_msgpack(None, df)
+        result = read_msgpack(s)
+        tm.assert_frame_equal(result, df)
+
+        with ensure_clean(self.path) as p:
+
+            s = df.to_msgpack()
+            fh = open(p, 'wb')
+            fh.write(s)
+            fh.close()
+            result = read_msgpack(p)
+            tm.assert_frame_equal(result, df)
+
+    def test_path_pathlib(self):
+        df = tm.makeDataFrame()
+        result = tm.round_trip_pathlib(df.to_msgpack, read_msgpack)
+        tm.assert_frame_equal(df, result)
+
+    def test_path_localpath(self):
+        df = tm.makeDataFrame()
+        result = tm.round_trip_localpath(df.to_msgpack, read_msgpack)
+        tm.assert_frame_equal(df, result)
+
+    def test_iterator_with_string_io(self):
+
+        dfs = [DataFrame(np.random.randn(10, 2)) for i in range(5)]
+        s = to_msgpack(None, *dfs)
+        for i, result in enumerate(read_msgpack(s, iterator=True)):
+            tm.assert_frame_equal(result, dfs[i])
+
+    def test_invalid_arg(self):
+        # GH10369
+        class A(object):
+
+            def __init__(self):
+                self.read = 0
+
+        pytest.raises(ValueError, read_msgpack, path_or_buf=None)
+        pytest.raises(ValueError, read_msgpack, path_or_buf={})
+        pytest.raises(ValueError, read_msgpack, path_or_buf=A())
+
+
+class TestNumpy(TestPackers):
+
+    def test_numpy_scalar_float(self):
+        x = np.float32(np.random.rand())
+        x_rec = self.encode_decode(x)
+        tm.assert_almost_equal(x, x_rec)
+
+    def test_numpy_scalar_complex(self):
+        x = np.complex64(np.random.rand() + 1j * np.random.rand())
+        x_rec = self.encode_decode(x)
+        assert np.allclose(x, x_rec)
+
+    def test_scalar_float(self):
+        x = np.random.rand()
+        x_rec = self.encode_decode(x)
+        tm.assert_almost_equal(x, x_rec)
+
+    def test_scalar_bool(self):
+        x = np.bool_(1)
+        x_rec = self.encode_decode(x)
+        tm.assert_almost_equal(x, x_rec)
+
+        x = np.bool_(0)
+        x_rec = self.encode_decode(x)
+        tm.assert_almost_equal(x, x_rec)
+
+    def test_scalar_complex(self):
+        x = np.random.rand() + 1j * np.random.rand()
+        x_rec = self.encode_decode(x)
+        assert np.allclose(x, x_rec)
+
+    def test_list_numpy_float(self):
+        x = [np.float32(np.random.rand()) for i in range(5)]
+        x_rec = self.encode_decode(x)
+        # current msgpack cannot distinguish list/tuple
+        tm.assert_almost_equal(tuple(x), x_rec)
+
+        x_rec = self.encode_decode(tuple(x))
+        tm.assert_almost_equal(tuple(x), x_rec)
+
+    def test_list_numpy_float_complex(self):
+        if not hasattr(np, 'complex128'):
+            pytest.skip('numpy can not handle complex128')
+
+        x = [np.float32(np.random.rand()) for i in range(5)] + \
+            [np.complex128(np.random.rand() + 1j * np.random.rand())
+             for i in range(5)]
+        x_rec = self.encode_decode(x)
+        assert np.allclose(x, x_rec)
+
+    def test_list_float(self):
+        x = [np.random.rand() for i in range(5)]
+        x_rec = self.encode_decode(x)
+        # current msgpack cannot distinguish list/tuple
+        tm.assert_almost_equal(tuple(x), x_rec)
+
+        x_rec = self.encode_decode(tuple(x))
+        tm.assert_almost_equal(tuple(x), x_rec)
+
+    def test_list_float_complex(self):
+        x = [np.random.rand() for i in range(5)] + \
+            [(np.random.rand() + 1j * np.random.rand()) for i in range(5)]
+        x_rec = self.encode_decode(x)
+        assert np.allclose(x, x_rec)
+
+    def test_dict_float(self):
+        x = {'foo': 1.0, 'bar': 2.0}
+        x_rec = self.encode_decode(x)
+        tm.assert_almost_equal(x, x_rec)
+
+    def test_dict_complex(self):
+        x = {'foo': 1.0 + 1.0j, 'bar': 2.0 + 2.0j}
+        x_rec = self.encode_decode(x)
+        tm.assert_dict_equal(x, x_rec)
+
+        for key in x:
+            tm.assert_class_equal(x[key], x_rec[key], obj="complex value")
+
+    def test_dict_numpy_float(self):
+        x = {'foo': np.float32(1.0), 'bar': np.float32(2.0)}
+        x_rec = self.encode_decode(x)
+        tm.assert_almost_equal(x, x_rec)
+
+    def test_dict_numpy_complex(self):
+        x = {'foo': np.complex128(1.0 + 1.0j),
+             'bar': np.complex128(2.0 + 2.0j)}
+        x_rec = self.encode_decode(x)
+        tm.assert_dict_equal(x, x_rec)
+
+        for key in x:
+            tm.assert_class_equal(x[key], x_rec[key], obj="numpy complex128")
+
+    def test_numpy_array_float(self):
+
+        # run multiple times
+        for n in range(10):
+            x = np.random.rand(10)
+            for dtype in ['float32', 'float64']:
+                x = x.astype(dtype)
+                x_rec = self.encode_decode(x)
+                tm.assert_almost_equal(x, x_rec)
+
+    def test_numpy_array_complex(self):
+        x = (np.random.rand(5) + 1j * np.random.rand(5)).astype(np.complex128)
+        x_rec = self.encode_decode(x)
+        assert (all(map(lambda x, y: x == y, x, x_rec)) and
+                x.dtype == x_rec.dtype)
+
+    def test_list_mixed(self):
+        x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo'), np.bool_(1)]
+        x_rec = self.encode_decode(x)
+        # current msgpack cannot distinguish list/tuple
+        tm.assert_almost_equal(tuple(x), x_rec)
+
+        x_rec = self.encode_decode(tuple(x))
+        tm.assert_almost_equal(tuple(x), x_rec)
+
+
+class TestBasic(TestPackers):
+
+    def test_timestamp(self):
+
+        for i in [Timestamp(
+            '20130101'), Timestamp('20130101', tz='US/Eastern'),
+                Timestamp('201301010501')]:
+            i_rec = self.encode_decode(i)
+            assert i == i_rec
+
+    def test_nat(self):
+        nat_rec = self.encode_decode(NaT)
+        assert NaT is nat_rec
+
+    def test_datetimes(self):
+
+        for i in [datetime.datetime(2013, 1, 1),
+                  datetime.datetime(2013, 1, 1, 5, 1),
+                  datetime.date(2013, 1, 1),
+                  np.datetime64(datetime.datetime(2013, 1, 5, 2, 15))]:
+            i_rec = self.encode_decode(i)
+            assert i == i_rec
+
+    def test_timedeltas(self):
+
+        for i in [datetime.timedelta(days=1),
+                  datetime.timedelta(days=1, seconds=10),
+                  np.timedelta64(1000000)]:
+            i_rec = self.encode_decode(i)
+            assert i == i_rec
+
+    def test_periods(self):
+        # 13463
+        for i in [Period('2010-09', 'M'), Period('2014-Q1', 'Q')]:
+            i_rec = self.encode_decode(i)
+            assert i == i_rec
+
+    def test_intervals(self):
+        # 19967
+        for i in [Interval(0, 1), Interval(0, 1, 'left'),
+                  Interval(10, 25., 'right')]:
+            i_rec = self.encode_decode(i)
+            assert i == i_rec
+
+
+class TestIndex(TestPackers):
+
+    def setup_method(self, method):
+        super(TestIndex, self).setup_method(method)
+
+        self.d = {
+            'string': tm.makeStringIndex(100),
+            'date': tm.makeDateIndex(100),
+            'int': tm.makeIntIndex(100),
+            'rng': tm.makeRangeIndex(100),
+            'float': tm.makeFloatIndex(100),
+            'empty': Index([]),
+            'tuple': Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])),
+            'period': Index(period_range('2012-1-1', freq='M', periods=3)),
+            'date2': Index(date_range('2013-01-1', periods=10)),
+            'bdate': Index(bdate_range('2013-01-02', periods=10)),
+            'cat': tm.makeCategoricalIndex(100),
+            'interval': tm.makeIntervalIndex(100),
+            'timedelta': tm.makeTimedeltaIndex(100, 'H')
+        }
+
+        self.mi = {
+            'reg': MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'),
+                                           ('foo', 'two'),
+                                           ('qux', 'one'), ('qux', 'two')],
+                                          names=['first', 'second']),
+        }
+
+    def test_basic_index(self):
+
+        for s, i in self.d.items():
+            i_rec = self.encode_decode(i)
+            tm.assert_index_equal(i, i_rec)
+
+        # datetime with no freq (GH5506)
+        i = Index([Timestamp('20130101'), Timestamp('20130103')])
+        i_rec = self.encode_decode(i)
+        tm.assert_index_equal(i, i_rec)
+
+        # datetime with timezone
+        i = Index([Timestamp('20130101 9:00:00'), Timestamp(
+            '20130103 11:00:00')]).tz_localize('US/Eastern')
+        i_rec = self.encode_decode(i)
+        tm.assert_index_equal(i, i_rec)
+
+    def test_multi_index(self):
+
+        for s, i in self.mi.items():
+            i_rec = self.encode_decode(i)
+            tm.assert_index_equal(i, i_rec)
+
+    def test_unicode(self):
+        i = tm.makeUnicodeIndex(100)
+
+        i_rec = self.encode_decode(i)
+        tm.assert_index_equal(i, i_rec)
+
+    def categorical_index(self):
+        # GH15487
+        df = DataFrame(np.random.randn(10, 2))
+        df = df.astype({0: 'category'}).set_index(0)
+        result = self.encode_decode(df)
+        tm.assert_frame_equal(result, df)
+
+
+class TestSeries(TestPackers):
+
+    def setup_method(self, method):
+        super(TestSeries, self).setup_method(method)
+
+        self.d = {}
+
+        s = tm.makeStringSeries()
+        s.name = 'string'
+        self.d['string'] = s
+
+        s = tm.makeObjectSeries()
+        s.name = 'object'
+        self.d['object'] = s
+
+        s = Series(iNaT, dtype='M8[ns]', index=range(5))
+        self.d['date'] = s
+
+        data = {
+            'A': [0., 1., 2., 3., np.nan],
+            'B': [0, 1, 0, 1, 0],
+            'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
+            'D': date_range('1/1/2009', periods=5),
+            'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
+            'F': [Timestamp('20130102', tz='US/Eastern')] * 2 +
+                 [Timestamp('20130603', tz='CET')] * 3,
+            'G': [Timestamp('20130102', tz='US/Eastern')] * 5,
+            'H': Categorical([1, 2, 3, 4, 5]),
+            'I': Categorical([1, 2, 3, 4, 5], ordered=True),
+            'J': (np.bool_(1), 2, 3, 4, 5),
+        }
+
+        self.d['float'] = Series(data['A'])
+        self.d['int'] = Series(data['B'])
+        self.d['mixed'] = Series(data['E'])
+        self.d['dt_tz_mixed'] = Series(data['F'])
+        self.d['dt_tz'] = Series(data['G'])
+        self.d['cat_ordered'] = Series(data['H'])
+        self.d['cat_unordered'] = Series(data['I'])
+        self.d['numpy_bool_mixed'] = Series(data['J'])
+
+    def test_basic(self):
+
+        # run multiple times here
+        for n in range(10):
+            for s, i in self.d.items():
+                i_rec = self.encode_decode(i)
+                assert_series_equal(i, i_rec)
+
+
+class TestCategorical(TestPackers):
+
+    def setup_method(self, method):
+        super(TestCategorical, self).setup_method(method)
+
+        self.d = {}
+
+        self.d['plain_str'] = Categorical(['a', 'b', 'c', 'd', 'e'])
+        self.d['plain_str_ordered'] = Categorical(['a', 'b', 'c', 'd', 'e'],
+                                                  ordered=True)
+
+        self.d['plain_int'] = Categorical([5, 6, 7, 8])
+        self.d['plain_int_ordered'] = Categorical([5, 6, 7, 8], ordered=True)
+
+    def test_basic(self):
+
+        # run multiple times here
+        for n in range(10):
+            for s, i in self.d.items():
+                i_rec = self.encode_decode(i)
+                assert_categorical_equal(i, i_rec)
+
+
+class TestNDFrame(TestPackers):
+
+    def setup_method(self, method):
+        super(TestNDFrame, self).setup_method(method)
+
+        data = {
+            'A': [0., 1., 2., 3., np.nan],
+            'B': [0, 1, 0, 1, 0],
+            'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
+            'D': date_range('1/1/2009', periods=5),
+            'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
+            'F': [Timestamp('20130102', tz='US/Eastern')] * 5,
+            'G': [Timestamp('20130603', tz='CET')] * 5,
+            'H': Categorical(['a', 'b', 'c', 'd', 'e']),
+            'I': Categorical(['a', 'b', 'c', 'd', 'e'], ordered=True),
+        }
+
+        self.frame = {
+            'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)),
+            'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)),
+            'mixed': DataFrame(data)}
+
+        with catch_warnings(record=True):
+            self.panel = {
+                'float': Panel(dict(ItemA=self.frame['float'],
+                                    ItemB=self.frame['float'] + 1))}
+
+    def test_basic_frame(self):
+
+        for s, i in self.frame.items():
+            i_rec = self.encode_decode(i)
+            assert_frame_equal(i, i_rec)
+
+    def test_basic_panel(self):
+
+        with catch_warnings(record=True):
+            for s, i in self.panel.items():
+                i_rec = self.encode_decode(i)
+                assert_panel_equal(i, i_rec)
+
+    def test_multi(self):
+
+        i_rec = self.encode_decode(self.frame)
+        for k in self.frame.keys():
+            assert_frame_equal(self.frame[k], i_rec[k])
+
+        l = tuple([self.frame['float'], self.frame['float'].A,
+                   self.frame['float'].B, None])
+        l_rec = self.encode_decode(l)
+        check_arbitrary(l, l_rec)
+
+        # this is an oddity in that packed lists will be returned as tuples
+        l = [self.frame['float'], self.frame['float']
+             .A, self.frame['float'].B, None]
+        l_rec = self.encode_decode(l)
+        assert isinstance(l_rec, tuple)
+        check_arbitrary(l, l_rec)
+
+    def test_iterator(self):
+
+        l = [self.frame['float'], self.frame['float']
+             .A, self.frame['float'].B, None]
+
+        with ensure_clean(self.path) as path:
+            to_msgpack(path, *l)
+            for i, packed in enumerate(read_msgpack(path, iterator=True)):
+                check_arbitrary(packed, l[i])
+
+    def tests_datetimeindex_freq_issue(self):
+
+        # GH 5947
+        # inferring freq on the datetimeindex
+        df = DataFrame([1, 2, 3], index=date_range('1/1/2013', '1/3/2013'))
+        result = self.encode_decode(df)
+        assert_frame_equal(result, df)
+
+        df = DataFrame([1, 2], index=date_range('1/1/2013', '1/2/2013'))
+        result = self.encode_decode(df)
+        assert_frame_equal(result, df)
+
+    def test_dataframe_duplicate_column_names(self):
+
+        # GH 9618
+        expected_1 = DataFrame(columns=['a', 'a'])
+        expected_2 = DataFrame(columns=[1] * 100)
+        expected_2.loc[0] = np.random.randn(100)
+        expected_3 = DataFrame(columns=[1, 1])
+        expected_3.loc[0] = ['abc', np.nan]
+
+        result_1 = self.encode_decode(expected_1)
+        result_2 = self.encode_decode(expected_2)
+        result_3 = self.encode_decode(expected_3)
+
+        assert_frame_equal(result_1, expected_1)
+        assert_frame_equal(result_2, expected_2)
+        assert_frame_equal(result_3, expected_3)
+
+
+class TestSparse(TestPackers):
+
+    def _check_roundtrip(self, obj, comparator, **kwargs):
+
+        # currently these are not implemetned
+        # i_rec = self.encode_decode(obj)
+        # comparator(obj, i_rec, **kwargs)
+        pytest.raises(NotImplementedError, self.encode_decode, obj)
+
+    def test_sparse_series(self):
+
+        s = tm.makeStringSeries()
+        s[3:5] = np.nan
+        ss = s.to_sparse()
+        self._check_roundtrip(ss, tm.assert_series_equal,
+                              check_series_type=True)
+
+        ss2 = s.to_sparse(kind='integer')
+        self._check_roundtrip(ss2, tm.assert_series_equal,
+                              check_series_type=True)
+
+        ss3 = s.to_sparse(fill_value=0)
+        self._check_roundtrip(ss3, tm.assert_series_equal,
+                              check_series_type=True)
+
+    def test_sparse_frame(self):
+
+        s = tm.makeDataFrame()
+        s.loc[3:5, 1:3] = np.nan
+        s.loc[8:10, -2] = np.nan
+        ss = s.to_sparse()
+
+        self._check_roundtrip(ss, tm.assert_frame_equal,
+                              check_frame_type=True)
+
+        ss2 = s.to_sparse(kind='integer')
+        self._check_roundtrip(ss2, tm.assert_frame_equal,
+                              check_frame_type=True)
+
+        ss3 = s.to_sparse(fill_value=0)
+        self._check_roundtrip(ss3, tm.assert_frame_equal,
+                              check_frame_type=True)
+
+
+class TestCompression(TestPackers):
+    """See https://github.com/pandas-dev/pandas/pull/9783
+    """
+
+    def setup_method(self, method):
+        try:
+            from sqlalchemy import create_engine
+            self._create_sql_engine = create_engine
+        except ImportError:
+            self._SQLALCHEMY_INSTALLED = False
+        else:
+            self._SQLALCHEMY_INSTALLED = True
+
+        super(TestCompression, self).setup_method(method)
+        data = {
+            'A': np.arange(1000, dtype=np.float64),
+            'B': np.arange(1000, dtype=np.int32),
+            'C': list(100 * 'abcdefghij'),
+            'D': date_range(datetime.datetime(2015, 4, 1), periods=1000),
+            'E': [datetime.timedelta(days=x) for x in range(1000)],
+        }
+        self.frame = {
+            'float': DataFrame({k: data[k] for k in ['A', 'A']}),
+            'int': DataFrame({k: data[k] for k in ['B', 'B']}),
+            'mixed': DataFrame(data),
+        }
+
+    def test_plain(self):
+        i_rec = self.encode_decode(self.frame)
+        for k in self.frame.keys():
+            assert_frame_equal(self.frame[k], i_rec[k])
+
+    def _test_compression(self, compress):
+        i_rec = self.encode_decode(self.frame, compress=compress)
+        for k in self.frame.keys():
+            value = i_rec[k]
+            expected = self.frame[k]
+            assert_frame_equal(value, expected)
+            # make sure that we can write to the new frames
+            for block in value._data.blocks:
+                assert block.values.flags.writeable
+
+    def test_compression_zlib(self):
+        if not _ZLIB_INSTALLED:
+            pytest.skip('no zlib')
+        self._test_compression('zlib')
+
+    def test_compression_blosc(self):
+        if not _BLOSC_INSTALLED:
+            pytest.skip('no blosc')
+        self._test_compression('blosc')
+
+    def _test_compression_warns_when_decompress_caches(self, compress):
+        not_garbage = []
+        control = []  # copied data
+
+        compress_module = globals()[compress]
+        real_decompress = compress_module.decompress
+
+        def decompress(ob):
+            """mock decompress function that delegates to the real
+            decompress but caches the result and a copy of the result.
+            """
+            res = real_decompress(ob)
+            not_garbage.append(res)  # hold a reference to this bytes object
+            control.append(bytearray(res))  # copy the data here to check later
+            return res
+
+        # types mapped to values to add in place.
+        rhs = {
+            np.dtype('float64'): 1.0,
+            np.dtype('int32'): 1,
+            np.dtype('object'): 'a',
+            np.dtype('datetime64[ns]'): np.timedelta64(1, 'ns'),
+            np.dtype('timedelta64[ns]'): np.timedelta64(1, 'ns'),
+        }
+
+        with patch(compress_module, 'decompress', decompress), \
+                tm.assert_produces_warning(PerformanceWarning) as ws:
+
+            i_rec = self.encode_decode(self.frame, compress=compress)
+            for k in self.frame.keys():
+
+                value = i_rec[k]
+                expected = self.frame[k]
+                assert_frame_equal(value, expected)
+                # make sure that we can write to the new frames even though
+                # we needed to copy the data
+                for block in value._data.blocks:
+                    assert block.values.flags.writeable
+                    # mutate the data in some way
+                    block.values[0] += rhs[block.dtype]
+
+        for w in ws:
+            # check the messages from our warnings
+            assert str(w.message) == ('copying data after decompressing; '
+                                      'this may mean that decompress is '
+                                      'caching its result')
+
+        for buf, control_buf in zip(not_garbage, control):
+            # make sure none of our mutations above affected the
+            # original buffers
+            assert buf == control_buf
+
+    def test_compression_warns_when_decompress_caches_zlib(self):
+        if not _ZLIB_INSTALLED:
+            pytest.skip('no zlib')
+        self._test_compression_warns_when_decompress_caches('zlib')
+
+    def test_compression_warns_when_decompress_caches_blosc(self):
+        if not _BLOSC_INSTALLED:
+            pytest.skip('no blosc')
+        self._test_compression_warns_when_decompress_caches('blosc')
+
+    def _test_small_strings_no_warn(self, compress):
+        empty = np.array([], dtype='uint8')
+        with tm.assert_produces_warning(None):
+            empty_unpacked = self.encode_decode(empty, compress=compress)
+
+        tm.assert_numpy_array_equal(empty_unpacked, empty)
+        assert empty_unpacked.flags.writeable
+
+        char = np.array([ord(b'a')], dtype='uint8')
+        with tm.assert_produces_warning(None):
+            char_unpacked = self.encode_decode(char, compress=compress)
+
+        tm.assert_numpy_array_equal(char_unpacked, char)
+        assert char_unpacked.flags.writeable
+        # if this test fails I am sorry because the interpreter is now in a
+        # bad state where b'a' points to 98 == ord(b'b').
+        char_unpacked[0] = ord(b'b')
+
+        # we compare the ord of bytes b'a' with unicode u'a' because the should
+        # always be the same (unless we were able to mutate the shared
+        # character singleton in which case ord(b'a') == ord(b'b').
+        assert ord(b'a') == ord(u'a')
+        tm.assert_numpy_array_equal(
+            char_unpacked,
+            np.array([ord(b'b')], dtype='uint8'),
+        )
+
+    def test_small_strings_no_warn_zlib(self):
+        if not _ZLIB_INSTALLED:
+            pytest.skip('no zlib')
+        self._test_small_strings_no_warn('zlib')
+
+    def test_small_strings_no_warn_blosc(self):
+        if not _BLOSC_INSTALLED:
+            pytest.skip('no blosc')
+        self._test_small_strings_no_warn('blosc')
+
+    def test_readonly_axis_blosc(self):
+        # GH11880
+        if not _BLOSC_INSTALLED:
+            pytest.skip('no blosc')
+        df1 = DataFrame({'A': list('abcd')})
+        df2 = DataFrame(df1, index=[1., 2., 3., 4.])
+        assert 1 in self.encode_decode(df1['A'], compress='blosc')
+        assert 1. in self.encode_decode(df2['A'], compress='blosc')
+
+    def test_readonly_axis_zlib(self):
+        # GH11880
+        df1 = DataFrame({'A': list('abcd')})
+        df2 = DataFrame(df1, index=[1., 2., 3., 4.])
+        assert 1 in self.encode_decode(df1['A'], compress='zlib')
+        assert 1. in self.encode_decode(df2['A'], compress='zlib')
+
+    def test_readonly_axis_blosc_to_sql(self):
+        # GH11880
+        if not _BLOSC_INSTALLED:
+            pytest.skip('no blosc')
+        if not self._SQLALCHEMY_INSTALLED:
+            pytest.skip('no sqlalchemy')
+        expected = DataFrame({'A': list('abcd')})
+        df = self.encode_decode(expected, compress='blosc')
+        eng = self._create_sql_engine("sqlite:///:memory:")
+        df.to_sql('test', eng, if_exists='append')
+        result = pandas.read_sql_table('test', eng, index_col='index')
+        result.index.names = [None]
+        assert_frame_equal(expected, result)
+
+    def test_readonly_axis_zlib_to_sql(self):
+        # GH11880
+        if not _ZLIB_INSTALLED:
+            pytest.skip('no zlib')
+        if not self._SQLALCHEMY_INSTALLED:
+            pytest.skip('no sqlalchemy')
+        expected = DataFrame({'A': list('abcd')})
+        df = self.encode_decode(expected, compress='zlib')
+        eng = self._create_sql_engine("sqlite:///:memory:")
+        df.to_sql('test', eng, if_exists='append')
+        result = pandas.read_sql_table('test', eng, index_col='index')
+        result.index.names = [None]
+        assert_frame_equal(expected, result)
+
+
+class TestEncoding(TestPackers):
+
+    def setup_method(self, method):
+        super(TestEncoding, self).setup_method(method)
+        data = {
+            'A': [compat.u('\u2019')] * 1000,
+            'B': np.arange(1000, dtype=np.int32),
+            'C': list(100 * 'abcdefghij'),
+            'D': date_range(datetime.datetime(2015, 4, 1), periods=1000),
+            'E': [datetime.timedelta(days=x) for x in range(1000)],
+            'G': [400] * 1000
+        }
+        self.frame = {
+            'float': DataFrame({k: data[k] for k in ['A', 'A']}),
+            'int': DataFrame({k: data[k] for k in ['B', 'B']}),
+            'mixed': DataFrame(data),
+        }
+        self.utf_encodings = ['utf8', 'utf16', 'utf32']
+
+    def test_utf(self):
+        # GH10581
+        for encoding in self.utf_encodings:
+            for frame in compat.itervalues(self.frame):
+                result = self.encode_decode(frame, encoding=encoding)
+                assert_frame_equal(result, frame)
+
+    def test_default_encoding(self):
+        for frame in compat.itervalues(self.frame):
+            result = frame.to_msgpack()
+            expected = frame.to_msgpack(encoding='utf8')
+            assert result == expected
+            result = self.encode_decode(frame)
+            assert_frame_equal(result, frame)
+
+
+files = glob.glob(os.path.join(os.path.dirname(__file__), "data",
+                               "legacy_msgpack", "*", "*.msgpack"))
+
+
+@pytest.fixture(params=files)
+def legacy_packer(request, datapath):
+    return datapath(request.param)
+
+
+class TestMsgpack(object):
+    """
+    How to add msgpack tests:
+
+    1. Install pandas version intended to output the msgpack.
+TestPackers
+    2. Execute "generate_legacy_storage_files.py" to create the msgpack.
+    $ python generate_legacy_storage_files.py <output_dir> msgpack
+
+    3. Move the created pickle to "data/legacy_msgpack/<version>" directory.
+    """
+
+    minimum_structure = {'series': ['float', 'int', 'mixed',
+                                    'ts', 'mi', 'dup'],
+                         'frame': ['float', 'int', 'mixed', 'mi'],
+                         'panel': ['float'],
+                         'index': ['int', 'date', 'period'],
+                         'mi': ['reg2']}
+
+    def check_min_structure(self, data, version):
+        for typ, v in self.minimum_structure.items():
+            assert typ in data, '"{0}" not found in unpacked data'.format(typ)
+            for kind in v:
+                msg = '"{0}" not found in data["{1}"]'.format(kind, typ)
+                assert kind in data[typ], msg
+
+    def compare(self, current_data, all_data, vf, version):
+        # GH12277 encoding default used to be latin-1, now utf-8
+        if LooseVersion(version) < LooseVersion('0.18.0'):
+            data = read_msgpack(vf, encoding='latin-1')
+        else:
+            data = read_msgpack(vf)
+        self.check_min_structure(data, version)
+        for typ, dv in data.items():
+            assert typ in all_data, ('unpacked data contains '
+                                     'extra key "{0}"'
+                                     .format(typ))
+            for dt, result in dv.items():
+                assert dt in current_data[typ], ('data["{0}"] contains extra '
+                                                 'key "{1}"'.format(typ, dt))
+                try:
+                    expected = current_data[typ][dt]
+                except KeyError:
+                    continue
+
+                # use a specific comparator
+                # if available
+                comp_method = "compare_{typ}_{dt}".format(typ=typ, dt=dt)
+                comparator = getattr(self, comp_method, None)
+                if comparator is not None:
+                    comparator(result, expected, typ, version)
+                else:
+                    check_arbitrary(result, expected)
+
+        return data
+
+    def compare_series_dt_tz(self, result, expected, typ, version):
+        # 8260
+        # dtype is object < 0.17.0
+        if LooseVersion(version) < LooseVersion('0.17.0'):
+            expected = expected.astype(object)
+            tm.assert_series_equal(result, expected)
+        else:
+            tm.assert_series_equal(result, expected)
+
+    def compare_frame_dt_mixed_tzs(self, result, expected, typ, version):
+        # 8260
+        # dtype is object < 0.17.0
+        if LooseVersion(version) < LooseVersion('0.17.0'):
+            expected = expected.astype(object)
+            tm.assert_frame_equal(result, expected)
+        else:
+            tm.assert_frame_equal(result, expected)
+
+    def test_msgpacks_legacy(self, current_packers_data, all_packers_data,
+                             legacy_packer, datapath):
+
+        version = os.path.basename(os.path.dirname(legacy_packer))
+
+        # GH12142 0.17 files packed in P2 can't be read in P3
+        if (compat.PY3 and version.startswith('0.17.') and
+                legacy_packer.split('.')[-4][-1] == '2'):
+            msg = "Files packed in Py2 can't be read in Py3 ({})"
+            pytest.skip(msg.format(version))
+        try:
+            with catch_warnings(record=True):
+                self.compare(current_packers_data, all_packers_data,
+                             legacy_packer, version)
+        except ImportError:
+            # blosc not installed
+            pass
@@ -0,0 +1,504 @@
+""" test parquet compat """
+
+import pytest
+import datetime
+from distutils.version import LooseVersion
+from warnings import catch_warnings
+
+import numpy as np
+import pandas as pd
+from pandas.compat import PY3, is_platform_windows, is_platform_mac
+from pandas.io.parquet import (to_parquet, read_parquet, get_engine,
+                               PyArrowImpl, FastParquetImpl)
+from pandas.util import testing as tm
+
+try:
+    import pyarrow  # noqa
+    _HAVE_PYARROW = True
+except ImportError:
+    _HAVE_PYARROW = False
+
+try:
+    import fastparquet  # noqa
+    _HAVE_FASTPARQUET = True
+except ImportError:
+    _HAVE_FASTPARQUET = False
+
+
+# setup engines & skips
+@pytest.fixture(params=[
+    pytest.param('fastparquet',
+                 marks=pytest.mark.skipif(not _HAVE_FASTPARQUET,
+                                          reason='fastparquet is '
+                                                 'not installed')),
+    pytest.param('pyarrow',
+                 marks=pytest.mark.skipif(not _HAVE_PYARROW,
+                                          reason='pyarrow is '
+                                                 'not installed'))])
+def engine(request):
+    return request.param
+
+
+@pytest.fixture
+def pa():
+    if not _HAVE_PYARROW:
+        pytest.skip("pyarrow is not installed")
+    return 'pyarrow'
+
+
+@pytest.fixture
+def pa_lt_070():
+    if not _HAVE_PYARROW:
+        pytest.skip("pyarrow is not installed")
+    if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'):
+        pytest.skip("pyarrow is >= 0.7.0")
+    return 'pyarrow'
+
+
+@pytest.fixture
+def pa_ge_070():
+    if not _HAVE_PYARROW:
+        pytest.skip("pyarrow is not installed")
+    if LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'):
+        pytest.skip("pyarrow is < 0.7.0")
+    return 'pyarrow'
+
+
+@pytest.fixture
+def fp():
+    if not _HAVE_FASTPARQUET:
+        pytest.skip("fastparquet is not installed")
+    return 'fastparquet'
+
+
+@pytest.fixture
+def fp_lt_014():
+    if not _HAVE_FASTPARQUET:
+        pytest.skip("fastparquet is not installed")
+    if LooseVersion(fastparquet.__version__) >= LooseVersion('0.1.4'):
+        pytest.skip("fastparquet is >= 0.1.4")
+    return 'fastparquet'
+
+
+@pytest.fixture
+def df_compat():
+    return pd.DataFrame({'A': [1, 2, 3], 'B': 'foo'})
+
+
+@pytest.fixture
+def df_cross_compat():
+    df = pd.DataFrame({'a': list('abc'),
+                       'b': list(range(1, 4)),
+                       # 'c': np.arange(3, 6).astype('u1'),
+                       'd': np.arange(4.0, 7.0, dtype='float64'),
+                       'e': [True, False, True],
+                       'f': pd.date_range('20130101', periods=3),
+                       # 'g': pd.date_range('20130101', periods=3,
+                       #                    tz='US/Eastern'),
+                       # 'h': pd.date_range('20130101', periods=3, freq='ns')
+                       })
+    return df
+
+
+@pytest.fixture
+def df_full():
+    return pd.DataFrame(
+        {'string': list('abc'),
+         'string_with_nan': ['a', np.nan, 'c'],
+         'string_with_none': ['a', None, 'c'],
+         'bytes': [b'foo', b'bar', b'baz'],
+         'unicode': [u'foo', u'bar', u'baz'],
+         'int': list(range(1, 4)),
+         'uint': np.arange(3, 6).astype('u1'),
+         'float': np.arange(4.0, 7.0, dtype='float64'),
+         'float_with_nan': [2., np.nan, 3.],
+         'bool': [True, False, True],
+         'datetime': pd.date_range('20130101', periods=3),
+         'datetime_with_nat': [pd.Timestamp('20130101'),
+                               pd.NaT,
+                               pd.Timestamp('20130103')]})
+
+
+def check_round_trip(df, engine=None, path=None,
+                     write_kwargs=None, read_kwargs=None,
+                     expected=None, check_names=True,
+                     repeat=2):
+    """Verify parquet serializer and deserializer produce the same results.
+
+    Performs a pandas to disk and disk to pandas round trip,
+    then compares the 2 resulting DataFrames to verify equality.
+
+    Parameters
+    ----------
+    df: Dataframe
+    engine: str, optional
+        'pyarrow' or 'fastparquet'
+    path: str, optional
+    write_kwargs: dict of str:str, optional
+    read_kwargs: dict of str:str, optional
+    expected: DataFrame, optional
+        Expected deserialization result, otherwise will be equal to `df`
+    check_names: list of str, optional
+        Closed set of column names to be compared
+    repeat: int, optional
+        How many times to repeat the test
+    """
+
+    write_kwargs = write_kwargs or {'compression': None}
+    read_kwargs = read_kwargs or {}
+
+    if expected is None:
+        expected = df
+
+    if engine:
+        write_kwargs['engine'] = engine
+        read_kwargs['engine'] = engine
+
+    def compare(repeat):
+        for _ in range(repeat):
+            df.to_parquet(path, **write_kwargs)
+            with catch_warnings(record=True):
+                actual = read_parquet(path, **read_kwargs)
+            tm.assert_frame_equal(expected, actual,
+                                  check_names=check_names)
+
+    if path is None:
+        with tm.ensure_clean() as path:
+            compare(repeat)
+    else:
+        compare(repeat)
+
+
+def test_invalid_engine(df_compat):
+    with pytest.raises(ValueError):
+        check_round_trip(df_compat, 'foo', 'bar')
+
+
+def test_options_py(df_compat, pa):
+    # use the set option
+
+    with pd.option_context('io.parquet.engine', 'pyarrow'):
+        check_round_trip(df_compat)
+
+
+def test_options_fp(df_compat, fp):
+    # use the set option
+
+    with pd.option_context('io.parquet.engine', 'fastparquet'):
+        check_round_trip(df_compat)
+
+
+def test_options_auto(df_compat, fp, pa):
+    # use the set option
+
+    with pd.option_context('io.parquet.engine', 'auto'):
+        check_round_trip(df_compat)
+
+
+def test_options_get_engine(fp, pa):
+    assert isinstance(get_engine('pyarrow'), PyArrowImpl)
+    assert isinstance(get_engine('fastparquet'), FastParquetImpl)
+
+    with pd.option_context('io.parquet.engine', 'pyarrow'):
+        assert isinstance(get_engine('auto'), PyArrowImpl)
+        assert isinstance(get_engine('pyarrow'), PyArrowImpl)
+        assert isinstance(get_engine('fastparquet'), FastParquetImpl)
+
+    with pd.option_context('io.parquet.engine', 'fastparquet'):
+        assert isinstance(get_engine('auto'), FastParquetImpl)
+        assert isinstance(get_engine('pyarrow'), PyArrowImpl)
+        assert isinstance(get_engine('fastparquet'), FastParquetImpl)
+
+    with pd.option_context('io.parquet.engine', 'auto'):
+        assert isinstance(get_engine('auto'), PyArrowImpl)
+        assert isinstance(get_engine('pyarrow'), PyArrowImpl)
+        assert isinstance(get_engine('fastparquet'), FastParquetImpl)
+
+
+@pytest.mark.xfail(is_platform_windows() or is_platform_mac(),
+                   reason="reading pa metadata failing on Windows/mac")
+def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
+    # cross-compat with differing reading/writing engines
+
+    df = df_cross_compat
+    with tm.ensure_clean() as path:
+        df.to_parquet(path, engine=pa, compression=None)
+
+        result = read_parquet(path, engine=fp)
+        tm.assert_frame_equal(result, df)
+
+        result = read_parquet(path, engine=fp, columns=['a', 'd'])
+        tm.assert_frame_equal(result, df[['a', 'd']])
+
+
+def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
+    # cross-compat with differing reading/writing engines
+
+    df = df_cross_compat
+    with tm.ensure_clean() as path:
+        df.to_parquet(path, engine=fp, compression=None)
+
+        with catch_warnings(record=True):
+            result = read_parquet(path, engine=pa)
+            tm.assert_frame_equal(result, df)
+
+            result = read_parquet(path, engine=pa, columns=['a', 'd'])
+            tm.assert_frame_equal(result, df[['a', 'd']])
+
+
+class Base(object):
+
+    def check_error_on_write(self, df, engine, exc):
+        # check that we are raising the exception on writing
+        with tm.ensure_clean() as path:
+            with pytest.raises(exc):
+                to_parquet(df, path, engine, compression=None)
+
+
+class TestBasic(Base):
+
+    def test_error(self, engine):
+        for obj in [pd.Series([1, 2, 3]), 1, 'foo', pd.Timestamp('20130101'),
+                    np.array([1, 2, 3])]:
+            self.check_error_on_write(obj, engine, ValueError)
+
+    def test_columns_dtypes(self, engine):
+        df = pd.DataFrame({'string': list('abc'),
+                           'int': list(range(1, 4))})
+
+        # unicode
+        df.columns = [u'foo', u'bar']
+        check_round_trip(df, engine)
+
+    def test_columns_dtypes_invalid(self, engine):
+        df = pd.DataFrame({'string': list('abc'),
+                           'int': list(range(1, 4))})
+
+        # numeric
+        df.columns = [0, 1]
+        self.check_error_on_write(df, engine, ValueError)
+
+        if PY3:
+            # bytes on PY3, on PY2 these are str
+            df.columns = [b'foo', b'bar']
+            self.check_error_on_write(df, engine, ValueError)
+
+        # python object
+        df.columns = [datetime.datetime(2011, 1, 1, 0, 0),
+                      datetime.datetime(2011, 1, 1, 1, 1)]
+        self.check_error_on_write(df, engine, ValueError)
+
+    @pytest.mark.parametrize('compression', [None, 'gzip', 'snappy', 'brotli'])
+    def test_compression(self, engine, compression):
+
+        if compression == 'snappy':
+            pytest.importorskip('snappy')
+
+        elif compression == 'brotli':
+            pytest.importorskip('brotli')
+
+        df = pd.DataFrame({'A': [1, 2, 3]})
+        check_round_trip(df, engine, write_kwargs={'compression': compression})
+
+    def test_read_columns(self, engine):
+        # GH18154
+        df = pd.DataFrame({'string': list('abc'),
+                           'int': list(range(1, 4))})
+
+        expected = pd.DataFrame({'string': list('abc')})
+        check_round_trip(df, engine, expected=expected,
+                         read_kwargs={'columns': ['string']})
+
+    def test_write_index(self, engine):
+        check_names = engine != 'fastparquet'
+
+        if engine == 'pyarrow':
+            import pyarrow
+            if LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'):
+                pytest.skip("pyarrow is < 0.7.0")
+
+        df = pd.DataFrame({'A': [1, 2, 3]})
+        check_round_trip(df, engine)
+
+        indexes = [
+            [2, 3, 4],
+            pd.date_range('20130101', periods=3),
+            list('abc'),
+            [1, 3, 4],
+        ]
+        # non-default index
+        for index in indexes:
+            df.index = index
+            check_round_trip(df, engine, check_names=check_names)
+
+        # index with meta-data
+        df.index = [0, 1, 2]
+        df.index.name = 'foo'
+        check_round_trip(df, engine)
+
+    def test_write_multiindex(self, pa_ge_070):
+        # Not suppoprted in fastparquet as of 0.1.3 or older pyarrow version
+        engine = pa_ge_070
+
+        df = pd.DataFrame({'A': [1, 2, 3]})
+        index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)])
+        df.index = index
+        check_round_trip(df, engine)
+
+    def test_write_column_multiindex(self, engine):
+        # column multi-index
+        mi_columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)])
+        df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
+        self.check_error_on_write(df, engine, ValueError)
+
+    def test_multiindex_with_columns(self, pa_ge_070):
+        engine = pa_ge_070
+        dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS')
+        df = pd.DataFrame(np.random.randn(2 * len(dates), 3),
+                          columns=list('ABC'))
+        index1 = pd.MultiIndex.from_product(
+            [['Level1', 'Level2'], dates],
+            names=['level', 'date'])
+        index2 = index1.copy(names=None)
+        for index in [index1, index2]:
+            df.index = index
+
+            check_round_trip(df, engine)
+            check_round_trip(df, engine, read_kwargs={'columns': ['A', 'B']},
+                             expected=df[['A', 'B']])
+
+
+class TestParquetPyArrow(Base):
+
+    def test_basic(self, pa, df_full):
+
+        df = df_full
+
+        # additional supported types for pyarrow
+        import pyarrow
+        if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'):
+            df['datetime_tz'] = pd.date_range('20130101', periods=3,
+                                              tz='Europe/Brussels')
+        df['bool_with_none'] = [True, None, True]
+
+        check_round_trip(df, pa)
+
+    @pytest.mark.xfail(reason="pyarrow fails on this (ARROW-1883)")
+    def test_basic_subset_columns(self, pa, df_full):
+        # GH18628
+
+        df = df_full
+        # additional supported types for pyarrow
+        df['datetime_tz'] = pd.date_range('20130101', periods=3,
+                                          tz='Europe/Brussels')
+
+        check_round_trip(df, pa, expected=df[['string', 'int']],
+                         read_kwargs={'columns': ['string', 'int']})
+
+    def test_duplicate_columns(self, pa):
+        # not currently able to handle duplicate columns
+        df = pd.DataFrame(np.arange(12).reshape(4, 3),
+                          columns=list('aaa')).copy()
+        self.check_error_on_write(df, pa, ValueError)
+
+    def test_unsupported(self, pa):
+        # period
+        df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
+        self.check_error_on_write(df, pa, ValueError)
+
+        # timedelta
+        df = pd.DataFrame({'a': pd.timedelta_range('1 day',
+                                                   periods=3)})
+        self.check_error_on_write(df, pa, NotImplementedError)
+
+        # mixed python objects
+        df = pd.DataFrame({'a': ['a', 1, 2.0]})
+        self.check_error_on_write(df, pa, ValueError)
+
+    def test_categorical(self, pa_ge_070):
+        pa = pa_ge_070
+
+        # supported in >= 0.7.0
+        df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
+
+        # de-serialized as object
+        expected = df.assign(a=df.a.astype(object))
+        check_round_trip(df, pa, expected=expected)
+
+    def test_categorical_unsupported(self, pa_lt_070):
+        pa = pa_lt_070
+
+        # supported in >= 0.7.0
+        df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
+        self.check_error_on_write(df, pa, NotImplementedError)
+
+    def test_s3_roundtrip(self, df_compat, s3_resource, pa):
+        # GH #19134
+        check_round_trip(df_compat, pa,
+                         path='s3://pandas-test/pyarrow.parquet')
+
+
+class TestParquetFastParquet(Base):
+
+    def test_basic(self, fp, df_full):
+        df = df_full
+
+        # additional supported types for fastparquet
+        if LooseVersion(fastparquet.__version__) >= LooseVersion('0.1.4'):
+            df['datetime_tz'] = pd.date_range('20130101', periods=3,
+                                              tz='US/Eastern')
+        df['timedelta'] = pd.timedelta_range('1 day', periods=3)
+        check_round_trip(df, fp)
+
+    @pytest.mark.skip(reason="not supported")
+    def test_duplicate_columns(self, fp):
+
+        # not currently able to handle duplicate columns
+        df = pd.DataFrame(np.arange(12).reshape(4, 3),
+                          columns=list('aaa')).copy()
+        self.check_error_on_write(df, fp, ValueError)
+
+    def test_bool_with_none(self, fp):
+        df = pd.DataFrame({'a': [True, None, False]})
+        expected = pd.DataFrame({'a': [1.0, np.nan, 0.0]}, dtype='float16')
+        check_round_trip(df, fp, expected=expected)
+
+    def test_unsupported(self, fp):
+
+        # period
+        df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
+        self.check_error_on_write(df, fp, ValueError)
+
+        # mixed
+        df = pd.DataFrame({'a': ['a', 1, 2.0]})
+        self.check_error_on_write(df, fp, ValueError)
+
+    def test_categorical(self, fp):
+        if LooseVersion(fastparquet.__version__) < LooseVersion("0.1.3"):
+            pytest.skip("CategoricalDtype not supported for older fp")
+        df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
+        check_round_trip(df, fp)
+
+    def test_datetime_tz(self, fp_lt_014):
+
+        # fastparquet<0.1.4 doesn't preserve tz
+        df = pd.DataFrame({'a': pd.date_range('20130101', periods=3,
+                                              tz='US/Eastern')})
+        # warns on the coercion
+        with catch_warnings(record=True):
+            check_round_trip(df, fp_lt_014,
+                             expected=df.astype('datetime64[ns]'))
+
+    def test_filter_row_groups(self, fp):
+        d = {'a': list(range(0, 3))}
+        df = pd.DataFrame(d)
+        with tm.ensure_clean() as path:
+            df.to_parquet(path, fp, compression=None,
+                          row_group_offsets=1)
+            result = read_parquet(path, fp, filters=[('a', '==', 0)])
+        assert len(result) == 1
+
+    def test_s3_roundtrip(self, df_compat, s3_resource, fp):
+        # GH #19134
+        check_round_trip(df_compat, fp,
+                         path='s3://pandas-test/fastparquet.parquet')
@@ -0,0 +1,480 @@
+# pylint: disable=E1101,E1103,W0232
+
+"""
+manage legacy pickle tests
+
+How to add pickle tests:
+
+1. Install pandas version intended to output the pickle.
+
+2. Execute "generate_legacy_storage_files.py" to create the pickle.
+$ python generate_legacy_storage_files.py <output_dir> pickle
+
+3. Move the created pickle to "data/legacy_pickle/<version>" directory.
+"""
+import glob
+import pytest
+from warnings import catch_warnings
+
+import os
+from distutils.version import LooseVersion
+import pandas as pd
+from pandas import Index
+from pandas.compat import is_platform_little_endian, PY3
+import pandas
+import pandas.util.testing as tm
+import pandas.util._test_decorators as td
+from pandas.tseries.offsets import Day, MonthEnd
+import shutil
+
+
+@pytest.fixture(scope='module')
+def current_pickle_data():
+    # our current version pickle data
+    from pandas.tests.io.generate_legacy_storage_files import (
+        create_pickle_data)
+    return create_pickle_data()
+
+
+# ---------------------
+# comparison functions
+# ---------------------
+def compare_element(result, expected, typ, version=None):
+    if isinstance(expected, Index):
+        tm.assert_index_equal(expected, result)
+        return
+
+    if typ.startswith('sp_'):
+        comparator = getattr(tm, "assert_%s_equal" % typ)
+        comparator(result, expected, exact_indices=False)
+    elif typ == 'timestamp':
+        if expected is pd.NaT:
+            assert result is pd.NaT
+        else:
+            assert result == expected
+            assert result.freq == expected.freq
+    else:
+        comparator = getattr(tm, "assert_%s_equal" %
+                             typ, tm.assert_almost_equal)
+        comparator(result, expected)
+
+
+def compare(data, vf, version):
+
+    # py3 compat when reading py2 pickle
+    try:
+        data = pandas.read_pickle(vf)
+    except (ValueError) as e:
+        if 'unsupported pickle protocol:' in str(e):
+            # trying to read a py3 pickle in py2
+            return
+        else:
+            raise
+
+    m = globals()
+    for typ, dv in data.items():
+        for dt, result in dv.items():
+            try:
+                expected = data[typ][dt]
+            except (KeyError):
+                if version in ('0.10.1', '0.11.0') and dt == 'reg':
+                    break
+                else:
+                    raise
+
+            # use a specific comparator
+            # if available
+            comparator = "compare_{typ}_{dt}".format(typ=typ, dt=dt)
+
+            comparator = m.get(comparator, m['compare_element'])
+            comparator(result, expected, typ, version)
+    return data
+
+
+def compare_sp_series_ts(res, exp, typ, version):
+    # SparseTimeSeries integrated into SparseSeries in 0.12.0
+    # and deprecated in 0.17.0
+    if version and LooseVersion(version) <= LooseVersion("0.12.0"):
+        tm.assert_sp_series_equal(res, exp, check_series_type=False)
+    else:
+        tm.assert_sp_series_equal(res, exp)
+
+
+def compare_series_ts(result, expected, typ, version):
+    # GH 7748
+    tm.assert_series_equal(result, expected)
+    assert result.index.freq == expected.index.freq
+    assert not result.index.freq.normalize
+    tm.assert_series_equal(result > 0, expected > 0)
+
+    # GH 9291
+    freq = result.index.freq
+    assert freq + Day(1) == Day(2)
+
+    res = freq + pandas.Timedelta(hours=1)
+    assert isinstance(res, pandas.Timedelta)
+    assert res == pandas.Timedelta(days=1, hours=1)
+
+    res = freq + pandas.Timedelta(nanoseconds=1)
+    assert isinstance(res, pandas.Timedelta)
+    assert res == pandas.Timedelta(days=1, nanoseconds=1)
+
+
+def compare_series_dt_tz(result, expected, typ, version):
+    # 8260
+    # dtype is object < 0.17.0
+    if LooseVersion(version) < LooseVersion('0.17.0'):
+        expected = expected.astype(object)
+        tm.assert_series_equal(result, expected)
+    else:
+        tm.assert_series_equal(result, expected)
+
+
+def compare_series_cat(result, expected, typ, version):
+    # Categorical dtype is added in 0.15.0
+    # ordered is changed in 0.16.0
+    if LooseVersion(version) < LooseVersion('0.15.0'):
+        tm.assert_series_equal(result, expected, check_dtype=False,
+                               check_categorical=False)
+    elif LooseVersion(version) < LooseVersion('0.16.0'):
+        tm.assert_series_equal(result, expected, check_categorical=False)
+    else:
+        tm.assert_series_equal(result, expected)
+
+
+def compare_frame_dt_mixed_tzs(result, expected, typ, version):
+    # 8260
+    # dtype is object < 0.17.0
+    if LooseVersion(version) < LooseVersion('0.17.0'):
+        expected = expected.astype(object)
+        tm.assert_frame_equal(result, expected)
+    else:
+        tm.assert_frame_equal(result, expected)
+
+
+def compare_frame_cat_onecol(result, expected, typ, version):
+    # Categorical dtype is added in 0.15.0
+    # ordered is changed in 0.16.0
+    if LooseVersion(version) < LooseVersion('0.15.0'):
+        tm.assert_frame_equal(result, expected, check_dtype=False,
+                              check_categorical=False)
+    elif LooseVersion(version) < LooseVersion('0.16.0'):
+        tm.assert_frame_equal(result, expected, check_categorical=False)
+    else:
+        tm.assert_frame_equal(result, expected)
+
+
+def compare_frame_cat_and_float(result, expected, typ, version):
+    compare_frame_cat_onecol(result, expected, typ, version)
+
+
+def compare_index_period(result, expected, typ, version):
+    tm.assert_index_equal(result, expected)
+    assert isinstance(result.freq, MonthEnd)
+    assert result.freq == MonthEnd()
+    assert result.freqstr == 'M'
+    tm.assert_index_equal(result.shift(2), expected.shift(2))
+
+
+def compare_sp_frame_float(result, expected, typ, version):
+    if LooseVersion(version) <= LooseVersion('0.18.1'):
+        tm.assert_sp_frame_equal(result, expected, exact_indices=False,
+                                 check_dtype=False)
+    else:
+        tm.assert_sp_frame_equal(result, expected)
+
+
+files = glob.glob(os.path.join(os.path.dirname(__file__), "data",
+                  "legacy_pickle", "*", "*.pickle"))
+
+
+@pytest.fixture(params=files)
+def legacy_pickle(request, datapath):
+    return datapath(request.param)
+
+
+# ---------------------
+# tests
+# ---------------------
+def test_pickles(current_pickle_data, legacy_pickle):
+    if not is_platform_little_endian():
+        pytest.skip("known failure on non-little endian")
+
+    version = os.path.basename(os.path.dirname(legacy_pickle))
+    with catch_warnings(record=True):
+        compare(current_pickle_data, legacy_pickle, version)
+
+
+def test_round_trip_current(current_pickle_data):
+
+    try:
+        import cPickle as c_pickle
+
+        def c_pickler(obj, path):
+            with open(path, 'wb') as fh:
+                c_pickle.dump(obj, fh, protocol=-1)
+
+        def c_unpickler(path):
+            with open(path, 'rb') as fh:
+                fh.seek(0)
+                return c_pickle.load(fh)
+    except:
+        c_pickler = None
+        c_unpickler = None
+
+    import pickle as python_pickle
+
+    def python_pickler(obj, path):
+        with open(path, 'wb') as fh:
+            python_pickle.dump(obj, fh, protocol=-1)
+
+    def python_unpickler(path):
+        with open(path, 'rb') as fh:
+            fh.seek(0)
+            return python_pickle.load(fh)
+
+    data = current_pickle_data
+    for typ, dv in data.items():
+        for dt, expected in dv.items():
+
+            for writer in [pd.to_pickle, c_pickler, python_pickler]:
+                if writer is None:
+                    continue
+
+                with tm.ensure_clean() as path:
+
+                    # test writing with each pickler
+                    writer(expected, path)
+
+                    # test reading with each unpickler
+                    result = pd.read_pickle(path)
+                    compare_element(result, expected, typ)
+
+                    if c_unpickler is not None:
+                        result = c_unpickler(path)
+                        compare_element(result, expected, typ)
+
+                    result = python_unpickler(path)
+                    compare_element(result, expected, typ)
+
+
+def test_pickle_v0_14_1(datapath):
+
+    cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False,
+                         categories=['a', 'b', 'c', 'd'])
+    pickle_path = datapath('io', 'data', 'categorical_0_14_1.pickle')
+    # This code was executed once on v0.14.1 to generate the pickle:
+    #
+    # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
+    #                   name='foobar')
+    # with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
+    #
+    tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
+
+
+def test_pickle_v0_15_2(datapath):
+    # ordered -> _ordered
+    # GH 9347
+
+    cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False,
+                         categories=['a', 'b', 'c', 'd'])
+    pickle_path = datapath('io', 'data', 'categorical_0_15_2.pickle')
+    # This code was executed once on v0.15.2 to generate the pickle:
+    #
+    # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
+    #                   name='foobar')
+    # with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
+    #
+    tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
+
+
+def test_pickle_path_pathlib():
+    df = tm.makeDataFrame()
+    result = tm.round_trip_pathlib(df.to_pickle, pd.read_pickle)
+    tm.assert_frame_equal(df, result)
+
+
+def test_pickle_path_localpath():
+    df = tm.makeDataFrame()
+    result = tm.round_trip_localpath(df.to_pickle, pd.read_pickle)
+    tm.assert_frame_equal(df, result)
+
+
+# ---------------------
+# test pickle compression
+# ---------------------
+
+@pytest.fixture
+def get_random_path():
+    return u'__%s__.pickle' % tm.rands(10)
+
+
+class TestCompression(object):
+
+    _compression_to_extension = {
+        None: ".none",
+        'gzip': '.gz',
+        'bz2': '.bz2',
+        'zip': '.zip',
+        'xz': '.xz',
+    }
+
+    def compress_file(self, src_path, dest_path, compression):
+        if compression is None:
+            shutil.copyfile(src_path, dest_path)
+            return
+
+        if compression == 'gzip':
+            import gzip
+            f = gzip.open(dest_path, "w")
+        elif compression == 'bz2':
+            import bz2
+            f = bz2.BZ2File(dest_path, "w")
+        elif compression == 'zip':
+            import zipfile
+            zip_file = zipfile.ZipFile(dest_path, "w",
+                                       compression=zipfile.ZIP_DEFLATED)
+            zip_file.write(src_path, os.path.basename(src_path))
+        elif compression == 'xz':
+            lzma = pandas.compat.import_lzma()
+            f = lzma.LZMAFile(dest_path, "w")
+        else:
+            msg = 'Unrecognized compression type: {}'.format(compression)
+            raise ValueError(msg)
+
+        if compression != "zip":
+            with open(src_path, "rb") as fh:
+                f.write(fh.read())
+            f.close()
+
+    def test_write_explicit(self, compression, get_random_path):
+        base = get_random_path
+        path1 = base + ".compressed"
+        path2 = base + ".raw"
+
+        with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
+            df = tm.makeDataFrame()
+
+            # write to compressed file
+            df.to_pickle(p1, compression=compression)
+
+            # decompress
+            with tm.decompress_file(p1, compression=compression) as f:
+                with open(p2, "wb") as fh:
+                    fh.write(f.read())
+
+            # read decompressed file
+            df2 = pd.read_pickle(p2, compression=None)
+
+            tm.assert_frame_equal(df, df2)
+
+    @pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z'])
+    def test_write_explicit_bad(self, compression, get_random_path):
+        with tm.assert_raises_regex(ValueError,
+                                    "Unrecognized compression type"):
+            with tm.ensure_clean(get_random_path) as path:
+                df = tm.makeDataFrame()
+                df.to_pickle(path, compression=compression)
+
+    @pytest.mark.parametrize('ext', [
+        '', '.gz', '.bz2', '.no_compress',
+        pytest.param('.xz', marks=td.skip_if_no_lzma)
+    ])
+    def test_write_infer(self, ext, get_random_path):
+        base = get_random_path
+        path1 = base + ext
+        path2 = base + ".raw"
+        compression = None
+        for c in self._compression_to_extension:
+            if self._compression_to_extension[c] == ext:
+                compression = c
+                break
+
+        with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
+            df = tm.makeDataFrame()
+
+            # write to compressed file by inferred compression method
+            df.to_pickle(p1)
+
+            # decompress
+            with tm.decompress_file(p1, compression=compression) as f:
+                with open(p2, "wb") as fh:
+                    fh.write(f.read())
+
+            # read decompressed file
+            df2 = pd.read_pickle(p2, compression=None)
+
+            tm.assert_frame_equal(df, df2)
+
+    def test_read_explicit(self, compression, get_random_path):
+        base = get_random_path
+        path1 = base + ".raw"
+        path2 = base + ".compressed"
+
+        with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
+            df = tm.makeDataFrame()
+
+            # write to uncompressed file
+            df.to_pickle(p1, compression=None)
+
+            # compress
+            self.compress_file(p1, p2, compression=compression)
+
+            # read compressed file
+            df2 = pd.read_pickle(p2, compression=compression)
+
+            tm.assert_frame_equal(df, df2)
+
+    @pytest.mark.parametrize('ext', [
+        '', '.gz', '.bz2', '.zip', '.no_compress',
+        pytest.param('.xz', marks=td.skip_if_no_lzma)
+    ])
+    def test_read_infer(self, ext, get_random_path):
+        base = get_random_path
+        path1 = base + ".raw"
+        path2 = base + ext
+        compression = None
+        for c in self._compression_to_extension:
+            if self._compression_to_extension[c] == ext:
+                compression = c
+                break
+
+        with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
+            df = tm.makeDataFrame()
+
+            # write to uncompressed file
+            df.to_pickle(p1, compression=None)
+
+            # compress
+            self.compress_file(p1, p2, compression=compression)
+
+            # read compressed file by inferred compression method
+            df2 = pd.read_pickle(p2)
+
+            tm.assert_frame_equal(df, df2)
+
+
+# ---------------------
+# test pickle compression
+# ---------------------
+
+class TestProtocol(object):
+
+    @pytest.mark.parametrize('protocol', [-1, 0, 1, 2])
+    def test_read(self, protocol, get_random_path):
+        with tm.ensure_clean(get_random_path) as path:
+            df = tm.makeDataFrame()
+            df.to_pickle(path, protocol=protocol)
+            df2 = pd.read_pickle(path)
+            tm.assert_frame_equal(df, df2)
+
+    @pytest.mark.parametrize('protocol', [3, 4])
+    @pytest.mark.skipif(PY3, reason="Testing invalid parameters for Python 2")
+    def test_read_bad_versions(self, protocol, get_random_path):
+        # For Python 2, HIGHEST_PROTOCOL should be 2.
+        msg = ("pickle protocol {protocol} asked for; the highest available "
+               "protocol is 2").format(protocol=protocol)
+        with tm.assert_raises_regex(ValueError, msg):
+            with tm.ensure_clean(get_random_path) as path:
+                df = tm.makeDataFrame()
+                df.to_pickle(path, protocol=protocol)
@@ -0,0 +1,8 @@
+from pandas.io.common import is_s3_url
+
+
+class TestS3URL(object):
+
+    def test_is_s3_url(self):
+        assert is_s3_url("s3://pandas/somethingelse.com")
+        assert not is_s3_url("s4://pandas/somethingelse.com")