started work on backend

2019-01-21 17:36:00 +01:00
parent a1a8bca34b
commit 9f9a7e4974
4032 changed files with 745079 additions and 0 deletions
@@ -0,0 +1,987 @@
+""":mod:`pandas.io.html` is a module containing functionality for dealing with
+HTML IO.
+
+"""
+
+import os
+import re
+import numbers
+import collections
+
+from distutils.version import LooseVersion
+
+import numpy as np
+
+from pandas.core.dtypes.common import is_list_like
+from pandas.errors import EmptyDataError
+from pandas.io.common import _is_url, urlopen, _validate_header_arg
+from pandas.io.parsers import TextParser
+from pandas.compat import (lrange, lmap, u, string_types, iteritems,
+                           raise_with_traceback, binary_type)
+from pandas import Series
+import pandas.core.common as com
+from pandas.io.formats.printing import pprint_thing
+
+_IMPORTS = False
+_HAS_BS4 = False
+_HAS_LXML = False
+_HAS_HTML5LIB = False
+
+
+def _importers():
+    # import things we need
+    # but make this done on a first use basis
+
+    global _IMPORTS
+    if _IMPORTS:
+        return
+
+    global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB
+
+    try:
+        import bs4  # noqa
+        _HAS_BS4 = True
+    except ImportError:
+        pass
+
+    try:
+        import lxml  # noqa
+        _HAS_LXML = True
+    except ImportError:
+        pass
+
+    try:
+        import html5lib  # noqa
+        _HAS_HTML5LIB = True
+    except ImportError:
+        pass
+
+    _IMPORTS = True
+
+
+#############
+# READ HTML #
+#############
+_RE_WHITESPACE = re.compile(r'[\r\n]+|\s{2,}')
+
+
+char_types = string_types + (binary_type,)
+
+
+def _remove_whitespace(s, regex=_RE_WHITESPACE):
+    """Replace extra whitespace inside of a string with a single space.
+
+    Parameters
+    ----------
+    s : str or unicode
+        The string from which to remove extra whitespace.
+
+    regex : regex
+        The regular expression to use to remove extra whitespace.
+
+    Returns
+    -------
+    subd : str or unicode
+        `s` with all extra whitespace replaced with a single space.
+    """
+    return regex.sub(' ', s.strip())
+
+
+def _get_skiprows(skiprows):
+    """Get an iterator given an integer, slice or container.
+
+    Parameters
+    ----------
+    skiprows : int, slice, container
+        The iterator to use to skip rows; can also be a slice.
+
+    Raises
+    ------
+    TypeError
+        * If `skiprows` is not a slice, integer, or Container
+
+    Returns
+    -------
+    it : iterable
+        A proper iterator to use to skip rows of a DataFrame.
+    """
+    if isinstance(skiprows, slice):
+        return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1)
+    elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):
+        return skiprows
+    elif skiprows is None:
+        return 0
+    raise TypeError('%r is not a valid type for skipping rows' %
+                    type(skiprows).__name__)
+
+
+def _read(obj):
+    """Try to read from a url, file or string.
+
+    Parameters
+    ----------
+    obj : str, unicode, or file-like
+
+    Returns
+    -------
+    raw_text : str
+    """
+    if _is_url(obj):
+        with urlopen(obj) as url:
+            text = url.read()
+    elif hasattr(obj, 'read'):
+        text = obj.read()
+    elif isinstance(obj, char_types):
+        text = obj
+        try:
+            if os.path.isfile(text):
+                with open(text, 'rb') as f:
+                    return f.read()
+        except (TypeError, ValueError):
+            pass
+    else:
+        raise TypeError("Cannot read object of type %r" % type(obj).__name__)
+    return text
+
+
+class _HtmlFrameParser(object):
+    """Base class for parsers that parse HTML into DataFrames.
+
+    Parameters
+    ----------
+    io : str or file-like
+        This can be either a string of raw HTML, a valid URL using the HTTP,
+        FTP, or FILE protocols or a file-like object.
+
+    match : str or regex
+        The text to match in the document.
+
+    attrs : dict
+        List of HTML <table> element attributes to match.
+
+    encoding : str
+        Encoding to be used by parser
+
+    displayed_only : bool
+        Whether or not items with "display:none" should be ignored
+
+        .. versionadded:: 0.23.0
+
+    Attributes
+    ----------
+    io : str or file-like
+        raw HTML, URL, or file-like object
+
+    match : regex
+        The text to match in the raw HTML
+
+    attrs : dict-like
+        A dictionary of valid table attributes to use to search for table
+        elements.
+
+    encoding : str
+        Encoding to be used by parser
+
+    displayed_only : bool
+        Whether or not items with "display:none" should be ignored
+
+        .. versionadded:: 0.23.0
+
+    Notes
+    -----
+    To subclass this class effectively you must override the following methods:
+        * :func:`_build_doc`
+        * :func:`_text_getter`
+        * :func:`_parse_td`
+        * :func:`_parse_tables`
+        * :func:`_parse_tr`
+        * :func:`_parse_thead`
+        * :func:`_parse_tbody`
+        * :func:`_parse_tfoot`
+    See each method's respective documentation for details on their
+    functionality.
+    """
+
+    def __init__(self, io, match, attrs, encoding, displayed_only):
+        self.io = io
+        self.match = match
+        self.attrs = attrs
+        self.encoding = encoding
+        self.displayed_only = displayed_only
+
+    def parse_tables(self):
+        tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
+        return (self._build_table(table) for table in tables)
+
+    def _parse_raw_data(self, rows):
+        """Parse the raw data into a list of lists.
+
+        Parameters
+        ----------
+        rows : iterable of node-like
+            A list of row elements.
+
+        text_getter : callable
+            A callable that gets the text from an individual node. This must be
+            defined by subclasses.
+
+        column_finder : callable
+            A callable that takes a row node as input and returns a list of the
+            column node in that row. This must be defined by subclasses.
+
+        Returns
+        -------
+        data : list of list of strings
+        """
+        data = [[_remove_whitespace(self._text_getter(col)) for col in
+                 self._parse_td(row)] for row in rows]
+        return data
+
+    def _text_getter(self, obj):
+        """Return the text of an individual DOM node.
+
+        Parameters
+        ----------
+        obj : node-like
+            A DOM node.
+
+        Returns
+        -------
+        text : str or unicode
+            The text from an individual DOM node.
+        """
+        raise com.AbstractMethodError(self)
+
+    def _parse_td(self, obj):
+        """Return the td elements from a row element.
+
+        Parameters
+        ----------
+        obj : node-like
+
+        Returns
+        -------
+        columns : list of node-like
+            These are the elements of each row, i.e., the columns.
+        """
+        raise com.AbstractMethodError(self)
+
+    def _parse_tables(self, doc, match, attrs):
+        """Return all tables from the parsed DOM.
+
+        Parameters
+        ----------
+        doc : tree-like
+            The DOM from which to parse the table element.
+
+        match : str or regular expression
+            The text to search for in the DOM tree.
+
+        attrs : dict
+            A dictionary of table attributes that can be used to disambiguate
+            multiple tables on a page.
+
+        Raises
+        ------
+        ValueError
+            * If `match` does not match any text in the document.
+
+        Returns
+        -------
+        tables : list of node-like
+            A list of <table> elements to be parsed into raw data.
+        """
+        raise com.AbstractMethodError(self)
+
+    def _parse_tr(self, table):
+        """Return the list of row elements from the parsed table element.
+
+        Parameters
+        ----------
+        table : node-like
+            A table element that contains row elements.
+
+        Returns
+        -------
+        rows : list of node-like
+            A list row elements of a table, usually <tr> or <th> elements.
+        """
+        raise com.AbstractMethodError(self)
+
+    def _parse_thead(self, table):
+        """Return the header of a table.
+
+        Parameters
+        ----------
+        table : node-like
+            A table element that contains row elements.
+
+        Returns
+        -------
+        thead : node-like
+            A <thead>...</thead> element.
+        """
+        raise com.AbstractMethodError(self)
+
+    def _parse_tbody(self, table):
+        """Return the list of tbody elements from the parsed table element.
+
+        Parameters
+        ----------
+        table : node-like
+            A table element that contains row elements.
+
+        Returns
+        -------
+        tbodys : list of node-like
+            A list of <tbody>...</tbody> elements
+        """
+        raise com.AbstractMethodError(self)
+
+    def _parse_tfoot(self, table):
+        """Return the footer of the table if any.
+
+        Parameters
+        ----------
+        table : node-like
+            A table element that contains row elements.
+
+        Returns
+        -------
+        tfoot : node-like
+            A <tfoot>...</tfoot> element.
+        """
+        raise com.AbstractMethodError(self)
+
+    def _build_doc(self):
+        """Return a tree-like object that can be used to iterate over the DOM.
+
+        Returns
+        -------
+        obj : tree-like
+        """
+        raise com.AbstractMethodError(self)
+
+    def _build_table(self, table):
+        header = self._parse_raw_thead(table)
+        body = self._parse_raw_tbody(table)
+        footer = self._parse_raw_tfoot(table)
+        return header, body, footer
+
+    def _parse_raw_thead(self, table):
+        thead = self._parse_thead(table)
+        res = []
+        if thead:
+            trs = self._parse_tr(thead[0])
+            for tr in trs:
+                cols = lmap(self._text_getter, self._parse_td(tr))
+                if any(col != '' for col in cols):
+                    res.append(cols)
+        return res
+
+    def _parse_raw_tfoot(self, table):
+        tfoot = self._parse_tfoot(table)
+        res = []
+        if tfoot:
+            res = lmap(self._text_getter, self._parse_td(tfoot[0]))
+        return np.atleast_1d(
+            np.array(res).squeeze()) if res and len(res) == 1 else res
+
+    def _parse_raw_tbody(self, table):
+        tbodies = self._parse_tbody(table)
+
+        raw_data = []
+
+        if tbodies:
+            for tbody in tbodies:
+                raw_data.extend(self._parse_tr(tbody))
+        else:
+            raw_data.extend(self._parse_tr(table))
+
+        return self._parse_raw_data(raw_data)
+
+    def _handle_hidden_tables(self, tbl_list, attr_name):
+        """Returns list of tables, potentially removing hidden elements
+
+        Parameters
+        ----------
+        tbl_list : list of Tag or list of Element
+            Type of list elements will vary depending upon parser used
+        attr_name : str
+            Name of the accessor for retrieving HTML attributes
+
+        Returns
+        -------
+        list of Tag or list of Element
+            Return type matches `tbl_list`
+        """
+        if not self.displayed_only:
+            return tbl_list
+
+        return [x for x in tbl_list if "display:none" not in
+                getattr(x, attr_name).get('style', '').replace(" ", "")]
+
+
+class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
+    """HTML to DataFrame parser that uses BeautifulSoup under the hood.
+
+    See Also
+    --------
+    pandas.io.html._HtmlFrameParser
+    pandas.io.html._LxmlFrameParser
+
+    Notes
+    -----
+    Documentation strings for this class are in the base class
+    :class:`pandas.io.html._HtmlFrameParser`.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args,
+                                                                **kwargs)
+        from bs4 import SoupStrainer
+        self._strainer = SoupStrainer('table')
+
+    def _text_getter(self, obj):
+        return obj.text
+
+    def _parse_td(self, row):
+        return row.find_all(('td', 'th'))
+
+    def _parse_tr(self, element):
+        return element.find_all('tr')
+
+    def _parse_th(self, element):
+        return element.find_all('th')
+
+    def _parse_thead(self, table):
+        return table.find_all('thead')
+
+    def _parse_tbody(self, table):
+        return table.find_all('tbody')
+
+    def _parse_tfoot(self, table):
+        return table.find_all('tfoot')
+
+    def _parse_tables(self, doc, match, attrs):
+        element_name = self._strainer.name
+        tables = doc.find_all(element_name, attrs=attrs)
+
+        if not tables:
+            raise ValueError('No tables found')
+
+        result = []
+        unique_tables = set()
+        tables = self._handle_hidden_tables(tables, "attrs")
+
+        for table in tables:
+            if self.displayed_only:
+                for elem in table.find_all(
+                        style=re.compile(r"display:\s*none")):
+                    elem.decompose()
+
+            if (table not in unique_tables and
+                    table.find(text=match) is not None):
+                result.append(table)
+            unique_tables.add(table)
+
+        if not result:
+            raise ValueError("No tables found matching pattern {patt!r}"
+                             .format(patt=match.pattern))
+        return result
+
+    def _setup_build_doc(self):
+        raw_text = _read(self.io)
+        if not raw_text:
+            raise ValueError('No text parsed from document: {doc}'
+                             .format(doc=self.io))
+        return raw_text
+
+    def _build_doc(self):
+        from bs4 import BeautifulSoup
+        return BeautifulSoup(self._setup_build_doc(), features='html5lib',
+                             from_encoding=self.encoding)
+
+
+def _build_xpath_expr(attrs):
+    """Build an xpath expression to simulate bs4's ability to pass in kwargs to
+    search for attributes when using the lxml parser.
+
+    Parameters
+    ----------
+    attrs : dict
+        A dict of HTML attributes. These are NOT checked for validity.
+
+    Returns
+    -------
+    expr : unicode
+        An XPath expression that checks for the given HTML attributes.
+    """
+    # give class attribute as class_ because class is a python keyword
+    if 'class_' in attrs:
+        attrs['class'] = attrs.pop('class_')
+
+    s = [u("@{key}={val!r}").format(key=k, val=v) for k, v in iteritems(attrs)]
+    return u('[{expr}]').format(expr=' and '.join(s))
+
+
+_re_namespace = {'re': 'http://exslt.org/regular-expressions'}
+_valid_schemes = 'http', 'file', 'ftp'
+
+
+class _LxmlFrameParser(_HtmlFrameParser):
+    """HTML to DataFrame parser that uses lxml under the hood.
+
+    Warning
+    -------
+    This parser can only handle HTTP, FTP, and FILE urls.
+
+    See Also
+    --------
+    _HtmlFrameParser
+    _BeautifulSoupLxmlFrameParser
+
+    Notes
+    -----
+    Documentation strings for this class are in the base class
+    :class:`_HtmlFrameParser`.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(_LxmlFrameParser, self).__init__(*args, **kwargs)
+
+    def _text_getter(self, obj):
+        return obj.text_content()
+
+    def _parse_td(self, row):
+        return row.xpath('.//td|.//th')
+
+    def _parse_tr(self, table):
+        return table.xpath('.//tr')
+
+    def _parse_tables(self, doc, match, kwargs):
+        pattern = match.pattern
+
+        # 1. check all descendants for the given pattern and only search tables
+        # 2. go up the tree until we find a table
+        query = '//table//*[re:test(text(), {patt!r})]/ancestor::table'
+        xpath_expr = u(query).format(patt=pattern)
+
+        # if any table attributes were given build an xpath expression to
+        # search for them
+        if kwargs:
+            xpath_expr += _build_xpath_expr(kwargs)
+
+        tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
+
+        tables = self._handle_hidden_tables(tables, "attrib")
+        if self.displayed_only:
+            for table in tables:
+                # lxml utilizes XPATH 1.0 which does not have regex
+                # support. As a result, we find all elements with a style
+                # attribute and iterate them to check for display:none
+                for elem in table.xpath('.//*[@style]'):
+                    if "display:none" in elem.attrib.get(
+                            "style", "").replace(" ", ""):
+                        elem.getparent().remove(elem)
+
+        if not tables:
+            raise ValueError("No tables found matching regex {patt!r}"
+                             .format(patt=pattern))
+        return tables
+
+    def _build_doc(self):
+        """
+        Raises
+        ------
+        ValueError
+            * If a URL that lxml cannot parse is passed.
+
+        Exception
+            * Any other ``Exception`` thrown. For example, trying to parse a
+              URL that is syntactically correct on a machine with no internet
+              connection will fail.
+
+        See Also
+        --------
+        pandas.io.html._HtmlFrameParser._build_doc
+        """
+        from lxml.html import parse, fromstring, HTMLParser
+        from lxml.etree import XMLSyntaxError
+        parser = HTMLParser(recover=True, encoding=self.encoding)
+
+        try:
+            if _is_url(self.io):
+                with urlopen(self.io) as f:
+                    r = parse(f, parser=parser)
+            else:
+                # try to parse the input in the simplest way
+                r = parse(self.io, parser=parser)
+            try:
+                r = r.getroot()
+            except AttributeError:
+                pass
+        except (UnicodeDecodeError, IOError) as e:
+            # if the input is a blob of html goop
+            if not _is_url(self.io):
+                r = fromstring(self.io, parser=parser)
+
+                try:
+                    r = r.getroot()
+                except AttributeError:
+                    pass
+            else:
+                raise e
+        else:
+            if not hasattr(r, 'text_content'):
+                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
+        return r
+
+    def _parse_tbody(self, table):
+        return table.xpath('.//tbody')
+
+    def _parse_thead(self, table):
+        return table.xpath('.//thead')
+
+    def _parse_tfoot(self, table):
+        return table.xpath('.//tfoot')
+
+    def _parse_raw_thead(self, table):
+        expr = './/thead'
+        thead = table.xpath(expr)
+        res = []
+        if thead:
+            # Grab any directly descending table headers first
+            ths = thead[0].xpath('./th')
+            if ths:
+                cols = [_remove_whitespace(x.text_content()) for x in ths]
+                if any(col != '' for col in cols):
+                    res.append(cols)
+            else:
+                trs = self._parse_tr(thead[0])
+
+                for tr in trs:
+                    cols = [_remove_whitespace(x.text_content()) for x in
+                            self._parse_td(tr)]
+
+                    if any(col != '' for col in cols):
+                        res.append(cols)
+        return res
+
+    def _parse_raw_tfoot(self, table):
+        expr = './/tfoot//th|//tfoot//td'
+        return [_remove_whitespace(x.text_content()) for x in
+                table.xpath(expr)]
+
+
+def _expand_elements(body):
+    lens = Series(lmap(len, body))
+    lens_max = lens.max()
+    not_max = lens[lens != lens_max]
+
+    empty = ['']
+    for ind, length in iteritems(not_max):
+        body[ind] += empty * (lens_max - length)
+
+
+def _data_to_frame(**kwargs):
+    head, body, foot = kwargs.pop('data')
+    header = kwargs.pop('header')
+    kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
+    if head:
+        rows = lrange(len(head))
+        body = head + body
+        if header is None:  # special case when a table has <th> elements
+            header = 0 if rows == [0] else rows
+
+    if foot:
+        body += [foot]
+
+    # fill out elements of body that are "ragged"
+    _expand_elements(body)
+    tp = TextParser(body, header=header, **kwargs)
+    df = tp.read()
+    return df
+
+
+_valid_parsers = {'lxml': _LxmlFrameParser, None: _LxmlFrameParser,
+                  'html5lib': _BeautifulSoupHtml5LibFrameParser,
+                  'bs4': _BeautifulSoupHtml5LibFrameParser}
+
+
+def _parser_dispatch(flavor):
+    """Choose the parser based on the input flavor.
+
+    Parameters
+    ----------
+    flavor : str
+        The type of parser to use. This must be a valid backend.
+
+    Returns
+    -------
+    cls : _HtmlFrameParser subclass
+        The parser class based on the requested input flavor.
+
+    Raises
+    ------
+    ValueError
+        * If `flavor` is not a valid backend.
+    ImportError
+        * If you do not have the requested `flavor`
+    """
+    valid_parsers = list(_valid_parsers.keys())
+    if flavor not in valid_parsers:
+        raise ValueError('{invalid!r} is not a valid flavor, valid flavors '
+                         'are {valid}'
+                         .format(invalid=flavor, valid=valid_parsers))
+
+    if flavor in ('bs4', 'html5lib'):
+        if not _HAS_HTML5LIB:
+            raise ImportError("html5lib not found, please install it")
+        if not _HAS_BS4:
+            raise ImportError(
+                "BeautifulSoup4 (bs4) not found, please install it")
+        import bs4
+        if LooseVersion(bs4.__version__) <= LooseVersion('4.2.0'):
+            raise ValueError("A minimum version of BeautifulSoup 4.2.1 "
+                             "is required")
+
+    else:
+        if not _HAS_LXML:
+            raise ImportError("lxml not found, please install it")
+    return _valid_parsers[flavor]
+
+
+def _print_as_set(s):
+    return '{{arg}}'.format(arg=', '.join(pprint_thing(el) for el in s))
+
+
+def _validate_flavor(flavor):
+    if flavor is None:
+        flavor = 'lxml', 'bs4'
+    elif isinstance(flavor, string_types):
+        flavor = flavor,
+    elif isinstance(flavor, collections.Iterable):
+        if not all(isinstance(flav, string_types) for flav in flavor):
+            raise TypeError('Object of type {typ!r} is not an iterable of '
+                            'strings'
+                            .format(typ=type(flavor).__name__))
+    else:
+        fmt = '{flavor!r}' if isinstance(flavor, string_types) else '{flavor}'
+        fmt += ' is not a valid flavor'
+        raise ValueError(fmt.format(flavor=flavor))
+
+    flavor = tuple(flavor)
+    valid_flavors = set(_valid_parsers)
+    flavor_set = set(flavor)
+
+    if not flavor_set & valid_flavors:
+        raise ValueError('{invalid} is not a valid set of flavors, valid '
+                         'flavors are {valid}'
+                         .format(invalid=_print_as_set(flavor_set),
+                                 valid=_print_as_set(valid_flavors)))
+    return flavor
+
+
+def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
+    flavor = _validate_flavor(flavor)
+    compiled_match = re.compile(match)  # you can pass a compiled regex here
+
+    # hack around python 3 deleting the exception variable
+    retained = None
+    for flav in flavor:
+        parser = _parser_dispatch(flav)
+        p = parser(io, compiled_match, attrs, encoding, displayed_only)
+
+        try:
+            tables = p.parse_tables()
+        except Exception as caught:
+            # if `io` is an io-like object, check if it's seekable
+            # and try to rewind it before trying the next parser
+            if hasattr(io, 'seekable') and io.seekable():
+                io.seek(0)
+            elif hasattr(io, 'seekable') and not io.seekable():
+                # if we couldn't rewind it, let the user know
+                raise ValueError('The flavor {} failed to parse your input. '
+                                 'Since you passed a non-rewindable file '
+                                 'object, we can\'t rewind it to try '
+                                 'another parser. Try read_html() with a '
+                                 'different flavor.'.format(flav))
+
+            retained = caught
+        else:
+            break
+    else:
+        raise_with_traceback(retained)
+
+    ret = []
+    for table in tables:
+        try:
+            ret.append(_data_to_frame(data=table, **kwargs))
+        except EmptyDataError:  # empty table
+            continue
+    return ret
+
+
+def read_html(io, match='.+', flavor=None, header=None, index_col=None,
+              skiprows=None, attrs=None, parse_dates=False,
+              tupleize_cols=None, thousands=',', encoding=None,
+              decimal='.', converters=None, na_values=None,
+              keep_default_na=True, displayed_only=True):
+    r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
+
+    Parameters
+    ----------
+    io : str or file-like
+        A URL, a file-like object, or a raw string containing HTML. Note that
+        lxml only accepts the http, ftp and file url protocols. If you have a
+        URL that starts with ``'https'`` you might try removing the ``'s'``.
+
+    match : str or compiled regular expression, optional
+        The set of tables containing text matching this regex or string will be
+        returned. Unless the HTML is extremely simple you will probably need to
+        pass a non-empty string here. Defaults to '.+' (match any non-empty
+        string). The default value will return all tables contained on a page.
+        This value is converted to a regular expression so that there is
+        consistent behavior between Beautiful Soup and lxml.
+
+    flavor : str or None, container of strings
+        The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
+        each other, they are both there for backwards compatibility. The
+        default of ``None`` tries to use ``lxml`` to parse and if that fails it
+        falls back on ``bs4`` + ``html5lib``.
+
+    header : int or list-like or None, optional
+        The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
+        make the columns headers.
+
+    index_col : int or list-like or None, optional
+        The column (or list of columns) to use to create the index.
+
+    skiprows : int or list-like or slice or None, optional
+        0-based. Number of rows to skip after parsing the column integer. If a
+        sequence of integers or a slice is given, will skip the rows indexed by
+        that sequence.  Note that a single element sequence means 'skip the nth
+        row' whereas an integer means 'skip n rows'.
+
+    attrs : dict or None, optional
+        This is a dictionary of attributes that you can pass to use to identify
+        the table in the HTML. These are not checked for validity before being
+        passed to lxml or Beautiful Soup. However, these attributes must be
+        valid HTML table attributes to work correctly. For example, ::
+
+            attrs = {'id': 'table'}
+
+        is a valid attribute dictionary because the 'id' HTML tag attribute is
+        a valid HTML attribute for *any* HTML tag as per `this document
+        <http://www.w3.org/TR/html-markup/global-attributes.html>`__. ::
+
+            attrs = {'asdf': 'table'}
+
+        is *not* a valid attribute dictionary because 'asdf' is not a valid
+        HTML attribute even if it is a valid XML attribute.  Valid HTML 4.01
+        table attributes can be found `here
+        <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
+        working draft of the HTML 5 spec can be found `here
+        <http://www.w3.org/TR/html-markup/table.html>`__. It contains the
+        latest information on table attributes for the modern web.
+
+    parse_dates : bool, optional
+        See :func:`~pandas.read_csv` for more details.
+
+    tupleize_cols : bool, optional
+        If ``False`` try to parse multiple header rows into a
+        :class:`~pandas.MultiIndex`, otherwise return raw tuples. Defaults to
+        ``False``.
+
+        .. deprecated:: 0.21.0
+           This argument will be removed and will always convert to MultiIndex
+
+    thousands : str, optional
+        Separator to use to parse thousands. Defaults to ``','``.
+
+    encoding : str or None, optional
+        The encoding used to decode the web page. Defaults to ``None``.``None``
+        preserves the previous encoding behavior, which depends on the
+        underlying parser library (e.g., the parser library will try to use
+        the encoding provided by the document).
+
+    decimal : str, default '.'
+        Character to recognize as decimal point (e.g. use ',' for European
+        data).
+
+        .. versionadded:: 0.19.0
+
+    converters : dict, default None
+        Dict of functions for converting values in certain columns. Keys can
+        either be integers or column labels, values are functions that take one
+        input argument, the cell (not column) content, and return the
+        transformed content.
+
+        .. versionadded:: 0.19.0
+
+    na_values : iterable, default None
+        Custom NA values
+
+        .. versionadded:: 0.19.0
+
+    keep_default_na : bool, default True
+        If na_values are specified and keep_default_na is False the default NaN
+        values are overridden, otherwise they're appended to
+
+        .. versionadded:: 0.19.0
+
+    display_only : bool, default True
+        Whether elements with "display: none" should be parsed
+
+        .. versionadded:: 0.23.0
+
+    Returns
+    -------
+    dfs : list of DataFrames
+
+    Notes
+    -----
+    Before using this function you should read the :ref:`gotchas about the
+    HTML parsing libraries <io.html.gotchas>`.
+
+    Expect to do some cleanup after you call this function. For example, you
+    might need to manually assign column names if the column names are
+    converted to NaN when you pass the `header=0` argument. We try to assume as
+    little as possible about the structure of the table and push the
+    idiosyncrasies of the HTML contained in the table to the user.
+
+    This function searches for ``<table>`` elements and only for ``<tr>``
+    and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
+    element in the table. ``<td>`` stands for "table data".
+
+    Similar to :func:`~pandas.read_csv` the `header` argument is applied
+    **after** `skiprows` is applied.
+
+    This function will *always* return a list of :class:`DataFrame` *or*
+    it will fail, e.g., it will *not* return an empty list.
+
+    Examples
+    --------
+    See the :ref:`read_html documentation in the IO section of the docs
+    <io.read_html>` for some examples of reading in HTML tables.
+
+    See Also
+    --------
+    pandas.read_csv
+    """
+    _importers()
+
+    # Type check here. We don't want to parse only to fail because of an
+    # invalid value of an integer skiprows.
+    if isinstance(skiprows, numbers.Integral) and skiprows < 0:
+        raise ValueError('cannot skip rows starting from the end of the '
+                         'data (you passed a negative value)')
+    _validate_header_arg(header)
+    return _parse(flavor=flavor, io=io, match=match, header=header,
+                  index_col=index_col, skiprows=skiprows,
+                  parse_dates=parse_dates, tupleize_cols=tupleize_cols,
+                  thousands=thousands, attrs=attrs, encoding=encoding,
+                  decimal=decimal, converters=converters, na_values=na_values,
+                  keep_default_na=keep_default_na,
+                  displayed_only=displayed_only)