started work on backend
This commit is contained in:
@@ -0,0 +1,987 @@
|
||||
""":mod:`pandas.io.html` is a module containing functionality for dealing with
|
||||
HTML IO.
|
||||
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import numbers
|
||||
import collections
|
||||
|
||||
from distutils.version import LooseVersion
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
from pandas.errors import EmptyDataError
|
||||
from pandas.io.common import _is_url, urlopen, _validate_header_arg
|
||||
from pandas.io.parsers import TextParser
|
||||
from pandas.compat import (lrange, lmap, u, string_types, iteritems,
|
||||
raise_with_traceback, binary_type)
|
||||
from pandas import Series
|
||||
import pandas.core.common as com
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
_IMPORTS = False
|
||||
_HAS_BS4 = False
|
||||
_HAS_LXML = False
|
||||
_HAS_HTML5LIB = False
|
||||
|
||||
|
||||
def _importers():
|
||||
# import things we need
|
||||
# but make this done on a first use basis
|
||||
|
||||
global _IMPORTS
|
||||
if _IMPORTS:
|
||||
return
|
||||
|
||||
global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB
|
||||
|
||||
try:
|
||||
import bs4 # noqa
|
||||
_HAS_BS4 = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import lxml # noqa
|
||||
_HAS_LXML = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import html5lib # noqa
|
||||
_HAS_HTML5LIB = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
_IMPORTS = True
|
||||
|
||||
|
||||
#############
|
||||
# READ HTML #
|
||||
#############
|
||||
_RE_WHITESPACE = re.compile(r'[\r\n]+|\s{2,}')
|
||||
|
||||
|
||||
char_types = string_types + (binary_type,)
|
||||
|
||||
|
||||
def _remove_whitespace(s, regex=_RE_WHITESPACE):
|
||||
"""Replace extra whitespace inside of a string with a single space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s : str or unicode
|
||||
The string from which to remove extra whitespace.
|
||||
|
||||
regex : regex
|
||||
The regular expression to use to remove extra whitespace.
|
||||
|
||||
Returns
|
||||
-------
|
||||
subd : str or unicode
|
||||
`s` with all extra whitespace replaced with a single space.
|
||||
"""
|
||||
return regex.sub(' ', s.strip())
|
||||
|
||||
|
||||
def _get_skiprows(skiprows):
|
||||
"""Get an iterator given an integer, slice or container.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
skiprows : int, slice, container
|
||||
The iterator to use to skip rows; can also be a slice.
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError
|
||||
* If `skiprows` is not a slice, integer, or Container
|
||||
|
||||
Returns
|
||||
-------
|
||||
it : iterable
|
||||
A proper iterator to use to skip rows of a DataFrame.
|
||||
"""
|
||||
if isinstance(skiprows, slice):
|
||||
return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1)
|
||||
elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):
|
||||
return skiprows
|
||||
elif skiprows is None:
|
||||
return 0
|
||||
raise TypeError('%r is not a valid type for skipping rows' %
|
||||
type(skiprows).__name__)
|
||||
|
||||
|
||||
def _read(obj):
|
||||
"""Try to read from a url, file or string.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : str, unicode, or file-like
|
||||
|
||||
Returns
|
||||
-------
|
||||
raw_text : str
|
||||
"""
|
||||
if _is_url(obj):
|
||||
with urlopen(obj) as url:
|
||||
text = url.read()
|
||||
elif hasattr(obj, 'read'):
|
||||
text = obj.read()
|
||||
elif isinstance(obj, char_types):
|
||||
text = obj
|
||||
try:
|
||||
if os.path.isfile(text):
|
||||
with open(text, 'rb') as f:
|
||||
return f.read()
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
else:
|
||||
raise TypeError("Cannot read object of type %r" % type(obj).__name__)
|
||||
return text
|
||||
|
||||
|
||||
class _HtmlFrameParser(object):
|
||||
"""Base class for parsers that parse HTML into DataFrames.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
io : str or file-like
|
||||
This can be either a string of raw HTML, a valid URL using the HTTP,
|
||||
FTP, or FILE protocols or a file-like object.
|
||||
|
||||
match : str or regex
|
||||
The text to match in the document.
|
||||
|
||||
attrs : dict
|
||||
List of HTML <table> element attributes to match.
|
||||
|
||||
encoding : str
|
||||
Encoding to be used by parser
|
||||
|
||||
displayed_only : bool
|
||||
Whether or not items with "display:none" should be ignored
|
||||
|
||||
.. versionadded:: 0.23.0
|
||||
|
||||
Attributes
|
||||
----------
|
||||
io : str or file-like
|
||||
raw HTML, URL, or file-like object
|
||||
|
||||
match : regex
|
||||
The text to match in the raw HTML
|
||||
|
||||
attrs : dict-like
|
||||
A dictionary of valid table attributes to use to search for table
|
||||
elements.
|
||||
|
||||
encoding : str
|
||||
Encoding to be used by parser
|
||||
|
||||
displayed_only : bool
|
||||
Whether or not items with "display:none" should be ignored
|
||||
|
||||
.. versionadded:: 0.23.0
|
||||
|
||||
Notes
|
||||
-----
|
||||
To subclass this class effectively you must override the following methods:
|
||||
* :func:`_build_doc`
|
||||
* :func:`_text_getter`
|
||||
* :func:`_parse_td`
|
||||
* :func:`_parse_tables`
|
||||
* :func:`_parse_tr`
|
||||
* :func:`_parse_thead`
|
||||
* :func:`_parse_tbody`
|
||||
* :func:`_parse_tfoot`
|
||||
See each method's respective documentation for details on their
|
||||
functionality.
|
||||
"""
|
||||
|
||||
def __init__(self, io, match, attrs, encoding, displayed_only):
|
||||
self.io = io
|
||||
self.match = match
|
||||
self.attrs = attrs
|
||||
self.encoding = encoding
|
||||
self.displayed_only = displayed_only
|
||||
|
||||
def parse_tables(self):
|
||||
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
|
||||
return (self._build_table(table) for table in tables)
|
||||
|
||||
def _parse_raw_data(self, rows):
|
||||
"""Parse the raw data into a list of lists.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
rows : iterable of node-like
|
||||
A list of row elements.
|
||||
|
||||
text_getter : callable
|
||||
A callable that gets the text from an individual node. This must be
|
||||
defined by subclasses.
|
||||
|
||||
column_finder : callable
|
||||
A callable that takes a row node as input and returns a list of the
|
||||
column node in that row. This must be defined by subclasses.
|
||||
|
||||
Returns
|
||||
-------
|
||||
data : list of list of strings
|
||||
"""
|
||||
data = [[_remove_whitespace(self._text_getter(col)) for col in
|
||||
self._parse_td(row)] for row in rows]
|
||||
return data
|
||||
|
||||
def _text_getter(self, obj):
|
||||
"""Return the text of an individual DOM node.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : node-like
|
||||
A DOM node.
|
||||
|
||||
Returns
|
||||
-------
|
||||
text : str or unicode
|
||||
The text from an individual DOM node.
|
||||
"""
|
||||
raise com.AbstractMethodError(self)
|
||||
|
||||
def _parse_td(self, obj):
|
||||
"""Return the td elements from a row element.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : node-like
|
||||
|
||||
Returns
|
||||
-------
|
||||
columns : list of node-like
|
||||
These are the elements of each row, i.e., the columns.
|
||||
"""
|
||||
raise com.AbstractMethodError(self)
|
||||
|
||||
def _parse_tables(self, doc, match, attrs):
|
||||
"""Return all tables from the parsed DOM.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
doc : tree-like
|
||||
The DOM from which to parse the table element.
|
||||
|
||||
match : str or regular expression
|
||||
The text to search for in the DOM tree.
|
||||
|
||||
attrs : dict
|
||||
A dictionary of table attributes that can be used to disambiguate
|
||||
multiple tables on a page.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
* If `match` does not match any text in the document.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tables : list of node-like
|
||||
A list of <table> elements to be parsed into raw data.
|
||||
"""
|
||||
raise com.AbstractMethodError(self)
|
||||
|
||||
def _parse_tr(self, table):
|
||||
"""Return the list of row elements from the parsed table element.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : node-like
|
||||
A table element that contains row elements.
|
||||
|
||||
Returns
|
||||
-------
|
||||
rows : list of node-like
|
||||
A list row elements of a table, usually <tr> or <th> elements.
|
||||
"""
|
||||
raise com.AbstractMethodError(self)
|
||||
|
||||
def _parse_thead(self, table):
|
||||
"""Return the header of a table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : node-like
|
||||
A table element that contains row elements.
|
||||
|
||||
Returns
|
||||
-------
|
||||
thead : node-like
|
||||
A <thead>...</thead> element.
|
||||
"""
|
||||
raise com.AbstractMethodError(self)
|
||||
|
||||
def _parse_tbody(self, table):
|
||||
"""Return the list of tbody elements from the parsed table element.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : node-like
|
||||
A table element that contains row elements.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tbodys : list of node-like
|
||||
A list of <tbody>...</tbody> elements
|
||||
"""
|
||||
raise com.AbstractMethodError(self)
|
||||
|
||||
def _parse_tfoot(self, table):
|
||||
"""Return the footer of the table if any.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : node-like
|
||||
A table element that contains row elements.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tfoot : node-like
|
||||
A <tfoot>...</tfoot> element.
|
||||
"""
|
||||
raise com.AbstractMethodError(self)
|
||||
|
||||
def _build_doc(self):
|
||||
"""Return a tree-like object that can be used to iterate over the DOM.
|
||||
|
||||
Returns
|
||||
-------
|
||||
obj : tree-like
|
||||
"""
|
||||
raise com.AbstractMethodError(self)
|
||||
|
||||
def _build_table(self, table):
|
||||
header = self._parse_raw_thead(table)
|
||||
body = self._parse_raw_tbody(table)
|
||||
footer = self._parse_raw_tfoot(table)
|
||||
return header, body, footer
|
||||
|
||||
def _parse_raw_thead(self, table):
|
||||
thead = self._parse_thead(table)
|
||||
res = []
|
||||
if thead:
|
||||
trs = self._parse_tr(thead[0])
|
||||
for tr in trs:
|
||||
cols = lmap(self._text_getter, self._parse_td(tr))
|
||||
if any(col != '' for col in cols):
|
||||
res.append(cols)
|
||||
return res
|
||||
|
||||
def _parse_raw_tfoot(self, table):
|
||||
tfoot = self._parse_tfoot(table)
|
||||
res = []
|
||||
if tfoot:
|
||||
res = lmap(self._text_getter, self._parse_td(tfoot[0]))
|
||||
return np.atleast_1d(
|
||||
np.array(res).squeeze()) if res and len(res) == 1 else res
|
||||
|
||||
def _parse_raw_tbody(self, table):
|
||||
tbodies = self._parse_tbody(table)
|
||||
|
||||
raw_data = []
|
||||
|
||||
if tbodies:
|
||||
for tbody in tbodies:
|
||||
raw_data.extend(self._parse_tr(tbody))
|
||||
else:
|
||||
raw_data.extend(self._parse_tr(table))
|
||||
|
||||
return self._parse_raw_data(raw_data)
|
||||
|
||||
def _handle_hidden_tables(self, tbl_list, attr_name):
|
||||
"""Returns list of tables, potentially removing hidden elements
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tbl_list : list of Tag or list of Element
|
||||
Type of list elements will vary depending upon parser used
|
||||
attr_name : str
|
||||
Name of the accessor for retrieving HTML attributes
|
||||
|
||||
Returns
|
||||
-------
|
||||
list of Tag or list of Element
|
||||
Return type matches `tbl_list`
|
||||
"""
|
||||
if not self.displayed_only:
|
||||
return tbl_list
|
||||
|
||||
return [x for x in tbl_list if "display:none" not in
|
||||
getattr(x, attr_name).get('style', '').replace(" ", "")]
|
||||
|
||||
|
||||
class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
|
||||
"""HTML to DataFrame parser that uses BeautifulSoup under the hood.
|
||||
|
||||
See Also
|
||||
--------
|
||||
pandas.io.html._HtmlFrameParser
|
||||
pandas.io.html._LxmlFrameParser
|
||||
|
||||
Notes
|
||||
-----
|
||||
Documentation strings for this class are in the base class
|
||||
:class:`pandas.io.html._HtmlFrameParser`.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args,
|
||||
**kwargs)
|
||||
from bs4 import SoupStrainer
|
||||
self._strainer = SoupStrainer('table')
|
||||
|
||||
def _text_getter(self, obj):
|
||||
return obj.text
|
||||
|
||||
def _parse_td(self, row):
|
||||
return row.find_all(('td', 'th'))
|
||||
|
||||
def _parse_tr(self, element):
|
||||
return element.find_all('tr')
|
||||
|
||||
def _parse_th(self, element):
|
||||
return element.find_all('th')
|
||||
|
||||
def _parse_thead(self, table):
|
||||
return table.find_all('thead')
|
||||
|
||||
def _parse_tbody(self, table):
|
||||
return table.find_all('tbody')
|
||||
|
||||
def _parse_tfoot(self, table):
|
||||
return table.find_all('tfoot')
|
||||
|
||||
def _parse_tables(self, doc, match, attrs):
|
||||
element_name = self._strainer.name
|
||||
tables = doc.find_all(element_name, attrs=attrs)
|
||||
|
||||
if not tables:
|
||||
raise ValueError('No tables found')
|
||||
|
||||
result = []
|
||||
unique_tables = set()
|
||||
tables = self._handle_hidden_tables(tables, "attrs")
|
||||
|
||||
for table in tables:
|
||||
if self.displayed_only:
|
||||
for elem in table.find_all(
|
||||
style=re.compile(r"display:\s*none")):
|
||||
elem.decompose()
|
||||
|
||||
if (table not in unique_tables and
|
||||
table.find(text=match) is not None):
|
||||
result.append(table)
|
||||
unique_tables.add(table)
|
||||
|
||||
if not result:
|
||||
raise ValueError("No tables found matching pattern {patt!r}"
|
||||
.format(patt=match.pattern))
|
||||
return result
|
||||
|
||||
def _setup_build_doc(self):
|
||||
raw_text = _read(self.io)
|
||||
if not raw_text:
|
||||
raise ValueError('No text parsed from document: {doc}'
|
||||
.format(doc=self.io))
|
||||
return raw_text
|
||||
|
||||
def _build_doc(self):
|
||||
from bs4 import BeautifulSoup
|
||||
return BeautifulSoup(self._setup_build_doc(), features='html5lib',
|
||||
from_encoding=self.encoding)
|
||||
|
||||
|
||||
def _build_xpath_expr(attrs):
|
||||
"""Build an xpath expression to simulate bs4's ability to pass in kwargs to
|
||||
search for attributes when using the lxml parser.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
attrs : dict
|
||||
A dict of HTML attributes. These are NOT checked for validity.
|
||||
|
||||
Returns
|
||||
-------
|
||||
expr : unicode
|
||||
An XPath expression that checks for the given HTML attributes.
|
||||
"""
|
||||
# give class attribute as class_ because class is a python keyword
|
||||
if 'class_' in attrs:
|
||||
attrs['class'] = attrs.pop('class_')
|
||||
|
||||
s = [u("@{key}={val!r}").format(key=k, val=v) for k, v in iteritems(attrs)]
|
||||
return u('[{expr}]').format(expr=' and '.join(s))
|
||||
|
||||
|
||||
_re_namespace = {'re': 'http://exslt.org/regular-expressions'}
|
||||
_valid_schemes = 'http', 'file', 'ftp'
|
||||
|
||||
|
||||
class _LxmlFrameParser(_HtmlFrameParser):
|
||||
"""HTML to DataFrame parser that uses lxml under the hood.
|
||||
|
||||
Warning
|
||||
-------
|
||||
This parser can only handle HTTP, FTP, and FILE urls.
|
||||
|
||||
See Also
|
||||
--------
|
||||
_HtmlFrameParser
|
||||
_BeautifulSoupLxmlFrameParser
|
||||
|
||||
Notes
|
||||
-----
|
||||
Documentation strings for this class are in the base class
|
||||
:class:`_HtmlFrameParser`.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(_LxmlFrameParser, self).__init__(*args, **kwargs)
|
||||
|
||||
def _text_getter(self, obj):
|
||||
return obj.text_content()
|
||||
|
||||
def _parse_td(self, row):
|
||||
return row.xpath('.//td|.//th')
|
||||
|
||||
def _parse_tr(self, table):
|
||||
return table.xpath('.//tr')
|
||||
|
||||
def _parse_tables(self, doc, match, kwargs):
|
||||
pattern = match.pattern
|
||||
|
||||
# 1. check all descendants for the given pattern and only search tables
|
||||
# 2. go up the tree until we find a table
|
||||
query = '//table//*[re:test(text(), {patt!r})]/ancestor::table'
|
||||
xpath_expr = u(query).format(patt=pattern)
|
||||
|
||||
# if any table attributes were given build an xpath expression to
|
||||
# search for them
|
||||
if kwargs:
|
||||
xpath_expr += _build_xpath_expr(kwargs)
|
||||
|
||||
tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
|
||||
|
||||
tables = self._handle_hidden_tables(tables, "attrib")
|
||||
if self.displayed_only:
|
||||
for table in tables:
|
||||
# lxml utilizes XPATH 1.0 which does not have regex
|
||||
# support. As a result, we find all elements with a style
|
||||
# attribute and iterate them to check for display:none
|
||||
for elem in table.xpath('.//*[@style]'):
|
||||
if "display:none" in elem.attrib.get(
|
||||
"style", "").replace(" ", ""):
|
||||
elem.getparent().remove(elem)
|
||||
|
||||
if not tables:
|
||||
raise ValueError("No tables found matching regex {patt!r}"
|
||||
.format(patt=pattern))
|
||||
return tables
|
||||
|
||||
def _build_doc(self):
|
||||
"""
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
* If a URL that lxml cannot parse is passed.
|
||||
|
||||
Exception
|
||||
* Any other ``Exception`` thrown. For example, trying to parse a
|
||||
URL that is syntactically correct on a machine with no internet
|
||||
connection will fail.
|
||||
|
||||
See Also
|
||||
--------
|
||||
pandas.io.html._HtmlFrameParser._build_doc
|
||||
"""
|
||||
from lxml.html import parse, fromstring, HTMLParser
|
||||
from lxml.etree import XMLSyntaxError
|
||||
parser = HTMLParser(recover=True, encoding=self.encoding)
|
||||
|
||||
try:
|
||||
if _is_url(self.io):
|
||||
with urlopen(self.io) as f:
|
||||
r = parse(f, parser=parser)
|
||||
else:
|
||||
# try to parse the input in the simplest way
|
||||
r = parse(self.io, parser=parser)
|
||||
try:
|
||||
r = r.getroot()
|
||||
except AttributeError:
|
||||
pass
|
||||
except (UnicodeDecodeError, IOError) as e:
|
||||
# if the input is a blob of html goop
|
||||
if not _is_url(self.io):
|
||||
r = fromstring(self.io, parser=parser)
|
||||
|
||||
try:
|
||||
r = r.getroot()
|
||||
except AttributeError:
|
||||
pass
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
if not hasattr(r, 'text_content'):
|
||||
raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
|
||||
return r
|
||||
|
||||
def _parse_tbody(self, table):
|
||||
return table.xpath('.//tbody')
|
||||
|
||||
def _parse_thead(self, table):
|
||||
return table.xpath('.//thead')
|
||||
|
||||
def _parse_tfoot(self, table):
|
||||
return table.xpath('.//tfoot')
|
||||
|
||||
def _parse_raw_thead(self, table):
|
||||
expr = './/thead'
|
||||
thead = table.xpath(expr)
|
||||
res = []
|
||||
if thead:
|
||||
# Grab any directly descending table headers first
|
||||
ths = thead[0].xpath('./th')
|
||||
if ths:
|
||||
cols = [_remove_whitespace(x.text_content()) for x in ths]
|
||||
if any(col != '' for col in cols):
|
||||
res.append(cols)
|
||||
else:
|
||||
trs = self._parse_tr(thead[0])
|
||||
|
||||
for tr in trs:
|
||||
cols = [_remove_whitespace(x.text_content()) for x in
|
||||
self._parse_td(tr)]
|
||||
|
||||
if any(col != '' for col in cols):
|
||||
res.append(cols)
|
||||
return res
|
||||
|
||||
def _parse_raw_tfoot(self, table):
|
||||
expr = './/tfoot//th|//tfoot//td'
|
||||
return [_remove_whitespace(x.text_content()) for x in
|
||||
table.xpath(expr)]
|
||||
|
||||
|
||||
def _expand_elements(body):
|
||||
lens = Series(lmap(len, body))
|
||||
lens_max = lens.max()
|
||||
not_max = lens[lens != lens_max]
|
||||
|
||||
empty = ['']
|
||||
for ind, length in iteritems(not_max):
|
||||
body[ind] += empty * (lens_max - length)
|
||||
|
||||
|
||||
def _data_to_frame(**kwargs):
|
||||
head, body, foot = kwargs.pop('data')
|
||||
header = kwargs.pop('header')
|
||||
kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
|
||||
if head:
|
||||
rows = lrange(len(head))
|
||||
body = head + body
|
||||
if header is None: # special case when a table has <th> elements
|
||||
header = 0 if rows == [0] else rows
|
||||
|
||||
if foot:
|
||||
body += [foot]
|
||||
|
||||
# fill out elements of body that are "ragged"
|
||||
_expand_elements(body)
|
||||
tp = TextParser(body, header=header, **kwargs)
|
||||
df = tp.read()
|
||||
return df
|
||||
|
||||
|
||||
_valid_parsers = {'lxml': _LxmlFrameParser, None: _LxmlFrameParser,
|
||||
'html5lib': _BeautifulSoupHtml5LibFrameParser,
|
||||
'bs4': _BeautifulSoupHtml5LibFrameParser}
|
||||
|
||||
|
||||
def _parser_dispatch(flavor):
|
||||
"""Choose the parser based on the input flavor.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
flavor : str
|
||||
The type of parser to use. This must be a valid backend.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cls : _HtmlFrameParser subclass
|
||||
The parser class based on the requested input flavor.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
* If `flavor` is not a valid backend.
|
||||
ImportError
|
||||
* If you do not have the requested `flavor`
|
||||
"""
|
||||
valid_parsers = list(_valid_parsers.keys())
|
||||
if flavor not in valid_parsers:
|
||||
raise ValueError('{invalid!r} is not a valid flavor, valid flavors '
|
||||
'are {valid}'
|
||||
.format(invalid=flavor, valid=valid_parsers))
|
||||
|
||||
if flavor in ('bs4', 'html5lib'):
|
||||
if not _HAS_HTML5LIB:
|
||||
raise ImportError("html5lib not found, please install it")
|
||||
if not _HAS_BS4:
|
||||
raise ImportError(
|
||||
"BeautifulSoup4 (bs4) not found, please install it")
|
||||
import bs4
|
||||
if LooseVersion(bs4.__version__) <= LooseVersion('4.2.0'):
|
||||
raise ValueError("A minimum version of BeautifulSoup 4.2.1 "
|
||||
"is required")
|
||||
|
||||
else:
|
||||
if not _HAS_LXML:
|
||||
raise ImportError("lxml not found, please install it")
|
||||
return _valid_parsers[flavor]
|
||||
|
||||
|
||||
def _print_as_set(s):
|
||||
return '{{arg}}'.format(arg=', '.join(pprint_thing(el) for el in s))
|
||||
|
||||
|
||||
def _validate_flavor(flavor):
|
||||
if flavor is None:
|
||||
flavor = 'lxml', 'bs4'
|
||||
elif isinstance(flavor, string_types):
|
||||
flavor = flavor,
|
||||
elif isinstance(flavor, collections.Iterable):
|
||||
if not all(isinstance(flav, string_types) for flav in flavor):
|
||||
raise TypeError('Object of type {typ!r} is not an iterable of '
|
||||
'strings'
|
||||
.format(typ=type(flavor).__name__))
|
||||
else:
|
||||
fmt = '{flavor!r}' if isinstance(flavor, string_types) else '{flavor}'
|
||||
fmt += ' is not a valid flavor'
|
||||
raise ValueError(fmt.format(flavor=flavor))
|
||||
|
||||
flavor = tuple(flavor)
|
||||
valid_flavors = set(_valid_parsers)
|
||||
flavor_set = set(flavor)
|
||||
|
||||
if not flavor_set & valid_flavors:
|
||||
raise ValueError('{invalid} is not a valid set of flavors, valid '
|
||||
'flavors are {valid}'
|
||||
.format(invalid=_print_as_set(flavor_set),
|
||||
valid=_print_as_set(valid_flavors)))
|
||||
return flavor
|
||||
|
||||
|
||||
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
|
||||
flavor = _validate_flavor(flavor)
|
||||
compiled_match = re.compile(match) # you can pass a compiled regex here
|
||||
|
||||
# hack around python 3 deleting the exception variable
|
||||
retained = None
|
||||
for flav in flavor:
|
||||
parser = _parser_dispatch(flav)
|
||||
p = parser(io, compiled_match, attrs, encoding, displayed_only)
|
||||
|
||||
try:
|
||||
tables = p.parse_tables()
|
||||
except Exception as caught:
|
||||
# if `io` is an io-like object, check if it's seekable
|
||||
# and try to rewind it before trying the next parser
|
||||
if hasattr(io, 'seekable') and io.seekable():
|
||||
io.seek(0)
|
||||
elif hasattr(io, 'seekable') and not io.seekable():
|
||||
# if we couldn't rewind it, let the user know
|
||||
raise ValueError('The flavor {} failed to parse your input. '
|
||||
'Since you passed a non-rewindable file '
|
||||
'object, we can\'t rewind it to try '
|
||||
'another parser. Try read_html() with a '
|
||||
'different flavor.'.format(flav))
|
||||
|
||||
retained = caught
|
||||
else:
|
||||
break
|
||||
else:
|
||||
raise_with_traceback(retained)
|
||||
|
||||
ret = []
|
||||
for table in tables:
|
||||
try:
|
||||
ret.append(_data_to_frame(data=table, **kwargs))
|
||||
except EmptyDataError: # empty table
|
||||
continue
|
||||
return ret
|
||||
|
||||
|
||||
def read_html(io, match='.+', flavor=None, header=None, index_col=None,
|
||||
skiprows=None, attrs=None, parse_dates=False,
|
||||
tupleize_cols=None, thousands=',', encoding=None,
|
||||
decimal='.', converters=None, na_values=None,
|
||||
keep_default_na=True, displayed_only=True):
|
||||
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
io : str or file-like
|
||||
A URL, a file-like object, or a raw string containing HTML. Note that
|
||||
lxml only accepts the http, ftp and file url protocols. If you have a
|
||||
URL that starts with ``'https'`` you might try removing the ``'s'``.
|
||||
|
||||
match : str or compiled regular expression, optional
|
||||
The set of tables containing text matching this regex or string will be
|
||||
returned. Unless the HTML is extremely simple you will probably need to
|
||||
pass a non-empty string here. Defaults to '.+' (match any non-empty
|
||||
string). The default value will return all tables contained on a page.
|
||||
This value is converted to a regular expression so that there is
|
||||
consistent behavior between Beautiful Soup and lxml.
|
||||
|
||||
flavor : str or None, container of strings
|
||||
The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
|
||||
each other, they are both there for backwards compatibility. The
|
||||
default of ``None`` tries to use ``lxml`` to parse and if that fails it
|
||||
falls back on ``bs4`` + ``html5lib``.
|
||||
|
||||
header : int or list-like or None, optional
|
||||
The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
|
||||
make the columns headers.
|
||||
|
||||
index_col : int or list-like or None, optional
|
||||
The column (or list of columns) to use to create the index.
|
||||
|
||||
skiprows : int or list-like or slice or None, optional
|
||||
0-based. Number of rows to skip after parsing the column integer. If a
|
||||
sequence of integers or a slice is given, will skip the rows indexed by
|
||||
that sequence. Note that a single element sequence means 'skip the nth
|
||||
row' whereas an integer means 'skip n rows'.
|
||||
|
||||
attrs : dict or None, optional
|
||||
This is a dictionary of attributes that you can pass to use to identify
|
||||
the table in the HTML. These are not checked for validity before being
|
||||
passed to lxml or Beautiful Soup. However, these attributes must be
|
||||
valid HTML table attributes to work correctly. For example, ::
|
||||
|
||||
attrs = {'id': 'table'}
|
||||
|
||||
is a valid attribute dictionary because the 'id' HTML tag attribute is
|
||||
a valid HTML attribute for *any* HTML tag as per `this document
|
||||
<http://www.w3.org/TR/html-markup/global-attributes.html>`__. ::
|
||||
|
||||
attrs = {'asdf': 'table'}
|
||||
|
||||
is *not* a valid attribute dictionary because 'asdf' is not a valid
|
||||
HTML attribute even if it is a valid XML attribute. Valid HTML 4.01
|
||||
table attributes can be found `here
|
||||
<http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
|
||||
working draft of the HTML 5 spec can be found `here
|
||||
<http://www.w3.org/TR/html-markup/table.html>`__. It contains the
|
||||
latest information on table attributes for the modern web.
|
||||
|
||||
parse_dates : bool, optional
|
||||
See :func:`~pandas.read_csv` for more details.
|
||||
|
||||
tupleize_cols : bool, optional
|
||||
If ``False`` try to parse multiple header rows into a
|
||||
:class:`~pandas.MultiIndex`, otherwise return raw tuples. Defaults to
|
||||
``False``.
|
||||
|
||||
.. deprecated:: 0.21.0
|
||||
This argument will be removed and will always convert to MultiIndex
|
||||
|
||||
thousands : str, optional
|
||||
Separator to use to parse thousands. Defaults to ``','``.
|
||||
|
||||
encoding : str or None, optional
|
||||
The encoding used to decode the web page. Defaults to ``None``.``None``
|
||||
preserves the previous encoding behavior, which depends on the
|
||||
underlying parser library (e.g., the parser library will try to use
|
||||
the encoding provided by the document).
|
||||
|
||||
decimal : str, default '.'
|
||||
Character to recognize as decimal point (e.g. use ',' for European
|
||||
data).
|
||||
|
||||
.. versionadded:: 0.19.0
|
||||
|
||||
converters : dict, default None
|
||||
Dict of functions for converting values in certain columns. Keys can
|
||||
either be integers or column labels, values are functions that take one
|
||||
input argument, the cell (not column) content, and return the
|
||||
transformed content.
|
||||
|
||||
.. versionadded:: 0.19.0
|
||||
|
||||
na_values : iterable, default None
|
||||
Custom NA values
|
||||
|
||||
.. versionadded:: 0.19.0
|
||||
|
||||
keep_default_na : bool, default True
|
||||
If na_values are specified and keep_default_na is False the default NaN
|
||||
values are overridden, otherwise they're appended to
|
||||
|
||||
.. versionadded:: 0.19.0
|
||||
|
||||
display_only : bool, default True
|
||||
Whether elements with "display: none" should be parsed
|
||||
|
||||
.. versionadded:: 0.23.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
dfs : list of DataFrames
|
||||
|
||||
Notes
|
||||
-----
|
||||
Before using this function you should read the :ref:`gotchas about the
|
||||
HTML parsing libraries <io.html.gotchas>`.
|
||||
|
||||
Expect to do some cleanup after you call this function. For example, you
|
||||
might need to manually assign column names if the column names are
|
||||
converted to NaN when you pass the `header=0` argument. We try to assume as
|
||||
little as possible about the structure of the table and push the
|
||||
idiosyncrasies of the HTML contained in the table to the user.
|
||||
|
||||
This function searches for ``<table>`` elements and only for ``<tr>``
|
||||
and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
|
||||
element in the table. ``<td>`` stands for "table data".
|
||||
|
||||
Similar to :func:`~pandas.read_csv` the `header` argument is applied
|
||||
**after** `skiprows` is applied.
|
||||
|
||||
This function will *always* return a list of :class:`DataFrame` *or*
|
||||
it will fail, e.g., it will *not* return an empty list.
|
||||
|
||||
Examples
|
||||
--------
|
||||
See the :ref:`read_html documentation in the IO section of the docs
|
||||
<io.read_html>` for some examples of reading in HTML tables.
|
||||
|
||||
See Also
|
||||
--------
|
||||
pandas.read_csv
|
||||
"""
|
||||
_importers()
|
||||
|
||||
# Type check here. We don't want to parse only to fail because of an
|
||||
# invalid value of an integer skiprows.
|
||||
if isinstance(skiprows, numbers.Integral) and skiprows < 0:
|
||||
raise ValueError('cannot skip rows starting from the end of the '
|
||||
'data (you passed a negative value)')
|
||||
_validate_header_arg(header)
|
||||
return _parse(flavor=flavor, io=io, match=match, header=header,
|
||||
index_col=index_col, skiprows=skiprows,
|
||||
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
|
||||
thousands=thousands, attrs=attrs, encoding=encoding,
|
||||
decimal=decimal, converters=converters, na_values=na_values,
|
||||
keep_default_na=keep_default_na,
|
||||
displayed_only=displayed_only)
|
||||
Reference in New Issue
Block a user