Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added 'displayed_only' option to 'read_html' #20047

Merged
merged 10 commits into from
Mar 10, 2018
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,7 @@ Other Enhancements
- :meth:`Timestamp.day_name` and :meth:`DatetimeIndex.day_name` are now available to return day names with a specified locale (:issue:`12806`)
- :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row.
``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`)
- :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`)

.. _whatsnew_0230.api_breaking:

Expand Down
71 changes: 66 additions & 5 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,14 @@ class _HtmlFrameParser(object):
attrs : dict
List of HTML <table> element attributes to match.

encoding : str
Copy link
Member Author

@WillAyd WillAyd Mar 7, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't add encoding as part of this change but it looks to have been undocumented from whenever it was added. Tossed something in there for now, assuming the sprint this weekend may address in more detail.

The docstrings throughout this module I think technically violate the standard by introducing blank space in between each parameter, but figured better left to the sprint than tossing in this change

Encoding to be used by parser

displayed_only : bool
Whether or not items with "display:none" should be ignored
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a versionadded tag


.. versionadded:: 0.23.0

Attributes
----------
io : str or file-like
Expand All @@ -172,6 +180,14 @@ class _HtmlFrameParser(object):
A dictionary of valid table attributes to use to search for table
elements.

encoding : str
Encoding to be used by parser

displayed_only : bool
Whether or not items with "display:none" should be ignored

.. versionadded:: 0.23.0

Notes
-----
To subclass this class effectively you must override the following methods:
Expand All @@ -187,11 +203,12 @@ class _HtmlFrameParser(object):
functionality.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add this to the attributes list

"""

def __init__(self, io, match, attrs, encoding):
def __init__(self, io, match, attrs, encoding, displayed_only):
self.io = io
self.match = match
self.attrs = attrs
self.encoding = encoding
self.displayed_only = displayed_only

def parse_tables(self):
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
Expand Down Expand Up @@ -380,6 +397,27 @@ def _parse_raw_tbody(self, table):
res = self._parse_tr(table)
return self._parse_raw_data(res)

def _handle_hidden_tables(self, tbl_list, attr_name):
"""Returns list of tables, potentially removing hidden elements

Parameters
----------
tbl_list : list of Tag or list of Element
Type of list elements will vary depending upon parser used
attr_name : str
Name of the accessor for retrieving HTML attributes

Returns
-------
list of Tag or list of Element
Return type matches `tbl_list`
"""
if not self.displayed_only:
return tbl_list

return [x for x in tbl_list if "display:none" not in
getattr(x, attr_name).get('style', '').replace(" ", "")]


class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
"""HTML to DataFrame parser that uses BeautifulSoup under the hood.
Expand Down Expand Up @@ -431,8 +469,14 @@ def _parse_tables(self, doc, match, attrs):

result = []
unique_tables = set()
tables = self._handle_hidden_tables(tables, "attrs")

for table in tables:
if self.displayed_only:
for elem in table.find_all(
style=re.compile(r"display:\s*none")):
elem.decompose()

if (table not in unique_tables and
table.find(text=match) is not None):
result.append(table)
Expand Down Expand Up @@ -528,6 +572,17 @@ def _parse_tables(self, doc, match, kwargs):

tables = doc.xpath(xpath_expr, namespaces=_re_namespace)

tables = self._handle_hidden_tables(tables, "attrib")
if self.displayed_only:
for table in tables:
# lxml utilizes XPATH 1.0 which does not have regex
# support. As a result, we find all elements with a style
# attribute and iterate them to check for display:none
for elem in table.xpath('.//*[@style]'):
if "display:none" in elem.attrib.get(
"style", "").replace(" ", ""):
elem.getparent().remove(elem)

if not tables:
raise ValueError("No tables found matching regex {patt!r}"
.format(patt=pattern))
Expand Down Expand Up @@ -729,15 +784,15 @@ def _validate_flavor(flavor):
return flavor


def _parse(flavor, io, match, attrs, encoding, **kwargs):
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
flavor = _validate_flavor(flavor)
compiled_match = re.compile(match) # you can pass a compiled regex here

# hack around python 3 deleting the exception variable
retained = None
for flav in flavor:
parser = _parser_dispatch(flav)
p = parser(io, compiled_match, attrs, encoding)
p = parser(io, compiled_match, attrs, encoding, displayed_only)

try:
tables = p.parse_tables()
Expand Down Expand Up @@ -773,7 +828,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
skiprows=None, attrs=None, parse_dates=False,
tupleize_cols=None, thousands=',', encoding=None,
decimal='.', converters=None, na_values=None,
keep_default_na=True):
keep_default_na=True, displayed_only=True):
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.

Parameters
Expand Down Expand Up @@ -877,6 +932,11 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,

.. versionadded:: 0.19.0

display_only : bool, default True
Whether elements with "display: none" should be parsed

.. versionadded:: 0.23.0

Returns
-------
dfs : list of DataFrames
Expand Down Expand Up @@ -924,4 +984,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
thousands=thousands, attrs=attrs, encoding=encoding,
decimal=decimal, converters=converters, na_values=na_values,
keep_default_na=keep_default_na)
keep_default_na=keep_default_na,
displayed_only=displayed_only)
66 changes: 66 additions & 0 deletions pandas/tests/io/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,39 @@ def test_wikipedia_states_table(self):
result = self.read_html(data, 'Arizona', header=1)[0]
assert result['sq mi'].dtype == np.dtype('float64')

@pytest.mark.parametrize("displayed_only,exp0,exp1", [
(True, DataFrame(["foo"]), None),
(False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))])
def test_displayed_only(self, displayed_only, exp0, exp1):
# GH 20027
data = StringIO("""<html>
<body>
<table>
<tr>
<td>
foo
<span style="display:none;text-align:center">bar</span>
<span style="display:none">baz</span>
<span style="display: none">qux</span>
</td>
</tr>
</table>
<table style="display: none">
<tr>
<td>foo</td>
</tr>
</table>
</body>
</html>""")

dfs = self.read_html(data, displayed_only=displayed_only)
tm.assert_frame_equal(dfs[0], exp0)

if exp1 is not None:
tm.assert_frame_equal(dfs[1], exp1)
else:
assert len(dfs) == 1 # Should not parse hidden table

def test_decimal_rows(self):

# GH 12907
Expand Down Expand Up @@ -896,6 +929,39 @@ def test_computer_sales_page(self):
data = os.path.join(DATA_PATH, 'computer_sales_page.html')
self.read_html(data, header=[0, 1])

@pytest.mark.parametrize("displayed_only,exp0,exp1", [
Copy link
Member Author

@WillAyd WillAyd Mar 7, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Blatant copy/paste of the method above. I was kind of surprised how few tests were shared between the parsers, so there's opportunity here to consolidate a lot of these tests into a base class but I figured that was better done comprehensively than trying to shimmy into this change

(True, DataFrame(["foo"]), None),
(False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))])
def test_displayed_only(self, displayed_only, exp0, exp1):
# GH 20027
data = StringIO("""<html>
<body>
<table>
<tr>
<td>
foo
<span style="display:none;text-align:center">bar</span>
<span style="display:none">baz</span>
<span style="display: none">qux</span>
</td>
</tr>
</table>
<table style="display: none">
<tr>
<td>foo</td>
</tr>
</table>
</body>
</html>""")

dfs = self.read_html(data, displayed_only=displayed_only)
tm.assert_frame_equal(dfs[0], exp0)

if exp1 is not None:
tm.assert_frame_equal(dfs[1], exp1)
else:
assert len(dfs) == 1 # Should not parse hidden table


def test_invalid_flavor():
url = 'google.com'
Expand Down