Skip to content

Commit

Permalink
Release version 0.7 . Better HTML5 support and an important bugfix.
Browse files Browse the repository at this point in the history
  • Loading branch information
buriy committed May 7, 2018
1 parent 537de2b commit 0e50b53
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 24 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ env:

before_install:
# work around https://github.com/travis-ci/travis-ci/issues/8363
- pyenv global system 3.5
- pyenv global system 3.6

install:
- travis_retry pip install -U pip wheel tox
Expand Down
22 changes: 11 additions & 11 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
# Makefile to help automate tasks
WD := $(shell pwd)
PY := .env/bin/python
PIP := .env/bin/pip
PEP8 := .env/bin/pep8
NOSE := .env/bin/nosetests

PY := .venv/bin/python
PIP := .venv/bin/pip
PEP8 := .venv/bin/pep8
NOSE := .venv/bin/nosetests

# ###########
# Tests rule!
Expand All @@ -22,16 +21,17 @@ $(NOSE):
.PHONY: all
all: venv develop

venv: bin/python
bin/python:
virtualenv .env
venv: .venv/bin/python

.venv/bin/python:
virtualenv .venv

.PHONY: clean_venv
clean_venv:
rm -rf .env
rm -rf .venv

develop: .env/lib/python*/site-packages/readability-lxml.egg-link
.env/lib/python*/site-packages/readability-lxml.egg-link:
develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
.venv/lib/python*/site-packages/readability-lxml.egg-link:
$(PY) setup.py develop


Expand Down
12 changes: 7 additions & 5 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,15 @@ Usage
Change Log
----------

- 0.3 Added Document.encoding, positive\_keywords and
negative\_keywords
- 0.4 Added Videos loading and allowed more images per paragraph
- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and
3.4
- 0.7 Improved HTML5 tags handling. Heuristics were changed for a lot of sites: Fixed an important
bug with stripping unwanted HTML nodes (only first matching node was removed before).
- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3
and 3.4
- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and
3.4
- 0.4 Added Videos loading and allowed more images per paragraph
- 0.3 Added Document.encoding, positive\_keywords and
negative\_keywords

Licensing
=========
Expand Down
10 changes: 5 additions & 5 deletions readability/readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,13 +381,13 @@ def class_weight(self, e):
def score_node(self, elem):
content_score = self.class_weight(elem)
name = elem.tag.lower()
if name == "div":
if name in ["div", "article"]:
content_score += 5
elif name in ["pre", "td", "blockquote"]:
content_score += 3
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]:
content_score -= 3
elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th", "header", "footer", "nav"]:
content_score -= 5
return {
'content_score': content_score,
Expand Down Expand Up @@ -463,7 +463,7 @@ def sanitize(self, node, candidates):

allowed = {}
# Conditionally clean <table>s, <ul>s, and <div>s
for el in self.reverse_tags(node, "table", "ul", "div"):
for el in self.reverse_tags(node, "table", "ul", "div", "aside", "header", "footer", "section"):
if el in allowed:
continue
weight = self.class_weight(el)
Expand Down Expand Up @@ -577,7 +577,7 @@ def sanitize(self, node, candidates):
if siblings and sum(siblings) > 1000:
to_remove = False
log.debug("Allowing %s" % describe(el))
for desnode in self.tags(el, "table", "ul", "div"):
for desnode in self.tags(el, "table", "ul", "div", "section"):
allowed[desnode] = True

if to_remove:
Expand Down
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

setup(
name="readability-lxml",
version="0.6.2",
version="0.7",
author="Yuri Baburov",
author_email="burchik@gmail.com",
description="fast html to text parser (article readability tool) with python3 support",
Expand Down Expand Up @@ -43,6 +43,5 @@
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",

],
)
31 changes: 31 additions & 0 deletions tests/test_article_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,34 @@ def test_best_elem_is_root_and_passing(self):
)
doc = Document(sample)
doc.summary()

def test_correct_cleanup(self):
sample = """
<html>
<body>
<section>test section</section>
<article class="">
<p>Lot of text here.</p>
<div id="advertisement"><a href="link">Ad</a></div>
<p>More text is written here, and contains punctuation and dots.</p>
</article>
<aside id="comment1"/>
<div id="comment2">
<a href="asd">spam</a>
<a href="asd">spam</a>
<a href="asd">spam</a>
</div>
<div id="comment3"/>
<aside id="comment4">A small comment.</aside>
<div id="comment5"><p>The comment is also helpful, but it's
still not the correct item to be extracted.</p>
<p>It's even longer than the article itself!"</p></div>
</body>
</html>
"""
doc = Document(sample)
s = doc.summary()
#print(s)
assert('punctuation' in s)
assert(not 'comment' in s)
assert(not 'aside' in s)

0 comments on commit 0e50b53

Please sign in to comment.