From 0e50b53d056359fa1020c9279e4f876be90cb484 Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Mon, 7 May 2018 12:31:39 +0700 Subject: [PATCH] Release version 0.7 . Better HTML5 support and an important bugfix. --- .travis.yml | 2 +- Makefile | 22 +++++++++++----------- README.rst | 12 +++++++----- readability/readability.py | 10 +++++----- setup.py | 3 +-- tests/test_article_only.py | 31 +++++++++++++++++++++++++++++++ 6 files changed, 56 insertions(+), 24 deletions(-) diff --git a/.travis.yml b/.travis.yml index b0ebc5ed..b542c481 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ env: before_install: # work around https://github.com/travis-ci/travis-ci/issues/8363 - - pyenv global system 3.5 + - pyenv global system 3.6 install: - travis_retry pip install -U pip wheel tox diff --git a/Makefile b/Makefile index 0a28f375..3daf2d1d 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,9 @@ # Makefile to help automate tasks WD := $(shell pwd) -PY := .env/bin/python -PIP := .env/bin/pip -PEP8 := .env/bin/pep8 -NOSE := .env/bin/nosetests - +PY := .venv/bin/python +PIP := .venv/bin/pip +PEP8 := .venv/bin/pep8 +NOSE := .venv/bin/nosetests # ########### # Tests rule! @@ -22,16 +21,17 @@ $(NOSE): .PHONY: all all: venv develop -venv: bin/python -bin/python: - virtualenv .env +venv: .venv/bin/python + +.venv/bin/python: + virtualenv .venv .PHONY: clean_venv clean_venv: - rm -rf .env + rm -rf .venv -develop: .env/lib/python*/site-packages/readability-lxml.egg-link -.env/lib/python*/site-packages/readability-lxml.egg-link: +develop: .venv/lib/python*/site-packages/readability-lxml.egg-link +.venv/lib/python*/site-packages/readability-lxml.egg-link: $(PY) setup.py develop diff --git a/README.rst b/README.rst index 51eac4af..518c7553 100644 --- a/README.rst +++ b/README.rst @@ -35,13 +35,15 @@ Usage Change Log ---------- -- 0.3 Added Document.encoding, positive\_keywords and - negative\_keywords -- 0.4 Added Videos loading and allowed more images per paragraph -- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and - 3.4 +- 0.7 Improved HTML5 tags handling. Heuristics were changed for a lot of sites: Fixed an important +bug with stripping unwanted HTML nodes (only first matching node was removed before). - 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 and 3.4 +- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and + 3.4 +- 0.4 Added Videos loading and allowed more images per paragraph +- 0.3 Added Document.encoding, positive\_keywords and + negative\_keywords Licensing ========= diff --git a/readability/readability.py b/readability/readability.py index 90fbc138..12f3d959 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -381,13 +381,13 @@ def class_weight(self, e): def score_node(self, elem): content_score = self.class_weight(elem) name = elem.tag.lower() - if name == "div": + if name in ["div", "article"]: content_score += 5 elif name in ["pre", "td", "blockquote"]: content_score += 3 - elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]: + elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]: content_score -= 3 - elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]: + elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th", "header", "footer", "nav"]: content_score -= 5 return { 'content_score': content_score, @@ -463,7 +463,7 @@ def sanitize(self, node, candidates): allowed = {} # Conditionally clean s,