From 2686e3df7cd149b8bbb6b6c632b73c4bbe2366a5 Mon Sep 17 00:00:00 2001 From: Asad Dhamani Date: Sat, 4 Apr 2015 07:31:58 +0530 Subject: [PATCH] Switch to UnicodeDammit for encoding detection --- readability/encoding.py | 63 +++++++++++++---------------------------- setup.py | 3 +- 2 files changed, 21 insertions(+), 45 deletions(-) diff --git a/readability/encoding.py b/readability/encoding.py index fb4761df..5df36eaa 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -1,48 +1,23 @@ -import re -import chardet +from bs4 import UnicodeDammit def get_encoding(page): - # Regex for XML and HTML Meta charset declaration - charset_re = re.compile(r']', flags=re.I) - pragma_re = re.compile(r']', flags=re.I) - xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') - - declared_encodings = (charset_re.findall(page) + - pragma_re.findall(page) + - xml_re.findall(page)) - - # Try any declared encodings - if len(declared_encodings) > 0: - for declared_encoding in declared_encodings: - try: - page.decode(custom_decode(declared_encoding)) - return custom_decode(declared_encoding) - except UnicodeDecodeError: - pass - - # Fallback to chardet if declared encodings fail - text = re.sub(']*>\s*', ' ', page) - enc = 'utf-8' - if not text.strip() or len(text) < 10: - return enc # can't guess - res = chardet.detect(text) - enc = res['encoding'] or 'utf-8' - #print '->', enc, "%.2f" % res['confidence'] - enc = custom_decode(enc) + # Pass in html to UnicodeDammit for encoding detection + page = UnicodeDammit(page) + enc = page.original_encoding return enc -def custom_decode(encoding): - """Overrides encoding when charset declaration - or charset determination is a subset of a larger - charset. Created because of issues with Chinese websites""" - encoding = encoding.lower() - alternates = { - 'big5': 'big5hkscs', - 'gb2312': 'gb18030', - 'ascii': 'utf-8', - 'MacCyrillic': 'cp1251', - } - if encoding in alternates: - return alternates[encoding] - else: - return encoding +# def custom_decode(encoding): +# """Overrides encoding when charset declaration +# or charset determination is a subset of a larger +# charset. Created because of issues with Chinese websites""" +# encoding = encoding.lower() +# alternates = { +# 'big5': 'big5hkscs', +# 'gb2312': 'gb18030', +# 'ascii': 'utf-8', +# 'MacCyrillic': 'cp1251', +# } +# if encoding in alternates: +# return alternates[encoding] +# else: +# return encoding diff --git a/setup.py b/setup.py index e7bb5884..bfa66983 100755 --- a/setup.py +++ b/setup.py @@ -24,7 +24,8 @@ packages=['readability'], install_requires=[ "chardet", - lxml_requirement + lxml_requirement, + "beautifulsoup4 >= 4.3.2" ], classifiers=[ "Environment :: Web Environment",