Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for 'language' field in scraped recipes #144

Merged
merged 26 commits into from
Apr 29, 2020
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
94d6544
Allow fallback from schema.org data to abstract scraper functions
jayaddison Apr 27, 2020
79c7573
Implement schema.org, HTML tag, and META tag language scraping
jayaddison Apr 27, 2020
1467f2d
Fixup: remove duplicate space
jayaddison Apr 27, 2020
fc223a0
Add test coverage for schema.org inLanguage field
jayaddison Apr 27, 2020
9541cc4
Only accept first language from meta http-equiv content-language
jayaddison Apr 27, 2020
7e7784f
Add BCP47 validation via language-tags library
jayaddison Apr 27, 2020
6b5c0ca
Return formatted language code tags
jayaddison Apr 27, 2020
1d9a1cc
Cleanup
jayaddison Apr 27, 2020
64eb829
Remove 'en' special-casing
jayaddison Apr 27, 2020
3e03be6
Remove 'en' if a more-specific alternative candidate language exists
jayaddison Apr 27, 2020
f86a2ae
Add explanatory comment
jayaddison Apr 27, 2020
b32d18b
Return first element from set iteration (safe for empty sets)
jayaddison Apr 27, 2020
793393a
Refactor schemaorg decorator
jayaddison Apr 28, 2020
b656516
Remove empty newline
jayaddison Apr 28, 2020
18c7e78
Cleanup
jayaddison Apr 28, 2020
96e4cce
Restore support for python3.5
jayaddison Apr 28, 2020
2182f7d
Add missing 'return' statement
jayaddison Apr 28, 2020
a74dc1f
Allow caller to enable meta http-equiv parsing
jayaddison Apr 28, 2020
45a8d71
Merge branch 'master' into language-field
jayaddison Apr 28, 2020
6698e36
Remove kwargs from scrape_me
jayaddison Apr 29, 2020
56214ea
Refactor WebsiteNotImplementedError
jayaddison Apr 29, 2020
4dc845d
wip
jayaddison Apr 29, 2020
39c529b
Introduce experimental 'harvest' method
jayaddison Apr 29, 2020
447e0d7
Argument ordering consistency
jayaddison Apr 29, 2020
666a8d5
Update dependencies in setup.py
jayaddison Apr 29, 2020
51d2352
Merge branch 'master' into language-field
jayaddison Apr 29, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions recipe_scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ class WebsiteNotImplementedError(NotImplementedError):
pass


def scrape_me(url_path):
def scrape_me(url_path, **kwargs):
jayaddison marked this conversation as resolved.
Show resolved Hide resolved

host_name = url_path_to_dict(url_path.replace('://www.', '://'))['host']

Expand All @@ -149,7 +149,7 @@ def scrape_me(url_path):
"Website ({}) is not supported".format(host_name)
)

return scraper(url_path)
return scraper(url_path, **kwargs)


__all__ = ['scrape_me']
Expand Down
74 changes: 64 additions & 10 deletions recipe_scrapers/_abstract.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import requests
from bs4 import BeautifulSoup
from language_tags import tags

from ._schemaorg import (
SchemaOrg,
Expand All @@ -19,30 +20,46 @@ class Decorators:
Define decorators for AbstractScraper methods here.
"""
@staticmethod
def schema_org_priority(function):
def schema_org_priority(decorated):
"""
Use SchemaOrg parser with priority (if there's data in it)
On exception raised - continue by default.
If there's no data (no schema implemented on the site) - continue by default
"""
def schema_org_priority_wrapper(self, *args, **kwargs):
if self.schema.data:
try:
return self.schema.__getattribute__(
jayaddison marked this conversation as resolved.
Show resolved Hide resolved
function.__name__
)(*args, **kwargs)
except SchemaOrgException:
pass
return function(self, *args, **kwargs)
function = getattr(self.schema, decorated.__name__)
if not function:
raise SchemaOrgException(
"Function '{}' not found in schema"
.format(decorated.__name)
)

if not self.schema.data:
return decorated(self, *args, **kwargs)

try:
value = function(*args, **kwargs)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!

except SchemaOrgException:
return decorated(self, *args, **kwargs)
return value or decorated(self, *args, **kwargs)

return schema_org_priority_wrapper

def __init__(self, url, test=False):
@staticmethod
def bcp47_validate(function):
def bcp47_validate_wrapper(self, *args, **kwargs):
tag = tags.tag(function(self, *args, **kwargs))
return str(tag) if tag.valid else None
return bcp47_validate_wrapper

def __init__(self, url, test=False, **kwargs):
if test: # when testing, we load a file
with url:
page_data = url.read()
else:
page_data = requests.get(url, headers=HEADERS).content

self.meta_http_equiv = kwargs.get('meta_http_equiv', False)
self.soup = BeautifulSoup(page_data, "html.parser")
self.schema = SchemaOrg(page_data)
self.url = url
Expand Down Expand Up @@ -88,6 +105,43 @@ def image(self):
except AttributeError: # if image not found
raise NotImplementedError("This should be implemented.")

@Decorators.bcp47_validate
@Decorators.schema_org_priority
def language(self):
"""
Human language the recipe is written in.

May be overridden by individual scrapers.
"""
candidate_languages = set()
html = self.soup.find(
'html',
{'lang': True}
)
candidate_languages.add(html.get('lang'))

# Deprecated: check for a meta http-equiv header
# See: https://www.w3.org/International/questions/qa-http-and-lang
meta_language = self.soup.find(
jayaddison marked this conversation as resolved.
Show resolved Hide resolved
'meta',
{
'http-equiv': lambda x: x and x.lower() == 'content-language',
'content': True
}
) if self.meta_http_equiv else None
if meta_language:
for language in meta_language.get('content').split(','):
candidate_languages.add(language)
break

# If other langs exist, remove 'en' commonly generated by HTML editors
if len(candidate_languages) > 1 and 'en' in candidate_languages:
candidate_languages.remove('en')

# Return the first candidate language
for language in candidate_languages:
return language

@Decorators.schema_org_priority
def ingredients(self):
raise NotImplementedError("This should be implemented.")
Expand Down
3 changes: 3 additions & 0 deletions recipe_scrapers/_schemaorg.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ def __init__(self, page_data):
self.data = self.data.get('mainEntity')
return

def language(self):
return self.data.get("inLanguage") or self.data.get("language")

def title(self):
return self.data.get("name")

Expand Down
6 changes: 6 additions & 0 deletions recipe_scrapers/tests/test_inspiralized.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ def test_host(self):
self.harvester_class.host()
)

def test_language(self):
self.assertEqual(
'en-US',
self.harvester_class.language()
)

def test_title(self):
self.assertEqual(
self.harvester_class.title(),
Expand Down
8 changes: 7 additions & 1 deletion recipe_scrapers/tests/test_mindmegette.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,20 @@ def setUp(self):
'test_data',
'mindmegette.testhtml'
)) as file_opened:
self.harvester_class = Mindmegette(file_opened, test=True)
self.harvester_class = Mindmegette(file_opened, meta_http_equiv=True, test=True)

def test_host(self):
self.assertEqual(
'mindmegette.hu',
self.harvester_class.host()
)

def test_language(self):
self.assertEqual(
'hu-HU',
self.harvester_class.language()
)

def test_title(self):
self.assertEqual(
self.harvester_class.title(),
Expand Down
6 changes: 6 additions & 0 deletions recipe_scrapers/tests/test_przepisy.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ def test_host(self):
self.harvester_class.host()
)

def test_language(self):
self.assertEqual(
'pl',
self.harvester_class.language()
)

def test_title(self):
self.assertEqual(
'Placki ziemniaczane',
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
beautifulsoup4>=4.6.0
coverage>=4.5.1
extruct>=0.8.0
language-tags>=1.0.0
requests>=2.19.1