-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
112 lines (88 loc) · 3.03 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from flask import Flask
from flask import make_response
app = Flask(__name__)
##############################
# IMPORT OTHER PY DEPENDENCIES
##############################
import json
import requests
import feedparser
from bs4 import BeautifulSoup
from collections import defaultdict
##############################
# 1 / FROM RSS: GET TODAY'S URL
##############################
url = 'https://www.theguardian.com/media/news-photography/rss'
feed = feedparser.parse(url)
lastest_url = feed.entries[0]['link']
# lastest_date = feed.entries[0]['date'] # add conversion for later usage
response = requests.get(lastest_url).text
# double failsafe for entries that are no galleries
if 'is-immersive' not in response:
lastest_url = feed.entries[1]['link']
response = requests.get(lastest_url).text
if 'is-immersive' not in response:
lastest_url = feed.entries[2]['link']
response = requests.get(lastest_url).text
##############################
# 2 / FROM HTML: GET DATA
##############################
soup = BeautifulSoup(response, 'html.parser')
# DEFINE LISTS
photo_urls = []
photo_titles = []
photo_captions = []
photo_credits = []
photos = defaultdict(dict)
# photos['test'] = 'some text'
photos['items'] = {}
# GET THE photo_urls OUT OF THE SOUP
for photoset in soup.find_all('source', media='(min-width: 1300px) and (-webkit-min-device-pixel-ratio: 1.25), (min-width: 1300px) and (min-resolution: 120dpi)'):
photo_url = str(photoset.attrs['srcset']).replace(' 2020w','')
photo_url = photo_url.replace(' 3800w','')
photo_urls.append(photo_url)
# GET THE photo_titles OUT OF THE SOUP
for photo_title in soup.find_all('h2', class_='gallery__caption__title'):
photo_titles.append(str(photo_title.text))
# GET THE photo_captions OUT OF THE SOUP
for photo_caption in soup.find_all('div', class_='gallery__caption'):
photo_caption.h2.decompose()
photo_caption = photo_caption.text
# photo_caption = photo_caption.split('\n',2)[1]
photo_caption = photo_caption.replace('\r',' ').replace('\n',' ').replace(' ',' ').replace(' ','').replace(' ','')
photo_captions.append(photo_caption)
# GET THE photo_credits OUT OF THE SOUP
for photo_credit in soup.find_all('p', class_='gallery__credit'):
photo_credit = photo_credit.text.replace('Photograph: ','')
photo_credits.append(photo_credit)
##############################
# 3 / PREPARE FISH 4 SELLING
##############################
for index,photo_url in enumerate(photo_urls):
if index > 0:
i = index - 1
item = {
'url': photo_url,
'title': photo_titles[i],
'caption': photo_captions[i],
'copyright': photo_credits[i]
}
photos['items'].update(
{
index : item
}
)
##############################
# 4 / CONVERT TO JSON
##############################
json_feed = json.dumps(photos)
##############################
# 5 / OUTPUT JSON
##############################
@app.route('/')
def the_output(json_feed=json_feed, headers=None):
resp = make_response(json_feed)
resp.mimetype = 'application/json'
return resp
if __name__ == '__main__':
app.run(debug=True, use_reloader=True)