-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrss_parser.py
207 lines (170 loc) · 8.29 KB
/
rss_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import feedparser
from newspaper import Article
import sqlite3
import datetime
import nltk
import pandas as pd
import dateutil.parser
import pytz
from concurrent.futures import ThreadPoolExecutor, as_completed
import psycopg2
from lxml.etree import tostring
import lxml
import traceback
utc=pytz.UTC
from config import DB_FILE, MEDIA_FILE, SOURCE_TYPE, POSTGRES_STRING
nltk.download('punkt')
def make_aware(dt, timezone=pytz.utc):
"""Convert offset-naive datetime to offset-aware using the given timezone."""
return dt if dt.tzinfo else timezone.localize(dt)
def extract_embedded_links(article_clean_top_node):
# Find all URLs in the HTML content
urls = article_clean_top_node.xpath("//a/@href")
return urls
def extract_tweet_details(article_clean_top_node):
# Find all embedded tweets using the appropriate class name
embedded_tweets = article_clean_top_node.xpath(".//blockquote[contains(@class, 'twitter-tweet')]")
tweets_data = []
for tweet in embedded_tweets:
tweet_data = {}
# The tweet URL is typically contained within the last <a> tag in the blockquote.
tweet_link = tweet.xpath(".//a/@href")[-1] if tweet.xpath(".//a/@href") else None
if tweet_link:
tweet_data['tweetLink'] = tweet_link
# Extract tweet ID from the tweet URL
tweet_data['tweetId'] = tweet_link.split('/')[-1]
# Extract the screen name from the tweet URL
parts = tweet_link.split('/')
tweet_data['screenName'] = parts[-3] if len(parts) > 3 else None
# The full text of the tweet is contained in the <p> tag within the blockquote
tweet_text_list = tweet.xpath(".//p//text()")
tweet_data['tweetText'] = ''.join(tweet_text_list).strip() if tweet_text_list else None
tweets_data.append(tweet_data)
return tweets_data
# Function to parse and display each feed
def process_and_insert(cursor, known_urls, source, feedName, url, country):
feed = feedparser.parse(url)
print(len(feed.entries), feedName, url)
# Create a new connection for each thread
with psycopg2.connect(POSTGRES_STRING) as conn:
cursor = conn.cursor()
for entry in feed.entries:
if 'link' not in entry or 'published' not in entry:
continue
elif entry.link in known_urls or make_aware(dateutil.parser.parse(entry.published)) < make_aware(datetime.datetime.now() - datetime.timedelta(days=2)):
continue
try:
# Use Newspaper3k to extract details
article = Article(entry.link)
article.download()
article.parse()
article.nlp()
# Extracting the fields for the output table
author = ', '.join(article.authors)
country = country
createdAt = datetime.datetime.now()
excerpt = article.text
language = article.meta_lang
outlet = source
publishedDate = article.publish_date or entry.published
siteCountry = '' # This needs to be determined possibly from the URL or another source
title = article.title
url = entry.link
websiteName = feedName
featuredImage = article.top_image # new field
# Add description?
# Insert data into the output table
# Insert data into the articles table
cursor.execute('''
INSERT INTO public.articles (
author,
country,
createdAt,
excerpt,
language,
outlet,
publishedDate,
siteCountry,
title,
url,
websiteName,
featuredImage
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
RETURNING articleId
''', (author, country, createdAt, excerpt, language, outlet, publishedDate, siteCountry, title, url, websiteName, featuredImage))
conn.commit()
# Retrieve the last inserted article_id
this_article_id = cursor.fetchone()[0]
if hasattr(article, 'tags' ) and len(article.tags) > 0:
# Insert an entry for each tag in article.tags
tag_data = [(this_article_id, tag) for tag in article.tags]
cursor.executemany('''
INSERT INTO public.tags (article_id, tag) VALUES (%s, %s)
''', tag_data)
if hasattr(article,'keywords') and len(article.keywords) > 0:
# Insert an entry for each keyword in article.keywords
keyword_data = [(this_article_id, keyword) for keyword in article.keywords]
cursor.executemany('''
INSERT INTO public.keywords (article_id, keyword) VALUES (%s, %s)
''', keyword_data)
conn.commit()
# Assuming article.clean_top_node contains the necessary data
if article.clean_top_node is not None:
all_urls = extract_embedded_links(article.clean_top_node)
embedded_tweets = extract_tweet_details(article.clean_top_node)
if len(embedded_tweets) > 0:
tweet_data = [(this_article_id, tweet['tweetLink'], tweet['tweetId'], tweet['screenName'], tweet['tweetText']) for tweet in embedded_tweets]
cursor.executemany('''
INSERT INTO public.tweets (article_id, tweetLink, tweetId, screenName, tweetText) VALUES (%s, %s, %s, %s, %s)
''', tweet_data)
if len(all_urls) > 0:
url_data = [(this_article_id, url) for url in all_urls]
cursor.executemany('''
INSERT INTO public.urls (article_id, url) VALUES (%s, %s)
''', url_data)
except Exception as e:
print(e, traceback.format_exc())
print(url, ' | Error processing article: ', entry)
continue
finally:
# Make sure to commit any changes
conn.commit()
print('Done processing feed: ', url, feedName)
def process_feed(data, known_urls):
source, feedName, url, country = data
# Call the original process_and_insert function
process_and_insert(cursor, known_urls, source, feedName, url, country)
with psycopg2.connect(POSTGRES_STRING) as conn:
cursor = conn.cursor()
cursor.execute("SELECT url FROM articles")
known_urls = cursor.fetchall()
if SOURCE_TYPE == 'database':
# Retrieve the RSS feed data from the database
cursor.execute("SELECT source, title, url, country FROM rss_feeds")
rows = cursor.fetchall()
for row in rows:
process_feed(row, known_urls)
else:
# Retrieve the RSS feed data from the CSV file
media_df = pd.read_excel(MEDIA_FILE, sheet_name='State_Media_Matrix')
#split rss feeds into separate rows by comma without using assign
#drop rows without rss feeds
media_df = media_df.dropna(subset=['RSS Feeds'])
media_df['RSS Feeds'] = media_df['RSS Feeds'].str.split(',')
media_df = media_df.explode('RSS Feeds')
#drop rows with with '' in rss feeds
media_df = media_df[media_df['RSS Feeds']!='']
media_df = media_df[media_df['Tracked']==1]
# # insert rss feeds into database
# for index,row in media_df.iterrows():
# cursor.execute('''
# INSERT INTO rss_feeds (
# source,
# title,
# url,
# country
# ) VALUES (?, ?, ?, ?)
# ''', (row['Media company'], row['Main assets'], row['RSS Feeds'], row['Country']))
feed_data = [(row['Media company'], row['Main assets'], row['RSS Feeds'], row['Country']) for index, row in media_df.iterrows()]
for feed in feed_data:
process_feed(feed, known_urls)