forked from YuHsuan87/Taiwan_news_dataset
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_parsing.py
47 lines (41 loc) · 1.44 KB
/
test_parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import news.db
import news.preprocess
from bs4 import BeautifulSoup
from tqdm import tqdm
if __name__ == '__main__':
dset = news.db.read.AllRecords(db_name='ftv.db')
res = []
for i, n in enumerate(tqdm(dset)):
try:
# if '120940/5544528' in n.url:
if n.company == '民視':
# if any(map(lambda url: url in n.url, ['2021702W0010', '2021702W0111'])):
res.append(news.preprocess.ftv.parse(n))
except Exception as err:
print(err.args)
conn = news.db.util.get_conn(db_name='test2.db')
cur = conn.cursor()
news.db.create.create_table(cur=cur)
news.db.write.write_new_records(cur=cur, news_list=res)
conn.close()
n_article = [r for r in res if not r.article]
n_category = [r for r in res if not r.category]
n_company = [r for r in res if not r.company]
n_datetime = [r for r in res if not r.datetime]
n_raw_xml = [r for r in res if not r.raw_xml]
n_reporter = [r for r in res if not r.reporter]
n_title = [r for r in res if not r.title]
n_url = [r for r in res if not r.url]
s = f"""empty count
article: {len(n_article)}
category: {len(n_category)}
company: {len(n_company)}
datetime: {len(n_datetime)}
raw_xml: {len(n_raw_xml)}
reporter: {len(n_reporter)}
title: {len(n_title)}
url: {len(n_url)}
"""
print(s)
def list_reporter(res):
return(set([r.reporter for r in res]))