-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapper.py
122 lines (107 loc) · 4.5 KB
/
scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import asyncio
import logging
from datetime import datetime, timedelta
from sqlalchemy.exc import IntegrityError
from agency import DailynewsAgency, MgronlineAgency, MatichonAgency, BkkbiznewsAgency, \
TheStandardAgency, PrachachatAgency, PostTodayAgency
from model import RawNewsEntity, MatichonRawNewsEntity
from database import db
from config import config
import adapter
dailynews_agency = DailynewsAgency(config=config['agency']['dailynews'])
mgronline_agency = MgronlineAgency(config=config['agency']['mgronline'])
matichon_agency = MatichonAgency(config=config['agency']['matichon'])
bkkbiznews_agency = BkkbiznewsAgency(config=config['agency']['bkkbiznews'])
the_standard_agency = TheStandardAgency(config=config['agency']['the_standard'])
prachachat_agency = PrachachatAgency(config=config['agency']['prachachat'])
posttoday_agency = PostTodayAgency(config=config['agency']['posttoday'])
logging.basicConfig(level=logging.INFO)
def insert_raw_news(raw_news_entity: RawNewsEntity):
if raw_news_entity is None:
logging.error(f'failed to create raw_news_entity')
return
logging.info(raw_news_entity)
try:
db.add(raw_news_entity)
db.commit()
except IntegrityError:
db.rollback()
logging.info(f'Duplicated {raw_news_entity.link}')
except Exception as err :
db.rollback()
logging.error(f'failed to store raw_news_entity')
logging.error(err)
def insert_matichon_raw_news(matichon_raw_news_entity: MatichonRawNewsEntity):
if matichon_raw_news_entity is None:
logging.error(f'failed to create matichon_raw_news_entity')
return
# logging.info(matichon_raw_news_entity)
try:
db.add(matichon_raw_news_entity)
db.commit()
except IntegrityError:
db.rollback()
logging.info(f'Duplicated {matichon_raw_news_entity.link}')
except Exception as err :
db.rollback()
logging.error(f'failed to store matichon_raw_news_entity')
logging.error(err)
async def scrap_dailynews():
# await adapter.publish_drop_table()
raw_news_entities = await dailynews_agency.scrap()
for entity in raw_news_entities:
insert_raw_news(entity)
post_news_response = await adapter.publish_raw_news(entity)
logging.info(post_news_response)
async def scrap_mgronline():
# await adapter.publish_drop_table()
raw_news_entities = await mgronline_agency.scrap()
for entity in raw_news_entities:
insert_raw_news(entity)
post_news_response = await adapter.publish_raw_news(entity)
logging.info(post_news_response)
async def scrap_matichon():
# await adapter.publish_drop_table()
matichon_raw_news_entities = await matichon_agency.scrap()
for entity in matichon_raw_news_entities:
insert_matichon_raw_news(entity)
# post_news_response = await adapter.publish_matichon_raw_news(entity)
logging.info(entity)
async def scrap_bkkbiznews():
# await adapter.publish_drop_table()
raw_news_entities = await bkkbiznews_agency.scrap()
for entity in raw_news_entities:
insert_raw_news(entity)
post_news_response = await adapter.publish_raw_news(entity)
logging.info(post_news_response)
async def scrap_the_standard():
# await adapter.publish_drop_table()
raw_news_entities = await the_standard_agency.scrap()
for entity in raw_news_entities:
insert_raw_news(entity)
post_news_response = await adapter.publish_raw_news(entity)
logging.info(post_news_response)
async def scrap_prachachat():
# await adapter.publish_drop_table()
raw_news_entities = await prachachat_agency.scrap()
for entity in raw_news_entities:
insert_raw_news(entity)
post_news_response = await adapter.publish_raw_news(entity)
logging.info(post_news_response)
async def scrap_posttoday():
# await adapter.publish_drop_table()
raw_news_entities = await posttoday_agency.scrap()
for entity in raw_news_entities:
insert_raw_news(entity)
post_news_response = await adapter.publish_raw_news(entity)
logging.info(post_news_response)
async def main():
await scrap_dailynews()
# await scrap_mgronline()
await scrap_matichon()
# await scrap_bkkbiznews()
await scrap_the_standard()
# await scrap_prachachat()
# await scrap_posttoday()
if __name__ == '__main__':
asyncio.run(main())