-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patharticle.py
151 lines (125 loc) · 5.82 KB
/
article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
aritcle.py
~~~~~~~~~~~~~~~~~
Crawl aritcle content
"""
import requests
import json
from bs4 import BeautifulSoup
import re
from config import *
from database import *
from gadget import *
def getArticle(page, latestTimestamp, conn):
'''Get Article Content: Get article id and then get content with it'''
# convert timestamp to date format
timeLocal = getDate(latestTimestamp)
breakCount = 0
print("=====开始抓取时间[%s]之后的微博文章=====" % timeLocal)
print('开始抓取文章列表第%s页' % page)
url = articleListUrlFormat.format(page)
data = requests.get(url, headers = articleHeaders)
data.encoding = 'utf-8'
data = json.loads(data.text)
if 'ok' in data and data['ok'] == 0:
print("抓取完毕或抓取失败,请重设Cookie或休息一会!")
exit()
for content in data['data']['cards']:
kwArticle = {}
# if 'card_type' is 9
if content['card_type'] == 9:
kwArticle['add_time'] = getTimestamp(content['mblog']['created_at'])
kwArticle['title'] = content['mblog']['page_info']['page_title']
object_id = content['mblog']['page_info']['object_id']
kwArticle['article_id'] = object_id.split(":")[1]
articleTuple = getArticleContent(kwArticle['article_id'],1)
if articleTuple[0] == '':
saveFailId(kwArticle['article_id'],kwArticle['title'])
continue
kwArticle['content'] = articleTuple[0]
elif content['card_type'] == 11 and content['card_group'][0]['card_type'] == 9:
kwArticle['add_time'] = getTimestamp(content['card_group'][0]['mblog']['created_at'])
kwArticle['title'] = content['card_group'][0]['mblog']['page_info']['page_title']
object_id = content['card_group'][0]['mblog']['page_info']['object_id']
kwArticle['article_id'] = object_id.split(":")[1]
articleTuple = getArticleContent(kwArticle['article_id'],1)
if articleTuple[0] == '':
saveFailId(kwArticle['article_id'],kwArticle['title'])
continue
kwArticle['content'] = articleTuple[0]
elif content['card_type'] == 8:
regex = re.compile('id=.*?&')
result = regex.findall(content['scheme'])
kwArticle['article_id'] = result[0].rstrip("&").split("=")[1]
kwArticle['title'] = content['title_sub']
urlFormat = r'http://card.weibo.com/article/aj/articleshow?cid={}'
urlArticle = urlFormat.format(kwArticle['article_id'])
articleTuple = getArticleContent(kwArticle['article_id'], 2)
if articleTuple[0] == '':
saveFailId(kwArticle['article_id'],kwArticle['title'])
continue
kwArticle['add_time'] = getTimestamp(articleTuple[1])
kwArticle['content'] = articleTuple[0]
if kwArticle:
# if kwArticle['add_time'] > 0:
if kwArticle['add_time'] > latestTimestamp:
print('保存文章:%s' % kwArticle['title'])
insert_data('wb_mzm_article', conn, **kwArticle)
print("保存成功!\n")
else:
print('已抓取,不写入')
breakCount = breakCount + 1
# if reach the limit fail times,stop
if breakCount == 5:
# close database connection
closeConn(conn)
print('已经抓取完毕,程序结束...')
exit()
def getArticleContent(id,card_type):
'''Get content by id'''
object_id = '1022:' + id
url = articleUrlFormat.format(object_id = object_id, id = id)
response = requests.get(url, headers = articleHeaders)
response.encoding = 'utf-8';
soup = BeautifulSoup(response.text,'lxml')
# if article content contains in script tag, parse content in it
if len(soup.findAll("script")) > 1 and card_type == 1:
jsText = soup.findAll("script")[1].text
# Extract value of content,add brackets in regex to make it only return the brackets part
regex = re.compile('\"content\":(.*)')
result = regex.findall(jsText)
# Get rid of html tag
resultSoup = BeautifulSoup(result[0], 'lxml')
return resultSoup.body.get_text().strip(',').strip('"'), ''
# if article content return from api
else:
urlFormat = r'http://card.weibo.com/article/aj/articleshow?cid={}'
url = urlFormat.format(id)
response = requests.get(url, headers = articleHeaders)
if response:
response.encoding = 'utf-8'
content = json.loads(response.text)
parseContent = json.dumps(content).encode('utf-8').decode('unicode_escape')
if type(content) == str or '原文章已被删除' in parseContent or '正在加载内容' in parseContent:
return '',''
articleHTML = content['data']['article']
articleSoup = BeautifulSoup(articleHTML,'lxml')
articleContent = articleSoup.find('div', class_='WBA_content').text
articleTime = articleSoup.find('span', class_='time').text
return articleContent.strip(',').strip('"'), articleTime
else:
return '',''
if __name__ == '__main__':
conn = db_connector()
latestTimestamp = selectData(conn,'wb_mzm_article',4)
if latestTimestamp == None:
latestTimestamp = 0
saveLastTimestamp(latestTimestamp,'last_article_timestamp.txt')
print('上次更新到:%s' % getDate(latestTimestamp))
articlePage = 1
while True:
getArticle(articlePage,latestTimestamp,conn)
articlePage = articlePage + 1
sleepTimes(3)