-
Notifications
You must be signed in to change notification settings - Fork 5
/
weibo_follow.py
123 lines (105 loc) · 4.98 KB
/
weibo_follow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import random
import sys
import traceback
from time import sleep
import requests
from lxml import etree
from tqdm import tqdm
from weibo import Weibo
class Follow(object):
def __init__(self, user_id, cookie):
"""Follow类初始化"""
if not isinstance(user_id, int):
sys.exit(u'user_id值应为一串数字形式,请重新输入')
self.user_id = user_id
self.cookie = cookie
self.follow_list = [str(user_id)] # 存储爬取到的所有关注微博的user_id
self.follow_name_list = [] # 存储爬取到的所有关注微博的用户名
def deal_html(self, url):
"""处理html"""
try:
html = requests.get(url, cookies=self.cookie).content
selector = etree.HTML(html)
return selector
except Exception as e:
print('Error: ', e)
traceback.print_exc()
def get_page_num(self):
"""获取关注列表页数"""
url = "https://weibo.cn/%d/follow" % self.user_id
selector = self.deal_html(url)
user_name = selector.xpath('//div[@class="ut"]/text()')[0]
user_name = user_name[:user_name.find('关注')] # 获取user_name
self.follow_name_list.append(user_name)
if selector.xpath("//input[@name='mp']") == []:
page_num = 1
else:
page_num = (int)(
selector.xpath("//input[@name='mp']")[0].attrib['value'])
return page_num
def get_one_page(self, page):
"""获取第page页的user_id"""
url = 'https://weibo.cn/%d/follow?page=%d' % (self.user_id, page)
selector = self.deal_html(url)
table_list = selector.xpath('//table')
for t in table_list:
# im = t.xpath('.//a/@href')[-1]
im = t.xpath('.//a/@href')[0] # 获取uid
name = t.xpath('.//a/text()')[0] # 获取用户名
user_id = im[im.find('u') + 2:] # 截取uid
img = t.xpath('.//img/@src') # 获取图片,如果有两个图片,第二个图片是大V
peoples = t.xpath('.//td/text()') # 获取粉丝数
for people in peoples:
if people.find("粉丝") == 0:
num_people = people[2:people.find("人")]
if num_people[-1] == '万':
num_people = float(num_people[:-1])*10000
break
if user_id.isdigit() and len(img) <= 1 and int(num_people) < 1000: # 删除大V和粉丝数大于1000的用户
self.follow_list.append(user_id)
self.follow_name_list.append(name)
def get_follow_list(self):
"""获取关注用户主页地址"""
page_num = self.get_page_num()
print(u'用户关注页数:' + str(page_num))
page1 = 0
random_pages = random.randint(1, 5)
for page in tqdm(range(1, page_num + 1), desc=u'关注列表爬取进度'):
self.get_one_page(page)
if page - page1 == random_pages and page < page_num:
sleep(random.randint(6, 10))
page1 = page
random_pages = random.randint(1, 5)
print(u'用户关注列表爬取完毕')
def main():
try:
# 爬取关注列表的user_id
user_id = int('Your id')
cookie = {'Cookie': 'Your cookie'}
# 将your cookie替换成自己的cookie
fw = Follow(user_id, cookie) # 调用Weibo类,创建微博实例wb
fw.get_follow_list() # 获取关注列表
print(fw.follow_list) # 输出关注列表的uid
print(fw.follow_name_list) # 输出关注列表的昵称
filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博
since_date = '2018-01-01' # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd
"""mongodb_write值为0代表不将结果写入MongoDB数据库,1代表写入;若要写入MongoDB数据库,
请先安装MongoDB数据库和pymongo,pymongo安装方法为命令行运行:pip install pymongo"""
mongodb_write = 0
"""mysql_write值为0代表不将结果写入MySQL数据库,1代表写入;若要写入MySQL数据库,
请先安装MySQL数据库和pymysql,pymysql安装方法为命令行运行:pip install pymysql"""
mysql_write = 0
pic_download = 1 # 值为0代表不下载微博原始图片,1代表下载微博原始图片
video_download = 0 # 值为0代表不下载微博视频,1代表下载微博视频
for user in fw.follow_list:
# 爬每个人的微博
new_list = [user]
wb = Weibo(filter, since_date, mongodb_write, mysql_write, pic_download, video_download)
wb.start(new_list)
except Exception as e:
print('Error: ', e)
traceback.print_exc()
if __name__ == '__main__':
main()