forked from ClericPy/Spider_on_Tianmao_and_Taobao
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path多线程-爬取京东单个商品所有评论示范Demo.py
54 lines (40 loc) · 1.73 KB
/
多线程-爬取京东单个商品所有评论示范Demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import requests
''''
自动爬取某单个商品的所有评论内容,速度因为使用多线程,大约1秒就按页码顺序存入ratejd.json,解析查看可以运行“解析json.py”。
注意:这里使用了一个requests模块,需要去第三方下载,可以通过pip
pip install requests
'''
# ll是用来存放评论内容列表的,key是页码,value是存放10个当页评论的列表
ll = {}
pid = '967821'
headers1 = {'GET': '',
'Host': "club.jd.com",
'User-Agent': "Mozilla/5.0 (Windows NT 6.2; rv:29.0) Gecko/20100101 Firefox/29.0",
'Referer': 'http://item.jd.com/{}.html'.format(pid)}
r1 = requests.get(
'http://club.jd.com/productpage/p-{}-s-0-t-3-p-{}.html'.format(pid, 0), headers=headers1)
# 先获取最大页码数
maxpagenum = r1.json()['productCommentSummary']['commentCount'] // 10
# print(maxpagenum)
def getrate_jd(pid, pagenum):
'''该函数用来获取商品ID是pid的第pagenum页的评论列表'''
headers1 = {'GET': '',
'Host': "club.jd.com",
'User-Agent': "Mozilla/5.0 (Windows NT 6.2; rv:29.0) Gecko/20100101 Firefox/29.0",
'Referer': 'http://item.jd.com/{}.html'.format(pid)}
r = requests.get(
'http://club.jd.com/productpage/p-{}-s-0-t-3-p-{}.html'.format(pid, pagenum), headers=headers1)
aa = r.json()
ss = [x['content'] for x in aa['comments']]
global ll
if ss != []:
ll[pagenum] = ss
import threading
treads = []
for i in range(maxpagenum + 1):
treads.append(threading.Thread(target=getrate_jd, args=(pid, i)))
for t in treads:
t.start()
import json
with open('ratejd.json', 'w') as f:
f.write(json.dumps(ll, sort_keys=True, indent=4, separators=(',', ': ')))