generated from GT-ZhangAcer/AI-Studio-Template
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcrawlerModule.py
54 lines (48 loc) · 2.43 KB
/
crawlerModule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# 百度爬取'福'的图片
# 参考https://blog.csdn.net/qq_45880822/article/details/109872504
import requests
import re
import os
class pic_crawler():
##
def __init__(self,outputPatn):
pages = 1 ##看我们要下多少页的数据
self.pages_1 = int(pages)+1
self.outputPath=outputPatn
self.crawlNum=1
# 图片爬取函数
def run(self,search_words=''):
search_words=search_words+' 正脸照 半身'
print('search word',search_words)
if len(search_words)==0:return []
result = []
for page in range(1, self.pages_1):
url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={}&pn={}'.format(search_words, page)
# url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1600582955784_R&pv=&ic=0&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&sid=&word=%E9%81%93%E8%B7%AF%E7%A6%81%E6%AD%A2%E8%B7%AF%E6%A0%87'
# 伪装浏览器请求头
headers = {
'user-agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 84.0 .4147.89Safari / 537.36'
}
# 获取网站源码
html = requests.get(url, headers=headers)
# 获取真实的url
urls_real = re.findall('"thumbURL":"(.*?)",', html.text)
print('图片url数量', len(urls_real))
# print(urls_real)
# 筛选资源url,并返回元组
# urls = re.findall(r'<img alt="" src="(http://.*?.jpg)" ',html.text)
# print(urls_real)
for index,i in enumerate(urls_real):
if index==self.crawlNum:break
subname = i.split('/')[-1] # 资源存放名称 eg:'u=4117662774,1210746205&fm=193'
filename = re.findall(r"=(.+?)&", subname)[0].replace(',', '')
path = self.outputPath + filename + '.jpg' # 资源存放点+资源存放名称
# print(path)
image_data = requests.get(i, headers=headers)
with open(path, 'wb')as f: # 将图片以二进制写入
f.write(image_data.content)
result.append(path)
return result
if __name__=='__main__':
pc=pic_crawler('pic/')
print(pc.run('特朗普'))