-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathapp_crawler.py
205 lines (173 loc) · 5.97 KB
/
app_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#coding=utf8
import requests
import json
from lxml import etree
import pymysql
'''
[官方分类显示列表](https://affiliate.itunes.apple.com/resources/documentation/genre-mapping/)
[app search](http://itunes.apple.com/search?term=google&country=us&entity=software)
param 1: term,搜索关键词
param 2: country, 搜索的市场
param 3: entity, 限定为software
爬虫思路:获取所有分类(genre),按照分类中字母顺序爬取。爬取到的app应该只是名字,注意记录重复的情况。
然后根据名称查询到app具体内容。主要是bundle id,图标也是考虑内容之一。
'''
proxies = {"http": "http://127.0.0.1:8118","https": "http://127.0.0.1:8118",}
genre_service_url = "https://itunes.apple.com/WebObjects/MZStoreServices.woa/ws/genres"
# category url: "https://itunes.apple.com/cn/genre/ios/id36?mt=8"
# genre url: "https://itunes.apple.com/cn/genre/id6005?mt=8&letter=A"
base_url = "https://itunes.apple.com/"
cgInfoFile = "./cgInfoFile.txt"
# 在进行真正的爬取工作之前,应该根据getCategories()确定存在的目录,爬取指定的目录,这里关注app store,然后需要获取app store对应的category id。
targetCategory = "App Store"
# 两个字母表示的国家代码,具体参阅https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
targetCountry = "cn"
targetPlatform = "ios"
# 按照app名称的字母顺序进行遍历。
alphabet = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','*']
'''
MySQL related things
'''
db="app_store"
table_name = "app_names_cn"
host = "localhost"
user = ""
pwd = ""
db = pymysql.connect(host, user, pwd, db)
cursor = db.cursor()
failed_sqls = []
failed_requests = []
failed_sqls_file = "./failed_sqls.txt"
failed_requests_file = "./failed_requests.txt"
#
def clean_name(name):
name = name.replace('\\','\\\\')
name = name.replace('\"','\\\"')
name = name.replace('\'','\\\'')
return name
#cg: categories and genres
def fetch_cgInfo():
response = requests.get(genre_service_url)
print('status code: %d\nresponse encoding: %s'%(response.status_code, response.encoding))
f = open(cgInfoFile, 'w')
f.write(response.text)
f.close()
# cgInfo here is a json_str
def read_cgInfo():
f = open(cgInfoFile,'r')
cgInfo = f.read()
f.close()
return cgInfo
# returns a dict, in which cate_name is the key, and id is the value
def getCategories(cg_json=None):
category_dict = {}
if cg_json == None:
return category_dict
for key in cg_json.keys():
category_dict[cg_json[key]['name']] = key
return category_dict
# returns a dict, in which genre_name is the key, and id is the value
def getGenres(cg_json=None, category_dict=None, target=""):
genre_dict = {}
if cg_json==None or category_dict==None or target == "":
return genre_dict
if not category_dict.__contains__(target):
return genre_dict
for key in cg_json[category_dict[target]]['subgenres'].keys():
genre_dict[cg_json[category_dict[target]]['subgenres'][key]['name']] = key
return genre_dict
# crawl by category
def crawlByCategory(genre_dict=None):
if genre_dict == None:
return None
# get genre name
for genre_name, genre_id in genre_dict.items():
print("crawling genre: %s"%genre_name)
genre_url = base_url + targetCountry + "/genre/id" + genre_id +"?mt=8"
# get an element from alphabet
for a in alphabet:
a_genre_url = genre_url + "&letter=%s"%a
page = 1
apd = False # all page done
# by page
while not apd:
pa_genre_url = a_genre_url + "&page=%d"%page
page += 1
# here we've got the target url, pa_genre_url
# next steps, we need to study the structure of the target html
# confirm this page contains valid content first
# fetch and parse content from the url
apd = parseAUrl(pa_genre_url, genre_id)
def parseAUrl(url="",genre_id=0):
if url=="":
return None
print(url)
try:
response = requests.get(url,proxies=proxies,timeout=60)
except:
failed_requests.append((genre_id, url))
return False
status_code = response.status_code
html_text = response.text
print("status_code: %d"%status_code)
html = etree.HTML(html_text)
# 主信息块,分为左中右三块。
leftCol_texts = html.xpath('//div[@id="selectedcontent"]/div[@class="column first"]/ul/li/a/text()')
#leftCol_hrefs = html.xpath('//div[@id="selectedcontent"]/div[@class="column first"]/ul/li/a/@href')
middleCol_texts = html.xpath('//div[@id="selectedcontent"]/div[@class="column"]/ul/li/a/text()')
rightCol_texts = html.xpath('//div[@id="selectedcontent"]/div[@class="column last"]/ul/li/a/text()')
if len(leftCol_texts)==0:
return True
else:
for each in leftCol_texts:
each = clean_name(each)
sql = 'insert into %s(app_name, genre_id) values("%s", %s)'%(table_name,each,genre_id)
try:
cursor.execute(sql)
except:
failed_sqls.append(sql)
if len(middleCol_texts)==0:
db.commit()
return True
else:
for each in middleCol_texts:
each = clean_name(each)
sql = 'insert into %s(app_name, genre_id) values("%s", %s)'%(table_name, each,genre_id)
try:
cursor.execute(sql)
except:
failed_sqls.append(sql)
if len(rightCol_texts)==0:
db.commit()
return True
else:
for each in rightCol_texts:
each = clean_name(each)
sql = 'insert into %s(app_name, genre_id) values("%s", %s)'%(table_name,each,genre_id)
try:
cursor.execute(sql)
except:
failed_sqls.append(sql)
db.commit()
return False
def main():
print("app crawler of apple app store ...")
cg_json_str = read_cgInfo()
cg_json = json.loads(cg_json_str)
category_dict = getCategories(cg_json)
targetCategory_id = category_dict[targetCategory]
genre_dict = getGenres(cg_json, category_dict, targetCategory)
crawlByCategory(genre_dict)
if len(failed_sqls) != 0:
f = open(failed_sqls_file,'w')
for each in failed_sqls:
f.write(each+"\n")
f.close()
if len(failed_requests) != 0:
f = open(failed_requests_file,'w')
for each in failed_requests:
f.write(each[0]+" "+each[1])
f.close()
if __name__ == '__main__':
main()
db.close()