-
Notifications
You must be signed in to change notification settings - Fork 2
/
PttStatistics.py
149 lines (123 loc) · 4.96 KB
/
PttStatistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# coding=UTF-8
import time
import requests
import sys
import jieba
import operator
from datetime import datetime
from bs4 import BeautifulSoup
requests.packages.urllib3.disable_warnings()
#可自行加入新的詞進去,增加斷詞精準度
jieba.load_userdict("dictNew.txt")
rs = requests.session()
def over18(board):
res = rs.get('https://www.ptt.cc/bbs/' + board + '/index.html', verify = False)
#先檢查網址是否包含'over18'字串 ,如有則為18禁網站
if ( res.url.find('over18') > -1 ):
print u"18禁網頁"
load = {
'from':'/bbs/'+board+'/index.html',
'yes':'yes'
}
res = rs.post('https://www.ptt.cc/ask/over18',verify = False, data = load)
return BeautifulSoup(res.text,'html.parser')
return BeautifulSoup(res.text,'html.parser')
def getPageNumber(content) :
startIndex = content.find('index')
endIndex = content.find('.html')
pageNumber = content[startIndex+5 : endIndex]
return pageNumber
def title_count(PttName, ParsingPage, ALLpage):
Titlesplit = ""
count = 0
for index in range(ALLpage, ALLpage - int(ParsingPage), -1):
count += 1
#避免被認為攻擊網站
time.sleep(0.1)
url = 'https://www.ptt.cc/bbs/'+ PttName +'/index'+ str(index) +'.html'
res = rs.get(url, verify = False)
soup = BeautifulSoup(res.text,'html.parser')
if (soup.title.text.find('Service Temporarily') > -1) :
print 'Service Temporarily', soup.title.text
for r_ent in soup.find_all( class_="r-ent"):
link = r_ent.find('a')
if ( link != None ):
title = link.text
try:
Titlesplit += title.split(']')[1].strip()
except:
Titlesplit += title.strip()
print u"等稍等: " + str(100 * count / ParsingPage ) + " %."
return jieba.cut(Titlesplit)
def push_count(PttName, ParsingPage, ALLpage):
UrlPer = []
pushcontent = ""
count = 0
#先抓取每篇文章的URL
for index in range(ALLpage, ALLpage - int(ParsingPage), -1):
#避免被認為攻擊網站
time.sleep(0.1)
url = 'https://www.ptt.cc/bbs/'+ PttName +'/index'+ str(index) +'.html'
res = rs.get(url, verify = False)
soup = BeautifulSoup(res.text,'html.parser')
if (soup.title.text.find('Service Temporarily') > -1) :
print 'Service Temporarily', soup.title.text
for r_ent in soup.find_all( class_="r-ent"):
link = r_ent.find('a')
if ( link != None ):
UrlPer.append('https://www.ptt.cc' + link['href'] )
total = len(UrlPer)
#開始抓取每篇文章內容的推文
while UrlPer :
time.sleep(0.1)
url = UrlPer.pop(0)
res = rs.get( url,verify = False)
soup = BeautifulSoup(res.text, 'html.parser')
if ( soup.title.text.find('Service Temporarily') > -1 ) :
UrlPer.append( url )
#print 'error_URL:',url
time.sleep(1)
else :
count += 1
#print 'OK_URL:', url
for push in soup.select('.push-content') :
pushcontent += push.text[1:]
print u"等稍等: " + str(100 * count / total ) + " %."
return jieba.cut(pushcontent)
# pyhton PttStatistics.py [統計型態] [板名] [爬取頁數]
# python PttStatistics.py title gossiping 10
if __name__ == "__main__":
search, PttName, ParsingPage = str(sys.argv[1]), str(sys.argv[2]) , int(sys.argv[3])
print 'Start parsing ' + PttName + '....'
fileName = '-'+ PttName + '-' + datetime.now().strftime('%Y%m%d%H%M%S') + '.txt'
start_time = time.time()
soup = over18(PttName)
ALLpageURL = soup.select('.btn.wide')[1]['href']
ALLpage = int(getPageNumber(ALLpageURL)) + 1
#print 'Total pages:',ALLpage
print 'Start parsing ' + search + ' count....'
fileName = search + fileName
if( search!='title' and search != 'push' ) :
print u"ERROR !! 請輸入 title or push !!"
sys.exit()
if( search == 'title'):
dataList = title_count(PttName, ParsingPage, ALLpage )
if( search == 'push'):
dataList = push_count(PttName, ParsingPage, ALLpage)
print u"字詞統計中,請稍等......"
dic = {}
#字詞統計
for ele in dataList :
if ele not in dic:
dic[ele] = 1
else:
dic[ele] = dic[ele] + 1
sorted_word = sorted(dic.items(), key = operator.itemgetter(1), reverse = True)
line = ''
for ele in sorted_word:
if( len(ele[0]) > 1 ): #只顯示一個字以上的詞,如需顯示一個字的詞請註解掉此行
line += ele[0] + ' ' + str(ele[1]) + '\n'
with open(fileName,'a') as f:
f.write( line.encode('utf8') )
print u'====================完成===================='
print u'execution time:' + str(time.time() - start_time)+'s'