-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_analysis_process.py
95 lines (79 loc) · 2.38 KB
/
data_analysis_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 4 13:30:21 2019
@author: 13115
"""
import csv
import re
import jieba
def getsallev(s):
''' s is a sting of salary description. This function is used to standerize
the salary info into level descreiption'''
if '-' in s:
p = s.index('-')
adjust_dict = {'万/月':12,'千/月':12/10,'万/年':1,'元/天':365/10000}
r = adjust_dict[s[-3:]]
ss = ( float(s[:p]) + float(s[p+1:-3]) ) / 2 * r
else:
ss = float(s[:-3]) * 365/10000
if ss <= 8:
level = 'G'
elif ss > 8 and ss <= 12:
level = 'F'
elif ss >12 and ss <= 16:
level = 'E'
elif ss > 16 and ss <= 22:
level = 'D'
elif ss > 22 and ss <= 30:
level = 'C'
elif ss > 30 and ss <= 40:
level = 'B'
elif ss > 40 and ss <= 60:
level = 'A'
else:
level = 'S'
return level
list0 = []
file = open('C:\\Users\\13115\\epy\\delete_none.csv')
cr = csv.reader(file)
for each in cr:
l = getsallev(each[3])
s = str(each[4])
sc = jieba.lcut(s)
pat1 = '[0-9A-Za-z\u4e00-\u9fa5]' # Filter the spercific symbol.
pat2 = '[0-9]' # Filter numbers.
for i in sc:
if i != 'qualification': # Filter the 'qualification' word.
if re.match(pat1,i) is not None:
if re.match(pat2,i) is None:
i = i.upper()
list0.append(i + ':' + l)
# Final get a list like ['word1:A','word2:B'...]
fin_dict = {}
# A sily filtering method which will occuping lots of rescource.
# This function runing spends 7 min!
for i in list0:
c = list0.count(i)
fin_dict[i] = c
'''Result:
len(list0)
Out: 171716
len(fin_dict)
Out: 22854
'''
xx = open('C:\\Users\\13115\\epy\\dict_test.csv','w')
cw = csv.writer(xx)
for each in fin_dict:
word_name = each[0].split(':')[0]
level = each[0].split(':')[1]
num = fin_dict[each]
if len(word_name) > 1 and num > 2:
cw.writerow([word_name,level,num])
xx.close()
list1 = []
with open('C:\\Users\\13115\\epy\\delete_none.csv') as fr:
cr = csv.reader(fr)
for each in cr:
list1.append(getsallev(each[3]))
for i in ['A','B','C','D','E','F','G','S']:
print(i,list1.count(i))