-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathresume.py
189 lines (165 loc) · 9.14 KB
/
resume.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
from bs4 import BeautifulSoup
import pytesseract
import base64
import time
from PIL import Image
from io import BytesIO
from collections import defaultdict
import os
import json
import demjson
convert_dict = {"姓名":'Name', '性别':'Gender', '年龄':'Age', '工作年限':'ServiceYear', '电子邮件':'E-mail', '联系电话':'Telephone',
'所在地':'location', '学历':'EducationalAttainment', '所在行业':'Industry', '公司名称':'CompanyName',
'所任职位':'PositionsHeld', '目前薪资':'CurrentIncome:', '期望行业':'ExpectedIndustry',
'期望地点':'ExpectedLocation', '期望薪资':'ExpectedSalary', '期望职位':'ExpectedPosition',
'下属人数':'SubordinatesNum', '所在地区':'Area', '工作职责和业绩':'ResponsibilitiesAchievement',
'汇报对象':'ReportingTo', '职务类别':'Function', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':'', '':''}
class objdict(dict):
def __getattr__(self, name):
if name in self:
return self[name]
else:
raise AttributeError("No such attribute: " + name)
def __setattr__(self, name, value):
global convert_dict
if name in convert_dict:
name = convert_dict[name]
self[name] = value
def __delattr__(self, name):
if name in self:
del self[name]
else:
raise AttributeError("No such attribute: " + name)
class Resume():
def __init__(self, html):
self.html = html
self.file_folder = os.path.split(html)[0]
with open(html, 'rb') as f:
html_str = f.read()
self.soup = BeautifulSoup(html_str, "html.parser", from_encoding='utf-8')
self.resume_type = self.soup.find('li', attrs={'class': 'active'}).text
if self.resume_type == '中文简历':
self._spliter = ":"
else:
self._spliter = ":"
self._spliters = {}
self._spliters[":"] = ":"
self._spliters[":"] = ":"
table = self.soup.find('table', attrs={'class': 'resume-basic-info'})
self.resume_id = self.soup.find('title').text.split('_')[0][3:].upper()
self.resume_basic_info = self._process_resume_basic(table)
self.resume_basic_info['ID'] = self.resume_id
self.current_info = self._process_current_info(self.soup)
self.work_info = self._process_work_info(self.soup)
self.edu_info = self._process_edu_info(self.soup)
self.skill_list = [i.text.strip() for i in self.soup.findAll('span', attrs={'class': 'skillLabel'})]
self.resume_comments = self.soup.find('div', attrs={'class': 'resume-comments'}).find('tbody').find_all('tr')[0].text
self.resume_language = [i.text.strip() for i in self.soup.find('div', attrs={'class': 'resume-language'}).find('tbody').find_all('tr')]
pass
def to_json(self):
resume = {}
resume['basic_info'] = self.resume_basic_info
resume['works'] = self.work_info
resume['edus'] = self.edu_info
resume['skills'] = self.skill_list
resume['comments'] = self.resume_comments
resume['languages'] = self.resume_language
return json.dumps(resume)
def _process_resume_basic(self, table):
table_body = table.find('tbody')
rows = table_body.find_all('tr')
resume_basic = objdict()
for row in rows:
cols = row.find_all('td')
for col in cols:
text = col.text.strip()
value = None
if text.startswith("联系电话") or text.startswith("电子邮件"):
text = text.split(self._spliter)[0]
for con in col.contents:
if con.name == "img":
value = con.attrs['src']
if value.startswith("data:image"):
msg = base64.b64decode(value.split(",")[1])
bytIO = BytesIO()
bytIO.write(msg)
img = Image.open(bytIO)
if img.format == 'GIF':
img = img.convert('RGB')
value = pytesseract.image_to_string(img)
else:
value = pytesseract.image_to_string(os.path.join(self.file_folder, value))
break
else:
ts = text.split(self._spliter)
text,value = ts[0], ts[1]
setattr(resume_basic, text, value)
return resume_basic
def _process_current_info(self, soup):
tabs = soup.findAll("table")
current_info = objdict()
for tab in tabs:
if tab.text.strip().startswith("目前职业概况") or tab.text.strip().startswith("职业发展意向"):
table_body = tab.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
for col in cols:
text = col.text.strip().replace(" ", "").replace("\n", "")
ts = text.split(self._spliter)
setattr(current_info, ts[0], ts[1])
return current_info
def _process_work_info(self, soup):
work_info_list = []
work_info = objdict()
divs = soup.find('div', attrs={'class': 'resume-work'})
for div in divs:
if hasattr(div, 'attrs') and 'class' in div.attrs:
if 'resume-job-title' in div.attrs['class']:
work_time = div.find('span', attrs={'class':'work-time'}).text
company_list = [i.replace(" ", '') for i in div.find('em', attrs={'class':'compony'}).text.split('\n')]
while '' in company_list:
company_list.remove('')
company, *work_time1 = company_list
work_info['WorkTime'] = work_time
work_info['CompName'] = company
elif 'resume-indent' in div.attrs['class']:
tab = div.find('table', attrs={'class':'job-list'})
table_body = tab.find('tbody')
title_row, *arch_row = table_body.find_all('tr')
title = title_row.find("div", attrs={'class':'job-list-title'}).text.strip()
work_info["JobTitle"] = title
if len(arch_row) == 2:
subos = arch_row[0].find('th').text.replace('\xa0', '').strip().split('|')
for i in subos:
if self._spliter in i:
attr, value = i.split(self._spliter)
else:
attr, value = i.split(self._spliters[self._spliter])
setattr(work_info, attr,value)
setattr(work_info, arch_row[1].find('th').text.strip().split(self._spliter)[0], arch_row[1].find('td').text.strip())
if len(arch_row) == 1:
if arch_row[0].find('td') == None:
subos = arch_row[0].find('th').text.replace('\xa0', '').strip().split('|')
for i in subos:
attr, value = i.split(self._spliter)
setattr(work_info, attr,value)
else:
setattr(work_info, arch_row[0].find('th').text.strip().split(self._spliter)[0], arch_row[0].find('td').text.strip())
work_info_list.append(work_info.copy())
work_info = objdict()
return work_info_list
def _process_edu_info(self, soup):
edu_info_list = []
edus = soup.find('div', attrs={'class': 'resume-education'}).findAll('ul', attrs={'class': 'edu-ul'})
for edu in edus:
edu_text = edu.find('div', attrs={'class': 'info'}).find('p').text.strip().replace(" ", '').replace("\n", '')
if ')(' in edu_text:
spliter = ')('
else:
spliter = '('
school, times = edu.find('div', attrs={'class': 'info'}).find('p').text.strip().replace(" ", '').replace("\n", '').split(spliter)
degree, major = edu.find('div', attrs={'class': 'info'}).find('p', attrs={'class': 'degree'}).text.strip().replace(" ", '').replace("\n", '').split('|')
tips = [i.text for i in edu.find('div', attrs={'class': 'info'}).findAll('span', attrs={'class': 'tips'})]
edu_info_list.append({'SchoolName':school, "SchoolTime":times, "Degree":degree, "Major":major, "SchoolTips":tips})
return edu_info_list