-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsingle_parser.py
302 lines (252 loc) · 10.7 KB
/
single_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
from pdfminer.high_level import extract_text
import spacy
from spacy.matcher import Matcher
import re
from datetime import datetime, date
class ResumeParser:
@staticmethod
def extract_text_from_pdf(pdf_path):
"""
Extracts text from a PDF file.
"""
return extract_text(pdf_path)
@staticmethod
def extract_name(resume_text):
"""
Extracts the name from the resume text, focusing on names with initials
and simple two-word names. Uses spaCy's Matcher for structured patterns
and a regex fallback for additional formats.
"""
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
# Define patterns for names with initials and simple two-word names
two_word_name_pattern = [[{"POS": "PROPN"}, {"POS": "PROPN"}]]
initial_name_pattern = [
[
{"IS_ALPHA": True, "IS_TITLE": True},
{"TEXT": {"REGEX": "^[A-Z]\.?$"}},
{"IS_ALPHA": True, "IS_TITLE": True},
]
]
# Add patterns to matcher
matcher.add("TWO_WORD_NAME", two_word_name_pattern)
matcher.add("INITIAL_NAME", initial_name_pattern)
doc = nlp(resume_text)
matches = matcher(doc)
# Exclusion terms to prevent non-name sections from being captured
exclusion_terms = {"curriculum vitae", "resume", "cv"}
# Check for matches using matcher patterns
for match_id, start, end in matches:
name_candidate = doc[start:end].text
if name_candidate.lower() not in exclusion_terms:
return name_candidate
# Regex Fallback: Supports initials and simple two-word names
regex_pattern = r"\b([A-Z][a-z]+(?:\s[A-Z]\.?\s)?[A-Z][a-z]+)\b"
regex_match = re.search(regex_pattern, resume_text)
if regex_match:
name_candidate = regex_match.group()
if name_candidate.lower() not in exclusion_terms:
return name_candidate
return None # Return None if no name is found
@staticmethod
def extract_email_from_resume(text):
"""
Extracts email addresses from the resume text using regex.
Gives priority to structured format `Email: value`.
"""
structured_email_pattern = (
r"Email\s*:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})"
)
match = re.search(structured_email_pattern, text, re.IGNORECASE)
if match:
return match.group(1)
# General email pattern
general_email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
match = re.search(general_email_pattern, text)
return match.group() if match else None
@staticmethod
def extract_mobile_number(text):
# """
# Extracts phone numbers, with priority to structured format `Phone: value`.
# """
# structured_phone_pattern = r"Phone\s*:\s*(\+?\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9})"
# match = re.search(structured_phone_pattern, text, re.IGNORECASE)
# if match:
# return match.group(1)
# # General phone number pattern
# general_phone_pattern = r"(?:\+91[-\s]?\d{10}|\d{10}|\d{5}[-\s]?\d{5})"
# match = re.search(general_phone_pattern, text)
# return match.group() if match else None
pass
@staticmethod
def extract_mobile_numbers(text, max_numbers=3):
"""
Extracts up to `max_numbers` phone numbers from text, prioritizing
structured format 'Phone: value' when available, allows 10- or
11-digit numbers, and adds '+91' if a country code is missing.
"""
# Priority pattern: Structured format "Phone: value"
structured_phone_pattern = r"Phone\s*:\s*(\+?\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{3,4}[-.\s]?\d{3,4}[-.\s]?\d{3,4})"
# General pattern: Common phone formats, including 10 or 11-digit options
general_phone_pattern = r"\b(?:\+91[-\s]?\d{10}|\d{11}|\d{10}|\d{3}[-\s]?\d{3}[-\s]?\d{4}|\d{5}[-\s]?\d{5})\b"
# Find all structured phone numbers first
structured_matches = re.findall(structured_phone_pattern, text, re.IGNORECASE)
# If we found fewer than `max_numbers` structured matches, search with the general pattern
if len(structured_matches) < max_numbers:
general_matches = re.findall(general_phone_pattern, text)
# Combine structured and general matches
matches = structured_matches + general_matches
else:
matches = structured_matches
# Remove duplicates and limit to `max_numbers`
unique_matches = list(dict.fromkeys(matches))[:max_numbers]
# Add '+91' if the country code is missing
formatted_numbers = [
f"+91{number}" if not number.startswith("+") else number
for number in unique_matches
]
return formatted_numbers
@staticmethod
def extract_education_from_resume(text):
"""
Extracts education details using regex patterns.
"""
education_pattern = r"(?i)(?:Diploma|Dip\.\w+|\bB\.\w+|\bM\.\w+|\bPh\.D|\bBachelor(?:'s)?|\bMasters(?:'s)?|\bB\.Tech|\bM\.Tech|\bB\.E\.|\bM\.E\.|\bB\.Sc|\bM\.Sc|\bB\.Com|\bM\.Com)\s(?:\w+\s)*\w+"
return re.findall(education_pattern, text)
@staticmethod
def extract_gender_from_resume(text):
"""
Extracts gender from the resume text, with priority to structured format `Gender: value`.
"""
structured_gender_pattern = r"Gender\s*:\s*(Male|Female)"
match = re.search(structured_gender_pattern, text, re.IGNORECASE)
if match:
return match.group(1).capitalize()
gender_pattern = r"\b(Male|Female)\b"
match = re.search(gender_pattern, text, re.IGNORECASE)
return match.group(0).capitalize() if match else None
@staticmethod
def extract_experience_from_resume(text):
"""
Extracts years of experience from the resume text using regex.
Priority is given to structured format `Experience: value`.
"""
structured_experience_pattern = r"Experience\s*:\s*(\d+)\s*(?:years?|yrs?)"
match = re.search(structured_experience_pattern, text, re.IGNORECASE)
if match:
return f"{match.group(1)} yrs"
experience_pattern = r"(\d+)\s*(?:years?|months?)\s*(?:of)?\s*(?:experience)?"
match = re.search(experience_pattern, text, re.IGNORECASE)
return match.group(0) if match else None
@staticmethod
def extract_dob_age(text):
"""
Extracts date of birth and age if found in structured format.
If structured formats are absent, it uses regex and NLP to parse dates.
"""
dob, age = None, None
structured_dob_pattern = r"(?:DOB|Date\s*of\s*Birth|D.O.B)\s*:\s*(.*)"
structured_age_pattern = r"Age\s*:\s*(\d+)"
dob_match = re.search(structured_dob_pattern, text, re.IGNORECASE)
age_match = re.search(structured_age_pattern, text, re.IGNORECASE)
if dob_match:
dob = ResumeParser.parse_date(dob_match.group(1).strip())
if age_match:
age = int(age_match.group(1))
if not dob:
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
for ent in doc.ents:
if ent.label_ == "DATE":
dob = ResumeParser.parse_date(ent.text)
if dob:
break
if not age:
age_pattern = re.search(
r"\b(?:age|aged)\s*[:\-]?\s*(\d{1,2})\b", text, re.IGNORECASE
)
if age_pattern:
age = int(age_pattern.group(1))
if dob and not age:
age = ResumeParser.calculate_age(dob)
return dob, age
@staticmethod
def remove_ordinal_suffix(date_str):
"""
Removes ordinal suffixes (e.g., 1st, 2nd) from date strings.
"""
return re.sub(r"(\d+)(st|nd|rd|th)", r"\1", date_str)
@staticmethod
def parse_date(date_str):
"""
Parses date strings using multiple formats.
"""
date_str = ResumeParser.remove_ordinal_suffix(date_str)
formats = [
"%d %B %Y",
"%d-%b-%Y",
"%d %b %Y",
"%d-%B-%Y",
"%d/%m/%Y",
"%d-%m-%Y",
"%Y-%m-%d",
"%d,%B,%Y",
"%d %b%Y",
"%d:%b:%Y",
"%d:%B:%Y",
]
for fmt in formats:
try:
return datetime.strptime(date_str, fmt).date()
except ValueError:
continue
return None
@staticmethod
def calculate_age(dob):
"""
Calculates age from date of birth.
"""
today = datetime.today().date()
return today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
@staticmethod
def format_date_to_custom(date_input):
# Return None if the input is None
if date_input is None:
return None
# If the input is a datetime.date (but not a datetime), convert it to datetime
if isinstance(date_input, date) and not isinstance(date_input, datetime):
date_obj = datetime.combine(date_input, datetime.min.time())
elif isinstance(date_input, datetime):
date_obj = date_input
else:
# Parse the string input assuming it's in 'YYYY-MM-DD' format
date_obj = datetime.strptime(
date_input, "%Y-%m-%d"
) # Adjust format if needed
# Format to the desired output format
return date_obj.strftime("%d-%b-%Y")
@staticmethod
def parse_resume(pdf_path):
"""
Main method to parse resume from PDF and return extracted details.
"""
text = ResumeParser.extract_text_from_pdf(pdf_path)
name = ResumeParser.extract_name(text)
email = ResumeParser.extract_email_from_resume(text)
phone = ResumeParser.extract_mobile_numbers(text)
education = ResumeParser.extract_education_from_resume(text)
dob, age = ResumeParser.extract_dob_age(text)
gender = ResumeParser.extract_gender_from_resume(text)
experience = ResumeParser.extract_experience_from_resume(text)
date_of_birth = ResumeParser.format_date_to_custom(dob)
return {
"name": name,
"email": email,
"phone": phone,
"education": education,
# "dob": dob,
"date of birth ": date_of_birth,
"age": age,
"gender": gender,
"experience": experience,
}