-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathResume_Parser.py
203 lines (150 loc) · 7.01 KB
/
Resume_Parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 22 02:48:47 2021
@author: davba
path to this file rn D:\ResumeIT\Resume_Parser.py
python -m streamlit run D:\ResumeIT\Resume_Parser.py
"""
import streamlit as st #framework
import pdf2image #converts pdf
from pdf2image import convert_from_path
import easyocr
import numpy as np
import PIL # Python Imaging Library
from PIL import ImageDraw # drawing bounding boxes
import tensorflow as tf
from IPython.display import display,Image
from matplotlib.pyplot import imshow
import xml.dom.minidom
import pandas as pd
import mrcnn
import mrcnn.utils
import mrcnn.config
import mrcnn.model
import mrcnn.visualize
import urllib.request
import os
import xml.etree
import cv2
import Train_MASKRCNN_Script as training
@st.cache
def convert_df(df):
# IMPORTANT: Cache the conversion to prevent computation on every rerun
return df.to_csv().encode('utf-8')
def run_model(model_path, img_array):
CLASS_NAMES = ['BG', 'Personal Info', 'Education', 'Skills', 'Projects', 'Work Experience', 'Extra']
class SimpleConfig(mrcnn.config.Config):
NAME = "coco_inference"
GPU_COUNT = 1
IMAGES_PER_GPU = 1
NUM_CLASSES = len(CLASS_NAMES)
model = mrcnn.model.MaskRCNN(mode="inference",
config=SimpleConfig(),
model_dir=os.getcwd())
model.load_weights(filepath=model_path,
by_name=True)
class_dict = {'bg' : 0,
'Personal Info' : 1,
'Education' : 2,
'Skills' : 3,
'Projects' : 4,
'Work Experience' : 5,
'Extra' : 6
}
inv_class_dict = {0:'bg',
1:'Personal Info',
2:'Education',
3:'Skills',
4:'Projects',
5:'Work Experience',
6: 'Extra'
}
sections = ['Personal Info', 'Education', 'Skills', 'Projects', 'Work Experience', 'Extra']
extracted_info_df = pd.DataFrame(columns=sections)
#Assuming an array of images in cv2 format: img_array
for image in img_array:
pred = model.detect([image], verbose=0)
pred = pred[0]
#The bnd box outputed is [y1,x1,y2,x2]
full_info = list()
class_list = list()
for i in range(len(pred['rois'])):
img_pil = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
temp_img = PIL.Image.fromarray(img_pil)
current_bnd_box = pred['rois'][i]
current_box_class = pred['class_ids'][i]
#pil_img.crop(box=[x1,y1,x2,y2])
crop_box = [current_bnd_box[1], current_bnd_box[0], current_bnd_box[3], current_bnd_box[2]]
crop_img = temp_img.crop(crop_box)
#text for this section
section_text = ""
bounds = reader.readtext(np.array(crop_img),min_size=0,slope_ths=0.2,ycenter_ths=0.7,height_ths=0.6,width_ths=0.8)
for b in bounds:
section_text += " " + b[1]
full_info.append([section_text])
class_list.append(inv_class_dict[current_box_class])
add_to_df = dict(zip(class_list,full_info))
for key in sections:
if key in add_to_df:
continue
else:
add_to_df[key] = [""]
temp_df = pd.DataFrame.from_dict(add_to_df)
extracted_info_df = extracted_info_df.append(temp_df, ignore_index = True)
return extracted_info_df
st.title("Resume Parsing")
os.environ['KMP_DUPLICATE_LIB_OK']='True'
reader= easyocr.Reader(["en"]) # select language
image_path ="C:/ResumeParser/tempDirectory/resume_image"
# Get Resume Book and Split Resume
st.write('<style>div.row-widget.stRadio > div{flex-direction:row;justify-content: center;} </style>', unsafe_allow_html=True)
st.write('<style>div.st-bf{flex-direction:column;} div.st-ag{font-weight:bold;padding-left:2px;}</style>', unsafe_allow_html=True)
choose=st.radio("Current Job",("Train Model","Parse Resumes"))
if choose == "Parse Resumes":
uploaded_file =st.file_uploader("Upload Your Resume Book", type=['pdf'], accept_multiple_files=False, key=None, help=None, on_change=None, args=None, kwargs=None)
if uploaded_file is not None:
if uploaded_file.type == "application/pdf":
images = pdf2image.convert_from_bytes(uploaded_file.read(),size=(1700,2200))
st.subheader("Please select page(s)")
start = st.number_input('Start with page',min_value =1,max_value=len(images),step=1,key="start_page")
end = st.number_input('End with page',min_value=1,max_value=len(images),step=1,key="end_page")
split_button = st.button("Split resume book", key='split_button')
if split_button:
for i in range(start-1,end):
img_index = i
resume = images[img_index]
image_path = "C:/ResumeParser/tempDirectory/resume_image"
image_name = uploaded_file.name.split(".")[0] + str(img_index+1)
image_ext = image_name + ".jpg"
resume.save(f"{image_path}/{image_ext}") #save jpeg
st.success("Finished splitting the resume. Ready to run!")
#Resume is split and saved as images so now we open that to get the resumes for prediction
files = os.listdir(image_path)
img_array = list()
for name in files:
image = cv2.imread(image_path + "/" + name)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
img_array.append(image)
model_path = st.text_input("Model Path", value="", max_chars=None, key="Model_path_input")
run_model_button = st.button("Run the Model", key='run_model_button')
# button "Click here to run the model"
df = pd.DataFrame()
if run_model_button:
df = run_model(model_path, img_array)
st.dataframe(data = df)
csv = convert_df(df)
st.download_button(
label="Download data as CSV",
data=csv,
mime='text/csv')
elif choose == "Train Model":
#def train_model(dataset_path, model_path, num_epochs, final_model_path):
dataset_path = st.text_input("Dataset Path (Folder containing Resumes and Resume Annotations folders)", value="", max_chars=None, key="dataset_path_input")
initial_model_path = st.text_input("Base Model Path", value="", max_chars=None, key="Base_Model_path_input")
num_epochs = st.number_input("Number of Epochs", key="Num_Epochs_input")
final_model_path = st.text_input("Final Model Path", value="", max_chars=None, key="Final_Model_path_input")
train_button = st.button("Train Model", key="Train_Model_Button")
if train_button:
with st.spinner('Training the Model'):
training.train_model(dataset_path,initial_model_path, int(num_epochs), final_model_path)
st.success('Finished Training!')