Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -12,7 +12,6 @@ download("en_core_web_sm")
|
|
12 |
# Load the spaCy model
|
13 |
nlp = spacy.load("en_core_web_sm")
|
14 |
|
15 |
-
|
16 |
# Set of English words
|
17 |
nltk.download('words', quiet=True)
|
18 |
english_words = set(words.words())
|
@@ -40,37 +39,32 @@ def extract_text_from_docx(file):
|
|
40 |
return "\n".join([para.text for para in doc.paragraphs])
|
41 |
|
42 |
def extract_companies(text):
|
43 |
-
# Process the text with the spaCy model
|
44 |
doc = nlp(text)
|
45 |
companies = []
|
46 |
|
47 |
-
# Define a regex pattern for common company name suffixes
|
48 |
company_pattern = re.compile(
|
49 |
r'\b(?:Inc|Corp|LLC|Ltd|Co|Company|Group|Services|Technologies|Pvt|Solutions|Consulting)\b', re.IGNORECASE)
|
50 |
|
51 |
-
# Iterate over the identified entities in the text
|
52 |
for ent in doc.ents:
|
53 |
-
if ent.label_ == "ORG":
|
54 |
-
|
55 |
-
if company_pattern.search(ent.text):
|
56 |
-
companies.append(ent.text)
|
57 |
|
58 |
-
|
|
|
59 |
|
60 |
def extract_colleges(text):
|
61 |
doc = nlp(text)
|
62 |
colleges = []
|
63 |
|
64 |
-
# Extended list of education-related keywords
|
65 |
edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"]
|
66 |
|
67 |
for sent in doc.sents:
|
68 |
-
# Extract entities labeled as ORG and check if they contain education-related keywords
|
69 |
edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)]
|
70 |
for edu in edu_ents:
|
71 |
colleges.append(edu.text)
|
72 |
-
|
73 |
-
|
|
|
74 |
|
75 |
def extract_years_of_experience(text):
|
76 |
years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
|
@@ -131,28 +125,27 @@ def parse_resume(file):
|
|
131 |
summary = extract_summary(doc)
|
132 |
linkedin = extract_linkedin(text)
|
133 |
|
134 |
-
|
135 |
-
|
136 |
-
"Colleges Attended": colleges,
|
137 |
-
"Years of Experience": years_of_experience,
|
138 |
-
"Phone Number": phone,
|
139 |
-
"Email ID": email,
|
140 |
-
"Summary": summary,
|
141 |
-
"LinkedIn ID": linkedin
|
142 |
-
}
|
143 |
-
|
144 |
-
return result
|
145 |
except Exception as e:
|
146 |
import traceback
|
147 |
-
return
|
148 |
|
149 |
-
# Create Gradio interface
|
150 |
iface = gr.Interface(
|
151 |
fn=parse_resume,
|
152 |
inputs=gr.File(label="Upload Resume (PDF or DOCX)"),
|
153 |
-
outputs=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
title="Advanced Resume Parser",
|
155 |
description="Upload a resume in PDF or DOCX format to extract key information."
|
156 |
)
|
157 |
|
158 |
-
iface.launch(share=True)
|
|
|
12 |
# Load the spaCy model
|
13 |
nlp = spacy.load("en_core_web_sm")
|
14 |
|
|
|
15 |
# Set of English words
|
16 |
nltk.download('words', quiet=True)
|
17 |
english_words = set(words.words())
|
|
|
39 |
return "\n".join([para.text for para in doc.paragraphs])
|
40 |
|
41 |
def extract_companies(text):
|
|
|
42 |
doc = nlp(text)
|
43 |
companies = []
|
44 |
|
|
|
45 |
company_pattern = re.compile(
|
46 |
r'\b(?:Inc|Corp|LLC|Ltd|Co|Company|Group|Services|Technologies|Pvt|Solutions|Consulting)\b', re.IGNORECASE)
|
47 |
|
|
|
48 |
for ent in doc.ents:
|
49 |
+
if ent.label_ == "ORG" and company_pattern.search(ent.text):
|
50 |
+
companies.append(ent.text)
|
|
|
|
|
51 |
|
52 |
+
# Join companies with new lines
|
53 |
+
return "\n".join(companies)
|
54 |
|
55 |
def extract_colleges(text):
|
56 |
doc = nlp(text)
|
57 |
colleges = []
|
58 |
|
|
|
59 |
edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"]
|
60 |
|
61 |
for sent in doc.sents:
|
|
|
62 |
edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)]
|
63 |
for edu in edu_ents:
|
64 |
colleges.append(edu.text)
|
65 |
+
|
66 |
+
# Join colleges with new lines
|
67 |
+
return "\n".join(colleges)
|
68 |
|
69 |
def extract_years_of_experience(text):
|
70 |
years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
|
|
|
125 |
summary = extract_summary(doc)
|
126 |
linkedin = extract_linkedin(text)
|
127 |
|
128 |
+
return companies, colleges, years_of_experience, phone, email, summary, linkedin
|
129 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
except Exception as e:
|
131 |
import traceback
|
132 |
+
return f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
|
133 |
|
134 |
+
# Create Gradio interface with separate output components
|
135 |
iface = gr.Interface(
|
136 |
fn=parse_resume,
|
137 |
inputs=gr.File(label="Upload Resume (PDF or DOCX)"),
|
138 |
+
outputs=[
|
139 |
+
gr.Textbox(label="Companies Worked For", lines=10),
|
140 |
+
gr.Textbox(label="Colleges Attended", lines=10),
|
141 |
+
gr.Textbox(label="Years of Experience"),
|
142 |
+
gr.Textbox(label="Phone Number"),
|
143 |
+
gr.Textbox(label="Email ID"),
|
144 |
+
gr.Textbox(label="Summary", lines=3),
|
145 |
+
gr.Textbox(label="LinkedIn ID")
|
146 |
+
],
|
147 |
title="Advanced Resume Parser",
|
148 |
description="Upload a resume in PDF or DOCX format to extract key information."
|
149 |
)
|
150 |
|
151 |
+
iface.launch(share=True)
|