Prernas19 commited on
Commit
51593d8
1 Parent(s): 7eb897b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -28
app.py CHANGED
@@ -12,7 +12,6 @@ download("en_core_web_sm")
12
  # Load the spaCy model
13
  nlp = spacy.load("en_core_web_sm")
14
 
15
-
16
  # Set of English words
17
  nltk.download('words', quiet=True)
18
  english_words = set(words.words())
@@ -40,37 +39,32 @@ def extract_text_from_docx(file):
40
  return "\n".join([para.text for para in doc.paragraphs])
41
 
42
  def extract_companies(text):
43
- # Process the text with the spaCy model
44
  doc = nlp(text)
45
  companies = []
46
 
47
- # Define a regex pattern for common company name suffixes
48
  company_pattern = re.compile(
49
  r'\b(?:Inc|Corp|LLC|Ltd|Co|Company|Group|Services|Technologies|Pvt|Solutions|Consulting)\b', re.IGNORECASE)
50
 
51
- # Iterate over the identified entities in the text
52
  for ent in doc.ents:
53
- if ent.label_ == "ORG":
54
- # Apply the regex pattern to filter out company names
55
- if company_pattern.search(ent.text):
56
- companies.append(ent.text)
57
 
58
- return companies
 
59
 
60
  def extract_colleges(text):
61
  doc = nlp(text)
62
  colleges = []
63
 
64
- # Extended list of education-related keywords
65
  edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"]
66
 
67
  for sent in doc.sents:
68
- # Extract entities labeled as ORG and check if they contain education-related keywords
69
  edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)]
70
  for edu in edu_ents:
71
  colleges.append(edu.text)
72
-
73
- return colleges
 
74
 
75
  def extract_years_of_experience(text):
76
  years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
@@ -131,28 +125,27 @@ def parse_resume(file):
131
  summary = extract_summary(doc)
132
  linkedin = extract_linkedin(text)
133
 
134
- result = {
135
- "Companies Worked For": companies,
136
- "Colleges Attended": colleges,
137
- "Years of Experience": years_of_experience,
138
- "Phone Number": phone,
139
- "Email ID": email,
140
- "Summary": summary,
141
- "LinkedIn ID": linkedin
142
- }
143
-
144
- return result
145
  except Exception as e:
146
  import traceback
147
- return {"Error": f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"}
148
 
149
- # Create Gradio interface
150
  iface = gr.Interface(
151
  fn=parse_resume,
152
  inputs=gr.File(label="Upload Resume (PDF or DOCX)"),
153
- outputs=gr.JSON(label="Extracted Information"),
 
 
 
 
 
 
 
 
154
  title="Advanced Resume Parser",
155
  description="Upload a resume in PDF or DOCX format to extract key information."
156
  )
157
 
158
- iface.launch(share=True)
 
12
  # Load the spaCy model
13
  nlp = spacy.load("en_core_web_sm")
14
 
 
15
  # Set of English words
16
  nltk.download('words', quiet=True)
17
  english_words = set(words.words())
 
39
  return "\n".join([para.text for para in doc.paragraphs])
40
 
41
  def extract_companies(text):
 
42
  doc = nlp(text)
43
  companies = []
44
 
 
45
  company_pattern = re.compile(
46
  r'\b(?:Inc|Corp|LLC|Ltd|Co|Company|Group|Services|Technologies|Pvt|Solutions|Consulting)\b', re.IGNORECASE)
47
 
 
48
  for ent in doc.ents:
49
+ if ent.label_ == "ORG" and company_pattern.search(ent.text):
50
+ companies.append(ent.text)
 
 
51
 
52
+ # Join companies with new lines
53
+ return "\n".join(companies)
54
 
55
  def extract_colleges(text):
56
  doc = nlp(text)
57
  colleges = []
58
 
 
59
  edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"]
60
 
61
  for sent in doc.sents:
 
62
  edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)]
63
  for edu in edu_ents:
64
  colleges.append(edu.text)
65
+
66
+ # Join colleges with new lines
67
+ return "\n".join(colleges)
68
 
69
  def extract_years_of_experience(text):
70
  years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
 
125
  summary = extract_summary(doc)
126
  linkedin = extract_linkedin(text)
127
 
128
+ return companies, colleges, years_of_experience, phone, email, summary, linkedin
129
+
 
 
 
 
 
 
 
 
 
130
  except Exception as e:
131
  import traceback
132
+ return f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
133
 
134
+ # Create Gradio interface with separate output components
135
  iface = gr.Interface(
136
  fn=parse_resume,
137
  inputs=gr.File(label="Upload Resume (PDF or DOCX)"),
138
+ outputs=[
139
+ gr.Textbox(label="Companies Worked For", lines=10),
140
+ gr.Textbox(label="Colleges Attended", lines=10),
141
+ gr.Textbox(label="Years of Experience"),
142
+ gr.Textbox(label="Phone Number"),
143
+ gr.Textbox(label="Email ID"),
144
+ gr.Textbox(label="Summary", lines=3),
145
+ gr.Textbox(label="LinkedIn ID")
146
+ ],
147
  title="Advanced Resume Parser",
148
  description="Upload a resume in PDF or DOCX format to extract key information."
149
  )
150
 
151
+ iface.launch(share=True)