bangaboy commited on
Commit
73601bf
1 Parent(s): 68f7a25

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -0
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import google.generativeai as genai
3
+ import fitz # PyMuPDF for PDF text extraction
4
+ import spacy
5
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
6
+ from transformers import AutoModelForSeq2SeqLM
7
+ from docx import Document
8
+ import re
9
+ import dateparser
10
+ from datetime import datetime
11
+ import os
12
+
13
+ # Load SpaCy model
14
+ nlp_spacy = spacy.load('en_core_web_sm')
15
+
16
+ # Load Babelscape NER model
17
+ tokenizer_ner = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
18
+ model_ner = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
19
+ nlp_ner = pipeline('ner', model=model_ner, tokenizer=tokenizer_ner, aggregation_strategy="simple")
20
+
21
+ # Load GLinER model
22
+ gliner_tokenizer = AutoTokenizer.from_pretrained("DAMO-NLP-SG/gliner-large")
23
+ gliner_model = AutoModelForSeq2SeqLM.from_pretrained("DAMO-NLP-SG/gliner-large")
24
+
25
+ def extract_info_with_gliner(text, info_type):
26
+ input_text = f"Extract {info_type} from: {text}"
27
+ input_ids = gliner_tokenizer(input_text, return_tensors="pt").input_ids
28
+ outputs = gliner_model.generate(input_ids, max_length=100)
29
+ return gliner_tokenizer.decode(outputs[0], skip_special_tokens=True)
30
+
31
+ class EnhancedNERPipeline:
32
+ def __init__(self, nlp_spacy, nlp_ner, gliner_model, gliner_tokenizer):
33
+ self.nlp_spacy = nlp_spacy
34
+ self.nlp_ner = nlp_ner
35
+ self.gliner_model = gliner_model
36
+ self.gliner_tokenizer = gliner_tokenizer
37
+
38
+ def __call__(self, text):
39
+ # SpaCy processing
40
+ doc = self.nlp_spacy(text)
41
+
42
+ # Babelscape NER processing
43
+ ner_results = self.nlp_ner(text)
44
+
45
+ # GLinER processing
46
+ gliner_companies = extract_info_with_gliner(text, "company names")
47
+ gliner_experience = extract_info_with_gliner(text, "years of experience")
48
+ gliner_education = extract_info_with_gliner(text, "educational institutions")
49
+
50
+ # Combine results
51
+ combined_entities = doc.ents + tuple(ner_results)
52
+
53
+ # Add GLinER results as custom attributes
54
+ doc._.gliner_companies = gliner_companies.split(', ')
55
+ doc._.gliner_experience = gliner_experience
56
+ doc._.gliner_education = gliner_education.split(', ')
57
+
58
+ # Update doc.ents with combined results for other entity types
59
+ doc.ents = [ent for ent in combined_entities if ent.label_ not in ["ORG"]]
60
+
61
+ return doc
62
+
63
+ # Create the enhanced pipeline
64
+ enhanced_nlp = EnhancedNERPipeline(nlp_spacy, nlp_ner, gliner_model, gliner_tokenizer)
65
+
66
+ def extract_companies(doc):
67
+ gliner_companies = set(doc._.gliner_companies)
68
+ spacy_babelscape_companies = set([ent.text for ent in doc.ents if ent.label_ == "ORG"])
69
+ return list(gliner_companies.union(spacy_babelscape_companies))
70
+
71
+ def extract_experience(doc):
72
+ gliner_experience = int(re.search(r'\d+', doc._.gliner_experience).group()) if doc._.gliner_experience else 0
73
+ spacy_experience = max([datetime.now().year - date.year for ent in doc.ents if ent.label_ == "DATE" and (date := dateparser.parse(ent.text)) and date.year <= datetime.now().year] or [0])
74
+ return max(gliner_experience, spacy_experience)
75
+
76
+ def extract_education(doc):
77
+ gliner_education = set(doc._.gliner_education)
78
+ spacy_babelscape_education = set([ent.text for ent in doc.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in ["university", "college", "institute", "school"])])
79
+ return list(gliner_education.union(spacy_babelscape_education))
80
+
81
+ def extract_text_from_pdf(file):
82
+ pdf = fitz.open(stream=file.read(), filetype="pdf")
83
+ text = ""
84
+ for page in pdf:
85
+ text += page.get_text()
86
+ return text
87
+
88
+ def extract_text_from_doc(file):
89
+ doc = Document(file)
90
+ return " ".join([paragraph.text for paragraph in doc.paragraphs])
91
+
92
+ def authenticate_gemini(api_key):
93
+ try:
94
+ genai.configure(api_key=api_key)
95
+ model = genai.GenerativeModel('gemini-pro')
96
+ return model
97
+ except Exception as e:
98
+ st.error(f"Authentication failed: {e}")
99
+ return None
100
+
101
+ def generate_summary(text, model):
102
+ prompt = f"Summarize the following resume:\n\n{text}\n\nProvide a brief overview of the candidate's qualifications, experience, and key skills."
103
+ response = model.generate_content(prompt)
104
+ return response.text
105
+
106
+ def main():
107
+ st.title("Enhanced Resume Analyzer with GLinER Focus")
108
+
109
+ api_key = st.text_input("Enter your Google Gemini API key", type="password")
110
+ uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx"])
111
+
112
+ if uploaded_file is not None and api_key:
113
+ try:
114
+ model = authenticate_gemini(api_key)
115
+ if model is None:
116
+ return
117
+
118
+ if uploaded_file.type == "application/pdf":
119
+ resume_text = extract_text_from_pdf(uploaded_file)
120
+ elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
121
+ resume_text = extract_text_from_doc(uploaded_file)
122
+ else:
123
+ st.error("Unsupported file format.")
124
+ return
125
+
126
+ # Process the resume text with the enhanced pipeline
127
+ doc = enhanced_nlp(resume_text)
128
+
129
+ companies = extract_companies(doc)
130
+ experience = extract_experience(doc)
131
+ education = extract_education(doc)
132
+
133
+ # Use GLinER for other extractions
134
+ phone = extract_info_with_gliner(resume_text, "phone number")
135
+ email = extract_info_with_gliner(resume_text, "email address")
136
+ linkedin = extract_info_with_gliner(resume_text, "LinkedIn profile")
137
+
138
+ st.subheader("Extracted Information")
139
+ st.write(f"**Years of Experience:** {experience}")
140
+ st.write("**Companies:**", ", ".join(companies))
141
+ st.write("**Education:**", ", ".join(education))
142
+ st.write(f"**Phone Number:** {phone}")
143
+ st.write(f"**Email:** {email}")
144
+ st.write(f"**LinkedIn:** {linkedin}")
145
+
146
+ summary = generate_summary(resume_text, model)
147
+ st.subheader("Resume Summary")
148
+ st.write(summary)
149
+
150
+ except Exception as e:
151
+ st.error(f"Error during processing: {e}")
152
+
153
+ if __name__ == "__main__":
154
+ main()