File size: 6,274 Bytes
73601bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import streamlit as st
import google.generativeai as genai
import fitz  # PyMuPDF for PDF text extraction
import spacy
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from transformers import AutoModelForSeq2SeqLM
from docx import Document
import re
import dateparser
from datetime import datetime
import os

# Load SpaCy model
nlp_spacy = spacy.load('en_core_web_sm')

# Load Babelscape NER model
tokenizer_ner = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model_ner = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
nlp_ner = pipeline('ner', model=model_ner, tokenizer=tokenizer_ner, aggregation_strategy="simple")

# Load GLinER model
gliner_tokenizer = AutoTokenizer.from_pretrained("DAMO-NLP-SG/gliner-large")
gliner_model = AutoModelForSeq2SeqLM.from_pretrained("DAMO-NLP-SG/gliner-large")

def extract_info_with_gliner(text, info_type):
    input_text = f"Extract {info_type} from: {text}"
    input_ids = gliner_tokenizer(input_text, return_tensors="pt").input_ids
    outputs = gliner_model.generate(input_ids, max_length=100)
    return gliner_tokenizer.decode(outputs[0], skip_special_tokens=True)

class EnhancedNERPipeline:
    def __init__(self, nlp_spacy, nlp_ner, gliner_model, gliner_tokenizer):
        self.nlp_spacy = nlp_spacy
        self.nlp_ner = nlp_ner
        self.gliner_model = gliner_model
        self.gliner_tokenizer = gliner_tokenizer

    def __call__(self, text):
        # SpaCy processing
        doc = self.nlp_spacy(text)

        # Babelscape NER processing
        ner_results = self.nlp_ner(text)

        # GLinER processing
        gliner_companies = extract_info_with_gliner(text, "company names")
        gliner_experience = extract_info_with_gliner(text, "years of experience")
        gliner_education = extract_info_with_gliner(text, "educational institutions")

        # Combine results
        combined_entities = doc.ents + tuple(ner_results)

        # Add GLinER results as custom attributes
        doc._.gliner_companies = gliner_companies.split(', ')
        doc._.gliner_experience = gliner_experience
        doc._.gliner_education = gliner_education.split(', ')

        # Update doc.ents with combined results for other entity types
        doc.ents = [ent for ent in combined_entities if ent.label_ not in ["ORG"]]

        return doc

# Create the enhanced pipeline
enhanced_nlp = EnhancedNERPipeline(nlp_spacy, nlp_ner, gliner_model, gliner_tokenizer)

def extract_companies(doc):
    gliner_companies = set(doc._.gliner_companies)
    spacy_babelscape_companies = set([ent.text for ent in doc.ents if ent.label_ == "ORG"])
    return list(gliner_companies.union(spacy_babelscape_companies))

def extract_experience(doc):
    gliner_experience = int(re.search(r'\d+', doc._.gliner_experience).group()) if doc._.gliner_experience else 0
    spacy_experience = max([datetime.now().year - date.year for ent in doc.ents if ent.label_ == "DATE" and (date := dateparser.parse(ent.text)) and date.year <= datetime.now().year] or [0])
    return max(gliner_experience, spacy_experience)

def extract_education(doc):
    gliner_education = set(doc._.gliner_education)
    spacy_babelscape_education = set([ent.text for ent in doc.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in ["university", "college", "institute", "school"])])
    return list(gliner_education.union(spacy_babelscape_education))

def extract_text_from_pdf(file):
    pdf = fitz.open(stream=file.read(), filetype="pdf")
    text = ""
    for page in pdf:
        text += page.get_text()
    return text

def extract_text_from_doc(file):
    doc = Document(file)
    return " ".join([paragraph.text for paragraph in doc.paragraphs])

def authenticate_gemini(api_key):
    try:
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel('gemini-pro')
        return model
    except Exception as e:
        st.error(f"Authentication failed: {e}")
        return None

def generate_summary(text, model):
    prompt = f"Summarize the following resume:\n\n{text}\n\nProvide a brief overview of the candidate's qualifications, experience, and key skills."
    response = model.generate_content(prompt)
    return response.text

def main():
    st.title("Enhanced Resume Analyzer with GLinER Focus")

    api_key = st.text_input("Enter your Google Gemini API key", type="password")
    uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx"])

    if uploaded_file is not None and api_key:
        try:
            model = authenticate_gemini(api_key)
            if model is None:
                return

            if uploaded_file.type == "application/pdf":
                resume_text = extract_text_from_pdf(uploaded_file)
            elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
                resume_text = extract_text_from_doc(uploaded_file)
            else:
                st.error("Unsupported file format.")
                return

            # Process the resume text with the enhanced pipeline
            doc = enhanced_nlp(resume_text)

            companies = extract_companies(doc)
            experience = extract_experience(doc)
            education = extract_education(doc)

            # Use GLinER for other extractions
            phone = extract_info_with_gliner(resume_text, "phone number")
            email = extract_info_with_gliner(resume_text, "email address")
            linkedin = extract_info_with_gliner(resume_text, "LinkedIn profile")

            st.subheader("Extracted Information")
            st.write(f"**Years of Experience:** {experience}")
            st.write("**Companies:**", ", ".join(companies))
            st.write("**Education:**", ", ".join(education))
            st.write(f"**Phone Number:** {phone}")
            st.write(f"**Email:** {email}")
            st.write(f"**LinkedIn:** {linkedin}")

            summary = generate_summary(resume_text, model)
            st.subheader("Resume Summary")
            st.write(summary)

        except Exception as e:
            st.error(f"Error during processing: {e}")

if __name__ == "__main__":
    main()