Manojajj's picture
Update app.py
0e4d704 verified
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import pdfplumber
import re
import openpyxl
import os
from huggingface_hub import login
# Function to authenticate Hugging Face using token
def authenticate_hf(token):
try:
login(token)
return "Authentication Successful"
except Exception as e:
return f"Error: {e}"
# Initialize the model and tokenizer
model_name = "Qwen/Qwen2.5-1.5B-Instruct" # Replace with the actual model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
text = ''
for page in pdf.pages:
text += page.extract_text()
return text
# Function to parse the resume text for name, email, phone, and skills
def parse_resume(text):
# Define the prompts for each type of information
prompts = {
"name": "Extract the name from this resume:\n",
"email": "Extract the email address from this resume:\n",
"phone": "Extract the phone number from this resume:\n",
"skills": "Extract the technical skills from this resume:\n"
}
results = {}
for key, prompt in prompts.items():
# Generate model response for each field
inputs = tokenizer(prompt + text, return_tensors="pt")
outputs = model.generate(**inputs, max_length=50000)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
if key == 'email':
# Use regex to validate email format
email = re.findall(r'\S+@\S+', response)
results[key] = email[0] if email else None
elif key == 'phone':
# Use regex to validate phone number format
phone = re.findall(r'\b\d{10,15}\b', response)
results[key] = phone[0] if phone else None
elif key == 'skills':
# Extract technical skills
results[key] = response
else:
results[key] = response
return results
# Function to save parsed data to Excel file
def save_to_excel(parsed_data, output_file):
wb = openpyxl.Workbook()
ws = wb.active
ws.append(["Name", "Email", "Phone", "Skills"])
for data in parsed_data:
ws.append([data["name"], data["email"], data["phone"], data["skills"]])
wb.save(output_file)
# Function to process PDF files and output an Excel file
def process_pdfs(pdfs):
parsed_data = []
for pdf in pdfs:
# Extract text from the PDF
text = extract_text_from_pdf(pdf.name)
# Parse the text for relevant details
parsed_info = parse_resume(text)
# Add parsed information to the list
parsed_data.append(parsed_info)
# Save the parsed data to an Excel file
output_file = "parsed_resumes.xlsx"
save_to_excel(parsed_data, output_file)
return output_file
# Gradio interface setup with Hugging Face API token input
with gr.Blocks() as app:
# Adding Hugging Face logo
gr.Image("https://huggingface.co/front/assets/huggingface_logo.svg", label="Hugging Face Logo", width=150)
gr.Markdown("### Hugging Face Authentication")
# Input field for Hugging Face API token (blank space)
hf_token = gr.Textbox(label="Hugging Face API Token", placeholder="Enter your Hugging Face token here", type="password", value="")
login_button = gr.Button("Authenticate")
auth_status = gr.Textbox(label="Authentication Status", interactive=False)
# Authenticate Hugging Face model when button is clicked
login_button.click(authenticate_hf, inputs=hf_token, outputs=auth_status)
gr.Markdown("### Upload PDF Resumes")
# File input to upload resumes (use "filepath" for type)
pdfs_input = gr.File(file_count="multiple", type="filepath")
output_file = gr.File()
# Process the PDFs and parse them
process_button = gr.Button("Process Resumes")
process_button.click(process_pdfs, inputs=pdfs_input, outputs=output_file)
# Launch the app
app.launch()