Spaces:

billusanda007
/

HireGPT

Sleeping

HireGPT / app_pdf_version.py

Jeet Paul

Rename app.py to app_pdf_version.py

fd49583 over 1 year ago

4.1 kB

	import streamlit as st
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk.stem import PorterStemmer
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from PyPDF2 import PdfReader
	import os
	from io import BytesIO
	import pickle
	import pdfminer
	from pdfminer.high_level import extract_text
	import re

	nltk.download('punkt')
	nltk.download('stopwords')

	def preprocess_text(text):
	words = word_tokenize(text.lower())

	stop_words = set(stopwords.words('english'))
	words = [word for word in words if word not in stop_words]

	stemmer = PorterStemmer()
	words = [stemmer.stem(word) for word in words]

	return ' '.join(words)

	def extract_text_from_pdf(pdf_content):
	pdf_reader = PdfReader(BytesIO(pdf_content))
	text = ''
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text

	def clean_pdf_text(text):
	# Your existing cleanResume function remains unchanged
	text = re.sub('http\S+\s*', ' ', text)
	text = re.sub('RT\|cc', ' ', text)
	text = re.sub('#\S+', '', text)
	text = re.sub('@\S+', ' ', text)
	text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{\|}~"""), ' ', text)
	text = re.sub(r'[^\x00-\x7f]',r' ', text)
	text = re.sub('\s+', ' ', text)
	return text

	def extract_candidate_name(text):
	# Use regular expressions to extract candidate names
	# Modify the regex pattern according to your naming conventions
	pattern = r'(?:Mr\.\|Ms\.\|Mrs\.)?\s?([A-Z][a-z]+)\s([A-Z][a-z]+)'
	match = re.search(pattern, text)
	if match:
	return match.group(0)
	return "Candidate Name Not Found"

	def calculate_similarity(job_description, cvs, cv_file_names):
	processed_job_desc = preprocess_text(job_description)

	processed_cvs = [preprocess_text(cv) for cv in cvs]

	all_text = [processed_job_desc] + processed_cvs

	vectorizer = TfidfVectorizer()
	tfidf_matrix = vectorizer.fit_transform(all_text)

	similarity_scores = cosine_similarity(tfidf_matrix)[0][1:]

	ranked_cvs = list(zip(cv_file_names, similarity_scores))
	ranked_cvs.sort(key=lambda x: x[1], reverse=True)

	return ranked_cvs

	def rank_and_shortlist(job_description, cv_files, threshold=0.15):
	cv_texts = [extract_text_from_pdf(cv_file.read()) for cv_file in cv_files]
	cv_file_names = [cv_file.name for cv_file in cv_files]
	cvs = [clean_pdf_text(cv_text) for cv_text in cv_texts]
	similarity_scores = calculate_similarity(job_description, cvs, cv_file_names)

	ranked_cvs = [(cv_name, score) for (cv_name, score) in similarity_scores]
	shortlisted_cvs = [(cv_name, score) for (cv_name, score) in ranked_cvs if score > threshold]

	return ranked_cvs, shortlisted_cvs

	def main():
	st.title("Resume Ranking App")

	st.write("Upload the Job Description:")
	job_description = st.text_area("Job Description", height=200, key='job_description')

	st.write("Upload the Resumes (PDFs):")
	cv_files = st.file_uploader("Choose PDF files", accept_multiple_files=True, type=["pdf"], key='cv_files')

	if st.button("Submit"):
	if job_description and cv_files:
	# Rank and shortlist candidates
	ranked_cvs, shortlisted_cvs = rank_and_shortlist(job_description, cv_files)

	# Display ranking with larger text
	st.markdown("### Ranking of Resumes:")
	for rank, score in ranked_cvs:
	st.markdown(f"File Name: {rank}, Similarity Score: {score:.2f}")

	# Display shortlisted candidates with larger text
	st.markdown("### Shortlisted Candidates:")
	if not shortlisted_cvs: # Check if the shortlisted_cvs list is empty
	st.markdown("None")
	else:
	for rank, score in shortlisted_cvs:
	st.markdown(f"File Name: {rank}, Similarity Score: {score:.2f}")
	else:
	st.write("Please upload both the job description and resumes to proceed.")

	if __name__ == "__main__":
	main()