Spaces:

billusanda007
/

HireGPT

Running

HireGPT / app_V1.py

Jeet Paul

Rename app.py to app_V1.py

8148f30 over 1 year ago

5.52 kB

	import streamlit as st
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk.stem import PorterStemmer
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from PyPDF2 import PdfReader
	import os
	from io import BytesIO
	import pickle
	import pdfminer
	from pdfminer.high_level import extract_text
	import re
	import PyPDF2
	import textract
	import tempfile
	from docx import Document

	nltk.download('punkt')
	nltk.download('stopwords')

	def preprocess_text(text):
	words = word_tokenize(text.lower())

	stop_words = set(stopwords.words('english'))
	words = [word for word in words if word not in stop_words]

	stemmer = PorterStemmer()
	words = [stemmer.stem(word) for word in words]

	return ' '.join(words)

	def extract_text_from_pdf(pdf_content):
	pdf_reader = PdfReader(BytesIO(pdf_content))
	text = ''
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text

	def extract_text_from_docx(docx_content):
	doc = Document(BytesIO(docx_content))
	text = " ".join(paragraph.text for paragraph in doc.paragraphs)
	return text


	def extract_text_from_txt(txt_content):
	text = textract.process(input_filename=None, input_bytes=txt_content)
	return text

	def extract_text_from_resume(file_path):
	file_extension = file_path.split('.')[-1].lower()

	if file_extension == 'pdf':
	return extract_text_from_pdf(file_path)
	elif file_extension == 'docx':
	return extract_text_from_docx(file_path)
	elif file_extension == 'txt':
	return extract_text_from_txt(file_path)
	else:
	raise ValueError(f"Unsupported file format: {file_extension}")

	def clean_pdf_text(text):
	text = re.sub('http\S+\s*', ' ', text)
	text = re.sub('RT\|cc', ' ', text)
	text = re.sub('#\S+', '', text)
	text = re.sub('@\S+', ' ', text)
	text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{\|}~"""), ' ', text)
	text = re.sub(r'[^\x00-\x7f]',r' ', text)
	text = re.sub('\s+', ' ', text)
	return text

	def extract_candidate_name(text):
	pattern = r'(?:Mr\.\|Ms\.\|Mrs\.)?\s?([A-Z][a-z]+)\s([A-Z][a-z]+)'
	match = re.search(pattern, text)
	if match:
	return match.group(0)
	return "Candidate Name Not Found"

	def calculate_similarity(job_description, cvs, cv_file_names):
	processed_job_desc = preprocess_text(job_description)

	processed_cvs = [preprocess_text(cv) for cv in cvs]

	all_text = [processed_job_desc] + processed_cvs

	vectorizer = TfidfVectorizer()
	tfidf_matrix = vectorizer.fit_transform(all_text)

	similarity_scores = cosine_similarity(tfidf_matrix)[0][1:]

	ranked_cvs = list(zip(cv_file_names, similarity_scores))
	ranked_cvs.sort(key=lambda x: x[1], reverse=True)

	return ranked_cvs

	def rank_and_shortlist(job_description, cv_files, threshold=0.15):
	cv_texts = []
	cv_file_names = []

	for cv_file in cv_files:
	file_extension = os.path.splitext(cv_file.name)[1].lower()

	try:
	if file_extension == '.pdf':
	cv_text = extract_text_from_pdf(cv_file.read())
	elif file_extension == '.docx':
	cv_text = extract_text_from_docx(cv_file.read())
	elif file_extension == '.txt':
	cv_text = cv_file.read().decode('utf-8', errors='ignore')
	else:
	st.warning(f"Unsupported file format: {file_extension}. Skipping file: {cv_file.name}")
	continue

	cv_texts.append(clean_pdf_text(cv_text))
	cv_file_names.append(cv_file.name)

	except Exception as e:
	st.warning(f"Error processing file '{cv_file.name}': {str(e)}")
	continue

	if not cv_texts:
	st.error("No valid resumes found. Please upload resumes in supported formats (PDF, DOCX, or TXT).")
	return [], []

	similarity_scores = calculate_similarity(job_description, cv_texts, cv_file_names)

	ranked_cvs = [(cv_name, score) for (cv_name, score) in similarity_scores]
	shortlisted_cvs = [(cv_name, score) for (cv_name, score) in ranked_cvs if score >= threshold]

	return ranked_cvs, shortlisted_cvs


	def main():
	st.title("Resume Ranking App")

	st.write("Upload the Job Description:")
	job_description = st.text_area("Job Description", height=200, key='job_description')

	st.write("Upload the Resumes :")
	cv_files = st.file_uploader("Choose files", accept_multiple_files=True, key='cv_files')

	if st.button("Submit"):
	if job_description and cv_files:
	# Rank and shortlist candidates
	ranked_cvs, shortlisted_cvs = rank_and_shortlist(job_description, cv_files)

	# Display ranking with larger text
	st.markdown("### Ranking of Resumes:")
	for rank, score in ranked_cvs:
	st.markdown(f"File Name: {rank}, Similarity Score: {score:.2f}")

	# Display shortlisted candidates with larger text
	st.markdown("### Shortlisted Candidates:")
	if not shortlisted_cvs: # Check if the shortlisted_cvs list is empty
	st.markdown("None")
	else:
	for rank, score in shortlisted_cvs:
	st.markdown(f"File Name: {rank}, Similarity Score: {score:.2f}")
	else:
	st.write("Please upload both the job description and resumes to proceed.")

	if __name__ == "__main__":
	main()