cm0805 commited on
Commit
0a510f9
1 Parent(s): 54f4f78

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +135 -0
utils.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ from docx import Document
3
+ from pptx import Presentation
4
+ from nlp import get_average_similarity_scores
5
+ import numpy as np
6
+ import plotly.graph_objects as go
7
+ import os
8
+ import tempfile
9
+ import shutil
10
+
11
+ # Langchain document loaders
12
+ from langchain.document_loaders import PyPDFLoader #for pdf files
13
+ from langchain.document_loaders import TextLoader #for text files
14
+ from langchain.document_loaders import Docx2txtLoader #for docx files
15
+ from langchain.document_loaders import UnstructuredPowerPointLoader #for pptx files
16
+
17
+ from constants import StreamlitException
18
+ from PyPDF2.errors import PdfReadError
19
+ from zipfile import BadZipFile
20
+
21
+ def load_file(st, uploaded_file):
22
+ # uploaded_file is the output of st.sidebar.file_uploader
23
+ file_type = uploaded_file.type
24
+ try:
25
+ os.mkdir('downloaded_files')
26
+ except FileExistsError:
27
+ pass
28
+ download_path = os.path.join('downloaded_files', uploaded_file.name)
29
+ with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
30
+ # Write the contents of the uploaded file to the temporary file
31
+ tmp_file.write(uploaded_file.read())
32
+ tmp_file.flush()
33
+ shutil.copy(tmp_file.name, download_path)
34
+ try:
35
+ if file_type == "application/pdf":
36
+ resume_text_raw = extract_pdf_text(uploaded_file)
37
+ lang_loader = PyPDFLoader(download_path)
38
+ elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
39
+ resume_text_raw = extract_word_text(uploaded_file)
40
+ lang_loader = Docx2txtLoader(download_path)
41
+ elif file_type == "application/vnd.ms-powerpoint" or file_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
42
+ resume_text_raw = extract_ppt_text(uploaded_file)
43
+ lang_loader = UnstructuredPowerPointLoader(download_path)
44
+ else:
45
+ return StreamlitException("**Error**: Invalid file format. Please upload a Word, PDF, or PowerPoint file.")
46
+ except (PdfReadError, BadZipFile):
47
+ return StreamlitException("**Error**: Invalid file content. Please upload a valid Word, PDF, or PowerPoint file.")
48
+
49
+ return resume_text_raw, lang_loader
50
+
51
+
52
+ # Function to extract text from a PDF file
53
+ def extract_pdf_text(file):
54
+ pdf_reader = PyPDF2.PdfReader(file)
55
+ text = ""
56
+ for page in pdf_reader.pages:
57
+ lines = page.extract_text().split('\n')
58
+ for line in lines:
59
+ text += line.strip() + ".\n"
60
+ return text
61
+
62
+
63
+ # Function to extract text from a Word file
64
+ def extract_word_text(file):
65
+ doc = Document(file)
66
+ text = ''
67
+ p_iter = iter(doc.paragraphs)
68
+ t_iter = iter(doc.tables)
69
+ while True:
70
+ try:
71
+ paragraph = next(p_iter)
72
+ text += paragraph.text + '.\n'
73
+ except StopIteration:
74
+ break
75
+ try:
76
+ table = next(t_iter)
77
+ for row in table.rows:
78
+ for cell in row.cells:
79
+ text += cell.text + '\t'
80
+ text += '\n'
81
+ except StopIteration:
82
+ pass
83
+ return text
84
+
85
+
86
+ # Function to extract text from a PowerPoint file
87
+ def extract_ppt_text(file):
88
+ prs = Presentation(file)
89
+ text = ""
90
+ for slide in prs.slides:
91
+ for shape in slide.shapes:
92
+ if shape.has_text_frame:
93
+ text += shape.text_frame.text
94
+ return text
95
+
96
+ # Function to plot the average similarity score for each job description phrase
97
+ def plot_similarity_scores(job_description_phrases, resume_phrases):
98
+ avg_similarity_scores = get_average_similarity_scores(job_description_phrases, resume_phrases)
99
+ sorted_scores = sorted(enumerate(avg_similarity_scores), key=lambda x: x[1], reverse=True)[:10]
100
+ indices = [i[0] for i in sorted_scores]
101
+ sorted_scores = [i[1] for i in sorted_scores]
102
+
103
+ y_pos = list(range(len(indices)))
104
+
105
+ fig = go.Figure()
106
+ fig.add_trace(go.Bar(
107
+ y=y_pos,
108
+ x=sorted_scores,
109
+ orientation='h'
110
+ ))
111
+
112
+ fig.update_layout(
113
+ yaxis=dict(
114
+ tickmode="array",
115
+ tickvals=y_pos,
116
+ ticktext=[s[:100].ljust(100) + '...' if len(s) > 100 else s.ljust(75) for s in np.array(job_description_phrases)[indices]],
117
+ tickfont=dict(size=14),
118
+ autorange="reversed",
119
+ side="right",
120
+ automargin=True
121
+ ),
122
+ xaxis=dict(
123
+ tickmode="array",
124
+ tickvals=np.round(np.arange(0, 1.2, 0.2), 2),
125
+ ticktext=np.round(np.arange(0, 1.2, 0.2), 2),
126
+ tickfont=dict(size=14),
127
+ range=[0, 1.05]
128
+ ),
129
+ showlegend=False,
130
+ margin=dict(t=0)
131
+ )
132
+
133
+ fig.update_xaxes(title="Average Similarity Score", title_font=dict(size=14))
134
+
135
+ return fig