Nassiraaa commited on
Commit
c02a423
1 Parent(s): a830d08

Update cv_quality.py

Browse files
Files changed (1) hide show
  1. cv_quality.py +33 -12
cv_quality.py CHANGED
@@ -1,31 +1,52 @@
1
  import os
2
- from langchain_community.document_loaders import CSVLoader, PyMuPDFLoader, Docx2txtLoader
 
 
3
  from dotenv import load_dotenv
4
  from yolo_text_extraction import pdf_to_text
5
 
6
  load_dotenv()
7
 
8
  class CV:
9
- def __init__(self, file_path):
10
- self.file_path = file_path
11
  self.doc_loader = {
12
- ".csv": CSVLoader,
13
- ".pdf": PyMuPDFLoader,
14
- ".docx": Docx2txtLoader
15
  }
16
 
17
  def get_cv_text(self):
18
- _, ext = os.path.splitext(self.file_path)
19
- if ext in self.doc_loader:
20
- loader = self.doc_loader[ext](self.file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  data = loader.load()
 
 
 
 
22
  if data:
23
  text = "\n".join([page.page_content for page in data])
24
- return text if text.strip() else pdf_to_text(self.file_path)
25
  else:
26
- return pdf_to_text(self.file_path)
27
  else:
28
- return pdf_to_text(self.file_path)
29
 
30
  def analyse_cv_quality(self):
31
  from cv_analyzer import analyze_cv
 
1
  import os
2
+ import requests
3
+ from io import BytesIO
4
+ from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredFileLoader
5
  from dotenv import load_dotenv
6
  from yolo_text_extraction import pdf_to_text
7
 
8
  load_dotenv()
9
 
10
  class CV:
11
+ def __init__(self, file_url):
12
+ self.file_url = file_url
13
  self.doc_loader = {
14
+ ".pdf": PyPDFLoader,
15
+ ".docx": Docx2txtLoader,
16
+ ".txt": UnstructuredFileLoader
17
  }
18
 
19
  def get_cv_text(self):
20
+ # Download the file from Supabase
21
+ response = requests.get(self.file_url)
22
+ file_content = BytesIO(response.content)
23
+
24
+ # Determine file extension
25
+ _, ext = os.path.splitext(self.file_url.split("?")[0]) # Remove query parameters
26
+
27
+ if ext.lower() in self.doc_loader:
28
+ if ext.lower() == '.pdf':
29
+ loader = self.doc_loader[ext.lower()](file_content)
30
+ else:
31
+ # For non-PDF files, save temporarily and use the appropriate loader
32
+ with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
33
+ temp_file.write(file_content.getvalue())
34
+ temp_file_path = temp_file.name
35
+
36
+ loader = self.doc_loader[ext.lower()](temp_file_path)
37
+
38
  data = loader.load()
39
+
40
+ if ext.lower() != '.pdf':
41
+ os.unlink(temp_file_path) # Delete the temporary file
42
+
43
  if data:
44
  text = "\n".join([page.page_content for page in data])
45
+ return text if text.strip() else pdf_to_text(file_content)
46
  else:
47
+ return pdf_to_text(file_content)
48
  else:
49
+ return pdf_to_text(file_content)
50
 
51
  def analyse_cv_quality(self):
52
  from cv_analyzer import analyze_cv