Nassiraaa commited on
Commit
3f18c40
1 Parent(s): cfca1e4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -0
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from doctr.models import ocr_predictor
3
+ from doctr.io import DocumentFile
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
5
+
6
+ # Initialize DocTR OCR predictor
7
+ ocr_model = ocr_predictor(pretrained=True)
8
+
9
+ # Initialize the LLM model and tokenizer
10
+ model = AutoModelForCausalLM.from_pretrained(
11
+ "microsoft/Phi-3-mini-4k-instruct",
12
+ device_map="auto",
13
+ torch_dtype="auto",
14
+ trust_remote_code=True,
15
+ )
16
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
17
+
18
+ # Define the text-generation pipeline
19
+ pipe = pipeline(
20
+ "text-generation",
21
+ model=model,
22
+ tokenizer=tokenizer,
23
+ )
24
+
25
+ generation_args = {
26
+ "max_new_tokens": 500,
27
+ "return_full_text": False,
28
+ "temperature": 0.0,
29
+ "do_sample": False,
30
+ }
31
+
32
+ # Function to check CV completeness using LLM
33
+ def check_cv_completeness(text):
34
+ sections = [
35
+ "Personal Information",
36
+ "Summary and objective",
37
+ "Education",
38
+ "Work Experience",
39
+ "Skills",
40
+ "Languages",
41
+ "Certificates",
42
+ "Interests",
43
+ "References (optional)"
44
+ ]
45
+
46
+ detected_sections = {section: "Not Detected" for section in sections}
47
+ for section in sections:
48
+ prompt = f"Does the following text contain the section '{section}'?\n\n{text}\n\nPlease respond with 'Detected' or 'Not Detected'."
49
+ messages = [
50
+ {"role": "system", "content": "You are a helpful AI assistant."},
51
+ {"role": "user", "content": prompt}
52
+ ]
53
+
54
+ output = pipe(messages, **generation_args)
55
+ response = output[0]['generated_text'].strip()
56
+
57
+ detected_sections[section] = response if response in ["Detected", "Not Detected"] else "Not Detected"
58
+
59
+ return detected_sections
60
+
61
+ # Streamlit application
62
+ def main():
63
+ st.title('Extraction de texte depuis un PDF avec DocTR et détection d\'erreurs')
64
+
65
+ # Uploader un fichier PDF
66
+ uploaded_file = st.file_uploader("Uploader un fichier PDF", type="pdf")
67
+
68
+ if uploaded_file is not None:
69
+ # Charger le fichier PDF avec Doctr
70
+ pdf_bytes = uploaded_file.read()
71
+ doc = DocumentFile.from_pdf(pdf_bytes)
72
+
73
+ # Extraire le texte
74
+ result = ocr_model(doc)
75
+ text = ""
76
+ for page in result.pages:
77
+ for block in page.blocks:
78
+ for line in block.lines:
79
+ for word in line.words:
80
+ text += word.value + " "
81
+ text += "\n"
82
+
83
+ # Afficher le texte extrait
84
+ st.header('Texte extrait du PDF')
85
+ st.write(text)
86
+
87
+ # Préparer l'entrée pour le LLM
88
+ extraction_prompt = f"Extraire l'email, le numéro de téléphone et la localisation à partir du texte suivant :\n{text}"
89
+ messages = [
90
+ {"role": "system", "content": "Vous êtes un assistant IA utile."},
91
+ {"role": "user", "content": extraction_prompt}
92
+ ]
93
+
94
+ # Utiliser le LLM pour extraire les détails
95
+ output = pipe(messages, **generation_args)
96
+ generated_text = output[0]['generated_text']
97
+
98
+ # Afficher les informations extraites
99
+ st.header('Informations extraites')
100
+ st.write(generated_text)
101
+
102
+ # Vérifier la complétude du CV
103
+ cv_completeness = check_cv_completeness(text)
104
+
105
+ st.header('CV Completeness')
106
+ for section, status in cv_completeness.items():
107
+ st.write(f"{section}: {status}")
108
+
109
+ if __name__ == '__main__':
110
+ main()