Spaces:
Sleeping
Sleeping
pierreguillou
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -147,7 +147,7 @@ def create_prompt(extracted_text: str) -> str:
|
|
147 |
|
148 |
return prompt
|
149 |
|
150 |
-
def extract_data_with_gemini(text_file_path: str) -> dict:
|
151 |
try:
|
152 |
# Initialize Gemini
|
153 |
model = initialize_gemini()
|
@@ -157,7 +157,7 @@ def extract_data_with_gemini(text_file_path: str) -> dict:
|
|
157 |
extracted_text = f.read()
|
158 |
|
159 |
# Create prompt and get response
|
160 |
-
prompt = create_prompt(extracted_text)
|
161 |
response = model.generate_content(prompt)
|
162 |
|
163 |
# Parse the JSON response
|
@@ -178,6 +178,7 @@ def extract_data_with_gemini(text_file_path: str) -> dict:
|
|
178 |
|
179 |
# Main Processing Function
|
180 |
def process_pdf(pdf_file):
|
|
|
181 |
temp_dir = os.path.join(os.getcwd(), "temp_processing")
|
182 |
output_dir = os.path.join(temp_dir, 'output_images')
|
183 |
|
@@ -185,6 +186,9 @@ def process_pdf(pdf_file):
|
|
185 |
shutil.rmtree(temp_dir)
|
186 |
os.makedirs(output_dir, exist_ok=True)
|
187 |
|
|
|
|
|
|
|
188 |
try:
|
189 |
# Convert PDF to images and process
|
190 |
images = convert_from_path(pdf_file.name)
|
@@ -206,7 +210,7 @@ def process_pdf(pdf_file):
|
|
206 |
text_file_path = os.path.join(output_dir, 'extracted_text.txt')
|
207 |
|
208 |
# Process with Gemini
|
209 |
-
extracted_data = extract_data_with_gemini(text_file_path)
|
210 |
|
211 |
# Save extracted data to JSON file
|
212 |
json_path = os.path.join(temp_dir, "extracted_data.json")
|
|
|
147 |
|
148 |
return prompt
|
149 |
|
150 |
+
def extract_data_with_gemini(text_file_path: str, path_to_data_to_extract: str) -> dict:
|
151 |
try:
|
152 |
# Initialize Gemini
|
153 |
model = initialize_gemini()
|
|
|
157 |
extracted_text = f.read()
|
158 |
|
159 |
# Create prompt and get response
|
160 |
+
prompt = create_prompt(extracted_text, path_to_data_to_extract)
|
161 |
response = model.generate_content(prompt)
|
162 |
|
163 |
# Parse the JSON response
|
|
|
178 |
|
179 |
# Main Processing Function
|
180 |
def process_pdf(pdf_file):
|
181 |
+
template_dir = os.path.join(os.getcwd(), "templates")
|
182 |
temp_dir = os.path.join(os.getcwd(), "temp_processing")
|
183 |
output_dir = os.path.join(temp_dir, 'output_images')
|
184 |
|
|
|
186 |
shutil.rmtree(temp_dir)
|
187 |
os.makedirs(output_dir, exist_ok=True)
|
188 |
|
189 |
+
## JSON of teh data to extract with descriptions
|
190 |
+
path_to_data_to_extract = os.path.join(template_dir, "data_to_extract.json")
|
191 |
+
|
192 |
try:
|
193 |
# Convert PDF to images and process
|
194 |
images = convert_from_path(pdf_file.name)
|
|
|
210 |
text_file_path = os.path.join(output_dir, 'extracted_text.txt')
|
211 |
|
212 |
# Process with Gemini
|
213 |
+
extracted_data = extract_data_with_gemini(text_file_path, path_to_data_to_extract)
|
214 |
|
215 |
# Save extracted data to JSON file
|
216 |
json_path = os.path.join(temp_dir, "extracted_data.json")
|