pierreguillou commited on
Commit
40a6f2f
1 Parent(s): 3f57b7a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +192 -0
app.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import shutil
4
+ import fitz
5
+ from PIL import Image
6
+ import numpy as np
7
+ import cv2
8
+ import pytesseract
9
+ from pytesseract import Output
10
+ import zipfile
11
+ from pdf2image import convert_from_path
12
+
13
+ # [Keep all the helper functions from the original code]
14
+ def convert_to_rgb(image_path):
15
+ img = Image.open(image_path)
16
+ rgb_img = img.convert("RGB")
17
+ return rgb_img
18
+
19
+ def preprocess_image(image):
20
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
21
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
22
+ denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21)
23
+ resized = cv2.resize(denoised, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
24
+ return resized
25
+
26
+ def extract_vertical_blocks(image):
27
+ image_np = np.array(image)
28
+ data = pytesseract.image_to_data(image_np, lang='fra', output_type=Output.DICT)
29
+
30
+ blocks = []
31
+ current_block = ""
32
+ current_block_coords = [float('inf'), float('inf'), 0, 0]
33
+ last_bottom = -1
34
+ line_height = 0
35
+
36
+ for i in range(len(data['text'])):
37
+ if int(data['conf'][i]) > 0:
38
+ text = data['text'][i]
39
+ x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
40
+
41
+ if line_height == 0:
42
+ line_height = h * 1.2
43
+
44
+ if y > last_bottom + line_height:
45
+ if current_block:
46
+ blocks.append({
47
+ "text": current_block.strip(),
48
+ "coords": current_block_coords
49
+ })
50
+ current_block = ""
51
+ current_block_coords = [float('inf'), float('inf'), 0, 0]
52
+
53
+ current_block += text + " "
54
+ current_block_coords[0] = min(current_block_coords[0], x)
55
+ current_block_coords[1] = min(current_block_coords[1], y)
56
+ current_block_coords[2] = max(current_block_coords[2], x + w)
57
+ current_block_coords[3] = max(current_block_coords[3], y + h)
58
+
59
+ last_bottom = y + h
60
+
61
+ if current_block:
62
+ blocks.append({
63
+ "text": current_block.strip(),
64
+ "coords": current_block_coords
65
+ })
66
+
67
+ return blocks
68
+
69
+ def draw_blocks_on_image(image_path, blocks, output_path):
70
+ image = cv2.imread(image_path)
71
+ for block in blocks:
72
+ coords = block['coords']
73
+ cv2.rectangle(image, (coords[0], coords[1]), (coords[2], coords[3]), (0, 0, 255), 2)
74
+ cv2.imwrite(output_path, image)
75
+ return output_path
76
+
77
+ def process_image(image, output_folder, page_number):
78
+ image = convert_to_rgb(image)
79
+ blocks = extract_vertical_blocks(image)
80
+ base_name = f'page_{page_number + 1}.png'
81
+ image_path = os.path.join(output_folder, base_name)
82
+ image.save(image_path)
83
+ annotated_image_path = os.path.join(output_folder, f'annotated_{base_name}')
84
+ annotated_image_path = draw_blocks_on_image(image_path, blocks, annotated_image_path)
85
+ return blocks, annotated_image_path
86
+
87
+ def save_extracted_text(blocks, page_number, output_folder):
88
+ text_file_path = os.path.join(output_folder, 'extracted_text.txt')
89
+ with open(text_file_path, 'a', encoding='utf-8') as f:
90
+ f.write(f"[PAGE {page_number}]\n")
91
+ for block in blocks:
92
+ f.write(block['text'] + "\n")
93
+ f.write(f"[FIN DE PAGE {page_number}]\n\n")
94
+ return text_file_path
95
+
96
+ # Modified process_pdf function with better temp file handling
97
+ def process_pdf(pdf_file):
98
+ # Create unique temporary working directory
99
+ temp_dir = os.path.join(os.getcwd(), "temp_processing")
100
+ output_dir = os.path.join(temp_dir, 'output_images')
101
+
102
+ # Clean up any existing temp directories
103
+ if os.path.exists(temp_dir):
104
+ shutil.rmtree(temp_dir)
105
+
106
+ os.makedirs(output_dir, exist_ok=True)
107
+
108
+ try:
109
+ # Convert PDF to images
110
+ images = convert_from_path(pdf_file.name)
111
+
112
+ # Process each image
113
+ annotated_images = []
114
+ for i, img in enumerate(images):
115
+ # Save temporary image
116
+ temp_img_path = os.path.join(temp_dir, f'temp_page_{i}.png')
117
+ img.save(temp_img_path)
118
+
119
+ # Process the image
120
+ blocks, annotated_image_path = process_image(temp_img_path, output_dir, i)
121
+ annotated_images.append(annotated_image_path)
122
+ save_extracted_text(blocks, i + 1, output_dir)
123
+
124
+ # Create ZIP file of annotated images
125
+ zip_path = os.path.join(temp_dir, "annotated_images.zip")
126
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
127
+ for img_path in annotated_images:
128
+ zipf.write(img_path, os.path.basename(img_path))
129
+
130
+ # Get the text file
131
+ text_file_path = os.path.join(output_dir, 'extracted_text.txt')
132
+
133
+ # Read the files into memory before cleanup
134
+ with open(text_file_path, 'rb') as f:
135
+ text_content = f.read()
136
+ with open(zip_path, 'rb') as f:
137
+ zip_content = f.read()
138
+
139
+ return (text_file_path, zip_path)
140
+
141
+ except Exception as e:
142
+ raise gr.Error(f"Error processing PDF: {str(e)}")
143
+
144
+ finally:
145
+ # Clean up will be handled by Hugging Face Spaces
146
+ pass
147
+
148
+ # Create Gradio interface with theme and better styling
149
+ css = """
150
+ .gradio-container {
151
+ font-family: 'IBM Plex Sans', sans-serif;
152
+ }
153
+ .gr-button {
154
+ color: white;
155
+ border-radius: 8px;
156
+ background: linear-gradient(45deg, #7928CA, #FF0080);
157
+ border: none;
158
+ }
159
+ """
160
+
161
+ # Create Gradio interface
162
+ demo = gr.Interface(
163
+ fn=process_pdf,
164
+ inputs=[
165
+ gr.File(
166
+ label="Upload PDF Document",
167
+ file_types=[".pdf"],
168
+ type="filepath"
169
+ )
170
+ ],
171
+ outputs=[
172
+ gr.File(label="Extracted Text (TXT)"),
173
+ gr.File(label="Annotated Images (ZIP)")
174
+ ],
175
+ title="PDF Text Extraction and Annotation",
176
+ description="""
177
+ Upload a PDF document to:
178
+ 1. Extract text content
179
+ 2. Get annotated images showing detected text blocks
180
+
181
+ Supports multiple pages and French language text.
182
+ """,
183
+ article="Created by [Your Name] - [Your GitHub/Profile Link]",
184
+ css=css,
185
+ examples=[], # Add example PDFs if you have any
186
+ cache_examples=False,
187
+ theme=gr.themes.Soft()
188
+ )
189
+
190
+ # Launch the app
191
+ if __name__ == "__main__":
192
+ demo.launch()