ayanika02 commited on
Commit
62eafad
·
verified ·
1 Parent(s): 72b58fd

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -0
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[7]:
5
+
6
+
7
+ import gradio as gr
8
+ import pytesseract
9
+ from PIL import Image
10
+ import re
11
+ import os
12
+ import sys
13
+
14
+ # Set Tesseract path if needed (uncomment and modify if Tesseract is not in PATH)
15
+ pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
16
+
17
+ def ocr_image(image):
18
+ try:
19
+ # Ensure the image is in RGB mode
20
+ image = image.convert('RGB')
21
+ # Perform OCR with Tesseract (supports both English and Hindi)
22
+ text = pytesseract.image_to_string(image, lang='eng')
23
+ return text
24
+ except Exception as e:
25
+ return f"OCR Error: {str(e)}"
26
+
27
+ def search_text(text, keyword):
28
+ if not keyword:
29
+ return "Please enter a keyword to search."
30
+
31
+ # Perform case-insensitive search
32
+ pattern = re.compile(re.escape(keyword), re.IGNORECASE)
33
+ matches = pattern.finditer(text)
34
+
35
+ # Highlight matches and get surrounding context
36
+ results = []
37
+ for match in matches:
38
+ start = max(0, match.start() - 20)
39
+ end = min(len(text), match.end() + 20)
40
+ context = text[start:end]
41
+ highlighted = pattern.sub(f"<mark>{match.group()}</mark>", context)
42
+ results.append(f"...{highlighted}...")
43
+
44
+ if results:
45
+ return "<br><br>".join(results)
46
+ else:
47
+ return "No matches found."
48
+
49
+ def process_image(image, keyword):
50
+ if image is None:
51
+ return "Please upload an image.", ""
52
+
53
+ extracted_text = ocr_image(image)
54
+ search_results = search_text(extracted_text, keyword) if "OCR Error" not in extracted_text else ""
55
+
56
+ return extracted_text, search_results
57
+
58
+ # Debug information
59
+ print(f"Python version: {sys.version}")
60
+ print(f"Tesseract version: {pytesseract.get_tesseract_version()}")
61
+ print(f"Tesseract path: {pytesseract.pytesseract.tesseract_cmd}")
62
+
63
+ # Create the Gradio interface
64
+ iface = gr.Interface(
65
+ fn=process_image,
66
+ inputs=[
67
+ gr.Image(type="pil", label="Upload Image"),
68
+ gr.Textbox(label="Search Keyword")
69
+ ],
70
+ outputs=[
71
+ gr.Textbox(label="Extracted Text"),
72
+ gr.HTML(label="Search Results")
73
+ ],
74
+ title="OCR and Keyword Search",
75
+ description="Upload an image with English text, and optionally provide a keyword to search within the extracted text."
76
+ )
77
+
78
+ # Launch the app
79
+ iface.launch(share=True)
80
+