Gradio-OCR / app.py
Aumkeshchy2003's picture
Update app.py
6916c84 verified
raw
history blame
1.82 kB
from typing import List
import pytesseract
from PIL import Image
import re
import gradio as gr
def tesseract_ocr(filepath: str) -> str:
"""Extract text from the image using Tesseract OCR with both English and Hindi."""
image = Image.open(filepath)
# Set languages to English and Hindi by default
combined_languages = 'eng+hin'
extracted_text = pytesseract.image_to_string(image=image, lang=combined_languages)
return extracted_text
def search_and_highlight(text: str, keyword: str) -> str:
"""Highlight occurrences of the keyword in the extracted text."""
if keyword:
highlighted_text = re.sub(f"({keyword})", r"<mark>\1</mark>", text, flags=re.IGNORECASE)
return highlighted_text
return text
def ocr_and_search(filepath: str, keyword: str) -> str:
"""Perform OCR on the image and highlight the specified keyword."""
if filepath is None:
return "Please upload an image."
# Perform OCR (with default English and Hindi languages)
extracted_text = tesseract_ocr(filepath)
# Highlight the keyword if provided
if keyword:
highlighted_text = search_and_highlight(extracted_text, keyword)
return highlighted_text
else:
return extracted_text
# Gradio Interface
title = "Tesseract OCR (English + Hindi)"
description = "Gradio demo for Tesseract with multi-language support (English and Hindi)."
demo = gr.Interface(
fn=ocr_and_search,
inputs=[
gr.Image(type="filepath", label="Upload Image for OCR"),
gr.Textbox(label="Keyword to Highlight", placeholder="Enter a keyword...")
],
outputs='html', # Changed to 'html' to display highlighted text
title=title,
description=description
)
if __name__ == '__main__':
demo.launch()
print("Finished running")