Spaces:
Sleeping
Sleeping
from typing import List | |
import pytesseract | |
from PIL import Image | |
import re | |
import gradio as gr | |
def tesseract_ocr(filepath: str) -> str: | |
"""Extract text from the image using Tesseract OCR with both English and Hindi.""" | |
image = Image.open(filepath) | |
# Set languages to English and Hindi by default | |
combined_languages = 'eng+hin' | |
extracted_text = pytesseract.image_to_string(image=image, lang=combined_languages) | |
return extracted_text | |
def search_and_highlight(text: str, keyword: str) -> str: | |
"""Highlight occurrences of the keyword in the extracted text.""" | |
if keyword: | |
highlighted_text = re.sub(f"({keyword})", r"<mark>\1</mark>", text, flags=re.IGNORECASE) | |
return highlighted_text | |
return text | |
def ocr_and_search(filepath: str, keyword: str) -> str: | |
"""Perform OCR on the image and highlight the specified keyword.""" | |
if filepath is None: | |
return "Please upload an image." | |
# Perform OCR (with default English and Hindi languages) | |
extracted_text = tesseract_ocr(filepath) | |
# Highlight the keyword if provided | |
if keyword: | |
highlighted_text = search_and_highlight(extracted_text, keyword) | |
return highlighted_text | |
else: | |
return extracted_text | |
# Gradio Interface | |
title = "Tesseract OCR (English + Hindi)" | |
description = "Gradio demo for Tesseract with multi-language support (English and Hindi)." | |
demo = gr.Interface( | |
fn=ocr_and_search, | |
inputs=[ | |
gr.Image(type="filepath", label="Upload Image for OCR"), | |
gr.Textbox(label="Keyword to Highlight", placeholder="Enter a keyword...") | |
], | |
outputs='html', # Changed to 'html' to display highlighted text | |
title=title, | |
description=description | |
) | |
if __name__ == '__main__': | |
demo.launch() | |
print("Finished running") |