Spaces:
Sleeping
Sleeping
from typing import List | |
import pytesseract | |
from PIL import Image | |
import gradio as gr | |
import re | |
def tesseract_ocr_with_search(filepath: str, languages: List[str], keyword: str): | |
# Perform OCR on the image | |
image = Image.open(filepath) | |
extracted_text = pytesseract.image_to_string(image=image, lang=', '.join(languages)) | |
# If keyword is provided, highlight the occurrences | |
if keyword: | |
highlighted_text = re.sub(f"({re.escape(keyword)})", r"<mark>\1</mark>", extracted_text, flags=re.IGNORECASE) | |
else: | |
highlighted_text = extracted_text | |
return highlighted_text | |
# Fetch available languages for Tesseract | |
language_choices = pytesseract.get_languages() | |
# Define Gradio Interface | |
demo = gr.Interface( | |
fn=tesseract_ocr_with_search, | |
inputs=[ | |
gr.Image(type="filepath", label="Upload Image"), # Input for image upload | |
gr.CheckboxGroup(language_choices, type="value", value=['eng'], label='Language'), # Language selection | |
gr.Textbox(placeholder="Enter keyword to search", label="Keyword Search") # Keyword input | |
], | |
outputs=gr.HTML(), # Use HTML output to allow text highlighting | |
title=title, | |
description=description, | |
article=article, | |
examples=examples, | |
) | |
if __name__ == '__main__': | |
demo.launch() | |
print("Finished running") | |