from typing import List import pytesseract from PIL import Image import re import gradio as gr def tesseract_ocr(filepath: str) -> str: """Extract text from the image using Tesseract OCR with both English and Hindi.""" image = Image.open(filepath) # Set languages to English and Hindi by default combined_languages = 'eng+hin' extracted_text = pytesseract.image_to_string(image=image, lang=combined_languages) return extracted_text def search_and_highlight(text: str, keyword: str) -> str: """Highlight occurrences of the keyword in the extracted text.""" if keyword: highlighted_text = re.sub(f"({keyword})", r"\1", text, flags=re.IGNORECASE) return highlighted_text return text def ocr_and_search(filepath: str, keyword: str) -> str: """Perform OCR on the image and highlight the specified keyword.""" if filepath is None: return "Please upload an image." # Perform OCR (with default English and Hindi languages) extracted_text = tesseract_ocr(filepath) # Highlight the keyword if provided if keyword: highlighted_text = search_and_highlight(extracted_text, keyword) return highlighted_text else: return extracted_text # Gradio Interface title = "Tesseract OCR (English + Hindi)" description = "Gradio demo for Tesseract with multi-language support (English and Hindi)." demo = gr.Interface( fn=ocr_and_search, inputs=[ gr.Image(type="filepath", label="Upload Image for OCR"), gr.Textbox(label="Keyword to Highlight", placeholder="Enter a keyword...") ], outputs='html', # Changed to 'html' to display highlighted text title=title, description=description ) if __name__ == '__main__': demo.launch() print("Finished running")