from typing import List
import pytesseract
from PIL import Image
import re
import gradio as gr

def tesseract_ocr(filepath: str) -> str:
    """Extract text from the image using Tesseract OCR with both English and Hindi."""
    image = Image.open(filepath)
    # Set languages to English and Hindi by default
    combined_languages = 'eng+hin'
    extracted_text = pytesseract.image_to_string(image=image, lang=combined_languages)
    return extracted_text

def search_and_highlight(text: str, keyword: str) -> str:
    """Highlight occurrences of the keyword in the extracted text."""
    if keyword:
        highlighted_text = re.sub(f"({keyword})", r"<mark>\1</mark>", text, flags=re.IGNORECASE)
        return highlighted_text
    return text

def ocr_and_search(filepath: str, keyword: str) -> str:
    """Perform OCR on the image and highlight the specified keyword."""
    if filepath is None:
        return "Please upload an image."
    
    # Perform OCR (with default English and Hindi languages)
    extracted_text = tesseract_ocr(filepath)
    
    # Highlight the keyword if provided
    if keyword:
        highlighted_text = search_and_highlight(extracted_text, keyword)
        return highlighted_text
    else:
        return extracted_text

# Gradio Interface
title = "Tesseract OCR (English + Hindi)"
description = "Gradio demo for Tesseract with multi-language support (English and Hindi)."

demo = gr.Interface(
    fn=ocr_and_search, 
    inputs=[
        gr.Image(type="filepath", label="Upload Image for OCR"), 
        gr.Textbox(label="Keyword to Highlight", placeholder="Enter a keyword...")
    ],
    outputs='html',  # Changed to 'html' to display highlighted text
    title=title,
    description=description
)

if __name__ == '__main__':
    demo.launch()
    print("Finished running")