import gradio as gr from PIL import Image import pytesseract import re def tesseract_ocr(filepath: str, languages: List[str]): image = Image.open(filepath) return pytesseract.image_to_string(image=image, lang=', '.join(languages)) title = "Tesseract OCR" description = "Gradio demo for Tesseract. Tesseract is an open source text recognition (OCR) Engine." article = "

Tesseract documentation | Github Repo

" examples = [ ['examples/eurotext.png', ['eng']], ['examples/tesseract_sample.png', ['jpn', 'eng']], ['examples/chi.jpg', ['HanS', 'HanT']] ] def search_and_highlight(text, keyword): highlighted_text = re.sub(f"({keyword})", r"\1", text, flags=re.IGNORECASE) return highlighted_text def ocr_and_search(image, keyword): if image is None: return "Please upload an image." extracted_text = perform_ocr(image) if keyword: highlighted_text = search_and_highlight(extracted_text, keyword) return highlighted_text else: return extracted_text iface = gr.Interface( fn=ocr_and_search, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(label="Enter keyword to search (optional)") ], outputs=gr.HTML(label="Extracted and Highlighted Text"), title="OCR and Keyword Search", description="Upload an image for OCR processing and search for keywords in the extracted text." ) iface.launch()