Spaces:

Aumkeshchy2003
/

Gradio-OCR

Sleeping

Gradio-OCR / app.py

Update app.py

814690d verified 4 months ago

1.34 kB

	from typing import List
	import pytesseract
	from PIL import Image
	import gradio as gr
	import re

	def tesseract_ocr_with_search(filepath: str, languages: List[str], keyword: str):
	# Perform OCR on the image
	image = Image.open(filepath)
	extracted_text = pytesseract.image_to_string(image=image, lang=', '.join(languages))

	# If keyword is provided, highlight the occurrences
	if keyword:
	highlighted_text = re.sub(f"({re.escape(keyword)})", r"<mark>\1</mark>", extracted_text, flags=re.IGNORECASE)
	else:
	highlighted_text = extracted_text

	return highlighted_text

	# Fetch available languages for Tesseract
	language_choices = pytesseract.get_languages()

	# Define Gradio Interface
	demo = gr.Interface(
	fn=tesseract_ocr_with_search,
	inputs=[
	gr.Image(type="filepath", label="Upload Image"), # Input for image upload
	gr.CheckboxGroup(language_choices, type="value", value=['eng'], label='Language'), # Language selection
	gr.Textbox(placeholder="Enter keyword to search", label="Keyword Search") # Keyword input
	],
	outputs=gr.HTML(), # Use HTML output to allow text highlighting
	title=title,
	description=description,
	article=article,
	examples=examples,
	)

	if __name__ == '__main__':
	demo.launch()
	print("Finished running")