Spaces:

Aumkeshchy2003
/

Gradio-OCR

Sleeping

App Files Files Community

Gradio-OCR / app.py

Aumkeshchy2003

Update app.py

6916c84 verified 4 months ago

raw

history blame

1.82 kB

	from typing import List
	import pytesseract
	from PIL import Image
	import re
	import gradio as gr

	def tesseract_ocr(filepath: str) -> str:
	"""Extract text from the image using Tesseract OCR with both English and Hindi."""
	image = Image.open(filepath)
	# Set languages to English and Hindi by default
	combined_languages = 'eng+hin'
	extracted_text = pytesseract.image_to_string(image=image, lang=combined_languages)
	return extracted_text

	def search_and_highlight(text: str, keyword: str) -> str:
	"""Highlight occurrences of the keyword in the extracted text."""
	if keyword:
	highlighted_text = re.sub(f"({keyword})", r"<mark>\1</mark>", text, flags=re.IGNORECASE)
	return highlighted_text
	return text

	def ocr_and_search(filepath: str, keyword: str) -> str:
	"""Perform OCR on the image and highlight the specified keyword."""
	if filepath is None:
	return "Please upload an image."

	# Perform OCR (with default English and Hindi languages)
	extracted_text = tesseract_ocr(filepath)

	# Highlight the keyword if provided
	if keyword:
	highlighted_text = search_and_highlight(extracted_text, keyword)
	return highlighted_text
	else:
	return extracted_text

	# Gradio Interface
	title = "Tesseract OCR (English + Hindi)"
	description = "Gradio demo for Tesseract with multi-language support (English and Hindi)."

	demo = gr.Interface(
	fn=ocr_and_search,
	inputs=[
	gr.Image(type="filepath", label="Upload Image for OCR"),
	gr.Textbox(label="Keyword to Highlight", placeholder="Enter a keyword...")
	],
	outputs='html', # Changed to 'html' to display highlighted text
	title=title,
	description=description
	)

	if __name__ == '__main__':
	demo.launch()
	print("Finished running")