Video-to-Multilingual-OCR

Runtime error

App Files Files Community

Video-to-Multilingual-OCR / app.py

stupidog04

Update app.py

32f9f47 over 1 year ago

raw

history blame

6.15 kB

	import numpy as np
	import PIL
	from PIL import Image, ImageDraw
	import gradio as gr
	import torch
	import easyocr
	import os
	from pathlib import Path
	import cv2
	import pandas as pd


	#torch.hub.download_url_to_file('https://github.com/AaronCWacker/Yggdrasil/blob/main/images/BeautyIsTruthTruthisBeauty.JPG', 'BeautyIsTruthTruthisBeauty.JPG')
	#torch.hub.download_url_to_file('https://github.com/AaronCWacker/Yggdrasil/blob/main/images/PleaseRepeatLouder.jpg', 'PleaseRepeatLouder.jpg')
	#torch.hub.download_url_to_file('https://github.com/AaronCWacker/Yggdrasil/blob/main/images/ProhibitedInWhiteHouse.JPG', 'ProhibitedInWhiteHouse.JPG')

	torch.hub.download_url_to_file('https://raw.githubusercontent.com/AaronCWacker/Yggdrasil/master/images/20-Books.jpg','20-Books.jpg')
	torch.hub.download_url_to_file('https://github.com/JaidedAI/EasyOCR/raw/master/examples/english.png', 'COVID.png')
	torch.hub.download_url_to_file('https://github.com/JaidedAI/EasyOCR/raw/master/examples/chinese.jpg', 'chinese.jpg')
	torch.hub.download_url_to_file('https://github.com/JaidedAI/EasyOCR/raw/master/examples/japanese.jpg', 'japanese.jpg')
	torch.hub.download_url_to_file('https://i.imgur.com/mwQFd7G.jpeg', 'Hindi.jpeg')

	def draw_boxes(image, bounds, color='yellow', width=2):
	draw = ImageDraw.Draw(image)
	for bound in bounds:
	p0, p1, p2, p3 = bound[0]
	draw.line([p0, p1, p2, p3, *p0], fill=color, width=width)
	return image

	def box_size(box):
	points = box[0]
	if len(points) == 4:
	x1, y1 = points[0]
	x2, y2 = points[2]
	return abs(x1 - x2) * abs(y1 - y2)
	else:
	return 0

	def box_position(box):
	return (box[0][0][0] + box[0][2][0]) / 2, (box[0][0][1] + box[0][2][1]) / 2


	def inference(video, lang, time_step):
	output = 'results.mp4'
	reader = easyocr.Reader(lang)
	bounds = []
	vidcap = cv2.VideoCapture(video)
	success, frame = vidcap.read()
	count = 0
	frame_rate = vidcap.get(cv2.CAP_PROP_FPS)
	output_frames = []
	temporal_profiles = []
	max_boxes = 10

	# Get the positions of the largest boxes in the first frame
	while success and not bounds:
	if count == 0:
	bounds = reader.readtext(frame)
	im = PIL.Image.fromarray(frame)
	im_with_boxes = draw_boxes(im, bounds)
	largest_boxes = sorted(bounds, key=lambda x: box_size(x), reverse=True)[:max_boxes]
	positions = [box_position(b) for b in largest_boxes]
	temporal_profiles = [[] for _ in range(len(largest_boxes))]
	success, frame = vidcap.read()
	count += 1

	# Match bboxes to position and store the text read by OCR
	while success:
	if count % (int(frame_rate * time_step)) == 0:
	bounds = reader.readtext(frame)
	for box in bounds:
	bbox_pos = box_position(box)
	for i, position in enumerate(positions):
	distance = np.linalg.norm(np.array(bbox_pos) - np.array(position))
	if distance < 50:
	temporal_profiles[i].append((count / frame_rate, box[1]))
	break
	im = PIL.Image.fromarray(frame)
	im_with_boxes = draw_boxes(im, bounds)
	output_frames.append(np.array(im_with_boxes))
	success, frame = vidcap.read()
	count += 1

	# Default resolutions of the frame are obtained. The default resolutions are system dependent.
	# We convert the resolutions from float to integer.
	width = int(vidcap.get(cv2.CAP_PROP_FRAME_WIDTH))
	height = int(vidcap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	fps = vidcap.get(cv2.CAP_PROP_FPS)
	frames_total = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))

	# Define the codec and create VideoWriter object.
	temp = f"{Path(output).stem}_temp{Path(output).suffix}"
	output_video = cv2.VideoWriter(
	temp, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height)
	)
	# output_video = cv2.VideoWriter(output, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))
	for frame in output_frames:
	output_video.write(frame)
	output_video.release()
	vidcap.release()

	# Compressing the video for smaller size and web compatibility.
	os.system(
	f"ffmpeg -y -i {temp} -c:v libx264 -b:v 5000k -minrate 1000k -maxrate 8000k -pass 1 -c:a aac -f mp4 /dev/null && ffmpeg -y -i {temp} -c:v libx264 -b:v 5000k -minrate 1000k -maxrate 8000k -pass 2 -c:a aac -movflags faststart {output}"
	)
	os.system(f"rm -rf {temp} ffmpeg2pass-0.log ffmpeg2pass-0.log.mbtree")

	# Format temporal profiles as a DataFrame
	df = pd.DataFrame(columns=["Box", "Time (s)", "Text"])
	for i, profile in enumerate(temporal_profiles):
	for t, text in profile:
	df = df.append({"Box": f"Box {i+1}", "Time (s)": t, "Text": text}, ignore_index=True)

	return output, df



	title = '🖼️Video to Multilingual OCR👁️Gradio'
	description = 'Multilingual OCR which works conveniently on all devices in multiple languages.'
	article = "<p style='text-align: center'></p>"

	examples = [
	#['PleaseRepeatLouder.jpg',['ja']],['ProhibitedInWhiteHouse.JPG',['en']],['BeautyIsTruthTruthisBeauty.JPG',['en']],
	['20-Books.jpg',['en']],['COVID.png',['en']],['chinese.jpg',['ch_sim', 'en']],['japanese.jpg',['ja', 'en']],['Hindi.jpeg',['hi', 'en']]
	]

	css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
	choices = [
	"ch_sim",
	"ch_tra",
	"de",
	"en",
	"es",
	"ja",
	"hi",
	"ru"
	]


	gr.Interface(
	inference,
	[
	# gr.inputs.Image(type='file', label='Input Image'),
	gr.inputs.Video(label='Input Video'),
	gr.inputs.CheckboxGroup(choices, type="value", default=['en'], label='Language'),
	gr.inputs.Number(label='Time Step (in seconds)', default=1.0)
	],
	[
	gr.outputs.Video(label='Output Video'),
	gr.outputs.Dataframe(headers=['Box', 'Time (s)', 'Text'])
	],
	title=title,
	description=description,
	article=article,
	# examples=examples,
	css=css,
	enable_queue=True
	).launch(debug=True)