Spaces:

Vanofuture
/

political_campaign

Runtime error

App Files Files Community

political_campaign / tools /ocr_video.py

unt2tled

init

86756d8 almost 2 years ago

raw

history blame

No virus

1.93 kB

	"""
	This module allows to extract texts from videos using OCR
	"""
	import easyocr
	import os
	import cv2
	import shutil
	import difflib
	import re
	from tools.video_tools import generate_frames

	CONF_THRESH = 0.9
	SIMILARITY_THRESH = 0.8

	def process_text(text):
	result = re.sub(r"[\n\"\[\]~;]", "", text)
	lst = result.split()
	s = ""
	for item in lst:
	item = item.strip()
	if len(item)!=1 or item == "a" or item == "I" or item == "i" or item == "A":
	s += " "+item
	if len(s)<6:
	s = ""
	return s

	def get_formated_text(texts_arr):
	res = ""
	for row in texts_arr:
	k = process_text(row.lower())
	if len(k) > 0:
	res += process_text(row.lower()) + ", "
	return res[:-2]

	def add_text(text_lst, text):
	for t in text_lst:
	similarity = difflib.SequenceMatcher(None, t, text).ratio()
	if similarity > SIMILARITY_THRESH:
	return
	text_lst.append(text)

	def retrieve_text(video_path, rate = 5, frames_path = "tmp_frames", show_print = True):
	texts_lst = []
	generate_frames(video_path, frames_path, rate = rate, show_print = show_print)
	ocr = easyocr.Reader(['en'])
	for i in os.listdir(frames_path):
	text = ocr.readtext(frames_path + "/" + i)
	for txt in text:
	# Threshold for confidence
	if txt[2] > CONF_THRESH:
	# Filter similar texts
	add_text(texts_lst, txt[1])
	# Delete temporary directory
	shutil.rmtree(frames_path)
	return texts_lst

	def retrieve_to_file(dest, video_path):
	text_lst = retrieve_text(video_path, rate = 2, show_print = False)
	file = open(dest, "w")
	file.writelines([line + "\n" for line in text_lst])
	file.close()

	def retrieve_to_files(dest, video_path):
	for file_name in os.listdir(video_path):
	retrieve_to_file(dest + "/" + os.path.splitext(file_name)[0] + "_text.txt", video_path + "/" + file_name)