political_campaign / tools /ocr_video.py
unt2tled
init
86756d8
raw
history blame
No virus
1.93 kB
"""
This module allows to extract texts from videos using OCR
"""
import easyocr
import os
import cv2
import shutil
import difflib
import re
from tools.video_tools import generate_frames
CONF_THRESH = 0.9
SIMILARITY_THRESH = 0.8
def process_text(text):
result = re.sub(r"[\n\"\[\]~;]", "", text)
lst = result.split()
s = ""
for item in lst:
item = item.strip()
if len(item)!=1 or item == "a" or item == "I" or item == "i" or item == "A":
s += " "+item
if len(s)<6:
s = ""
return s
def get_formated_text(texts_arr):
res = ""
for row in texts_arr:
k = process_text(row.lower())
if len(k) > 0:
res += process_text(row.lower()) + ", "
return res[:-2]
def add_text(text_lst, text):
for t in text_lst:
similarity = difflib.SequenceMatcher(None, t, text).ratio()
if similarity > SIMILARITY_THRESH:
return
text_lst.append(text)
def retrieve_text(video_path, rate = 5, frames_path = "tmp_frames", show_print = True):
texts_lst = []
generate_frames(video_path, frames_path, rate = rate, show_print = show_print)
ocr = easyocr.Reader(['en'])
for i in os.listdir(frames_path):
text = ocr.readtext(frames_path + "/" + i)
for txt in text:
# Threshold for confidence
if txt[2] > CONF_THRESH:
# Filter similar texts
add_text(texts_lst, txt[1])
# Delete temporary directory
shutil.rmtree(frames_path)
return texts_lst
def retrieve_to_file(dest, video_path):
text_lst = retrieve_text(video_path, rate = 2, show_print = False)
file = open(dest, "w")
file.writelines([line + "\n" for line in text_lst])
file.close()
def retrieve_to_files(dest, video_path):
for file_name in os.listdir(video_path):
retrieve_to_file(dest + "/" + os.path.splitext(file_name)[0] + "_text.txt", video_path + "/" + file_name)