unt2tled commited on
Commit
86756d8
1 Parent(s): 6018cfd
.gitattributes CHANGED
@@ -1,31 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ftz filter=lfs diff=lfs merge=lfs -text
6
- *.gz filter=lfs diff=lfs merge=lfs -text
7
- *.h5 filter=lfs diff=lfs merge=lfs -text
8
- *.joblib filter=lfs diff=lfs merge=lfs -text
9
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
- *.model filter=lfs diff=lfs merge=lfs -text
11
- *.msgpack filter=lfs diff=lfs merge=lfs -text
12
- *.npy filter=lfs diff=lfs merge=lfs -text
13
- *.npz filter=lfs diff=lfs merge=lfs -text
14
- *.onnx filter=lfs diff=lfs merge=lfs -text
15
- *.ot filter=lfs diff=lfs merge=lfs -text
16
- *.parquet filter=lfs diff=lfs merge=lfs -text
17
- *.pickle filter=lfs diff=lfs merge=lfs -text
18
- *.pkl filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pt filter=lfs diff=lfs merge=lfs -text
21
- *.pth filter=lfs diff=lfs merge=lfs -text
22
- *.rar filter=lfs diff=lfs merge=lfs -text
23
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
- *.tar.* filter=lfs diff=lfs merge=lfs -text
25
- *.tflite filter=lfs diff=lfs merge=lfs -text
26
- *.tgz filter=lfs diff=lfs merge=lfs -text
27
- *.wasm filter=lfs diff=lfs merge=lfs -text
28
- *.xz filter=lfs diff=lfs merge=lfs -text
29
- *.zip filter=lfs diff=lfs merge=lfs -text
30
- *.zst filter=lfs diff=lfs merge=lfs -text
31
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Auto detect text files and perform LF normalization
2
+ * text=auto
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
The diff for this file is too large to render. See raw diff
 
Demo.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Demo UI page
3
+ """
4
+ import streamlit as st
5
+ #import tools.ocr_video as ocr
6
+ import os
7
+ import shutil
8
+ import uuid
9
+ from model_loader import HFPretrainedModel
10
+ from transformers import pipeline
11
+ import torch
12
+
13
+ @st.cache(hash_funcs={"MyUnhashableClass": lambda _: None})
14
+ def load_sentiment_model():
15
+ return pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english")
16
+
17
+ @st.cache(hash_funcs={"MyUnhashableClass": lambda _: None})
18
+ def load_campaign_model():
19
+ return HFPretrainedModel("distilbert-base-uncased", "deano/political-campaign-analysis-110922")
20
+
21
+ if "session_id" not in st.session_state:
22
+ st.session_state["session_id"] = uuid.uuid1()
23
+
24
+ # Temporary folder path
25
+ TMP_PATH = "tmp-{"+str(st.session_state["session_id"])+"}/"
26
+
27
+ st.title("Demo page")
28
+ st.markdown("""Upload the US political campaign video to predict its orientation (base/center).""")
29
+ video_file = st.file_uploader("Choose the US political campaign video", type=["wmv", "avi", "mov"], disabled=True)
30
+ text = st.text_input("Transcript of the video", "")
31
+ b = st.button("Predict")
32
+ if b:
33
+ st.markdown("""---""")
34
+ status_bar = st.progress(0)
35
+ upload_cap = st.caption("Uploading video...")
36
+ #if os.path.isdir(TMP_PATH):
37
+ # shutil.rmtree(TMP_PATH)
38
+ #os.mkdir(TMP_PATH)
39
+ #with open(TMP_PATH+"uploaded_video_tmp", "wb") as f:
40
+ # f.write(video_file.getbuffer())
41
+ status_bar.progress(50)
42
+ #upload_cap.caption("Extracting text from frames... (can take some time)")
43
+ #text_ocr = ocr.get_formated_text(ocr.retrieve_text(TMP_PATH+"uploaded_video_tmp", frames_path = "tmp_frames-{"+str(st.session_state["session_id"])+"}", show_print = False))
44
+ upload_cap.caption("Extracting text sentiment...")
45
+ sentiment_analysis = load_sentiment_model()
46
+ text_sentiment = sentiment_analysis(text)[0]["label"]
47
+ status_bar.progress(80)
48
+
49
+ #shutil.rmtree(TMP_PATH)
50
+ status_bar.progress(90)
51
+ upload_cap.caption("Prediction...")
52
+ model = load_campaign_model()
53
+ #query_dict = {"text": [text], "text_ocr": [text_ocr]}
54
+ query_dict = {"text": [text], "label_sentiment": [text_sentiment]}
55
+ # Predicted confidence for each label
56
+ conf = model.predict(query_dict)
57
+ col1, col2 = st.columns(2)
58
+ col1.metric("Base", "{:.2f}".format(conf[1].item()*100)+"%", "")
59
+ col2.metric("Center", "{:.2f}".format(conf[0].item()*100)+"%", "")
60
+
61
+ status_bar.progress(100)
62
+ upload_cap.caption("Done")
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 unt2tled
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,19 @@
1
- ---
2
- title: Political Campaign
3
- emoji: 📊
4
- colorFrom: gray
5
- colorTo: red
6
- sdk: streamlit
7
- sdk_version: 1.10.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
1
+ # Political Campaign Project
2
+ Deep learning pipelines to predict the target of political messages.
3
+ ## About
4
+ The goal of this project is to present machine learning approach of classification political campaign videos from the USA of different years by target audience (base/center). The classification is done by extracting different features from the video (e.g., speech-to-text, visual data) and training a neural network. More details can be found in the related [paper](https://drive.google.com/file/d/1-o9UVRRV7XRlGGBsYUfOkmch2ai-A2Fg/view?usp=sharing).
5
+ ## Navigation
6
+ ### Dataset
7
+ Datasets, including extracted features, tagging files and political campaign videos to train on can be found [here](https://drive.google.com/drive/folders/1-7rkd_SozNGLrNHXnEZ0iTKqO9ztKhiU?usp=sharing).
8
+ ### Features extraction
9
+ All the code used for features extraction is in the */tools* directory.
10
+ ### Analysis
11
+ Code for model analysis is in the */analysis* directory.
12
+ ### Training model
13
+ To train the model use [this](https://colab.research.google.com/drive/1ceVEWRAkIQJsOGuMxmG2qvPY3huZf8gc?usp=sharing) Google Colab notebook. [This](https://colab.research.google.com/drive/1MH19zWCCqQFTKidT5qq6pIPbmsdyuAIp?usp=sharing) notebook is used to make predictions from the pre-trained model.
14
+ ### Demo
15
+ Example UI of a pre-trained model with test accuracy of ~80% using speech-to-text and text from video features can be found [here](https://unt2tled-political-campaign-project-demo-6gbfbd.streamlitapp.com/) or by cloning the repository and calling from the project's root:
16
+ ```
17
+ pip install streamlit
18
+ streamlit run Demo.py
19
+ ```
analysis/linguistic_analysis.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains methods for texts linguistic analysis
3
+ """
4
+ import csv
5
+ import re
6
+ import matplotlib.pyplot as plt
7
+
8
+ def count_avg_questions(path):
9
+ x = []
10
+ y = ([], [], [])
11
+ with open(path, "r") as tags_file:
12
+ csv_reader = csv.reader(tags_file)
13
+ next(csv_reader)
14
+ counter = [0, 0, 0]
15
+ counter_total = [0, 0, 0]
16
+ for i, row in enumerate(csv_reader):
17
+ x.append(i)
18
+ y[0].append(0)
19
+ y[1].append(0)
20
+ y[2].append(0)
21
+ text = row[1]
22
+ counter_total[int(row[2])] += 1
23
+ counter[int(row[2])] += len(re.findall("\?", text))
24
+ y[int(row[2])][-1] = len(re.findall("\?", text))
25
+ plt.plot(x, y[0])
26
+ #plt.plot(x, y[1])
27
+ plt.plot(x, y[2])
28
+ print(y[2])
29
+ plt.show()
30
+ return [(counter[i]/counter_total[i]) for i in range(len(counter))]
31
+
32
+ def count_pronouns(path):
33
+ with open(path, "r") as tags_file:
34
+ csv_reader = csv.reader(tags_file)
35
+ next(csv_reader)
36
+ counter = [0, 0, 0]
37
+ counter_total = [0, 0, 0]
38
+ for row in csv_reader:
39
+ text = row[1]
40
+ counter_total[int(row[2])] += 1
41
+ #pattern = "(he)|(she)|(her)|(his)|(them)|(they)|(their)"
42
+ pattern = "(Obama)"
43
+ counter[int(row[2])] += len(re.findall(pattern, text, re.IGNORECASE))
44
+ return [(counter[i]/counter_total[i]) for i in range(len(counter))]
45
+
46
+ print(count_avg_questions("tags.csv"))
47
+ print(count_pronouns("tags.csv"))
analysis/words_decision_tree.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module containes methods for words classification using desicion trees
3
+ """
4
+ from __future__ import print_function
5
+ import os
6
+ import subprocess
7
+ import pandas as pd
8
+ import numpy as np
9
+ from sklearn.tree import DecisionTreeClassifier, plot_tree
10
+ import graphviz
11
+
12
+ # ref: http://chrisstrelioff.ws/sandbox/2015/06/08/decision_trees_in_python_with_scikit_learn_and_pandas.html
13
+
14
+ input_file_path = 'text_words_labels.csv'
15
+
16
+ def get_data(input_file_path):
17
+ df = pd.read_csv(input_file_path)
18
+ return df
19
+
20
+ def encode_target(df, target_column):
21
+ """Add column to df with integers for the target.
22
+
23
+ Args
24
+ ----
25
+ df -- pandas DataFrame.
26
+ target_column -- column to map to int, producing
27
+ new Target column.
28
+
29
+ Returns
30
+ -------
31
+ df_mod -- modified DataFrame.
32
+ targets -- list of target names.
33
+ """
34
+ df_mod = df.copy()
35
+ targets = df_mod[target_column].unique()
36
+ map_to_int = {name: n for n, name in enumerate(targets)}
37
+ df_mod["target"] = df_mod[target_column].replace(map_to_int)
38
+
39
+ return (df_mod, targets)
40
+
41
+ df = get_data(input_file_path)
42
+ df2, targets = encode_target(df, "target")
43
+ print("* df2.head()", df2[["target", "name"]].head(),
44
+ sep="\n", end="\n\n")
45
+ print("* df2.tail()", df2[["target", "name"]].tail(),
46
+ sep="\n", end="\n\n")
47
+ print("* targets", targets, sep="\n", end="\n\n")
48
+
49
+ features = [c for c in df2.columns.values if c != 'name' and c != 'isdefinite' and c != 'target']
50
+
51
+ y = df2["target"]
52
+ X = df2[features]
53
+ dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
54
+ dt.fit(X, y)
55
+
56
+ plot_tree(dt,max_depth=3)
analysis/words_distributions.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains classes and methods for words distribution analysis
3
+ """
4
+ import pandas as pd
5
+ import numpy as np
6
+
7
+ class WordsDistributionClass:
8
+ ''' This class is for creating a dataframe with the frequencies
9
+ of the words in the text column of the input file, in addition
10
+ to the file's original columns. '''
11
+ def __init__(self,input_file_path,output_file_path,text_column='text'):
12
+ self.input_file_path = input_file_path
13
+ self.output_file_path = output_file_path
14
+ self.text_column = text_column
15
+ def initialize_data(self):
16
+ # read dataframe from the input CSV file path
17
+ self.df = pd.read_csv(self.input_file_path,encoding='cp1255')
18
+ # add frequencies of the words in the text column as columns
19
+ # for the dataframe which was previously read
20
+ # Impl. Note: all_words is a dictionary for the words' frequencies
21
+ # to be used during the calculation. It's a local variable.
22
+ # for word in all_words.keys():
23
+ # all_words[word] == # videos which contain word
24
+ # as part of the text in in the text column
25
+ all_words = {}
26
+ self.df['freq'] = self.df.apply(lambda x:
27
+ WordsDistributionClass.get_words_freq_in_text(x[self.text_column],all_words),axis=1)
28
+ for word in all_words.keys():
29
+ if all_words[word] >= 10:
30
+ self.df['freq_'+word] = self.df.apply(lambda x:
31
+ 0 if word not in x['freq'].keys() else x['freq'][word],axis=1)
32
+ del all_words
33
+ del self.df['freq']
34
+
35
+ def get_words_freq_in_text(text,all_words):
36
+ # static public function
37
+ freq = {}
38
+ # our calcuation is not sensitive to CAPS-LOCK characters
39
+ text = text.lower()
40
+ # our calcuation is not sensitive to the characters: ";",",","."
41
+ # NOTE: we are sensitive to other characters, including question marks
42
+ # and '"', "'" etc.
43
+ text = text.replace(";","")
44
+ text = text.replace(",","")
45
+ text = text.replace(".","")
46
+ words = text.split(" ")
47
+ # algorithm for assigning words distribution
48
+ # for given all_words dictionary
49
+ for word in words:
50
+ if word not in all_words:
51
+ all_words[word] = 0
52
+ if word not in freq.keys():
53
+ freq[word] = 1
54
+ all_words[word] += 1
55
+ else:
56
+ freq[word] += 1
57
+ return freq
58
+
59
+ def save_output(self):
60
+ #export dataframe to output CSV file path
61
+ self.df.to_csv(self.output_file_path,index=False)
62
+
63
+ if __name__ == "__main__":
64
+ # Arguments
65
+ INPUT_FILE_NAME = 'tagging_MMD_db_with_face_sentiment_extracted.csv'
66
+ OUTPUT_FILE_NAME = 'tagging_MMD_db_with_face_sentiment_extracted_and_words_distributions.csv'
67
+
68
+ # Run WordsDistributionClass on the given input
69
+ wdc = WordsDistributionClass(INPUT_FILE_NAME,OUTPUT_FILE_NAME)
70
+ wdc.initialize_data()
71
+ wdc.save_output()
analysis/words_distributions.xlsx ADDED
Binary file (826 kB). View file
 
model_loader.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains loaders for loading models to predict a political campaign orientation (base/center)
3
+ """
4
+ from transformers import AutoTokenizer
5
+ from datasets import Dataset
6
+ from transformers import AutoModelForSequenceClassification, Trainer
7
+ from datasets import load_metric
8
+ import pandas as pd
9
+ import numpy as np
10
+ import torch
11
+ from torch.nn.functional import softmax
12
+
13
+ HF_TOKEN = "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"
14
+
15
+ class HFPretrainedModel:
16
+ def __init__(self, lang_model_name: str,checkpoint:str):
17
+ self.lang_model_name = lang_model_name
18
+ self.checkpoint = checkpoint
19
+ self.init_tokenizer()
20
+ self.init_config()
21
+ @staticmethod
22
+ def compute_metrics(eval_pred):
23
+ logits, labels = eval_pred
24
+ metric = load_metric("accuracy")
25
+ predictions = np.argmax(logits, axis=-1)
26
+ return metric.compute(predictions=predictions, references=labels)
27
+ def init_tokenizer(self):
28
+ self.tokenizer = AutoTokenizer.from_pretrained(self.lang_model_name)
29
+ def init_config(self):
30
+ self.model = AutoModelForSequenceClassification.from_pretrained(self.checkpoint, use_auth_token=HF_TOKEN, num_labels=2)
31
+ self.trainer = Trainer(model=self.model,tokenizer=self.tokenizer,compute_metrics=HFPretrainedModel.compute_metrics)
32
+ def predict(self, data: dict):
33
+ # Build dataset with one row
34
+ data_to_predict = Dataset.from_dict(data)
35
+ tokenized_ds = data_to_predict.map(lambda examples: self.tokenizer([examples[text_feature] if examples[text_feature] is not None else '' for text_feature in data.keys()],is_split_into_words=True,truncation=True))
36
+ predictions = self.trainer.predict(tokenized_ds)
37
+ pred_tensor = torch.tensor(predictions.predictions[0])
38
+ return softmax(pred_tensor, dim=0)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ datasets
3
+ numpy
4
+ pandas
5
+ torch
tools/__init__.py ADDED
File without changes
tools/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (171 Bytes). View file
 
tools/__pycache__/ocr_video.cpython-38.pyc ADDED
Binary file (2.32 kB). View file
 
tools/__pycache__/video_tools.cpython-38.pyc ADDED
Binary file (748 Bytes). View file
 
tools/facial_features.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module allows to extract facial deatures from videos
3
+ """
4
+ import os
5
+ import shutil
6
+ from retinaface import RetinaFace
7
+ from deepface import DeepFace
8
+ import json
9
+ from video_tools import generate_frames
10
+
11
+ FRAMES_PATH = "tmp_frames_faces"
12
+
13
+ def retrieve_faces_data(video_path, rate = 50, show_print = True):
14
+ faces_lst = []
15
+ generate_frames(video_path, FRAMES_PATH, rate = rate, show_print = show_print)
16
+ for i in sorted([int(s[:-4]) for s in os.listdir(FRAMES_PATH)]):
17
+ faces = RetinaFace.extract_faces(FRAMES_PATH + "/" + str(i) + ".png")
18
+ data_lst = []
19
+ for face in faces:
20
+ try:
21
+ face_dict = DeepFace.analyze(face, actions = ["emotion"], detector_backend = "skip")
22
+ data_lst.append(face_dict["emotion"])
23
+ except ValueError:
24
+ # Face was not detected
25
+ continue
26
+ faces_lst.append(data_lst)
27
+ # Delete temporary directory
28
+ #shutil.rmtree(FRAMES_PATH)
29
+ return faces_lst
30
+
31
+ def retrieve_to_file(dest, video_path):
32
+ face_data = retrieve_faces_data(video_path, show_print = False)
33
+ with open(dest, "w") as output_file:
34
+ output_file.writelines([json.dumps(item) + "\n" for item in face_data])
35
+
36
+ def retrieve_to_files(dest, video_path):
37
+ for file_name in os.listdir(video_path):
38
+ retrieve_to_file(dest + "/" + os.path.splitext(file_name)[0] + "_data", video_path + "/" + file_name)
39
+
40
+ def restore_from_file(file_path):
41
+ restored_lst = []
42
+ with open(file_path, "r") as file:
43
+ for line in file.readlines():
44
+ if line != "":
45
+ restored_lst.append(eval(line))
46
+ return restored_lst
47
+
48
+ def data_to_vector(data):
49
+ vec = []
50
+ for frame in data:
51
+ avg = [0, 0, 0, 0, 0, 0, 0]
52
+ for face in frame:
53
+ avg[0] += face["angry"]
54
+ avg[1] += face["disgust"]
55
+ avg[2] += face["fear"]
56
+ avg[3] += face["happy"]
57
+ avg[4] += face["sad"]
58
+ avg[5] += face["surprise"]
59
+ avg[6] += face["neutral"]
60
+ if len(frame) != 0:
61
+ for i in range(7):
62
+ avg[i] /= len(frame)
63
+ vec.append(avg)
64
+ return vec
65
+
66
+ if __name__ == "__main__":
67
+ retrieve_to_files("x", "result")
tools/ocr_video.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module allows to extract texts from videos using OCR
3
+ """
4
+ import easyocr
5
+ import os
6
+ import cv2
7
+ import shutil
8
+ import difflib
9
+ import re
10
+ from tools.video_tools import generate_frames
11
+
12
+ CONF_THRESH = 0.9
13
+ SIMILARITY_THRESH = 0.8
14
+
15
+ def process_text(text):
16
+ result = re.sub(r"[\n\"\[\]~;]", "", text)
17
+ lst = result.split()
18
+ s = ""
19
+ for item in lst:
20
+ item = item.strip()
21
+ if len(item)!=1 or item == "a" or item == "I" or item == "i" or item == "A":
22
+ s += " "+item
23
+ if len(s)<6:
24
+ s = ""
25
+ return s
26
+
27
+ def get_formated_text(texts_arr):
28
+ res = ""
29
+ for row in texts_arr:
30
+ k = process_text(row.lower())
31
+ if len(k) > 0:
32
+ res += process_text(row.lower()) + ", "
33
+ return res[:-2]
34
+
35
+ def add_text(text_lst, text):
36
+ for t in text_lst:
37
+ similarity = difflib.SequenceMatcher(None, t, text).ratio()
38
+ if similarity > SIMILARITY_THRESH:
39
+ return
40
+ text_lst.append(text)
41
+
42
+ def retrieve_text(video_path, rate = 5, frames_path = "tmp_frames", show_print = True):
43
+ texts_lst = []
44
+ generate_frames(video_path, frames_path, rate = rate, show_print = show_print)
45
+ ocr = easyocr.Reader(['en'])
46
+ for i in os.listdir(frames_path):
47
+ text = ocr.readtext(frames_path + "/" + i)
48
+ for txt in text:
49
+ # Threshold for confidence
50
+ if txt[2] > CONF_THRESH:
51
+ # Filter similar texts
52
+ add_text(texts_lst, txt[1])
53
+ # Delete temporary directory
54
+ shutil.rmtree(frames_path)
55
+ return texts_lst
56
+
57
+ def retrieve_to_file(dest, video_path):
58
+ text_lst = retrieve_text(video_path, rate = 2, show_print = False)
59
+ file = open(dest, "w")
60
+ file.writelines([line + "\n" for line in text_lst])
61
+ file.close()
62
+
63
+ def retrieve_to_files(dest, video_path):
64
+ for file_name in os.listdir(video_path):
65
+ retrieve_to_file(dest + "/" + os.path.splitext(file_name)[0] + "_text.txt", video_path + "/" + file_name)
tools/text_sentiment.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains methods for extracting text sentiment from texts
3
+ """
4
+ import torch
5
+ import pandas as pd
6
+ import numpy as np
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
8
+ # ref: https://colab.research.google.com/github/chrsiebert/sentiment-roberta-large-english/blob/main/sentiment_roberta_prediction_example.ipynb
9
+ # Create class for data preparation
10
+ class SimpleDataset:
11
+ def __init__(self, tokenized_texts):
12
+ self.tokenized_texts = tokenized_texts
13
+
14
+ def __len__(self):
15
+ return len(self.tokenized_texts["input_ids"])
16
+
17
+ def __getitem__(self, idx):
18
+ return {k: v[idx] for k, v in self.tokenized_texts.items()}
19
+
20
+ class Sentiment_Extractor:
21
+ def __init__(self,input_file_name,text_column,output_file_name):
22
+ self.input_file_name = input_file_name
23
+ self.text_column = text_column
24
+ self.output_file_name = output_file_name
25
+ def run(self):
26
+ # Load tokenizer and model, create trainer
27
+ model_name = "siebert/sentiment-roberta-large-english"
28
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
29
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
30
+ trainer = Trainer(model=model)
31
+
32
+ df_pred = pd.read_csv(self.input_file_name,encoding='cp1255')
33
+ pred_texts = df_pred[self.text_column].dropna().astype('str').tolist()
34
+
35
+ # Tokenize texts and create prediction data set
36
+ tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
37
+ pred_dataset = SimpleDataset(tokenized_texts)
38
+
39
+ # Run predictions
40
+ predictions = trainer.predict(pred_dataset)
41
+
42
+ # Transform predictions to labels
43
+ preds = predictions.predictions.argmax(-1)
44
+ labels = pd.Series(preds).map(model.config.id2label)
45
+ scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)
46
+
47
+ # Create DataFrame with texts, predictions, labels, and scores
48
+ df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['text_sentiment','pred_sentiment','label_sentiment','score_sentiment'])
49
+ df_output = df_pred.merge(df,left_on=self.text_column,right_on='text_sentiment')
50
+ del df_output['text_sentiment']
51
+ df_output.to_csv(self.output_file_name,encoding='cp1255',index=False)
52
+
53
+ if __name__ == "__main__":
54
+ # Arguments
55
+ # INPUT_FILE_NAME is the name of the input file
56
+ INPUT_FILE_NAME = "tagging_MMD_db_with_summarized.csv"
57
+ # TEXT_COLUMN is the name of the text column in the input file
58
+ # from which we extract the positive / negative sentiment by the 🤗 model.
59
+ TEXT_COLUMN = "text"
60
+ OUTPUT_FILE_NAME = 'tagging_MMD_db_with_sentiment.csv'
61
+
62
+ # Run Sentiment_Extractor on the given arguments
63
+ obj = Sentiment_Extractor(INPUT_FILE_NAME,OUTPUT_FILE_NAME)
64
+ obj.run()
tools/text_summarization.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module is for text summarization
3
+ """
4
+ # ref: https://towardsdatascience.com/understand-text-summarization-and-create-your-own-summarizer-in-python-b26a9f09fc70
5
+ import nltk
6
+ nltk.download('stopwords')
7
+ from nltk.corpus import stopwords
8
+ from nltk.cluster.util import cosine_distance
9
+ import pandas as pd
10
+ import numpy as np
11
+ import networkx as nx
12
+
13
+ class SummarizationClass:
14
+ def read_text(text):
15
+ text = text.replace("\"","")
16
+ article = text.split(". ")
17
+ sentences = []
18
+
19
+ for sentence in article:
20
+ #print(sentence)
21
+ sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
22
+ #sentences.pop()
23
+
24
+ return sentences
25
+
26
+ def sentence_similarity(sent1, sent2, stopwords=None):
27
+ if stopwords is None:
28
+ stopwords = []
29
+
30
+ sent1 = [w.lower() for w in sent1]
31
+ sent2 = [w.lower() for w in sent2]
32
+
33
+ all_words = list(set(sent1 + sent2))
34
+
35
+ vector1 = [0] * len(all_words)
36
+ vector2 = [0] * len(all_words)
37
+
38
+ # build the vector for the first sentence
39
+ for w in sent1:
40
+ if w in stopwords:
41
+ continue
42
+ vector1[all_words.index(w)] += 1
43
+
44
+ # build the vector for the second sentence
45
+ for w in sent2:
46
+ if w in stopwords:
47
+ continue
48
+ vector2[all_words.index(w)] += 1
49
+
50
+ return 1 - cosine_distance(vector1, vector2)
51
+
52
+ def build_similarity_matrix(sentences, stop_words):
53
+ # Create an empty similarity matrix
54
+ similarity_matrix = np.zeros((len(sentences), len(sentences)))
55
+
56
+ for idx1 in range(len(sentences)):
57
+ for idx2 in range(len(sentences)):
58
+ if idx1 == idx2: #ignore if both are same sentences
59
+ continue
60
+ similarity_matrix[idx1][idx2] = SummarizationClass.sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
61
+
62
+ return similarity_matrix
63
+
64
+
65
+ def generate_summary(text, top_n=5):
66
+ stop_words = stopwords.words('english')
67
+ summarize_text = []
68
+
69
+ # Step 1 - Read text anc split it
70
+ sentences = SummarizationClass.read_text(text)
71
+ # Step 2 - Generate Similary Martix across sentences
72
+ sentence_similarity_martix = SummarizationClass.build_similarity_matrix(sentences, stop_words)
73
+
74
+ # Step 3 - Rank sentences in similarity martix
75
+ sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
76
+ #print(sentence_similarity_graph)
77
+ try:
78
+ scores = nx.pagerank(sentence_similarity_graph)
79
+
80
+ # Step 4 - Sort the rank and pick top sentences
81
+ ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
82
+
83
+ for i in range(top_n):
84
+ summarize_text.append(" ".join(ranked_sentence[i][1]))
85
+ except nx.exception.PowerIterationFailedConvergence:
86
+ print(f'text={text} was bad for nx')
87
+ return ''
88
+ # Step 5 - Offcourse, output the summarize texr
89
+ return ". ".join(summarize_text)
90
+
91
+ class SummarizationClassRun:
92
+ ''' class for running the summarization class algorithm with given parameters '''
93
+ def __init__(self,input_file_path,text_column,output_file_path_keep_original_text_column):
94
+ self.input_file_path = input_file_path
95
+ self.text_column = text_column
96
+ self.output_file_path_keep_original_text_column = output_file_path_keep_original_text_column
97
+ self.output_file_path_override_text_column = output_file_path_override_text_column
98
+ def run(self):
99
+ # read input file as a dataframe
100
+ df = pd.read_csv(self.input_file_path,encoding='cp1255')
101
+ # add column with summarization of the text in the text column
102
+ df['summarized_text'] = df[self.text_column].apply(lambda x: SummarizationClass.generate_summary(x, 1))
103
+ # export output with the original text column to CSV file
104
+ df.to_csv(self.output_file_path_keep_original_text_column,encoding='cp1255',index=False)
105
+ # override original text column
106
+ df[self.text_column] = df['summarized_text']
107
+ del df['summarized_text']
108
+ # export output with the overridden text column to CSV file
109
+ df.to_csv(self.output_file_path_override_text_column,encoding='cp1255',index=False)
110
+
111
+ if __name__ == '__main__':
112
+ # Arguments
113
+ INPUT_FILE_PATH = 'tagging_MMD_db.csv'
114
+ TEXT_COLUMN = 'text'
115
+ OUTPUT_FILE_PATH_KEEP_ORIGINAL_TEXT_COLUMN = 'tagging_MMD_db_with_summarized.csv'
116
+ OUTPUT_FILE_PATH_OVERRIDE_TEXT_COLUMN = 'summarized_tagging_MMD_db.csv'
117
+ obj = SummarizationClassRun(INPUT_FILE_PATH,TEXT_COLUMN,OUTPUT_FILE_PATH_KEEP_ORIGINAL_TEXT_COLUMN,OUTPUT_FILE_PATH_OVERRIDE_TEXT_COLUMN)
118
+ obj.run()
tools/video_tools.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains methods for video processing
3
+ """
4
+ import os
5
+ import cv2
6
+
7
+ def generate_frames(video_path, frames_path, rate, show_print = True):
8
+ # Create a new temporary folder
9
+ if not os.path.exists(frames_path):
10
+ os.makedirs(frames_path)
11
+ # Capture video
12
+ src_vid = cv2.VideoCapture(video_path)
13
+ index = 0
14
+ while src_vid.isOpened():
15
+ ret, frame = src_vid.read()
16
+ if not ret:
17
+ break
18
+ name = frames_path + "/" + str(index) + ".png"
19
+ if index % rate == 0:
20
+ if show_print:
21
+ print("Frame: " + name)
22
+ cv2.imwrite(name, frame)
23
+ index = index + 1
24
+ src_vid.release()