Spaces:
Running
Running
move the sentiment detection to OCR file
Browse files- modules/OCR.py +29 -0
- modules/streamlit_utils.py +0 -4
- modules/toWizard.py +3 -32
modules/OCR.py
CHANGED
@@ -20,6 +20,35 @@ VISION_ENDPOINT = os.getenv("VISION_ENDPOINT")
|
|
20 |
|
21 |
VISION_KEY = json_data["VISION_KEY"]
|
22 |
VISION_ENDPOINT = json_data["VISION_ENDPOINT"]"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
def sample_ocr_image_file(image_data):
|
25 |
# Set the values of your computer vision endpoint and computer vision key
|
|
|
20 |
|
21 |
VISION_KEY = json_data["VISION_KEY"]
|
22 |
VISION_ENDPOINT = json_data["VISION_ENDPOINT"]"""
|
23 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
24 |
+
import torch
|
25 |
+
import logging
|
26 |
+
|
27 |
+
# Suppress specific warnings from transformers
|
28 |
+
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
|
29 |
+
|
30 |
+
# Function to initialize the model and tokenizer
|
31 |
+
def initialize_model():
|
32 |
+
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
|
33 |
+
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
|
34 |
+
return tokenizer, model
|
35 |
+
|
36 |
+
# Initialize model and tokenizer
|
37 |
+
tokenizer, emotion_model = initialize_model()
|
38 |
+
|
39 |
+
# Function to perform sentiment analysis and return the highest scoring emotion and its score between positive and negative
|
40 |
+
def analyze_sentiment(sentence, tokenizer=tokenizer, model=emotion_model):
|
41 |
+
inputs = tokenizer(sentence, return_tensors="pt")
|
42 |
+
outputs = model(**inputs)
|
43 |
+
probs = torch.nn.functional.softmax(outputs.logits, dim=-1).squeeze().tolist()
|
44 |
+
labels = ["negative", "neutral", "positive"]
|
45 |
+
results = dict(zip(labels, probs))
|
46 |
+
|
47 |
+
# Filter out the neutral score and get the highest score between positive and negative
|
48 |
+
relevant_results = {k: results[k] for k in ["positive", "negative"]}
|
49 |
+
highest_emotion = max(relevant_results, key=relevant_results.get)
|
50 |
+
highest_score = relevant_results[highest_emotion]
|
51 |
+
return highest_emotion, highest_score
|
52 |
|
53 |
def sample_ocr_image_file(image_data):
|
54 |
# Set the values of your computer vision endpoint and computer vision key
|
modules/streamlit_utils.py
CHANGED
@@ -33,8 +33,6 @@ import time
|
|
33 |
from modules.toXML import get_size_elements
|
34 |
|
35 |
|
36 |
-
|
37 |
-
|
38 |
def get_memory_usage():
|
39 |
process = psutil.Process()
|
40 |
mem_info = process.memory_info()
|
@@ -52,8 +50,6 @@ def read_xml_file(filepath):
|
|
52 |
return file.read()
|
53 |
|
54 |
|
55 |
-
|
56 |
-
|
57 |
# Suppress the symlink warning
|
58 |
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
|
59 |
|
|
|
33 |
from modules.toXML import get_size_elements
|
34 |
|
35 |
|
|
|
|
|
36 |
def get_memory_usage():
|
37 |
process = psutil.Process()
|
38 |
mem_info = process.memory_info()
|
|
|
50 |
return file.read()
|
51 |
|
52 |
|
|
|
|
|
53 |
# Suppress the symlink warning
|
54 |
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
|
55 |
|
modules/toWizard.py
CHANGED
@@ -2,37 +2,8 @@ import xml.etree.ElementTree as ET
|
|
2 |
from modules.utils import class_dict
|
3 |
from xml.dom import minidom
|
4 |
from modules.utils import error
|
5 |
-
from
|
6 |
|
7 |
-
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
8 |
-
import torch
|
9 |
-
import logging
|
10 |
-
|
11 |
-
# Suppress specific warnings from transformers
|
12 |
-
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
|
13 |
-
|
14 |
-
# Function to initialize the model and tokenizer
|
15 |
-
def initialize_model():
|
16 |
-
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
|
17 |
-
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
|
18 |
-
return tokenizer, model
|
19 |
-
|
20 |
-
# Function to perform sentiment analysis and return the highest scoring emotion and its score between positive and negative
|
21 |
-
def analyze_sentiment(sentence, tokenizer, model):
|
22 |
-
inputs = tokenizer(sentence, return_tensors="pt")
|
23 |
-
outputs = model(**inputs)
|
24 |
-
probs = torch.nn.functional.softmax(outputs.logits, dim=-1).squeeze().tolist()
|
25 |
-
labels = ["negative", "neutral", "positive"]
|
26 |
-
results = dict(zip(labels, probs))
|
27 |
-
|
28 |
-
# Filter out the neutral score and get the highest score between positive and negative
|
29 |
-
relevant_results = {k: results[k] for k in ["positive", "negative"]}
|
30 |
-
highest_emotion = max(relevant_results, key=relevant_results.get)
|
31 |
-
highest_score = relevant_results[highest_emotion]
|
32 |
-
return highest_emotion, highest_score
|
33 |
-
|
34 |
-
# Initialize model and tokenizer
|
35 |
-
tokenizer, model = initialize_model()
|
36 |
|
37 |
def rescale(scale, boxes):
|
38 |
for i in range(len(boxes)):
|
@@ -149,7 +120,7 @@ def find_positive_end(bpmn_ids, links, text_mapping):
|
|
149 |
continue
|
150 |
if check_end(links[idx]) and (bpmn_id.split('_')[0] in ['event', 'message']):
|
151 |
# Perform sentiment analysis and get the highest scoring emotion and its score between positive and negative
|
152 |
-
highest_emotion, highest_score = analyze_sentiment(text_mapping[bpmn_id]
|
153 |
emotion_data.append((bpmn_id, highest_emotion, highest_score))
|
154 |
|
155 |
# Sort by emotion label with 'positive' first and 'negative' second,
|
@@ -161,7 +132,7 @@ def find_positive_end(bpmn_ids, links, text_mapping):
|
|
161 |
def find_best_direction(texts_list):
|
162 |
emotion_data = []
|
163 |
for text in texts_list:
|
164 |
-
highest_emotion, highest_score = analyze_sentiment(text
|
165 |
emotion_data.append((text, highest_emotion, highest_score))
|
166 |
|
167 |
# Sort by emotion label with 'positive' first and 'negative' second,
|
|
|
2 |
from modules.utils import class_dict
|
3 |
from xml.dom import minidom
|
4 |
from modules.utils import error
|
5 |
+
from modules.OCR import analyze_sentiment
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def rescale(scale, boxes):
|
9 |
for i in range(len(boxes)):
|
|
|
120 |
continue
|
121 |
if check_end(links[idx]) and (bpmn_id.split('_')[0] in ['event', 'message']):
|
122 |
# Perform sentiment analysis and get the highest scoring emotion and its score between positive and negative
|
123 |
+
highest_emotion, highest_score = analyze_sentiment(text_mapping[bpmn_id])
|
124 |
emotion_data.append((bpmn_id, highest_emotion, highest_score))
|
125 |
|
126 |
# Sort by emotion label with 'positive' first and 'negative' second,
|
|
|
132 |
def find_best_direction(texts_list):
|
133 |
emotion_data = []
|
134 |
for text in texts_list:
|
135 |
+
highest_emotion, highest_score = analyze_sentiment(text)
|
136 |
emotion_data.append((text, highest_emotion, highest_score))
|
137 |
|
138 |
# Sort by emotion label with 'positive' first and 'negative' second,
|