dperales commited on
Commit
65ffad7
·
1 Parent(s): 5152081

Upload 2 files

Browse files
Files changed (2) hide show
  1. detect_language.py +29 -0
  2. sentiment_analysis_v2.py +93 -0
detect_language.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
2
+
3
+ class LanguageDetector:
4
+
5
+ def __init__(self):
6
+ # Download the model file
7
+ #model_path = hf_hub_download("facebook/fasttext-language-identification", "model.bin")
8
+ # Load the FastText model
9
+ #self.model = fasttext.load_model(model_path)
10
+
11
+ self.tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
12
+ self.model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
13
+
14
+ # Function to predict the language of a text
15
+ def predict_language(self, text):
16
+ # Tokenize the input text
17
+ inputs = self.tokenizer(text, return_tensors="pt")
18
+
19
+ # Get the model's predictions
20
+ outputs = self.model(**inputs)
21
+
22
+ # Find the index of the highest score
23
+ prediction_idx = outputs.logits.argmax(dim=-1).item()
24
+
25
+ # Convert the index to the corresponding language code using the model's config.id2label
26
+ language_code = self.model.config.id2label[prediction_idx]
27
+
28
+ return language_code
29
+
sentiment_analysis_v2.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
2
+ from transformers_interpret import SequenceClassificationExplainer
3
+ import torch
4
+ import pandas as pd
5
+
6
+
7
+ class SentimentAnalysis:
8
+ """
9
+ Sentiment on text data.
10
+ Attributes:
11
+ tokenizer: An instance of Hugging Face Tokenizer
12
+ model: An instance of Hugging Face Model
13
+ explainer: An instance of SequenceClassificationExplainer from Transformers interpret
14
+ """
15
+
16
+ def __init__(self):
17
+ # Load Tokenizer & Model
18
+ hub_location = 'cardiffnlp/twitter-roberta-base-sentiment'
19
+ self.tokenizer = AutoTokenizer.from_pretrained(hub_location)
20
+ self.model = AutoModelForSequenceClassification.from_pretrained(hub_location)
21
+
22
+ hub_location_sp = 'finiteautomata/beto-sentiment-analysis'
23
+ self.tokenizer_sp = AutoTokenizer.from_pretrained(hub_location_sp)
24
+ self.model_sp = AutoModelForSequenceClassification.from_pretrained(hub_location_sp)
25
+
26
+ # Change model labels in config
27
+ self.model.config.id2label[0] = "Negative"
28
+ self.model.config.id2label[1] = "Neutral"
29
+ self.model.config.id2label[2] = "Positive"
30
+ self.model.config.label2id["Negative"] = self.model.config.label2id.pop("LABEL_0")
31
+ self.model.config.label2id["Neutral"] = self.model.config.label2id.pop("LABEL_1")
32
+ self.model.config.label2id["Positive"] = self.model.config.label2id.pop("LABEL_2")
33
+
34
+ # Instantiate explainer
35
+ self.explainer = SequenceClassificationExplainer(self.model, self.tokenizer)
36
+ self.explainer_sp = SequenceClassificationExplainer(self.model_sp, self.tokenizer_sp)
37
+
38
+ def justify(self, text, lang):
39
+ """
40
+ Get html annotation for displaying sentiment justification over text.
41
+ Parameters:
42
+ text (str): The user input string to sentiment justification
43
+ Returns:
44
+ html (hmtl): html object for plotting sentiment prediction justification
45
+ """
46
+
47
+ if lang == 'es':
48
+ word_attributions = self.explainer_sp(text)
49
+ html = self.explainer_sp.visualize("example.html")
50
+ else:
51
+ word_attributions = self.explainer(text)
52
+ html = self.explainer.visualize("example.html")
53
+
54
+ return html
55
+
56
+ def classify(self, text, lang):
57
+ """
58
+ Recognize Sentiment in text.
59
+ Parameters:
60
+ text (str): The user input string to perform sentiment classification on
61
+ Returns:
62
+ predictions (str): The predicted probabilities for sentiment classes
63
+ """
64
+
65
+ if lang == 'es':
66
+ tokens = self.tokenizer_sp.encode_plus(text, add_special_tokens=False, return_tensors='pt')
67
+ outputs = self.model_sp(**tokens)
68
+ probs = torch.nn.functional.softmax(outputs[0], dim=-1)
69
+ probs = probs.mean(dim=0).detach().numpy()
70
+ predictions = pd.Series(probs, index=["Negative", "Neutral", "Positive"], name='Predicted Probability')
71
+ else:
72
+ tokens = self.tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
73
+ outputs = self.model(**tokens)
74
+ probs = torch.nn.functional.softmax(outputs[0], dim=-1)
75
+ probs = probs.mean(dim=0).detach().numpy()
76
+ predictions = pd.Series(probs, index=["Negative", "Neutral", "Positive"], name='Predicted Probability')
77
+
78
+ return predictions
79
+
80
+ def run(self, text, lang):
81
+ """
82
+ Classify and Justify Sentiment in text.
83
+ Parameters:
84
+ text (str): The user input string to perform sentiment classification on
85
+ Returns:
86
+ predictions (str): The predicted probabilities for sentiment classes
87
+ html (hmtl): html object for plotting sentiment prediction justification
88
+ """
89
+
90
+ predictions = self.classify(text, lang)
91
+ html = self.justify(text, lang)
92
+
93
+ return predictions, html