Spaces:
Runtime error
Runtime error
Upload sentimentAnalysis.py
Browse files- sentimentAnalysis.py +91 -0
sentimentAnalysis.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
import pandas as pd
|
3 |
+
from nltk.corpus import stopwords
|
4 |
+
import re
|
5 |
+
import numpy as np
|
6 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
7 |
+
from scipy.special import softmax
|
8 |
+
|
9 |
+
class sentimentAnalysis():
|
10 |
+
def __init__(self, lang, text2analysePath):
|
11 |
+
self.lang = lang
|
12 |
+
self.text2analysePath = text2analysePath
|
13 |
+
self.engLabels = ["negative", "neutral", "positive"]
|
14 |
+
nltk.download("stopwords")
|
15 |
+
|
16 |
+
def downloadModels(self):
|
17 |
+
txtt = open(self.text2analysePath, 'r', encoding="utf-8")
|
18 |
+
if self.lang == "English" or self.lang == "İngilizce" or self.lang == "ingilizce" or self.lang == "english":
|
19 |
+
MODEL = f"sentimentModels/cardiffnlp/twitter-roberta-base-sentiment"
|
20 |
+
self.tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
21 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(MODEL)
|
22 |
+
self.model.save_pretrained(MODEL)
|
23 |
+
self.tokenizer.save_pretrained(MODEL)
|
24 |
+
self.engPrepareText(txtt)
|
25 |
+
|
26 |
+
elif self.lang == "Turkish" or self.lang == "Türkçe" or self.lang == "türkçe" or self.lang == "turkish":
|
27 |
+
self.model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
|
28 |
+
self.tokenizer = AutoTokenizer.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
|
29 |
+
self.sa = pipeline("sentiment-analysis", tokenizer=self.tokenizer, model=self.model)
|
30 |
+
self.trPrepareText(txtt)
|
31 |
+
|
32 |
+
else:
|
33 |
+
print("Dil bulunamadı!------The language has not been found!")
|
34 |
+
|
35 |
+
def engPrepareText(self, txtt):
|
36 |
+
a = []
|
37 |
+
for i in txtt.readlines():
|
38 |
+
i = i.lower()
|
39 |
+
i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i)
|
40 |
+
spl = i.split(' ')
|
41 |
+
new_word = [word for word in spl if not word in set(stopwords.words("english"))]
|
42 |
+
a.append(' '.join(new_word))
|
43 |
+
dFen = pd.DataFrame(a, columns=["texts"])
|
44 |
+
self.engAnalyse(dFen)
|
45 |
+
|
46 |
+
def trPrepareText(self, txtt):
|
47 |
+
a = []
|
48 |
+
for i in txtt.readlines():
|
49 |
+
i = i.lower()
|
50 |
+
i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i)
|
51 |
+
spl = i.split(' ')
|
52 |
+
new_word = [word for word in spl if not word in set(stopwords.words("turkish"))]
|
53 |
+
a.append(' '.join(new_word))
|
54 |
+
dFtr = pd.DataFrame(a, columns=["metinler"])
|
55 |
+
self.trAnalyse(dFtr)
|
56 |
+
|
57 |
+
def engAnalyse(self, dFen):
|
58 |
+
for i in range(len(dFen)):
|
59 |
+
text = dFen["texts"][i]
|
60 |
+
encoded_input = self.tokenizer(text, return_tensors='pt')
|
61 |
+
output = self.model(**encoded_input)
|
62 |
+
scores = output[0][0].detach().numpy()
|
63 |
+
scores = softmax(scores)
|
64 |
+
ranking = np.argsort(scores)
|
65 |
+
ranking = ranking[::-1]
|
66 |
+
print(f"text: {text}")
|
67 |
+
for i in range(scores.shape[0]):
|
68 |
+
l = self.engLabels[ranking[i]]
|
69 |
+
s = scores[ranking[i]]
|
70 |
+
print(f"{i + 1}) {l + ':'} {np.round(float(s), 4)}")
|
71 |
+
|
72 |
+
def trAnalyse(self, dFtr):
|
73 |
+
for i in range(len(dFtr)):
|
74 |
+
text = dFtr["metinler"][i]
|
75 |
+
p = self.sa(text)[0]
|
76 |
+
if p["label"] == "positive":
|
77 |
+
print(f"text: {text}")
|
78 |
+
print(f"1-) positive: {np.round(float(p['score']), 4)}")
|
79 |
+
print(f"2-) negative: {np.round(float(1 - p['score']), 4)}")
|
80 |
+
else:
|
81 |
+
print(f"text: {text}")
|
82 |
+
print(f"1-) positive: {np.round(float(1 - p['score']), 4)}")
|
83 |
+
print(f"2-) negative: {np.round(float(p['score']), 4)}")
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
lang = "ingilizce"
|
88 |
+
path = "texts/denemeler/text.txt"
|
89 |
+
|
90 |
+
sA = sentimentAnalysis(lang, path).downloadModels()
|
91 |
+
|