Bedirhan commited on
Commit
a1e75ef
·
1 Parent(s): 37ed8b3

Upload sentimentAnalysis.py

Browse files
Files changed (1) hide show
  1. sentimentAnalysis.py +91 -0
sentimentAnalysis.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import pandas as pd
3
+ from nltk.corpus import stopwords
4
+ import re
5
+ import numpy as np
6
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
7
+ from scipy.special import softmax
8
+
9
+ class sentimentAnalysis():
10
+ def __init__(self, lang, text2analysePath):
11
+ self.lang = lang
12
+ self.text2analysePath = text2analysePath
13
+ self.engLabels = ["negative", "neutral", "positive"]
14
+ nltk.download("stopwords")
15
+
16
+ def downloadModels(self):
17
+ txtt = open(self.text2analysePath, 'r', encoding="utf-8")
18
+ if self.lang == "English" or self.lang == "İngilizce" or self.lang == "ingilizce" or self.lang == "english":
19
+ MODEL = f"sentimentModels/cardiffnlp/twitter-roberta-base-sentiment"
20
+ self.tokenizer = AutoTokenizer.from_pretrained(MODEL)
21
+ self.model = AutoModelForSequenceClassification.from_pretrained(MODEL)
22
+ self.model.save_pretrained(MODEL)
23
+ self.tokenizer.save_pretrained(MODEL)
24
+ self.engPrepareText(txtt)
25
+
26
+ elif self.lang == "Turkish" or self.lang == "Türkçe" or self.lang == "türkçe" or self.lang == "turkish":
27
+ self.model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
28
+ self.tokenizer = AutoTokenizer.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
29
+ self.sa = pipeline("sentiment-analysis", tokenizer=self.tokenizer, model=self.model)
30
+ self.trPrepareText(txtt)
31
+
32
+ else:
33
+ print("Dil bulunamadı!------The language has not been found!")
34
+
35
+ def engPrepareText(self, txtt):
36
+ a = []
37
+ for i in txtt.readlines():
38
+ i = i.lower()
39
+ i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i)
40
+ spl = i.split(' ')
41
+ new_word = [word for word in spl if not word in set(stopwords.words("english"))]
42
+ a.append(' '.join(new_word))
43
+ dFen = pd.DataFrame(a, columns=["texts"])
44
+ self.engAnalyse(dFen)
45
+
46
+ def trPrepareText(self, txtt):
47
+ a = []
48
+ for i in txtt.readlines():
49
+ i = i.lower()
50
+ i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i)
51
+ spl = i.split(' ')
52
+ new_word = [word for word in spl if not word in set(stopwords.words("turkish"))]
53
+ a.append(' '.join(new_word))
54
+ dFtr = pd.DataFrame(a, columns=["metinler"])
55
+ self.trAnalyse(dFtr)
56
+
57
+ def engAnalyse(self, dFen):
58
+ for i in range(len(dFen)):
59
+ text = dFen["texts"][i]
60
+ encoded_input = self.tokenizer(text, return_tensors='pt')
61
+ output = self.model(**encoded_input)
62
+ scores = output[0][0].detach().numpy()
63
+ scores = softmax(scores)
64
+ ranking = np.argsort(scores)
65
+ ranking = ranking[::-1]
66
+ print(f"text: {text}")
67
+ for i in range(scores.shape[0]):
68
+ l = self.engLabels[ranking[i]]
69
+ s = scores[ranking[i]]
70
+ print(f"{i + 1}) {l + ':'} {np.round(float(s), 4)}")
71
+
72
+ def trAnalyse(self, dFtr):
73
+ for i in range(len(dFtr)):
74
+ text = dFtr["metinler"][i]
75
+ p = self.sa(text)[0]
76
+ if p["label"] == "positive":
77
+ print(f"text: {text}")
78
+ print(f"1-) positive: {np.round(float(p['score']), 4)}")
79
+ print(f"2-) negative: {np.round(float(1 - p['score']), 4)}")
80
+ else:
81
+ print(f"text: {text}")
82
+ print(f"1-) positive: {np.round(float(1 - p['score']), 4)}")
83
+ print(f"2-) negative: {np.round(float(p['score']), 4)}")
84
+
85
+
86
+
87
+ lang = "ingilizce"
88
+ path = "texts/denemeler/text.txt"
89
+
90
+ sA = sentimentAnalysis(lang, path).downloadModels()
91
+