zeynepgulhan commited on
Commit
79bbdf9
·
verified ·
1 Parent(s): cbd2f6b

app file created

Browse files
Files changed (1) hide show
  1. app.py +101 -0
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+ import torch
4
+ import numpy as np
5
+ import re
6
+
7
+ from turkish.deasciifier import Deasciifier
8
+
9
+ # Model ve tokenizer initialization
10
+ tokenizer = AutoTokenizer.from_pretrained("TURKCELL/bert-offensive-lang-detection-tr")
11
+ model = AutoModelForSequenceClassification.from_pretrained("TURKCELL/bert-offensive-lang-detection-tr")
12
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
+ model.to(device)
14
+
15
+
16
+ def deasciifier(text):
17
+ deasciifier = Deasciifier(text)
18
+ return deasciifier.convert_to_turkish()
19
+
20
+
21
+ def remove_circumflex(text):
22
+ circumflex_map = {
23
+ 'â': 'a',
24
+ 'î': 'i',
25
+ 'û': 'u',
26
+ 'ô': 'o',
27
+ 'Â': 'A',
28
+ 'Î': 'I',
29
+ 'Û': 'U',
30
+ 'Ô': 'O'
31
+ }
32
+
33
+ return ''.join(circumflex_map.get(c, c) for c in text)
34
+
35
+
36
+ def turkish_lower(text):
37
+ turkish_map = {
38
+ 'I': 'ı',
39
+ 'İ': 'i',
40
+ 'Ç': 'ç',
41
+ 'Ş': 'ş',
42
+ 'Ğ': 'ğ',
43
+ 'Ü': 'ü',
44
+ 'Ö': 'ö'
45
+ }
46
+ return ''.join(turkish_map.get(c, c).lower() for c in text)
47
+
48
+
49
+ def clean_text(text):
50
+ # Metindeki şapkalı harfleri kaldırma
51
+ text = remove_circumflex(text)
52
+ # Metni küçük harfe dönüştürme
53
+ text = turkish_lower(text)
54
+ # deasciifier
55
+ text = deasciifier(text)
56
+ # Kullanıcı adlarını kaldırma
57
+ text = re.sub(r"@\S*", " ", text)
58
+ # Hashtag'leri kaldırma
59
+ text = re.sub(r'#\S+', ' ', text)
60
+ # URL'leri kaldırma
61
+ text = re.sub(r"http\S+|www\S+|https\S+", ' ', text, flags=re.MULTILINE)
62
+ # Noktalama işaretlerini ve metin tabanlı emojileri kaldırma
63
+ text = re.sub(r'[^\w\s]|(:\)|:\(|:D|:P|:o|:O|;\))', ' ', text)
64
+ # Emojileri kaldırma
65
+ emoji_pattern = re.compile("["
66
+ u"\U0001F600-\U0001F64F" # emoticons
67
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
68
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
69
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
70
+ u"\U00002702-\U000027B0"
71
+ u"\U000024C2-\U0001F251"
72
+ "]+", flags=re.UNICODE)
73
+ text = emoji_pattern.sub(r' ', text)
74
+
75
+ # Birden fazla boşluğu tek boşlukla değiştirme
76
+ text = re.sub(r'\s+', ' ', text).strip()
77
+ return text
78
+
79
+
80
+ def is_offensive(sentence):
81
+ normalize_text = clean_text(sentence)
82
+
83
+ test_sample = tokenizer(normalize_text, padding=True, truncation=True, max_length=256, return_tensors='pt')
84
+ test_sample = {k: v.to(device) for k, v in test_sample.items()}
85
+
86
+ output = model(**test_sample)
87
+ y_pred = np.argmax(output.logits.detach().cpu().numpy(), axis=1)
88
+
89
+ d = {0: 'non-offensive', 1: 'offensive'}
90
+ return d[y_pred[0]]
91
+
92
+
93
+ iface = gr.Interface(
94
+ fn=is_offensive,
95
+ inputs=gr.Textbox(lines=2, placeholder="Enter sentence here..."),
96
+ outputs="text",
97
+ title="Offensive Language Detection",
98
+ description="Offensive language detection for Turkish"
99
+ )
100
+
101
+ iface.launch()