hm-auch commited on
Commit
072c906
1 Parent(s): b642a67

update classifier and demonstrator-code

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Hatespeech
3
  emoji: 😻
4
  colorFrom: yellow
5
  colorTo: red
 
1
  ---
2
+ title: Hassrede
3
  emoji: 😻
4
  colorFrom: yellow
5
  colorTo: red
app.py CHANGED
@@ -1,13 +1,9 @@
1
  import transformers
2
 
3
  import gradio as gr
 
4
  import tensorflow as tf
5
 
6
- MODEL_DIRECTORY = './result/model'
7
- PRETRAINED_MODEL_NAME = 'dbmdz/bert-base-german-cased'
8
- TOKENIZER = transformers.BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
9
- MAX_SEQUENCE_LENGTH = 300
10
-
11
  def encode(sentences, tokenizer, sequence_length):
12
  return tokenizer.batch_encode_plus(
13
  sentences,
@@ -19,39 +15,62 @@ def encode(sentences, tokenizer, sequence_length):
19
  return_tensors='tf'
20
  )
21
 
22
- hs_detection_model = tf.keras.models.load_model(MODEL_DIRECTORY, compile=True)
23
-
24
- def inference(sentence):
25
- encoded_sentence = encode([sentence], TOKENIZER, MAX_SEQUENCE_LENGTH)
26
- return hs_detection_model.predict(encoded_sentence.values())
27
 
 
 
 
 
 
 
28
 
29
- title = "HS-Detector Demonstrator"
30
  description = """
 
 
31
  <center>
32
- <p>Dataset: germeval18_hasoc19_rp21_combi_dataset (17,7% HS)</p>
33
- <p>Das bisher beste Modell basierend auf Bert nach 2 Epochen und max. 300 Token pro Eintrag fine-tuning mit folgenden Evaluationsergebnissen:</p>
34
-
35
- Accuracy: 0.8794712286158631<br/>
36
- Balanced Accuracy: 0.7561891312100413<br/>
37
- Binary F1-Score: 0.6249999999999999<br/>
38
- Binary Precision: 0.6994584837545126<br/>
39
- Binary Recall: 0.564868804664723<br/>
40
- Weighted F1-Score: 0.8742843536656945<br/>
41
- Weighted Precision: 0.8722794361456155<br/>
42
- Weighted Recall: 0.8794712286158631<br/>
43
- Macro F1-Score: 0.7765982087708463<br/>
44
- Macro Precision: 0.80455672371745<br/>
45
- Macro Recall: 0.7561891312100413<br/>
46
- MCC score: 0.558655967312084<br/>
47
- AUROC score: 0.7561891312100413<br/>
48
-
49
- <img src="https://huggingface.co/spaces/course-demos/Rick_and_Morty_QA/resolve/main/rick.png" width=200px>
 
 
 
 
 
 
 
 
 
 
 
 
50
  </center>
 
 
51
  """
 
 
 
 
52
 
53
- article = "Die Eingaben werden nicht geloggt. Klassifikator einfach ausprobieren."
54
-
55
- input_sentence_text = gr.inputs.Textbox(placeholder="Hier den Satz eingeben, der Hassrede enthalten kann.")
56
- ui = gr.Interface(fn=inference, inputs=input_sentence_text, outputs="text", title = title, description = description, article = article)
57
- ui.launch()
 
1
  import transformers
2
 
3
  import gradio as gr
4
+ import numpy as np
5
  import tensorflow as tf
6
 
 
 
 
 
 
7
  def encode(sentences, tokenizer, sequence_length):
8
  return tokenizer.batch_encode_plus(
9
  sentences,
 
15
  return_tensors='tf'
16
  )
17
 
18
+ hs_detection_model_1 = tf.keras.models.load_model('./model_1', compile=True)
19
+ hs_detection_model_2 = tf.keras.models.load_model('./model_2', compile=True)
 
 
 
20
 
21
+ def model_inference(sentence):
22
+ encoded_model1_sentence = encode([sentence], transformers.BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased'), 300)
23
+ encoded_model2_sentence = encode([sentence], transformers.BertTokenizer.from_pretrained('dbmdz/bert-base-german-uncased'), 512)
24
+ predictions_1 = hs_detection_model_1.predict(encoded_model1_sentence.values()).flatten()
25
+ predictions_2 = hs_detection_model_2.predict(encoded_model2_sentence.values()).flatten()
26
+ return {'Hassrede': float(predictions_1[0])}, {'Hassrede': float(predictions_2[0])}
27
 
28
+ title = "HS-Detector Demonstrator (deutsch)"
29
  description = """
30
+ <div style="float: none; overflow: hidden;">
31
+ <div style="display:block; width:100%;">
32
  <center>
33
+ <div style="width:50%; float: left; display: inline-block;">
34
+ <h2>Ausgangsmodell</h2>
35
+ <p>Modell: Bert ('dbmdz/bert-base-german-cased')</p>
36
+ <p>Dataset: germeval18_hasoc19_rp21_combi_dataset <br/> (77.161 Einträge mit einem Hassrede-Anteil von 17,7%)</p>
37
+ <p>Fine-Tuning Parameter: 2 Epochen, 300 Token pro Eintrag, 2e-5 LR</p>
38
+
39
+ Evaluationsergebnisse:
40
+ Balanced Accuracy: 0.756
41
+ (Accuracy: 0.880)
42
+ Binary F1-Score: 0.625
43
+ Binary Precision: 0.699
44
+ Binary Recall: 0.565
45
+ MCC score: 0.559
46
+ AUROC score: 0.756
47
+ </div>
48
+ <div style="width:50%; float: left; display: inline-block;">
49
+ <h2>Challenger-Modell</h2>
50
+ <p>Modell: Bert ('dbmdz/bert-base-german-uncased')</p>
51
+ <p>Dataset: germeval18_hasoc19_rp21_combi_dataset_no-url_no-address <br/> (~77.161 Einträge mit einem Hassrede-Anteil von 17,7%)</p>
52
+ <p>Fine-Tuning Parameter: 2 Epochen, 512 Token pro Eintrag, 2e-5 LR</p>
53
+
54
+ Evaluationsergebnisse:
55
+ Balanced Accuracy: 0.749
56
+ (Accuracy: 0.867)
57
+ Binary F1-Score: 0.602
58
+ Binary Precision: 0.642
59
+ Binary Recall: 0.567
60
+ MCC score: 0.524
61
+ AUROC score: 0.749
62
+ </div>
63
  </center>
64
+ </div>
65
+ </div>
66
  """
67
+ # <p>Dataset: germeval18_hasoc19_rp21_glasebach22_combi_dataset_no-addr.csv <br/> (84.239 Einträge mit einem Hassrede-Anteil von 18,2%)</p>
68
+ article = """Die Eingaben werden nicht geloggt. Klassifikator einfach ausprobieren.
69
+ Unter dem Button 'Ersteller' kann inspiziert werden, welche Satz-Bestandteile für die Modelle vermutlich entscheident waren.
70
+ Dabei werden automatisiert Satzteile verändert und die Auswirkungen auf die jeweils abgefragten Predictions beobachtet."""
71
 
72
+ input_sentence_text = gr.inputs.Textbox(lines=5, placeholder="Geben Sie hier den Satz ein, der von den Modellen auf Hassrede geprüft werden soll.")
73
+ output_predictions = [gr.outputs.Label(label="Prediction of initial model", num_top_classes=1), gr.outputs.Label(label="Prediction of challenging model", num_top_classes=1)]
74
+ ui = gr.Interface(fn=model_inference, inputs=input_sentence_text, outputs=output_predictions, title=title, article=article, description=description, interpretation="default",
75
+ flagging_options=["incorrect", "ambiguous", "other"])
76
+ ui.launch(enable_queue=True)
gradio_queue.db ADDED
File without changes
gradio_queue.db-journal ADDED
Binary file (512 Bytes). View file
 
{result/model → model_1}/keras_metadata.pb RENAMED
File without changes
{result/model → model_1}/saved_model.pb RENAMED
File without changes
{result/model → model_1}/variables/variables.data-00000-of-00001 RENAMED
File without changes
{result/model → model_1}/variables/variables.index RENAMED
File without changes
model_2/keras_metadata.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8af43660950e7ee4747371bb148060c46c696cf2141dcccfa48b02fe15d51f6
3
+ size 154814
model_2/saved_model.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f32916473730c6bcf6302fc51828059364bc2d8ccc5d19fac4f633ac47f15073
3
+ size 6564579
model_2/variables/variables.data-00000-of-00001 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:338d6f22e3fa2f63a391c300d0ba96774d08ff685986f1006c55177d11656004
3
+ size 1319386304
model_2/variables/variables.index ADDED
Binary file (40.7 kB). View file