klebendes darkproger commited on
Commit
ef24f98
·
0 Parent(s):

Duplicate from darkproger/propaganda

Browse files

Co-authored-by: Volodymyr Ky <darkproger@users.noreply.huggingface.co>

Files changed (5) hide show
  1. .gitattributes +27 -0
  2. README.md +38 -0
  3. app.py +72 -0
  4. model.py +128 -0
  5. requirements.txt +6 -0
.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Propaganda
3
+ emoji: 📊
4
+ colorFrom: green
5
+ colorTo: red
6
+ sdk: streamlit
7
+ app_file: app.py
8
+ pinned: false
9
+ duplicated_from: darkproger/propaganda
10
+ ---
11
+
12
+ # Configuration
13
+
14
+ `title`: _string_
15
+ Display title for the Space
16
+
17
+ `emoji`: _string_
18
+ Space emoji (emoji-only character allowed)
19
+
20
+ `colorFrom`: _string_
21
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
22
+
23
+ `colorTo`: _string_
24
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
25
+
26
+ `sdk`: _string_
27
+ Can be either `gradio` or `streamlit`
28
+
29
+ `sdk_version` : _string_
30
+ Only applicable for `streamlit` SDK.
31
+ See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
32
+
33
+ `app_file`: _string_
34
+ Path to your main application file (which contains either `gradio` or `streamlit` Python code).
35
+ Path is relative to the root of the repository.
36
+
37
+ `pinned`: _boolean_
38
+ Whether the Space stays on top of your list.
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from spacy import displacy
3
+ from spacy.tokens import Doc
4
+ from spacy.vocab import Vocab
5
+ from spacy_streamlit.util import get_html
6
+ import streamlit as st
7
+ import torch
8
+ from transformers import BertTokenizerFast
9
+
10
+ from model import BertForTokenAndSequenceJointClassification
11
+
12
+
13
+ @st.cache(allow_output_mutation=True)
14
+ def load_model():
15
+ tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
16
+ model = BertForTokenAndSequenceJointClassification.from_pretrained(
17
+ "QCRI/PropagandaTechniquesAnalysis-en-BERT",
18
+ revision="v0.1.0")
19
+ return tokenizer, model
20
+
21
+ with torch.inference_mode(True):
22
+ tokenizer, model = load_model()
23
+
24
+ st.write("[Propaganda Techniques Analysis BERT](https://huggingface.co/QCRI/PropagandaTechniquesAnalysis-en-BERT) Tagger")
25
+
26
+ input = st.text_area('Input', """\
27
+ In some instances, it can be highly dangerous to use a medicine for the prevention or treatment of COVID-19 that has not been approved by or has not received emergency use authorization from the FDA.
28
+ """)
29
+
30
+ inputs = tokenizer.encode_plus(input, return_tensors="pt")
31
+ outputs = model(**inputs)
32
+ sequence_class_index = torch.argmax(outputs.sequence_logits, dim=-1)
33
+ sequence_class = model.sequence_tags[sequence_class_index[0]]
34
+ token_class_index = torch.argmax(outputs.token_logits, dim=-1)
35
+ tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0][1:-1])
36
+ tags = [model.token_tags[i] for i in token_class_index[0].tolist()[1:-1]]
37
+
38
+ columns = st.columns(len(outputs.sequence_logits.flatten()))
39
+ for col, sequence_tag, logit in zip(columns, model.sequence_tags, outputs.sequence_logits.flatten()):
40
+ col.metric(sequence_tag, '%.2f' % logit.item())
41
+
42
+
43
+ spaces = [not tok.startswith('##') for tok in tokens][1:] + [False]
44
+
45
+ doc = Doc(Vocab(strings=set(tokens)),
46
+ words=tokens,
47
+ spaces=spaces,
48
+ ents=[tag if tag == "O" else f"B-{tag}" for tag in tags])
49
+
50
+ labels = model.token_tags[2:]
51
+
52
+ label_select = st.multiselect(
53
+ "Tags",
54
+ options=labels,
55
+ default=labels,
56
+ key=f"tags_ner_label_select",
57
+ )
58
+ html = displacy.render(
59
+ doc, style="ent", options={"ents": label_select, "colors": {}}
60
+ )
61
+ style = "<style>mark.entity { display: inline-block }</style>"
62
+ st.write(f"{style}{get_html(html)}", unsafe_allow_html=True)
63
+
64
+ attrs = ["text", "label_", "start", "end", "start_char", "end_char"]
65
+ data = [
66
+ [str(getattr(ent, attr)) for attr in attrs]
67
+ for ent in doc.ents
68
+ if ent.label_ in label_select
69
+ ]
70
+ if data:
71
+ df = pd.DataFrame(data, columns=attrs)
72
+ st.dataframe(df)
model.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __author__ = "Yifan Zhang (yzhang@hbku.edu.qa)"
2
+ __copyright__ = "Copyright (C) 2021, Qatar Computing Research Institute, HBKU, Doha"
3
+
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Optional, Tuple
7
+ import torch
8
+ from torch import nn
9
+ from torch.nn.functional import sigmoid
10
+ from transformers import BertPreTrainedModel, BertModel
11
+ from transformers.file_utils import ModelOutput
12
+
13
+
14
+ TOKEN_TAGS = (
15
+ "<PAD>", "O",
16
+ "Name_Calling,Labeling", "Repetition", "Slogans", "Appeal_to_fear-prejudice", "Doubt",
17
+ "Exaggeration,Minimisation", "Flag-Waving", "Loaded_Language",
18
+ "Reductio_ad_hitlerum", "Bandwagon",
19
+ "Causal_Oversimplification", "Obfuscation,Intentional_Vagueness,Confusion", "Appeal_to_Authority", "Black-and-White_Fallacy",
20
+ "Thought-terminating_Cliches", "Red_Herring", "Straw_Men", "Whataboutism"
21
+ )
22
+
23
+
24
+ SEQUENCE_TAGS = ("Non-prop", "Prop")
25
+
26
+
27
+ @dataclass
28
+ class TokenAndSequenceJointClassifierOutput(ModelOutput):
29
+ loss: Optional[torch.FloatTensor] = None
30
+ token_logits: torch.FloatTensor = None
31
+ sequence_logits: torch.FloatTensor = None
32
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
33
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
34
+
35
+
36
+ class BertForTokenAndSequenceJointClassification(BertPreTrainedModel):
37
+
38
+ def __init__(self, config):
39
+ super().__init__(config)
40
+ self.num_token_labels = 20
41
+ self.num_sequence_labels = 2
42
+
43
+ self.token_tags = TOKEN_TAGS
44
+ self.sequence_tags = SEQUENCE_TAGS
45
+
46
+ self.alpha = 0.9
47
+
48
+ self.bert = BertModel(config)
49
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
50
+ self.classifier = nn.ModuleList([
51
+ nn.Linear(config.hidden_size, self.num_token_labels),
52
+ nn.Linear(config.hidden_size, self.num_sequence_labels),
53
+ ])
54
+ self.masking_gate = nn.Linear(2, 1)
55
+
56
+ self.init_weights()
57
+ self.merge_classifier_1 = nn.Linear(self.num_token_labels + self.num_sequence_labels, self.num_token_labels)
58
+
59
+ def forward(
60
+ self,
61
+ input_ids=None,
62
+ attention_mask=None,
63
+ token_type_ids=None,
64
+ position_ids=None,
65
+ head_mask=None,
66
+ inputs_embeds=None,
67
+ labels=None,
68
+ output_attentions=None,
69
+ output_hidden_states=None,
70
+ return_dict=True,
71
+ ):
72
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
73
+
74
+ outputs = self.bert(
75
+ input_ids,
76
+ attention_mask=attention_mask,
77
+ token_type_ids=token_type_ids,
78
+ position_ids=position_ids,
79
+ head_mask=head_mask,
80
+ inputs_embeds=inputs_embeds,
81
+ output_attentions=output_attentions,
82
+ output_hidden_states=output_hidden_states,
83
+ )
84
+
85
+ sequence_output = outputs[0]
86
+ pooler_output = outputs[1]
87
+
88
+ sequence_output = self.dropout(sequence_output)
89
+ token_logits = self.classifier[0](sequence_output)
90
+
91
+ pooler_output = self.dropout(pooler_output)
92
+ sequence_logits = self.classifier[1](pooler_output)
93
+
94
+ gate = torch.sigmoid(self.masking_gate(sequence_logits))
95
+
96
+ gates = gate.unsqueeze(1).repeat(1, token_logits.size()[1], token_logits.size()[2])
97
+
98
+ weighted_token_logits = torch.mul(gates, token_logits)
99
+
100
+ logits = [weighted_token_logits, sequence_logits]
101
+
102
+ loss = None
103
+ if labels is not None:
104
+ criterion = nn.CrossEntropyLoss(ignore_index=0)
105
+ binary_criterion = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([3932/14263]).cuda())
106
+ loss_fct = CrossEntropyLoss()
107
+ weighted_token_logits = weighted_token_logits.view(-1, weighted_token_logits.shape[-1])
108
+ sequence_logits = sequence_logits.view(-1, sequence_logits.shape[-1])
109
+
110
+ token_loss = criterion(weighted_token_logits, labels)
111
+ sequence_label = torch.LongTensor([1] if any([label > 0 for label in labels]) else [0])
112
+ sequence_loss = binary_criterion(sequence_logits, sequence_label)
113
+
114
+ loss = self.alpha*loss[0] + (1-self.alpha)*loss[1]
115
+
116
+ if not return_dict:
117
+ output = (logits,) + outputs[2:]
118
+ return ((loss,) + output) if loss is not None else output
119
+
120
+ return TokenAndSequenceJointClassifierOutput(
121
+ loss=loss,
122
+ token_logits=weighted_token_logits,
123
+ sequence_logits=sequence_logits,
124
+ hidden_states=outputs.hidden_states,
125
+ attentions=outputs.attentions,
126
+ )
127
+
128
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pandas
2
+ spacy
3
+ spacy_streamlit
4
+ streamlit
5
+ transformers
6
+ torch