system HF staff commited on
Commit
5ef2457
1 Parent(s): 38fd0da

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +105 -0
README.md ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multilingual Joint Fine-tuning of Transformer models for identifying Trolling, Aggression and Cyberbullying at TRAC 2020
2
+
3
+ Models and predictions for submission to TRAC - 2020 Second Workshop on Trolling, Aggression and Cyberbullying
4
+
5
+ Our approach is described in our paper titled:
6
+
7
+ > Mishra, Sudhanshu, Shivangi Prasad, and Shubhanshu Mishra. 2020. "Multilingual Joint Fine-Tuning of Transformer Models for Identifying Trolling, Aggression and Cyberbullying at TRAC 2020." In Proceedings of the Second Workshop on Trolling, Aggression and Cyberbullying (TRAC-2020).
8
+
9
+ The source code for training this model and more details can be found on our code repository: https://github.com/socialmediaie/TRAC2020
10
+
11
+ NOTE: These models are retrained for uploading here after our submission so the evaluation measures may be slightly different from the ones reported in the paper.
12
+
13
+ If you plan to use the dataset please cite the following resources:
14
+
15
+ * Mishra, Sudhanshu, Shivangi Prasad, and Shubhanshu Mishra. 2020. "Multilingual Joint Fine-Tuning of Transformer Models for Identifying Trolling, Aggression and Cyberbullying at TRAC 2020." In Proceedings of the Second Workshop on Trolling, Aggression and Cyberbullying (TRAC-2020).
16
+ * Mishra, Shubhanshu, Shivangi Prasad, and Shubhanshu Mishra. 2020. “Trained Models for Multilingual Joint Fine-Tuning of Transformer Models for Identifying Trolling, Aggression and Cyberbullying at TRAC 2020.” University of Illinois at Urbana-Champaign. https://doi.org/10.13012/B2IDB-8882752_V1.
17
+
18
+
19
+ ```
20
+ @inproceedings{Mishra2020TRAC,
21
+ author = {Mishra, Sudhanshu and Prasad, Shivangi and Mishra, Shubhanshu},
22
+ booktitle = {Proceedings of the Second Workshop on Trolling, Aggression and Cyberbullying (TRAC-2020)},
23
+ title = {{Multilingual Joint Fine-tuning of Transformer models for identifying Trolling, Aggression and Cyberbullying at TRAC 2020}},
24
+ year = {2020}
25
+ }
26
+
27
+ @data{illinoisdatabankIDB-8882752,
28
+ author = {Mishra, Shubhanshu and Prasad, Shivangi and Mishra, Shubhanshu},
29
+ doi = {10.13012/B2IDB-8882752_V1},
30
+ publisher = {University of Illinois at Urbana-Champaign},
31
+ title = {{Trained models for Multilingual Joint Fine-tuning of Transformer models for identifying Trolling, Aggression and Cyberbullying at TRAC 2020}},
32
+ url = {https://doi.org/10.13012/B2IDB-8882752{\_}V1},
33
+ year = {2020}
34
+ }
35
+ ```
36
+
37
+
38
+ ## Usage
39
+
40
+ The models can be used via the following code:
41
+
42
+ ```python
43
+ from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
44
+ import torch
45
+ from pathlib import Path
46
+ from scipy.special import softmax
47
+ import numpy as np
48
+ import pandas as pd
49
+
50
+ TASK_LABEL_IDS = {
51
+ "Sub-task A": ["OAG", "NAG", "CAG"],
52
+ "Sub-task B": ["GEN", "NGEN"],
53
+ "Sub-task C": ["OAG-GEN", "OAG-NGEN", "NAG-GEN", "NAG-NGEN", "CAG-GEN", "CAG-NGEN"]
54
+ }
55
+
56
+ model_version="databank" # other option is hugging face library
57
+ if model_version == "databank":
58
+ # Make sure you have downloaded the required model file from https://databank.illinois.edu/datasets/IDB-8882752
59
+ # Unzip the file at some model_path (we are using: "databank_model")
60
+ model_path = next(Path("databank_model").glob("./*/output/*/model"))
61
+ # Assuming you get the following type of structure inside "databank_model"
62
+ # 'databank_model/ALL/Sub-task C/output/bert-base-multilingual-uncased/model'
63
+ lang, task, _, base_model, _ = model_path.parts
64
+ tokenizer = AutoTokenizer.from_pretrained(base_model)
65
+ model = AutoModelForSequenceClassification.from_pretrained(model_path)
66
+ else:
67
+ lang, task, base_model = "ALL", "Sub-task C", "bert-base-multilingual-uncased"
68
+ base_model = f"socialmediaie/{lang}_{lang.split()[-1]}_{base_model}"
69
+ tokenizer = AutoTokenizer.from_pretrained(base_model)
70
+ model = AutoModelForSequenceClassification.from_pretrained(base_model)
71
+
72
+ # For doing inference set model in eval mode
73
+ model.eval()
74
+
75
+ task_labels = TASK_LABEL_IDS[task]
76
+
77
+ sentence = "This is a good cat and this is a bad dog."
78
+ processed_sentence = f"{tokenizer.cls_token} {sentence}"
79
+ tokens = tokenizer.tokenize(sentence)
80
+ indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
81
+ tokens_tensor = torch.tensor([indexed_tokens])
82
+
83
+ with torch.no_grad():
84
+ logits, = model(tokens_tensor, labels=None)
85
+ logits
86
+
87
+
88
+ preds = logits.detach().cpu().numpy()
89
+ preds_probs = softmax(preds, axis=1)
90
+ preds = np.argmax(preds_probs, axis=1)
91
+ preds_labels = np.array(task_labels)[preds]
92
+ print(dict(zip(task_labels, preds_probs[0])), preds_labels)
93
+ """You should get an output as follows:
94
+
95
+ ({'CAG-GEN': 0.06762535,
96
+ 'CAG-NGEN': 0.03244293,
97
+ 'NAG-GEN': 0.6897794,
98
+ 'NAG-NGEN': 0.15498641,
99
+ 'OAG-GEN': 0.034373745,
100
+ 'OAG-NGEN': 0.020792078},
101
+ array(['NAG-GEN'], dtype='<U8'))
102
+
103
+ """
104
+
105
+ ```