luisespinosa commited on
Commit
7a0cada
โ€ข
1 Parent(s): 1f8684f

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +29 -19
README.md CHANGED
@@ -1,3 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  # twitter-XLM-roBERTa-base for Sentiment Analysis
2
 
3
  This is a XLM-roBERTa-base model trained on ~198M tweets and finetuned for sentiment analysis in
@@ -5,7 +17,16 @@ This is a XLM-roBERTa-base model trained on ~198M tweets and finetuned for senti
5
  - Paper: [XLM-T: A Multilingual Language Model Toolkit for Twitter](https://...).
6
  - Git Repo: [Tweeteval official repository](https://github.com/cardiffnlp/xlm-t).
7
 
8
- ## Example of classification
 
 
 
 
 
 
 
 
 
9
 
10
  ```python
11
  from transformers import AutoModelForSequenceClassification
@@ -13,32 +34,20 @@ from transformers import TFAutoModelForSequenceClassification
13
  from transformers import AutoTokenizer
14
  import numpy as np
15
  from scipy.special import softmax
16
- import csv
17
- import urllib.request
18
 
19
  # Preprocess text (username and link placeholders)
20
  def preprocess(text):
21
  new_text = []
22
-
23
-
24
  for t in text.split(" "):
25
  t = '@user' if t.startswith('@') and len(t) > 1 else t
26
  t = 'http' if t.startswith('http') else t
27
  new_text.append(t)
28
  return " ".join(new_text)
29
 
 
30
 
31
- MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"
32
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
33
-
34
- # download label mapping
35
- labels=[]
36
- mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
37
- with urllib.request.urlopen(mapping_link) as f:
38
- html = f.read().decode('utf-8').split("\\
39
- ")
40
- csvreader = csv.reader(html, delimiter='\\\\t')
41
- labels = [row[1] for row in csvreader if len(row) > 1]
42
 
43
  # PT
44
  model = AutoModelForSequenceClassification.from_pretrained(MODEL)
@@ -61,10 +70,11 @@ scores = softmax(scores)
61
  # scores = output[0][0].numpy()
62
  # scores = softmax(scores)
63
 
 
64
  ranking = np.argsort(scores)
65
  ranking = ranking[::-1]
66
  for i in range(scores.shape[0]):
67
- l = labels[ranking[i]]
68
  s = scores[ranking[i]]
69
  print(f"{i+1}) {l} {np.round(float(s), 4)}")
70
 
@@ -73,8 +83,8 @@ for i in range(scores.shape[0]):
73
  Output:
74
 
75
  ```
76
- 1) positive 0.76726073
77
- 2) neutral 0.201
78
- 3) negative 0.0312
79
  ```
80
 
 
1
+ ---
2
+ language: multilingual
3
+ widget:
4
+ - text: "T'estimo!"
5
+ - text: "I love you!"
6
+ - text: "I hate you"
7
+ - text: "Mahal kita!"
8
+ - text: "์‚ฌ๋ž‘ํ•ด!"
9
+ - text: "๋‚œ ๋„ˆ๊ฐ€ ์‹ซ์–ด"
10
+ ---
11
+
12
+
13
  # twitter-XLM-roBERTa-base for Sentiment Analysis
14
 
15
  This is a XLM-roBERTa-base model trained on ~198M tweets and finetuned for sentiment analysis in
 
17
  - Paper: [XLM-T: A Multilingual Language Model Toolkit for Twitter](https://...).
18
  - Git Repo: [Tweeteval official repository](https://github.com/cardiffnlp/xlm-t).
19
 
20
+ ## Example Pipeline
21
+ ```python
22
+ from transformers import pipeline
23
+ model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
24
+ sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)
25
+
26
+ sentiment_task("T'estimo!")
27
+ ```
28
+
29
+ ## Full classification example
30
 
31
  ```python
32
  from transformers import AutoModelForSequenceClassification
 
34
  from transformers import AutoTokenizer
35
  import numpy as np
36
  from scipy.special import softmax
 
 
37
 
38
  # Preprocess text (username and link placeholders)
39
  def preprocess(text):
40
  new_text = []
 
 
41
  for t in text.split(" "):
42
  t = '@user' if t.startswith('@') and len(t) > 1 else t
43
  t = 'http' if t.startswith('http') else t
44
  new_text.append(t)
45
  return " ".join(new_text)
46
 
47
+ MODEL = f"/home/jupyter/misc/tweeteval/TweetEval_models/xlm-twitter/twitter-xlm-roberta-base-sentiment"
48
 
 
49
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
50
+ config = AutoConfig.from_pretrained(MODEL)
 
 
 
 
 
 
 
 
51
 
52
  # PT
53
  model = AutoModelForSequenceClassification.from_pretrained(MODEL)
 
70
  # scores = output[0][0].numpy()
71
  # scores = softmax(scores)
72
 
73
+ # Print labels and scores
74
  ranking = np.argsort(scores)
75
  ranking = ranking[::-1]
76
  for i in range(scores.shape[0]):
77
+ l = config.id2label[ranking[i]]
78
  s = scores[ranking[i]]
79
  print(f"{i+1}) {l} {np.round(float(s), 4)}")
80
 
 
83
  Output:
84
 
85
  ```
86
+ 1) Positive 0.7673
87
+ 2) Neutral 0.2015
88
+ 3) Negative 0.0313
89
  ```
90