Update README.md
Browse files
README.md
CHANGED
@@ -3,4 +3,59 @@ license: apache-2.0
|
|
3 |
---
|
4 |
Korean Pre-Trained Crypto RoBERTa model fine-tuned on BTC sentiment classification dataset.
|
5 |
|
6 |
-
For more details, check our work [CBITS: Crypto BERT Incorporated Trading System](https://ieeexplore.ieee.org/document/10014986) on IEEE Access.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
---
|
4 |
Korean Pre-Trained Crypto RoBERTa model fine-tuned on BTC sentiment classification dataset.
|
5 |
|
6 |
+
For more details, check our work [CBITS: Crypto BERT Incorporated Trading System](https://ieeexplore.ieee.org/document/10014986) on IEEE Access.
|
7 |
+
|
8 |
+
## Example Use Case: BTC Sentiment Classification
|
9 |
+
```python
|
10 |
+
from tokenization_roberta_spm import FairSeqRobertaSentencePieceTokenizer
|
11 |
+
from transformers import XLMRobertaForSequenceClassification
|
12 |
+
|
13 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
14 |
+
|
15 |
+
model = XLMRobertaForSequenceClassification.from_pretrained("axiomlabs/KR-cryptoroberta-base", num_labels=3)
|
16 |
+
model.eval()
|
17 |
+
model.to(device)
|
18 |
+
|
19 |
+
tokenizer = FairSeqRobertaSentencePieceTokenizer.from_pretrained("fairseq-roberta-all-model")
|
20 |
+
|
21 |
+
title = "์ฐ์ฆ๋ฒก, ์ธ๊ตญ๊ธฐ์
์ ์ํธํํ ๊ฑฐ๋์๊ธ ๊ตญ๋ด๊ณ์ข ์
๊ธ ํ์ฉ"
|
22 |
+
content = "๋นํธ์ฝ์ธ๋ท์ปด์ ๋ฐ๋ฅด๋ฉด ์ฐ์ฆ๋ฒ ํค์คํ ์ค์์ํ์ด ์ธ๊ตญ๊ธฐ์
์ ๊ตญ๋ด ์ํ ๊ณ์ข ๊ฐ์ค ๋ฐ ์ํธํํ ๊ฑฐ๋ ์๊ธ ์
๊ธ์ ํ์ฉํ๋ค. ์์ ์ฐ์ฆ๋ฒ ํค์คํ์ ์ธ๊ตญ๊ธฐ์
์ ์ํ ๊ณ์ข ๊ฐ์ค ๋ฑ์ ์ ํ ๋ฐ ๊ธ์งํ ๋ฐ ์๋ค. ๊ฐ์ ์์ ๋ฐ๋ผ ์ด๋ฌํ ์๊ธ์ ์ํธํํ ๋งค์
์ ์ํด ๊ฑฐ๋์๋ก ์ด์ฒด, ํน์ ์๊ธ์ด ์ ์
๋ ๊ดํ ๊ถ ๋ด ๋ฑ๋ก๋ ๋ฒ์ธ ๊ณ์ข๋ก ์ด์ฒดํ ์ ์๋ค. ๋ค๋ง ๊ทธ ์ธ ๋ค๋ฅธ ๋ชฉ์ ์ ์ํ ์ฌ์ฉ์ ๊ธ์ง๋๋ค. ํด๋น ๊ฐ์ ์์ ์ง๋ 2์ 9์ผ ๋ฐํจ๋๋ค."
|
23 |
+
|
24 |
+
|
25 |
+
encoded_input = tokenizer(str(title), str(content), max_length=512, padding="max_length", truncation=True, return_tensors="pt").to(device)
|
26 |
+
|
27 |
+
with torch.no_grad():
|
28 |
+
output = model(**encoded_input).logits
|
29 |
+
output = nn.Softmax(dim=1)(output)
|
30 |
+
output = output.detach().cpu().numpy()[0]
|
31 |
+
print("ํธ์ฌ: {:.2f}% | ์
์ฌ: {:.2f}% | ์ค๋ฆฝ: {:.2f}%".format(output[0]*100,output[1]*100,output[2]*100))
|
32 |
+
```
|
33 |
+
|
34 |
+
## Example Use Case: Crypto Embedding Similarity
|
35 |
+
```python
|
36 |
+
from tokenization_roberta_spm import FairSeqRobertaSentencePieceTokenizer
|
37 |
+
from transformers import XLMRobertaForSequenceClassification
|
38 |
+
|
39 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
40 |
+
|
41 |
+
model = AutoModel.from_pretrained("axiomlabs/KR-cryptoroberta-base")
|
42 |
+
model.eval()
|
43 |
+
model.to(device)
|
44 |
+
|
45 |
+
tokenizer = FairSeqRobertaSentencePieceTokenizer.from_pretrained("fairseq-roberta-all-model")
|
46 |
+
|
47 |
+
title1 = "USDN ๋ค์ค๋ด๋ณด ์์ฐ ์ ํ ์ ์ ํต๊ณผ"
|
48 |
+
content1 = "์จ์ด๋ธ ์ํ๊ณ ์คํ
์ด๋ธ์ฝ์ธ USDN์ ๋ค์ค๋ด๋ณด ์์ฐ์ผ๋ก ์ ํํ๋ ์ ์ ํฌํ๊ฐ ์ฐฌ์ฑ 99%๋ก ์ค๋ ํต๊ณผ๋๋ค. ์์ ์ฝ์ธ๋์ค๋ ์จ๋ธ๊ฐ $WX,$SWOP,$VIRES,$EGG,$WEST๋ฅผ ๋ด๋ณด๋ก ํด USDN์ ์จ์ด๋ธ ์ํ๊ณ ์ธ๋ฑ์ค ์์ฐ์ผ๋ก ๋ง๋ค์ด USDN ๋ํ๊น
์ด์๋ฅผ ํด๊ฒฐํ ํ๋์ ๊ณต๊ฐํ๋ค๊ณ ์ ํ ๋ฐ ์๋ค."
|
49 |
+
|
50 |
+
title2 = "์จ์ด๋ธ, USDN ๊ณ ๋ ์ฒญ์ฐ์ ํฌํ ํต๊ณผ๋ก 30%โ"
|
51 |
+
content2 = "์ ํฌ๋ฐ์ด์ ๋ฐ๋ฅด๋ฉด ์จ์ด๋ธ(WAVES) ๊ธฐ๋ฐ ์๊ณ ๋ฆฌ์ฆ ์คํ
์ด๋ธ์ฝ์ธ ๋ดํธ๋ฆฌ๋
ธ(USDN)์ ๋ํ๊ทธ ๋ฐ์ ์์ด ๋๊ท๋ชจ USDN ํฌ์ง์
์ฒญ์ฐ์ ๊ฐ๋ฅํ๊ฒ ํ๋ ํฌํ๊ฐ ๋ง์ฅ์ผ์น๋ก ํต๊ณผ ๋จ์ ๋ฐ๋ผ WAVES๊ฐ ๋ช์๊ฐ ์์ 30%๋ ์์นํญ์ ๋ํ๋๋ค. ์ง๋ 28์ผ ์จ์ด๋ธ ํ์ด ๋ฐํํ USDN์ ๋ฌ๋ฌ ํ๊ทธ ํ๋ณต ๊ณํ์ ๋ค์๊ณผ ๊ฐ๋ค.- ์ปค๋ธ ๋ฐ CRV ํ ํฐ์ผ๋ก USDN ์ ๋์ฑ ๊ณต๊ธ.- ๊ณ ๋ ๊ณ์ข๋ฅผ ์ฒญ์ฐ์์ผ Vires ์ ๋์ฑ ๋ณต๊ตฌ.- USDN ๋ด๋ณด๋ฌผ์ ๋๋ฌ์ ๊ฑธ์ณ ์ฒ์ฒํ ํ๋งค.- ๋ดํธ๋ฆฌ๋
ธ ํ๋กํ ์ฝ ์๋ณธ ์กฐ๋ฌ์ ์ํ ์๋ก์ด ํ ํฐ ๋ฐํ."
|
52 |
+
|
53 |
+
encoded_input1 = tokenizer(str(title1), str(content1), max_length=512, padding="max_length", truncation=True, return_tensors="pt").to(device)
|
54 |
+
encoded_input2 = tokenizer(str(title2), str(content2), max_length=512, padding="max_length", truncation=True, return_tensors="pt").to(device)
|
55 |
+
|
56 |
+
with torch.no_grad():
|
57 |
+
emb1 = model(**encoded_input1)[0][:,0,:].detach().cpu().numpy()
|
58 |
+
emb2 = model(**encoded_input2)[0][:,0,:].detach().cpu().numpy()
|
59 |
+
sim_scores = cdist(emb1, emb2, "cosine")[0]
|
60 |
+
print(f"cosine distance = {sim_scores[0]}")
|
61 |
+
```
|