Lukekim's picture
Update README.md
e8edcc8
|
raw
history blame
4.01 kB
metadata
license: apache-2.0

Korean Pre-Trained Crypto RoBERTa model fine-tuned on BTC sentiment classification dataset.

For more details, check our work CBITS: Crypto BERT Incorporated Trading System on IEEE Access.

Example Use Case: BTC Sentiment Classification

from tokenization_roberta_spm import FairSeqRobertaSentencePieceTokenizer 
from transformers import XLMRobertaForSequenceClassification

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 

model = XLMRobertaForSequenceClassification.from_pretrained("axiomlabs/KR-cryptoroberta-base", num_labels=3) 
model.eval()
model.to(device)

tokenizer = FairSeqRobertaSentencePieceTokenizer.from_pretrained("fairseq-roberta-all-model")

title = "์šฐ์ฆˆ๋ฒก, ์™ธ๊ตญ๊ธฐ์—…์˜ ์•”ํ˜ธํ™”ํ ๊ฑฐ๋ž˜์ž๊ธˆ ๊ตญ๋‚ด๊ณ„์ขŒ ์ž…๊ธˆ ํ—ˆ์šฉ" 
content = "๋น„ํŠธ์ฝ”์ธ๋‹ท์ปด์— ๋”ฐ๋ฅด๋ฉด ์šฐ์ฆˆ๋ฒ ํ‚ค์Šคํƒ„ ์ค‘์•™์€ํ–‰์ด ์™ธ๊ตญ๊ธฐ์—…์˜ ๊ตญ๋‚ด ์€ํ–‰ ๊ณ„์ขŒ ๊ฐœ์„ค ๋ฐ ์•”ํ˜ธํ™”ํ ๊ฑฐ๋ž˜ ์ž๊ธˆ ์ž…๊ธˆ์„ ํ—ˆ์šฉํ–ˆ๋‹ค. ์•ž์„œ ์šฐ์ฆˆ๋ฒ ํ‚ค์Šคํƒ„์€ ์™ธ๊ตญ๊ธฐ์—…์˜ ์€ํ–‰ ๊ณ„์ขŒ ๊ฐœ์„ค ๋“ฑ์„ ์ œํ•œ ๋ฐ ๊ธˆ์ง€ํ•œ ๋ฐ” ์žˆ๋‹ค. ๊ฐœ์ •์•ˆ์— ๋”ฐ๋ผ ์ด๋Ÿฌํ•œ ์ž๊ธˆ์€ ์•”ํ˜ธํ™”ํ ๋งค์ž…์„ ์œ„ํ•ด ๊ฑฐ๋ž˜์†Œ๋กœ ์ด์ฒด, ํ˜น์€ ์ž๊ธˆ์ด ์œ ์ž…๋œ ๊ด€ํ• ๊ถŒ ๋‚ด ๋“ฑ๋ก๋œ ๋ฒ•์ธ ๊ณ„์ขŒ๋กœ ์ด์ฒดํ•  ์ˆ˜ ์žˆ๋‹ค. ๋‹ค๋งŒ ๊ทธ ์™ธ ๋‹ค๋ฅธ ๋ชฉ์ ์„ ์œ„ํ•œ ์‚ฌ์šฉ์€ ๊ธˆ์ง€๋œ๋‹ค. ํ•ด๋‹น ๊ฐœ์ •์•ˆ์€ ์ง€๋‚œ 2์›” 9์ผ ๋ฐœํšจ๋๋‹ค."


encoded_input = tokenizer(str(title), str(content), max_length=512, padding="max_length", truncation=True, return_tensors="pt").to(device) 

with torch.no_grad(): 
    output = model(**encoded_input).logits
    output = nn.Softmax(dim=1)(output) 
    output = output.detach().cpu().numpy()[0] 
    print("ํ˜ธ์žฌ: {:.2f}% | ์•…์žฌ: {:.2f}% | ์ค‘๋ฆฝ: {:.2f}%".format(output[0]*100,output[1]*100,output[2]*100)) 

Example Use Case: Crypto Embedding Similarity

from tokenization_roberta_spm import FairSeqRobertaSentencePieceTokenizer 
from transformers import XLMRobertaForSequenceClassification

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 

model = AutoModel.from_pretrained("axiomlabs/KR-cryptoroberta-base") 
model.eval()
model.to(device) 

tokenizer = FairSeqRobertaSentencePieceTokenizer.from_pretrained("fairseq-roberta-all-model")

title1 = "USDN ๋‹ค์ค‘๋‹ด๋ณด ์ž์‚ฐ ์ „ํ™˜ ์ œ์•ˆ ํ†ต๊ณผ"
content1 = "์›จ์ด๋ธŒ ์ƒํƒœ๊ณ„ ์Šคํ…Œ์ด๋ธ”์ฝ”์ธ USDN์„ ๋‹ค์ค‘๋‹ด๋ณด ์ž์‚ฐ์œผ๋กœ ์ „ํ™˜ํ•˜๋Š” ์ œ์•ˆ ํˆฌํ‘œ๊ฐ€ ์ฐฌ์„ฑ 99%๋กœ ์˜ค๋Š˜ ํ†ต๊ณผ๋๋‹ค. ์•ž์„œ ์ฝ”์ธ๋‹ˆ์Šค๋Š” ์›จ๋ธŒ๊ฐ€ $WX,$SWOP,$VIRES,$EGG,$WEST๋ฅผ ๋‹ด๋ณด๋กœ ํ•ด USDN์„ ์›จ์ด๋ธŒ ์ƒํƒœ๊ณ„ ์ธ๋ฑ์Šค ์ž์‚ฐ์œผ๋กœ ๋งŒ๋“ค์–ด USDN ๋””ํŽ˜๊น… ์ด์Šˆ๋ฅผ ํ•ด๊ฒฐํ•  ํ”Œ๋žœ์„ ๊ณต๊ฐœํ–ˆ๋‹ค๊ณ  ์ „ํ•œ ๋ฐ” ์žˆ๋‹ค."

title2 = "์›จ์ด๋ธŒ, USDN ๊ณ ๋ž˜ ์ฒญ์‚ฐ์•ˆ ํˆฌํ‘œ ํ†ต๊ณผ๋กœ 30%โ†‘"
content2 = "์œ ํˆฌ๋ฐ์ด์— ๋”ฐ๋ฅด๋ฉด ์›จ์ด๋ธŒ(WAVES) ๊ธฐ๋ฐ˜ ์•Œ๊ณ ๋ฆฌ์ฆ˜ ์Šคํ…Œ์ด๋ธ”์ฝ”์ธ ๋‰ดํŠธ๋ฆฌ๋…ธ(USDN)์˜ ๋””ํŽ˜๊ทธ ๋ฐœ์ƒ ์—†์ด ๋Œ€๊ทœ๋ชจ USDN ํฌ์ง€์…˜ ์ฒญ์‚ฐ์„ ๊ฐ€๋Šฅํ•˜๊ฒŒ ํ•˜๋Š” ํˆฌํ‘œ๊ฐ€ ๋งŒ์žฅ์ผ์น˜๋กœ ํ†ต๊ณผ ๋จ์— ๋”ฐ๋ผ WAVES๊ฐ€ ๋ช‡์‹œ๊ฐ„ ์•ˆ์— 30%๋Œ€ ์ƒ์Šนํญ์„ ๋‚˜ํƒ€๋ƒˆ๋‹ค. ์ง€๋‚œ 28์ผ ์›จ์ด๋ธŒ ํŒ€์ด ๋ฐœํ‘œํ•œ USDN์˜ ๋‹ฌ๋Ÿฌ ํŽ˜๊ทธ ํšŒ๋ณต ๊ณ„ํš์€ ๋‹ค์Œ๊ณผ ๊ฐ™๋‹ค.- ์ปค๋ธŒ ๋ฐ CRV ํ† ํฐ์œผ๋กœ USDN ์œ ๋™์„ฑ ๊ณต๊ธ‰.- ๊ณ ๋ž˜ ๊ณ„์ขŒ๋ฅผ ์ฒญ์‚ฐ์‹œ์ผœ Vires ์œ ๋™์„ฑ ๋ณต๊ตฌ.- USDN ๋‹ด๋ณด๋ฌผ์„ ๋‘๋‹ฌ์— ๊ฑธ์ณ ์ฒœ์ฒœํžˆ ํŒ๋งค.- ๋‰ดํŠธ๋ฆฌ๋…ธ ํ”„๋กœํ† ์ฝœ ์ž๋ณธ ์กฐ๋‹ฌ์„ ์œ„ํ•œ ์ƒˆ๋กœ์šด ํ† ํฐ ๋ฐœํ–‰."

encoded_input1 = tokenizer(str(title1), str(content1), max_length=512, padding="max_length", truncation=True, return_tensors="pt").to(device)
encoded_input2 = tokenizer(str(title2), str(content2), max_length=512, padding="max_length", truncation=True, return_tensors="pt").to(device)

with torch.no_grad(): 
    emb1 = model(**encoded_input1)[0][:,0,:].detach().cpu().numpy() 
    emb2 = model(**encoded_input2)[0][:,0,:].detach().cpu().numpy() 
    sim_scores = cdist(emb1, emb2, "cosine")[0] 
print(f"cosine distance = {sim_scores[0]}")