sonoisa commited on
Commit
54967e5
·
1 Parent(s): ee8fecd

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +67 -0
README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: ja
3
+ license: cc-by-sa-4.0
4
+ tags:
5
+ - sentence-transformers
6
+ - sentence-t5
7
+ - feature-extraction
8
+ - sentence-similarity
9
+ ---
10
+
11
+
12
+ This is a Japanese sentence-T5 model.
13
+
14
+ 日本語用Sentence-T5モデルです。
15
+
16
+ 事前学習済みモデルとして[sonoisa/t5-base-japanese](https://huggingface.co/sonoisa/t5-base-japanese)を利用しました。
17
+ 推論の実行にはsentencepieceが必要です(pip install sentencepiece)。
18
+
19
+
20
+ # 使い方
21
+
22
+ ```python
23
+ from transformers import T5Tokenizer, T5Model
24
+ import torch
25
+
26
+
27
+ class SentenceT5:
28
+ def __init__(self, model_name_or_path, device=None):
29
+ self.tokenizer = T5Tokenizer.from_pretrained(model_name_or_path)
30
+ self.model = T5Model.from_pretrained(model_name_or_path).encoder
31
+ self.model.eval()
32
+
33
+ if device is None:
34
+ device = "cuda" if torch.cuda.is_available() else "cpu"
35
+ self.device = torch.device(device)
36
+ self.model.to(device)
37
+
38
+ def _mean_pooling(self, model_output, attention_mask):
39
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
40
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
41
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
42
+
43
+ @torch.no_grad()
44
+ def encode(self, sentences, batch_size=8):
45
+ all_embeddings = []
46
+ iterator = range(0, len(sentences), batch_size)
47
+ for batch_idx in iterator:
48
+ batch = sentences[batch_idx:batch_idx + batch_size]
49
+
50
+ encoded_input = self.tokenizer.batch_encode_plus(batch, padding="longest",
51
+ truncation=True, return_tensors="pt").to(self.device)
52
+ model_output = self.model(**encoded_input)
53
+ sentence_embeddings = self._mean_pooling(model_output, encoded_input["attention_mask"]).to('cpu')
54
+
55
+ all_embeddings.extend(sentence_embeddings)
56
+
57
+ return torch.stack(all_embeddings)
58
+
59
+
60
+ MODEL_NAME = "sonoisa/sentence-t5-base-ja-mean-tokens"
61
+ model = SentenceT5(MODEL_NAME)
62
+
63
+ sentences = ["暴走したAI", "暴走した人工知能"]
64
+ sentence_embeddings = model.encode(sentences, batch_size=8)
65
+
66
+ print("Sentence embeddings:", sentence_embeddings)
67
+ ```