Start-GPT commited on
Commit
017380c
·
verified ·
1 Parent(s): 41aeee5

Create token_processing.py

Browse files
Files changed (1) hide show
  1. server/utils/token_processing.py +48 -0
server/utils/token_processing.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from transformers.tokenization_bert import BertTokenizer
3
+ from .f import flatten_, assoc, memoize, GetAttr
4
+
5
+ from typing import List
6
+
7
+ def fix_byte_spaces(toks: List[str]) -> List[str]:
8
+ return [t.replace("\u0120", " ").replace("\u010A", "\\n") for t in toks]
9
+
10
+ @memoize
11
+ def get_bpe(bpe_pretrained_name_or_path):
12
+ return BertTokenizer.from_pretrained(bpe_pretrained_name_or_path)
13
+
14
+ # [String] -> [String]
15
+ def remove_CLS_SEP(toks):
16
+ return [t for t in toks if t not in set(["[CLS]", "[SEP]"])]
17
+
18
+ # torch.Tensor -> np.Array
19
+ def process_hidden_tensors(t):
20
+ """Embeddings are returned from the BERT model in a non-ideal embedding shape:
21
+ - unnecessary batch dimension
22
+ - Undesired second sentence "[SEP]".
23
+
24
+ Drop the unnecessary information and just return what we need for the first sentence
25
+ """
26
+ # Drop unnecessary batch dim and second sent
27
+ t = t.squeeze(0)[:-1]
28
+
29
+ # Drop second sentence sep ??
30
+ t = t[1:-1]
31
+
32
+ # Convert to numpy
33
+ return t.data.numpy()
34
+
35
+
36
+ # np.Array -> np.Array
37
+ def normalize(a):
38
+ """Divide each head by its norm"""
39
+ norms = np.linalg.norm(a, axis=-1, keepdims=True)
40
+ return a / norms
41
+
42
+
43
+ # np.Array:<a,b,c,d> -> np.Array<a,b,c*d>
44
+ def reshape(a):
45
+ """Combine the last two dimensions of a numpy array"""
46
+ all_head_size = a.shape[-2] * a.shape[-1]
47
+ new_shape = a.shape[:-2] + (all_head_size,)
48
+ return a.reshape(new_shape)