Fix no white space when using stream_chat with fast tokenizer (#9)
Browse files- Fix no white space when using stream_chat with fast tokenizer (065ad5414c08dd04c70abaf3f8f58968cdf84dcb)
Co-authored-by: Shuhao Xing <x54-729@users.noreply.huggingface.co>
tokenization_internlm2_fast.py
CHANGED
@@ -56,14 +56,14 @@ class InternLM2Converter(SpmConverter):
|
|
56 |
return unk_id
|
57 |
|
58 |
def decoder(self, replacement, add_prefix_space):
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
)
|
67 |
|
68 |
def tokenizer(self, proto):
|
69 |
model_type = proto.trainer_spec.model_type
|
|
|
56 |
return unk_id
|
57 |
|
58 |
def decoder(self, replacement, add_prefix_space):
|
59 |
+
decoders_sequence = [
|
60 |
+
decoders.Replace("▁", " "),
|
61 |
+
decoders.ByteFallback(),
|
62 |
+
decoders.Fuse(),
|
63 |
+
]
|
64 |
+
if self.proto.normalizer_spec.add_dummy_prefix:
|
65 |
+
decoders_sequence.append(decoders.Strip(content=" ", left=1))
|
66 |
+
return decoders.Sequence(decoders_sequence)
|
67 |
|
68 |
def tokenizer(self, proto):
|
69 |
model_type = proto.trainer_spec.model_type
|