Doron Adler
commited on
Commit
โข
b6c25db
1
Parent(s):
c8ddd98
Updated model
Browse files- app.py +7 -5
- model/added_tokens.json +0 -0
- model/config.json +1 -1
- model/merges.txt +0 -0
- model/pytorch_model.bin +1 -1
- model/special_tokens_map.json +0 -0
- model/tokenizer.json +0 -0
- model/tokenizer_config.json +3 -2
- model/vocab.json +0 -0
app.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
|
|
|
|
|
|
|
3 |
import argparse
|
4 |
import re
|
5 |
-
import os
|
6 |
|
7 |
import streamlit as st
|
8 |
import random
|
@@ -11,8 +13,6 @@ import torch
|
|
11 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
12 |
import tokenizers
|
13 |
|
14 |
-
#os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
15 |
-
|
16 |
random.seed(None)
|
17 |
suggested_text_list = ['ืืฉื ืืืคืืข ืืื','ืงืืื ืฉืืคื ืืช','ืคืขื ืืืช ืืคื ื ืฉื ืื ืจืืืช', 'ืืืจื ืคืืืจ ืืืื ืืืื ื ืืื', 'ืืื ืืคืจืชื ืืช ืื ืืืื ืืืงืก ืืฉ']
|
18 |
|
@@ -24,7 +24,9 @@ def load_model(model_name):
|
|
24 |
|
25 |
def extend(input_text, max_size=20, top_k=50, top_p=0.95, temperature=0.7):
|
26 |
if len(input_text) == 0:
|
27 |
-
input_text = ""
|
|
|
|
|
28 |
|
29 |
encoded_prompt = tokenizer.encode(
|
30 |
input_text, add_special_tokens=False, return_tensors="pt")
|
@@ -83,7 +85,7 @@ if __name__ == "__main__":
|
|
83 |
model, tokenizer = load_model("./model")
|
84 |
|
85 |
stop_token = "<|endoftext|>"
|
86 |
-
new_lines = "
|
87 |
|
88 |
np.random.seed(None)
|
89 |
random_seed = np.random.randint(10000,size=1)
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
|
3 |
+
import os
|
4 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
5 |
+
|
6 |
import argparse
|
7 |
import re
|
|
|
8 |
|
9 |
import streamlit as st
|
10 |
import random
|
|
|
13 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
14 |
import tokenizers
|
15 |
|
|
|
|
|
16 |
random.seed(None)
|
17 |
suggested_text_list = ['ืืฉื ืืืคืืข ืืื','ืงืืื ืฉืืคื ืืช','ืคืขื ืืืช ืืคื ื ืฉื ืื ืจืืืช', 'ืืืจื ืคืืืจ ืืืื ืืืื ื ืืื', 'ืืื ืืคืจืชื ืืช ืื ืืืื ืืืงืก ืืฉ']
|
18 |
|
|
|
24 |
|
25 |
def extend(input_text, max_size=20, top_k=50, top_p=0.95, temperature=0.7):
|
26 |
if len(input_text) == 0:
|
27 |
+
input_text = "<|startoftext|>"
|
28 |
+
else:
|
29 |
+
input_text = "<|startoftext|>" + input_text
|
30 |
|
31 |
encoded_prompt = tokenizer.encode(
|
32 |
input_text, add_special_tokens=False, return_tensors="pt")
|
|
|
85 |
model, tokenizer = load_model("./model")
|
86 |
|
87 |
stop_token = "<|endoftext|>"
|
88 |
+
new_lines = "<|pad|>"
|
89 |
|
90 |
np.random.seed(None)
|
91 |
random_seed = np.random.randint(10000,size=1)
|
model/added_tokens.json
CHANGED
File without changes
|
model/config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"activation_function": "gelu_new",
|
4 |
"architectures": [
|
5 |
"GPTNeoForCausalLM"
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "Norod78/hebrew-gpt_neo-small",
|
3 |
"activation_function": "gelu_new",
|
4 |
"architectures": [
|
5 |
"GPTNeoForCausalLM"
|
model/merges.txt
CHANGED
File without changes
|
model/pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 551197393
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:25bdb066c638e0f8e2da703f4ecd9223448fcf282fd46e354a47e817656af8e5
|
3 |
size 551197393
|
model/special_tokens_map.json
CHANGED
File without changes
|
model/tokenizer.json
CHANGED
File without changes
|
model/tokenizer_config.json
CHANGED
@@ -7,7 +7,7 @@
|
|
7 |
"errors": "replace",
|
8 |
"full_tokenizer_file": null,
|
9 |
"max_len": 1024,
|
10 |
-
"name_or_path": "
|
11 |
"pad_token": "<|pad|>",
|
12 |
"special_tokens_map_file": "special_tokens_map.json",
|
13 |
"tokenizer_class": "GPT2Tokenizer",
|
@@ -18,5 +18,6 @@
|
|
18 |
"normalized": true,
|
19 |
"rstrip": false,
|
20 |
"single_word": false
|
21 |
-
}
|
|
|
22 |
}
|
|
|
7 |
"errors": "replace",
|
8 |
"full_tokenizer_file": null,
|
9 |
"max_len": 1024,
|
10 |
+
"name_or_path": "Norod78/hebrew-gpt_neo-small",
|
11 |
"pad_token": "<|pad|>",
|
12 |
"special_tokens_map_file": "special_tokens_map.json",
|
13 |
"tokenizer_class": "GPT2Tokenizer",
|
|
|
18 |
"normalized": true,
|
19 |
"rstrip": false,
|
20 |
"single_word": false
|
21 |
+
},
|
22 |
+
"unknown_token": "<|unknown|>"
|
23 |
}
|
model/vocab.json
CHANGED
File without changes
|