Doron Adler commited on
Commit
363236f
β€’
1 Parent(s): 3437d8c

Hebrew text generator: Science Fiction and Fantasy (GPT-Neo)

Browse files
README.md CHANGED
@@ -1,10 +1,9 @@
1
  ---
2
- title: FantasyChildrenScifi-hebrew-gpt Neo-small
3
- emoji: πŸ’»
4
- colorFrom: red
5
- colorTo: purple
6
  sdk: streamlit
7
- sdk_version: 1.10.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
1
  ---
2
+ title: Hebrew GPT Neo - Science Fiction and Fantasy
3
+ emoji: πŸ§™β€β™€οΈ
4
+ colorFrom: yellow
5
+ colorTo: blue
6
  sdk: streamlit
 
7
  app_file: app.py
8
  pinned: false
9
  license: mit
app.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import argparse
4
+ import re
5
+ import os
6
+
7
+ import streamlit as st
8
+ import random
9
+ import numpy as np
10
+ import torch
11
+ from transformers import AutoTokenizer, AutoModelForCausalLM
12
+ import tokenizers
13
+
14
+ #os.environ["TOKENIZERS_PARALLELISM"] = "false"
15
+
16
+ random.seed(None)
17
+ suggested_text_list = ['Χ”Χ©Χ“ Χ”Χ•Χ€Χ™Χ’ ΧžΧ•Χœ','Χ§ΧΧœΧ™ Χ©ΧœΧ€Χ” אΧͺ','׀גם אחΧͺ ΧœΧ€Χ Χ™ שנים Χ¨Χ‘Χ•Χͺ', 'הארי Χ€Χ•Χ˜Χ¨ Χ—Χ™Χ™Χš Χ—Χ™Χ•Χš Χ Χ‘Χ•Χš', 'ואז Χ”Χ€Χ¨ΧͺΧ™ אΧͺ Χ›Χœ Χ›ΧœΧœΧ™ Χ”Χ˜Χ§Χ‘ Χ›Χ©']
18
+
19
+ @st.cache(hash_funcs={tokenizers.Tokenizer: id, tokenizers.AddedToken: id})
20
+ def load_model(model_name):
21
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
22
+ model = AutoModelForCausalLM.from_pretrained(model_name)
23
+ return model, tokenizer
24
+
25
+ def extend(input_text, max_size=20, top_k=50, top_p=0.95):
26
+ if len(input_text) == 0:
27
+ input_text = ""
28
+
29
+ encoded_prompt = tokenizer.encode(
30
+ input_text, add_special_tokens=False, return_tensors="pt")
31
+
32
+ encoded_prompt = encoded_prompt.to(device)
33
+
34
+ if encoded_prompt.size()[-1] == 0:
35
+ input_ids = None
36
+ else:
37
+ input_ids = encoded_prompt
38
+
39
+ output_sequences = model.generate(
40
+ input_ids=input_ids,
41
+ max_length=max_size + len(encoded_prompt[0]),
42
+ top_k=top_k,
43
+ top_p=top_p,
44
+ do_sample=True,
45
+ repetition_penalty=5.0,
46
+ num_return_sequences=1)
47
+
48
+ # Remove the batch dimension when returning multiple sequences
49
+ if len(output_sequences.shape) > 2:
50
+ output_sequences.squeeze_()
51
+
52
+ generated_sequences = []
53
+
54
+ for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
55
+ generated_sequence = generated_sequence.tolist()
56
+
57
+ # Decode text
58
+ text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
59
+
60
+ # Remove all text after the stop token
61
+ text = text[: text.find(stop_token) if stop_token else None]
62
+
63
+ # Remove all text after 3 newlines
64
+ text = text[: text.find(new_lines) if new_lines else None]
65
+
66
+ # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
67
+ total_sequence = (
68
+ input_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
69
+ )
70
+
71
+ generated_sequences.append(total_sequence)
72
+
73
+ parsed_text = total_sequence.replace("<|startoftext|>", "").replace("\r","").replace("\n\n", "\n")
74
+ if len(parsed_text) == 0:
75
+ parsed_text = "שגיאה"
76
+ return parsed_text
77
+
78
+
79
+
80
+ if __name__ == "__main__":
81
+ st.title("Hebrew text generator: Science Fiction and Fantasy (GPT-Neo)")
82
+ model, tokenizer = load_model("./model")
83
+
84
+ stop_token = "<|endoftext|>"
85
+ new_lines = "\n\n\n"
86
+
87
+ np.random.seed(None)
88
+ random_seed = np.random.randint(10000,size=1)
89
+
90
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
91
+ n_gpu = 0 if torch.cuda.is_available()==False else torch.cuda.device_count()
92
+
93
+ torch.manual_seed(random_seed)
94
+ if n_gpu > 0:
95
+ torch.cuda.manual_seed_all(random_seed)
96
+
97
+ model.to(device)
98
+
99
+ text_area = st.text_area("Enter the first few words (or leave blank), tap on \"Generate Text\" below. Tapping again will produce a different result.", 'האדם Χ”ΧΧ—Χ¨Χ•ΧŸ Χ’ΧœΧ™ ΧΧ“ΧžΧ•Χͺ Χ™Χ©Χ‘ ΧœΧ‘Χ“ Χ‘Χ—Χ“Χ¨Χ• Χ›Χ©ΧœΧ€ΧͺΧ’ Χ Χ©ΧžΧ’Χ” Χ“Χ€Χ™Χ§Χ”')
100
+
101
+ st.sidebar.subheader("Configurable parameters")
102
+
103
+ max_len = st.sidebar.slider("Max-Length", 0, 512, 160,help="The maximum length of the sequence to be generated.")
104
+ top_k = st.sidebar.slider("Top-K", 0, 100, 40, help="The number of highest probability vocabulary tokens to keep for top-k-filtering.")
105
+ top_p = st.sidebar.slider("Top-P", 0.0, 1.0, 0.92, help="If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.")
106
+
107
+ if st.button("Generate Text"):
108
+ with st.spinner(text="Generating results..."):
109
+ st.subheader("Result")
110
+ print(f"device:{device}, n_gpu:{n_gpu}, random_seed:{random_seed}, maxlen:{max_len}, top_k:{top_k}, top_p:{top_p}")
111
+ if len(text_area.strip()) == 0:
112
+ text_area = random.choice(suggested_text_list)
113
+ result = extend(input_text=text_area,
114
+ max_size=int(max_len),
115
+ top_k=int(top_k),
116
+ top_p=float(top_p))
117
+
118
+ print("Done length: " + str(len(result)) + " bytes")
119
+ #<div class="rtl" dir="rtl" style="text-align:right;">
120
+ st.markdown(f"<p dir=\"rtl\" style=\"text-align:right;\"> {result} </p>", unsafe_allow_html=True)
121
+ st.write("\n\nResult length: " + str(len(result)) + " bytes\n Random seed: " + str(random_seed) + "\ntop_k: " + str(top_k) + "\ntop_p: " + str(top_p) + "\nmax_len: " + str(max_len) + "\ndevice: " + str(device) + "\nn_gpu: " + str(n_gpu))
122
+ print(f"\"{result}\"")
123
+
124
+ st.markdown(
125
+ """Hebrew text generation model based on EleutherAI's gpt-neo architecture. Originally trained on a TPUv3-8 which was made avilable to me via the [TPU Research Cloud Program](https://sites.research.google/trc/). The model was then slightly fine-tuned upon science fiction and fantasy text."""
126
+ )
127
+
128
+ st.markdown("<footer><hr><p style=\"font-size:14px\">The site is fan made and is not affiliated with any author in any way.</p><p style=\"font-size:12px\">By <a href=\"https://linktr.ee/Norod78\">Doron Adler</a></p></footer> ", unsafe_allow_html=True)
129
+
model/.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
model/added_tokens.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 50257,
3
+ "<|pad|>": 50260,
4
+ "<|startoftext|>": 50258,
5
+ "<|unknown|>": 50259
6
+ }
model/config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Norod78/hebrew-gpt_neo-small",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPTNeoForCausalLM"
6
+ ],
7
+ "attention_dropout": 0,
8
+ "attention_layers": [
9
+ "global",
10
+ "global",
11
+ "global",
12
+ "global",
13
+ "global",
14
+ "global",
15
+ "global",
16
+ "global",
17
+ "global",
18
+ "global",
19
+ "global",
20
+ "global"
21
+ ],
22
+ "attention_types": [
23
+ [
24
+ [
25
+ "global"
26
+ ],
27
+ 12
28
+ ]
29
+ ],
30
+ "bos_token_id": 50256,
31
+ "embed_dropout": 0,
32
+ "eos_token_id": 50256,
33
+ "gradient_checkpointing": false,
34
+ "hidden_size": 768,
35
+ "initializer_range": 0.02,
36
+ "intermediate_size": null,
37
+ "layer_norm_epsilon": 1e-05,
38
+ "max_position_embeddings": 2048,
39
+ "model_type": "gpt_neo",
40
+ "num_heads": 12,
41
+ "num_layers": 12,
42
+ "pad_token_id": 50256,
43
+ "resid_dropout": 0,
44
+ "summary_activation": null,
45
+ "summary_first_dropout": 0.1,
46
+ "summary_proj_to_labels": true,
47
+ "summary_type": "cls_index",
48
+ "summary_use_proj": true,
49
+ "torch_dtype": "float32",
50
+ "transformers_version": "4.21.0",
51
+ "use_cache": true,
52
+ "vocab_size": 50261,
53
+ "window_size": 256
54
+ }
model/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25e83f166977308069becae45ac59d48a2c08c0de8b3135a9acb63455fc0aec9
3
+ size 551197393
model/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|startoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|pad|>",
5
+ "unk_token": "<unk>"
6
+ }
model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
model/tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": "<|startoftext|>",
5
+ "do_lower_case": false,
6
+ "eos_token": "<|endoftext|>",
7
+ "errors": "replace",
8
+ "full_tokenizer_file": null,
9
+ "max_len": 1024,
10
+ "name_or_path": "Norod78/hebrew-gpt_neo-small",
11
+ "pad_token": "<|pad|>",
12
+ "special_tokens_map_file": "special_tokens_map.json",
13
+ "tokenizer_class": "GPT2Tokenizer",
14
+ "unk_token": {
15
+ "__type": "AddedToken",
16
+ "content": "<|endoftext|>",
17
+ "lstrip": false,
18
+ "normalized": true,
19
+ "rstrip": false,
20
+ "single_word": false
21
+ }
22
+ }
model/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ transformers
3
+ tokenizers
4
+ torch
start.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -e
3
+
4
+ if [ "$DEBUG" = true ] ; then
5
+ echo 'Debugging - ON'
6
+ nodemon --exec streamlit run app.py
7
+ else
8
+ echo 'Debugging - OFF'
9
+ streamlit run app.py
10
+ fi