mariagrandury commited on
Commit
c068c68
1 Parent(s): b0cc4e8

Initial commit

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. README.md +2 -2
  3. app.py +168 -0
  4. prompts.py +5 -0
  5. requirements.txt +4 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv/
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Spanish Gpt2
3
  emoji: 🔥
4
  colorFrom: yellow
5
- colorTo: yellow
6
  sdk: streamlit
7
  app_file: app.py
8
  pinned: false
 
1
  ---
2
+ title: Spanish GPT-2
3
  emoji: 🔥
4
  colorFrom: yellow
5
+ colorTo: red
6
  sdk: streamlit
7
  app_file: app.py
8
  pinned: false
app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ import requests
4
+
5
+ from mtranslate import translate
6
+ import streamlit as st
7
+
8
+ from prompts import PROMPT_LIST
9
+
10
+
11
+ headers = {}
12
+
13
+ LOGO = "https://raw.githubusercontent.com/nlp-en-es/assets/main/logo.png"
14
+
15
+ MODELS = {
16
+ "Model trained on OSCAR": {
17
+ "url": "https://api-inference.huggingface.co/models/flax-community/gpt-2-spanish"
18
+ },
19
+ "Model trained on the Large Spanish Corpus": {
20
+ "url": "https://api-inference.huggingface.co/models/mrm8488/spanish-gpt2"
21
+ },
22
+ }
23
+
24
+
25
+ def query(payload, model_name):
26
+ data = json.dumps(payload)
27
+ print("model url:", MODELS[model_name]["url"])
28
+ response = requests.request(
29
+ "POST", MODELS[model_name]["url"], headers=headers, data=data
30
+ )
31
+ return json.loads(response.content.decode("utf-8"))
32
+
33
+
34
+ def process(
35
+ text: str, model_name: str, max_len: int, temp: float, top_k: int, top_p: float
36
+ ):
37
+ payload = {
38
+ "inputs": text,
39
+ "parameters": {
40
+ "max_new_tokens": max_len,
41
+ "top_k": top_k,
42
+ "top_p": top_p,
43
+ "temperature": temp,
44
+ "repetition_penalty": 2.0,
45
+ },
46
+ "options": {
47
+ "use_cache": True,
48
+ },
49
+ }
50
+ return query(payload, model_name)
51
+
52
+
53
+ st.set_page_config(page_title="Spanish GPT-2 Demo", page_icon=LOGO)
54
+ st.title("Spanish GPT-2")
55
+
56
+ # Sidebar
57
+ st.sidebar.image(LOGO)
58
+ st.sidebar.subheader("Configurable parameters")
59
+
60
+ max_len = st.sidebar.number_input(
61
+ "Maximum length",
62
+ value=100,
63
+ help="The maximum length of the sequence to be generated.",
64
+ )
65
+
66
+ temp = st.sidebar.slider(
67
+ "Temperature",
68
+ value=1.0,
69
+ min_value=0.1,
70
+ max_value=100.0,
71
+ help="The value used to module the next token probabilities.",
72
+ )
73
+
74
+ top_k = st.sidebar.number_input(
75
+ "Top k",
76
+ value=10,
77
+ help="The number of highest probability vocabulary tokens to keep for top-k-filtering.",
78
+ )
79
+
80
+ top_p = st.sidebar.number_input(
81
+ "Top p",
82
+ value=0.95,
83
+ help=" If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.",
84
+ )
85
+
86
+ do_sample = st.sidebar.selectbox(
87
+ "Sampling?",
88
+ (True, False),
89
+ help="Whether or not to use sampling; use greedy decoding otherwise.",
90
+ )
91
+
92
+ st.markdown(
93
+ """
94
+ Spanish GPT-2 models trained from scratch on two different datasets. One
95
+ model is trained on the Spanish portion of
96
+ [OSCAR](https://huggingface.co/datasets/viewer/?dataset=oscar)
97
+ and the other on the
98
+ [large_spanish_corpus](https://huggingface.co/datasets/viewer/?dataset=large_spanish_corpus)
99
+ aka BETO's corpus.
100
+
101
+ The models are trained with Flax and using TPUs sponsored by Google since this is part of the
102
+ [Flax/Jax Community Week](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104)
103
+ organised by HuggingFace.
104
+ """
105
+ )
106
+
107
+ model_name = st.selectbox(
108
+ "Model",
109
+ (["Model trained on OSCAR", "Model trained on the Large Spanish Corpus"]),
110
+ )
111
+
112
+ ALL_PROMPTS = list(PROMPT_LIST.keys()) + ["Custom"]
113
+ prompt = st.selectbox("Prompt", ALL_PROMPTS, index=len(ALL_PROMPTS) - 1)
114
+ if prompt == "Custom":
115
+ prompt_box = "Enter your text here"
116
+ else:
117
+ prompt_box = random.choice(PROMPT_LIST[prompt])
118
+
119
+ text = st.text_area("Enter text", prompt_box)
120
+
121
+ if st.button("Run"):
122
+ with st.spinner(text="Getting results..."):
123
+ st.subheader("Result")
124
+ print(f"maxlen:{max_len}, temp:{temp}, top_k:{top_k}, top_p:{top_p}")
125
+ result = process(
126
+ text=text,
127
+ model_name=model_name,
128
+ max_len=int(max_len),
129
+ temp=temp,
130
+ top_k=int(top_k),
131
+ top_p=float(top_p),
132
+ )
133
+ print("result:", result)
134
+ if "error" in result:
135
+ if type(result["error"]) is str:
136
+ st.write(f'{result["error"]}.', end=" ")
137
+ if "estimated_time" in result:
138
+ st.write(
139
+ f'Please try again in about {result["estimated_time"]:.0f} seconds.'
140
+ )
141
+ else:
142
+ if type(result["error"]) is list:
143
+ for error in result["error"]:
144
+ st.write(f"{error}")
145
+ else:
146
+ result = result[0]["generated_text"]
147
+ st.write(result.replace("\n", " \n"))
148
+ st.text("English translation")
149
+ st.write(translate(result, "en", "es").replace("\n", " \n"))
150
+
151
+ st.markdown(
152
+ """
153
+ ### Team members
154
+ - Manuel Romero ([mrm8488](https://huggingface.co/mrm8488))
155
+ - María Grandury ([mariagrandury](https://huggingface.co/mariagrandury))
156
+ - Pablo González de Prado ([Pablogps](https://huggingface.co/Pablogps))
157
+ - Daniel Vera ([daveni](https://huggingface.co/daveni))
158
+ - Sri Lakshmi ([srisweet](https://huggingface.co/srisweet))
159
+ - José Posada ([jdposa](https://huggingface.co/jdposa))
160
+ - Santiago Hincapie ([shpotes](https://huggingface.co/shpotes))
161
+ - Jorge ([jorgealro](https://huggingface.co/jorgealro))
162
+
163
+ ### More information
164
+ You can find more information about these models in their cards:
165
+ - [Model trained on OSCAR](https://huggingface.co/models/flax-community/gpt-2-spanish)
166
+ - [Model trained on the Large Spanish Corpus](https://huggingface.co/mrm8488/spanish-gpt2)
167
+ """
168
+ )
prompts.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ PROMPT_LIST = {
2
+ "Érase una vez...": ["Érase una vez "],
3
+ "¡Hola!": ["¡Hola! Me llamo "],
4
+ "¿Ser o no ser?": ["En mi opinión, 'ser' es "],
5
+ }
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ mtranslate
3
+ requests==2.24.0
4
+ requests-toolbelt==0.9.1