Annorita commited on
Commit
35996ec
·
1 Parent(s): 1459d42

add process function

Browse files
Files changed (2) hide show
  1. app.py +12 -4
  2. utils.py +24 -0
app.py CHANGED
@@ -1,9 +1,10 @@
1
  import streamlit as st
 
2
 
3
- st.title('Tokenizers demo!!')
4
 
5
- x = st.slider('Select a value')
6
- st.write(x, 'squared is', x * x)
7
 
8
 
9
  option = st.selectbox(
@@ -14,4 +15,11 @@ option = st.selectbox(
14
 
15
  model_name = st.text_input('Model Name', 'deepseek-ai/deepseek-coder-1.3b-instruct')
16
 
17
- 'Your choice:', model_name
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from utils import get_res
3
 
4
+ st.title('Tokenizers demo')
5
 
6
+ #x = st.slider('Select a value')
7
+ #st.write(x, 'squared is', x * x)
8
 
9
 
10
  option = st.selectbox(
 
15
 
16
  model_name = st.text_input('Model Name', 'deepseek-ai/deepseek-coder-1.3b-instruct')
17
 
18
+ 'Your choice:', model_name
19
+
20
+ input_data = st.text_input('Input Sentence', 'Hello world!!!')
21
+
22
+
23
+ res = get_res(model_name=model_name, input_sentence=input_data, single_print=False)
24
+
25
+ st.write(res)
utils.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+ import itertools
4
+
5
+
6
+
7
+ def get_color():
8
+ colors = [i for i in range(41, 48)]
9
+ return itertools.cycle(colors)
10
+
11
+ def get_res(model_name, input_sentence, single_print=True):
12
+
13
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
14
+ color_iterator = get_color()
15
+ out = tokenizer.encode(input_sentence, add_special_tokens=False)
16
+ token_num = len(out)
17
+
18
+ w = [ '\033[''1;'+str(next(color_iterator))+f'm {tokenizer.decode(x)}\033[m' for x in out]
19
+ res = ''.join(w) + f' {str(token_num)}'
20
+ if single_print:
21
+ print(res)
22
+ else:
23
+ return res
24
+