import gradio as gr
import tiktoken
import random
# License Information
# This application uses the following open-source libraries:
#
# 1. Gradio:
# - License: Apache License 2.0
# - Copyright: 2020-2023, Gradio contributors
# - Full License: http://www.apache.org/licenses/LICENSE-2.0
#
# 2. tiktoken:
# - License: MIT License
# - Copyright: 2022, OpenAI, Shantanu Jain
# - Full License: https://opensource.org/licenses/MIT
# Load the tokenizers
enc_gpt4o = tiktoken.encoding_for_model("gpt-4o")
enc_gpt4 = tiktoken.encoding_for_model("gpt-4")
def get_color_mapping(tokens):
unique_tokens = list(set(tokens))
colors = ["#" + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in unique_tokens]
color_mapping = dict(zip(unique_tokens, colors))
return color_mapping
def process_model(text, encoder, model_name):
token_ids = encoder.encode(text)
tokens = [encoder.decode([id]) for id in token_ids]
num_tokens = len(tokens)
color_mapping = get_color_mapping(tokens)
modelname_html = f'
{model_name}
'
tokens_colored = [f'{token}' for token in tokens]
token_ids_colored = [f'{token_id}' for token, token_id in zip(tokens, token_ids)]
tokens_html = f'{model_name} Tokens
' + ' '.join(tokens_colored)
num_tokens_html = f'Number of Tokens: {num_tokens}
'
token_ids_html = f'{model_name} Token IDs
' + ' '.join(map(str, token_ids_colored))
return modelname_html + num_tokens_html + tokens_html + token_ids_html
def tokenize_input(text):
gpt4o_result = process_model(text, enc_gpt4o, "GPT-4o")
gpt4_result = process_model(text, enc_gpt4, "GPT-4")
num_chars = len(text)
num_chars_html = f'Number of Characters: {num_chars}
'
return num_chars_html, gpt4o_result, gpt4_result
with gr.Blocks() as demo:
gr.Markdown("## GPT4o vs GPT4 Token Comparison")
with gr.Row():
input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter text to tokenize and compare results between GPT-4o and GPT-4 tokenizers.")
num_chars_output = gr.HTML()
with gr.Row():
gpt4o_output = gr.HTML(label="GPT-4o")
gpt4_output = gr.HTML(label="GPT-4")
input_text.change(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt4_output])
input_text.submit(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt4_output])
gr.Markdown("""
### License Information
This application uses the following open-source libraries:
1. **Gradio**:
- License: Apache License 2.0
- Copyright: 2020-2023, Gradio contributors
- Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
- Repository: [Gradio GitHub](https://github.com/gradio-app/gradio/)
2. **tiktoken**:
- License: MIT License
- Copyright: 2022, OpenAI, Shantanu Jain
- Full License: [MIT License](https://opensource.org/licenses/MIT)
- Repository: [tiktoken GitHub](https://github.com/openai/tiktoken)
""")
# Launch the app
demo.launch()