GPT4vsGPT4oTokenComparison

Sleeping

App Files Files Community

GPT4vsGPT4oTokenComparison / app.py

gojiteji

Update app.py

deb8701 verified 2 months ago

raw

history blame

3.47 kB

	import gradio as gr
	import tiktoken
	import random

	# License Information
	# This application uses the following open-source libraries:
	#
	# 1. Gradio:
	# - License: Apache License 2.0
	# - Copyright: 2020-2023, Gradio contributors
	# - Full License: http://www.apache.org/licenses/LICENSE-2.0
	#
	# 2. tiktoken:
	# - License: MIT License
	# - Copyright: 2022, OpenAI, Shantanu Jain
	# - Full License: https://opensource.org/licenses/MIT


	# Load the tokenizers
	enc_gpt4o = tiktoken.encoding_for_model("gpt-4o")
	enc_gpt4 = tiktoken.encoding_for_model("gpt-4")

	def get_color_mapping(tokens):
	unique_tokens = list(set(tokens))
	colors = ["#" + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in unique_tokens]
	color_mapping = dict(zip(unique_tokens, colors))
	return color_mapping

	def process_model(text, encoder, model_name):
	token_ids = encoder.encode(text)
	tokens = [encoder.decode([id]) for id in token_ids]
	num_tokens = len(tokens)

	color_mapping = get_color_mapping(tokens)

	modelname_html = f'<h2>{model_name}</h2>'

	tokens_colored = [f'<span style="color:{color_mapping[token]}; font-weight: bold;">{token}</span>' for token in tokens]
	token_ids_colored = [f'<span style="color:{color_mapping[token]}; font-weight: bold;">{token_id}</span>' for token, token_id in zip(tokens, token_ids)]

	tokens_html = f'<h3>{model_name} Tokens</h3>' + ' '.join(tokens_colored)
	num_tokens_html = f'<h3>Number of Tokens: <span style="font-size: 20px; font-weight: bold;">{num_tokens}</span></h3>'
	token_ids_html = f'<h3>{model_name} Token IDs</h3>' + ' '.join(map(str, token_ids_colored))

	return modelname_html + num_tokens_html + tokens_html + token_ids_html

	def tokenize_input(text):
	gpt4o_result = process_model(text, enc_gpt4o, "GPT-4o")
	gpt4_result = process_model(text, enc_gpt4, "GPT-4")
	num_chars = len(text)
	num_chars_html = f'<h2>Number of Characters: <span style="font-size: 20px; font-weight: bold;">{num_chars}</span></h2>'
	return num_chars_html, gpt4o_result, gpt4_result


	with gr.Blocks() as demo:
	gr.Markdown("## GPT4o vs GPT4 Token Comparison")
	with gr.Row():
	input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter text to tokenize and compare results between GPT-4o and GPT-4 tokenizers.")
	num_chars_output = gr.HTML()
	with gr.Row():
	gpt4o_output = gr.HTML(label="GPT-4o")
	gpt4_output = gr.HTML(label="GPT-4")

	input_text.change(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt4_output])
	input_text.submit(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt4_output])

	gr.Markdown("""
	<hr>

	### License Information
	This application uses the following open-source libraries:

	1. Gradio:
	- License: Apache License 2.0
	- Copyright: 2020-2023, Gradio contributors
	- Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
	- Repository: [Gradio GitHub](https://github.com/gradio-app/gradio/)

	2. tiktoken:
	- License: MIT License
	- Copyright: 2022, OpenAI, Shantanu Jain
	- Full License: [MIT License](https://opensource.org/licenses/MIT)
	- Repository: [tiktoken GitHub](https://github.com/openai/tiktoken)
	""")


	# Launch the app
	demo.launch()