import gradio as gr import tiktoken import json import os # Function to count tokens in the dataset based on the "messages" field def count_tokens(json_file, encoding_name): encoding = tiktoken.get_encoding(encoding_name) # Load the JSON or JSONL data with open(json_file.name, 'r') as f: data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()] total_token_count = 0 token_counts = [] for entry in data: conversation_token_count = 0 conversation_texts = [] if "messages" in entry: for message in entry["messages"]: content = message.get("content", "") conversation_texts.append(content) tokens = len(encoding.encode(content)) conversation_token_count += tokens # Add conversation token count to the total total_token_count += conversation_token_count token_counts.append({ 'conversation': ' '.join(conversation_texts), 'token_count': conversation_token_count }) return token_counts, total_token_count # Gradio interface function def token_counter(json_file, encoding_with_model): # Split encoding name and model type from the dropdown input encoding_name = encoding_with_model.split()[0] # Get token counts token_data, total_token_count = count_tokens(json_file, encoding_name) return token_data, total_token_count # Define the encoding choices with model information encoding_options = [ "o200k_base (gpt-4o, gpt-4o-mini)", "cl100k_base (gpt-4-turbo, gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large)", "p50k_base (Codex models, text-davinci-002, text-davinci-003)", "r50k_base (GPT-3 models like davinci)" ] # Gradio UI setup with gr.Blocks() as app: gr.Markdown("# Token Counter for JSON/JSONL Datasets") with gr.Row(): json_input = gr.File(label="Upload JSON/JSONL File") encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)") # Example file (this will automatically upload when clicked) example_file_path = "keivalyaMedQuad-MedicalQnADataset_valid.jsonl" if os.path.exists(example_file_path): example = gr.Examples( examples=[example_file_path], inputs=json_input, label="Click here to load the example file" ) # Display credits for the dataset author gr.Markdown("### Dataset Credits") gr.Markdown( """ This dataset is provided by the [MedQuad-MedicalQnADataset](https://huggingface.co/datasets/keivalya/MedQuad-MedicalQnADataset) on Hugging Face. All credit goes to the original creator, [keivalya](https://huggingface.co/keivalya). """ ) # Output for individual conversation token counts conversation_output = gr.JSON(label="Token Counts per Conversation") # Output for total token count total_output = gr.Number(label="Total Token Count", interactive=False) # Add a submit button to trigger token counting submit_button = gr.Button("Submit") # Link the button click event to the token counting function submit_button.click(token_counter, [json_input, encoding_dropdown], [conversation_output, total_output]) # Launch the app app.launch()