Spaces:

luminoussg
/

token_counter

Sleeping

App Files Files Community

luminoussg commited on Oct 16, 2024

Commit

f825e7c

verified ·

1 Parent(s): 1476a99

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -14

app.py CHANGED Viewed

@@ -30,21 +30,42 @@ def count_tokens(json_file, encoding_name):
             'token_count': conversation_token_count
         })
-    return {"conversations": token_counts, "total_token_count": total_token_count}
 # Gradio interface function
-def token_counter(json_file, encoding_name):
-    token_data = count_tokens(json_file, encoding_name)
-    return token_data
 # Gradio UI setup
-gr.Interface(
-    fn=token_counter,
-    inputs=[
-        gr.File(label="Upload JSON/JSONL File"),
-        gr.Dropdown(["r50k_base", "p50k_base", "cl100k_base", "o200k_base"], label="Select Encoding", value="cl100k_base")
-    ],
-    outputs=[
-        gr.JSON(label="Token Counts per Conversation and Total"),
-    ]
-).launch()

             'token_count': conversation_token_count
         })
+    return token_counts, total_token_count
 # Gradio interface function
+def token_counter(json_file, encoding_with_model):
+    # Split encoding name and model type from the dropdown input
+    encoding_name = encoding_with_model.split()[0]
+    # Get token counts
+    token_data, total_token_count = count_tokens(json_file, encoding_name)
+    return token_data, total_token_count
+# Define the encoding choices with model information
+encoding_options = [
+    "o200k_base (gpt-4o, gpt-4o-mini)",
+    "cl100k_base (gpt-4-turbo, gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large)",
+    "p50k_base (Codex models, text-davinci-002, text-davinci-003)",
+    "r50k_base (GPT-3 models like davinci)"
+]
 # Gradio UI setup
+with gr.Blocks() as app:
+    gr.Markdown("# Token Counter for JSON/JSONL Datasets")
+    with gr.Row():
+        json_input = gr.File(label="Upload JSON/JSONL File")
+        encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
+    # Output for individual conversation token counts
+    conversation_output = gr.JSON(label="Token Counts per Conversation")
+    # Output for total token count
+    total_output = gr.Number(label="Total Token Count", interactive=False)
+    # Link the inputs and outputs to the function
+    json_input.change(token_counter, [json_input, encoding_dropdown], [conversation_output, total_output])
+# Launch the app
+app.launch()