luminoussg commited on
Commit
f825e7c
·
verified ·
1 Parent(s): 1476a99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -14
app.py CHANGED
@@ -30,21 +30,42 @@ def count_tokens(json_file, encoding_name):
30
  'token_count': conversation_token_count
31
  })
32
 
33
- return {"conversations": token_counts, "total_token_count": total_token_count}
34
 
35
  # Gradio interface function
36
- def token_counter(json_file, encoding_name):
37
- token_data = count_tokens(json_file, encoding_name)
38
- return token_data
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  # Gradio UI setup
41
- gr.Interface(
42
- fn=token_counter,
43
- inputs=[
44
- gr.File(label="Upload JSON/JSONL File"),
45
- gr.Dropdown(["r50k_base", "p50k_base", "cl100k_base", "o200k_base"], label="Select Encoding", value="cl100k_base")
46
- ],
47
- outputs=[
48
- gr.JSON(label="Token Counts per Conversation and Total"),
49
- ]
50
- ).launch()
 
 
 
 
 
 
 
 
 
30
  'token_count': conversation_token_count
31
  })
32
 
33
+ return token_counts, total_token_count
34
 
35
  # Gradio interface function
36
+ def token_counter(json_file, encoding_with_model):
37
+ # Split encoding name and model type from the dropdown input
38
+ encoding_name = encoding_with_model.split()[0]
39
+
40
+ # Get token counts
41
+ token_data, total_token_count = count_tokens(json_file, encoding_name)
42
+
43
+ return token_data, total_token_count
44
+
45
+ # Define the encoding choices with model information
46
+ encoding_options = [
47
+ "o200k_base (gpt-4o, gpt-4o-mini)",
48
+ "cl100k_base (gpt-4-turbo, gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large)",
49
+ "p50k_base (Codex models, text-davinci-002, text-davinci-003)",
50
+ "r50k_base (GPT-3 models like davinci)"
51
+ ]
52
 
53
  # Gradio UI setup
54
+ with gr.Blocks() as app:
55
+ gr.Markdown("# Token Counter for JSON/JSONL Datasets")
56
+
57
+ with gr.Row():
58
+ json_input = gr.File(label="Upload JSON/JSONL File")
59
+ encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
60
+
61
+ # Output for individual conversation token counts
62
+ conversation_output = gr.JSON(label="Token Counts per Conversation")
63
+
64
+ # Output for total token count
65
+ total_output = gr.Number(label="Total Token Count", interactive=False)
66
+
67
+ # Link the inputs and outputs to the function
68
+ json_input.change(token_counter, [json_input, encoding_dropdown], [conversation_output, total_output])
69
+
70
+ # Launch the app
71
+ app.launch()