Spaces:

luminoussg
/

token_counter

Sleeping

luminoussg commited on Oct 16, 2024

Commit

e7d39a8

verified ·

1 Parent(s): e30fc8a

Create app.py

Files changed (1) hide show

app.py ADDED Viewed

+import gradio as gr
+import tiktoken
+import json
+# Function to load JSON/JSONL file and count tokens for each entry
+def count_tokens(json_file, encoding_name):
+    encoding = tiktoken.get_encoding(encoding_name)
+    # Load the JSON or JSONL data
+    with open(json_file.name, 'r') as f:
+        data = [json.loads(line) for line in f.readlines()] if json_file.name.endswith('.jsonl') else json.load(f)
+    # Token counting for each item in the dataset
+    token_counts = []
+    for item in data:
+        if isinstance(item, dict):
+            text = ' '.join([str(v) for v in item.values() if isinstance(v, str)])
+        else:
+            text = str(item)
+        num_tokens = len(encoding.encode(text))
+        token_counts.append({
+            'text': text,
+            'token_count': num_tokens
+        })
+    return token_counts
+# Gradio interface
+def token_counter(json_file, encoding_name):
+    token_data = count_tokens(json_file, encoding_name)
+    return token_data
+# Gradio UI
+gr.Interface(
+    fn=token_counter,
+    inputs=[
+        gr.File(label="Upload JSON/JSONL File"),
+        gr.Dropdown(["r50k_base", "p50k_base", "cl100k_base", "o200k_base"], label="Select Encoding", value="cl100k_base")
+    ],
+    outputs=gr.JSON(label="Token Counts")
+).launch()