Spaces:
Sleeping
Sleeping
luminoussg
commited on
Commit
•
e7d39a8
1
Parent(s):
e30fc8a
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import tiktoken
|
3 |
+
import json
|
4 |
+
|
5 |
+
# Function to load JSON/JSONL file and count tokens for each entry
|
6 |
+
def count_tokens(json_file, encoding_name):
|
7 |
+
encoding = tiktoken.get_encoding(encoding_name)
|
8 |
+
|
9 |
+
# Load the JSON or JSONL data
|
10 |
+
with open(json_file.name, 'r') as f:
|
11 |
+
data = [json.loads(line) for line in f.readlines()] if json_file.name.endswith('.jsonl') else json.load(f)
|
12 |
+
|
13 |
+
# Token counting for each item in the dataset
|
14 |
+
token_counts = []
|
15 |
+
for item in data:
|
16 |
+
if isinstance(item, dict):
|
17 |
+
text = ' '.join([str(v) for v in item.values() if isinstance(v, str)])
|
18 |
+
else:
|
19 |
+
text = str(item)
|
20 |
+
|
21 |
+
num_tokens = len(encoding.encode(text))
|
22 |
+
token_counts.append({
|
23 |
+
'text': text,
|
24 |
+
'token_count': num_tokens
|
25 |
+
})
|
26 |
+
|
27 |
+
return token_counts
|
28 |
+
|
29 |
+
# Gradio interface
|
30 |
+
def token_counter(json_file, encoding_name):
|
31 |
+
token_data = count_tokens(json_file, encoding_name)
|
32 |
+
return token_data
|
33 |
+
|
34 |
+
# Gradio UI
|
35 |
+
gr.Interface(
|
36 |
+
fn=token_counter,
|
37 |
+
inputs=[
|
38 |
+
gr.File(label="Upload JSON/JSONL File"),
|
39 |
+
gr.Dropdown(["r50k_base", "p50k_base", "cl100k_base", "o200k_base"], label="Select Encoding", value="cl100k_base")
|
40 |
+
],
|
41 |
+
outputs=gr.JSON(label="Token Counts")
|
42 |
+
).launch()
|