Evan73 commited on
Commit
fcb6ffd
1 Parent(s): 9a80e8e

Add app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -28
app.py CHANGED
@@ -1,32 +1,87 @@
1
  import streamlit as st
 
2
  import json
 
 
 
3
  import zipfile
4
- import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- def process_json(json_file):
7
- data = json.load(json_file)
8
- # 在这里处理 JSON 数据
9
- return f"Processed JSON file: {json_file.name}"
10
-
11
- def process_zip(zip_file):
12
- results = []
13
- with zipfile.ZipFile(zip_file, 'r') as zip_ref:
14
- zip_ref.extractall("temp")
15
- for root, _, files in os.walk("temp"):
16
- for filename in files:
17
- if filename.endswith('.json'):
18
- with open(os.path.join(root, filename)) as f:
19
- data = json.load(f)
20
- # 在这里处理解压后的 JSON 文件
21
- results.append(f"Processed JSON file from ZIP: {filename}")
22
- return results
23
-
24
- st.title("JSON and ZIP File Processor")
25
- uploaded_files = st.file_uploader("Upload JSON or ZIP files", type=["json", "zip"], accept_multiple_files=True)
26
-
27
- if uploaded_files:
28
- for file in uploaded_files:
29
- if file.name.endswith(".json"):
30
- st.write(process_json(file))
31
- elif file.name.endswith(".zip"):
32
- st.write(process_zip(file))
 
1
  import streamlit as st
2
+ import os
3
  import json
4
+ import re
5
+ import datasets
6
+ import tiktoken
7
  import zipfile
8
+ from pathlib import Path
9
+
10
+ # 定义 tiktoken 编码器
11
+ encoding = tiktoken.get_encoding("cl100k_base")
12
+
13
+ # MGTHuman 类
14
+ class MGTHuman(datasets.GeneratorBasedBuilder):
15
+ VERSION = datasets.Version("1.0.0")
16
+ BUILDER_CONFIGS = [
17
+ datasets.BuilderConfig(name="human", version=VERSION, description="This part of human data"),
18
+ datasets.BuilderConfig(name="Moonshot", version=VERSION, description="Data from the Moonshot model"),
19
+ datasets.BuilderConfig(name="gpt35", version=VERSION, description="Data from the gpt-3.5-turbo model"),
20
+ datasets.BuilderConfig(name="Llama3", version=VERSION, description="Data from the Llama3 model"),
21
+ datasets.BuilderConfig(name="Mixtral", version=VERSION, description="Data from the Mixtral model"),
22
+ datasets.BuilderConfig(name="Qwen", version=VERSION, description="Data from the Qwen model"),
23
+ ]
24
+ DEFAULT_CONFIG_NAME = "human"
25
+
26
+ def truncate_text(self, text, max_tokens=2048):
27
+ tokens = encoding.encode(text, allowed_special={'<|endoftext|>'})
28
+ if len(tokens) > max_tokens:
29
+ tokens = tokens[:max_tokens]
30
+ truncated_text = encoding.decode(tokens)
31
+ last_period_idx = truncated_text.rfind('。')
32
+ if last_period_idx == -1:
33
+ last_period_idx = truncated_text.rfind('.')
34
+ if last_period_idx != -1:
35
+ truncated_text = truncated_text[:last_period_idx + 1]
36
+ return truncated_text
37
+ else:
38
+ return text
39
+
40
+ def get_text_by_index(self, filepath, index):
41
+ count = 0
42
+ for file in filepath:
43
+ with open(file, 'r') as f:
44
+ data = json.load(f)
45
+
46
+ for row in data:
47
+ if not row["text"].strip():
48
+ continue
49
+ if count == index:
50
+ text = self.truncate_text(row["text"], max_tokens=2048)
51
+ return text
52
+ count += 1
53
+ return "Index 超出范围,请输入有效的数字。"
54
+
55
+ # Streamlit UI
56
+ st.title("MGTHuman Dataset Viewer")
57
+
58
+ # 文件夹上传
59
+ uploaded_folder = st.file_uploader("上传包含 JSON 文件的 ZIP 文件夹", type=["zip"])
60
+ if uploaded_folder:
61
+ folder_path = Path("temp")
62
+ folder_path.mkdir(exist_ok=True)
63
+ zip_path = folder_path / uploaded_folder.name
64
+ with open(zip_path, "wb") as f:
65
+ f.write(uploaded_folder.getbuffer())
66
+
67
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
68
+ zip_ref.extractall(folder_path)
69
+
70
+ # 获取解压后的所有 JSON 文件路径
71
+ json_files = list(folder_path.glob("*.json"))
72
+
73
+ # 选择数据配置
74
+ config_name = st.selectbox("选择数据配置", ["human", "Moonshot", "gpt35", "Llama3", "Mixtral", "Qwen"])
75
+ mgt_human = MGTHuman(name=config_name)
76
+
77
+ # 输入序号查看文本
78
+ index_to_view = st.number_input("输入要查看的文本序号", min_value=0, step=1)
79
+ if st.button("显示文本"):
80
+ text = mgt_human.get_text_by_index(json_files, index=index_to_view)
81
+ st.write("对应的文本内容为:", text)
82
 
83
+ # 清理上传文件的临时目录
84
+ if st.button("清除文件"):
85
+ import shutil
86
+ shutil.rmtree("temp")
87
+ st.write("临时文件已清除。")