Evan73 commited on
Commit
732150f
1 Parent(s): 2cb39e4

modify app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -19
app.py CHANGED
@@ -39,23 +39,21 @@ class MGTHuman(datasets.GeneratorBasedBuilder):
39
 
40
  def get_text_by_index(self, filepath, index):
41
  count = 0
42
- for file in filepath:
43
- with open(file, 'r') as f:
44
- data = json.load(f)
45
-
46
- for row in data:
47
- if not row["text"].strip():
48
- continue
49
- if count == index:
50
- text = self.truncate_text(row["text"], max_tokens=2048)
51
- return text
52
- count += 1
53
  return "Index 超出范围,请输入有效的数字。"
54
 
55
  # Streamlit UI
56
  st.title("MGTHuman Dataset Viewer")
57
 
58
- # 文件夹上传
59
  uploaded_folder = st.file_uploader("上传包含 JSON 文件的 ZIP 文件夹", type=["zip"])
60
  if uploaded_folder:
61
  folder_path = Path("temp")
@@ -67,17 +65,26 @@ if uploaded_folder:
67
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
68
  zip_ref.extractall(folder_path)
69
 
70
- # 获取解压后的所有 JSON 文件路径
71
- json_files = list(folder_path.glob("*.json"))
72
-
73
- # 选择数据配置
74
- config_name = st.selectbox("选择数据配置", ["human", "Moonshot", "gpt35", "Llama3", "Mixtral", "Qwen"])
75
- mgt_human = MGTHuman(name=config_name)
 
 
76
 
 
 
 
77
  # 输入序号查看文本
78
  index_to_view = st.number_input("输入要查看的文本序号", min_value=0, step=1)
 
79
  if st.button("显示文本"):
80
- text = mgt_human.get_text_by_index(json_files, index=index_to_view)
 
 
 
81
  st.write("对应的文本内容为:", text)
82
 
83
  # 清理上传文件的临时目录
 
39
 
40
  def get_text_by_index(self, filepath, index):
41
  count = 0
42
+ with open(filepath, 'r') as f:
43
+ data = json.load(f)
44
+ for row in data:
45
+ if not row["text"].strip():
46
+ continue
47
+ if count == index:
48
+ text = self.truncate_text(row["text"], max_tokens=2048)
49
+ return text
50
+ count += 1
 
 
51
  return "Index 超出范围,请输入有效的数字。"
52
 
53
  # Streamlit UI
54
  st.title("MGTHuman Dataset Viewer")
55
 
56
+ # 上传包含 JSON 文件的 ZIP 文件
57
  uploaded_folder = st.file_uploader("上传包含 JSON 文件的 ZIP 文件夹", type=["zip"])
58
  if uploaded_folder:
59
  folder_path = Path("temp")
 
65
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
66
  zip_ref.extractall(folder_path)
67
 
68
+ # 获取所有 JSON 文件并分类到不同的 domain
69
+ category = {}
70
+ for json_file in folder_path.glob("*.json"):
71
+ domain = json_file.stem.split('_task3')[0]
72
+ category.setdefault(domain, []).append(str(json_file))
73
+
74
+ # 显示可用的 domain
75
+ st.write("可用的数据种类:", list(category.keys()))
76
 
77
+ # 用户选择 domain
78
+ selected_domain = st.selectbox("选择数据种类", options=list(category.keys()))
79
+
80
  # 输入序号查看文本
81
  index_to_view = st.number_input("输入要查看的文本序号", min_value=0, step=1)
82
+
83
  if st.button("显示文本"):
84
+ # 选择第一个文件进行展示
85
+ file_to_display = category[selected_domain][0]
86
+ mgt_human = MGTHuman(name=selected_domain)
87
+ text = mgt_human.get_text_by_index(file_to_display, index=index_to_view)
88
  st.write("对应的文本内容为:", text)
89
 
90
  # 清理上传文件的临时目录