Spaces:

AITextDetect
/

MGTbenchmark

Sleeping

App Files Files Community

MGTbenchmark / app.py

Evan73

modify app.py

296f63c 4 months ago

raw

history blame

4.25 kB

	import streamlit as st
	import os
	import json
	import re
	import datasets
	import tiktoken
	import zipfile
	from pathlib import Path

	# 定义 tiktoken 编码器
	encoding = tiktoken.get_encoding("cl100k_base")

	# MGTHuman 类
	class MGTHuman(datasets.GeneratorBasedBuilder):
	VERSION = datasets.Version("1.0.0")
	BUILDER_CONFIGS = [
	datasets.BuilderConfig(name="human", version=VERSION, description="This part of human data"),
	datasets.BuilderConfig(name="Moonshot", version=VERSION, description="Data from the Moonshot model"),
	datasets.BuilderConfig(name="gpt35", version=VERSION, description="Data from the gpt-3.5-turbo model"),
	datasets.BuilderConfig(name="Llama3", version=VERSION, description="Data from the Llama3 model"),
	datasets.BuilderConfig(name="Mixtral", version=VERSION, description="Data from the Mixtral model"),
	datasets.BuilderConfig(name="Qwen", version=VERSION, description="Data from the Qwen model"),
	]
	DEFAULT_CONFIG_NAME = "human"

	def truncate_text(self, text, max_tokens=2048):
	tokens = encoding.encode(text, allowed_special={'<\|endoftext\|>'})
	if len(tokens) > max_tokens:
	tokens = tokens[:max_tokens]
	truncated_text = encoding.decode(tokens)
	last_period_idx = truncated_text.rfind('。')
	if last_period_idx == -1:
	last_period_idx = truncated_text.rfind('.')
	if last_period_idx != -1:
	truncated_text = truncated_text[:last_period_idx + 1]
	return truncated_text
	else:
	return text

	def get_text_by_index(self, filepath, index):
	count = 0
	with open(filepath, 'r') as f:
	data = json.load(f)
	for row in data:
	if not row["text"].strip():
	continue
	if count == index:
	text = self.truncate_text(row["text"], max_tokens=2048)
	return text
	count += 1
	return "Index 超出范围，请输入有效的数字。"

	def count_entries(self, filepath):
	"""返回文件中的总条数，用于动态生成索引范围"""
	count = 0
	with open(filepath, 'r') as f:
	data = json.load(f)
	for row in data:
	if row["text"].strip():
	count += 1
	return count

	# Streamlit UI
	st.title("MGTHuman Dataset Viewer")

	# 上传包含 JSON 文件的 ZIP 文件
	uploaded_folder = st.file_uploader("上传包含 JSON 文件的 ZIP 文件夹", type=["zip"])
	if uploaded_folder:
	folder_path = Path("temp")
	folder_path.mkdir(exist_ok=True)
	zip_path = folder_path / uploaded_folder.name
	with open(zip_path, "wb") as f:
	f.write(uploaded_folder.getbuffer())

	with zipfile.ZipFile(zip_path, 'r') as zip_ref:
	zip_ref.extractall(folder_path)

	# 递归获取所有 JSON 文件并分类到不同的 domain
	category = {}
	for json_file in folder_path.rglob("*.json"): # 使用 rglob 递归查找所有 JSON 文件
	domain = json_file.stem.split('_task3')[0]
	category.setdefault(domain, []).append(str(json_file))

	# 显示可用的 domain 下拉框
	if category:
	selected_domain = st.selectbox("选择数据种类", options=list(category.keys()))

	# 确定该 domain 的第一个文件路径并获取条目数量
	file_to_display = category[selected_domain][0]
	mgt_human = MGTHuman(name=selected_domain)
	total_entries = mgt_human.count_entries(file_to_display)
	st.write(f"可用的索引范围: 0 到 {total_entries - 1}")

	# 输入序号查看文本
	index_to_view = st.number_input("输入要查看的文本序号", min_value=0, max_value=total_entries - 1, step=1)

	if st.button("显示文本"):
	text = mgt_human.get_text_by_index(file_to_display, index=index_to_view)
	st.write("对应的文本内容为：", text)
	else:
	st.write("未找到任何 JSON 文件，请检查 ZIP 文件结构。")

	# 清理上传文件的临时目录
	if st.button("清除文件"):
	import shutil
	shutil.rmtree("temp")
	st.write("临时文件已清除。")