Spaces:
Running
Running
modify app.py
Browse files
app.py
CHANGED
@@ -68,7 +68,7 @@ class MGTHuman(datasets.GeneratorBasedBuilder):
|
|
68 |
else:
|
69 |
return text
|
70 |
|
71 |
-
def get_text_by_index(self, filepath, index):
|
72 |
count = 0
|
73 |
with open(filepath, 'r') as f:
|
74 |
data = json.load(f)
|
@@ -76,7 +76,9 @@ class MGTHuman(datasets.GeneratorBasedBuilder):
|
|
76 |
if not row["text"].strip():
|
77 |
continue
|
78 |
if count == index:
|
79 |
-
text =
|
|
|
|
|
80 |
return text
|
81 |
count += 1
|
82 |
return "Index 超出范围,请输入有效的数字。"
|
@@ -124,9 +126,12 @@ if uploaded_folder:
|
|
124 |
|
125 |
# 输入序号查看文本
|
126 |
index_to_view = st.number_input("输入要查看的文本序号", min_value=0, max_value=total_entries - 1, step=1)
|
|
|
|
|
|
|
127 |
|
128 |
if st.button("显示文本"):
|
129 |
-
text = mgt_human.get_text_by_index(file_to_display, index=index_to_view)
|
130 |
st.write("对应的文本内容为:", text)
|
131 |
else:
|
132 |
st.write("未找到任何 JSON 文件,请检查 ZIP 文件结构。")
|
|
|
68 |
else:
|
69 |
return text
|
70 |
|
71 |
+
def get_text_by_index(self, filepath, index, cut_tokens=False, max_tokens=2048):
|
72 |
count = 0
|
73 |
with open(filepath, 'r') as f:
|
74 |
data = json.load(f)
|
|
|
76 |
if not row["text"].strip():
|
77 |
continue
|
78 |
if count == index:
|
79 |
+
text = row["text"]
|
80 |
+
if cut_tokens:
|
81 |
+
text = self.truncate_text(text, max_tokens)
|
82 |
return text
|
83 |
count += 1
|
84 |
return "Index 超出范围,请输入有效的数字。"
|
|
|
126 |
|
127 |
# 输入序号查看文本
|
128 |
index_to_view = st.number_input("输入要查看的文本序号", min_value=0, max_value=total_entries - 1, step=1)
|
129 |
+
|
130 |
+
# 添加复选框以选择是否切割文本
|
131 |
+
cut_tokens = st.checkbox("是否对文本进行token切割", value=False)
|
132 |
|
133 |
if st.button("显示文本"):
|
134 |
+
text = mgt_human.get_text_by_index(file_to_display, index=index_to_view, cut_tokens=cut_tokens)
|
135 |
st.write("对应的文本内容为:", text)
|
136 |
else:
|
137 |
st.write("未找到任何 JSON 文件,请检查 ZIP 文件结构。")
|