Evan73 commited on
Commit
479384b
1 Parent(s): 36f0c73

modify app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -3
app.py CHANGED
@@ -68,7 +68,7 @@ class MGTHuman(datasets.GeneratorBasedBuilder):
68
  else:
69
  return text
70
 
71
- def get_text_by_index(self, filepath, index):
72
  count = 0
73
  with open(filepath, 'r') as f:
74
  data = json.load(f)
@@ -76,7 +76,9 @@ class MGTHuman(datasets.GeneratorBasedBuilder):
76
  if not row["text"].strip():
77
  continue
78
  if count == index:
79
- text = self.truncate_text(row["text"], max_tokens=2048)
 
 
80
  return text
81
  count += 1
82
  return "Index 超出范围,请输入有效的数字。"
@@ -124,9 +126,12 @@ if uploaded_folder:
124
 
125
  # 输入序号查看文本
126
  index_to_view = st.number_input("输入要查看的文本序号", min_value=0, max_value=total_entries - 1, step=1)
 
 
 
127
 
128
  if st.button("显示文本"):
129
- text = mgt_human.get_text_by_index(file_to_display, index=index_to_view)
130
  st.write("对应的文本内容为:", text)
131
  else:
132
  st.write("未找到任何 JSON 文件,请检查 ZIP 文件结构。")
 
68
  else:
69
  return text
70
 
71
+ def get_text_by_index(self, filepath, index, cut_tokens=False, max_tokens=2048):
72
  count = 0
73
  with open(filepath, 'r') as f:
74
  data = json.load(f)
 
76
  if not row["text"].strip():
77
  continue
78
  if count == index:
79
+ text = row["text"]
80
+ if cut_tokens:
81
+ text = self.truncate_text(text, max_tokens)
82
  return text
83
  count += 1
84
  return "Index 超出范围,请输入有效的数字。"
 
126
 
127
  # 输入序号查看文本
128
  index_to_view = st.number_input("输入要查看的文本序号", min_value=0, max_value=total_entries - 1, step=1)
129
+
130
+ # 添加复选框以选择是否切割文本
131
+ cut_tokens = st.checkbox("是否对文本进行token切割", value=False)
132
 
133
  if st.button("显示文本"):
134
+ text = mgt_human.get_text_by_index(file_to_display, index=index_to_view, cut_tokens=cut_tokens)
135
  st.write("对应的文本内容为:", text)
136
  else:
137
  st.write("未找到任何 JSON 文件,请检查 ZIP 文件结构。")