m-ric HF staff commited on
Commit
afb37e6
β€’
1 Parent(s): 6d80cb7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -10
app.py CHANGED
@@ -25,19 +25,19 @@ def extract_separators_from_string(separators_str):
25
  Please type it in the correct format: "['separator_1', 'separator_2', etc]"
26
  """)
27
 
28
- def change_split_selection(text, slider_count, split_selection, separator_selection, length_unit_selection):
29
  return (
30
  gr.Textbox.update(visible=(split_selection==LABEL_RECURSIVE)),
31
- chunk(text, slider_count, split_selection, separator_selection, length_unit_selection)
32
  )
33
 
34
- def chunk(text, length, splitter_selection, separators_str, length_unit_selection):
35
  separators = extract_separators_from_string(separators_str)
36
  length_function = (length_tokens if "token" in length_unit_selection.lower() else len)
37
  if splitter_selection == LABEL_TEXTSPLITTER:
38
  text_splitter = CharacterTextSplitter(
39
  chunk_size=length,
40
- chunk_overlap=0,
41
  length_function=length_function,
42
  strip_whitespace=False,
43
  is_separator_regex=False,
@@ -46,7 +46,7 @@ def chunk(text, length, splitter_selection, separators_str, length_unit_selectio
46
  elif splitter_selection == LABEL_RECURSIVE:
47
  text_splitter = RecursiveCharacterTextSplitter(
48
  chunk_size=length,
49
- chunk_overlap=0,
50
  length_function=length_function,
51
  strip_whitespace=False,
52
  separators=separators,
@@ -54,7 +54,9 @@ def chunk(text, length, splitter_selection, separators_str, length_unit_selectio
54
  splits = text_splitter.create_documents([text])
55
  text_splits = [split.page_content for split in splits]
56
 
57
- output = [(split, str(i)) for i, split in enumerate(text_splits)]
 
 
58
  return output
59
 
60
 
@@ -134,6 +136,9 @@ with gr.Blocks(theme=gr.themes.Soft(text_size='lg', font=["monospace"], primary_
134
  slider_count = gr.Slider(
135
  20, 500, value=200, label="Chunk length πŸ“", info="In the chosen unit."
136
  )
 
 
 
137
  out = gr.HighlightedText(
138
  label="Output",
139
  show_legend=True,
@@ -141,22 +146,27 @@ with gr.Blocks(theme=gr.themes.Soft(text_size='lg', font=["monospace"], primary_
141
  )
142
  text.change(
143
  fn=chunk,
144
- inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
145
  outputs=out,
146
  )
147
  length_unit_selection.change(
148
  fn=chunk,
149
- inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
150
  outputs=out,
151
  )
152
  split_selection.change(
153
  fn=change_split_selection,
154
- inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
155
  outputs=[separator_selection, out],
156
  )
157
  slider_count.change(
158
  fn=chunk,
159
- inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
 
 
 
 
 
160
  outputs=out,
161
  )
162
  demo.launch()
 
25
  Please type it in the correct format: "['separator_1', 'separator_2', etc]"
26
  """)
27
 
28
+ def change_split_selection(text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap):
29
  return (
30
  gr.Textbox.update(visible=(split_selection==LABEL_RECURSIVE)),
31
+ chunk(text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap)
32
  )
33
 
34
+ def chunk(text, length, splitter_selection, separators_str, length_unit_selection, chunk_overlap):
35
  separators = extract_separators_from_string(separators_str)
36
  length_function = (length_tokens if "token" in length_unit_selection.lower() else len)
37
  if splitter_selection == LABEL_TEXTSPLITTER:
38
  text_splitter = CharacterTextSplitter(
39
  chunk_size=length,
40
+ chunk_overlap=chunk_overlap,
41
  length_function=length_function,
42
  strip_whitespace=False,
43
  is_separator_regex=False,
 
46
  elif splitter_selection == LABEL_RECURSIVE:
47
  text_splitter = RecursiveCharacterTextSplitter(
48
  chunk_size=length,
49
+ chunk_overlap=chunk_overlap,
50
  length_function=length_function,
51
  strip_whitespace=False,
52
  separators=separators,
 
54
  splits = text_splitter.create_documents([text])
55
  text_splits = [split.page_content for split in splits]
56
 
57
+ unoverlapped_text_splits = unoverlap_list(text_splits)
58
+
59
+ output = [((split[0], 0) if split[1] else (split[0], str(i+1))) for i, split in enumerate(unoverlapped_text_splits)]
60
  return output
61
 
62
 
 
136
  slider_count = gr.Slider(
137
  20, 500, value=200, label="Chunk length πŸ“", info="In the chosen unit."
138
  )
139
+ chunk_overlap = gr.Slider(
140
+ 0, 30, value=10, label="Overlap between chunks", info="In the chosen unit."
141
+ )
142
  out = gr.HighlightedText(
143
  label="Output",
144
  show_legend=True,
 
146
  )
147
  text.change(
148
  fn=chunk,
149
+ inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap],
150
  outputs=out,
151
  )
152
  length_unit_selection.change(
153
  fn=chunk,
154
+ inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap],
155
  outputs=out,
156
  )
157
  split_selection.change(
158
  fn=change_split_selection,
159
+ inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap],
160
  outputs=[separator_selection, out],
161
  )
162
  slider_count.change(
163
  fn=chunk,
164
+ inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap],
165
+ outputs=out,
166
+ )
167
+ chunk_overlap.change(
168
+ fn=chunk,
169
+ inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection, chunk_overlap],
170
  outputs=out,
171
  )
172
  demo.launch()