Aymeric Roucher commited on
Commit
cb842ed
·
verified ·
1 Parent(s): fadcc20

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -46
app.py CHANGED
@@ -13,41 +13,14 @@ preprocessor = PreProcessor(
13
  remove_substrings=None,
14
  max_chars_check=10_000,
15
  )
16
- import difflib
17
 
18
 
19
- def separate_overlap(s1, s2):
20
- for i in range(len(s1) - len(s2), len(s1)):
21
- if s1[i:] == s2[: len(s1) - i]:
22
- overlap = s1[i:]
23
- return [s1[:i], overlap, s2[len(s1) - i :]]
24
- # if no overlap is found, return the strings
25
- return [s1, s2]
26
-
27
-
28
- def extract_overlaps(list):
29
- i = 0
30
- annotated_list = [[el, i] for i, el in enumerate(list)]
31
-
32
- while i < len(annotated_list) - 1:
33
- separated = separate_overlap(annotated_list[i][0], annotated_list[i + 1][0])
34
- if len(separated) == 2:
35
- i += 1
36
- elif len(separated) == 3:
37
- annotated_list[i][0] = separated[0]
38
- annotated_list.insert(i + 1, [separated[1], "overlap"])
39
- annotated_list[i + 2][0] = separated[2]
40
- i += 2
41
- return annotated_list
42
-
43
-
44
- def chunk(text, words, splitter_selection, slider_overlap):
45
  if "Word" in splitter_selection:
46
  splits = preprocessor.split(
47
  Document(text),
48
  split_length=words,
49
  split_by="word",
50
- split_overlap=slider_overlap,
51
  split_respect_sentence_boundary=(
52
  "respect sentence boundaries" in splitter_selection
53
  ),
@@ -57,7 +30,6 @@ def chunk(text, words, splitter_selection, slider_overlap):
57
  text_splitter = CharacterTextSplitter(
58
  separator="",
59
  chunk_size=words,
60
- chunk_overlap=slider_overlap,
61
  length_function=len,
62
  is_separator_regex=False,
63
  )
@@ -66,7 +38,6 @@ def chunk(text, words, splitter_selection, slider_overlap):
66
  elif splitter_selection == "Langchain's RecursiveCharacterTextSplitter - vanilla":
67
  text_splitter = RecursiveCharacterTextSplitter(
68
  chunk_size=words,
69
- chunk_overlap=slider_overlap,
70
  length_function=len,
71
  add_start_index=True,
72
  )
@@ -75,7 +46,6 @@ def chunk(text, words, splitter_selection, slider_overlap):
75
  elif splitter_selection == "Langchain's RecursiveCharacterTextSplitter - with '.'":
76
  text_splitter = RecursiveCharacterTextSplitter(
77
  chunk_size=words,
78
- chunk_overlap=slider_overlap,
79
  length_function=len,
80
  add_start_index=True,
81
  separators=["\n\n", "\n", ".", " ", ""],
@@ -151,13 +121,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
151
  slider_count = gr.Slider(
152
  20, 500, value=50, label="Count 🧮", info="Chunk size, in the chosen unit."
153
  )
154
- slider_overlap = gr.Slider(
155
- 0,
156
- 100,
157
- value=0,
158
- label="Overlap 🔀",
159
- info="Size of overlap between adjacent chunks.",
160
- )
161
  out = gr.HighlightedText(
162
  label="Output",
163
  show_legend=True,
@@ -165,22 +128,17 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
165
  )
166
  text.change(
167
  fn=chunk,
168
- inputs=[text, slider_count, split_selection, slider_overlap],
169
  outputs=out,
170
  )
171
  split_selection.change(
172
  fn=chunk,
173
- inputs=[text, slider_count, split_selection, slider_overlap],
174
  outputs=out,
175
  )
176
  slider_count.change(
177
  fn=chunk,
178
- inputs=[text, slider_count, split_selection, slider_overlap],
179
- outputs=out,
180
- )
181
- slider_overlap.change(
182
- fn=chunk,
183
- inputs=[text, slider_count, split_selection, slider_overlap],
184
  outputs=out,
185
  )
186
  demo.launch()
 
13
  remove_substrings=None,
14
  max_chars_check=10_000,
15
  )
 
16
 
17
 
18
+ def chunk(text, words, splitter_selection):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  if "Word" in splitter_selection:
20
  splits = preprocessor.split(
21
  Document(text),
22
  split_length=words,
23
  split_by="word",
 
24
  split_respect_sentence_boundary=(
25
  "respect sentence boundaries" in splitter_selection
26
  ),
 
30
  text_splitter = CharacterTextSplitter(
31
  separator="",
32
  chunk_size=words,
 
33
  length_function=len,
34
  is_separator_regex=False,
35
  )
 
38
  elif splitter_selection == "Langchain's RecursiveCharacterTextSplitter - vanilla":
39
  text_splitter = RecursiveCharacterTextSplitter(
40
  chunk_size=words,
 
41
  length_function=len,
42
  add_start_index=True,
43
  )
 
46
  elif splitter_selection == "Langchain's RecursiveCharacterTextSplitter - with '.'":
47
  text_splitter = RecursiveCharacterTextSplitter(
48
  chunk_size=words,
 
49
  length_function=len,
50
  add_start_index=True,
51
  separators=["\n\n", "\n", ".", " ", ""],
 
121
  slider_count = gr.Slider(
122
  20, 500, value=50, label="Count 🧮", info="Chunk size, in the chosen unit."
123
  )
 
 
 
 
 
 
 
124
  out = gr.HighlightedText(
125
  label="Output",
126
  show_legend=True,
 
128
  )
129
  text.change(
130
  fn=chunk,
131
+ inputs=[text, slider_count, split_selection],
132
  outputs=out,
133
  )
134
  split_selection.change(
135
  fn=chunk,
136
+ inputs=[text, slider_count, split_selection],
137
  outputs=out,
138
  )
139
  slider_count.change(
140
  fn=chunk,
141
+ inputs=[text, slider_count, split_selection],
 
 
 
 
 
142
  outputs=out,
143
  )
144
  demo.launch()