m-ric HF staff commited on
Commit
af9e1dd
·
verified ·
1 Parent(s): 047048a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -19
app.py CHANGED
@@ -1,6 +1,4 @@
1
  import gradio as gr
2
- from haystack import Document
3
- from haystack.nodes import PreProcessor
4
  from langchain.text_splitter import (
5
  CharacterTextSplitter,
6
  RecursiveCharacterTextSplitter,
@@ -15,18 +13,7 @@ preprocessor = PreProcessor(
15
  )
16
 
17
 
18
- def chunk(text, words, splitter_selection):
19
- if "Word" in splitter_selection:
20
- splits = preprocessor.split(
21
- Document(text),
22
- split_length=words,
23
- split_by="word",
24
- split_overlap=0,
25
- split_respect_sentence_boundary=(
26
- "respect sentence boundaries" in splitter_selection
27
- ),
28
- )
29
- text_splits = [split.content for split in splits]
30
  elif splitter_selection == "LangChain's CharacterTextSplitter":
31
  text_splitter = CharacterTextSplitter(
32
  separator="",
@@ -39,7 +26,7 @@ def chunk(text, words, splitter_selection):
39
  text_splits = [split.page_content for split in splits]
40
  elif splitter_selection == "Langchain's RecursiveCharacterTextSplitter - vanilla":
41
  text_splitter = RecursiveCharacterTextSplitter(
42
- chunk_size=words,
43
  chunk_overlap=0,
44
  length_function=len,
45
  add_start_index=True,
@@ -48,7 +35,7 @@ def chunk(text, words, splitter_selection):
48
  text_splits = [split.page_content for split in splits]
49
  elif splitter_selection == "Langchain's RecursiveCharacterTextSplitter - with '.'":
50
  text_splitter = RecursiveCharacterTextSplitter(
51
- chunk_size=words,
52
  chunk_overlap=0,
53
  length_function=len,
54
  add_start_index=True,
@@ -107,12 +94,21 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
107
  "LangChain's CharacterTextSplitter",
108
  "Langchain's RecursiveCharacterTextSplitter - vanilla",
109
  "Langchain's RecursiveCharacterTextSplitter - with '.'",
110
- "Haystack's PreProcessor - Word level, no sentence boundaries",
111
- "Haystack's PreProcessor - Word level, respect sentence boundaries",
112
  ],
113
  value="LangChain's CharacterTextSplitter",
114
  label="Chunking method ",
115
- info="How should we split our chunks",
 
 
 
 
 
 
 
 
 
 
 
116
  )
117
  slider_count = gr.Slider(
118
  20, 500, value=50, label="Count 🧮", info="Chunk size, in the chosen unit."
@@ -127,6 +123,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
127
  inputs=[text, slider_count, split_selection],
128
  outputs=out,
129
  )
 
 
 
 
 
130
  split_selection.change(
131
  fn=chunk,
132
  inputs=[text, slider_count, split_selection],
 
1
  import gradio as gr
 
 
2
  from langchain.text_splitter import (
3
  CharacterTextSplitter,
4
  RecursiveCharacterTextSplitter,
 
13
  )
14
 
15
 
16
+ def chunk(text, length, splitter_selection):
 
 
 
 
 
 
 
 
 
 
 
17
  elif splitter_selection == "LangChain's CharacterTextSplitter":
18
  text_splitter = CharacterTextSplitter(
19
  separator="",
 
26
  text_splits = [split.page_content for split in splits]
27
  elif splitter_selection == "Langchain's RecursiveCharacterTextSplitter - vanilla":
28
  text_splitter = RecursiveCharacterTextSplitter(
29
+ chunk_size=length,
30
  chunk_overlap=0,
31
  length_function=len,
32
  add_start_index=True,
 
35
  text_splits = [split.page_content for split in splits]
36
  elif splitter_selection == "Langchain's RecursiveCharacterTextSplitter - with '.'":
37
  text_splitter = RecursiveCharacterTextSplitter(
38
+ chunk_size=length,
39
  chunk_overlap=0,
40
  length_function=len,
41
  add_start_index=True,
 
94
  "LangChain's CharacterTextSplitter",
95
  "Langchain's RecursiveCharacterTextSplitter - vanilla",
96
  "Langchain's RecursiveCharacterTextSplitter - with '.'",
 
 
97
  ],
98
  value="LangChain's CharacterTextSplitter",
99
  label="Chunking method ",
100
+ info="How should we split our chunks?",
101
+ )
102
+ demo = gr.Interface(fn=greet, inputs="textbox", outputs="textbox")
103
+ separator_selection = gradio.Textbox(value=["\n\n", "\n", ".", " ", ""], label="Separators used in RecursiveCharacterTextSplitter")
104
+ length_unit_selection = gr.Dropdown(
105
+ choices=[
106
+ "Character count",
107
+ "Token count",
108
+ ],
109
+ value="Token count",
110
+ label="Length count",
111
+ info="How should we count our chunk lengths?",
112
  )
113
  slider_count = gr.Slider(
114
  20, 500, value=50, label="Count 🧮", info="Chunk size, in the chosen unit."
 
123
  inputs=[text, slider_count, split_selection],
124
  outputs=out,
125
  )
126
+ length_unit_selection.change(
127
+ fn=chunk,
128
+ inputs=[text, slider_count, split_selection],
129
+ outputs=out,
130
+ )
131
  split_selection.change(
132
  fn=chunk,
133
  inputs=[text, slider_count, split_selection],