Aymeric Roucher commited on
Commit
2c8408f
โ€ข
1 Parent(s): 2728386

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -23
app.py CHANGED
@@ -1,47 +1,93 @@
1
  import gradio as gr
2
  from haystack import Document
3
  from haystack.nodes import PreProcessor
4
- from langchain.text_splitter import CharacterTextSplitter
 
 
 
5
 
6
  preprocessor = PreProcessor(
7
  clean_empty_lines=True,
8
  clean_whitespace=True,
9
  clean_header_footer=True,
10
- remove_substrings=None,
11
- max_chars_check=10_000
12
  )
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- def chunk(text, words, splitter_selection):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  if "Word" in splitter_selection:
16
  splits = preprocessor.split(
17
  Document(text),
18
  split_length=words,
19
  split_by="word",
20
- split_overlap=0,
21
- split_respect_sentence_boundary=("respect sentence boundaries" in splitter_selection),
 
 
22
  )
23
  text_splits = [split.content for split in splits]
24
- if splitter_selection == "Character":
25
- text_splitter=CharacterTextSplitter(
26
  separator="",
27
  chunk_size=words,
28
- chunk_overlap=0,
29
  length_function=len,
30
  is_separator_regex=False,
31
  )
32
  splits = text_splitter.create_documents([text])
33
  text_splits = [split.page_content for split in splits]
 
 
 
 
 
 
 
 
 
 
 
34
 
 
 
35
 
36
-
37
- output = [(split, str(i)) for i, split in enumerate(text_splits)]
38
- return(output)
39
 
40
 
41
  ESSAY = """
42
  Chapter 6
43
  WHAT SORT OF DESPOTISM DEMOCRATIC NATIONS HAVE TO FEAR
44
- I HAD remarked during my stay in the United States that a democratic state of society, similar to that of the Americans, might offer singular facilities for the establishment of despotism; and I perceived, upon my return to Europe, how much use had already been made, by most of our rulers, of the notions, the sentiments, and the wants created by this same social condition, for the purpose of extending the circle of their power. This led me to think that the nations of Christendom would perhaps eventually undergo some oppression like that which hung over several of the nations of the ancient world. .
 
45
 
46
  A more accurate examination of the subject, and five years of further meditation, have not diminished my fears, but have changed their object.
47
 
@@ -49,23 +95,79 @@ No sovereign ever lived in former ages so absolute or so powerful as to undertak
49
 
50
  When the Roman emperors were at the height of their power, the different nations of the empire still preserved usages and customs of great diversity; although they were subject to the same monarch, most of the provinces were separately administered; they abounded in powerful and active municipalities; and although the whole government of the empire was centered in the hands of the Emperor alone and he always remained, in case of need, the supreme arbiter in all matters, yet the details of social life and private occupations lay for the most part beyond his control. The emperors possessed, it is true, an immense and unchecked power, which allowed them to gratify all their whimsical tastes and to employ for that purpose the whole strength of the state. They frequently abused that power arbitrarily to deprive their subjects of property or of life; their tyranny was extremely onerous to the few, but it did not reach the many; it was confined to some few main objects and neglected the rest; it was violent, but its range was limited.
51
 
52
- It would seem that if despotism were to be established among the democratic nations of our days, it might assume a different character; it would be more extensive and more mild; it would degrade men without tormenting them. I do not question that, in an age of instruction and equality like our own, sovereigns might more easily succeed in collecting all political power into their own hands and might interfere more habitually and decidedly with the circle of private interests than any sovereign of antiquity could ever do. But this same principle of equality which facilitates despotism tempers its rigor. We have seen how the customs of society become more humane and gentle in proportion as men become more equal and alike. When no member of the community has much power or much wealth, tyranny is, as it were, without opportunities and a field of action. As all fortunes are scanty, the passions of men are naturally circumscribed, their imagination limited, their pleasures simple. This universal moderation moderates the sovereign himself and checks within certain limits the inordinate stretch of his desires.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- Independently of these reasons, drawn from the nature of the state of society itself, I might add many others arising from causes beyond my subject; but I shall keep within the limits I have laid down.
55
 
56
- Democratic governments may become violent and even cruel at certain periods of extreme effervescence or of great danger, but these crises will be rare and brief. When I consider the petty passions of our contemporaries, the mildness of their manners, the extent of their education, the purity of their religion, the gentleness of their morality, their regular and industrious habits, and the restraint which they almost all observe in their vices no less than in their virtues, I have no fear that they will meet with tyrants in their rulers, but rather with guardians.1
57
  """
 
 
58
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
59
  text = gr.Textbox(label="Your text ๐Ÿชถ", value=ESSAY)
60
- split_selection = gr.Radio(["Word - respect sentence boundaries", "Word - no sentence boundaries", "Character"], value='Character', label='Chunking method ๐Ÿž', info='How should we split our chunks?')
61
- slider = gr.Slider(20, 500, value=50, label="Count ๐Ÿงฎ", info="Chunk size, in the chosen unit.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  out = gr.HighlightedText(
63
  label="Output",
64
  show_legend=True,
65
  show_label=False,
66
  )
67
- text.change(fn=chunk, inputs=[text, slider, split_selection], outputs=out)
68
- split_selection.change(fn=chunk, inputs=[text, slider, split_selection], outputs=out)
69
- slider.change(fn=chunk, inputs=[text, slider, split_selection], outputs=out)
70
-
71
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from haystack import Document
3
  from haystack.nodes import PreProcessor
4
+ from langchain.text_splitter import (
5
+ CharacterTextSplitter,
6
+ RecursiveCharacterTextSplitter,
7
+ )
8
 
9
  preprocessor = PreProcessor(
10
  clean_empty_lines=True,
11
  clean_whitespace=True,
12
  clean_header_footer=True,
13
+ remove_substrings=None,
14
+ max_chars_check=10_000,
15
  )
16
+ import difflib
17
+
18
+
19
+ def separate_overlap(s1, s2):
20
+ for i in range(len(s1) - len(s2), len(s1)):
21
+ if s1[i:] == s2[: len(s1) - i]:
22
+ overlap = s1[i:]
23
+ return [s1[:i], overlap, s2[len(s1) - i :]]
24
+ # if no overlap is found, return the strings
25
+ return [s1, s2]
26
+
27
 
28
+ def extract_overlaps(list):
29
+ i = 0
30
+ annotated_list = [[el, i] for i, el in enumerate(list)]
31
+
32
+ while i < len(annotated_list) - 1:
33
+ separated = separate_overlap(annotated_list[i][0], annotated_list[i + 1][0])
34
+ if len(separated) == 2:
35
+ i += 1
36
+ elif len(separated) == 3:
37
+ annotated_list[i][0] = separated[0]
38
+ annotated_list.insert(i + 1, [separated[1], "overlap"])
39
+ annotated_list[i + 2][0] = separated[2]
40
+ i += 2
41
+ return annotated_list
42
+
43
+
44
+ def chunk(text, words, splitter_selection, slider_overlap):
45
  if "Word" in splitter_selection:
46
  splits = preprocessor.split(
47
  Document(text),
48
  split_length=words,
49
  split_by="word",
50
+ split_overlap=slider_overlap,
51
+ split_respect_sentence_boundary=(
52
+ "respect sentence boundaries" in splitter_selection
53
+ ),
54
  )
55
  text_splits = [split.content for split in splits]
56
+ elif splitter_selection == "Character":
57
+ text_splitter = CharacterTextSplitter(
58
  separator="",
59
  chunk_size=words,
60
+ chunk_overlap=slider_overlap,
61
  length_function=len,
62
  is_separator_regex=False,
63
  )
64
  splits = text_splitter.create_documents([text])
65
  text_splits = [split.page_content for split in splits]
66
+ elif splitter_selection == "Recursive Character Text Splitter":
67
+ text_splitter = RecursiveCharacterTextSplitter(
68
+ separator="",
69
+ chunk_size=words,
70
+ chunk_overlap=slider_overlap,
71
+ length_function=len,
72
+ add_start_index=True,
73
+ separators=["\n\n", "\n", ".", " ", ""],
74
+ )
75
+ splits = text_splitter.create_documents([text])
76
+ text_splits = [split.page_content for split in splits]
77
 
78
+ if slider_overlap > 0:
79
+ output = extract_overlaps(text_splits)
80
 
81
+ else:
82
+ output = [(split, str(i)) for i, split in enumerate(text_splits)]
83
+ return output
84
 
85
 
86
  ESSAY = """
87
  Chapter 6
88
  WHAT SORT OF DESPOTISM DEMOCRATIC NATIONS HAVE TO FEAR
89
+
90
+ I had remarked during my stay in the United States that a democratic state of society, similar to that of the Americans, might offer singular facilities for the establishment of despotism; and I perceived, upon my return to Europe, how much use had already been made, by most of our rulers, of the notions, the sentiments, and the wants created by this same social condition, for the purpose of extending the circle of their power. This led me to think that the nations of Christendom would perhaps eventually undergo some oppression like that which hung over several of the nations of the ancient world. .
91
 
92
  A more accurate examination of the subject, and five years of further meditation, have not diminished my fears, but have changed their object.
93
 
 
95
 
96
  When the Roman emperors were at the height of their power, the different nations of the empire still preserved usages and customs of great diversity; although they were subject to the same monarch, most of the provinces were separately administered; they abounded in powerful and active municipalities; and although the whole government of the empire was centered in the hands of the Emperor alone and he always remained, in case of need, the supreme arbiter in all matters, yet the details of social life and private occupations lay for the most part beyond his control. The emperors possessed, it is true, an immense and unchecked power, which allowed them to gratify all their whimsical tastes and to employ for that purpose the whole strength of the state. They frequently abused that power arbitrarily to deprive their subjects of property or of life; their tyranny was extremely onerous to the few, but it did not reach the many; it was confined to some few main objects and neglected the rest; it was violent, but its range was limited.
97
 
98
+ ---
99
+
100
+ Then you can [Create a dataset repository](../huggingface_hub/quick-start#create-a-repository), for example using:
101
+
102
+ ```python
103
+ from huggingface_hub import HfApi
104
+
105
+ HfApi().create_repo(repo_id="username/my_dataset", repo_type="dataset")
106
+ ```
107
+
108
+ Finally, you can use [Hugging Face paths]([Hugging Face paths](https://huggingface.co/docs/huggingface_hub/guides/hf_file_system#integrations)) in Pandas:
109
+
110
+ ```python
111
+ import pandas as pd
112
+
113
+ df.to_parquet("hf://datasets/username/my_dataset/data.parquet")
114
+
115
+ # or write in separate files if the dataset has train/validation/test splits
116
+ df_train.to_parquet("hf://datasets/username/my_dataset/train.parquet")
117
+ df_valid.to_parquet("hf://datasets/username/my_dataset/validation.parquet")
118
+ df_test .to_parquet("hf://datasets/username/my_dataset/test.parquet")
119
+ ```
120
 
 
121
 
 
122
  """
123
+
124
+
125
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
126
  text = gr.Textbox(label="Your text ๐Ÿชถ", value=ESSAY)
127
+ split_selection = gr.Radio(
128
+ [
129
+ "Word - respect sentence boundaries",
130
+ "Word - no sentence boundaries",
131
+ "Recursive Character Text Splitter",
132
+ "Character",
133
+ ],
134
+ value="Character",
135
+ label="Chunking method ",
136
+ info="How should we split our chunks",
137
+ )
138
+ slider_count = gr.Slider(
139
+ 20, 500, value=50, label="Count ๐Ÿงฎ", info="Chunk size, in the chosen unit."
140
+ )
141
+ slider_overlap = gr.Slider(
142
+ 0,
143
+ 100,
144
+ value=0,
145
+ label="Overlap ๐Ÿ”€",
146
+ info="Size of overlap between adjacent chunks.",
147
+ )
148
  out = gr.HighlightedText(
149
  label="Output",
150
  show_legend=True,
151
  show_label=False,
152
  )
153
+ text.change(
154
+ fn=chunk,
155
+ inputs=[text, slider_count, split_selection, slider_overlap],
156
+ outputs=out,
157
+ )
158
+ split_selection.change(
159
+ fn=chunk,
160
+ inputs=[text, slider_count, split_selection, slider_overlap],
161
+ outputs=out,
162
+ )
163
+ slider_count.change(
164
+ fn=chunk,
165
+ inputs=[text, slider_count, split_selection, slider_overlap],
166
+ outputs=out,
167
+ )
168
+ slider_overlap.change(
169
+ fn=chunk,
170
+ inputs=[text, slider_count, split_selection, slider_overlap],
171
+ outputs=out,
172
+ )
173
+ demo.launch()