Spaces:
Running
Running
File size: 7,425 Bytes
e7cb6de db61c57 bb88228 2c8408f e7cb6de db61c57 2c8408f db61c57 2c8408f db61c57 2c8408f bb88228 2c8408f bb88228 2c8408f bb88228 2c8408f bb88228 2c8408f bb88228 2c8408f bb88228 2c8408f bb88228 2c8408f db61c57 bb88228 e7cb6de bb88228 2c8408f bb88228 2c8408f bb88228 2c8408f bb88228 37c61d6 bb88228 2c8408f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import gradio as gr
from haystack import Document
from haystack.nodes import PreProcessor
from langchain.text_splitter import (
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
)
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
remove_substrings=None,
max_chars_check=10_000,
)
import difflib
def separate_overlap(s1, s2):
for i in range(len(s1) - len(s2), len(s1)):
if s1[i:] == s2[: len(s1) - i]:
overlap = s1[i:]
return [s1[:i], overlap, s2[len(s1) - i :]]
# if no overlap is found, return the strings
return [s1, s2]
def extract_overlaps(list):
i = 0
annotated_list = [[el, i] for i, el in enumerate(list)]
while i < len(annotated_list) - 1:
separated = separate_overlap(annotated_list[i][0], annotated_list[i + 1][0])
if len(separated) == 2:
i += 1
elif len(separated) == 3:
annotated_list[i][0] = separated[0]
annotated_list.insert(i + 1, [separated[1], "overlap"])
annotated_list[i + 2][0] = separated[2]
i += 2
return annotated_list
def chunk(text, words, splitter_selection, slider_overlap):
if "Word" in splitter_selection:
splits = preprocessor.split(
Document(text),
split_length=words,
split_by="word",
split_overlap=slider_overlap,
split_respect_sentence_boundary=(
"respect sentence boundaries" in splitter_selection
),
)
text_splits = [split.content for split in splits]
elif splitter_selection == "Character":
text_splitter = CharacterTextSplitter(
separator="",
chunk_size=words,
chunk_overlap=slider_overlap,
length_function=len,
is_separator_regex=False,
)
splits = text_splitter.create_documents([text])
text_splits = [split.page_content for split in splits]
elif splitter_selection == "Recursive Character Text Splitter":
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=words,
chunk_overlap=slider_overlap,
length_function=len,
add_start_index=True,
separators=["\n\n", "\n", ".", " ", ""],
)
splits = text_splitter.create_documents([text])
text_splits = [split.page_content for split in splits]
if slider_overlap > 0:
output = extract_overlaps(text_splits)
else:
output = [(split, str(i)) for i, split in enumerate(text_splits)]
return output
ESSAY = """
Chapter 6
WHAT SORT OF DESPOTISM DEMOCRATIC NATIONS HAVE TO FEAR
I had remarked during my stay in the United States that a democratic state of society, similar to that of the Americans, might offer singular facilities for the establishment of despotism; and I perceived, upon my return to Europe, how much use had already been made, by most of our rulers, of the notions, the sentiments, and the wants created by this same social condition, for the purpose of extending the circle of their power. This led me to think that the nations of Christendom would perhaps eventually undergo some oppression like that which hung over several of the nations of the ancient world. .
A more accurate examination of the subject, and five years of further meditation, have not diminished my fears, but have changed their object.
No sovereign ever lived in former ages so absolute or so powerful as to undertake to administer by his own agency, and without the assistance of intermediate powers, all the parts of a great empire; none ever attempted to subject all his subjects indiscriminately to strict uniformity of regulation and personally to tutor and direct every member of the community. The notion of such an undertaking never occurred to the human mind; and if any man had conceived it, the want of information, the imperfection of the administrative system, and, above all, the natural obstacles caused by the inequality of conditions would speedily have checked the execution of so vast a design.
When the Roman emperors were at the height of their power, the different nations of the empire still preserved usages and customs of great diversity; although they were subject to the same monarch, most of the provinces were separately administered; they abounded in powerful and active municipalities; and although the whole government of the empire was centered in the hands of the Emperor alone and he always remained, in case of need, the supreme arbiter in all matters, yet the details of social life and private occupations lay for the most part beyond his control. The emperors possessed, it is true, an immense and unchecked power, which allowed them to gratify all their whimsical tastes and to employ for that purpose the whole strength of the state. They frequently abused that power arbitrarily to deprive their subjects of property or of life; their tyranny was extremely onerous to the few, but it did not reach the many; it was confined to some few main objects and neglected the rest; it was violent, but its range was limited.
---
Then you can [Create a dataset repository](../huggingface_hub/quick-start#create-a-repository), for example using:
```python
from huggingface_hub import HfApi
HfApi().create_repo(repo_id="username/my_dataset", repo_type="dataset")
```
Finally, you can use [Hugging Face paths]([Hugging Face paths](https://huggingface.co/docs/huggingface_hub/guides/hf_file_system#integrations)) in Pandas:
```python
import pandas as pd
df.to_parquet("hf://datasets/username/my_dataset/data.parquet")
# or write in separate files if the dataset has train/validation/test splits
df_train.to_parquet("hf://datasets/username/my_dataset/train.parquet")
df_valid.to_parquet("hf://datasets/username/my_dataset/validation.parquet")
df_test .to_parquet("hf://datasets/username/my_dataset/test.parquet")
```
"""
with gr.Blocks(theme=gr.themes.Soft()) as demo:
text = gr.Textbox(label="Your text ๐ชถ", value=ESSAY)
split_selection = gr.Radio(
[
"Word - respect sentence boundaries",
"Word - no sentence boundaries",
"Recursive Character Text Splitter",
"Character",
],
value="Character",
label="Chunking method ",
info="How should we split our chunks",
)
slider_count = gr.Slider(
20, 500, value=50, label="Count ๐งฎ", info="Chunk size, in the chosen unit."
)
slider_overlap = gr.Slider(
0,
100,
value=0,
label="Overlap ๐",
info="Size of overlap between adjacent chunks.",
)
out = gr.HighlightedText(
label="Output",
show_legend=True,
show_label=False,
)
text.change(
fn=chunk,
inputs=[text, slider_count, split_selection, slider_overlap],
outputs=out,
)
split_selection.change(
fn=chunk,
inputs=[text, slider_count, split_selection, slider_overlap],
outputs=out,
)
slider_count.change(
fn=chunk,
inputs=[text, slider_count, split_selection, slider_overlap],
outputs=out,
)
slider_overlap.change(
fn=chunk,
inputs=[text, slider_count, split_selection, slider_overlap],
outputs=out,
)
demo.launch()
|