Spaces:

m-ric
/

chunk_visualizer

Running

File size: 8,963 Bytes

e7cb6de
2c8408f
 
 
67543ef
2c8408f
6334a1e
5de0055
e7cb6de
8d72163
 
eb84d7e
40a40cb
 
1fa958e
 
 
 
74c0a8b
133d9a7
9501bef
 
 
eb84d7e
9501bef
5de0055
2a1c060
9501bef
eb84d7e
2dd0559
51c0840
 
67543ef
51c0840
b8e290e
afb37e6
9501bef
1fa958e
eb84d7e
1fa958e
 
de45fdc
1fa958e
207ec3b
1fa958e
1b3d905
1fa958e
eb84d7e
a917825
1fa958e
 
de45fdc
1fa958e
 
 
 
5e658e7
40a40cb
 
d3a0161
 
bb88228
afb37e6
 
3244268
2c8408f
bb88228
d9f26aa
 
262dfde
d9f26aa
 
 
 
 
262dfde
 
d9f26aa
bb88228
2a1c060
3265b22
bb88228
3265b22
9b2e5ac
bb88228
 
3265b22
d9f26aa
 
5622e1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3265b22
2c8408f
5622e1c
 
 
 
 
 
 
 
 
2c8408f
bb88228
2c8408f
2a1c060
e8e9bc6
b88152c
5933b14
e304b98
 
eb84d7e
 
e304b98
1c92e81
e8e9bc6
e304b98
e80eb77
1115dfa
d9f26aa
a8ce2d7
384ad14
1c92e81
e304b98
67543ef
 
b88152c
67543ef
1c92e81
b88152c
60ca977
6a12a73
 
 
02271b4
6a12a73
02271b4
 
c38e426
6a12a73
 
333777e
6a12a73
afb37e6
333777e
afb37e6
bb88228
 
37c61d6
bb88228
3244268
bb88228
b88152c
2c8408f
731bcbf
2dd0559
 
2c8408f
67543ef
d9f26aa
67543ef
2dd0559
d9f26aa
b88152c
2dd0559
b88152c
e644d1d
b88152c
2c8408f
d2adb70
93f7595

import gradio as gr
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    Language,
)
from transformers import AutoTokenizer
from overlap import unoverlap_list

LABEL_TEXTSPLITTER = "🦜🔗 LangChain's CharacterTextSplitter"
LABEL_RECURSIVE = "🦜🔗 LangChain's RecursiveCharacterTextSplitter"

bert_tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')

def length_tokens(txt):
    return len(bert_tokenizer.tokenize(txt))


def extract_separators_from_string(separators_str):
    print('Received:', type(separators_str), 'with value', separators_str)
    try:
        separators = separators_str[1:-1].split(", ")
        return [separator.replace('"', "").replace("'", "") for separator in separators]
    except Exception as e:
        raise gr.Error(f"""
        Did not succeed in extracting seperators from string: {separator_str} due to: {str(e)}.
        Please type it in the correct format: "['separator_1', 'separator_2', ...]"
        """)

def change_split_selection(split_selection):
    return (
        gr.Textbox.update(visible=(split_selection==LABEL_RECURSIVE)),
        gr.Radio.update(visible=(split_selection==LABEL_RECURSIVE)),
    )

def chunk(text, length, splitter_selection, separators_str, length_unit_selection, chunk_overlap):
    separators = extract_separators_from_string(separators_str)
    length_function = (length_tokens if "token" in length_unit_selection.lower() else len)
    if splitter_selection == LABEL_TEXTSPLITTER:
        text_splitter = CharacterTextSplitter(
            chunk_size=length,
            chunk_overlap=int(chunk_overlap),
            length_function=length_function,
            strip_whitespace=False,
            is_separator_regex=False,
            separator=" ",
        )
    elif splitter_selection == LABEL_RECURSIVE:
        print('Splitting with separators:', ',,'.join(separators), f'and chunk length {length} and chunk overlap {chunk_overlap}')
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=length,
            chunk_overlap=int(chunk_overlap),
            length_function=length_function,
            strip_whitespace=False,
            separators=separators,
        )
        print(text_splitter._separators)
    splits = text_splitter.create_documents([text])
    text_splits = [split.page_content for split in splits]
    print('I did splits:')
    print(text_splits)

    unoverlapped_text_splits = unoverlap_list(text_splits)

    output = [((split[0], 'Overlap') if split[1] else (split[0], f"Chunk {str(i)}")) for i, split in enumerate(unoverlapped_text_splits)]
    return output

def change_preset_separators(choice):
    text_splitter = RecursiveCharacterTextSplitter()
    if choice == "Default recursive":
        return ["\n\n", "\n", " ", ""]
    elif choice == "Markdown":
        return text_splitter.get_separators_for_language(Language.MARKDOWN)
    elif choice == "Python":
        return text_splitter.get_separators_for_language(Language.PYTHON)
    else:
        raise gr.Error("Choice of preset not recognized.")


EXAMPLE_TEXT = """### Chapter 6

WHAT SORT OF DESPOTISM DEMOCRATIC NATIONS HAVE TO FEAR

I had remarked during my stay in the United States that a democratic state of society, similar to that of the Americans, might offer singular facilities for the establishment of despotism; and I perceived, upon my return to Europe, how much use had already been made, by most of our rulers, of the notions, the sentiments, and the wants created by this same social condition, for the purpose of extending the circle of their power. This led me to think that the nations of Christendom would perhaps eventually undergo some oppression like that which hung over several of the nations of the ancient world.
A more accurate examination of the subject, and five years of further meditation, have not diminished my fears, but have changed their object.
No sovereign ever lived in former ages so absolute or so powerful as to undertake to administer by his own agency, and without the assistance of intermediate powers, all the parts of a great empire; none ever attempted to subject all his subjects indiscriminately to strict uniformity of regulation and personally to tutor and direct every member of the community. The notion of such an undertaking never occurred to the human mind; and if any man had conceived it, the want of information, the imperfection of the administrative system, and, above all, the natural obstacles caused by the inequality of conditions would speedily have checked the execution of so vast a design.

---

### Challenges of agent systems

Generally, the difficult parts of running an agent system for the LLM engine are:

1. From supplied tools, choose the one that will help advance to a desired goal: e.g. when asked `"What is the smallest prime number greater than 30,000?"`, the agent could call the `Search` tool with `"What is he height of K2"` but it won't help.
2. Call tools with a rigorous argument formatting: for instance when trying to calculate the speed of a car that went 3 km in 10 minutes, you have to call tool `Calculator` to divide `distance` by `time` : even if your Calculator tool accepts calls in the JSON format: `{”tool”: “Calculator”, “args”: “3km/10min”}` , there are many pitfalls, for instance:
    - Misspelling the tool name: `“calculator”` or `“Compute”` wouldn’t work
    - Giving the name of the arguments instead of their values: `“args”: “distance/time”`
    - Non-standardized formatting: `“args": "3km in 10minutes”`
3. Efficiently ingesting and using the information gathered in the past observations, be it the initial context or the observations returned after using tool uses.


So, how would a complete Agent setup look like?

## Running agents with LangChain

We have just integrated a `ChatHuggingFace` wrapper that lets you create agents based on open-source models in [🦜🔗LangChain](https://www.langchain.com/).

The code to create the ChatModel and give it tools is really simple, you can check it all in the [Langchain doc](https://python.langchain.com/docs/integrations/chat/huggingface). 

```python
from langchain_community.llms import HuggingFaceHub
from langchain_community.chat_models.huggingface import ChatHuggingFace

llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
)

chat_model = ChatHuggingFace(llm=llm)
```
"""

    
with gr.Blocks(theme=gr.themes.Soft(text_size='lg', font=["monospace"], primary_hue=gr.themes.colors.green)) as demo:
    text = gr.Textbox(label="Your text 🪶", value=EXAMPLE_TEXT)
    with gr.Row():
        split_selection = gr.Dropdown(
            choices=[
                LABEL_TEXTSPLITTER,
                LABEL_RECURSIVE,
            ],
            value=LABEL_RECURSIVE,
            label="Method to split chunks 🍞",
        )
        separators_selection = gr.Textbox(
            elem_id="textbox_id",
            value=["\n\n", "\n", " ", ""],
            info="Separators used in RecursiveCharacterTextSplitter",
            show_label=False, # or set label to an empty string if you want to keep its space
            visible=True,
        )
        separator_preset_selection = gr.Radio(
            ['Default recursive', 'Python', 'Markdown'],
            label="Choose a preset",
            info="This will apply a specific set of separators to RecursiveCharacterTextSplitter.",
            visible=True,
        )
    with gr.Row():
        length_unit_selection = gr.Dropdown(
            choices=[
                "Character count",
                "Token count (BERT tokens)",
            ],
            value="Character count",
            label="Length function",
            info="How should we measure our chunk lengths?",
        )
        slider_count = gr.Slider(
            20, 500, value=200, step=1, label="Chunk length 📏", info="In the chosen unit."
        )
        chunk_overlap = gr.Slider(
            0, 30, value=10, step=1, label="Overlap between chunks", info="In the chosen unit."
        )
    out = gr.HighlightedText(
        label="Output",
        show_legend=True,
        show_label=False,
        color_map={'Overlap': '#DADADA'}
    )

    split_selection.change(
        fn=change_split_selection,
        inputs=split_selection,
        outputs=[separators_selection, separator_preset_selection],
    )
    separator_preset_selection.change(
        fn=change_preset_separators,
        inputs=separator_preset_selection,
        outputs=separators_selection,
    )
    gr.on(
        [text.change, length_unit_selection.change, separators_selection.change, split_selection.change, slider_count.change, chunk_overlap.change],
        chunk,
        [text, slider_count, split_selection, separators_selection, length_unit_selection, chunk_overlap],
        outputs=out
    )

demo.launch()