julien-c HF staff commited on
Commit
a00452d
β€’
1 Parent(s): 8c141ca

Hook full-text search

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. README.md +5 -3
  3. app.py +66 -0
  4. requirements.txt +1 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env/
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Duckdb Full Text Search
3
- emoji: πŸ¦€
4
  colorFrom: indigo
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 3.32.0
8
  app_file: app.py
@@ -10,3 +10,5 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
+ title: DuckDB Full Text Search
3
+ emoji: 🐀
4
  colorFrom: indigo
5
+ colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 3.32.0
8
  app_file: app.py
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+ Inspired by https://huggingface.co/spaces/asoria/duckdb-parquet-demo
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Inspired by https://huggingface.co/spaces/asoria/duckdb-parquet-demo
2
+
3
+ import gradio as gr
4
+ import duckdb
5
+ import pandas as pd
6
+ import requests
7
+
8
+ DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co"
9
+ PARQUET_REVISION="refs/convert/parquet"
10
+
11
+ EXAMPLE_DATASET_NAME = "LLMs/Alpaca-ShareGPT"
12
+
13
+
14
+ def get_parquet_urls(dataset: str) -> list[str]:
15
+ splits = requests.get(f"{DATASETS_SERVER_ENDPOINT}/splits?dataset={dataset}", timeout=60).json().get("splits")
16
+ split = splits[0]
17
+ response = requests.get(f"{DATASETS_SERVER_ENDPOINT}/parquet?dataset={dataset}&config={split['config']}", timeout=60)
18
+ if response.status_code != 200:
19
+ raise Exception(response)
20
+
21
+ response = response.json()
22
+ parquet_files = response["parquet_files"]
23
+ urls = [content["url"] for content in parquet_files if content["split"] == split["split"]]
24
+ if len(urls) == 0:
25
+ raise Exception("No parquet files found for dataset")
26
+ return urls
27
+
28
+ def run_command(query: str) -> pd.DataFrame:
29
+ try:
30
+ result = duckdb.execute("SELECT fts_main_data.match_bm25(id, ?) AS score, id, instruction, input, output FROM data WHERE score IS NOT NULL ORDER BY score;", [query])
31
+ print("Ok")
32
+ except Exception as error:
33
+ print(f"Error: {str(error)}")
34
+ return pd.DataFrame({"Error": [f"❌ {str(error)}"]})
35
+ print(result)
36
+ return result.df()
37
+
38
+ def import_data():
39
+ # Import data + index
40
+ parquet_url = get_parquet_urls(EXAMPLE_DATASET_NAME)[0]
41
+ print("parquet_url", parquet_url)
42
+ duckdb.sql("CREATE SEQUENCE serial START 1;")
43
+ # We need a sequence id column for Full text search
44
+ # I'm very rusty in SQL so it's very possible there are simpler ways.
45
+
46
+ duckdb.sql(f"CREATE TABLE data AS SELECT nextval('serial') AS id, * FROM '{parquet_url}';")
47
+ duckdb.sql("PRAGMA create_fts_index('data', 'id', '*');")
48
+
49
+ duckdb.sql("DESCRIBE SELECT * FROM data").show()
50
+ print("foo foo")
51
+
52
+
53
+
54
+ with gr.Blocks() as demo:
55
+ gr.Markdown(" ## Full-text search using DuckDB on top of datasets-server Parquet files 🐀")
56
+ gr.CheckboxGroup(label="Dataset", choices=["LLMs/Alpaca-ShareGPT"], value="LLMs/Alpaca-ShareGPT", info="Dataset to query"),
57
+ query = gr.Textbox(label="query", placeholder="Full-text search...")
58
+ run_button = gr.Button("Run")
59
+ gr.Markdown("### Result")
60
+ cached_responses_table = gr.DataFrame()
61
+ run_button.click(run_command, inputs=[query], outputs=cached_responses_table)
62
+
63
+
64
+ if __name__ == "__main__":
65
+ import_data()
66
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ duckdb==0.8.0