madoss commited on
Commit
19b90a7
1 Parent(s): 2a4d7df

Update query_index.py

Browse files
Files changed (1) hide show
  1. query_index.py +39 -28
query_index.py CHANGED
@@ -1,18 +1,10 @@
1
- import argparse
2
  import logging
3
-
4
  import datasets
5
  import sentence_transformers
6
 
7
- import utils
8
-
9
  logging.disable(logging.CRITICAL)
10
 
11
- parser = argparse.ArgumentParser()
12
- parser.add_argument("--query", type=str, required=True)
13
- parser.add_argument("--k", type=int, default=5)
14
- args = parser.parse_args()
15
-
16
  model = sentence_transformers.SentenceTransformer(
17
  "dangvantuan/sentence-camembert-large", device="cuda"
18
  )
@@ -20,24 +12,43 @@ model = sentence_transformers.SentenceTransformer(
20
  dataset = datasets.load_dataset("json", data_files=["./data/dataset.json"], split="train")
21
  dataset.load_faiss_index("embeddings", "index.faiss")
22
 
23
- query_embedding = model.encode(args.query)
24
- _, retrieved_examples = dataset.get_nearest_examples(
25
- "embeddings",
26
- query_embedding,
27
- k=args.k,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  )
29
 
30
-
31
- for text, start, end, title, url in zip(
32
- retrieved_examples["text"],
33
- retrieved_examples["start"],
34
- retrieved_examples["end"],
35
- retrieved_examples["title"],
36
- retrieved_examples["url"],
37
- ):
38
- start = start
39
- end = end
40
- print(f"title: {title}")
41
- print(f"transcript: [{str(start)+' ====> '+str(end)}] {text}")
42
- print(f"link: {url}")
43
- print("*" * 10)
 
 
1
  import logging
2
+ import gradio as gr
3
  import datasets
4
  import sentence_transformers
5
 
 
 
6
  logging.disable(logging.CRITICAL)
7
 
 
 
 
 
 
8
  model = sentence_transformers.SentenceTransformer(
9
  "dangvantuan/sentence-camembert-large", device="cuda"
10
  )
 
12
  dataset = datasets.load_dataset("json", data_files=["./data/dataset.json"], split="train")
13
  dataset.load_faiss_index("embeddings", "index.faiss")
14
 
15
+ def search(query: str, k: int):
16
+ query_embedding = model.encode(query)
17
+ _, retrieved_examples = dataset.get_nearest_examples(
18
+ "embeddings",
19
+ query_embedding,
20
+ k=k,
21
+ )
22
+
23
+ results = []
24
+ for text, start, end, title, url in zip(
25
+ retrieved_examples["text"],
26
+ retrieved_examples["start"],
27
+ retrieved_examples["end"],
28
+ retrieved_examples["title"],
29
+ retrieved_examples["url"],
30
+ ):
31
+ start = start
32
+ end = end
33
+ result = {
34
+ "title": title,
35
+ "transcript": f"[{str(start)+' ====> '+str(end)}] {text}",
36
+ "link": url
37
+ }
38
+ results.append(result)
39
+ return results
40
+
41
+ iface = gr.Interface(
42
+ fn=search,
43
+ inputs=["text", "number"],
44
+ outputs=gr.outputs.JSON(),
45
+ title="Search Dataset",
46
+ description="Search a dataset using Camembert and Faiss.",
47
+ example=[
48
+ "Enter a query to search for.",
49
+ 5
50
+ ]
51
  )
52
 
53
+ if __name__ == "__main__":
54
+ iface.launch()