Spaces:

ccm
/

chat-with-publications

Running on Zero

App Files Files Community

ccm commited on Jul 21, 2024

Commit

d7a54c3

verified ·

1 Parent(s): 50d362b

Update main.py

Browse files

Files changed (1) hide show

main.py +82 -73

main.py CHANGED Viewed

@@ -1,13 +1,11 @@
-import json  # For stringifying a dict
-import random # For selecting a search hint
-import gradio  # GUI framework
-import datasets  # Used to load publication dataset
-import numpy  # For a few simple matrix operations
-import pandas  # Needed for operating on dataset
-import sentence_transformers  # Needed for query embedding
-import faiss  # Needed for fast similarity search
 # Load the dataset and convert to pandas
 full_data = datasets.load_dataset("ccm/publications")["train"].to_pandas()
@@ -24,77 +22,88 @@ data = full_data[~pandas.Series(filter)]
 data.reset_index(inplace=True)
 # Create a FAISS index for fast similarity search
-indices = []
-metrics = [faiss.METRIC_INNER_PRODUCT ,faiss.METRIC_L2]
-normalization = [True, False]
 vectors = numpy.stack(data["embedding"].tolist(), axis=0)
-for  metric in metrics:
-    for normal in normalization:
-        index = faiss.IndexFlatL2(len(data["embedding"][0]))
-        index.metric_type = metric
-        if normal:
-            faiss.normalize_L2(vectors)
-        index.train(vectors)
-        index.add(vectors)
-        indices.append(index)
 # Load the model for later use in embeddings
 model = sentence_transformers.SentenceTransformer("allenai-specter")
 # Define the search function
-def search(query: str, k: int, n: int):
     query = numpy.expand_dims(model.encode(query), axis=0)
     faiss.normalize_L2(query)
-    D, I = indices[n].search(query, k)
     top_five = data.loc[I[0]]
-    search_results = ""
     for i in range(k):
-        search_results += "### " + top_five["bib_dict"].values[i]["title"] + "\n\n"
-        search_results += str(int(100*D[0][i])) + "% relevant  "
-        if top_five["author_pub_id"].values[i] is not None:
-            search_results += "/  [Full Text](https://scholar.google.com/citations?view_op=view_citation&citation_for_view=" + top_five["author_pub_id"].values[i] + ")  "
-        if top_five["citedby_url"].values[i] is not None:
-            search_results += (
-                "/  [Cited By](" + SCHOLAR_URL + top_five["citedby_url"].values[i] + ")  "
-            )
-        if top_five["url_related_articles"].values[i] is not None:
-            search_results += (
-                "/  [Related Articles]("
-                + SCHOLAR_URL
-                + top_five["url_related_articles"].values[i]
-                + ")  "
-            )
-        search_results += "\n\n```bibtex\n"
-        search_results += (
-            json.dumps(top_five["bibtex"].values[i], indent=4)
-            .replace("\\n", "\n")
-            .replace("\\t", "\t")
-            .strip('"')
-        )
-        search_results += "```\n"
-    return search_results
-with gradio.Blocks() as demo:
-    with gradio.Group():
-        query = gradio.Textbox(
-            placeholder = random.choice([
-                "design for additive manufacturing",
-                "best practices for agent based modeling",
-                "arctic environmental science",
-                "analysis of student teamwork"
-            ]),
-            show_label=False,
-            lines=1,
-            max_lines=1
-        )
-        with gradio.Accordion("Settings", open=False):
-            k = gradio.Number(10.0, label="Number of results", precision=0)
-            n = gradio.Radio([True, False], value=True, label="Normalized")
-    results = gradio.Markdown()
-    query.change(fn=search, inputs=[query, k, n], outputs=results)
-    k.change(fn=search, inputs=[query, k, n], outputs=results)
-    n.change(fn=search, inputs=[query, k, n], outputs=results)
-demo.launch(debug=True)

+import gradio  # for the interface
+import transformers  # to load an LLM
+import sentence_transformers  # to load an embedding model
+import faiss  # to create an index
+import numpy  # to work with vectors
+import pandas  # to work with pandas
+import json  # to work with JSON
+import datasets  # to load the dataset
 # Load the dataset and convert to pandas
 full_data = datasets.load_dataset("ccm/publications")["train"].to_pandas()
 data.reset_index(inplace=True)
 # Create a FAISS index for fast similarity search
+metric = faiss.METRIC_INNER_PRODUCT
 vectors = numpy.stack(data["embedding"].tolist(), axis=0)
+gpu_index = faiss.IndexFlatL2(len(data["embedding"][0]))
+# res = faiss.StandardGpuResources()  # use a single GPU
+# gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
+gpu_index.metric_type = metric
+faiss.normalize_L2(vectors)
+gpu_index.train(vectors)
+gpu_index.add(vectors)
 # Load the model for later use in embeddings
 model = sentence_transformers.SentenceTransformer("allenai-specter")
 # Define the search function
+def search(query: str, k: int) -> tuple[str]:
     query = numpy.expand_dims(model.encode(query), axis=0)
     faiss.normalize_L2(query)
+    D, I = gpu_index.search(query, k)
     top_five = data.loc[I[0]]
+    search_results = "You are an AI assistant who delights in helping people" \
+        + "learn about research from the Design Research Collective. Here are" \
+        + "several really cool abstracts:\n\n"
+    references = "\n\n## References\n\n"
     for i in range(k):
+        search_results += top_five["bib_dict"].values[i]["abstract"] + "\n"
+        references += str(i+1) + ". [" + top_five["bib_dict"].values[i]["title"] + "]" \
+            + "(https://scholar.google.com/citations?view_op=view_citation&citation_for_view=" + top_five["author_pub_id"].values[i] + ")\n"
+    search_results += "\nSummarize the above abstracts as you respond to the following query:"
+    print(search_results)
+    return search_results, references
+# Create an LLM pipeline that we can send queries to
+pipe = transformers.pipeline(
+    "text-generation",
+    model="Qwen/Qwen2-0.5B-Instruct",
+    # model="microsoft/Phi-3-medium-128k-instruct-onnx-cuda",
+    # model="microsoft/Phi-3-medium-128k-instruct",
+    trust_remote_code=True,
+    max_new_tokens = 512,
+    device="cuda:0",
+)
+def preprocess(message: str) -> tuple[str]:
+    """Applies a preprocessing step to the user's message before the LLM receives it"""
+    block_search_results, formatted_search_results = search(message, 5)
+    return block_search_results + message, formatted_search_results
+def postprocess(response: str, bypass_from_preprocessing: str) -> str:
+    """Applies a postprocessing step to the LLM's response before the user receives it"""
+    return response + bypass_from_preprocessing
+def predict(message: str, history: list[str]) -> str:
+    """This function is responsible for crafting a response"""
+    # Apply preprocessing
+    message, bypass = preprocess(message)
+    # This is some handling that is applied to the history variable to put it in a good format
+    if isinstance(history, list):
+        if len(history) > 0:
+            history = history[-1]
+    history_transformer_format = [
+        {"role": "assistant" if idx&1 else "user", "content": msg}
+        for idx, msg in enumerate(history)
+    ] + [{"role": "user", "content": message}]
+    # Create a response
+    response = pipe(history_transformer_format)
+    response_message = response[0]["generated_text"][-1]["content"]
+    # Apply postprocessing
+    response_message = postprocess(response_message, bypass)
+    return response_message
+# Create and run the gradio interface
+gradio.ChatInterface(predict).launch(debug=True)