Spaces:
Build error
Build error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
import logging
|
4 |
+
import sys
|
5 |
+
|
6 |
+
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
7 |
+
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
|
8 |
+
|
9 |
+
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
|
10 |
+
from llama_index.llms import HuggingFaceLLM
|
11 |
+
from langchain.document_loaders import PyPDFLoader
|
12 |
+
|
13 |
+
|
14 |
+
**Mount Google Drive to access data (you may need to authenticate)**
|
15 |
+
|
16 |
+
import pandas as pd
|
17 |
+
from datasets import load_dataset, concatenate_datasets
|
18 |
+
|
19 |
+
dataset_names = [
|
20 |
+
"medalpaca/medical_meadow_mediqa",
|
21 |
+
"medalpaca/medical_meadow_medical_flashcards",
|
22 |
+
"medalpaca/medical_meadow_wikidoc_patient_information",
|
23 |
+
"medalpaca/medical_meadow_wikidoc",
|
24 |
+
"medalpaca/medical_meadow_pubmed_casual",
|
25 |
+
"medalpaca/medical_meadow_medqa",
|
26 |
+
"medalpaca/medical_meadow_health_advice",
|
27 |
+
"medalpaca/medical_meadow_cord19"
|
28 |
+
|
29 |
+
]
|
30 |
+
|
31 |
+
datasets = [load_dataset(name, split = "train") for name in dataset_names]
|
32 |
+
combined_dataset = concatenate_datasets(datasets)
|
33 |
+
|
34 |
+
#from google.colab import drive
|
35 |
+
#drive.mount('/content/drive')
|
36 |
+
|
37 |
+
**Reading the data from the saved path in google drive**
|
38 |
+
|
39 |
+
#documents = SimpleDirectoryReader("/content/drive/MyDrive/Data").load_data()
|
40 |
+
|
41 |
+
from langchain.text_splitter import CharacterTextSplitter
|
42 |
+
from langchain import OpenAI
|
43 |
+
from langchain.document_loaders import PyPDFLoader
|
44 |
+
|
45 |
+
**Define a system prompt for the Q&A assistant**
|
46 |
+
|
47 |
+
|
48 |
+
from llama_index.prompts.prompts import SimpleInputPrompt
|
49 |
+
|
50 |
+
|
51 |
+
system_prompt = "You are a medical AI chatbot. Your goal is to answer questions as accurately as possible based on the instructions and context provided.Use only information from the previous context information. Do not invent stuff or give false information"
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
# This will wrap the default prompts that are internal to llama-index
|
56 |
+
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")
|
57 |
+
|
58 |
+
**Log in to Hugging Face**
|
59 |
+
|
60 |
+
#!huggingface-cli login
|
61 |
+
|
62 |
+
**Configure the HuggingFaceLLM (Language Model)**
|
63 |
+
|
64 |
+
import torch
|
65 |
+
|
66 |
+
llm = HuggingFaceLLM(
|
67 |
+
context_window=4096,
|
68 |
+
max_new_tokens=256,
|
69 |
+
generate_kwargs={"temperature": 0.5, "do_sample": False},
|
70 |
+
system_prompt=system_prompt,
|
71 |
+
query_wrapper_prompt=query_wrapper_prompt,
|
72 |
+
tokenizer_name="NousResearch/Llama-2-7b-chat-hf",
|
73 |
+
model_name="NousResearch/Llama-2-7b-chat-hf",
|
74 |
+
device_map="auto",
|
75 |
+
# uncomment this if using CUDA to reduce memory usage
|
76 |
+
model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True})
|
77 |
+
|
78 |
+
**Configure embeddings using Hugging Face model**
|
79 |
+
|
80 |
+
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
81 |
+
from llama_index import LangchainEmbedding, ServiceContext
|
82 |
+
|
83 |
+
embed_model = LangchainEmbedding(
|
84 |
+
HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
|
85 |
+
)
|
86 |
+
|
87 |
+
|
88 |
+
**Configure the service context**
|
89 |
+
|
90 |
+
service_context = ServiceContext.from_defaults(
|
91 |
+
chunk_size=1024,
|
92 |
+
llm=llm,
|
93 |
+
embed_model=embed_model
|
94 |
+
)
|
95 |
+
|
96 |
+
**Create a vector store index from the loaded documents**
|
97 |
+
|
98 |
+
index = VectorStoreIndex.from_documents(combined_dataset, service_context=service_context)
|
99 |
+
|
100 |
+
**Create a query engine for the index**
|
101 |
+
|
102 |
+
query_engine = index.as_query_engine()
|
103 |
+
response = query_engine.query("What is gross profit?")
|
104 |
+
|
105 |
+
print(response)
|
106 |
+
|
107 |
+
**To interact with the chatbot**
|
108 |
+
|
109 |
+
import gradio as gr
|
110 |
+
|
111 |
+
# Define your chatbot function
|
112 |
+
def chatbot_interface(query):
|
113 |
+
response = query_engine.query(query)
|
114 |
+
return response
|
115 |
+
|
116 |
+
# Create a Gradio interface
|
117 |
+
iface = gr.Interface(
|
118 |
+
fn=chatbot_interface,
|
119 |
+
inputs=gr.Textbox(placeholder="Enter your question here..."),
|
120 |
+
outputs=gr.Textbox(),
|
121 |
+
live=False,
|
122 |
+
title="Insurance Chatbot Demo",
|
123 |
+
description="Ask questions, and the chatbot will provide answers based on the provided context.",
|
124 |
+
)
|
125 |
+
|
126 |
+
# Launch the Gradio interface
|
127 |
+
iface.launch(debug=True)
|