In [68]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone

import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain.llms import CTransformers
from tqdm.autonotebook import tqdm

In [2]:
PINECONE_API_KEY = "1bae0d8e-019e-4e87-8080-ecf523e5f25f"

In [3]:
def load_pdf(data):
 loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
 documents = loader.load()
 return documents

In [4]:
extracted_data = load_pdf("data/")

In [5]:
# Data is extracted from the PDFs
# Now form chunks out of it

In [6]:
def text_split(extracted_data):
 splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
 chunks = splitter.split_documents(extracted_data)
 return chunks


In [7]:
chunks = text_split(extracted_data)

In [8]:
# len(chunks)
chunks[0]
# Chunks formation is done
# Now, convert the chunks into embeddings
# Then Store this embeddings to VectorDB pinecone

Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': 'data\\Medical_book.pdf', 'page': 1})

In [9]:
def download_hugging_face_embeddings():
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 return embeddings

In [10]:
embeddings = download_hugging_face_embeddings()

 warn_deprecated(


In [12]:

chunk_embeddings = []
for i in range(0, len(chunks)):
 chunk_embeddings.append(embeddings.embed_query(chunks[i].page_content))



In [13]:
len(chunk_embeddings)
chunks[0]

Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': 'data\\Medical_book.pdf', 'page': 1})

In [14]:
chunk_content = [chunks[i].page_content for i in range(0 , len(chunks))]

In [15]:

chunk_ids = [str(i+1) for i in range(0, len(chunks))]


In [16]:
print(chunk_ids[0])
print(chunk_embeddings[0])
chunk_content[0]

1
[0.0017460489179939032, -0.033502884209156036, -0.03290388733148575, 0.007168094161897898, -0.01460327859967947, 0.010261928662657738, -0.01151528861373663, 0.22930213809013367, -0.023232396692037582, 0.004120402969419956, -0.036560822278261185, 0.08592110127210617, 0.012972140684723854, 0.05221788212656975, -0.10232618451118469, -0.003139043692499399, -0.012686969712376595, 0.000471863109851256, -0.02848585695028305, -0.050259195268154144, 0.01155101228505373, 0.0778065174818039, 0.09282823652029037, -0.0137972766533494, -0.016935130581259727, -0.025955867022275925, -0.04956510663032532, -0.046131301671266556, 0.00729052210226655, -0.013553328812122345, 0.038439445197582245, 0.06280472129583359, 0.018353812396526337, 0.008242843672633171, 0.0017155527602881193, -0.039861857891082764, -0.011638614349067211, 0.016446180641651154, 0.025595590472221375, 0.09104609489440918, 0.029672738164663315, -0.05416030064225197, -0.04576560854911804, -0.013853926211595535, 0.02577359229326248, 0.01

'TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION'

In [41]:
index_name = "medical-chatbot"
index=pinecone.Index(api_key=PINECONE_API_KEY, host="https://medical-chatbot-pv4ded8.svc.aped-4627-b74a.pinecone.io")
# from http import client
pc = Pinecone(embedding=embeddings, text_key=chunk_content, index=index)
# Replace with your index name


In [42]:
upsert_vectors = [
 {
 "id": chunk_id,
 "values": embedding,
 "metadata": {"text": content} # Replace with actual metadata if available
 }
 for chunk_id, embedding, content in zip(chunk_ids, chunk_embeddings, chunk_content)
]


In [43]:
upsert_vectors[0]

{'id': '1',
 'values': [0.0017460489179939032,
 -0.033502884209156036,
 -0.03290388733148575,
 0.007168094161897898,
 -0.01460327859967947,
 0.010261928662657738,
 -0.01151528861373663,
 0.22930213809013367,
 -0.023232396692037582,
 0.004120402969419956,
 -0.036560822278261185,
 0.08592110127210617,
 0.012972140684723854,
 0.05221788212656975,
 -0.10232618451118469,
 -0.003139043692499399,
 -0.012686969712376595,
 0.000471863109851256,
 -0.02848585695028305,
 -0.050259195268154144,
 0.01155101228505373,
 0.0778065174818039,
 0.09282823652029037,
 -0.0137972766533494,
 -0.016935130581259727,
 -0.025955867022275925,
 -0.04956510663032532,
 -0.046131301671266556,
 0.00729052210226655,
 -0.013553328812122345,
 0.038439445197582245,
 0.06280472129583359,
 0.018353812396526337,
 0.008242843672633171,
 0.0017155527602881193,
 -0.039861857891082764,
 -0.011638614349067211,
 0.016446180641651154,
 0.025595590472221375,
 0.09104609489440918,
 0.029672738164663315,
 -0.05416030064225197,
 -0.0457

In [45]:
# docsearch = Pinecone.from_texts([t.page_content for t in chunks], embeddings, index_name)

# Upsert the chunks into Pinecone
# index.upsert(vectors=upsert_vectors)
batch_size = 500 # Adjust as necessary based on your data size and Pinecone limits
for i in range(0, len(upsert_vectors), batch_size):
 batch_vectors = upsert_vectors[i:i + batch_size]
 index.upsert(vectors=batch_vectors)



In [46]:
query_embedding = embeddings.embed_query("What are allergies")
print(len(query_embedding))
# Perform query to retrieve similar vectors
results = index.query(vector=[query_embedding], top_k=3, include_values=False, include_metadata=True)


384


In [47]:
print(results)

{'matches': [{'id': '1373',
 'metadata': {'text': 'GALE ENCYCLOPEDIA OF MEDICINE 2 '
 '117Allergies\n'
 'Allergic rhinitis is commonly triggered '
 'by\n'
 'exposure to household dust, animal fur,or '
 'pollen. The foreign substance thattriggers '
 'an allergic reaction is calledan '
 'allergen.\n'
 'The presence of an allergen causes the\n'
 "body's lymphocytes to begin producingIgE "
 'antibodies. The lymphocytes of an allergy '
 'sufferer produce an unusuallylarge amount '
 'of IgE.\n'
 'IgE molecules attach to mast\n'
 'cells, which contain '
 'histamine.HistaminePollen grains\n'
 'Lymphocyte\n'
 'FIRST EXPOSURE'},
 'score': 0.682266653,
 'values': []},
 {'id': '1356',
 'metadata': {'text': 'allergens are the following:\n'
 '• plant pollens\n'
 '• animal fur and dander\n'
 '• body parts from house mites (microscopic '
 'creatures\n'
 'found in all houses)\n'
 '• house dust• mold spores• cigarette '
 'smoke• solvents• cleaners\n'
 'Common food allergens include the '
 'following:\n'

In [48]:
matched_ids = [match['id'] for match in results['matches']]
print(matched_ids)

['1373', '1356', '1306']


In [49]:
chunks[1306].page_content
# Now, based on these top results, I will send it to the llm and it will return the appropriate answer

'mous. Seasonal AR is most commonly caused by grassand tree pollens, since their pollen is produced in largeamounts and is dispersed by the wind. Showy flowers,like roses or lilacs, that attract insects produce a stickypollen which is less likely to become airborne. Differentplants release their pollen at different times of the year,so seasonal AR sufferers may be most affected in spring,summer, or fall, depending on which plants provoke aresponse. The amount of pollen in the air is reflected'

In [50]:
prompt_template = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below nothing else.
Helpful Answer: 
"""

In [51]:
PROMPT = PromptTemplate(template = prompt_template, input_variables=["context", "question"])
chain_type_kwargs = {"prompt":PROMPT}

In [52]:
llm = CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin", model_type="llama", config={'max_new_tokens': 512, 'temperature': 1})

In [61]:
# Create Pinecone retriever
vector_store = Pinecone(index, embeddings, text_key="text")


In [76]:

# Example query
query = "How to Strengthen Hairs?"
answer = vector_store.similarity_search(query, k=3)
print(answer)

[Document(page_content='tively or by altering the skin of the scalp. One exampleis thyroid disorders. Hyperthyroidism (too much thy-\nroid hormone) causes hair to become thin and fine.\nGALE ENCYCLOPEDIA OF MEDICINE 2 125Alopecia\nTop of balding male’s head. (Photograph by Kelly A. Quin.\nReproduced by permission.)GEM - 0001 to 0432 - A 10/22/03 1:42 PM Page 125'), Document(page_content='plugs of skin, each containing one to several hairs,from the back side of the scalp. The bald sections arethen implanted with the plugs. Research completed in2000 looked at the new technique of hair grafting, andfound that micrografts (one to two hairs transplantedper follicle) resulted in fewer complications and thebest results\nAnother surgical procedure used to treat androgenic'), Document(page_content='multitude of hair replacement methods performed byboth physicians and non-physicians. They range fromsimply weaving someone else’s hair in with the remainsof your own to surgically transplanting thou

In [63]:

qa = RetrievalQA.from_chain_type(llm, chain_type="stuff",retriever = vector_store.as_retriever(search_kwargs={"k": 2}), chain_type_kwargs=chain_type_kwargs)

In [77]:
# Example query
query = "How to cure AIDS?"
answer = qa.invoke({"query":query})
print(answer)


{'query': 'How to cure AIDS?', 'result': 'Unfortunately, there is no known cure for HIV or AIDS at this time. While advances have been made in treating the symptoms and slowing the progression of the disease, a cure has not yet been discovered. Research continues to be conducted on new treatments and potential cures, but as of now, there is no known way to completely eliminate the virus from the body or to restore the immune system to its full function.'}
