In [2]:
import sys
import os
sys.path.append("..")

import re
import json
import fire
import string

from tqdm.autonotebook import tqdm
from medalpaca.inferer import Inferer


greedy_search = {
    "num_beams" : 1, 
    "do_sample" : False,
    "max_new_tokens" : 128, 
    "early_stopping" : False
}

beam_serach = {
    "num_beams" : 4, 
    "do_sample" : False,
    "max_new_tokens" : 128, 
    "early_stopping" : True,
}

sampling_top_k = {
    "do_sample" : True,
    "num_beams": 1,
    "max_new_tokens": 128, 
    "early_stopping": True,
    "temperature": 0.7,
    "top_k": 50
}

sampling_top_p = {
    "do_sample" : True,
    "top_k": 0, 
    "num_beams": 1,
    "max_new_tokens": 128, 
    "early_stopping": True,
    "temperature": 0.7,
    "top_p": 0.9
}

sampling = {
    "do_sample" : True,
    "top_k": 50, 
    "num_beams": 1,
    "max_new_tokens": 128, 
    "early_stopping": True,
    "temperature": 0.4,
    "top_p": 0.9
}


def format_question(d): 
    question = d["question"]
    options = d["options"]
    for k, v in options.items(): 
        question += f"\n{k}: {v}"
    return question


def strip_special_chars(input_str):
    "Remove special characters from string start/end"
    if not input_str:
        return input_str
    
    start_index = 0
    end_index = len(input_str) - 1

    while start_index < len(input_str) and input_str[start_index] not in string.ascii_letters + string.digits:
        start_index += 1

    while end_index >= 0 and input_str[end_index] not in string.ascii_letters + string.digits:
        end_index -= 1

    if start_index <= end_index:
        return input_str[start_index:end_index + 1]
    else:
        return ""

def starts_with_capital_letter(input_str):
    """
    The answers should start like this: 
        'A: '
        'A. '
        'A '
    """
    pattern = r'^[A-Z](:|\.|) .+'
    return bool(re.match(pattern, input_str))


  from tqdm.autonotebook import tqdm


In [None]:
# model_name: str, # "medalpaca/medalpaca-lora-13b-8bit", 
# prompt_template: str, # "../medalpaca/prompt_templates/medalpaca.json", 
# base_model: str, # "decapoda-research/llama-13b-hf",
# peft: bool, # True,
# load_in_8bit: bool, # True
# path_to_exams: str, # eval/data/test/
# ntries: int = 5, 
# skip_if_exists: bool = True,


# model = Inferer(
#     model_name='medalpaca/medalpaca-7b',
#     prompt_template="../medalpaca/prompt_templates/medalpaca.json",
#     base_model='decapoda-research/llama-7b-hf',
#     peft=True,
#     load_in_8bit=False,
# ) 
    

from transformers import pipeline

pl = pipeline("text-generation", model="medalpaca/medalpaca-7b", tokenizer="medalpaca/medalpaca-7b")


In [1]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline.from_model_id(
    model_id="Ali-C137/Llama-2-7b-chat-hf-tuned-medical-chat",
    task="text-generation",
    model_kwargs={"temperature": 0, "max_length": 64}
)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
Downloading adapter_model.bin: 100%|██████████| 33.6M/33.6M [00:01<00:00, 23.9MB/s]
Device has 1 GPUs available. Provide device={deviceId} to `from_model_id` to use availableGPUs for execution. deviceId is -1 (default) for CPU and can be a positive integer associated with CUDA device id.


In [2]:
llm("hello")

In [None]:
question = "What are the symptoms of diabetes?"
# context = "Diabetes is a metabolic disease that causes high blood sugar. The symptoms include increased thirst, frequent urination, and unexplained weight loss."
answer = pl(question,max_length=200)
print(answer[0]['generated_text'])

In [None]:
answer

[{'generated_text': 'What are the symptoms of diabetes?\nDiabetes is a disease in which your'}]

In [3]:
path_to_exams = '/home/ubuntu/LLM/.conda/om/medAlpaca/data_clean/questions/US'

In [4]:
with open(os.path.join(path_to_exams, f"test.jsonl")) as f:
    questions = [json.loads(line) for line in f]
    # print(questions)

outname = os.path.join(path_to_exams, f"ouput.json")
if os.path.exists(outname): 
    with open(outname, "r") as fp:
        answers = json.load(fp)
else: 
    answers = []
    
print(len(questions))

100


In [5]:
for question in tqdm(questions):
    print(format_question(question))
    n = 0
    response = model(
        instruction="Answer this multiple choice question.", 
        input=format_question(question), 
        output="The Answer to the question is:",
        **sampling
    )
    response = strip_special_chars(response)
    print(response[:100])
    if starts_with_capital_letter(response): 
        n += 1
        break
    else: 
        print(f"Output not satisfactoy, retrying {n} times")
    question["answer"] = response
    answers.append(response)

  0%|          | 0/100 [01:29<?, ?it/s]

C: Tell the attending that he cannot fail to disclose this mistake.

### Discussion:
This is an ethi





In [6]:
answers

[]

In [None]:
   with open(outname, "w+") as fp:
        json.dump(answers, fp)

In [7]:
response = model(
        instruction="hello.", 
        input=format_question(question), 
        output="The Answer to the question is:",
        **sampling
    )



In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
tokenizer = AutoTokenizer.from_pretrained("/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM", use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
model.generation_config = GenerationConfig.from_pretrained("Flmc/DISC-MedLLM")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:27<00:00,  9.22s/it]
Downloading (…)neration_config.json: 100%|██████████| 284/284 [00:00<00:00, 1.46MB/s]


In [None]:
messages = []
messages.append({"role": "user", "content": "Hello the patient will provide you with the reports & other information regarding the paitent. You have to answer the questions based on the information provided and your knowledge. Next you will talk with the paitent"})
response = model.chat(tokenizer, messages)


In [39]:
messages.append({"role": "user", "content": f" Hello I am Om, can i ask you questions"})
response = model.chat(tokenizer, messages)
print(response)

Yes of course! Please feel free to tell me about yourself so that I may better assist you


In [40]:
# report_data = "my recent cholesterol levels from a lab report. Their total cholesterol is 200 mg/dL, HDL cholesterol is 50 mg/dL, and LDL cholesterol is 130 mg/dL."
report_data = "None"
question = "i am really worried about my high cholesterol levels. what should i do and what does it indicate?"

In [41]:
messages.append({"role": "user", "content": f" Detials {report_data} : & User Question {question}"})
response = model.chat(tokenizer, messages)

In [42]:
response

"Cholesterol is a fatty substance that circulates throughout our bloodstreams as part of cell membranes within cells or lipoproteins (plasma proteins) called LDL-cholesterol which are found floating freely around inside plasma membrane bounded by an apo B100 protein molecule attached at one end via its phospholipid bilayer structure . It's essential for maintaining healthy brain function , nerve conduction pathways between neurons along nerves from sensory organs such as taste receptors located primarily underneath tongue epithelium into ganglia situated deep beneath dura mater covering cranial bones forming pa of basilar papilla whereby signals travel downwards towards spinal cord terminating eventually upon synapse connections formed amongst axonal branches projecting outwardly onto muscle fibers resulting ultimately leading upstairs back again all over body so we don't get tired when standing still but instead continue moving forward like this indefinitely without any fatigue feelin

In [1]:
import os
import langchain
import sqlite3
from langchain.document_loaders import PyPDFLoader  
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain,RetrievalQA
from langchain.document_loaders import UnstructuredPDFLoader
import openai
import os
import PyPDF2
from langchain.document_loaders.csv_loader import CSVLoader
from langchain import OpenAI, PromptTemplate
from langchain.document_loaders import TextLoader, Docx2txtLoader, PyPDFLoader, UnstructuredExcelLoader, CSVLoader
import logging
from tqdm import tqdm
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
import uuid
from PIL import Image

# from utils import get_completion,model_info,model_load

import pytesseract

def get_text_img(path):
    return pytesseract.image_to_string(Image.open(path)).replace("\n", " ")

logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)

base_path = os.path.join(os.getcwd(),"db")
key_openai ="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"
embedding = OpenAIEmbeddings(openai_api_key =key_openai)

# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from transformers.generation.utils import GenerationConfig
# tokenizer = AutoTokenizer.from_pretrained("/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM", use_fast=False, trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained("/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
# model.generation_config = GenerationConfig.from_pretrained("Flmc/DISC-MedLLM")

data_llm_16k = ChatOpenAI(
        model_name="gpt-3.5-turbo-16k",
        temperature = 0,
        openai_api_key=key_openai,
    )

data_llm = ChatOpenAI(
        model_name="gpt-3.5-turbo",
        temperature = 0,
        openai_api_key=key_openai,
    )

chain = load_summarize_chain(data_llm_16k, chain_type="stuff")

def get_qa_chain_answers_llm(question,email):
    title = str(email)
    persist_directory = os.path.join(base_path,title)
    db = Chroma(persist_directory=persist_directory, embedding_function=embedding)
    k_tops = db.similarity_search(question, k=3)
    print(k_tops)
    #question_new = f" 'context' {k_tops}: '{question}'"
    #res = get_completion(question_new, 300, 0)
    print("LLM MODEL------------------------------")
    messages = []
    messages.append({"role": "user", "content": "Hello the patient will provide you with the reports & other information regarding the paitent. You have to answer the questions based on the information provided and your knowledge. Next you will talk with the paitent"})
    model.chat(tokenizer, messages)
    messages.append({"role": "user", "content": f" Detials {k_tops} : & User Question {question}"})
    return model.chat(tokenizer, messages)

# def get_qa_chain_answers(question,email,history=[]):
#     title = str(email)
#     persist_directory = os.path.join(base_path,title)
#     db = Chroma(persist_directory=persist_directory, embedding_function=embedding)
    
#     # retriever_from_llm = MultiQueryRetriever.from_llm(retriever=db.as_retriever(),llm=data_llm)
#     # unique_docs = retriever_from_llm.get_relevant_documents(query=question)

#     qa_chain = RetrievalQA.from_chain_type(data_llm_16k,retriever=db.as_retriever())
#     question_updated = "Act Like a Medical doctor and give suggestions based on the context given or your own knwoelege and question asked" + question
#     answers = qa_chain({"query": question_updated})
#     return answers['result']
  
def get_text(doc,file_name):
    file_extension = os.path.splitext(file_name)[1].lower()
    print(file_extension)
    if file_extension == ".pdf":
        pdf = PyPDF2.PdfReader(doc)
        pdf_text = ""
        for page in pdf.pages:
            pdf_text += page.extract_text()
        return pdf_text
        
    elif file_extension == ".md" or file_extension == ".txt":
        loader = TextLoader(doc)
    elif file_extension in [".docx", ".doc"]:
        loader = Docx2txtLoader(doc)
    elif file_extension == ".csv":
        loader = CSVLoader(file_path=doc)
    elif file_extension in [".xls", ".xlsx"]:
        try:
            df = pd.read_excel(doc, engine='openpyxl')
            file_name = f"{str(uuid.uuid1())}.csv"
            df.to_csv(file_name)
            loader = CSVLoader(file_path=file_name)
        except Exception as e:
            print(e)
            loader = UnstructuredExcelLoader(doc, mode="elements")
        documents = loader.load()
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
        texts = text_splitter.split_documents(documents)
        return texts
    
    elif file_extension == ".png" or file_extension == ".jpg" or file_extension == ".jpeg":
        texts = get_text_img(doc)
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
        texts = text_splitter.create_documents(texts)
        print(texts)
        return texts
        
    else:
        raise ValueError(f"Unsupported file extension: {file_extension}")

    documents = loader.load()
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_documents(documents)

    return texts
  
embedding = OpenAIEmbeddings(openai_api_key = "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")

def upload_chroma(book_file,filename,email):
    pbar = tqdm(total=100)
    final_texts = get_text(book_file,filename)
    pbar.update(40)
    title = str(email)
    persist_directory = os.path.join(base_path,title)
    db = Chroma.from_documents(final_texts, embedding , persist_directory=persist_directory)
    pbar.update(40)
    db.persist()
    logging.info(f"Successfully uploaded the PDF of the book: {title}")
    print(f"Successfully uploaded the PDF of the book: {title}")
    pbar.update(20)
    pbar.close()

In [2]:
doc = "/home/ubuntu/LLM/.conda/om/medAlpaca/eval/section4_mobile_screen.png"

In [3]:
file_extension = ".png"
print(file_extension)
if file_extension == ".pdf":
    pdf = PyPDF2.PdfReader(doc)
    pdf_text = ""
    for page in pdf.pages:
        pdf_text += page.extract_text()
    
elif file_extension == ".md" or file_extension == ".txt":
    loader = TextLoader(doc)
elif file_extension in [".docx", ".doc"]:
    loader = Docx2txtLoader(doc)
elif file_extension == ".csv":
    loader = CSVLoader(file_path=doc)
elif file_extension in [".xls", ".xlsx"]:
    try:
        df = pd.read_excel(doc, engine='openpyxl')
        file_name = f"{str(uuid.uuid1())}.csv"
        df.to_csv(file_name)
        loader = CSVLoader(file_path=file_name)
    except Exception as e:
        print(e)
        loader = UnstructuredExcelLoader(doc, mode="elements")
    documents = loader.load()
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    texts = text_splitter.split_documents(documents)

elif file_extension == ".png" or file_extension == ".jpg" or file_extension == ".jpeg":
    texts = get_text_img(doc)
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    texts = text_splitter.create_documents(texts)

.png


In [11]:
texts = get_text_img(doc)
texts

'Profile details  Payal Tandon  Female  etd Seen  Patient details  Name Surname Date of Birth city  Country  Shared profile  Rekha Singhn Tviews  Ey  Payal Tandon Luly 16, 1990 (30y) Mumbai  India    \x0c'

In [12]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema.document import Document

def get_text_chunks_langchain(text):
   text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
   docs = [Document(page_content=x) for x in text_splitter.split_text(text)]
   return docs

In [13]:
get_text_chunks_langchain(texts)

[Document(page_content='Profile details  Payal Tandon  Female  etd Seen  Patient details  Name Surname Date of Birth city  Country  Shared profile  Rekha Singhn Tviews  Ey  Payal Tandon Luly 16, 1990 (30y) Mumbai  India', metadata={})]