Spaces:

PingAndPasquale
/

med-rag

Sleeping

App Files Files Community

pminervini commited on Mar 2, 2024

Commit

5f90f73

•

1 Parent(s): 1e5558f

update

Browse files

Files changed (2) hide show

app.py +49 -46
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import os
 import gradio as gr
 import torch
 from transformers import pipeline, StoppingCriteria, StoppingCriteriaList, MaxTimeCriteria, AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizer, BitsAndBytesConfig
 from openai import OpenAI
@@ -56,8 +59,52 @@ def search(query, index="pubmed", num_docs=3):
     return docs
 def analyse(reference: str, passage: str) -> str:
-    import vllm
     fava_input = "Read the following references:\n{evidence}\nPlease identify all the errors in the following text using the information in the references provided and suggest edits if necessary:\n[Text] {output}\n[Edited] "
     prompt = [fava_input.format_map({"evidence": reference, "output": passage})]
@@ -105,51 +152,7 @@ def rag_pipeline(prompt, index="pubmed", num_docs=3, model_name="HuggingFaceH4/z
         }
     ]
-    for message in messages:
-        print('MSG', message)
-    max_new_tokens = 1024
-    if model_name.startswith('openai/'):
-        openai_model_name = model_name.split('/')[1]
-        client = OpenAI()
-        openai_res = client.chat.completions.create(model=openai_model_name,
-                                                    messages=messages,
-                                                    max_tokens=max_new_tokens,
-                                                    temperature=0)
-        print('OAI_RESPONSE', openai_res)
-        response = openai_res.choices[0].message.content.strip()
-    else:
-        quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
-        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", low_cpu_mem_usage=True, quantization_config=quantization_config)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        # Load your language model from HuggingFace Transformers
-        generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
-        tokenized_prompt = tokenizer.apply_chat_template(messages, tokenize=True)
-        # Define the stopping criteria using MaxTimeCriteria
-        stopping_criteria = StoppingCriteriaList([
-            # MaxTimeCriteria(32),
-            MultiTokenEOSCriteria("\n", tokenizer, len(tokenized_prompt))
-        ])
-        # Define the generation_kwargs with stopping criteria
-        generation_kwargs = {
-            "max_new_tokens": max_new_tokens,
-            "generation_kwargs": {"stopping_criteria": stopping_criteria},
-            "return_full_text": False
-        }
-        # Generate response using the HF LLM
-        hf_response = generator(messages, **generation_kwargs)
-        print('HF_RESPONSE', hf_response)
-        response = hf_response[0]['generated_text']
-    model = tokenizer = None
     # analysed_response = analyse(joined_docs, response)

 import os
 import gradio as gr
+import ray
+import vllm
 import torch
 from transformers import pipeline, StoppingCriteria, StoppingCriteriaList, MaxTimeCriteria, AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizer, BitsAndBytesConfig
 from openai import OpenAI
     return docs
+@ray.remote(num_gpus=1, max_calls=1)
+def generate(model_name: str, messages):
+    max_new_tokens = 1024
+    if model_name.startswith('openai/'):
+        openai_model_name = model_name.split('/')[1]
+        client = OpenAI()
+        openai_res = client.chat.completions.create(model=openai_model_name,
+                                                    messages=messages,
+                                                    max_tokens=max_new_tokens,
+                                                    temperature=0)
+        print('OAI_RESPONSE', openai_res)
+        response = openai_res.choices[0].message.content.strip()
+    else:
+        quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
+        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", low_cpu_mem_usage=True, quantization_config=quantization_config)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        # Load your language model from HuggingFace Transformers
+        generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
+        tokenized_prompt = tokenizer.apply_chat_template(messages, tokenize=True)
+        # Define the stopping criteria using MaxTimeCriteria
+        stopping_criteria = StoppingCriteriaList([
+            # MaxTimeCriteria(32),
+            MultiTokenEOSCriteria("\n", tokenizer, len(tokenized_prompt))
+        ])
+        # Define the generation_kwargs with stopping criteria
+        generation_kwargs = {
+            "max_new_tokens": max_new_tokens,
+            "generation_kwargs": {"stopping_criteria": stopping_criteria},
+            "return_full_text": False
+        }
+        # Generate response using the HF LLM
+        hf_response = generator(messages, **generation_kwargs)
+        print('HF_RESPONSE', hf_response)
+        response = hf_response[0]['generated_text']
+    return response
+@ray.remote(num_gpus=1, max_calls=1)
 def analyse(reference: str, passage: str) -> str:
     fava_input = "Read the following references:\n{evidence}\nPlease identify all the errors in the following text using the information in the references provided and suggest edits if necessary:\n[Text] {output}\n[Edited] "
     prompt = [fava_input.format_map({"evidence": reference, "output": passage})]
         }
     ]
+    response = generate(model_name, messages)
     # analysed_response = analyse(joined_docs, response)

requirements.txt CHANGED Viewed

@@ -5,3 +5,4 @@ transformers
 elasticsearch
 openai
 vllm

 elasticsearch
 openai
 vllm
+ray