Epitomea-demo-V2

Runtime error

File size: 15,898 Bytes

import os
import re
import json
import getpass
import logging
import openai
import asyncio
from typing import Any, List, Tuple, Dict
import gradio as gr
import llama_index
from llama_index import Document
from llama_index.llms import OpenAI
from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
from llama_index.llms import HuggingFaceLLM
import requests

from RAG_utils import PDFProcessor_Unstructured, PDFQueryEngine, HybridRetriever, MixtralLLM, KeywordSearch, base_utils, ConfigManager


# Configure basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Create a logger object
logger = logging.getLogger(__name__)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

config_manager = ConfigManager()
#config_manager.load_config("api", "Config/api_config.json")
config_manager.load_config("model", "model_config.json")

openai.api_key = os.environ['OPENAI_API_KEY'] #config_manager.get_config_value("api", "OPENAI_API_KEY")
hf_token = os.environ['HF_TOKEN']#config_manager.get_config_value("api", "HF_TOKEN")

# PDF rendering and chunking parameters
pdf_processing_config = config_manager.get_config_value("model", "pdf_processing")


ALLOWED_EXTENSIONS = config_manager.get_config_value("model", "allowed_extensions")
embed = config_manager.get_config_value("model", "embeddings")
embed_model_name = config_manager.get_config_value("model", "embeddings_model")


#llm_model = config_manager.get_config_value("model", "llm_model")
model_temperature = config_manager.get_config_value("model", "model_temp")
output_token_size = config_manager.get_config_value("model", "max_tokens")
model_context_window = config_manager.get_config_value("model", "context_window")

gpt_prompt_path = config_manager.get_config_value("model","GPT_PROMPT_PATH")
mistral_prompt_path = config_manager.get_config_value("model","MISTRAL_PROMPT_PATH")
info_prompt_path = config_manager.get_config_value("model", "INFO_PROMPT_PATH")

peer_review_journals_path = config_manager.get_config_value("model", "peer_review_journals_path")
eq_network_journals_path = config_manager.get_config_value("model", "eq_network_journals_path")

queries = config_manager.get_config_value("model", "queries")
criteria = config_manager.get_config_value("model", "criteria")
num_criteria = len(queries)

author_query = config_manager.get_config_value("model", "author_query")
journal_query = config_manager.get_config_value("model", "journal_query")
title_query = config_manager.get_config_value("model", "title_query")


# Helper function to check if the file extension is allowed
def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

def generate_score_bar(score, num_criteria):
    # Convert and round the score from a 9-point scale to a 100-point scale
    score_out_of_100 = round((score / num_criteria) * 100)

    # Determine the color and text based on the original score
    if score == 9:
        color = "#4CAF50"  # green
        text = "Very good"
    elif score in [7, 8]:
        color = "#FFEB3B"  # yellow
        text = "Good"
    elif score in [5, 6]:
        color = "#FF9800"  # orange
        text = "Ok"
    elif score in [3, 4]:
        color = "#F44336"  # red
        text = "Bad"
    else:  # score < 3
        color = "#800000"  # maroon
        text = "Very bad"

    # Create the HTML for the score bar
    score_bar_html = f"""
        <div style="background-color: #ddd; border-radius: 10px; position: relative; height: 20px; width: 100%;">
            <div style="background-color: {color}; height: 100%; border-radius: 10px; width: {score_out_of_100}%;"></div>
        </div>
        <p style="color: {color};">{text}</p>  <!-- Display the text -->
    """
    return score_bar_html
def format_example(example):
        """
        Formats a few-shot example into a string.
        Args:
            example (dict): A dictionary containing 'query', 'score', and 'reasoning' for the few-shot example.
        Returns:
            str: Formatted few-shot example text.
        """
        return "Example:\nQuery: {}\n Direct Answer: {}\n".format(
            example['query'], example['Answer'])
    
def process_pdf(uploaded_file, llm_model, n_criteria = num_criteria):
    # Process the PDF file
    pdf_processor = PDFProcessor_Unstructured(pdf_processing_config)
    merged_chunks, tables = pdf_processor.process_pdf_file(uploaded_file)
    documents = [Document(text=t) for t in merged_chunks]

    # Prompts and Queries
    utils = base_utils()
    
    info_prompt = utils.read_from_file(info_prompt_path)

    # LLM Model choice
    try:
        if llm_model == "Model 1":
            llm = OpenAI(model="gpt-4-1106-preview", temperature=model_temperature, max_tokens=output_token_size)
            general_prompt = utils.read_from_file(gpt_prompt_path)

        elif llm_model == "Model 2":
            if any(param is None for param in [model_context_window, output_token_size, model_temperature, hf_token]):
                raise ValueError("All parameters are required for Mistral LLM.")

               
            llm = MixtralLLM(context_window=model_context_window, num_output=output_token_size,
                           temperature=model_temperature, model_name="mistralai/Mixtral-8x7B-Instruct-v0.1", api_key=hf_token)
            general_prompt = utils.read_from_file(mistral_prompt_path)
        else:
            raise ValueError(f"Unsupported language model: {llm_model}")

    except Exception as e:
        logger.error(f"Error initializing language model '{llm_model}': {e}", exc_info=True)
        raise  # Or handle the exception as needed

    # Embedding model choice for RAG
    try:
        if embed == "openai":
            embed_model = OpenAIEmbedding(model="text-embedding-3-large")

        elif embed == "huggingface":
            # Use the specified model name
            embed_model = HuggingFaceEmbedding(embed_model_name)
        
        else:
            raise ValueError(f"Unsupported embedding model: {embed_model}")


    except Exception as e:
        logger.error(f"Error initializing embedding model: {e}", exc_info=True)
        raise

    
    peer_review_journals = utils.read_from_file(peer_review_journals_path)
    eq_network_journals = utils.read_from_file(eq_network_journals_path)

    peer_review_journals_list = peer_review_journals.split('\n')
    eq_network_journals_list = eq_network_journals.split('\n')

    
    modified_journal_query = "Is the given research paper published in any of the following journals: " + ", ".join(peer_review_journals_list) + "?"

    example_journal = {"query":modified_journal_query,
                       "Answer": "The article is published in the Lancet."}
    example_author = {"query":author_query,
                       "Answer": "Corresponding author. Stephanie J. Sohl, Ph.D., Department of Social Sciences & Health Policy, Wake Forest School of Medicine, Medical Center Boulevard, Winston-Salem, NC 27157, USA, ssohl@wakehealth.edu"}

    formatted_journal_example = format_example(example_journal)
    formatted_author_example = format_example(example_author)
    
    qa_author_prompt_with_example = info_prompt.replace("{few_shot_examples}", formatted_author_example)
    qa_journal_prompt_with_example = info_prompt.replace("{few_shot_examples}", formatted_journal_example)

    info_llm = OpenAI(model="gpt-4-1106-preview", temperature=model_temperature, max_tokens=100)
    pdf_info_query = PDFQueryEngine(documents, info_llm, embed_model, (info_prompt))
    info_query_engine = pdf_info_query.setup_query_engine()
    journal_result = info_query_engine.query(modified_journal_query).response
    author_result = info_query_engine.query(author_query).response
    title_result = info_query_engine.query(title_query).response
    
    
    pdf_criteria_query = PDFQueryEngine(documents, llm, embed_model, (general_prompt))

    # Check for prior registration
    nlp_methods = KeywordSearch(merged_chunks)
    eq_journal_result = nlp_methods.find_journal_name(journal_result, eq_network_journals_list)
    peer_journal_result = nlp_methods.find_journal_name(journal_result, peer_review_journals_list)

    registration_result = nlp_methods.check_registration()
        
    # Evaluate with OpenAI model
    total_score, criteria_met, score_percentage, reasoning = pdf_criteria_query.evaluate_with_llm(registration_result, peer_journal_result, eq_journal_result, queries)

    reasoning_html = "<ul>"
    for query, reason in zip(criteria, reasoning):
        reasoning_html += f"<li style='font-size: 18px;'><strong style='color: forestgreen;'>{query}</strong> <br> Reasoning: {reason}</li>"
    reasoning_html += "</ul>"

    # Generate the score bar HTML
    score_bar_html = generate_score_bar(total_score, n_criteria)

    # Return the score as a string and the reasoning as HTML
    return str(round((total_score / n_criteria) * 100)) + "/100", score_bar_html, reasoning_html, author_result, title_result


with gr.Blocks(theme=gr.themes.Glass(
    text_size="sm", 
    font=[gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"], 
    primary_hue="neutral", 
    secondary_hue="gray")) as demo:
    
    gr.Markdown("## Med Library")
    
    with gr.Row():
        file_upload = gr.File(label="Choose a paper", file_types=['.pdf'])
    
    with gr.Row():
        models = ["Model 1", "Model 2"]
        model_choice = gr.Dropdown(models, label="Choose a model", value="Model 1")
        submit_button = gr.Button("Evaluate")

    score_output = gr.Textbox(label="Final Score:", interactive=False)
    score_bar_output = gr.HTML()
    reasoning_output = gr.HTML()

    # Heading for Author Information
    gr.Markdown("## Title of the paper")

    # Output for dynamically generated author information
    title_info_output = gr.Markdown()

        
    # Heading for Author Information
    gr.Markdown("## Author Information")

    # Output for dynamically generated author information
    author_info_output = gr.Markdown()
        
    # Set the click event for the button
    submit_button.click(
        fn=process_pdf,
        inputs=[file_upload, model_choice],
        outputs=[score_output, score_bar_output, reasoning_output, author_info_output, title_info_output]
    )


#Launch the app
demo.launch(share=True, server_name="0.0.0.0", server_port=7860)

# Main route for file upload and display results
# @app.route('/', methods=['GET', 'POST'])
# def upload_and_display_results():
#     total_score = 0
#     score_percentage = 0
#     reasoning = []
#     criteria_met = 0

#     if request.method == 'POST':
#         # Check if the post request has the file part
#         if 'file' not in request.files:
#             flash('No file part')
#             return redirect(request.url)
#         file = request.files['file']
#         # If user does not select file, browser also submits an empty part without filename
#         if file.filename == '':
#             flash('No selected file')
#             return redirect(request.url)
#         if file and allowed_file(file.filename):
#             try:
#                 # Process the PDF file
#                 pdf_processor = PDFProcessor_Unstructured(pdf_processing_config)
#                 merged_chunks, tables = pdf_processor.process_pdf_file(file)
#                 documents = [Document(text=t) for t in merged_chunks]

#                 # LLM Model choice
#                 try:
#                     if llm_model == "gpt-4" or llm_model == "gpt-3.5-turbo":
#                         llm = OpenAI(model=llm_model, temperature=model_temperature, max_tokens=output_token_size)

#                     elif llm_model == "mistralai/Mixtral-8x7B-Instruct-v0.1":
#                         if any(param is None for param in [model_context_window, output_token_size, model_temperature, hf_token]):
#                             raise ValueError("All parameters are required for Mistral LLM.")

#                         llm = MixtralLLM(context_window=model_context_window, num_output=output_token_size,
#                             temperature=model_temperature, model_name=llm_model, api_key=hf_token)
#                     else:
#                         raise ValueError(f"Unsupported language model: {llm_model}")

#                 except Exception as e:
#                     logger.error(f"Error initializing language model '{llm_model}': {e}", exc_info=True)
#                     raise  # Or handle the exception as needed

#                 # Embedding model choice for RAG
#                 try:
#                     if embed == "openai":
#                         embed_model = OpenAIEmbedding()

#                     elif embed == "huggingface":
#                         if embed_model_name is None:
#                             # Set to default model if name not provided
#                             embed_model_name = "BAAI/bge-small-en-v1.5"
#                             embed_model = HuggingFaceEmbedding(embed_model_name)
#                         else:
#                             # Use the specified model name
#                             embed_model = HuggingFaceEmbedding(embed_model_name)
#                     else:
#                         raise ValueError(f"Unsupported embedding model: {embed_model}")


#                 except Exception as e:
#                     logger.error(f"Error initializing embedding model: {e}", exc_info=True)
#                     raise



#                 # Prompts and Queries
#                 utils = base_utils()
#                 general_prompt = utils.read_from_file(general_prompt_path)
#                 info_prompt = utils.read_from_file(info_prompt_path)

#                 peer_review_journals = utils.read_from_file(peer_review_journals_path)
#                 eq_network_journals = utils.read_from_file(eq_network_journals_path)

#                 peer_review_journals_list = peer_review_journals.split('\n')
#                 eq_network_journals_list = eq_network_journals.split('\n')


#                 modified_journal_query = "Is the given research paper published in any of the following journals: " + ", ".join(peer_review_journals_list) + "?"

#                 pdf_info_query = PDFQueryEngine(documents, llm, embed_model, (info_prompt))
#                 info_query_engine = pdf_info_query.setup_query_engine()
#                 journal_result = info_query_engine.query(modified_journal_query).response


#                 pdf_criteria_query = PDFQueryEngine(documents, llm, embed_model, (general_prompt))

#                 # Check for prior registration
#                 nlp_methods = KeywordSearch(merged_chunks)
#                 eq_journal_result = nlp_methods.find_journal_name(journal_result, eq_network_journals_list)
#                 peer_journal_result = nlp_methods.find_journal_name(journal_result, peer_review_journals_list)
#                 registration_result = nlp_methods.check_registration()


#                 # Evaluate with OpenAI model
#                 total_score, criteria_met, score_percentage, reasoning = pdf_criteria_query.evaluate_with_llm(registration_result, peer_journal_result, eq_journal_result, queries)


#             except Exception as e:
#                 logging.exception("An error occurred while processing the file.")
#                 # Consider adding a user-friendly message or redirect
#                 flash('An error occurred while processing the file.')
#                 return redirect(request.url)

#     return render_template('index.html',
#                        total_score = total_score,
#                        score_percentage = score_percentage,
#                        criteria_met = criteria_met,
#                        reasoning = reasoning)


# if __name__ == '__main__':
#     app.run(debug=True)