import json from langchain_community.document_loaders.csv_loader import CSVLoader from langchain.text_splitter import RecursiveCharacterTextSplitter import pandas as pd import langchain import os import openai import ast from langchain import OpenAI from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import JSONLoader from langchain.document_loaders import UnstructuredURLLoader from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import FAISS from langchain_core.prompts import ChatPromptTemplate from langchain_core.pydantic_v1 import BaseModel, Field from langchain_openai import ChatOpenAI from langchain_core.prompts import ChatPromptTemplate from langchain_core.pydantic_v1 import BaseModel, Field from langchain_openai import ChatOpenAI from typing import List, Dict, Any import requests from dotenv import load_dotenv load_dotenv() # getting the json files def get_clinical_record_info(clinical_record_id: str) -> Dict[str, Any]: # Request: # curl -X GET "https://clinicaltrials.gov/api/v2/studies/NCT00841061" \ # -H "accept: text/csv" request_url = f"https://clinicaltrials.gov/api/v2/studies/{clinical_record_id}" response = requests.get(request_url, headers={"accept": "application/json"}) return response.json() def get_clinical_records_by_ids(clinical_record_ids: List[str]) -> List[Dict[str, Any]]: clinical_records = [] for clinical_record_id in clinical_record_ids: clinical_record_info = get_clinical_record_info(clinical_record_id) clinical_records.append(clinical_record_info) return clinical_records def process_json_data_for_llm(data): # Define the fields you want to keep fields_to_keep = [ "class_of_organization", "title", "overallStatus", "descriptionModule", "conditions", "interventions", "outcomesModule", "eligibilityModule", ] # Iterate through the dictionary and keep only the desired fields filtered_data = [] for item in data: try: organization_name = item["protocolSection"]["identificationModule"][ "organization" ]["fullName"] except: organization_name = "" try: project_title = item["protocolSection"]["identificationModule"][ "officialTitle" ] except: project_title = "" try: status = item["protocolSection"]["statusModule"]["overallStatus"] except: status = "" try: brief_description = item["protocolSection"]["descriptionModule"][ "briefSummary" ] except: brief_description = "" try: detailed_description = item["protocolSection"]["descriptionModule"][ "detailedDescription" ] except: detailed_description = "" try: conditions = item["protocolSection"]["conditionsModule"]["conditions"] except: conditions = [] try: keywords = item["protocolSection"]["conditionsModule"]["keywords"] except: keywords = [] try: interventions = item["protocolSection"]["armsInterventionsModule"][ "interventions" ] except: interventions = [] try: primary_outcomes = item["protocolSection"]["outcomesModule"][ "primaryOutcomes" ] except: primary_outcomes = [] try: secondary_outcomes = item["protocolSection"]["outcomesModule"][ "secondaryOutcomes" ] except: secondary_outcomes = [] try: eligibility = item["protocolSection"]["eligibilityModule"] except: eligibility = {} filtered_item = { "organization_name": organization_name, "project_title": project_title, "status": status, "brief_description": brief_description, "detailed_description": detailed_description, "keywords": keywords, "interventions": interventions, "primary_outcomes": primary_outcomes, "secondary_outcomes": secondary_outcomes, "eligibility": eligibility, } filtered_data.append(filtered_item) # for ele in filtered_data: # print(ele) def llm_config(): tagging_prompt = ChatPromptTemplate.from_template( """ Extract the desired information from the following list of JSON clinical trials. Only extract the properties mentioned in the 'Classification' function. Passage: {input} """ ) class Classification(BaseModel): description: str = Field( description="text description grouping all the clinical trials using brief_description and detailed_description keys" ) project_title: list = Field( description="Extract the project title of all the clinical trials" ) status: list = Field( description="Extract the status of all the clinical trials" ) keywords: list = Field( description="Extract the most relevant keywords regrouping all the clinical trials" ) interventions: list = Field( description="describe the interventions for each clinical trial using title, name and description" ) primary_outcomes: list = Field( description="get the primary outcomes of each clinical trial" ) # secondary_outcomes: list= Field(description= "get the secondary outcomes of each clinical trial") eligibility: list = Field( description="get the eligibilityCriteria grouping all the clinical trials" ) # healthy_volunteers: list= Field(description= "determine whether the clinical trial requires healthy volunteers") minimum_age: list = Field( description="get the minimum age from each experiment" ) maximum_age: list = Field( description="get the maximum age from each experiment" ) gender: list = Field(description="get the gender from each experiment") def get_dict(self): return { "summary": self.description, "project_title": self.project_title, "status": self.status, "keywords": self.keywords, "interventions": self.interventions, "primary_outcomes": self.primary_outcomes, # "secondary_outcomes": self.secondary_outcomes, "eligibility": self.eligibility, # "healthy_volunteers": self.healthy_volunteers, "minimum_age": self.minimum_age, "maximum_age": self.maximum_age, "gender": self.gender, } # LLM llm = ChatOpenAI( temperature=0.6, model="gpt-4", openai_api_key=os.environ["OPENAI_API_KEY"], ).with_structured_output(Classification) tagging_chain = tagging_prompt | llm return tagging_chain # clinical_record_info = get_clinical_records_by_ids(['NCT00841061', 'NCT03035123', 'NCT02272751', 'NCT03035123', 'NCT03055377']) # print(clinical_record_info) # with open('data.json', 'w') as f: # json.dump(clinical_record_info, f, indent=4) tagging_chain = llm_config() def process_dictionaty_with_llm_to_generate_response(json_contents): processed_data = process_json_data_for_llm(json_contents) res = tagging_chain.invoke({"input": processed_data}) return res