Spaces:
Sleeping
Sleeping
1-ARIjitS
commited on
Commit
•
0b336c0
1
Parent(s):
27d40b9
llm for getting the output added
Browse files- llm_res.py +197 -0
llm_res.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from langchain_community.document_loaders.csv_loader import CSVLoader
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
import pandas as pd
|
5 |
+
import langchain
|
6 |
+
import os
|
7 |
+
import openai
|
8 |
+
import ast
|
9 |
+
from langchain import OpenAI
|
10 |
+
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
|
11 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
+
from langchain_community.document_loaders import JSONLoader
|
13 |
+
from langchain.document_loaders import UnstructuredURLLoader
|
14 |
+
from langchain.embeddings import OpenAIEmbeddings
|
15 |
+
from langchain.vectorstores import FAISS
|
16 |
+
from langchain_core.prompts import ChatPromptTemplate
|
17 |
+
from langchain_core.pydantic_v1 import BaseModel, Field
|
18 |
+
from langchain_openai import ChatOpenAI
|
19 |
+
from langchain_core.prompts import ChatPromptTemplate
|
20 |
+
from langchain_core.pydantic_v1 import BaseModel, Field
|
21 |
+
from langchain_openai import ChatOpenAI
|
22 |
+
from typing import List, Dict, Any
|
23 |
+
import requests
|
24 |
+
|
25 |
+
# getting the json files
|
26 |
+
def get_clinical_record_info(clinical_record_id: str) -> Dict[str, Any]:
|
27 |
+
# Request:
|
28 |
+
# curl -X GET "https://clinicaltrials.gov/api/v2/studies/NCT00841061" \
|
29 |
+
# -H "accept: text/csv"
|
30 |
+
request_url = f"https://clinicaltrials.gov/api/v2/studies/{clinical_record_id}"
|
31 |
+
response = requests.get(request_url, headers={"accept": "application/json"})
|
32 |
+
return response.json()
|
33 |
+
|
34 |
+
def get_clinical_records_by_ids(clinical_record_ids: List[str]) -> List[Dict[str, Any]]:
|
35 |
+
clinical_records = []
|
36 |
+
for clinical_record_id in clinical_record_ids:
|
37 |
+
clinical_record_info = get_clinical_record_info(clinical_record_id)
|
38 |
+
clinical_records.append(clinical_record_info)
|
39 |
+
return clinical_records
|
40 |
+
|
41 |
+
def process_json(json_file):
|
42 |
+
# processing the files and getting the info needed
|
43 |
+
# Open the JSON file for reading
|
44 |
+
with open(json_file, 'r') as f:
|
45 |
+
data = json.load(f) # Parse JSON data into a Python dictionary
|
46 |
+
|
47 |
+
# Define the fields you want to keep
|
48 |
+
fields_to_keep = ['class_of_organization', 'title', 'overallStatus', 'descriptionModule', 'conditions', 'interventions', 'outcomesModule', 'eligibilityModule']
|
49 |
+
|
50 |
+
# Iterate through the dictionary and keep only the desired fields
|
51 |
+
filtered_data = []
|
52 |
+
for item in data:
|
53 |
+
try:
|
54 |
+
organization_name= item['protocolSection']['identificationModule']['organization']['fullName']
|
55 |
+
except:
|
56 |
+
organization_name= ""
|
57 |
+
try:
|
58 |
+
project_title= item['protocolSection']['identificationModule']['officialTitle']
|
59 |
+
except:
|
60 |
+
project_title= ""
|
61 |
+
try:
|
62 |
+
status= item['protocolSection']['statusModule']['overallStatus']
|
63 |
+
except:
|
64 |
+
status= ""
|
65 |
+
try:
|
66 |
+
brief_description= item['protocolSection']['descriptionModule']['briefSummary']
|
67 |
+
except:
|
68 |
+
brief_description= ""
|
69 |
+
try:
|
70 |
+
detailed_description= item['protocolSection']['descriptionModule']['detailedDescription']
|
71 |
+
except:
|
72 |
+
detailed_description= ""
|
73 |
+
try:
|
74 |
+
conditions= item['protocolSection']['conditionsModule']['conditions']
|
75 |
+
except:
|
76 |
+
conditions= []
|
77 |
+
try:
|
78 |
+
keywords= item['protocolSection']['conditionsModule']['keywords']
|
79 |
+
except:
|
80 |
+
keywords= []
|
81 |
+
try:
|
82 |
+
interventions= item['protocolSection']['armsInterventionsModule']['interventions']
|
83 |
+
except:
|
84 |
+
interventions= []
|
85 |
+
try:
|
86 |
+
primary_outcomes= item['protocolSection']['outcomesModule']['primaryOutcomes']
|
87 |
+
except:
|
88 |
+
primary_outcomes= []
|
89 |
+
try:
|
90 |
+
secondary_outcomes= item['protocolSection']['outcomesModule']['secondaryOutcomes']
|
91 |
+
except:
|
92 |
+
secondary_outcomes= []
|
93 |
+
try:
|
94 |
+
eligibility= item['protocolSection']['eligibilityModule']
|
95 |
+
except:
|
96 |
+
eligibility= {}
|
97 |
+
filtered_item = {"organization_name": organization_name,
|
98 |
+
"project_title": project_title,
|
99 |
+
"status": status,
|
100 |
+
"brief_description": brief_description,
|
101 |
+
"detailed_description": detailed_description,
|
102 |
+
"keywords":keywords,
|
103 |
+
"interventions": interventions,
|
104 |
+
"primary_outcomes": primary_outcomes,
|
105 |
+
"secondary_outcomes": secondary_outcomes,
|
106 |
+
"eligibility": eligibility}
|
107 |
+
filtered_data.append(filtered_item)
|
108 |
+
|
109 |
+
# for ele in filtered_data:
|
110 |
+
# print(ele)
|
111 |
+
|
112 |
+
# Write the filtered data to a new JSON file
|
113 |
+
with open('output.json', 'w') as f:
|
114 |
+
json.dump(filtered_data, f, indent=4)
|
115 |
+
|
116 |
+
def llm_config():
|
117 |
+
tagging_prompt = ChatPromptTemplate.from_template(
|
118 |
+
"""
|
119 |
+
Extract the desired information from the following list of JSON clinical trials.
|
120 |
+
|
121 |
+
Only extract the properties mentioned in the 'Classification' function.
|
122 |
+
|
123 |
+
Passage:
|
124 |
+
{input}
|
125 |
+
|
126 |
+
"""
|
127 |
+
)
|
128 |
+
|
129 |
+
class Classification(BaseModel):
|
130 |
+
description: str = Field(description= "text description grouping all the clinical trials using brief_description and detailed_description keys")
|
131 |
+
project_title: list = Field(description="Extract the project title of all the clinical trials")
|
132 |
+
status: list= Field(description="Extract the status of all the clinical trials")
|
133 |
+
keywords: list= Field(description="Extract the most relevant keywords regrouping all the clinical trials")
|
134 |
+
interventions: list= Field(description="describe the interventions for each clinical trial using title, name and description")
|
135 |
+
primary_outcomes: list= Field(description= "get the primary outcomes of each clinical trial")
|
136 |
+
# secondary_outcomes: list= Field(description= "get the secondary outcomes of each clinical trial")
|
137 |
+
eligibility: list= Field(description= "get the eligibilityCriteria grouping all the clinical trials")
|
138 |
+
# healthy_volunteers: list= Field(description= "determine whether the clinical trial requires healthy volunteers")
|
139 |
+
minimum_age: list = Field(description="get the minimum age from each experiment")
|
140 |
+
maximum_age: list = Field(description="get the maximum age from each experiment")
|
141 |
+
gender: list = Field(description="get the gender from each experiment")
|
142 |
+
|
143 |
+
def get_dict(self):
|
144 |
+
return {
|
145 |
+
"summary": self.description,
|
146 |
+
"project_title": self.project_title,
|
147 |
+
"status": self.status,
|
148 |
+
"keywords": self.keywords,
|
149 |
+
"interventions": self.interventions,
|
150 |
+
"primary_outcomes": self.primary_outcomes,
|
151 |
+
# "secondary_outcomes": self.secondary_outcomes,
|
152 |
+
"eligibility": self.eligibility,
|
153 |
+
# "healthy_volunteers": self.healthy_volunteers,
|
154 |
+
"minimum_age": self.minimum_age,
|
155 |
+
"maximum_age": self.maximum_age,
|
156 |
+
"gender": self.gender
|
157 |
+
}
|
158 |
+
|
159 |
+
# LLM
|
160 |
+
llm = ChatOpenAI(
|
161 |
+
temperature=0.6,
|
162 |
+
model="gpt-4",
|
163 |
+
openai_api_key="sk-proj-CG2E98bSWs53X2eWO0Z4T3BlbkFJLm7H1vfkbua0zP548CKQ"
|
164 |
+
).with_structured_output(
|
165 |
+
Classification
|
166 |
+
)
|
167 |
+
|
168 |
+
tagging_chain = tagging_prompt | llm
|
169 |
+
|
170 |
+
return tagging_chain
|
171 |
+
|
172 |
+
def get_llm_results(results):
|
173 |
+
result_dict= results.get_dict()
|
174 |
+
return result_dict
|
175 |
+
|
176 |
+
def save_llm_results(results_json):
|
177 |
+
with open('llm_results.json', 'w') as f:
|
178 |
+
json.dump(results_json, f, indent=4)
|
179 |
+
|
180 |
+
# clinical_record_info = get_clinical_records_by_ids(['NCT00841061', 'NCT03035123', 'NCT02272751', 'NCT03035123', 'NCT03055377'])
|
181 |
+
# print(clinical_record_info)
|
182 |
+
|
183 |
+
# with open('data.json', 'w') as f:
|
184 |
+
# json.dump(clinical_record_info, f, indent=4)
|
185 |
+
|
186 |
+
# change the json file here and run it to get the output
|
187 |
+
json_file= "D:/HACKUPC/hupc/klinic/data.json"
|
188 |
+
process_json(json_file)
|
189 |
+
|
190 |
+
with open('output.json', 'r') as file:
|
191 |
+
data = json.load(file)
|
192 |
+
|
193 |
+
tagging_chain= llm_config()
|
194 |
+
res= tagging_chain.invoke({"input": data})
|
195 |
+
result_json= get_llm_results(res)
|
196 |
+
save_llm_results(result_json)
|
197 |
+
print(result_json)
|