Satoc commited on
Commit
6c69822
·
1 Parent(s): 30c02a6
.gitignore ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Add any directories, files, or patterns you don't want to be tracked by version control
2
+
3
+
4
+ # Byte-compiled / optimized / DLL files
5
+ __pycache__/
6
+ #*.py[cod]
7
+ #*$py.class
8
+ #*.txt
9
+ #*.tsv
10
+ #*.csv
11
+ *.xlsx
12
+ *.pdf
13
+ *.nii
14
+ #*.nii.gz
15
+ *.DS_Store
16
+ #*.png
17
+ #*.pyn
18
+ *.jpg
19
+ *.nii.gz
20
+ *.pkl
21
+ *-checkpoint.ipynb
22
+ *.pkls
23
+ *.pth
24
+ *.yaml
25
+ *.ckpt
26
+ # C extensions
27
+ #*.so
28
+
29
+ # Distribution / packaging
30
+ #.Python
31
+ #build/
32
+ #develop-eggs/
33
+ #dist/
34
+ #downloads/
35
+ #eggs/
36
+ #.eggs/
37
+ #lib/
38
+ #lib64/
39
+ #parts/
40
+ #sdist/
41
+ #var/
42
+ #wheels/
43
+ #*.egg-info/
44
+ #.installed.cfg
45
+ #*.egg
46
+ #MANIFEST
47
+
48
+ # PyInstaller
49
+ # Usually these files are written by a python script from a template
50
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
51
+ #*.manifest
52
+ #*.spec
53
+
54
+ # Installer logs
55
+ #pip-log.txt
56
+ #pip-delete-this-directory.txt
57
+
58
+ # Unit test / coverage reports
59
+ #htmlcov/
60
+ #.tox/
61
+ #.coverage
62
+ #.coverage.*
63
+ #.cache
64
+ #nosetests.xml
65
+ #coverage.xml
66
+ #*.cover
67
+ #.hypothesis/
68
+ #.pytest_cache/
69
+
70
+ # Translations
71
+ #*.mo
72
+ #*.pot
73
+
74
+ # Django stuff:
75
+ #*.log
76
+ #.static_storage/
77
+ #.media/
78
+ #local_settings.py
79
+
80
+ # Flask stuff:
81
+ #instance/
82
+ #.webassets-cache
83
+
84
+ # Scrapy stuff:
85
+ #.scrapy
86
+
87
+ # Sphinx documentation
88
+ #docs/_build/
89
+
90
+ # PyBuilder
91
+ #target/
92
+
93
+ # Jupyter Notebook
94
+ .ipynb_checkpoint/*
95
+
96
+ # pyenv
97
+ #.python-version
98
+
99
+ # celery beat schedule file
100
+ #celerybeat-schedule
101
+
102
+ # SageMath parsed files
103
+ #*.sage.py
104
+
105
+ # Environments
106
+ #.env
107
+ #.venv
108
+ #env/
109
+ #venv/
110
+ #ENV/
111
+ #env.bak/
112
+ #venv.bak/
113
+
114
+ # Spyder project settings
115
+ #.spyderproject
116
+ #.spyproject
117
+
118
+ # Rope project settings
119
+ #.ropeproject
120
+
121
+ # mkdocs documentation
122
+ #/site
123
+ /models/
124
+ # mypy
125
+ #.mypy_cache/
126
+ #over 100MB
127
+
128
+ # Add any directories, files, or patterns you don't want to be tracked by version control
129
+
130
+
131
+ #deep settings
132
+ *.h5
133
+
134
+ .OpenAITools/chromedriver
135
+ /OpenAITools/chromedriver
OpenAITools/.ipynb_checkpoints/ECarteTools-checkpoint.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import time
3
+ import wikipedia
4
+ import random
5
+ import re
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ import os
9
+ import glob
10
+ from natsort import natsorted
11
+ import requests
12
+ from bs4 import BeautifulSoup
13
+ import xml.etree.ElementTree as ET
14
+ import pandas as pd
15
+
16
+ wikipedia.set_lang("ja")
17
+ # APIキーの設定
18
+ openai.api_key = os.environ['OPENAI_API_KEY']
19
+ engine="gpt-3.5-turbo"
20
+
21
+
22
+ def generate(system_template,prompt,engine="gpt-3.5-turbo"):
23
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
24
+ try:
25
+ response = openai.ChatCompletion.create(
26
+ model=engine,
27
+ messages=[
28
+ {"role": "system", "content": system_template},
29
+ {"role": "user", "content":prompt},
30
+ ]
31
+ )
32
+ result=response["choices"][0]["message"]["content"]
33
+ return result
34
+ except:
35
+ print("リトライ")
36
+ time.sleep(30)
37
+ pass
38
+
39
+ def generate_carte(prompt,engine="gpt-3.5-turbo"):
40
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
41
+ try:
42
+ response = openai.ChatCompletion.create(
43
+ model=engine,
44
+ messages=[
45
+ {"role": "system", "content": "You are useful assistant"},
46
+ {"role": "user", "content":"%s\n・・・という患者と医師の会話をSOAP形式のカルテとして日本語で端的にまとめて下さい。各セクションはS),O), A),P)として下さい "%prompt},
47
+ ]
48
+ )
49
+ result=response["choices"][0]["message"]["content"]
50
+ return result
51
+ except:
52
+ print("リトライ")
53
+ time.sleep(30)
54
+ pass
55
+
56
+ def get_selected_fileds(texts):
57
+ input_name = texts.replace(' ' , "+")
58
+ corona_fields = ct.get_study_fields(
59
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
60
+ fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
61
+ max_studies=500,
62
+ fmt="csv")
63
+ return corona_fields
64
+
65
+ def get_retriever_str(fields):
66
+ retriever_str=''
67
+ for i in range(1,len(fields)):
68
+ colnames = fields[0]
69
+ targetCol = fields[i]
70
+ for f in range(len(fields[0])):
71
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
72
+ retriever_str+='\n'
73
+ return retriever_str
OpenAITools/.ipynb_checkpoints/ExpertTools-checkpoint.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import time
4
+ import wikipedia
5
+ import random
6
+ import re
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ import os
10
+ import glob
11
+ from natsort import natsorted
12
+ import requests
13
+ from bs4 import BeautifulSoup
14
+ import xml.etree.ElementTree as ET
15
+ from pytrials.client import ClinicalTrials
16
+ from Bio import Entrez
17
+ import pandas as pd
18
+ import numpy as np
19
+ import time
20
+ #from langchain.agents import create_pandas_dataframe_agent
21
+ from langchain_experimental.agents import create_pandas_dataframe_agent
22
+ from langchain.llms import OpenAI
23
+
24
+ # APIキーの設定
25
+ openai.api_key = os.environ['OPENAI_API_KEY']
26
+ gptengine="gpt-3.5-turbo"
27
+
28
+
29
+ """def get_selected_fileds(texts):
30
+ ct = ClinicalTrials()
31
+ input_name = texts.replace(' ' , "+")
32
+ corona_fields = ct.get_study_fields(
33
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
34
+ fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
35
+ max_studies=500,
36
+ fmt="csv")
37
+ return corona_fields"""
38
+
39
+ def get_retriever_str(fields):
40
+ retriever_str=''
41
+ for i in range(1,len(fields)):
42
+ colnames = fields[0]
43
+ targetCol = fields[i]
44
+ for f in range(len(fields[0])):
45
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
46
+ retriever_str+='\n'
47
+ return retriever_str
48
+
49
+ def get_chanked_retriever(fields):
50
+ retriever_list =[]
51
+ for i in range(1,len(fields)):
52
+ retriever_str=''
53
+ colnames = fields[0]
54
+ targetCol = fields[i]
55
+ for f in range(len(fields[0])):
56
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
57
+ retriever_list.append(retriever_str)
58
+ return retriever_list
59
+
60
+ from pytrials.client import ClinicalTrials
61
+ def get_selected_fields(texts, split_criteria=False,
62
+ split_word_number = False, split_number=700):
63
+ ct = ClinicalTrials()
64
+ input_name = texts.replace(' ', "+")
65
+ corona_fields = ct.get_study_fields(
66
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (input_name),
67
+ fields=["NCTId", "Condition", "BriefTitle", 'BriefSummary', 'EligibilityCriteria'],
68
+ max_studies=500,
69
+ fmt="csv")
70
+
71
+ if split_criteria:
72
+ new_fields = []
73
+
74
+ # 検索対象の文字列
75
+ target_string1 = 'Exclusion Criteria'
76
+ target_string2 = 'Exclusion criteria'
77
+
78
+ # 各要素で検索対象の文字列を探し、直前で分割して新しいリストに格納
79
+ for corona_field in corona_fields:
80
+ new_list = []
81
+ for item in corona_field:
82
+ if target_string1 in item:
83
+ split_position = item.index(target_string1)
84
+ new_list.append(item[:split_position])
85
+ new_list.append(item[split_position:])
86
+ elif target_string2 in item:
87
+ split_position = item.index(target_string2)
88
+ new_list.append(item[:split_position])
89
+ new_list.append(item[split_position:])
90
+ else:
91
+ new_list.append(item)
92
+ new_fields.append(new_list)
93
+ else:
94
+ new_fields = corona_fields
95
+
96
+ if split_word_number:
97
+ split_fields = []
98
+ for new_field in new_fields:
99
+ new_list= []
100
+
101
+ # 各要素を調べて、700文字以上であれば分割し、新しいリストに格納
102
+ for item in new_field:
103
+ item_length = len(item)
104
+ if item_length > split_number:
105
+ num_parts = -(-item_length // split_number) # 向上の除算を用いて分割数を計算
106
+ for i in range(num_parts):
107
+ start_index = i * split_number
108
+ end_index = min((i + 1) * split_number, item_length) # 文字列の終わりを超えないように調整
109
+ new_list.append(item[start_index:end_index])
110
+ else:
111
+ new_list.append(item)
112
+
113
+ split_fields.append(new_list)
114
+ new_fields = split_fields
115
+
116
+ return new_fields
117
+
118
+
119
+ def print_agent_results(df, Ids,
120
+ interesteds = ['Condition', 'BriefTitle', 'BriefSummary', 'EligibilityCriteria'],
121
+ translater=None):
122
+ results = ""
123
+ for Id in Ids:
124
+ print("%s\n"%Id)
125
+ sdf = df[df['NCTId'] == Id]
126
+ for interested in interesteds:
127
+ # 最初の要素を取得
128
+ results += '%s: \n %s \n' % (interested, sdf[interested].iloc[0])
129
+ #print('%s: \n %s \n' % (interested, sdf[interested].iloc[0]))
130
+ if translater:
131
+ to_be_printed = translater.translate(results)
132
+ else:
133
+ to_be_printed =results
134
+ print(to_be_printed)
135
+
136
+ def search(query):
137
+ Entrez.email = os.getenv('MAIL_ADRESS')
138
+ #Entrez.email='sing.monotonyflower@gmail.com'
139
+ handle = Entrez.esearch(db='pubmed',
140
+ sort = 'relevance',
141
+ retmax = '20',
142
+ retmode = 'xml',
143
+ term = query)
144
+ results = Entrez.read(handle)
145
+ return results
146
+
147
+ def fetch_details(id_list):
148
+ ids = ','.join(id_list)
149
+ Entrez.email = os.getenv('MAIL_ADRESS')
150
+ #Entrez.email = 'sing.monotonyflower@gmail.com'
151
+ handle = Entrez.efetch(db = 'pubmed',
152
+ retmode = 'xml',
153
+ id = ids)
154
+ results = Entrez.read(handle)
155
+ return results
156
+ '''def generate(prompt,engine=None):
157
+ if engine is None:
158
+ engine=gptengine
159
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
160
+ try:
161
+ response = openai.ChatCompletion.create(
162
+ model=engine,
163
+ messages=[
164
+ {"role": "system", "content": "You are useful assistant"},
165
+ {"role": "user", "content":prompt},
166
+ ]
167
+ )
168
+ result=response["choices"][0]["message"]["content"]
169
+ return result
170
+ except Exception as e:
171
+ print(e)
172
+ print("リトライ")
173
+ time.sleep(30)
174
+ pass
175
+ '''
176
+
177
+ def generate(prompt,engine=None):
178
+ if engine is None:
179
+ engine=gptengine
180
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
181
+ try:
182
+ response = openai.chat.completions.create(
183
+ model=engine,
184
+ messages=[
185
+ {"role": "system", "content": "You are useful assistant"},
186
+ {"role": "user", "content":prompt},
187
+ ]
188
+ )
189
+ #result=response["choices"][0]["message"]["content"]
190
+ result=response.choices[0].message.content
191
+ return result
192
+ except Exception as e:
193
+ print(e)
194
+ print("リトライ")
195
+ time.sleep(30)
196
+ pass
197
+
198
+ def GetPubmedSummaryDf(studies):
199
+ title_list= []
200
+ abstract_list=[]
201
+ journal_list = []
202
+ language_list =[]
203
+ pubdate_year_list = []
204
+ pubdate_month_list = []
205
+ studiesIdList = studies['IdList']
206
+ chunk_size = 10000
207
+ for chunk_i in range(0, len(studiesIdList), chunk_size):
208
+ chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
209
+
210
+ try:
211
+ papers = fetch_details(chunk)
212
+ for i, paper in enumerate(papers['PubmedArticle']):
213
+ title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
214
+ try:
215
+ abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
216
+ except:
217
+ abstract_list.append('No Abstract')
218
+ journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
219
+ language_list.append(paper['MedlineCitation']['Article']['Language'][0])
220
+ try:
221
+ pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
222
+ except:
223
+ pubdate_year_list.append('No Data')
224
+ try:
225
+ pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
226
+ except:
227
+ pubdate_month_list.append('No Data')
228
+ except: # occasionally a chunk might annoy your parser
229
+ pass
230
+ df = pd.DataFrame(list(zip(
231
+ title_list, abstract_list, journal_list, language_list, pubdate_year_list,
232
+ pubdate_month_list)),
233
+ columns=['Title', 'Abstract', 'Journal', 'Language', 'Year','Month'])
234
+ return df, abstract_list
235
+
236
+ def ClinicalAgent(fileds, verbose=False):
237
+ df = pd.DataFrame.from_records(fileds[1:], columns=fileds[0])
238
+ return create_pandas_dataframe_agent(OpenAI(temperature=0, model='gpt-3.5-turbo-16k'), df, verbose=verbose)
239
+
240
+ def GetNCTID(results):
241
+ # NCTで始まる単語を検索する正規表現
242
+ pattern = r'\bNCT\d+\b'
243
+ # 正規表現を使って単語を抽出
244
+ nct_words = re.findall(pattern,results)
245
+ return nct_words
OpenAITools/.ipynb_checkpoints/FetchTools-checkpoint.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ #from llama_index.llms.replicate import Replicate
4
+ import requests
5
+ import re
6
+
7
+
8
+ def extract_japan_cities(text):
9
+ # 正規表現を使用して " - Japan" で終わる都市名を抽出
10
+ pattern = r'(\b\w+\s*\w*\b) - Japan'
11
+ cities = re.findall(pattern, text)
12
+ unique_cities = list(set(cities))
13
+ # ユニークな都市名をソートしてカンマで区切られた文字列に変換
14
+ unique_cities.sort()
15
+ return ', '.join(unique_cities)
16
+
17
+ def fetch_clinical_trials(cancer_name):
18
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
19
+ # Initial URL for the first API call
20
+ base_url = "https://clinicaltrials.gov/api/v2/studies"
21
+ params = {
22
+ "query.titles": search_expr,
23
+ "pageSize": 100
24
+ }
25
+
26
+ # Initialize an empty list to store the data
27
+ data_list = []
28
+ # Loop until there is no nextPageToken
29
+ while True:
30
+ # Print the current URL (for debugging purposes)
31
+ print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
32
+
33
+ # Send a GET request to the API
34
+ response = requests.get(base_url, params=params)
35
+
36
+ # Check if the request was successful
37
+ if response.status_code == 200:
38
+ data = response.json() # Parse JSON response
39
+ studies = data.get('studies', []) # Extract the list of studies
40
+
41
+ # Loop through each study and extract specific information
42
+ for study in studies:
43
+ # Safely access nested keys
44
+ nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
45
+ startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
46
+ conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
47
+ title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
48
+ summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
49
+
50
+ # Extract locations safely
51
+ locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
52
+ locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
53
+
54
+ JapanesLocations = extract_japan_cities(locations)
55
+ # Extract dates and phases
56
+ primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
57
+
58
+ phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
59
+ eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
60
+
61
+ # Append the data to the list as a dictionary
62
+ data_list.append({
63
+ "NCTID": nctId,
64
+ "Title": title,
65
+ #"Start Date": startDate,
66
+ "Primary Completion Date": primaryCompletionDate,
67
+ #"Conditions": conditions,
68
+ "Cancer": conditions,
69
+ "Summary": summary,
70
+ "Japanes Locations": JapanesLocations,
71
+ #"Phases": phases,
72
+ "Eligibility Criteria": eligibilityCriteria
73
+ })
74
+
75
+ # Check for nextPageToken and update the params or break the loop
76
+ nextPageToken = data.get('nextPageToken')
77
+ if nextPageToken:
78
+ params['pageToken'] = nextPageToken # Set the pageToken for the next request
79
+ else:
80
+ break # Exit the loop if no nextPageToken is present
81
+ else:
82
+ print("Failed to fetch data. Status code:", response.status_code)
83
+ break
84
+
85
+ # Create a DataFrame from the list of dictionaries
86
+ df = pd.DataFrame(data_list)
87
+ return df
88
+
89
+ def fetch_clinical_trials_jp(cancer_name):
90
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
91
+ # Initial URL for the first API call
92
+ base_url = "https://clinicaltrials.gov/api/v2/studies"
93
+ params = {
94
+ "query.titles": search_expr,
95
+ "pageSize": 100
96
+ }
97
+
98
+ # Initialize an empty list to store the data
99
+ data_list = []
100
+ # Loop until there is no nextPageToken
101
+ while True:
102
+ # Print the current URL (for debugging purposes)
103
+ print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
104
+
105
+ # Send a GET request to the API
106
+ response = requests.get(base_url, params=params)
107
+
108
+ # Check if the request was successful
109
+ if response.status_code == 200:
110
+ data = response.json() # Parse JSON response
111
+ studies = data.get('studies', []) # Extract the list of studies
112
+
113
+ # Loop through each study and extract specific information
114
+ for study in studies:
115
+ # Safely access nested keys
116
+ nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
117
+ startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
118
+ conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
119
+ title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
120
+ summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
121
+
122
+ # Extract locations safely
123
+ locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
124
+ locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
125
+
126
+ JapanesLocations = extract_japan_cities(locations)
127
+ # Extract dates and phases
128
+ primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
129
+
130
+ phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
131
+ eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
132
+
133
+ # Append the data to the list as a dictionary
134
+ data_list.append({
135
+ "NCTID": nctId,
136
+ "タイトル": title,
137
+ #"Start Date": startDate,
138
+ #"Primary Completion Date": primaryCompletionDate,
139
+ "対象となる癌": conditions,
140
+ "サマリー": summary,
141
+ "場所": JapanesLocations,
142
+ #"Phases": phases,
143
+ "クライテリア": eligibilityCriteria
144
+ })
145
+
146
+ # Check for nextPageToken and update the params or break the loop
147
+ nextPageToken = data.get('nextPageToken')
148
+ if nextPageToken:
149
+ params['pageToken'] = nextPageToken # Set the pageToken for the next request
150
+ else:
151
+ break # Exit the loop if no nextPageToken is present
152
+ else:
153
+ print("Failed to fetch data. Status code:", response.status_code)
154
+ break
155
+
156
+ # Create a DataFrame from the list of dictionaries
157
+ df = pd.DataFrame(data_list)
158
+ return df
OpenAITools/.ipynb_checkpoints/scrapeThisData-checkpoint.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.support.ui import Select
3
+ from selenium.webdriver.common.by import By
4
+
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ import re
8
+
9
+ import os
10
+ import time
11
+
12
+ from selenium.webdriver.support.ui import WebDriverWait
13
+ from selenium.webdriver.common.by import By
14
+ from selenium.webdriver.support import expected_conditions as EC
15
+ from selenium.webdriver.common.action_chains import ActionChains
16
+ import chromedriver_autoinstaller
17
+
18
+ class ScrapeThatData:
19
+
20
+ def __init__(self, time_threshold = 10):
21
+
22
+ try:
23
+ chrome_options = webdriver.ChromeOptions()
24
+ chrome_options.add_argument('--no-sandbox')
25
+ self.driver = webdriver.Chrome(options=chrome_options)
26
+
27
+ except:
28
+ chromedriver_autoinstaller.install()
29
+ chrome_options = webdriver.ChromeOptions()
30
+ chrome_options.add_argument('--no-sandbox')
31
+ self.driver = webdriver.Chrome(options=chrome_options)
32
+
33
+
34
+
35
+ self.wait = WebDriverWait(self.driver,time_threshold)
36
+ self.attribute_dict = {'status':1 ,'conditions':2, 'interventions': 3, 'study type':4,
37
+ 'phase':5, 'sponsor':6, 'funder type':7 , 'study design': 8,
38
+ 'outcome measures':9, 'number enrolled':10, 'sex':11, 'age':12,
39
+ 'nct number': 13, 'other ids':14, 'title acronym': 15 , 'study start': 16,
40
+ 'primary completion': 17, 'study completion': 18 , 'first posted': 19,
41
+ 'last update posted': 20 , 'results first posted': 21 , 'locations':22, 'study documents': 23}
42
+
43
+ self.status_dict = {'not yet recruiting' : 'notYetRecrCB',
44
+ 'recruiting' : 'recruitingCB',
45
+ 'enrolling by invitation':'enrollingByInvCB',
46
+ 'active, not recruiting': 'activeCB',
47
+ 'suspended': 'suspendedCB',
48
+ 'terminated':'terminatedCB',
49
+ 'completed':'completedCB',
50
+ 'withdrawn': 'withdrawnCB',
51
+ 'unknown status': 'unknownCB'}
52
+
53
+ def clicking_show_hide_cols(self, driver):
54
+ columns = driver.find_element(By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/button')
55
+ action_chain = ActionChains(driver)
56
+ action_chain.move_to_element(columns).click()
57
+ action_chain.perform()
58
+
59
+ def select_attributes_to_show(self, listed_attributes, attribute_dict):
60
+ ll = [value.lower() for value in listed_attributes if value.lower() in ['status', 'conditions', 'interventions', 'locations']]
61
+ if ll:
62
+ to_show = [value.lower() for value in listed_attributes if value.lower() not in ll]
63
+ to_hide = [value for value in ['status', 'conditions', 'interventions', 'locations'] if value not in ll]
64
+ to_click = to_hide + to_show
65
+ for att in to_click:
66
+ self.clicking_show_hide_cols(self.driver)
67
+ time.sleep(1)
68
+ self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att]) + ']'))).click()
69
+ time.sleep(1)
70
+ else:
71
+ for att in listed_attributes:
72
+ self.clicking_show_hide_cols(self.driver)
73
+ time.sleep(1)
74
+ self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att.lower()]) + ']'))).click()
75
+ time.sleep(1)
76
+
77
+ def select_by_status(self, listed_states, status_dict):
78
+ if listed_states:
79
+ for status in listed_states:
80
+ self.driver.find_element(By.ID,status_dict[status.lower()]).click()
81
+
82
+ self.driver.find_element(By.XPATH,'//*[@id="FiltersBody"]/div[1]/input[1]').click()
83
+ time.sleep(3)
84
+
85
+
86
+ select = Select(self.driver.find_element_by_name('theDataTable_length'))
87
+ select.select_by_value('100')
88
+
89
+ def collect_data_search_page(self,l_ordered, amount_of_data = None):
90
+
91
+ class_name = ''
92
+ page_index = 1
93
+
94
+ elements = [l_ordered]
95
+
96
+ while 'disabled' not in class_name :
97
+
98
+
99
+
100
+ time.sleep(10)
101
+
102
+ print('Getting data from page {}'.format(page_index))
103
+
104
+ #Counting how many rows of the table appear
105
+ table = self.driver.find_element(By.ID,'theDataTable')
106
+ row_count = len(table.find_elements(By.TAG_NAME,"tr"))
107
+
108
+ #Looping table page
109
+ for index in range(1, row_count):
110
+ row = []
111
+ if 'status' in l_ordered:
112
+ self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3)')))
113
+ status_element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3) > span')
114
+ row.append(status_element.text.strip())
115
+ for i, val in enumerate(l_ordered):
116
+ if val == 'status':
117
+ continue
118
+
119
+ self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')))
120
+ element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')
121
+ try:
122
+ row.append(element.text.strip())
123
+ except:
124
+ print(i, element)
125
+ else:
126
+ for i, val in enumerate(l_ordered):
127
+ self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')))
128
+ element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')
129
+ try:
130
+ row.append(element.text.strip())
131
+ except:
132
+ print(i, element)
133
+ elements.append(row)
134
+
135
+
136
+
137
+
138
+ #Getting next page button
139
+ next_page= self.driver.find_element(By.ID,"theDataTable_next")
140
+
141
+ #Getting the class attribute of the next page button
142
+ class_name = next_page.get_attribute('class')
143
+
144
+ #Going to the next page
145
+ next_page.click()
146
+ page_index += 1
147
+
148
+ if amount_of_data:
149
+ if len(elements) >= amount_of_data or row_count < amount_of_data :
150
+ break
151
+ else:
152
+ continue
153
+
154
+ return elements
155
+
156
+ def get_criteria(self, NCTnumber):
157
+
158
+ url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
159
+ ClinicalTrialpage = requests.get(url)
160
+ soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
161
+
162
+ wrapping_crit_class = soup.find_all("div", {"class": "tr-indent2"})
163
+ list_elements = wrapping_crit_class[1].find_all(re.compile("(ul|ol)"))
164
+ inclusion, exclusion = ('','')
165
+
166
+
167
+ if not list_elements:
168
+ print ("WARNING: Study number " + NCTnumber + " doesn't have eligibility criteria or HTML tag format is not a list")
169
+ else:
170
+
171
+ if len(list_elements) == 1:
172
+ try:
173
+ if wrapping_crit_class[1].find(text = 'Inclusion Criteria:'):
174
+ inclusion = list_elements[0].find_all("li")
175
+
176
+ elif wrapping_crit_class[1].find(text = 'Exclusion Criteria:'):
177
+ exclusion = list_elements[0].find_all("li")
178
+ except:
179
+ print('criteria doesnt exist')
180
+ else:
181
+ inclusion = list_elements[0].find_all("li")
182
+ exclusion = list_elements[1].find_all("li")
183
+
184
+
185
+ inclusion = ' '.join([t.text.strip() for t in inclusion ])
186
+ exclusion = ' '.join([t.text.strip() for t in exclusion ])
187
+
188
+ return(inclusion, exclusion)
189
+
190
+ #function that gets number of patients enrolled in a study
191
+ def get_enrollment (self, NCTnumber):
192
+ url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
193
+ ClinicalTrialpage = requests.get(url)
194
+ soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
195
+ enrollment = ''
196
+ wrapping_enrol_class = soup.find_all('td', {'headers':'studyInfoColData','style':"padding-left:1em"})
197
+ if not wrapping_enrol_class:
198
+ print('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
199
+ else:
200
+ enrollment = wrapping_enrol_class[1]
201
+ enrollment = enrollment.text.split()[0]
202
+ if enrollment.isdigit() == False:
203
+ print ('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
204
+ else:
205
+ return(enrollment)
206
+
207
+
208
+
209
+ def __call__(self, condition, listed_attributes, listed_states, amount_of_data):
210
+
211
+ self.driver.get('https://clinicaltrials.gov/ct2/results?cond=' + condition + '&rank=1&view=record#rowId0')
212
+ self.select_attributes_to_show(listed_attributes, self.attribute_dict)
213
+
214
+ try:
215
+ self.select_by_status(listed_states, self.status_dict)
216
+ except:
217
+ print('select by status is a problem')
218
+ n = []
219
+ for i in listed_attributes:
220
+ n.append(self.attribute_dict[i.lower()])
221
+ attribute_ordered = [list(self.attribute_dict.keys())[list(self.attribute_dict.values()).index(i)]for i in sorted(n)]
222
+
223
+ search_data = self.collect_data_search_page(attribute_ordered, amount_of_data=amount_of_data)
224
+ nct_numbers = [e[search_data[0].index('nct number')] for e in search_data[1:]]
225
+ search_data[0].extend(['inclusion', 'exclusion', 'enrollment'])
226
+ for index, nct in enumerate(nct_numbers):
227
+ if index % 100 == 0 and index!= 0:
228
+ print("Collected Data from {} Studies: ".format(index))
229
+
230
+ inc, exc = self.get_criteria(nct)
231
+ enrol = self.get_enrollment(nct)
232
+ search_data[index + 1].extend([inc, exc, enrol])
233
+ return search_data
234
+ # except:
235
+ # print('no data available with the specified status')
236
+
237
+
OpenAITools/ECarteTools.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import time
3
+ import wikipedia
4
+ import random
5
+ import re
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ import os
9
+ import glob
10
+ from natsort import natsorted
11
+ import requests
12
+ from bs4 import BeautifulSoup
13
+ import xml.etree.ElementTree as ET
14
+ import pandas as pd
15
+
16
+ wikipedia.set_lang("ja")
17
+ # APIキーの設定
18
+ openai.api_key = os.environ['OPENAI_API_KEY']
19
+ engine="gpt-3.5-turbo"
20
+
21
+
22
+ def generate(system_template,prompt,engine="gpt-3.5-turbo"):
23
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
24
+ try:
25
+ response = openai.ChatCompletion.create(
26
+ model=engine,
27
+ messages=[
28
+ {"role": "system", "content": system_template},
29
+ {"role": "user", "content":prompt},
30
+ ]
31
+ )
32
+ result=response["choices"][0]["message"]["content"]
33
+ return result
34
+ except:
35
+ print("リトライ")
36
+ time.sleep(30)
37
+ pass
38
+
39
+ def generate_carte(prompt,engine="gpt-3.5-turbo"):
40
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
41
+ try:
42
+ response = openai.ChatCompletion.create(
43
+ model=engine,
44
+ messages=[
45
+ {"role": "system", "content": "You are useful assistant"},
46
+ {"role": "user", "content":"%s\n・・・という患者と医師の会話をSOAP形式のカルテとして日本語で端的にまとめて下さい。各セクションはS),O), A),P)として下さい "%prompt},
47
+ ]
48
+ )
49
+ result=response["choices"][0]["message"]["content"]
50
+ return result
51
+ except:
52
+ print("リトライ")
53
+ time.sleep(30)
54
+ pass
55
+
56
+ def get_selected_fileds(texts):
57
+ input_name = texts.replace(' ' , "+")
58
+ corona_fields = ct.get_study_fields(
59
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
60
+ fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
61
+ max_studies=500,
62
+ fmt="csv")
63
+ return corona_fields
64
+
65
+ def get_retriever_str(fields):
66
+ retriever_str=''
67
+ for i in range(1,len(fields)):
68
+ colnames = fields[0]
69
+ targetCol = fields[i]
70
+ for f in range(len(fields[0])):
71
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
72
+ retriever_str+='\n'
73
+ return retriever_str
OpenAITools/ExpertTools.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import time
4
+ import wikipedia
5
+ import random
6
+ import re
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ import os
10
+ import glob
11
+ from natsort import natsorted
12
+ import requests
13
+ from bs4 import BeautifulSoup
14
+ import xml.etree.ElementTree as ET
15
+ from pytrials.client import ClinicalTrials
16
+ from Bio import Entrez
17
+ import pandas as pd
18
+ import numpy as np
19
+ import time
20
+ #from langchain.agents import create_pandas_dataframe_agent
21
+ from langchain_experimental.agents import create_pandas_dataframe_agent
22
+ #from langchain.llms import OpenAI
23
+ from langchain_community.llms import OpenAI
24
+
25
+ # APIキーの設定
26
+ openai.api_key = os.environ['OPENAI_API_KEY']
27
+ gptengine="gpt-3.5-turbo"
28
+
29
+
30
+ """def get_selected_fileds(texts):
31
+ ct = ClinicalTrials()
32
+ input_name = texts.replace(' ' , "+")
33
+ corona_fields = ct.get_study_fields(
34
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
35
+ fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
36
+ max_studies=500,
37
+ fmt="csv")
38
+ return corona_fields"""
39
+
40
+ def get_retriever_str(fields):
41
+ retriever_str=''
42
+ for i in range(1,len(fields)):
43
+ colnames = fields[0]
44
+ targetCol = fields[i]
45
+ for f in range(len(fields[0])):
46
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
47
+ retriever_str+='\n'
48
+ return retriever_str
49
+
50
+ def get_chanked_retriever(fields):
51
+ retriever_list =[]
52
+ for i in range(1,len(fields)):
53
+ retriever_str=''
54
+ colnames = fields[0]
55
+ targetCol = fields[i]
56
+ for f in range(len(fields[0])):
57
+ retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
58
+ retriever_list.append(retriever_str)
59
+ return retriever_list
60
+
61
+ from pytrials.client import ClinicalTrials
62
+ def get_selected_fields(texts, split_criteria=False,
63
+ split_word_number = False, split_number=700):
64
+ ct = ClinicalTrials()
65
+ input_name = texts.replace(' ', "+")
66
+ corona_fields = ct.get_study_fields(
67
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (input_name),
68
+ fields=["NCTId", "Condition", "BriefTitle", 'BriefSummary', 'EligibilityCriteria'],
69
+ max_studies=500,
70
+ fmt="csv")
71
+
72
+ if split_criteria:
73
+ new_fields = []
74
+
75
+ # 検索対象の文字列
76
+ target_string1 = 'Exclusion Criteria'
77
+ target_string2 = 'Exclusion criteria'
78
+
79
+ # 各要素で検索対象の文字列を探し、直前で分割して新しいリストに格納
80
+ for corona_field in corona_fields:
81
+ new_list = []
82
+ for item in corona_field:
83
+ if target_string1 in item:
84
+ split_position = item.index(target_string1)
85
+ new_list.append(item[:split_position])
86
+ new_list.append(item[split_position:])
87
+ elif target_string2 in item:
88
+ split_position = item.index(target_string2)
89
+ new_list.append(item[:split_position])
90
+ new_list.append(item[split_position:])
91
+ else:
92
+ new_list.append(item)
93
+ new_fields.append(new_list)
94
+ else:
95
+ new_fields = corona_fields
96
+
97
+ if split_word_number:
98
+ split_fields = []
99
+ for new_field in new_fields:
100
+ new_list= []
101
+
102
+ # 各要素を調べて、700文字以上であれば分割し、新しいリストに格納
103
+ for item in new_field:
104
+ item_length = len(item)
105
+ if item_length > split_number:
106
+ num_parts = -(-item_length // split_number) # 向上の除算を用いて分割数を計算
107
+ for i in range(num_parts):
108
+ start_index = i * split_number
109
+ end_index = min((i + 1) * split_number, item_length) # 文字列の終わりを超えないように調整
110
+ new_list.append(item[start_index:end_index])
111
+ else:
112
+ new_list.append(item)
113
+
114
+ split_fields.append(new_list)
115
+ new_fields = split_fields
116
+
117
+ return new_fields
118
+
119
+
120
+ def print_agent_results(df, Ids,
121
+ interesteds = ['Condition', 'BriefTitle', 'BriefSummary', 'EligibilityCriteria'],
122
+ translater=None):
123
+ results = ""
124
+ for Id in Ids:
125
+ print("%s\n"%Id)
126
+ sdf = df[df['NCTId'] == Id]
127
+ for interested in interesteds:
128
+ # 最初の要素を取得
129
+ results += '%s: \n %s \n' % (interested, sdf[interested].iloc[0])
130
+ #print('%s: \n %s \n' % (interested, sdf[interested].iloc[0]))
131
+ if translater:
132
+ to_be_printed = translater.translate(results)
133
+ else:
134
+ to_be_printed =results
135
+ print(to_be_printed)
136
+
137
+ def search(query):
138
+ Entrez.email = os.getenv('MAIL_ADRESS')
139
+ #Entrez.email='sing.monotonyflower@gmail.com'
140
+ handle = Entrez.esearch(db='pubmed',
141
+ sort = 'relevance',
142
+ retmax = '20',
143
+ retmode = 'xml',
144
+ term = query)
145
+ results = Entrez.read(handle)
146
+ return results
147
+
148
+ def fetch_details(id_list):
149
+ ids = ','.join(id_list)
150
+ Entrez.email = os.getenv('MAIL_ADRESS')
151
+ #Entrez.email = 'sing.monotonyflower@gmail.com'
152
+ handle = Entrez.efetch(db = 'pubmed',
153
+ retmode = 'xml',
154
+ id = ids)
155
+ results = Entrez.read(handle)
156
+ return results
157
+ '''def generate(prompt,engine=None):
158
+ if engine is None:
159
+ engine=gptengine
160
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
161
+ try:
162
+ response = openai.ChatCompletion.create(
163
+ model=engine,
164
+ messages=[
165
+ {"role": "system", "content": "You are useful assistant"},
166
+ {"role": "user", "content":prompt},
167
+ ]
168
+ )
169
+ result=response["choices"][0]["message"]["content"]
170
+ return result
171
+ except Exception as e:
172
+ print(e)
173
+ print("リトライ")
174
+ time.sleep(30)
175
+ pass
176
+ '''
177
+
178
+ def generate(prompt,engine=None):
179
+ if engine is None:
180
+ engine=gptengine
181
+ while True: #OpenAI APIが落ちてる時に無限リトライするので注意
182
+ try:
183
+ response = openai.chat.completions.create(
184
+ model=engine,
185
+ messages=[
186
+ {"role": "system", "content": "You are useful assistant"},
187
+ {"role": "user", "content":prompt},
188
+ ]
189
+ )
190
+ #result=response["choices"][0]["message"]["content"]
191
+ result=response.choices[0].message.content
192
+ return result
193
+ except Exception as e:
194
+ print(e)
195
+ print("リトライ")
196
+ time.sleep(30)
197
+ pass
198
+
199
+ def GetPubmedSummaryDf(studies):
200
+ title_list= []
201
+ abstract_list=[]
202
+ journal_list = []
203
+ language_list =[]
204
+ pubdate_year_list = []
205
+ pubdate_month_list = []
206
+ studiesIdList = studies['IdList']
207
+ chunk_size = 10000
208
+ for chunk_i in range(0, len(studiesIdList), chunk_size):
209
+ chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
210
+
211
+ try:
212
+ papers = fetch_details(chunk)
213
+ for i, paper in enumerate(papers['PubmedArticle']):
214
+ title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
215
+ try:
216
+ abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
217
+ except:
218
+ abstract_list.append('No Abstract')
219
+ journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
220
+ language_list.append(paper['MedlineCitation']['Article']['Language'][0])
221
+ try:
222
+ pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
223
+ except:
224
+ pubdate_year_list.append('No Data')
225
+ try:
226
+ pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
227
+ except:
228
+ pubdate_month_list.append('No Data')
229
+ except: # occasionally a chunk might annoy your parser
230
+ pass
231
+ df = pd.DataFrame(list(zip(
232
+ title_list, abstract_list, journal_list, language_list, pubdate_year_list,
233
+ pubdate_month_list)),
234
+ columns=['Title', 'Abstract', 'Journal', 'Language', 'Year','Month'])
235
+ return df, abstract_list
236
+
237
+ def ClinicalAgent(fileds, verbose=False):
238
+ df = pd.DataFrame.from_records(fileds[1:], columns=fileds[0])
239
+ return create_pandas_dataframe_agent(OpenAI(temperature=0, model='gpt-3.5-turbo-16k'), df, verbose=verbose)
240
+
241
+ def GetNCTID(results):
242
+ # NCTで始まる単語を検索する正規表現
243
+ pattern = r'\bNCT\d+\b'
244
+ # 正規表現を使って単語を抽出
245
+ nct_words = re.findall(pattern,results)
246
+ return nct_words
OpenAITools/FetchTools.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ #from llama_index.llms.replicate import Replicate
4
+ import requests
5
+ import re
6
+
7
+
8
+ def extract_japan_cities(text):
9
+ # 正規表現を使用して " - Japan" で終わる都市名を抽出
10
+ pattern = r'(\b\w+\s*\w*\b) - Japan'
11
+ cities = re.findall(pattern, text)
12
+ unique_cities = list(set(cities))
13
+ # ユニークな都市名をソートしてカンマで区切られた文字列に変換
14
+ unique_cities.sort()
15
+ return ', '.join(unique_cities)
16
+
17
+ def fetch_clinical_trials(cancer_name):
18
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
19
+ # Initial URL for the first API call
20
+ base_url = "https://clinicaltrials.gov/api/v2/studies"
21
+ params = {
22
+ "query.titles": search_expr,
23
+ "pageSize": 100
24
+ }
25
+
26
+ # Initialize an empty list to store the data
27
+ data_list = []
28
+ # Loop until there is no nextPageToken
29
+ while True:
30
+ # Print the current URL (for debugging purposes)
31
+ print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
32
+
33
+ # Send a GET request to the API
34
+ response = requests.get(base_url, params=params)
35
+
36
+ # Check if the request was successful
37
+ if response.status_code == 200:
38
+ data = response.json() # Parse JSON response
39
+ studies = data.get('studies', []) # Extract the list of studies
40
+
41
+ # Loop through each study and extract specific information
42
+ for study in studies:
43
+ # Safely access nested keys
44
+ nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
45
+ startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
46
+ conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
47
+ title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
48
+ summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
49
+
50
+ # Extract locations safely
51
+ locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
52
+ locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
53
+
54
+ JapanesLocations = extract_japan_cities(locations)
55
+ # Extract dates and phases
56
+ primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
57
+
58
+ phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
59
+ eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
60
+
61
+ # Append the data to the list as a dictionary
62
+ data_list.append({
63
+ "NCTID": nctId,
64
+ "Title": title,
65
+ #"Start Date": startDate,
66
+ "Primary Completion Date": primaryCompletionDate,
67
+ #"Conditions": conditions,
68
+ "Cancer": conditions,
69
+ "Summary": summary,
70
+ "Japanes Locations": JapanesLocations,
71
+ #"Phases": phases,
72
+ "Eligibility Criteria": eligibilityCriteria
73
+ })
74
+
75
+ # Check for nextPageToken and update the params or break the loop
76
+ nextPageToken = data.get('nextPageToken')
77
+ if nextPageToken:
78
+ params['pageToken'] = nextPageToken # Set the pageToken for the next request
79
+ else:
80
+ break # Exit the loop if no nextPageToken is present
81
+ else:
82
+ print("Failed to fetch data. Status code:", response.status_code)
83
+ break
84
+
85
+ # Create a DataFrame from the list of dictionaries
86
+ df = pd.DataFrame(data_list)
87
+ return df
88
+
89
+ def fetch_clinical_trials_jp(cancer_name):
90
+ search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
91
+ # Initial URL for the first API call
92
+ base_url = "https://clinicaltrials.gov/api/v2/studies"
93
+ params = {
94
+ "query.titles": search_expr,
95
+ "pageSize": 100
96
+ }
97
+
98
+ # Initialize an empty list to store the data
99
+ data_list = []
100
+ # Loop until there is no nextPageToken
101
+ while True:
102
+ # Print the current URL (for debugging purposes)
103
+ print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
104
+
105
+ # Send a GET request to the API
106
+ response = requests.get(base_url, params=params)
107
+
108
+ # Check if the request was successful
109
+ if response.status_code == 200:
110
+ data = response.json() # Parse JSON response
111
+ studies = data.get('studies', []) # Extract the list of studies
112
+
113
+ # Loop through each study and extract specific information
114
+ for study in studies:
115
+ # Safely access nested keys
116
+ nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
117
+ startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
118
+ conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
119
+ title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
120
+ summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
121
+
122
+ # Extract locations safely
123
+ locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
124
+ locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
125
+
126
+ JapanesLocations = extract_japan_cities(locations)
127
+ # Extract dates and phases
128
+ primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
129
+
130
+ phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
131
+ eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
132
+
133
+ # Append the data to the list as a dictionary
134
+ data_list.append({
135
+ "NCTID": nctId,
136
+ "タイトル": title,
137
+ #"Start Date": startDate,
138
+ #"Primary Completion Date": primaryCompletionDate,
139
+ "対象となる癌": conditions,
140
+ "サマリー": summary,
141
+ "場所": JapanesLocations,
142
+ #"Phases": phases,
143
+ "クライテリア": eligibilityCriteria
144
+ })
145
+
146
+ # Check for nextPageToken and update the params or break the loop
147
+ nextPageToken = data.get('nextPageToken')
148
+ if nextPageToken:
149
+ params['pageToken'] = nextPageToken # Set the pageToken for the next request
150
+ else:
151
+ break # Exit the loop if no nextPageToken is present
152
+ else:
153
+ print("Failed to fetch data. Status code:", response.status_code)
154
+ break
155
+
156
+ # Create a DataFrame from the list of dictionaries
157
+ df = pd.DataFrame(data_list)
158
+ return df
OpenAITools/ReviewPaperTools.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+
4
+ def parse_text_file(text):
5
+ # セクションを分割するための正規表現パターンを定義
6
+ # \d+ は1つ以上の数字にマッチします
7
+ pattern = re.compile(r'\n\n\n\d+\.')
8
+
9
+ # テキストをセクションごとに分割
10
+ sections = pattern.split(text)[1:] # 最初の空のセクションを除外
11
+
12
+ # 各セクションの前後の空白を削除
13
+ sections = [section.strip() for section in sections]
14
+
15
+ return sections
16
+
17
+ def split_sections(text):
18
+ contents = text.split('\n\n')
19
+ contents = [section.strip() for section in contents if section.strip()]
20
+ if len(contents) == 8 :
21
+ keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI', 'COI']
22
+ elif len(contents) == 7 :
23
+ keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI']
24
+ elif len(contents) == 6:
25
+ keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'DOI']
26
+ elif len(contents) == 5:
27
+ keys = ['PublishInfo', 'Title', 'AuthorName', 'Abstract', 'DOI']
28
+
29
+ # 辞書を作成し、キーが存在しない場合は空の文字列を設定
30
+ section_dict = {key: contents[i] if i < len(contents) else "" for i, key in enumerate(keys)}
31
+ return section_dict
32
+
33
+
34
+ def GetSummaryDf(textdir):
35
+ with open(textdir, 'r', encoding='utf-8') as f:
36
+ content = f.read()
37
+ sections = parse_text_file(content)
38
+ dicts = []
39
+ for section in sections:
40
+ splited_dic = split_sections(section)
41
+ dicts.append(splited_dic)
42
+ return pd.DataFrame(dicts)
OpenAITools/scrapeThisData.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.support.ui import Select
3
+ from selenium.webdriver.common.by import By
4
+
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ import re
8
+
9
+ import os
10
+ import time
11
+
12
+ from selenium.webdriver.support.ui import WebDriverWait
13
+ from selenium.webdriver.common.by import By
14
+ from selenium.webdriver.support import expected_conditions as EC
15
+ from selenium.webdriver.common.action_chains import ActionChains
16
+ import chromedriver_autoinstaller
17
+
18
+ class ScrapeThatData:
19
+
20
+ def __init__(self, time_threshold = 10):
21
+
22
+ try:
23
+ chrome_options = webdriver.ChromeOptions()
24
+ chrome_options.add_argument('--no-sandbox')
25
+ self.driver = webdriver.Chrome(options=chrome_options)
26
+
27
+ except:
28
+ chromedriver_autoinstaller.install()
29
+ chrome_options = webdriver.ChromeOptions()
30
+ chrome_options.add_argument('--no-sandbox')
31
+ self.driver = webdriver.Chrome(options=chrome_options)
32
+
33
+
34
+
35
+ self.wait = WebDriverWait(self.driver,time_threshold)
36
+ self.attribute_dict = {'status':1 ,'conditions':2, 'interventions': 3, 'study type':4,
37
+ 'phase':5, 'sponsor':6, 'funder type':7 , 'study design': 8,
38
+ 'outcome measures':9, 'number enrolled':10, 'sex':11, 'age':12,
39
+ 'nct number': 13, 'other ids':14, 'title acronym': 15 , 'study start': 16,
40
+ 'primary completion': 17, 'study completion': 18 , 'first posted': 19,
41
+ 'last update posted': 20 , 'results first posted': 21 , 'locations':22, 'study documents': 23}
42
+
43
+ self.status_dict = {'not yet recruiting' : 'notYetRecrCB',
44
+ 'recruiting' : 'recruitingCB',
45
+ 'enrolling by invitation':'enrollingByInvCB',
46
+ 'active, not recruiting': 'activeCB',
47
+ 'suspended': 'suspendedCB',
48
+ 'terminated':'terminatedCB',
49
+ 'completed':'completedCB',
50
+ 'withdrawn': 'withdrawnCB',
51
+ 'unknown status': 'unknownCB'}
52
+
53
+ def clicking_show_hide_cols(self, driver):
54
+ columns = driver.find_element(By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/button')
55
+ action_chain = ActionChains(driver)
56
+ action_chain.move_to_element(columns).click()
57
+ action_chain.perform()
58
+
59
+ def select_attributes_to_show(self, listed_attributes, attribute_dict):
60
+ ll = [value.lower() for value in listed_attributes if value.lower() in ['status', 'conditions', 'interventions', 'locations']]
61
+ if ll:
62
+ to_show = [value.lower() for value in listed_attributes if value.lower() not in ll]
63
+ to_hide = [value for value in ['status', 'conditions', 'interventions', 'locations'] if value not in ll]
64
+ to_click = to_hide + to_show
65
+ for att in to_click:
66
+ self.clicking_show_hide_cols(self.driver)
67
+ time.sleep(1)
68
+ self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att]) + ']'))).click()
69
+ time.sleep(1)
70
+ else:
71
+ for att in listed_attributes:
72
+ self.clicking_show_hide_cols(self.driver)
73
+ time.sleep(1)
74
+ self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att.lower()]) + ']'))).click()
75
+ time.sleep(1)
76
+
77
+ def select_by_status(self, listed_states, status_dict):
78
+ if listed_states:
79
+ for status in listed_states:
80
+ self.driver.find_element(By.ID,status_dict[status.lower()]).click()
81
+
82
+ self.driver.find_element(By.XPATH,'//*[@id="FiltersBody"]/div[1]/input[1]').click()
83
+ time.sleep(3)
84
+
85
+
86
+ select = Select(self.driver.find_element_by_name('theDataTable_length'))
87
+ select.select_by_value('100')
88
+
89
+ def collect_data_search_page(self,l_ordered, amount_of_data = None):
90
+
91
+ class_name = ''
92
+ page_index = 1
93
+
94
+ elements = [l_ordered]
95
+
96
+ while 'disabled' not in class_name :
97
+
98
+
99
+
100
+ time.sleep(10)
101
+
102
+ print('Getting data from page {}'.format(page_index))
103
+
104
+ #Counting how many rows of the table appear
105
+ table = self.driver.find_element(By.ID,'theDataTable')
106
+ row_count = len(table.find_elements(By.TAG_NAME,"tr"))
107
+
108
+ #Looping table page
109
+ for index in range(1, row_count):
110
+ row = []
111
+ if 'status' in l_ordered:
112
+ self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3)')))
113
+ status_element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3) > span')
114
+ row.append(status_element.text.strip())
115
+ for i, val in enumerate(l_ordered):
116
+ if val == 'status':
117
+ continue
118
+
119
+ self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')))
120
+ element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')
121
+ try:
122
+ row.append(element.text.strip())
123
+ except:
124
+ print(i, element)
125
+ else:
126
+ for i, val in enumerate(l_ordered):
127
+ self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')))
128
+ element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')
129
+ try:
130
+ row.append(element.text.strip())
131
+ except:
132
+ print(i, element)
133
+ elements.append(row)
134
+
135
+
136
+
137
+
138
+ #Getting next page button
139
+ next_page= self.driver.find_element(By.ID,"theDataTable_next")
140
+
141
+ #Getting the class attribute of the next page button
142
+ class_name = next_page.get_attribute('class')
143
+
144
+ #Going to the next page
145
+ next_page.click()
146
+ page_index += 1
147
+
148
+ if amount_of_data:
149
+ if len(elements) >= amount_of_data or row_count < amount_of_data :
150
+ break
151
+ else:
152
+ continue
153
+
154
+ return elements
155
+
156
+ def get_criteria(self, NCTnumber):
157
+
158
+ url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
159
+ ClinicalTrialpage = requests.get(url)
160
+ soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
161
+
162
+ wrapping_crit_class = soup.find_all("div", {"class": "tr-indent2"})
163
+ list_elements = wrapping_crit_class[1].find_all(re.compile("(ul|ol)"))
164
+ inclusion, exclusion = ('','')
165
+
166
+
167
+ if not list_elements:
168
+ print ("WARNING: Study number " + NCTnumber + " doesn't have eligibility criteria or HTML tag format is not a list")
169
+ else:
170
+
171
+ if len(list_elements) == 1:
172
+ try:
173
+ if wrapping_crit_class[1].find(text = 'Inclusion Criteria:'):
174
+ inclusion = list_elements[0].find_all("li")
175
+
176
+ elif wrapping_crit_class[1].find(text = 'Exclusion Criteria:'):
177
+ exclusion = list_elements[0].find_all("li")
178
+ except:
179
+ print('criteria doesnt exist')
180
+ else:
181
+ inclusion = list_elements[0].find_all("li")
182
+ exclusion = list_elements[1].find_all("li")
183
+
184
+
185
+ inclusion = ' '.join([t.text.strip() for t in inclusion ])
186
+ exclusion = ' '.join([t.text.strip() for t in exclusion ])
187
+
188
+ return(inclusion, exclusion)
189
+
190
+ #function that gets number of patients enrolled in a study
191
+ def get_enrollment (self, NCTnumber):
192
+ url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
193
+ ClinicalTrialpage = requests.get(url)
194
+ soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
195
+ enrollment = ''
196
+ wrapping_enrol_class = soup.find_all('td', {'headers':'studyInfoColData','style':"padding-left:1em"})
197
+ if not wrapping_enrol_class:
198
+ print('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
199
+ else:
200
+ enrollment = wrapping_enrol_class[1]
201
+ enrollment = enrollment.text.split()[0]
202
+ if enrollment.isdigit() == False:
203
+ print ('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
204
+ else:
205
+ return(enrollment)
206
+
207
+
208
+
209
+ def __call__(self, condition, listed_attributes, listed_states, amount_of_data):
210
+
211
+ self.driver.get('https://clinicaltrials.gov/ct2/results?cond=' + condition + '&rank=1&view=record#rowId0')
212
+ self.select_attributes_to_show(listed_attributes, self.attribute_dict)
213
+
214
+ try:
215
+ self.select_by_status(listed_states, self.status_dict)
216
+ except:
217
+ print('select by status is a problem')
218
+ n = []
219
+ for i in listed_attributes:
220
+ n.append(self.attribute_dict[i.lower()])
221
+ attribute_ordered = [list(self.attribute_dict.keys())[list(self.attribute_dict.values()).index(i)]for i in sorted(n)]
222
+
223
+ search_data = self.collect_data_search_page(attribute_ordered, amount_of_data=amount_of_data)
224
+ nct_numbers = [e[search_data[0].index('nct number')] for e in search_data[1:]]
225
+ search_data[0].extend(['inclusion', 'exclusion', 'enrollment'])
226
+ for index, nct in enumerate(nct_numbers):
227
+ if index % 100 == 0 and index!= 0:
228
+ print("Collected Data from {} Studies: ".format(index))
229
+
230
+ inc, exc = self.get_criteria(nct)
231
+ enrol = self.get_enrollment(nct)
232
+ search_data[index + 1].extend([inc, exc, enrol])
233
+ return search_data
234
+ # except:
235
+ # print('no data available with the specified status')
236
+
237
+
app.py CHANGED
@@ -1,63 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
- """
43
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
44
- """
45
- demo = gr.ChatInterface(
46
- respond,
47
- additional_inputs=[
48
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
49
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
50
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
- gr.Slider(
52
- minimum=0.1,
53
- maximum=1.0,
54
- value=0.95,
55
- step=0.05,
56
- label="Top-p (nucleus sampling)",
57
  ),
58
- ],
59
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
 
 
 
 
 
 
 
61
 
62
  if __name__ == "__main__":
63
  demo.launch()
 
1
+ import os
2
+ from llama_index.core.query_engine import NLSQLTableQueryEngine
3
+ import pickle
4
+ import pandas as pd
5
+ import sqlalchemy as sa
6
+ #from llama_index.llms.replicate import Replicate
7
+ from llama_index.llms.groq import Groq
8
+ from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt, completion_to_prompt
9
+ from llama_index.core import SQLDatabase
10
+ import requests
11
+ import re
12
+ from llama_index.llms.openai import OpenAI
13
+ from OpenAITools.FetchTools import fetch_clinical_trials, fetch_clinical_trials_jp
14
  import gradio as gr
15
+
16
+
17
+ def custom_completion_to_prompt(completion:str) ->str:
18
+ return completion_to_prompt(
19
+ completion, system_prompt=(
20
+ "You are a sophisticated AI model designed to assist users in finding relevant clinical trials efficiently. **Accurate Retrieval**: Utilize the RAG mechanism to retrieve the most relevant and current clinical trial information from reliable databases. Ensure the information is accurate and cite the sources when necessary. Make decisions based solely on the information provided by the retriever."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  ),
22
+ )
23
+
24
+ def RetriviralClinicalTrrial_multi(message,history,cancer_name):
25
+ df = fetch_clinical_trials(cancer_name)
26
+ engine = sa.create_engine("sqlite:///:memory:")
27
+ sql_database = df.to_sql(name='clinical_study', con=engine, if_exists='replace', index=False)
28
+ database = SQLDatabase(engine)
29
+ # The replicate endpoint
30
+ LLAMA3_8B = "Llama3-8b-8192"
31
+ LLAMA3_70b = "Llama3-70b-8192"
32
+ Mixtral = "mixtral-8x7b-32768"
33
+
34
+ '''llm = OpenAI(
35
+ model='gpt-3.5-turbo',
36
+ temperature=0.01,
37
+ context_window=4096,
38
+ completion_to_prompt=custom_completion_to_prompt,
39
+ messages_to_prompt=messages_to_prompt,)'''
40
+
41
+ llm = Groq(
42
+ model=LLAMA3_70b,
43
+ temperature=0.01,
44
+ context_window=4096,
45
+ completion_to_prompt=custom_completion_to_prompt,
46
+ messages_to_prompt=messages_to_prompt,
47
+ )
48
+ query_engine = NLSQLTableQueryEngine(
49
+ sql_database = database,
50
+ tables=["clinical_study"],llm=llm)
51
+ response = query_engine.query(message)
52
+ return response.response
53
+
54
+ with gr.Blocks() as demo:
55
+ cancer_name = gr.Textbox("lung cancer", label="cancer name")
56
 
57
+ gr.ChatInterface(
58
+ RetriviralClinicalTrrial_multi, additional_inputs=[cancer_name],
59
+ title="Assistant for Clinical traial Search",
60
+ description='リアルタイムで日本で行われているClinical trialの情報を入手して治験を探すお手伝いをします。対象となるがんの名前を上に入力して、まずNCTIDを聞いてください。その後にそれぞれのNCTIDのを聴いてください。その後興味のある内容 Title,Cancer,Summary,Japanes Locations,Eligibility Criteriaなどを聞いてみてください(スペルの打ち間違いに注意)例えば「NCT04270591のTitleを教えて』のように尋ねてください',
61
+ theme="soft",
62
+ examples=[["NCTIDを全て教えて"], ["のTitleを教えて"], ["の対象Cancerを教えて"], ["のSummaryを教えて"], ["のJapanes Locationsを教えて"], ["のEligibility Criteriaを教えて"], ["のPrimary Completion Dateを教えて"]]
63
+ )
64
 
65
  if __name__ == "__main__":
66
  demo.launch()
requirements.txt CHANGED
@@ -1 +1,72 @@
1
- huggingface_hub==0.22.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.5
2
+ aiosignal==1.3.1
3
+ bio==1.7.1
4
+ biopython==1.83
5
+ biothings-client==0.3.1
6
+ dataclasses-json==0.6.6
7
+ Deprecated==1.2.14
8
+ dirtyjson==1.0.8
9
+ diskcache==5.6.3
10
+ distro==1.9.0
11
+ frozenlist==1.4.1
12
+ gprofiler-official==1.0.0
13
+ greenlet==3.0.3
14
+ hpack==4.0.0
15
+ jsonpatch==1.33
16
+ langchain==0.2.2
17
+ langchain-community==0.2.4
18
+ langchain-core==0.2.4
19
+ langchain-experimental==0.0.60
20
+ langchain-openai==0.1.8
21
+ langchain-text-splitters==0.2.1
22
+ langsmith==0.1.71
23
+ llama-index==0.10.43
24
+ llama-index-agent-openai==0.2.7
25
+ llama-index-cli==0.1.12
26
+ llama-index-core==0.10.43
27
+ llama-index-embeddings-openai==0.1.10
28
+ llama-index-indices-managed-llama-cloud==0.1.6
29
+ llama-index-legacy==0.9.48
30
+ llama-index-llms-groq==0.1.4
31
+ llama-index-llms-llama-cpp==0.1.3
32
+ llama-index-llms-openai==0.1.22
33
+ llama-index-llms-openai-like==0.1.3
34
+ llama-index-llms-replicate==0.1.3
35
+ llama-index-multi-modal-llms-openai==0.1.6
36
+ llama-index-program-openai==0.1.6
37
+ llama-index-question-gen-openai==0.1.3
38
+ llama-index-readers-file==0.1.23
39
+ llama-index-readers-llama-parse==0.1.4
40
+ llama-parse==0.4.4
41
+ llama_cpp_python==0.2.77
42
+ llamaindex-py-client==0.1.19
43
+ marshmallow==3.21.2
44
+ multidict==6.0.5
45
+ munkres==1.1.4
46
+ mygene==3.2.2
47
+ mypy-extensions==1.0.0
48
+ natsort==8.4.0
49
+ networkx==3.3
50
+ nltk
51
+ openai
52
+ packaging==23.2
53
+ pooch==1.8.1
54
+ pypdf==4.2.0
55
+ pytrials==1.0.0
56
+ regex==2024.5.15
57
+ replicate==0.26.0
58
+ safetensors
59
+ setuptools==70.0.0
60
+ SQLAlchemy==2.0.30
61
+ striprtf==0.0.26
62
+ tenacity==8.3.0
63
+ tiktoken==0.7.0
64
+ tokenizers==0.19.1
65
+ transformers==4.41.2
66
+ typer==0.12.3
67
+ typer-slim==0.12.3
68
+ typing-inspect==0.9.0
69
+ wheel==0.43.0
70
+ wikipedia==1.4.0
71
+ wrapt==1.16.0
72
+ yarl==1.9.4