Spaces:
Sleeping
Sleeping
gegegege
Browse files- .gitignore +135 -0
- OpenAITools/.ipynb_checkpoints/ECarteTools-checkpoint.py +73 -0
- OpenAITools/.ipynb_checkpoints/ExpertTools-checkpoint.py +245 -0
- OpenAITools/.ipynb_checkpoints/FetchTools-checkpoint.py +158 -0
- OpenAITools/.ipynb_checkpoints/scrapeThisData-checkpoint.py +237 -0
- OpenAITools/ECarteTools.py +73 -0
- OpenAITools/ExpertTools.py +246 -0
- OpenAITools/FetchTools.py +158 -0
- OpenAITools/ReviewPaperTools.py +42 -0
- OpenAITools/scrapeThisData.py +237 -0
- app.py +60 -57
- requirements.txt +72 -1
.gitignore
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Add any directories, files, or patterns you don't want to be tracked by version control
|
2 |
+
|
3 |
+
|
4 |
+
# Byte-compiled / optimized / DLL files
|
5 |
+
__pycache__/
|
6 |
+
#*.py[cod]
|
7 |
+
#*$py.class
|
8 |
+
#*.txt
|
9 |
+
#*.tsv
|
10 |
+
#*.csv
|
11 |
+
*.xlsx
|
12 |
+
*.pdf
|
13 |
+
*.nii
|
14 |
+
#*.nii.gz
|
15 |
+
*.DS_Store
|
16 |
+
#*.png
|
17 |
+
#*.pyn
|
18 |
+
*.jpg
|
19 |
+
*.nii.gz
|
20 |
+
*.pkl
|
21 |
+
*-checkpoint.ipynb
|
22 |
+
*.pkls
|
23 |
+
*.pth
|
24 |
+
*.yaml
|
25 |
+
*.ckpt
|
26 |
+
# C extensions
|
27 |
+
#*.so
|
28 |
+
|
29 |
+
# Distribution / packaging
|
30 |
+
#.Python
|
31 |
+
#build/
|
32 |
+
#develop-eggs/
|
33 |
+
#dist/
|
34 |
+
#downloads/
|
35 |
+
#eggs/
|
36 |
+
#.eggs/
|
37 |
+
#lib/
|
38 |
+
#lib64/
|
39 |
+
#parts/
|
40 |
+
#sdist/
|
41 |
+
#var/
|
42 |
+
#wheels/
|
43 |
+
#*.egg-info/
|
44 |
+
#.installed.cfg
|
45 |
+
#*.egg
|
46 |
+
#MANIFEST
|
47 |
+
|
48 |
+
# PyInstaller
|
49 |
+
# Usually these files are written by a python script from a template
|
50 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
51 |
+
#*.manifest
|
52 |
+
#*.spec
|
53 |
+
|
54 |
+
# Installer logs
|
55 |
+
#pip-log.txt
|
56 |
+
#pip-delete-this-directory.txt
|
57 |
+
|
58 |
+
# Unit test / coverage reports
|
59 |
+
#htmlcov/
|
60 |
+
#.tox/
|
61 |
+
#.coverage
|
62 |
+
#.coverage.*
|
63 |
+
#.cache
|
64 |
+
#nosetests.xml
|
65 |
+
#coverage.xml
|
66 |
+
#*.cover
|
67 |
+
#.hypothesis/
|
68 |
+
#.pytest_cache/
|
69 |
+
|
70 |
+
# Translations
|
71 |
+
#*.mo
|
72 |
+
#*.pot
|
73 |
+
|
74 |
+
# Django stuff:
|
75 |
+
#*.log
|
76 |
+
#.static_storage/
|
77 |
+
#.media/
|
78 |
+
#local_settings.py
|
79 |
+
|
80 |
+
# Flask stuff:
|
81 |
+
#instance/
|
82 |
+
#.webassets-cache
|
83 |
+
|
84 |
+
# Scrapy stuff:
|
85 |
+
#.scrapy
|
86 |
+
|
87 |
+
# Sphinx documentation
|
88 |
+
#docs/_build/
|
89 |
+
|
90 |
+
# PyBuilder
|
91 |
+
#target/
|
92 |
+
|
93 |
+
# Jupyter Notebook
|
94 |
+
.ipynb_checkpoint/*
|
95 |
+
|
96 |
+
# pyenv
|
97 |
+
#.python-version
|
98 |
+
|
99 |
+
# celery beat schedule file
|
100 |
+
#celerybeat-schedule
|
101 |
+
|
102 |
+
# SageMath parsed files
|
103 |
+
#*.sage.py
|
104 |
+
|
105 |
+
# Environments
|
106 |
+
#.env
|
107 |
+
#.venv
|
108 |
+
#env/
|
109 |
+
#venv/
|
110 |
+
#ENV/
|
111 |
+
#env.bak/
|
112 |
+
#venv.bak/
|
113 |
+
|
114 |
+
# Spyder project settings
|
115 |
+
#.spyderproject
|
116 |
+
#.spyproject
|
117 |
+
|
118 |
+
# Rope project settings
|
119 |
+
#.ropeproject
|
120 |
+
|
121 |
+
# mkdocs documentation
|
122 |
+
#/site
|
123 |
+
/models/
|
124 |
+
# mypy
|
125 |
+
#.mypy_cache/
|
126 |
+
#over 100MB
|
127 |
+
|
128 |
+
# Add any directories, files, or patterns you don't want to be tracked by version control
|
129 |
+
|
130 |
+
|
131 |
+
#deep settings
|
132 |
+
*.h5
|
133 |
+
|
134 |
+
.OpenAITools/chromedriver
|
135 |
+
/OpenAITools/chromedriver
|
OpenAITools/.ipynb_checkpoints/ECarteTools-checkpoint.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import time
|
3 |
+
import wikipedia
|
4 |
+
import random
|
5 |
+
import re
|
6 |
+
import requests
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
import os
|
9 |
+
import glob
|
10 |
+
from natsort import natsorted
|
11 |
+
import requests
|
12 |
+
from bs4 import BeautifulSoup
|
13 |
+
import xml.etree.ElementTree as ET
|
14 |
+
import pandas as pd
|
15 |
+
|
16 |
+
wikipedia.set_lang("ja")
|
17 |
+
# APIキーの設定
|
18 |
+
openai.api_key = os.environ['OPENAI_API_KEY']
|
19 |
+
engine="gpt-3.5-turbo"
|
20 |
+
|
21 |
+
|
22 |
+
def generate(system_template,prompt,engine="gpt-3.5-turbo"):
|
23 |
+
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
24 |
+
try:
|
25 |
+
response = openai.ChatCompletion.create(
|
26 |
+
model=engine,
|
27 |
+
messages=[
|
28 |
+
{"role": "system", "content": system_template},
|
29 |
+
{"role": "user", "content":prompt},
|
30 |
+
]
|
31 |
+
)
|
32 |
+
result=response["choices"][0]["message"]["content"]
|
33 |
+
return result
|
34 |
+
except:
|
35 |
+
print("リトライ")
|
36 |
+
time.sleep(30)
|
37 |
+
pass
|
38 |
+
|
39 |
+
def generate_carte(prompt,engine="gpt-3.5-turbo"):
|
40 |
+
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
41 |
+
try:
|
42 |
+
response = openai.ChatCompletion.create(
|
43 |
+
model=engine,
|
44 |
+
messages=[
|
45 |
+
{"role": "system", "content": "You are useful assistant"},
|
46 |
+
{"role": "user", "content":"%s\n・・・という患者と医師の会話をSOAP形式のカルテとして日本語で端的にまとめて下さい。各セクションはS),O), A),P)として下さい "%prompt},
|
47 |
+
]
|
48 |
+
)
|
49 |
+
result=response["choices"][0]["message"]["content"]
|
50 |
+
return result
|
51 |
+
except:
|
52 |
+
print("リトライ")
|
53 |
+
time.sleep(30)
|
54 |
+
pass
|
55 |
+
|
56 |
+
def get_selected_fileds(texts):
|
57 |
+
input_name = texts.replace(' ' , "+")
|
58 |
+
corona_fields = ct.get_study_fields(
|
59 |
+
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
|
60 |
+
fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
|
61 |
+
max_studies=500,
|
62 |
+
fmt="csv")
|
63 |
+
return corona_fields
|
64 |
+
|
65 |
+
def get_retriever_str(fields):
|
66 |
+
retriever_str=''
|
67 |
+
for i in range(1,len(fields)):
|
68 |
+
colnames = fields[0]
|
69 |
+
targetCol = fields[i]
|
70 |
+
for f in range(len(fields[0])):
|
71 |
+
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
72 |
+
retriever_str+='\n'
|
73 |
+
return retriever_str
|
OpenAITools/.ipynb_checkpoints/ExpertTools-checkpoint.py
ADDED
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import openai
|
3 |
+
import time
|
4 |
+
import wikipedia
|
5 |
+
import random
|
6 |
+
import re
|
7 |
+
import requests
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
import os
|
10 |
+
import glob
|
11 |
+
from natsort import natsorted
|
12 |
+
import requests
|
13 |
+
from bs4 import BeautifulSoup
|
14 |
+
import xml.etree.ElementTree as ET
|
15 |
+
from pytrials.client import ClinicalTrials
|
16 |
+
from Bio import Entrez
|
17 |
+
import pandas as pd
|
18 |
+
import numpy as np
|
19 |
+
import time
|
20 |
+
#from langchain.agents import create_pandas_dataframe_agent
|
21 |
+
from langchain_experimental.agents import create_pandas_dataframe_agent
|
22 |
+
from langchain.llms import OpenAI
|
23 |
+
|
24 |
+
# APIキーの設定
|
25 |
+
openai.api_key = os.environ['OPENAI_API_KEY']
|
26 |
+
gptengine="gpt-3.5-turbo"
|
27 |
+
|
28 |
+
|
29 |
+
"""def get_selected_fileds(texts):
|
30 |
+
ct = ClinicalTrials()
|
31 |
+
input_name = texts.replace(' ' , "+")
|
32 |
+
corona_fields = ct.get_study_fields(
|
33 |
+
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
|
34 |
+
fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
|
35 |
+
max_studies=500,
|
36 |
+
fmt="csv")
|
37 |
+
return corona_fields"""
|
38 |
+
|
39 |
+
def get_retriever_str(fields):
|
40 |
+
retriever_str=''
|
41 |
+
for i in range(1,len(fields)):
|
42 |
+
colnames = fields[0]
|
43 |
+
targetCol = fields[i]
|
44 |
+
for f in range(len(fields[0])):
|
45 |
+
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
46 |
+
retriever_str+='\n'
|
47 |
+
return retriever_str
|
48 |
+
|
49 |
+
def get_chanked_retriever(fields):
|
50 |
+
retriever_list =[]
|
51 |
+
for i in range(1,len(fields)):
|
52 |
+
retriever_str=''
|
53 |
+
colnames = fields[0]
|
54 |
+
targetCol = fields[i]
|
55 |
+
for f in range(len(fields[0])):
|
56 |
+
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
57 |
+
retriever_list.append(retriever_str)
|
58 |
+
return retriever_list
|
59 |
+
|
60 |
+
from pytrials.client import ClinicalTrials
|
61 |
+
def get_selected_fields(texts, split_criteria=False,
|
62 |
+
split_word_number = False, split_number=700):
|
63 |
+
ct = ClinicalTrials()
|
64 |
+
input_name = texts.replace(' ', "+")
|
65 |
+
corona_fields = ct.get_study_fields(
|
66 |
+
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (input_name),
|
67 |
+
fields=["NCTId", "Condition", "BriefTitle", 'BriefSummary', 'EligibilityCriteria'],
|
68 |
+
max_studies=500,
|
69 |
+
fmt="csv")
|
70 |
+
|
71 |
+
if split_criteria:
|
72 |
+
new_fields = []
|
73 |
+
|
74 |
+
# 検索対象の文字列
|
75 |
+
target_string1 = 'Exclusion Criteria'
|
76 |
+
target_string2 = 'Exclusion criteria'
|
77 |
+
|
78 |
+
# 各要素で検索対象の文字列を探し、直前で分割して新しいリストに格納
|
79 |
+
for corona_field in corona_fields:
|
80 |
+
new_list = []
|
81 |
+
for item in corona_field:
|
82 |
+
if target_string1 in item:
|
83 |
+
split_position = item.index(target_string1)
|
84 |
+
new_list.append(item[:split_position])
|
85 |
+
new_list.append(item[split_position:])
|
86 |
+
elif target_string2 in item:
|
87 |
+
split_position = item.index(target_string2)
|
88 |
+
new_list.append(item[:split_position])
|
89 |
+
new_list.append(item[split_position:])
|
90 |
+
else:
|
91 |
+
new_list.append(item)
|
92 |
+
new_fields.append(new_list)
|
93 |
+
else:
|
94 |
+
new_fields = corona_fields
|
95 |
+
|
96 |
+
if split_word_number:
|
97 |
+
split_fields = []
|
98 |
+
for new_field in new_fields:
|
99 |
+
new_list= []
|
100 |
+
|
101 |
+
# 各要素を調べて、700文字以上であれば分割し、新しいリストに格納
|
102 |
+
for item in new_field:
|
103 |
+
item_length = len(item)
|
104 |
+
if item_length > split_number:
|
105 |
+
num_parts = -(-item_length // split_number) # 向上の除算を用いて分割数を計算
|
106 |
+
for i in range(num_parts):
|
107 |
+
start_index = i * split_number
|
108 |
+
end_index = min((i + 1) * split_number, item_length) # 文字列の終わりを超えないように調整
|
109 |
+
new_list.append(item[start_index:end_index])
|
110 |
+
else:
|
111 |
+
new_list.append(item)
|
112 |
+
|
113 |
+
split_fields.append(new_list)
|
114 |
+
new_fields = split_fields
|
115 |
+
|
116 |
+
return new_fields
|
117 |
+
|
118 |
+
|
119 |
+
def print_agent_results(df, Ids,
|
120 |
+
interesteds = ['Condition', 'BriefTitle', 'BriefSummary', 'EligibilityCriteria'],
|
121 |
+
translater=None):
|
122 |
+
results = ""
|
123 |
+
for Id in Ids:
|
124 |
+
print("%s\n"%Id)
|
125 |
+
sdf = df[df['NCTId'] == Id]
|
126 |
+
for interested in interesteds:
|
127 |
+
# 最初の要素を取得
|
128 |
+
results += '%s: \n %s \n' % (interested, sdf[interested].iloc[0])
|
129 |
+
#print('%s: \n %s \n' % (interested, sdf[interested].iloc[0]))
|
130 |
+
if translater:
|
131 |
+
to_be_printed = translater.translate(results)
|
132 |
+
else:
|
133 |
+
to_be_printed =results
|
134 |
+
print(to_be_printed)
|
135 |
+
|
136 |
+
def search(query):
|
137 |
+
Entrez.email = os.getenv('MAIL_ADRESS')
|
138 |
+
#Entrez.email='sing.monotonyflower@gmail.com'
|
139 |
+
handle = Entrez.esearch(db='pubmed',
|
140 |
+
sort = 'relevance',
|
141 |
+
retmax = '20',
|
142 |
+
retmode = 'xml',
|
143 |
+
term = query)
|
144 |
+
results = Entrez.read(handle)
|
145 |
+
return results
|
146 |
+
|
147 |
+
def fetch_details(id_list):
|
148 |
+
ids = ','.join(id_list)
|
149 |
+
Entrez.email = os.getenv('MAIL_ADRESS')
|
150 |
+
#Entrez.email = 'sing.monotonyflower@gmail.com'
|
151 |
+
handle = Entrez.efetch(db = 'pubmed',
|
152 |
+
retmode = 'xml',
|
153 |
+
id = ids)
|
154 |
+
results = Entrez.read(handle)
|
155 |
+
return results
|
156 |
+
'''def generate(prompt,engine=None):
|
157 |
+
if engine is None:
|
158 |
+
engine=gptengine
|
159 |
+
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
160 |
+
try:
|
161 |
+
response = openai.ChatCompletion.create(
|
162 |
+
model=engine,
|
163 |
+
messages=[
|
164 |
+
{"role": "system", "content": "You are useful assistant"},
|
165 |
+
{"role": "user", "content":prompt},
|
166 |
+
]
|
167 |
+
)
|
168 |
+
result=response["choices"][0]["message"]["content"]
|
169 |
+
return result
|
170 |
+
except Exception as e:
|
171 |
+
print(e)
|
172 |
+
print("リトライ")
|
173 |
+
time.sleep(30)
|
174 |
+
pass
|
175 |
+
'''
|
176 |
+
|
177 |
+
def generate(prompt,engine=None):
|
178 |
+
if engine is None:
|
179 |
+
engine=gptengine
|
180 |
+
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
181 |
+
try:
|
182 |
+
response = openai.chat.completions.create(
|
183 |
+
model=engine,
|
184 |
+
messages=[
|
185 |
+
{"role": "system", "content": "You are useful assistant"},
|
186 |
+
{"role": "user", "content":prompt},
|
187 |
+
]
|
188 |
+
)
|
189 |
+
#result=response["choices"][0]["message"]["content"]
|
190 |
+
result=response.choices[0].message.content
|
191 |
+
return result
|
192 |
+
except Exception as e:
|
193 |
+
print(e)
|
194 |
+
print("リトライ")
|
195 |
+
time.sleep(30)
|
196 |
+
pass
|
197 |
+
|
198 |
+
def GetPubmedSummaryDf(studies):
|
199 |
+
title_list= []
|
200 |
+
abstract_list=[]
|
201 |
+
journal_list = []
|
202 |
+
language_list =[]
|
203 |
+
pubdate_year_list = []
|
204 |
+
pubdate_month_list = []
|
205 |
+
studiesIdList = studies['IdList']
|
206 |
+
chunk_size = 10000
|
207 |
+
for chunk_i in range(0, len(studiesIdList), chunk_size):
|
208 |
+
chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
|
209 |
+
|
210 |
+
try:
|
211 |
+
papers = fetch_details(chunk)
|
212 |
+
for i, paper in enumerate(papers['PubmedArticle']):
|
213 |
+
title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
|
214 |
+
try:
|
215 |
+
abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
|
216 |
+
except:
|
217 |
+
abstract_list.append('No Abstract')
|
218 |
+
journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
|
219 |
+
language_list.append(paper['MedlineCitation']['Article']['Language'][0])
|
220 |
+
try:
|
221 |
+
pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
|
222 |
+
except:
|
223 |
+
pubdate_year_list.append('No Data')
|
224 |
+
try:
|
225 |
+
pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
|
226 |
+
except:
|
227 |
+
pubdate_month_list.append('No Data')
|
228 |
+
except: # occasionally a chunk might annoy your parser
|
229 |
+
pass
|
230 |
+
df = pd.DataFrame(list(zip(
|
231 |
+
title_list, abstract_list, journal_list, language_list, pubdate_year_list,
|
232 |
+
pubdate_month_list)),
|
233 |
+
columns=['Title', 'Abstract', 'Journal', 'Language', 'Year','Month'])
|
234 |
+
return df, abstract_list
|
235 |
+
|
236 |
+
def ClinicalAgent(fileds, verbose=False):
|
237 |
+
df = pd.DataFrame.from_records(fileds[1:], columns=fileds[0])
|
238 |
+
return create_pandas_dataframe_agent(OpenAI(temperature=0, model='gpt-3.5-turbo-16k'), df, verbose=verbose)
|
239 |
+
|
240 |
+
def GetNCTID(results):
|
241 |
+
# NCTで始まる単語を検索する正規表現
|
242 |
+
pattern = r'\bNCT\d+\b'
|
243 |
+
# 正規表現を使って単語を抽出
|
244 |
+
nct_words = re.findall(pattern,results)
|
245 |
+
return nct_words
|
OpenAITools/.ipynb_checkpoints/FetchTools-checkpoint.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
#from llama_index.llms.replicate import Replicate
|
4 |
+
import requests
|
5 |
+
import re
|
6 |
+
|
7 |
+
|
8 |
+
def extract_japan_cities(text):
|
9 |
+
# 正規表現を使用して " - Japan" で終わる都市名を抽出
|
10 |
+
pattern = r'(\b\w+\s*\w*\b) - Japan'
|
11 |
+
cities = re.findall(pattern, text)
|
12 |
+
unique_cities = list(set(cities))
|
13 |
+
# ユニークな都市名をソートしてカンマで区切られた文字列に変換
|
14 |
+
unique_cities.sort()
|
15 |
+
return ', '.join(unique_cities)
|
16 |
+
|
17 |
+
def fetch_clinical_trials(cancer_name):
|
18 |
+
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
|
19 |
+
# Initial URL for the first API call
|
20 |
+
base_url = "https://clinicaltrials.gov/api/v2/studies"
|
21 |
+
params = {
|
22 |
+
"query.titles": search_expr,
|
23 |
+
"pageSize": 100
|
24 |
+
}
|
25 |
+
|
26 |
+
# Initialize an empty list to store the data
|
27 |
+
data_list = []
|
28 |
+
# Loop until there is no nextPageToken
|
29 |
+
while True:
|
30 |
+
# Print the current URL (for debugging purposes)
|
31 |
+
print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
|
32 |
+
|
33 |
+
# Send a GET request to the API
|
34 |
+
response = requests.get(base_url, params=params)
|
35 |
+
|
36 |
+
# Check if the request was successful
|
37 |
+
if response.status_code == 200:
|
38 |
+
data = response.json() # Parse JSON response
|
39 |
+
studies = data.get('studies', []) # Extract the list of studies
|
40 |
+
|
41 |
+
# Loop through each study and extract specific information
|
42 |
+
for study in studies:
|
43 |
+
# Safely access nested keys
|
44 |
+
nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
|
45 |
+
startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
|
46 |
+
conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
|
47 |
+
title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
|
48 |
+
summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
|
49 |
+
|
50 |
+
# Extract locations safely
|
51 |
+
locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
|
52 |
+
locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
|
53 |
+
|
54 |
+
JapanesLocations = extract_japan_cities(locations)
|
55 |
+
# Extract dates and phases
|
56 |
+
primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
|
57 |
+
|
58 |
+
phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
|
59 |
+
eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
|
60 |
+
|
61 |
+
# Append the data to the list as a dictionary
|
62 |
+
data_list.append({
|
63 |
+
"NCTID": nctId,
|
64 |
+
"Title": title,
|
65 |
+
#"Start Date": startDate,
|
66 |
+
"Primary Completion Date": primaryCompletionDate,
|
67 |
+
#"Conditions": conditions,
|
68 |
+
"Cancer": conditions,
|
69 |
+
"Summary": summary,
|
70 |
+
"Japanes Locations": JapanesLocations,
|
71 |
+
#"Phases": phases,
|
72 |
+
"Eligibility Criteria": eligibilityCriteria
|
73 |
+
})
|
74 |
+
|
75 |
+
# Check for nextPageToken and update the params or break the loop
|
76 |
+
nextPageToken = data.get('nextPageToken')
|
77 |
+
if nextPageToken:
|
78 |
+
params['pageToken'] = nextPageToken # Set the pageToken for the next request
|
79 |
+
else:
|
80 |
+
break # Exit the loop if no nextPageToken is present
|
81 |
+
else:
|
82 |
+
print("Failed to fetch data. Status code:", response.status_code)
|
83 |
+
break
|
84 |
+
|
85 |
+
# Create a DataFrame from the list of dictionaries
|
86 |
+
df = pd.DataFrame(data_list)
|
87 |
+
return df
|
88 |
+
|
89 |
+
def fetch_clinical_trials_jp(cancer_name):
|
90 |
+
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
|
91 |
+
# Initial URL for the first API call
|
92 |
+
base_url = "https://clinicaltrials.gov/api/v2/studies"
|
93 |
+
params = {
|
94 |
+
"query.titles": search_expr,
|
95 |
+
"pageSize": 100
|
96 |
+
}
|
97 |
+
|
98 |
+
# Initialize an empty list to store the data
|
99 |
+
data_list = []
|
100 |
+
# Loop until there is no nextPageToken
|
101 |
+
while True:
|
102 |
+
# Print the current URL (for debugging purposes)
|
103 |
+
print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
|
104 |
+
|
105 |
+
# Send a GET request to the API
|
106 |
+
response = requests.get(base_url, params=params)
|
107 |
+
|
108 |
+
# Check if the request was successful
|
109 |
+
if response.status_code == 200:
|
110 |
+
data = response.json() # Parse JSON response
|
111 |
+
studies = data.get('studies', []) # Extract the list of studies
|
112 |
+
|
113 |
+
# Loop through each study and extract specific information
|
114 |
+
for study in studies:
|
115 |
+
# Safely access nested keys
|
116 |
+
nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
|
117 |
+
startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
|
118 |
+
conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
|
119 |
+
title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
|
120 |
+
summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
|
121 |
+
|
122 |
+
# Extract locations safely
|
123 |
+
locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
|
124 |
+
locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
|
125 |
+
|
126 |
+
JapanesLocations = extract_japan_cities(locations)
|
127 |
+
# Extract dates and phases
|
128 |
+
primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
|
129 |
+
|
130 |
+
phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
|
131 |
+
eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
|
132 |
+
|
133 |
+
# Append the data to the list as a dictionary
|
134 |
+
data_list.append({
|
135 |
+
"NCTID": nctId,
|
136 |
+
"タイトル": title,
|
137 |
+
#"Start Date": startDate,
|
138 |
+
#"Primary Completion Date": primaryCompletionDate,
|
139 |
+
"対象となる癌": conditions,
|
140 |
+
"サマリー": summary,
|
141 |
+
"場所": JapanesLocations,
|
142 |
+
#"Phases": phases,
|
143 |
+
"クライテリア": eligibilityCriteria
|
144 |
+
})
|
145 |
+
|
146 |
+
# Check for nextPageToken and update the params or break the loop
|
147 |
+
nextPageToken = data.get('nextPageToken')
|
148 |
+
if nextPageToken:
|
149 |
+
params['pageToken'] = nextPageToken # Set the pageToken for the next request
|
150 |
+
else:
|
151 |
+
break # Exit the loop if no nextPageToken is present
|
152 |
+
else:
|
153 |
+
print("Failed to fetch data. Status code:", response.status_code)
|
154 |
+
break
|
155 |
+
|
156 |
+
# Create a DataFrame from the list of dictionaries
|
157 |
+
df = pd.DataFrame(data_list)
|
158 |
+
return df
|
OpenAITools/.ipynb_checkpoints/scrapeThisData-checkpoint.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from selenium.webdriver.support.ui import Select
|
3 |
+
from selenium.webdriver.common.by import By
|
4 |
+
|
5 |
+
import requests
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
import re
|
8 |
+
|
9 |
+
import os
|
10 |
+
import time
|
11 |
+
|
12 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
13 |
+
from selenium.webdriver.common.by import By
|
14 |
+
from selenium.webdriver.support import expected_conditions as EC
|
15 |
+
from selenium.webdriver.common.action_chains import ActionChains
|
16 |
+
import chromedriver_autoinstaller
|
17 |
+
|
18 |
+
class ScrapeThatData:
|
19 |
+
|
20 |
+
def __init__(self, time_threshold = 10):
|
21 |
+
|
22 |
+
try:
|
23 |
+
chrome_options = webdriver.ChromeOptions()
|
24 |
+
chrome_options.add_argument('--no-sandbox')
|
25 |
+
self.driver = webdriver.Chrome(options=chrome_options)
|
26 |
+
|
27 |
+
except:
|
28 |
+
chromedriver_autoinstaller.install()
|
29 |
+
chrome_options = webdriver.ChromeOptions()
|
30 |
+
chrome_options.add_argument('--no-sandbox')
|
31 |
+
self.driver = webdriver.Chrome(options=chrome_options)
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
self.wait = WebDriverWait(self.driver,time_threshold)
|
36 |
+
self.attribute_dict = {'status':1 ,'conditions':2, 'interventions': 3, 'study type':4,
|
37 |
+
'phase':5, 'sponsor':6, 'funder type':7 , 'study design': 8,
|
38 |
+
'outcome measures':9, 'number enrolled':10, 'sex':11, 'age':12,
|
39 |
+
'nct number': 13, 'other ids':14, 'title acronym': 15 , 'study start': 16,
|
40 |
+
'primary completion': 17, 'study completion': 18 , 'first posted': 19,
|
41 |
+
'last update posted': 20 , 'results first posted': 21 , 'locations':22, 'study documents': 23}
|
42 |
+
|
43 |
+
self.status_dict = {'not yet recruiting' : 'notYetRecrCB',
|
44 |
+
'recruiting' : 'recruitingCB',
|
45 |
+
'enrolling by invitation':'enrollingByInvCB',
|
46 |
+
'active, not recruiting': 'activeCB',
|
47 |
+
'suspended': 'suspendedCB',
|
48 |
+
'terminated':'terminatedCB',
|
49 |
+
'completed':'completedCB',
|
50 |
+
'withdrawn': 'withdrawnCB',
|
51 |
+
'unknown status': 'unknownCB'}
|
52 |
+
|
53 |
+
def clicking_show_hide_cols(self, driver):
|
54 |
+
columns = driver.find_element(By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/button')
|
55 |
+
action_chain = ActionChains(driver)
|
56 |
+
action_chain.move_to_element(columns).click()
|
57 |
+
action_chain.perform()
|
58 |
+
|
59 |
+
def select_attributes_to_show(self, listed_attributes, attribute_dict):
|
60 |
+
ll = [value.lower() for value in listed_attributes if value.lower() in ['status', 'conditions', 'interventions', 'locations']]
|
61 |
+
if ll:
|
62 |
+
to_show = [value.lower() for value in listed_attributes if value.lower() not in ll]
|
63 |
+
to_hide = [value for value in ['status', 'conditions', 'interventions', 'locations'] if value not in ll]
|
64 |
+
to_click = to_hide + to_show
|
65 |
+
for att in to_click:
|
66 |
+
self.clicking_show_hide_cols(self.driver)
|
67 |
+
time.sleep(1)
|
68 |
+
self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att]) + ']'))).click()
|
69 |
+
time.sleep(1)
|
70 |
+
else:
|
71 |
+
for att in listed_attributes:
|
72 |
+
self.clicking_show_hide_cols(self.driver)
|
73 |
+
time.sleep(1)
|
74 |
+
self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att.lower()]) + ']'))).click()
|
75 |
+
time.sleep(1)
|
76 |
+
|
77 |
+
def select_by_status(self, listed_states, status_dict):
|
78 |
+
if listed_states:
|
79 |
+
for status in listed_states:
|
80 |
+
self.driver.find_element(By.ID,status_dict[status.lower()]).click()
|
81 |
+
|
82 |
+
self.driver.find_element(By.XPATH,'//*[@id="FiltersBody"]/div[1]/input[1]').click()
|
83 |
+
time.sleep(3)
|
84 |
+
|
85 |
+
|
86 |
+
select = Select(self.driver.find_element_by_name('theDataTable_length'))
|
87 |
+
select.select_by_value('100')
|
88 |
+
|
89 |
+
def collect_data_search_page(self,l_ordered, amount_of_data = None):
|
90 |
+
|
91 |
+
class_name = ''
|
92 |
+
page_index = 1
|
93 |
+
|
94 |
+
elements = [l_ordered]
|
95 |
+
|
96 |
+
while 'disabled' not in class_name :
|
97 |
+
|
98 |
+
|
99 |
+
|
100 |
+
time.sleep(10)
|
101 |
+
|
102 |
+
print('Getting data from page {}'.format(page_index))
|
103 |
+
|
104 |
+
#Counting how many rows of the table appear
|
105 |
+
table = self.driver.find_element(By.ID,'theDataTable')
|
106 |
+
row_count = len(table.find_elements(By.TAG_NAME,"tr"))
|
107 |
+
|
108 |
+
#Looping table page
|
109 |
+
for index in range(1, row_count):
|
110 |
+
row = []
|
111 |
+
if 'status' in l_ordered:
|
112 |
+
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3)')))
|
113 |
+
status_element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3) > span')
|
114 |
+
row.append(status_element.text.strip())
|
115 |
+
for i, val in enumerate(l_ordered):
|
116 |
+
if val == 'status':
|
117 |
+
continue
|
118 |
+
|
119 |
+
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')))
|
120 |
+
element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')
|
121 |
+
try:
|
122 |
+
row.append(element.text.strip())
|
123 |
+
except:
|
124 |
+
print(i, element)
|
125 |
+
else:
|
126 |
+
for i, val in enumerate(l_ordered):
|
127 |
+
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')))
|
128 |
+
element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')
|
129 |
+
try:
|
130 |
+
row.append(element.text.strip())
|
131 |
+
except:
|
132 |
+
print(i, element)
|
133 |
+
elements.append(row)
|
134 |
+
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
#Getting next page button
|
139 |
+
next_page= self.driver.find_element(By.ID,"theDataTable_next")
|
140 |
+
|
141 |
+
#Getting the class attribute of the next page button
|
142 |
+
class_name = next_page.get_attribute('class')
|
143 |
+
|
144 |
+
#Going to the next page
|
145 |
+
next_page.click()
|
146 |
+
page_index += 1
|
147 |
+
|
148 |
+
if amount_of_data:
|
149 |
+
if len(elements) >= amount_of_data or row_count < amount_of_data :
|
150 |
+
break
|
151 |
+
else:
|
152 |
+
continue
|
153 |
+
|
154 |
+
return elements
|
155 |
+
|
156 |
+
def get_criteria(self, NCTnumber):
|
157 |
+
|
158 |
+
url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
|
159 |
+
ClinicalTrialpage = requests.get(url)
|
160 |
+
soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
|
161 |
+
|
162 |
+
wrapping_crit_class = soup.find_all("div", {"class": "tr-indent2"})
|
163 |
+
list_elements = wrapping_crit_class[1].find_all(re.compile("(ul|ol)"))
|
164 |
+
inclusion, exclusion = ('','')
|
165 |
+
|
166 |
+
|
167 |
+
if not list_elements:
|
168 |
+
print ("WARNING: Study number " + NCTnumber + " doesn't have eligibility criteria or HTML tag format is not a list")
|
169 |
+
else:
|
170 |
+
|
171 |
+
if len(list_elements) == 1:
|
172 |
+
try:
|
173 |
+
if wrapping_crit_class[1].find(text = 'Inclusion Criteria:'):
|
174 |
+
inclusion = list_elements[0].find_all("li")
|
175 |
+
|
176 |
+
elif wrapping_crit_class[1].find(text = 'Exclusion Criteria:'):
|
177 |
+
exclusion = list_elements[0].find_all("li")
|
178 |
+
except:
|
179 |
+
print('criteria doesnt exist')
|
180 |
+
else:
|
181 |
+
inclusion = list_elements[0].find_all("li")
|
182 |
+
exclusion = list_elements[1].find_all("li")
|
183 |
+
|
184 |
+
|
185 |
+
inclusion = ' '.join([t.text.strip() for t in inclusion ])
|
186 |
+
exclusion = ' '.join([t.text.strip() for t in exclusion ])
|
187 |
+
|
188 |
+
return(inclusion, exclusion)
|
189 |
+
|
190 |
+
#function that gets number of patients enrolled in a study
|
191 |
+
def get_enrollment (self, NCTnumber):
|
192 |
+
url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
|
193 |
+
ClinicalTrialpage = requests.get(url)
|
194 |
+
soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
|
195 |
+
enrollment = ''
|
196 |
+
wrapping_enrol_class = soup.find_all('td', {'headers':'studyInfoColData','style':"padding-left:1em"})
|
197 |
+
if not wrapping_enrol_class:
|
198 |
+
print('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
|
199 |
+
else:
|
200 |
+
enrollment = wrapping_enrol_class[1]
|
201 |
+
enrollment = enrollment.text.split()[0]
|
202 |
+
if enrollment.isdigit() == False:
|
203 |
+
print ('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
|
204 |
+
else:
|
205 |
+
return(enrollment)
|
206 |
+
|
207 |
+
|
208 |
+
|
209 |
+
def __call__(self, condition, listed_attributes, listed_states, amount_of_data):
|
210 |
+
|
211 |
+
self.driver.get('https://clinicaltrials.gov/ct2/results?cond=' + condition + '&rank=1&view=record#rowId0')
|
212 |
+
self.select_attributes_to_show(listed_attributes, self.attribute_dict)
|
213 |
+
|
214 |
+
try:
|
215 |
+
self.select_by_status(listed_states, self.status_dict)
|
216 |
+
except:
|
217 |
+
print('select by status is a problem')
|
218 |
+
n = []
|
219 |
+
for i in listed_attributes:
|
220 |
+
n.append(self.attribute_dict[i.lower()])
|
221 |
+
attribute_ordered = [list(self.attribute_dict.keys())[list(self.attribute_dict.values()).index(i)]for i in sorted(n)]
|
222 |
+
|
223 |
+
search_data = self.collect_data_search_page(attribute_ordered, amount_of_data=amount_of_data)
|
224 |
+
nct_numbers = [e[search_data[0].index('nct number')] for e in search_data[1:]]
|
225 |
+
search_data[0].extend(['inclusion', 'exclusion', 'enrollment'])
|
226 |
+
for index, nct in enumerate(nct_numbers):
|
227 |
+
if index % 100 == 0 and index!= 0:
|
228 |
+
print("Collected Data from {} Studies: ".format(index))
|
229 |
+
|
230 |
+
inc, exc = self.get_criteria(nct)
|
231 |
+
enrol = self.get_enrollment(nct)
|
232 |
+
search_data[index + 1].extend([inc, exc, enrol])
|
233 |
+
return search_data
|
234 |
+
# except:
|
235 |
+
# print('no data available with the specified status')
|
236 |
+
|
237 |
+
|
OpenAITools/ECarteTools.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import time
|
3 |
+
import wikipedia
|
4 |
+
import random
|
5 |
+
import re
|
6 |
+
import requests
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
import os
|
9 |
+
import glob
|
10 |
+
from natsort import natsorted
|
11 |
+
import requests
|
12 |
+
from bs4 import BeautifulSoup
|
13 |
+
import xml.etree.ElementTree as ET
|
14 |
+
import pandas as pd
|
15 |
+
|
16 |
+
wikipedia.set_lang("ja")
|
17 |
+
# APIキーの設定
|
18 |
+
openai.api_key = os.environ['OPENAI_API_KEY']
|
19 |
+
engine="gpt-3.5-turbo"
|
20 |
+
|
21 |
+
|
22 |
+
def generate(system_template,prompt,engine="gpt-3.5-turbo"):
|
23 |
+
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
24 |
+
try:
|
25 |
+
response = openai.ChatCompletion.create(
|
26 |
+
model=engine,
|
27 |
+
messages=[
|
28 |
+
{"role": "system", "content": system_template},
|
29 |
+
{"role": "user", "content":prompt},
|
30 |
+
]
|
31 |
+
)
|
32 |
+
result=response["choices"][0]["message"]["content"]
|
33 |
+
return result
|
34 |
+
except:
|
35 |
+
print("リトライ")
|
36 |
+
time.sleep(30)
|
37 |
+
pass
|
38 |
+
|
39 |
+
def generate_carte(prompt,engine="gpt-3.5-turbo"):
|
40 |
+
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
41 |
+
try:
|
42 |
+
response = openai.ChatCompletion.create(
|
43 |
+
model=engine,
|
44 |
+
messages=[
|
45 |
+
{"role": "system", "content": "You are useful assistant"},
|
46 |
+
{"role": "user", "content":"%s\n・・・という患者と医師の会話をSOAP形式のカルテとして日本語で端的にまとめて下さい。各セクションはS),O), A),P)として下さい "%prompt},
|
47 |
+
]
|
48 |
+
)
|
49 |
+
result=response["choices"][0]["message"]["content"]
|
50 |
+
return result
|
51 |
+
except:
|
52 |
+
print("リトライ")
|
53 |
+
time.sleep(30)
|
54 |
+
pass
|
55 |
+
|
56 |
+
def get_selected_fileds(texts):
|
57 |
+
input_name = texts.replace(' ' , "+")
|
58 |
+
corona_fields = ct.get_study_fields(
|
59 |
+
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
|
60 |
+
fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
|
61 |
+
max_studies=500,
|
62 |
+
fmt="csv")
|
63 |
+
return corona_fields
|
64 |
+
|
65 |
+
def get_retriever_str(fields):
|
66 |
+
retriever_str=''
|
67 |
+
for i in range(1,len(fields)):
|
68 |
+
colnames = fields[0]
|
69 |
+
targetCol = fields[i]
|
70 |
+
for f in range(len(fields[0])):
|
71 |
+
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
72 |
+
retriever_str+='\n'
|
73 |
+
return retriever_str
|
OpenAITools/ExpertTools.py
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import openai
|
3 |
+
import time
|
4 |
+
import wikipedia
|
5 |
+
import random
|
6 |
+
import re
|
7 |
+
import requests
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
import os
|
10 |
+
import glob
|
11 |
+
from natsort import natsorted
|
12 |
+
import requests
|
13 |
+
from bs4 import BeautifulSoup
|
14 |
+
import xml.etree.ElementTree as ET
|
15 |
+
from pytrials.client import ClinicalTrials
|
16 |
+
from Bio import Entrez
|
17 |
+
import pandas as pd
|
18 |
+
import numpy as np
|
19 |
+
import time
|
20 |
+
#from langchain.agents import create_pandas_dataframe_agent
|
21 |
+
from langchain_experimental.agents import create_pandas_dataframe_agent
|
22 |
+
#from langchain.llms import OpenAI
|
23 |
+
from langchain_community.llms import OpenAI
|
24 |
+
|
25 |
+
# APIキーの設定
|
26 |
+
openai.api_key = os.environ['OPENAI_API_KEY']
|
27 |
+
gptengine="gpt-3.5-turbo"
|
28 |
+
|
29 |
+
|
30 |
+
"""def get_selected_fileds(texts):
|
31 |
+
ct = ClinicalTrials()
|
32 |
+
input_name = texts.replace(' ' , "+")
|
33 |
+
corona_fields = ct.get_study_fields(
|
34 |
+
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)"%(input_name),
|
35 |
+
fields=["NCTId", "Condition", "BriefTitle",'BriefSummary','EligibilityCriteria'],
|
36 |
+
max_studies=500,
|
37 |
+
fmt="csv")
|
38 |
+
return corona_fields"""
|
39 |
+
|
40 |
+
def get_retriever_str(fields):
|
41 |
+
retriever_str=''
|
42 |
+
for i in range(1,len(fields)):
|
43 |
+
colnames = fields[0]
|
44 |
+
targetCol = fields[i]
|
45 |
+
for f in range(len(fields[0])):
|
46 |
+
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
47 |
+
retriever_str+='\n'
|
48 |
+
return retriever_str
|
49 |
+
|
50 |
+
def get_chanked_retriever(fields):
|
51 |
+
retriever_list =[]
|
52 |
+
for i in range(1,len(fields)):
|
53 |
+
retriever_str=''
|
54 |
+
colnames = fields[0]
|
55 |
+
targetCol = fields[i]
|
56 |
+
for f in range(len(fields[0])):
|
57 |
+
retriever_str+=colnames[f] + ":" + targetCol[f] +"\n"
|
58 |
+
retriever_list.append(retriever_str)
|
59 |
+
return retriever_list
|
60 |
+
|
61 |
+
from pytrials.client import ClinicalTrials
|
62 |
+
def get_selected_fields(texts, split_criteria=False,
|
63 |
+
split_word_number = False, split_number=700):
|
64 |
+
ct = ClinicalTrials()
|
65 |
+
input_name = texts.replace(' ', "+")
|
66 |
+
corona_fields = ct.get_study_fields(
|
67 |
+
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (input_name),
|
68 |
+
fields=["NCTId", "Condition", "BriefTitle", 'BriefSummary', 'EligibilityCriteria'],
|
69 |
+
max_studies=500,
|
70 |
+
fmt="csv")
|
71 |
+
|
72 |
+
if split_criteria:
|
73 |
+
new_fields = []
|
74 |
+
|
75 |
+
# 検索対象の文字列
|
76 |
+
target_string1 = 'Exclusion Criteria'
|
77 |
+
target_string2 = 'Exclusion criteria'
|
78 |
+
|
79 |
+
# 各要素で検索対象の文字列を探し、直前で分割して新しいリストに格納
|
80 |
+
for corona_field in corona_fields:
|
81 |
+
new_list = []
|
82 |
+
for item in corona_field:
|
83 |
+
if target_string1 in item:
|
84 |
+
split_position = item.index(target_string1)
|
85 |
+
new_list.append(item[:split_position])
|
86 |
+
new_list.append(item[split_position:])
|
87 |
+
elif target_string2 in item:
|
88 |
+
split_position = item.index(target_string2)
|
89 |
+
new_list.append(item[:split_position])
|
90 |
+
new_list.append(item[split_position:])
|
91 |
+
else:
|
92 |
+
new_list.append(item)
|
93 |
+
new_fields.append(new_list)
|
94 |
+
else:
|
95 |
+
new_fields = corona_fields
|
96 |
+
|
97 |
+
if split_word_number:
|
98 |
+
split_fields = []
|
99 |
+
for new_field in new_fields:
|
100 |
+
new_list= []
|
101 |
+
|
102 |
+
# 各要素を調べて、700文字以上であれば分割し、新しいリストに格納
|
103 |
+
for item in new_field:
|
104 |
+
item_length = len(item)
|
105 |
+
if item_length > split_number:
|
106 |
+
num_parts = -(-item_length // split_number) # 向上の除算を用いて分割数を計算
|
107 |
+
for i in range(num_parts):
|
108 |
+
start_index = i * split_number
|
109 |
+
end_index = min((i + 1) * split_number, item_length) # 文字列の終わりを超えないように調整
|
110 |
+
new_list.append(item[start_index:end_index])
|
111 |
+
else:
|
112 |
+
new_list.append(item)
|
113 |
+
|
114 |
+
split_fields.append(new_list)
|
115 |
+
new_fields = split_fields
|
116 |
+
|
117 |
+
return new_fields
|
118 |
+
|
119 |
+
|
120 |
+
def print_agent_results(df, Ids,
|
121 |
+
interesteds = ['Condition', 'BriefTitle', 'BriefSummary', 'EligibilityCriteria'],
|
122 |
+
translater=None):
|
123 |
+
results = ""
|
124 |
+
for Id in Ids:
|
125 |
+
print("%s\n"%Id)
|
126 |
+
sdf = df[df['NCTId'] == Id]
|
127 |
+
for interested in interesteds:
|
128 |
+
# 最初の要素を取得
|
129 |
+
results += '%s: \n %s \n' % (interested, sdf[interested].iloc[0])
|
130 |
+
#print('%s: \n %s \n' % (interested, sdf[interested].iloc[0]))
|
131 |
+
if translater:
|
132 |
+
to_be_printed = translater.translate(results)
|
133 |
+
else:
|
134 |
+
to_be_printed =results
|
135 |
+
print(to_be_printed)
|
136 |
+
|
137 |
+
def search(query):
|
138 |
+
Entrez.email = os.getenv('MAIL_ADRESS')
|
139 |
+
#Entrez.email='sing.monotonyflower@gmail.com'
|
140 |
+
handle = Entrez.esearch(db='pubmed',
|
141 |
+
sort = 'relevance',
|
142 |
+
retmax = '20',
|
143 |
+
retmode = 'xml',
|
144 |
+
term = query)
|
145 |
+
results = Entrez.read(handle)
|
146 |
+
return results
|
147 |
+
|
148 |
+
def fetch_details(id_list):
|
149 |
+
ids = ','.join(id_list)
|
150 |
+
Entrez.email = os.getenv('MAIL_ADRESS')
|
151 |
+
#Entrez.email = 'sing.monotonyflower@gmail.com'
|
152 |
+
handle = Entrez.efetch(db = 'pubmed',
|
153 |
+
retmode = 'xml',
|
154 |
+
id = ids)
|
155 |
+
results = Entrez.read(handle)
|
156 |
+
return results
|
157 |
+
'''def generate(prompt,engine=None):
|
158 |
+
if engine is None:
|
159 |
+
engine=gptengine
|
160 |
+
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
161 |
+
try:
|
162 |
+
response = openai.ChatCompletion.create(
|
163 |
+
model=engine,
|
164 |
+
messages=[
|
165 |
+
{"role": "system", "content": "You are useful assistant"},
|
166 |
+
{"role": "user", "content":prompt},
|
167 |
+
]
|
168 |
+
)
|
169 |
+
result=response["choices"][0]["message"]["content"]
|
170 |
+
return result
|
171 |
+
except Exception as e:
|
172 |
+
print(e)
|
173 |
+
print("リトライ")
|
174 |
+
time.sleep(30)
|
175 |
+
pass
|
176 |
+
'''
|
177 |
+
|
178 |
+
def generate(prompt,engine=None):
|
179 |
+
if engine is None:
|
180 |
+
engine=gptengine
|
181 |
+
while True: #OpenAI APIが落ちてる時に無限リトライするので注意
|
182 |
+
try:
|
183 |
+
response = openai.chat.completions.create(
|
184 |
+
model=engine,
|
185 |
+
messages=[
|
186 |
+
{"role": "system", "content": "You are useful assistant"},
|
187 |
+
{"role": "user", "content":prompt},
|
188 |
+
]
|
189 |
+
)
|
190 |
+
#result=response["choices"][0]["message"]["content"]
|
191 |
+
result=response.choices[0].message.content
|
192 |
+
return result
|
193 |
+
except Exception as e:
|
194 |
+
print(e)
|
195 |
+
print("リトライ")
|
196 |
+
time.sleep(30)
|
197 |
+
pass
|
198 |
+
|
199 |
+
def GetPubmedSummaryDf(studies):
|
200 |
+
title_list= []
|
201 |
+
abstract_list=[]
|
202 |
+
journal_list = []
|
203 |
+
language_list =[]
|
204 |
+
pubdate_year_list = []
|
205 |
+
pubdate_month_list = []
|
206 |
+
studiesIdList = studies['IdList']
|
207 |
+
chunk_size = 10000
|
208 |
+
for chunk_i in range(0, len(studiesIdList), chunk_size):
|
209 |
+
chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
|
210 |
+
|
211 |
+
try:
|
212 |
+
papers = fetch_details(chunk)
|
213 |
+
for i, paper in enumerate(papers['PubmedArticle']):
|
214 |
+
title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
|
215 |
+
try:
|
216 |
+
abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
|
217 |
+
except:
|
218 |
+
abstract_list.append('No Abstract')
|
219 |
+
journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
|
220 |
+
language_list.append(paper['MedlineCitation']['Article']['Language'][0])
|
221 |
+
try:
|
222 |
+
pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
|
223 |
+
except:
|
224 |
+
pubdate_year_list.append('No Data')
|
225 |
+
try:
|
226 |
+
pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
|
227 |
+
except:
|
228 |
+
pubdate_month_list.append('No Data')
|
229 |
+
except: # occasionally a chunk might annoy your parser
|
230 |
+
pass
|
231 |
+
df = pd.DataFrame(list(zip(
|
232 |
+
title_list, abstract_list, journal_list, language_list, pubdate_year_list,
|
233 |
+
pubdate_month_list)),
|
234 |
+
columns=['Title', 'Abstract', 'Journal', 'Language', 'Year','Month'])
|
235 |
+
return df, abstract_list
|
236 |
+
|
237 |
+
def ClinicalAgent(fileds, verbose=False):
|
238 |
+
df = pd.DataFrame.from_records(fileds[1:], columns=fileds[0])
|
239 |
+
return create_pandas_dataframe_agent(OpenAI(temperature=0, model='gpt-3.5-turbo-16k'), df, verbose=verbose)
|
240 |
+
|
241 |
+
def GetNCTID(results):
|
242 |
+
# NCTで始まる単語を検索する正規表現
|
243 |
+
pattern = r'\bNCT\d+\b'
|
244 |
+
# 正規表現を使って単語を抽出
|
245 |
+
nct_words = re.findall(pattern,results)
|
246 |
+
return nct_words
|
OpenAITools/FetchTools.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
#from llama_index.llms.replicate import Replicate
|
4 |
+
import requests
|
5 |
+
import re
|
6 |
+
|
7 |
+
|
8 |
+
def extract_japan_cities(text):
|
9 |
+
# 正規表現を使用して " - Japan" で終わる都市名を抽出
|
10 |
+
pattern = r'(\b\w+\s*\w*\b) - Japan'
|
11 |
+
cities = re.findall(pattern, text)
|
12 |
+
unique_cities = list(set(cities))
|
13 |
+
# ユニークな都市名をソートしてカンマで区切られた文字列に変換
|
14 |
+
unique_cities.sort()
|
15 |
+
return ', '.join(unique_cities)
|
16 |
+
|
17 |
+
def fetch_clinical_trials(cancer_name):
|
18 |
+
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
|
19 |
+
# Initial URL for the first API call
|
20 |
+
base_url = "https://clinicaltrials.gov/api/v2/studies"
|
21 |
+
params = {
|
22 |
+
"query.titles": search_expr,
|
23 |
+
"pageSize": 100
|
24 |
+
}
|
25 |
+
|
26 |
+
# Initialize an empty list to store the data
|
27 |
+
data_list = []
|
28 |
+
# Loop until there is no nextPageToken
|
29 |
+
while True:
|
30 |
+
# Print the current URL (for debugging purposes)
|
31 |
+
print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
|
32 |
+
|
33 |
+
# Send a GET request to the API
|
34 |
+
response = requests.get(base_url, params=params)
|
35 |
+
|
36 |
+
# Check if the request was successful
|
37 |
+
if response.status_code == 200:
|
38 |
+
data = response.json() # Parse JSON response
|
39 |
+
studies = data.get('studies', []) # Extract the list of studies
|
40 |
+
|
41 |
+
# Loop through each study and extract specific information
|
42 |
+
for study in studies:
|
43 |
+
# Safely access nested keys
|
44 |
+
nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
|
45 |
+
startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
|
46 |
+
conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
|
47 |
+
title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
|
48 |
+
summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
|
49 |
+
|
50 |
+
# Extract locations safely
|
51 |
+
locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
|
52 |
+
locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
|
53 |
+
|
54 |
+
JapanesLocations = extract_japan_cities(locations)
|
55 |
+
# Extract dates and phases
|
56 |
+
primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
|
57 |
+
|
58 |
+
phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
|
59 |
+
eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
|
60 |
+
|
61 |
+
# Append the data to the list as a dictionary
|
62 |
+
data_list.append({
|
63 |
+
"NCTID": nctId,
|
64 |
+
"Title": title,
|
65 |
+
#"Start Date": startDate,
|
66 |
+
"Primary Completion Date": primaryCompletionDate,
|
67 |
+
#"Conditions": conditions,
|
68 |
+
"Cancer": conditions,
|
69 |
+
"Summary": summary,
|
70 |
+
"Japanes Locations": JapanesLocations,
|
71 |
+
#"Phases": phases,
|
72 |
+
"Eligibility Criteria": eligibilityCriteria
|
73 |
+
})
|
74 |
+
|
75 |
+
# Check for nextPageToken and update the params or break the loop
|
76 |
+
nextPageToken = data.get('nextPageToken')
|
77 |
+
if nextPageToken:
|
78 |
+
params['pageToken'] = nextPageToken # Set the pageToken for the next request
|
79 |
+
else:
|
80 |
+
break # Exit the loop if no nextPageToken is present
|
81 |
+
else:
|
82 |
+
print("Failed to fetch data. Status code:", response.status_code)
|
83 |
+
break
|
84 |
+
|
85 |
+
# Create a DataFrame from the list of dictionaries
|
86 |
+
df = pd.DataFrame(data_list)
|
87 |
+
return df
|
88 |
+
|
89 |
+
def fetch_clinical_trials_jp(cancer_name):
|
90 |
+
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
|
91 |
+
# Initial URL for the first API call
|
92 |
+
base_url = "https://clinicaltrials.gov/api/v2/studies"
|
93 |
+
params = {
|
94 |
+
"query.titles": search_expr,
|
95 |
+
"pageSize": 100
|
96 |
+
}
|
97 |
+
|
98 |
+
# Initialize an empty list to store the data
|
99 |
+
data_list = []
|
100 |
+
# Loop until there is no nextPageToken
|
101 |
+
while True:
|
102 |
+
# Print the current URL (for debugging purposes)
|
103 |
+
print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
|
104 |
+
|
105 |
+
# Send a GET request to the API
|
106 |
+
response = requests.get(base_url, params=params)
|
107 |
+
|
108 |
+
# Check if the request was successful
|
109 |
+
if response.status_code == 200:
|
110 |
+
data = response.json() # Parse JSON response
|
111 |
+
studies = data.get('studies', []) # Extract the list of studies
|
112 |
+
|
113 |
+
# Loop through each study and extract specific information
|
114 |
+
for study in studies:
|
115 |
+
# Safely access nested keys
|
116 |
+
nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
|
117 |
+
startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
|
118 |
+
conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
|
119 |
+
title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
|
120 |
+
summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
|
121 |
+
|
122 |
+
# Extract locations safely
|
123 |
+
locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
|
124 |
+
locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
|
125 |
+
|
126 |
+
JapanesLocations = extract_japan_cities(locations)
|
127 |
+
# Extract dates and phases
|
128 |
+
primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
|
129 |
+
|
130 |
+
phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
|
131 |
+
eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
|
132 |
+
|
133 |
+
# Append the data to the list as a dictionary
|
134 |
+
data_list.append({
|
135 |
+
"NCTID": nctId,
|
136 |
+
"タイトル": title,
|
137 |
+
#"Start Date": startDate,
|
138 |
+
#"Primary Completion Date": primaryCompletionDate,
|
139 |
+
"対象となる癌": conditions,
|
140 |
+
"サマリー": summary,
|
141 |
+
"場所": JapanesLocations,
|
142 |
+
#"Phases": phases,
|
143 |
+
"クライテリア": eligibilityCriteria
|
144 |
+
})
|
145 |
+
|
146 |
+
# Check for nextPageToken and update the params or break the loop
|
147 |
+
nextPageToken = data.get('nextPageToken')
|
148 |
+
if nextPageToken:
|
149 |
+
params['pageToken'] = nextPageToken # Set the pageToken for the next request
|
150 |
+
else:
|
151 |
+
break # Exit the loop if no nextPageToken is present
|
152 |
+
else:
|
153 |
+
print("Failed to fetch data. Status code:", response.status_code)
|
154 |
+
break
|
155 |
+
|
156 |
+
# Create a DataFrame from the list of dictionaries
|
157 |
+
df = pd.DataFrame(data_list)
|
158 |
+
return df
|
OpenAITools/ReviewPaperTools.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
def parse_text_file(text):
|
5 |
+
# セクションを分割するための正規表現パターンを定義
|
6 |
+
# \d+ は1つ以上の数字にマッチします
|
7 |
+
pattern = re.compile(r'\n\n\n\d+\.')
|
8 |
+
|
9 |
+
# テキストをセクションごとに分割
|
10 |
+
sections = pattern.split(text)[1:] # 最初の空のセクションを除外
|
11 |
+
|
12 |
+
# 各セクションの前後の空白を削除
|
13 |
+
sections = [section.strip() for section in sections]
|
14 |
+
|
15 |
+
return sections
|
16 |
+
|
17 |
+
def split_sections(text):
|
18 |
+
contents = text.split('\n\n')
|
19 |
+
contents = [section.strip() for section in contents if section.strip()]
|
20 |
+
if len(contents) == 8 :
|
21 |
+
keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI', 'COI']
|
22 |
+
elif len(contents) == 7 :
|
23 |
+
keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI']
|
24 |
+
elif len(contents) == 6:
|
25 |
+
keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'DOI']
|
26 |
+
elif len(contents) == 5:
|
27 |
+
keys = ['PublishInfo', 'Title', 'AuthorName', 'Abstract', 'DOI']
|
28 |
+
|
29 |
+
# 辞書を作成し、キーが存在しない場合は空の文字列を設定
|
30 |
+
section_dict = {key: contents[i] if i < len(contents) else "" for i, key in enumerate(keys)}
|
31 |
+
return section_dict
|
32 |
+
|
33 |
+
|
34 |
+
def GetSummaryDf(textdir):
|
35 |
+
with open(textdir, 'r', encoding='utf-8') as f:
|
36 |
+
content = f.read()
|
37 |
+
sections = parse_text_file(content)
|
38 |
+
dicts = []
|
39 |
+
for section in sections:
|
40 |
+
splited_dic = split_sections(section)
|
41 |
+
dicts.append(splited_dic)
|
42 |
+
return pd.DataFrame(dicts)
|
OpenAITools/scrapeThisData.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from selenium.webdriver.support.ui import Select
|
3 |
+
from selenium.webdriver.common.by import By
|
4 |
+
|
5 |
+
import requests
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
import re
|
8 |
+
|
9 |
+
import os
|
10 |
+
import time
|
11 |
+
|
12 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
13 |
+
from selenium.webdriver.common.by import By
|
14 |
+
from selenium.webdriver.support import expected_conditions as EC
|
15 |
+
from selenium.webdriver.common.action_chains import ActionChains
|
16 |
+
import chromedriver_autoinstaller
|
17 |
+
|
18 |
+
class ScrapeThatData:
|
19 |
+
|
20 |
+
def __init__(self, time_threshold = 10):
|
21 |
+
|
22 |
+
try:
|
23 |
+
chrome_options = webdriver.ChromeOptions()
|
24 |
+
chrome_options.add_argument('--no-sandbox')
|
25 |
+
self.driver = webdriver.Chrome(options=chrome_options)
|
26 |
+
|
27 |
+
except:
|
28 |
+
chromedriver_autoinstaller.install()
|
29 |
+
chrome_options = webdriver.ChromeOptions()
|
30 |
+
chrome_options.add_argument('--no-sandbox')
|
31 |
+
self.driver = webdriver.Chrome(options=chrome_options)
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
self.wait = WebDriverWait(self.driver,time_threshold)
|
36 |
+
self.attribute_dict = {'status':1 ,'conditions':2, 'interventions': 3, 'study type':4,
|
37 |
+
'phase':5, 'sponsor':6, 'funder type':7 , 'study design': 8,
|
38 |
+
'outcome measures':9, 'number enrolled':10, 'sex':11, 'age':12,
|
39 |
+
'nct number': 13, 'other ids':14, 'title acronym': 15 , 'study start': 16,
|
40 |
+
'primary completion': 17, 'study completion': 18 , 'first posted': 19,
|
41 |
+
'last update posted': 20 , 'results first posted': 21 , 'locations':22, 'study documents': 23}
|
42 |
+
|
43 |
+
self.status_dict = {'not yet recruiting' : 'notYetRecrCB',
|
44 |
+
'recruiting' : 'recruitingCB',
|
45 |
+
'enrolling by invitation':'enrollingByInvCB',
|
46 |
+
'active, not recruiting': 'activeCB',
|
47 |
+
'suspended': 'suspendedCB',
|
48 |
+
'terminated':'terminatedCB',
|
49 |
+
'completed':'completedCB',
|
50 |
+
'withdrawn': 'withdrawnCB',
|
51 |
+
'unknown status': 'unknownCB'}
|
52 |
+
|
53 |
+
def clicking_show_hide_cols(self, driver):
|
54 |
+
columns = driver.find_element(By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/button')
|
55 |
+
action_chain = ActionChains(driver)
|
56 |
+
action_chain.move_to_element(columns).click()
|
57 |
+
action_chain.perform()
|
58 |
+
|
59 |
+
def select_attributes_to_show(self, listed_attributes, attribute_dict):
|
60 |
+
ll = [value.lower() for value in listed_attributes if value.lower() in ['status', 'conditions', 'interventions', 'locations']]
|
61 |
+
if ll:
|
62 |
+
to_show = [value.lower() for value in listed_attributes if value.lower() not in ll]
|
63 |
+
to_hide = [value for value in ['status', 'conditions', 'interventions', 'locations'] if value not in ll]
|
64 |
+
to_click = to_hide + to_show
|
65 |
+
for att in to_click:
|
66 |
+
self.clicking_show_hide_cols(self.driver)
|
67 |
+
time.sleep(1)
|
68 |
+
self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att]) + ']'))).click()
|
69 |
+
time.sleep(1)
|
70 |
+
else:
|
71 |
+
for att in listed_attributes:
|
72 |
+
self.clicking_show_hide_cols(self.driver)
|
73 |
+
time.sleep(1)
|
74 |
+
self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="theDataTable_wrapper"]/div[3]/div[2]/button['+ str(attribute_dict[att.lower()]) + ']'))).click()
|
75 |
+
time.sleep(1)
|
76 |
+
|
77 |
+
def select_by_status(self, listed_states, status_dict):
|
78 |
+
if listed_states:
|
79 |
+
for status in listed_states:
|
80 |
+
self.driver.find_element(By.ID,status_dict[status.lower()]).click()
|
81 |
+
|
82 |
+
self.driver.find_element(By.XPATH,'//*[@id="FiltersBody"]/div[1]/input[1]').click()
|
83 |
+
time.sleep(3)
|
84 |
+
|
85 |
+
|
86 |
+
select = Select(self.driver.find_element_by_name('theDataTable_length'))
|
87 |
+
select.select_by_value('100')
|
88 |
+
|
89 |
+
def collect_data_search_page(self,l_ordered, amount_of_data = None):
|
90 |
+
|
91 |
+
class_name = ''
|
92 |
+
page_index = 1
|
93 |
+
|
94 |
+
elements = [l_ordered]
|
95 |
+
|
96 |
+
while 'disabled' not in class_name :
|
97 |
+
|
98 |
+
|
99 |
+
|
100 |
+
time.sleep(10)
|
101 |
+
|
102 |
+
print('Getting data from page {}'.format(page_index))
|
103 |
+
|
104 |
+
#Counting how many rows of the table appear
|
105 |
+
table = self.driver.find_element(By.ID,'theDataTable')
|
106 |
+
row_count = len(table.find_elements(By.TAG_NAME,"tr"))
|
107 |
+
|
108 |
+
#Looping table page
|
109 |
+
for index in range(1, row_count):
|
110 |
+
row = []
|
111 |
+
if 'status' in l_ordered:
|
112 |
+
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3)')))
|
113 |
+
status_element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child(3) > span')
|
114 |
+
row.append(status_element.text.strip())
|
115 |
+
for i, val in enumerate(l_ordered):
|
116 |
+
if val == 'status':
|
117 |
+
continue
|
118 |
+
|
119 |
+
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')))
|
120 |
+
element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(4+i)+')')
|
121 |
+
try:
|
122 |
+
row.append(element.text.strip())
|
123 |
+
except:
|
124 |
+
print(i, element)
|
125 |
+
else:
|
126 |
+
for i, val in enumerate(l_ordered):
|
127 |
+
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')))
|
128 |
+
element = self.driver.find_elements(By.CLASS_NAME,'#theDataTable > tbody > tr:nth-child('+str(index)+') > td:nth-child('+str(3+i)+')')
|
129 |
+
try:
|
130 |
+
row.append(element.text.strip())
|
131 |
+
except:
|
132 |
+
print(i, element)
|
133 |
+
elements.append(row)
|
134 |
+
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
#Getting next page button
|
139 |
+
next_page= self.driver.find_element(By.ID,"theDataTable_next")
|
140 |
+
|
141 |
+
#Getting the class attribute of the next page button
|
142 |
+
class_name = next_page.get_attribute('class')
|
143 |
+
|
144 |
+
#Going to the next page
|
145 |
+
next_page.click()
|
146 |
+
page_index += 1
|
147 |
+
|
148 |
+
if amount_of_data:
|
149 |
+
if len(elements) >= amount_of_data or row_count < amount_of_data :
|
150 |
+
break
|
151 |
+
else:
|
152 |
+
continue
|
153 |
+
|
154 |
+
return elements
|
155 |
+
|
156 |
+
def get_criteria(self, NCTnumber):
|
157 |
+
|
158 |
+
url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
|
159 |
+
ClinicalTrialpage = requests.get(url)
|
160 |
+
soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
|
161 |
+
|
162 |
+
wrapping_crit_class = soup.find_all("div", {"class": "tr-indent2"})
|
163 |
+
list_elements = wrapping_crit_class[1].find_all(re.compile("(ul|ol)"))
|
164 |
+
inclusion, exclusion = ('','')
|
165 |
+
|
166 |
+
|
167 |
+
if not list_elements:
|
168 |
+
print ("WARNING: Study number " + NCTnumber + " doesn't have eligibility criteria or HTML tag format is not a list")
|
169 |
+
else:
|
170 |
+
|
171 |
+
if len(list_elements) == 1:
|
172 |
+
try:
|
173 |
+
if wrapping_crit_class[1].find(text = 'Inclusion Criteria:'):
|
174 |
+
inclusion = list_elements[0].find_all("li")
|
175 |
+
|
176 |
+
elif wrapping_crit_class[1].find(text = 'Exclusion Criteria:'):
|
177 |
+
exclusion = list_elements[0].find_all("li")
|
178 |
+
except:
|
179 |
+
print('criteria doesnt exist')
|
180 |
+
else:
|
181 |
+
inclusion = list_elements[0].find_all("li")
|
182 |
+
exclusion = list_elements[1].find_all("li")
|
183 |
+
|
184 |
+
|
185 |
+
inclusion = ' '.join([t.text.strip() for t in inclusion ])
|
186 |
+
exclusion = ' '.join([t.text.strip() for t in exclusion ])
|
187 |
+
|
188 |
+
return(inclusion, exclusion)
|
189 |
+
|
190 |
+
#function that gets number of patients enrolled in a study
|
191 |
+
def get_enrollment (self, NCTnumber):
|
192 |
+
url = 'https://clinicaltrials.gov/ct2/show/' + NCTnumber
|
193 |
+
ClinicalTrialpage = requests.get(url)
|
194 |
+
soup = BeautifulSoup(ClinicalTrialpage.text, 'html.parser')
|
195 |
+
enrollment = ''
|
196 |
+
wrapping_enrol_class = soup.find_all('td', {'headers':'studyInfoColData','style':"padding-left:1em"})
|
197 |
+
if not wrapping_enrol_class:
|
198 |
+
print('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
|
199 |
+
else:
|
200 |
+
enrollment = wrapping_enrol_class[1]
|
201 |
+
enrollment = enrollment.text.split()[0]
|
202 |
+
if enrollment.isdigit() == False:
|
203 |
+
print ('WARNING: Number of Participants in Study number '+ NCTnumber +' is unavailable')
|
204 |
+
else:
|
205 |
+
return(enrollment)
|
206 |
+
|
207 |
+
|
208 |
+
|
209 |
+
def __call__(self, condition, listed_attributes, listed_states, amount_of_data):
|
210 |
+
|
211 |
+
self.driver.get('https://clinicaltrials.gov/ct2/results?cond=' + condition + '&rank=1&view=record#rowId0')
|
212 |
+
self.select_attributes_to_show(listed_attributes, self.attribute_dict)
|
213 |
+
|
214 |
+
try:
|
215 |
+
self.select_by_status(listed_states, self.status_dict)
|
216 |
+
except:
|
217 |
+
print('select by status is a problem')
|
218 |
+
n = []
|
219 |
+
for i in listed_attributes:
|
220 |
+
n.append(self.attribute_dict[i.lower()])
|
221 |
+
attribute_ordered = [list(self.attribute_dict.keys())[list(self.attribute_dict.values()).index(i)]for i in sorted(n)]
|
222 |
+
|
223 |
+
search_data = self.collect_data_search_page(attribute_ordered, amount_of_data=amount_of_data)
|
224 |
+
nct_numbers = [e[search_data[0].index('nct number')] for e in search_data[1:]]
|
225 |
+
search_data[0].extend(['inclusion', 'exclusion', 'enrollment'])
|
226 |
+
for index, nct in enumerate(nct_numbers):
|
227 |
+
if index % 100 == 0 and index!= 0:
|
228 |
+
print("Collected Data from {} Studies: ".format(index))
|
229 |
+
|
230 |
+
inc, exc = self.get_criteria(nct)
|
231 |
+
enrol = self.get_enrollment(nct)
|
232 |
+
search_data[index + 1].extend([inc, exc, enrol])
|
233 |
+
return search_data
|
234 |
+
# except:
|
235 |
+
# print('no data available with the specified status')
|
236 |
+
|
237 |
+
|
app.py
CHANGED
@@ -1,63 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
def respond(
|
11 |
-
message,
|
12 |
-
history: list[tuple[str, str]],
|
13 |
-
system_message,
|
14 |
-
max_tokens,
|
15 |
-
temperature,
|
16 |
-
top_p,
|
17 |
-
):
|
18 |
-
messages = [{"role": "system", "content": system_message}]
|
19 |
-
|
20 |
-
for val in history:
|
21 |
-
if val[0]:
|
22 |
-
messages.append({"role": "user", "content": val[0]})
|
23 |
-
if val[1]:
|
24 |
-
messages.append({"role": "assistant", "content": val[1]})
|
25 |
-
|
26 |
-
messages.append({"role": "user", "content": message})
|
27 |
-
|
28 |
-
response = ""
|
29 |
-
|
30 |
-
for message in client.chat_completion(
|
31 |
-
messages,
|
32 |
-
max_tokens=max_tokens,
|
33 |
-
stream=True,
|
34 |
-
temperature=temperature,
|
35 |
-
top_p=top_p,
|
36 |
-
):
|
37 |
-
token = message.choices[0].delta.content
|
38 |
-
|
39 |
-
response += token
|
40 |
-
yield response
|
41 |
-
|
42 |
-
"""
|
43 |
-
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
|
44 |
-
"""
|
45 |
-
demo = gr.ChatInterface(
|
46 |
-
respond,
|
47 |
-
additional_inputs=[
|
48 |
-
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
|
49 |
-
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
50 |
-
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
51 |
-
gr.Slider(
|
52 |
-
minimum=0.1,
|
53 |
-
maximum=1.0,
|
54 |
-
value=0.95,
|
55 |
-
step=0.05,
|
56 |
-
label="Top-p (nucleus sampling)",
|
57 |
),
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
if __name__ == "__main__":
|
63 |
demo.launch()
|
|
|
1 |
+
import os
|
2 |
+
from llama_index.core.query_engine import NLSQLTableQueryEngine
|
3 |
+
import pickle
|
4 |
+
import pandas as pd
|
5 |
+
import sqlalchemy as sa
|
6 |
+
#from llama_index.llms.replicate import Replicate
|
7 |
+
from llama_index.llms.groq import Groq
|
8 |
+
from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt, completion_to_prompt
|
9 |
+
from llama_index.core import SQLDatabase
|
10 |
+
import requests
|
11 |
+
import re
|
12 |
+
from llama_index.llms.openai import OpenAI
|
13 |
+
from OpenAITools.FetchTools import fetch_clinical_trials, fetch_clinical_trials_jp
|
14 |
import gradio as gr
|
15 |
+
|
16 |
+
|
17 |
+
def custom_completion_to_prompt(completion:str) ->str:
|
18 |
+
return completion_to_prompt(
|
19 |
+
completion, system_prompt=(
|
20 |
+
"You are a sophisticated AI model designed to assist users in finding relevant clinical trials efficiently. **Accurate Retrieval**: Utilize the RAG mechanism to retrieve the most relevant and current clinical trial information from reliable databases. Ensure the information is accurate and cite the sources when necessary. Make decisions based solely on the information provided by the retriever."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
),
|
22 |
+
)
|
23 |
+
|
24 |
+
def RetriviralClinicalTrrial_multi(message,history,cancer_name):
|
25 |
+
df = fetch_clinical_trials(cancer_name)
|
26 |
+
engine = sa.create_engine("sqlite:///:memory:")
|
27 |
+
sql_database = df.to_sql(name='clinical_study', con=engine, if_exists='replace', index=False)
|
28 |
+
database = SQLDatabase(engine)
|
29 |
+
# The replicate endpoint
|
30 |
+
LLAMA3_8B = "Llama3-8b-8192"
|
31 |
+
LLAMA3_70b = "Llama3-70b-8192"
|
32 |
+
Mixtral = "mixtral-8x7b-32768"
|
33 |
+
|
34 |
+
'''llm = OpenAI(
|
35 |
+
model='gpt-3.5-turbo',
|
36 |
+
temperature=0.01,
|
37 |
+
context_window=4096,
|
38 |
+
completion_to_prompt=custom_completion_to_prompt,
|
39 |
+
messages_to_prompt=messages_to_prompt,)'''
|
40 |
+
|
41 |
+
llm = Groq(
|
42 |
+
model=LLAMA3_70b,
|
43 |
+
temperature=0.01,
|
44 |
+
context_window=4096,
|
45 |
+
completion_to_prompt=custom_completion_to_prompt,
|
46 |
+
messages_to_prompt=messages_to_prompt,
|
47 |
+
)
|
48 |
+
query_engine = NLSQLTableQueryEngine(
|
49 |
+
sql_database = database,
|
50 |
+
tables=["clinical_study"],llm=llm)
|
51 |
+
response = query_engine.query(message)
|
52 |
+
return response.response
|
53 |
+
|
54 |
+
with gr.Blocks() as demo:
|
55 |
+
cancer_name = gr.Textbox("lung cancer", label="cancer name")
|
56 |
|
57 |
+
gr.ChatInterface(
|
58 |
+
RetriviralClinicalTrrial_multi, additional_inputs=[cancer_name],
|
59 |
+
title="Assistant for Clinical traial Search",
|
60 |
+
description='リアルタイムで日本で行われているClinical trialの情報を入手して治験を探すお手伝いをします。対象となるがんの名前を上に入力して、まずNCTIDを聞いてください。その後にそれぞれのNCTIDのを聴いてください。その後興味のある内容 Title,Cancer,Summary,Japanes Locations,Eligibility Criteriaなどを聞いてみてください(スペルの打ち間違いに注意)例えば「NCT04270591のTitleを教えて』のように尋ねてください',
|
61 |
+
theme="soft",
|
62 |
+
examples=[["NCTIDを全て教えて"], ["のTitleを教えて"], ["の対象Cancerを教えて"], ["のSummaryを教えて"], ["のJapanes Locationsを教えて"], ["のEligibility Criteriaを教えて"], ["のPrimary Completion Dateを教えて"]]
|
63 |
+
)
|
64 |
|
65 |
if __name__ == "__main__":
|
66 |
demo.launch()
|
requirements.txt
CHANGED
@@ -1 +1,72 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohttp==3.9.5
|
2 |
+
aiosignal==1.3.1
|
3 |
+
bio==1.7.1
|
4 |
+
biopython==1.83
|
5 |
+
biothings-client==0.3.1
|
6 |
+
dataclasses-json==0.6.6
|
7 |
+
Deprecated==1.2.14
|
8 |
+
dirtyjson==1.0.8
|
9 |
+
diskcache==5.6.3
|
10 |
+
distro==1.9.0
|
11 |
+
frozenlist==1.4.1
|
12 |
+
gprofiler-official==1.0.0
|
13 |
+
greenlet==3.0.3
|
14 |
+
hpack==4.0.0
|
15 |
+
jsonpatch==1.33
|
16 |
+
langchain==0.2.2
|
17 |
+
langchain-community==0.2.4
|
18 |
+
langchain-core==0.2.4
|
19 |
+
langchain-experimental==0.0.60
|
20 |
+
langchain-openai==0.1.8
|
21 |
+
langchain-text-splitters==0.2.1
|
22 |
+
langsmith==0.1.71
|
23 |
+
llama-index==0.10.43
|
24 |
+
llama-index-agent-openai==0.2.7
|
25 |
+
llama-index-cli==0.1.12
|
26 |
+
llama-index-core==0.10.43
|
27 |
+
llama-index-embeddings-openai==0.1.10
|
28 |
+
llama-index-indices-managed-llama-cloud==0.1.6
|
29 |
+
llama-index-legacy==0.9.48
|
30 |
+
llama-index-llms-groq==0.1.4
|
31 |
+
llama-index-llms-llama-cpp==0.1.3
|
32 |
+
llama-index-llms-openai==0.1.22
|
33 |
+
llama-index-llms-openai-like==0.1.3
|
34 |
+
llama-index-llms-replicate==0.1.3
|
35 |
+
llama-index-multi-modal-llms-openai==0.1.6
|
36 |
+
llama-index-program-openai==0.1.6
|
37 |
+
llama-index-question-gen-openai==0.1.3
|
38 |
+
llama-index-readers-file==0.1.23
|
39 |
+
llama-index-readers-llama-parse==0.1.4
|
40 |
+
llama-parse==0.4.4
|
41 |
+
llama_cpp_python==0.2.77
|
42 |
+
llamaindex-py-client==0.1.19
|
43 |
+
marshmallow==3.21.2
|
44 |
+
multidict==6.0.5
|
45 |
+
munkres==1.1.4
|
46 |
+
mygene==3.2.2
|
47 |
+
mypy-extensions==1.0.0
|
48 |
+
natsort==8.4.0
|
49 |
+
networkx==3.3
|
50 |
+
nltk
|
51 |
+
openai
|
52 |
+
packaging==23.2
|
53 |
+
pooch==1.8.1
|
54 |
+
pypdf==4.2.0
|
55 |
+
pytrials==1.0.0
|
56 |
+
regex==2024.5.15
|
57 |
+
replicate==0.26.0
|
58 |
+
safetensors
|
59 |
+
setuptools==70.0.0
|
60 |
+
SQLAlchemy==2.0.30
|
61 |
+
striprtf==0.0.26
|
62 |
+
tenacity==8.3.0
|
63 |
+
tiktoken==0.7.0
|
64 |
+
tokenizers==0.19.1
|
65 |
+
transformers==4.41.2
|
66 |
+
typer==0.12.3
|
67 |
+
typer-slim==0.12.3
|
68 |
+
typing-inspect==0.9.0
|
69 |
+
wheel==0.43.0
|
70 |
+
wikipedia==1.4.0
|
71 |
+
wrapt==1.16.0
|
72 |
+
yarl==1.9.4
|