ClinicalTrialV3 / OpenAITools /ReviewPaperTools.py
Satoc's picture
Add application file
92df76e
import re
import pandas as pd
def parse_text_file(text):
# セクションを分割するための正規表現パターンを定義
# \d+ は1つ以上の数字にマッチします
pattern = re.compile(r'\n\n\n\d+\.')
# テキストをセクションごとに分割
sections = pattern.split(text)[1:] # 最初の空のセクションを除外
# 各セクションの前後の空白を削除
sections = [section.strip() for section in sections]
return sections
def split_sections(text):
contents = text.split('\n\n')
contents = [section.strip() for section in contents if section.strip()]
if len(contents) == 8 :
keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI', 'COI']
elif len(contents) == 7 :
keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI']
elif len(contents) == 6:
keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'DOI']
elif len(contents) == 5:
keys = ['PublishInfo', 'Title', 'AuthorName', 'Abstract', 'DOI']
# 辞書を作成し、キーが存在しない場合は空の文字列を設定
section_dict = {key: contents[i] if i < len(contents) else "" for i, key in enumerate(keys)}
return section_dict
def GetSummaryDf(textdir):
with open(textdir, 'r', encoding='utf-8') as f:
content = f.read()
sections = parse_text_file(content)
dicts = []
for section in sections:
splited_dic = split_sections(section)
dicts.append(splited_dic)
return pd.DataFrame(dicts)