import re import pandas as pd def parse_text_file(text): # セクションを分割するための正規表現パターンを定義 # \d+ は1つ以上の数字にマッチします pattern = re.compile(r'\n\n\n\d+\.') # テキストをセクションごとに分割 sections = pattern.split(text)[1:] # 最初の空のセクションを除外 # 各セクションの前後の空白を削除 sections = [section.strip() for section in sections] return sections def split_sections(text): contents = text.split('\n\n') contents = [section.strip() for section in contents if section.strip()] if len(contents) == 8 : keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI', 'COI'] elif len(contents) == 7 : keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI'] elif len(contents) == 6: keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'DOI'] elif len(contents) == 5: keys = ['PublishInfo', 'Title', 'AuthorName', 'Abstract', 'DOI'] # 辞書を作成し、キーが存在しない場合は空の文字列を設定 section_dict = {key: contents[i] if i < len(contents) else "" for i, key in enumerate(keys)} return section_dict def GetSummaryDf(textdir): with open(textdir, 'r', encoding='utf-8') as f: content = f.read() sections = parse_text_file(content) dicts = [] for section in sections: splited_dic = split_sections(section) dicts.append(splited_dic) return pd.DataFrame(dicts)