Spaces:
Running
Running
import re | |
import pandas as pd | |
def parse_text_file(text): | |
# セクションを分割するための正規表現パターンを定義 | |
# \d+ は1つ以上の数字にマッチします | |
pattern = re.compile(r'\n\n\n\d+\.') | |
# テキストをセクションごとに分割 | |
sections = pattern.split(text)[1:] # 最初の空のセクションを除外 | |
# 各セクションの前後の空白を削除 | |
sections = [section.strip() for section in sections] | |
return sections | |
def split_sections(text): | |
contents = text.split('\n\n') | |
contents = [section.strip() for section in contents if section.strip()] | |
if len(contents) == 8 : | |
keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI', 'COI'] | |
elif len(contents) == 7 : | |
keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI'] | |
elif len(contents) == 6: | |
keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'DOI'] | |
elif len(contents) == 5: | |
keys = ['PublishInfo', 'Title', 'AuthorName', 'Abstract', 'DOI'] | |
# 辞書を作成し、キーが存在しない場合は空の文字列を設定 | |
section_dict = {key: contents[i] if i < len(contents) else "" for i, key in enumerate(keys)} | |
return section_dict | |
def GetSummaryDf(textdir): | |
with open(textdir, 'r', encoding='utf-8') as f: | |
content = f.read() | |
sections = parse_text_file(content) | |
dicts = [] | |
for section in sections: | |
splited_dic = split_sections(section) | |
dicts.append(splited_dic) | |
return pd.DataFrame(dicts) |