Mathieu Lai-King
first commit
1a3b3aa
import re
from typing import Dict, List, Any, Tuple
STRICT_OUTCOME_REGEX = '(outcome|end(\s)?point)'
OUTCOME_REGEX = '(outcome|end(\s)?point|measure|assessment)'
METHOD_REGEX = '(method|approach|strategy|design|protocol)'
SAMPLE_SIZE_REGEX = 'sample\s(size|number)'
ABSTRACT_REGEX = '(abstract|summary)'
STRICT_PRIM_SEC_REGEX = f'(primary|secondary|main|)\s([a-z]+\s)?{STRICT_OUTCOME_REGEX}'
PRIM_SEC_REGEX = f'(primary|secondary|main|)\s([a-z]+\s)?{OUTCOME_REGEX}'
STRICT_METHOD_AND_PRIM_SEC_REGEX = f'{METHOD_REGEX}.+{STRICT_PRIM_SEC_REGEX}'
METHOD_AND_PRIM_SEC_REGEX = f'{METHOD_REGEX}.+{PRIM_SEC_REGEX}'
CHECK_PRIORITY = [
("strict_method_and_prim_sec","title",STRICT_METHOD_AND_PRIM_SEC_REGEX),
("strict_prim_sec","title",STRICT_PRIM_SEC_REGEX),
("prim_sec","title",PRIM_SEC_REGEX),
("outcome","title",OUTCOME_REGEX),
("strict_prim_sec","content",STRICT_PRIM_SEC_REGEX),
("prim_sec","content",PRIM_SEC_REGEX),
("method_and_prim_sec","title",METHOD_AND_PRIM_SEC_REGEX),
("outcome","content",OUTCOME_REGEX),
("method","title",METHOD_REGEX),
("sample_size","title",SAMPLE_SIZE_REGEX),
("abstract","title",ABSTRACT_REGEX),
]
def filter_sections(sections_dict: Dict[str, List[str]]) -> Dict[str, Any] :
"""Filter sections to keep only the ones containing relevant information if the text is a fulltext
else keep all sections of abstract
Args:
sections_dict (Dict[str,List[str]]): dictionary containing all sections titles (keys) and their corresponding text content (values)
text_type (str): type of text to filter (abstract or fulltext)
Returns:
Dict[str,Any]: dictionary containing the following keys:
- filtered_sections: dictionary containing all sections titles (keys) and their corresponding text content (values) that contain relevant information
- regex_priority_index: index of the regex used to filter the sections in the CHECK_PRIORITY list
- regex_priority_name: name of the regex used to filter the sections in the CHECK_PRIORITY list
- check_type: type of check used to filter the sections (title or content)
"""
filter_output = {
"filtered_sections" : None,
"regex_priority_index" : None,
"regex_priority_name" : None,
"check_type" : None,
}
if not sections_dict:
return filter_output
# else we filter the sections
filter_output["filtered_sections"] = {} # init
match_found = False
for i, el in enumerate(CHECK_PRIORITY) :
priority_name, content_type, current_regex = el
current_regex = re.compile(current_regex, re.IGNORECASE)
for title, content_list in sections_dict.items() :
content = title if content_type == "title" else '\n'.join(content_list)
if current_regex.search(content) :
filter_output["check_type"] = content_type
filter_output["regex_priority_name"] = priority_name
filter_output["regex_priority_index"] = i
filter_output["filtered_sections"][title] = content_list
match_found = True
if match_found :
break
return filter_output
def filter_outcomes(entities: List[Dict[str, Any]]) -> List[Tuple[str,str]]:
"""Filter primary and secondary outcomes from the list of entities a key is created
only if at least one entity is found for the given group"""
outcomes = []
for entity in entities:
if entity["entity_group"] == "O":
continue
elif entity["entity_group"] == "PrimaryOutcome" :
outcomes.append(("primary", entity["word"]))
elif entity["entity_group"] == "SecondaryOutcome":
outcomes.append(("secondary", entity["word"]))
return outcomes
def get_sections_text(sections: Dict[str, List[str]]) -> str:
if not sections :
return None
sections_text = ""
for title, content in sections.items():
sections_text += title + '\n' + " ".join(content) + '\n'
return sections_text