Spaces:
Sleeping
Sleeping
import re | |
from typing import Dict, List, Any, Tuple | |
STRICT_OUTCOME_REGEX = '(outcome|end(\s)?point)' | |
OUTCOME_REGEX = '(outcome|end(\s)?point|measure|assessment)' | |
METHOD_REGEX = '(method|approach|strategy|design|protocol)' | |
SAMPLE_SIZE_REGEX = 'sample\s(size|number)' | |
ABSTRACT_REGEX = '(abstract|summary)' | |
STRICT_PRIM_SEC_REGEX = f'(primary|secondary|main|)\s([a-z]+\s)?{STRICT_OUTCOME_REGEX}' | |
PRIM_SEC_REGEX = f'(primary|secondary|main|)\s([a-z]+\s)?{OUTCOME_REGEX}' | |
STRICT_METHOD_AND_PRIM_SEC_REGEX = f'{METHOD_REGEX}.+{STRICT_PRIM_SEC_REGEX}' | |
METHOD_AND_PRIM_SEC_REGEX = f'{METHOD_REGEX}.+{PRIM_SEC_REGEX}' | |
CHECK_PRIORITY = [ | |
("strict_method_and_prim_sec","title",STRICT_METHOD_AND_PRIM_SEC_REGEX), | |
("strict_prim_sec","title",STRICT_PRIM_SEC_REGEX), | |
("prim_sec","title",PRIM_SEC_REGEX), | |
("outcome","title",OUTCOME_REGEX), | |
("strict_prim_sec","content",STRICT_PRIM_SEC_REGEX), | |
("prim_sec","content",PRIM_SEC_REGEX), | |
("method_and_prim_sec","title",METHOD_AND_PRIM_SEC_REGEX), | |
("outcome","content",OUTCOME_REGEX), | |
("method","title",METHOD_REGEX), | |
("sample_size","title",SAMPLE_SIZE_REGEX), | |
("abstract","title",ABSTRACT_REGEX), | |
] | |
def filter_sections(sections_dict: Dict[str, List[str]]) -> Dict[str, Any] : | |
"""Filter sections to keep only the ones containing relevant information if the text is a fulltext | |
else keep all sections of abstract | |
Args: | |
sections_dict (Dict[str,List[str]]): dictionary containing all sections titles (keys) and their corresponding text content (values) | |
text_type (str): type of text to filter (abstract or fulltext) | |
Returns: | |
Dict[str,Any]: dictionary containing the following keys: | |
- filtered_sections: dictionary containing all sections titles (keys) and their corresponding text content (values) that contain relevant information | |
- regex_priority_index: index of the regex used to filter the sections in the CHECK_PRIORITY list | |
- regex_priority_name: name of the regex used to filter the sections in the CHECK_PRIORITY list | |
- check_type: type of check used to filter the sections (title or content) | |
""" | |
filter_output = { | |
"filtered_sections" : None, | |
"regex_priority_index" : None, | |
"regex_priority_name" : None, | |
"check_type" : None, | |
} | |
if not sections_dict: | |
return filter_output | |
# else we filter the sections | |
filter_output["filtered_sections"] = {} # init | |
match_found = False | |
for i, el in enumerate(CHECK_PRIORITY) : | |
priority_name, content_type, current_regex = el | |
current_regex = re.compile(current_regex, re.IGNORECASE) | |
for title, content_list in sections_dict.items() : | |
content = title if content_type == "title" else '\n'.join(content_list) | |
if current_regex.search(content) : | |
filter_output["check_type"] = content_type | |
filter_output["regex_priority_name"] = priority_name | |
filter_output["regex_priority_index"] = i | |
filter_output["filtered_sections"][title] = content_list | |
match_found = True | |
if match_found : | |
break | |
return filter_output | |
def filter_outcomes(entities: List[Dict[str, Any]]) -> List[Tuple[str,str]]: | |
"""Filter primary and secondary outcomes from the list of entities a key is created | |
only if at least one entity is found for the given group""" | |
outcomes = [] | |
for entity in entities: | |
if entity["entity_group"] == "O": | |
continue | |
elif entity["entity_group"] == "PrimaryOutcome" : | |
outcomes.append(("primary", entity["word"])) | |
elif entity["entity_group"] == "SecondaryOutcome": | |
outcomes.append(("secondary", entity["word"])) | |
return outcomes | |
def get_sections_text(sections: Dict[str, List[str]]) -> str: | |
if not sections : | |
return None | |
sections_text = "" | |
for title, content in sections.items(): | |
sections_text += title + '\n' + " ".join(content) + '\n' | |
return sections_text |