File size: 4,079 Bytes
1a3b3aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import re
from typing import Dict, List, Any, Tuple

STRICT_OUTCOME_REGEX = '(outcome|end(\s)?point)'
OUTCOME_REGEX = '(outcome|end(\s)?point|measure|assessment)'

METHOD_REGEX = '(method|approach|strategy|design|protocol)'
SAMPLE_SIZE_REGEX = 'sample\s(size|number)'
ABSTRACT_REGEX = '(abstract|summary)'

STRICT_PRIM_SEC_REGEX = f'(primary|secondary|main|)\s([a-z]+\s)?{STRICT_OUTCOME_REGEX}'
PRIM_SEC_REGEX = f'(primary|secondary|main|)\s([a-z]+\s)?{OUTCOME_REGEX}'
STRICT_METHOD_AND_PRIM_SEC_REGEX = f'{METHOD_REGEX}.+{STRICT_PRIM_SEC_REGEX}' 
METHOD_AND_PRIM_SEC_REGEX = f'{METHOD_REGEX}.+{PRIM_SEC_REGEX}'

CHECK_PRIORITY = [
    ("strict_method_and_prim_sec","title",STRICT_METHOD_AND_PRIM_SEC_REGEX),
    ("strict_prim_sec","title",STRICT_PRIM_SEC_REGEX),
    ("prim_sec","title",PRIM_SEC_REGEX),
    ("outcome","title",OUTCOME_REGEX),
    ("strict_prim_sec","content",STRICT_PRIM_SEC_REGEX),
    ("prim_sec","content",PRIM_SEC_REGEX),
    ("method_and_prim_sec","title",METHOD_AND_PRIM_SEC_REGEX),
    ("outcome","content",OUTCOME_REGEX),
    ("method","title",METHOD_REGEX),
    ("sample_size","title",SAMPLE_SIZE_REGEX),
    ("abstract","title",ABSTRACT_REGEX),
]

def filter_sections(sections_dict: Dict[str, List[str]]) -> Dict[str, Any] :
    """Filter sections to keep only the ones containing relevant information if the text is a fulltext
    else keep all sections of abstract

    Args:
        sections_dict (Dict[str,List[str]]): dictionary containing all sections titles (keys) and their corresponding text content (values)
        text_type (str): type of text to filter (abstract or fulltext)

    Returns:
        Dict[str,Any]: dictionary containing the following keys:
            - filtered_sections: dictionary containing all sections titles (keys) and their corresponding text content (values) that contain relevant information
            - regex_priority_index: index of the regex used to filter the sections in the CHECK_PRIORITY list
            - regex_priority_name: name of the regex used to filter the sections in the CHECK_PRIORITY list
            - check_type: type of check used to filter the sections (title or content)
    """
    filter_output = {
        "filtered_sections" : None,
        "regex_priority_index" : None,
        "regex_priority_name" : None,
        "check_type" : None,
    }
    if not sections_dict:
        return filter_output
    # else we filter the sections
    filter_output["filtered_sections"] = {} # init
    match_found = False
    for i, el  in enumerate(CHECK_PRIORITY) :
        priority_name, content_type, current_regex = el
        current_regex = re.compile(current_regex, re.IGNORECASE)
        for title, content_list in sections_dict.items() :
            content = title if content_type == "title" else '\n'.join(content_list)
            if current_regex.search(content) :
                filter_output["check_type"] = content_type
                filter_output["regex_priority_name"] = priority_name
                filter_output["regex_priority_index"] = i
                filter_output["filtered_sections"][title] = content_list
                match_found = True
        if match_found :
            break
    return filter_output


def filter_outcomes(entities: List[Dict[str, Any]]) -> List[Tuple[str,str]]:
    """Filter primary and secondary outcomes from the list of entities a key is created 
    only if at least one entity is found for the given group"""
    outcomes = []
    for entity in entities:
        if entity["entity_group"] == "O":
            continue
        elif entity["entity_group"] == "PrimaryOutcome" :
            outcomes.append(("primary", entity["word"]))
        elif entity["entity_group"] == "SecondaryOutcome":
            outcomes.append(("secondary", entity["word"]))
    return outcomes

def get_sections_text(sections: Dict[str, List[str]]) -> str:
    if not sections :
        return None
    sections_text = ""
    for title, content in sections.items():
        sections_text += title + '\n' + " ".join(content) + '\n'
    return sections_text