Spaces:

laiking
/

outcome-switch

Sleeping

outcome-switch / outcome_switch /filter.py

Mathieu Lai-King

first commit

1a3b3aa 7 months ago

4.08 kB

	import re
	from typing import Dict, List, Any, Tuple

	STRICT_OUTCOME_REGEX = '(outcome\|end(\s)?point)'
	OUTCOME_REGEX = '(outcome\|end(\s)?point\|measure\|assessment)'

	METHOD_REGEX = '(method\|approach\|strategy\|design\|protocol)'
	SAMPLE_SIZE_REGEX = 'sample\s(size\|number)'
	ABSTRACT_REGEX = '(abstract\|summary)'

	STRICT_PRIM_SEC_REGEX = f'(primary\|secondary\|main\|)\s([a-z]+\s)?{STRICT_OUTCOME_REGEX}'
	PRIM_SEC_REGEX = f'(primary\|secondary\|main\|)\s([a-z]+\s)?{OUTCOME_REGEX}'
	STRICT_METHOD_AND_PRIM_SEC_REGEX = f'{METHOD_REGEX}.+{STRICT_PRIM_SEC_REGEX}'
	METHOD_AND_PRIM_SEC_REGEX = f'{METHOD_REGEX}.+{PRIM_SEC_REGEX}'

	CHECK_PRIORITY = [
	("strict_method_and_prim_sec","title",STRICT_METHOD_AND_PRIM_SEC_REGEX),
	("strict_prim_sec","title",STRICT_PRIM_SEC_REGEX),
	("prim_sec","title",PRIM_SEC_REGEX),
	("outcome","title",OUTCOME_REGEX),
	("strict_prim_sec","content",STRICT_PRIM_SEC_REGEX),
	("prim_sec","content",PRIM_SEC_REGEX),
	("method_and_prim_sec","title",METHOD_AND_PRIM_SEC_REGEX),
	("outcome","content",OUTCOME_REGEX),
	("method","title",METHOD_REGEX),
	("sample_size","title",SAMPLE_SIZE_REGEX),
	("abstract","title",ABSTRACT_REGEX),
	]

	def filter_sections(sections_dict: Dict[str, List[str]]) -> Dict[str, Any] :
	"""Filter sections to keep only the ones containing relevant information if the text is a fulltext
	else keep all sections of abstract

	Args:
	sections_dict (Dict[str,List[str]]): dictionary containing all sections titles (keys) and their corresponding text content (values)
	text_type (str): type of text to filter (abstract or fulltext)

	Returns:
	Dict[str,Any]: dictionary containing the following keys:
	- filtered_sections: dictionary containing all sections titles (keys) and their corresponding text content (values) that contain relevant information
	- regex_priority_index: index of the regex used to filter the sections in the CHECK_PRIORITY list
	- regex_priority_name: name of the regex used to filter the sections in the CHECK_PRIORITY list
	- check_type: type of check used to filter the sections (title or content)
	"""
	filter_output = {
	"filtered_sections" : None,
	"regex_priority_index" : None,
	"regex_priority_name" : None,
	"check_type" : None,
	}
	if not sections_dict:
	return filter_output
	# else we filter the sections
	filter_output["filtered_sections"] = {} # init
	match_found = False
	for i, el in enumerate(CHECK_PRIORITY) :
	priority_name, content_type, current_regex = el
	current_regex = re.compile(current_regex, re.IGNORECASE)
	for title, content_list in sections_dict.items() :
	content = title if content_type == "title" else '\n'.join(content_list)
	if current_regex.search(content) :
	filter_output["check_type"] = content_type
	filter_output["regex_priority_name"] = priority_name
	filter_output["regex_priority_index"] = i
	filter_output["filtered_sections"][title] = content_list
	match_found = True
	if match_found :
	break
	return filter_output


	def filter_outcomes(entities: List[Dict[str, Any]]) -> List[Tuple[str,str]]:
	"""Filter primary and secondary outcomes from the list of entities a key is created
	only if at least one entity is found for the given group"""
	outcomes = []
	for entity in entities:
	if entity["entity_group"] == "O":
	continue
	elif entity["entity_group"] == "PrimaryOutcome" :
	outcomes.append(("primary", entity["word"]))
	elif entity["entity_group"] == "SecondaryOutcome":
	outcomes.append(("secondary", entity["word"]))
	return outcomes

	def get_sections_text(sections: Dict[str, List[str]]) -> str:
	if not sections :
	return None
	sections_text = ""
	for title, content in sections.items():
	sections_text += title + '\n' + " ".join(content) + '\n'
	return sections_text