mlo0ollm commited on
Commit
0c98584
1 Parent(s): c983ef1

keyword annoation module

Browse files
Files changed (1) hide show
  1. text_annotator.py +45 -0
text_annotator.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def generate_annotated_text(text, keyw_list):
4
+ '''
5
+ :param text: str
6
+ :param keyw_list: list of str
7
+ :return: str
8
+
9
+ e.g. input1 = 'I like an apple. Do you like apples too?'
10
+ input2 = ["like", "apple"]
11
+ output = annotated_text("I ", ("like", ""), " an ", ("apple", ""), ". Do you ", ("like", ""), " ", ("apple", ""), "s too?")
12
+ '''
13
+ def find_keyword_index(text, keyw_list):
14
+ indices = []
15
+
16
+ for keyword in keyw_list:
17
+ for match in re.finditer(keyword, text):
18
+ indices.append((keyword, match.start()))
19
+
20
+ return indices
21
+
22
+ indices = find_keyword_index(text, keyw_list)
23
+
24
+ # 1. indices를 index 값에 따라 오름차순으로 정렬
25
+ sorted_indices = sorted(indices, key=lambda x: x[1])
26
+
27
+ output = 'annotated_text('
28
+ last_index = 0
29
+
30
+ # 2. input1에서 각 키워드의 위치를 활용하여 문자열 분할 및 재처리
31
+ for word, start_idx in sorted_indices:
32
+ output += f'"{text[last_index:start_idx]}", ("{word}", ""), '
33
+ last_index = start_idx + len(word)
34
+
35
+ # 3. 결과 문자열 생성
36
+ output += f'"{text[last_index:]}"' + ')'
37
+
38
+ # 4. 연속 문자열 대치
39
+ string_cont_type1 = re.compile(r'", ""\), " ", \("')
40
+ string_cont_type2 = re.compile(r'", ""\), "", \("')
41
+ output = string_cont_type1.sub(' ', output)
42
+ output = string_cont_type2.sub(' ', output)
43
+
44
+ return output
45
+