Spaces:
Runtime error
Runtime error
keyword annoation module
Browse files- text_annotator.py +45 -0
text_annotator.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
def generate_annotated_text(text, keyw_list):
|
4 |
+
'''
|
5 |
+
:param text: str
|
6 |
+
:param keyw_list: list of str
|
7 |
+
:return: str
|
8 |
+
|
9 |
+
e.g. input1 = 'I like an apple. Do you like apples too?'
|
10 |
+
input2 = ["like", "apple"]
|
11 |
+
output = annotated_text("I ", ("like", ""), " an ", ("apple", ""), ". Do you ", ("like", ""), " ", ("apple", ""), "s too?")
|
12 |
+
'''
|
13 |
+
def find_keyword_index(text, keyw_list):
|
14 |
+
indices = []
|
15 |
+
|
16 |
+
for keyword in keyw_list:
|
17 |
+
for match in re.finditer(keyword, text):
|
18 |
+
indices.append((keyword, match.start()))
|
19 |
+
|
20 |
+
return indices
|
21 |
+
|
22 |
+
indices = find_keyword_index(text, keyw_list)
|
23 |
+
|
24 |
+
# 1. indices를 index 값에 따라 오름차순으로 정렬
|
25 |
+
sorted_indices = sorted(indices, key=lambda x: x[1])
|
26 |
+
|
27 |
+
output = 'annotated_text('
|
28 |
+
last_index = 0
|
29 |
+
|
30 |
+
# 2. input1에서 각 키워드의 위치를 활용하여 문자열 분할 및 재처리
|
31 |
+
for word, start_idx in sorted_indices:
|
32 |
+
output += f'"{text[last_index:start_idx]}", ("{word}", ""), '
|
33 |
+
last_index = start_idx + len(word)
|
34 |
+
|
35 |
+
# 3. 결과 문자열 생성
|
36 |
+
output += f'"{text[last_index:]}"' + ')'
|
37 |
+
|
38 |
+
# 4. 연속 문자열 대치
|
39 |
+
string_cont_type1 = re.compile(r'", ""\), " ", \("')
|
40 |
+
string_cont_type2 = re.compile(r'", ""\), "", \("')
|
41 |
+
output = string_cont_type1.sub(' ', output)
|
42 |
+
output = string_cont_type2.sub(' ', output)
|
43 |
+
|
44 |
+
return output
|
45 |
+
|