File size: 14,621 Bytes
fe41391 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 |
import itertools
import os
import re
from string import Template
from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
from tokenizers import Encoding, Tokenizer
dirname = os.path.dirname(__file__)
css_filename = os.path.join(dirname, "visualizer-styles.css")
with open(css_filename) as f:
css = f.read()
class Annotation:
start: int
end: int
label: int
def __init__(self, start: int, end: int, label: str):
self.start = start
self.end = end
self.label = label
AnnotationList = List[Annotation]
PartialIntList = List[Optional[int]]
class CharStateKey(NamedTuple):
token_ix: Optional[int]
anno_ix: Optional[int]
class CharState:
char_ix: Optional[int]
def __init__(self, char_ix):
self.char_ix = char_ix
self.anno_ix: Optional[int] = None
self.tokens: List[int] = []
@property
def token_ix(self):
return self.tokens[0] if len(self.tokens) > 0 else None
@property
def is_multitoken(self):
"""
BPE tokenizers can output more than one token for a char
"""
return len(self.tokens) > 1
def partition_key(self) -> CharStateKey:
return CharStateKey(
token_ix=self.token_ix,
anno_ix=self.anno_ix,
)
class Aligned:
pass
class EncodingVisualizer:
"""
Build an EncodingVisualizer
Args:
tokenizer (:class:`~tokenizers.Tokenizer`):
A tokenizer instance
default_to_notebook (:obj:`bool`):
Whether to render html output in a notebook by default
annotation_converter (:obj:`Callable`, `optional`):
An optional (lambda) function that takes an annotation in any format and returns
an Annotation object
"""
unk_token_regex = re.compile("(.{1}\b)?(unk|oov)(\b.{1})?", flags=re.IGNORECASE)
def __init__(
self,
tokenizer: Tokenizer,
default_to_notebook: bool = True,
annotation_converter: Optional[Callable[[Any], Annotation]] = None,
):
if default_to_notebook:
try:
from IPython.core.display import HTML, display
except ImportError as e:
raise Exception(
"""We couldn't import IPython utils for html display.
Are you running in a notebook?
You can also pass `default_to_notebook=False` to get back raw HTML
"""
)
self.tokenizer = tokenizer
self.default_to_notebook = default_to_notebook
self.annotation_coverter = annotation_converter
pass
def __call__(
self,
text: str,
annotations: AnnotationList = [],
default_to_notebook: Optional[bool] = None,
) -> Optional[str]:
"""
Build a visualization of the given text
Args:
text (:obj:`str`):
The text to tokenize
annotations (:obj:`List[Annotation]`, `optional`):
An optional list of annotations of the text. The can either be an annotation class
or anything else if you instantiated the visualizer with a converter function
default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
If True, will render the html in a notebook. Otherwise returns an html string.
Returns:
The HTML string if default_to_notebook is False, otherwise (default) returns None and
renders the HTML in the notebook
"""
final_default_to_notebook = self.default_to_notebook
if default_to_notebook is not None:
final_default_to_notebook = default_to_notebook
if final_default_to_notebook:
try:
from IPython.core.display import HTML, display
except ImportError as e:
raise Exception(
"""We couldn't import IPython utils for html display.
Are you running in a notebook?"""
)
if self.annotation_coverter is not None:
annotations = list(map(self.annotation_coverter, annotations))
encoding = self.tokenizer.encode(text)
html = EncodingVisualizer.__make_html(text, encoding, annotations)
if final_default_to_notebook:
display(HTML(html))
else:
return html
@staticmethod
def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
"""
Generates a color palette for all the labels in a given set of annotations
Args:
annotations (:obj:`Annotation`):
A list of annotations
Returns:
:obj:`dict`: A dictionary mapping labels to colors in HSL format
"""
if len(annotations) == 0:
return {}
labels = set(map(lambda x: x.label, annotations))
num_labels = len(labels)
h_step = int(255 / num_labels)
if h_step < 20:
h_step = 20
s = 32
l = 64
h = 10
colors = {}
for label in sorted(labels): # sort so we always get the same colors for a given set of labels
colors[label] = f"hsl({h},{s}%,{l}%"
h += h_step
return colors
@staticmethod
def consecutive_chars_to_html(
consecutive_chars_list: List[CharState],
text: str,
encoding: Encoding,
):
"""
Converts a list of "consecutive chars" into a single HTML element.
Chars are consecutive if they fall under the same word, token and annotation.
The CharState class is a named tuple with a "partition_key" method that makes it easy to
compare if two chars are consecutive.
Args:
consecutive_chars_list (:obj:`List[CharState]`):
A list of CharStates that have been grouped together
text (:obj:`str`):
The original text being processed
encoding (:class:`~tokenizers.Encoding`):
The encoding returned from the tokenizer
Returns:
:obj:`str`: The HTML span for a set of consecutive chars
"""
first = consecutive_chars_list[0]
if first.char_ix is None:
# its a special token
stoken = encoding.tokens[first.token_ix]
# special tokens are represented as empty spans. We use the data attribute and css
# magic to display it
return f'<span class="special-token" data-stoken={stoken}></span>'
# We're not in a special token so this group has a start and end.
last = consecutive_chars_list[-1]
start = first.char_ix
end = last.char_ix + 1
span_text = text[start:end]
css_classes = [] # What css classes will we apply on the resulting span
data_items = {} # What data attributes will we apply on the result span
if first.token_ix is not None:
# We can either be in a token or not (e.g. in white space)
css_classes.append("token")
if first.is_multitoken:
css_classes.append("multi-token")
if first.token_ix % 2:
# We use this to color alternating tokens.
# A token might be split by an annotation that ends in the middle of it, so this
# lets us visually indicate a consecutive token despite its possible splitting in
# the html markup
css_classes.append("odd-token")
else:
# Like above, but a different color so we can see the tokens alternate
css_classes.append("even-token")
if EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix]) is not None:
# This is a special token that is in the text. probably UNK
css_classes.append("special-token")
# TODO is this the right name for the data attribute ?
data_items["stok"] = encoding.tokens[first.token_ix]
else:
# In this case we are looking at a group/single char that is not tokenized.
# e.g. white space
css_classes.append("non-token")
css = f'''class="{' '.join(css_classes)}"'''
data = ""
for key, val in data_items.items():
data += f' data-{key}="{val}"'
return f"<span {css} {data} >{span_text}</span>"
@staticmethod
def __make_html(text: str, encoding: Encoding, annotations: AnnotationList) -> str:
char_states = EncodingVisualizer.__make_char_states(text, encoding, annotations)
current_consecutive_chars = [char_states[0]]
prev_anno_ix = char_states[0].anno_ix
spans = []
label_colors_dict = EncodingVisualizer.calculate_label_colors(annotations)
cur_anno_ix = char_states[0].anno_ix
if cur_anno_ix is not None:
# If we started in an annotation make a span for it
anno = annotations[cur_anno_ix]
label = anno.label
color = label_colors_dict[label]
spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
for cs in char_states[1:]:
cur_anno_ix = cs.anno_ix
if cur_anno_ix != prev_anno_ix:
# If we've transitioned in or out of an annotation
spans.append(
# Create a span from the current consecutive characters
EncodingVisualizer.consecutive_chars_to_html(
current_consecutive_chars,
text=text,
encoding=encoding,
)
)
current_consecutive_chars = [cs]
if prev_anno_ix is not None:
# if we transitioned out of an annotation close it's span
spans.append("</span>")
if cur_anno_ix is not None:
# If we entered a new annotation make a span for it
anno = annotations[cur_anno_ix]
label = anno.label
color = label_colors_dict[label]
spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
prev_anno_ix = cur_anno_ix
if cs.partition_key() == current_consecutive_chars[0].partition_key():
# If the current charchter is in the same "group" as the previous one
current_consecutive_chars.append(cs)
else:
# Otherwise we make a span for the previous group
spans.append(
EncodingVisualizer.consecutive_chars_to_html(
current_consecutive_chars,
text=text,
encoding=encoding,
)
)
# An reset the consecutive_char_list to form a new group
current_consecutive_chars = [cs]
# All that's left is to fill out the final span
# TODO I think there is an edge case here where an annotation's span might not close
spans.append(
EncodingVisualizer.consecutive_chars_to_html(
current_consecutive_chars,
text=text,
encoding=encoding,
)
)
res = HTMLBody(spans) # Send the list of spans to the body of our html
return res
@staticmethod
def __make_anno_map(text: str, annotations: AnnotationList) -> PartialIntList:
"""
Args:
text (:obj:`str`):
The raw text we want to align to
annotations (:obj:`AnnotationList`):
A (possibly empty) list of annotations
Returns:
A list of length len(text) whose entry at index i is None if there is no annotation on
charachter i or k, the index of the annotation that covers index i where k is with
respect to the list of annotations
"""
annotation_map = [None] * len(text)
for anno_ix, a in enumerate(annotations):
for i in range(a.start, a.end):
annotation_map[i] = anno_ix
return annotation_map
@staticmethod
def __make_char_states(text: str, encoding: Encoding, annotations: AnnotationList) -> List[CharState]:
"""
For each character in the original text, we emit a tuple representing it's "state":
* which token_ix it corresponds to
* which word_ix it corresponds to
* which annotation_ix it corresponds to
Args:
text (:obj:`str`):
The raw text we want to align to
annotations (:obj:`List[Annotation]`):
A (possibly empty) list of annotations
encoding: (:class:`~tokenizers.Encoding`):
The encoding returned from the tokenizer
Returns:
:obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
it's state is
"""
annotation_map = EncodingVisualizer.__make_anno_map(text, annotations)
# Todo make this a dataclass or named tuple
char_states: List[CharState] = [CharState(char_ix) for char_ix in range(len(text))]
for token_ix, token in enumerate(encoding.tokens):
offsets = encoding.token_to_chars(token_ix)
if offsets is not None:
start, end = offsets
for i in range(start, end):
char_states[i].tokens.append(token_ix)
for char_ix, anno_ix in enumerate(annotation_map):
char_states[char_ix].anno_ix = anno_ix
return char_states
def HTMLBody(children: List[str], css_styles=css) -> str:
"""
Generates the full html with css from a list of html spans
Args:
children (:obj:`List[str]`):
A list of strings, assumed to be html elements
css_styles (:obj:`str`, `optional`):
Optional alternative implementation of the css
Returns:
:obj:`str`: An HTML string with style markup
"""
children_text = "".join(children)
return f"""
<html>
<head>
<style>
{css_styles}
</style>
</head>
<body>
<div class="tokenized-text" dir=auto>
{children_text}
</div>
</body>
</html>
"""
|