import html import re from dataclasses import dataclass from typing import List PLOT_ENCODINGS = ["image", "html", "df"] @dataclass class PlotData: """ Data to plot. Args: data: the data to plot: - a base64 encoded PNG if `encoding` is `png`. - a string in HTML if `encoding` is `html`. - a path to a parquet file if `encoding` is `df`. encoding: the encoding of the data, one of PLOT_ENCODINGS. """ data: str encoding: str def __post_init__(self): assert self.encoding in PLOT_ENCODINGS, f"Unknown plot encoding {self.encoding}" def get_line_separator_html(): return ( "
" ) def decode_bytes(chunks: List[bytes]): """Decodes bytes to string Args: chunks: byte chunks Returns: list of decoded strings """ decoded_tokens = [] buffer = b"" for chunk in chunks: combined = buffer + chunk try: # Try to decode the combined bytes decoded_tokens.append(combined.decode("utf-8")) # If successful, clear the buffer buffer = b"" except UnicodeDecodeError: # If decoding failed, keep the current chunk in the buffer # and attempt to combine it with the next chunk buffer = chunk # Attempt to decode any remaining bytes in the buffer try: decoded_tokens.append(buffer.decode("utf-8")) except UnicodeDecodeError: pass return decoded_tokens def format_for_markdown_visualization(text: str) -> str: """ Convert newlines to
tags, except for those inside code blocks. This is needed because the markdown_table_cell_type() function does not convert newlines to
tags, so we have to do it ourselves. This function is rather simple and may fail on text that uses ` in some other context than marking code cells or uses ` within the code itself (as this function). """ code_block_regex = r"(```.*?```|``.*?``)" parts = re.split(code_block_regex, text, flags=re.DOTALL) for i in range(len(parts)): # Only substitute for text outside matched code blocks if "`" not in parts[i]: parts[i] = parts[i].replace("\n", "
").strip() text = "".join(parts) # Restore newlines around code blocks, needed for correct rendering for x in ["```", "``", "`"]: text = text.replace(f"
{x}", f"\n{x}") text = text.replace(f"{x}
", f"{x}\n") return html.escape(text.replace("
", "\n")) def list_to_markdown_representation( tokens: List[str], masks: List[bool], pad_token: int, num_chars: int = 65 ): """ Creates a markdown representation string from a list of tokens, with HTML line breaks after 'num_chars' characters. Masked tokens will be emphasized in HTML representation. """ x = [] sublist: List[str] = [] raw_sublist: List[str] = [] for token, mask in zip(tokens, masks): if len(token) + len(", ".join(raw_sublist)) > num_chars: x.append(", ".join(sublist)) sublist = [] raw_sublist = [] raw_sublist.append(token) token_formatted = html.escape(token) if mask: token_formatted = f"""***{token_formatted}***""" elif token == pad_token: token_formatted = f"""{ token_formatted }""" sublist.append(token_formatted) if sublist: # add any remaining items in sublist x.append(", ".join(sublist)) list_representation = "\n[" + "
".join(x) + "]\n" return list_representation