File size: 3,327 Bytes
2d4811a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from typing import Optional

import pandas as pd
import streamlit as st
from datasets import Dataset  # type: ignore

from src.data import encode_dataset, get_collator, get_data, predict
from src.model import get_encoder, get_model, get_tokenizer
from src.subpages import Context
from src.utils import align_sample, device, explode_df

_TOKENIZER_NAME = (
    "xlm-roberta-base",
    "gagan3012/bert-tiny-finetuned-ner",
    "distilbert-base-german-cased",
)[0]


def _load_models_and_tokenizer(
    encoder_model_name: str,
    model_name: str,
    tokenizer_name: Optional[str],
    device: str = "cpu",
):
    sentence_encoder = get_encoder(encoder_model_name, device=device)
    tokenizer = get_tokenizer(tokenizer_name if tokenizer_name else model_name)
    labels = "O B-COMMA".split() if "comma" in model_name else None
    model = get_model(model_name, labels=labels)
    return sentence_encoder, model, tokenizer


@st.cache(allow_output_mutation=True)
def load_context(
    encoder_model_name: str,
    model_name: str,
    ds_name: str,
    ds_config_name: str,
    ds_split_name: str,
    split_sample_size: int,
    randomize_sample: bool,
    **kw_args,
) -> Context:
    """Utility method loading (almost) everything we need for the application.
    This exists just because we want to cache the results of this function.

    Args:
        encoder_model_name (str): Name of the sentence encoder to load.
        model_name (str): Name of the NER model to load.
        ds_name (str): Dataset name or path.
        ds_config_name (str): Dataset config name.
        ds_split_name (str): Dataset split name.
        split_sample_size (int): Number of examples to load from the split.

    Returns:
        Context: An object containing everything we need for the application.
    """

    sentence_encoder, model, tokenizer = _load_models_and_tokenizer(
        encoder_model_name=encoder_model_name,
        model_name=model_name,
        tokenizer_name=_TOKENIZER_NAME if "comma" in model_name else None,
        device=str(device),
    )
    collator = get_collator(tokenizer)

    # load data related stuff
    split: Dataset = get_data(
        ds_name, ds_config_name, ds_split_name, split_sample_size, randomize_sample
    )
    tags = split.features["ner_tags"].feature
    split_encoded, word_ids, ids = encode_dataset(split, tokenizer)

    # transform into dataframe
    df = predict(split_encoded, model, tokenizer, collator, tags)
    df["word_ids"] = word_ids
    df["ids"] = ids

    # explode, clean, merge
    df_tokens = explode_df(df)
    df_tokens_cleaned = df_tokens.query("labels != 'IGN'")
    df_merged = pd.DataFrame(df.apply(align_sample, axis=1).tolist())
    df_tokens_merged = explode_df(df_merged)

    return Context(
        **{
            "model": model,
            "tokenizer": tokenizer,
            "sentence_encoder": sentence_encoder,
            "df": df,
            "df_tokens": df_tokens,
            "df_tokens_cleaned": df_tokens_cleaned,
            "df_tokens_merged": df_tokens_merged,
            "tags": tags,
            "labels": tags.names,
            "split_sample_size": split_sample_size,
            "ds_name": ds_name,
            "ds_config_name": ds_config_name,
            "ds_split_name": ds_split_name,
            "split": split,
        }
    )