Spaces:

tombm
/

gp-uq-tester

Runtime error

App Files Files Community

tombm commited on Sep 22, 2023

Commit

5212a08

•

1 Parent(s): b51e9e8

Add functionality to app

Browse files

Files changed (6) hide show

.gitignore +5 -0
README.md +3 -3
app.py +67 -4
gp.py +132 -0
train.py +106 -0
uq.py +102 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.vscode/
+__pycache__/
+text/
+misc/
+bert-base-uncased-finetuned-cola/

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Gp Uq Tester
 emoji: 📈
-colorFrom: gray
-colorTo: blue
 sdk: gradio
 sdk_version: 3.44.4
 app_file: app.py

 ---
+title: GP UQ Tester
 emoji: 📈
+colorFrom: green
+colorTo: purple
 sdk: gradio
 sdk_version: 3.44.4
 app_file: app.py

app.py CHANGED Viewed

@@ -1,7 +1,70 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

 import gradio as gr
+from transformers import pipeline, set_seed, AutoTokenizer
+from uq import BertForUQSequenceClassification
+def predict(sentence):
+    model_path = "tombm/bert-base-uncased-finetuned-cola"
+    classifier = pipeline("text-classification", model=model_path, tokenizer=model_path)
+    label = classifier(sentence)
+    return label
+def uncertainty(sentence):
+    model_path = "tombm/bert-base-uncased-finetuned-cola"
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = BertForUQSequenceClassification.from_pretrained(model_path)
+    test_input = tokenizer(sentence, return_tensors="pt")
+    model.return_gp_cov = True
+    _, gp_cov = model(**test_input)
+    return str(gp_cov.item())
+with gr.Blocks() as demo:
+    set_seed(12)
+    intro_str = """The *cola* dataset focuses on determining whether sentences are grammatically correct.
+               Firstly, let's see how our finetuned model classifies two sentences,
+               the first of which is correct (i.e. valid) and the second is not (i.e. invalid):"""
+    gr.Markdown(value=intro_str)
+    gr.Interface(
+        fn=predict,
+        inputs=gr.Textbox(value="Good morning.", label="Input"),
+        outputs="label",
+    )
+    gr.Interface(
+        fn=predict,
+        inputs=gr.Textbox(
+            value="This sentence is sentence, this is a correct sentence!",
+            label="Input",
+        ),
+        outputs="label",
+    )
+    explain_str = """As we can see, our model correctly classifies the first sentence, but misclassifies the second.
+        Let's now inspect the uncertainties associated with each prediction generated by our GP head:"""
+    gr.Markdown(value=explain_str)
+    gr.Interface(
+        fn=uncertainty,
+        inputs=gr.Textbox(value="Good morning.", label="Input"),
+        outputs="text",
+    )  # should have low uncertainty
+    gr.Interface(
+        fn=uncertainty,
+        inputs=gr.Textbox(
+            value="This sentence is sentence, this is a correct sentence!",
+            label="Input",
+        ),
+        outputs="text",
+    )  # should have high uncertainty
+    final_str = """We can see here that the variance for the misclassified example is much higher than for the correctly
+                classified example. This is great, as now we have some indication of when our model might be uncertain!"""
+    gr.Markdown(value=final_str)
+demo.launch()
+# iface = gr.Interface(fn=predict, inputs="text", outputs="text")
+# iface.launch()

gp.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# Code for GP final layer adapted from this great repo:
+# https://github.com/kimjeyoung/SNGP-BERT-Pytorch .
+# We simplify things here a bit by removing the spectral
+# normalisation as the authors of the Plex paper say that this
+# isn't strictly necessary, so we just have a GP classification head on the model.
+import torch
+import math
+import copy
+from torch import nn
+def RandomFeatureLinear(i_dim, o_dim, bias=True, require_grad=False):
+    m = nn.Linear(i_dim, o_dim, bias)
+    nn.init.normal_(m.weight, mean=0.0, std=0.05)
+    m.weight.requires_grad = require_grad  # Freeze weights
+    if bias:
+        nn.init.uniform_(m.bias, a=0.0, b=2.0 * math.pi)  # Freeze bias
+        m.bias.requires_grad = require_grad
+    return m
+class GPClassificationHead(nn.Module):
+    def __init__(
+        self,
+        hidden_size=768,
+        gp_kernel_scale=1.0,
+        num_inducing=1024,
+        gp_output_bias=0.0,
+        layer_norm_eps=1e-12,
+        scale_random_features=True,
+        normalize_input=True,
+        gp_cov_momentum=0.999,
+        gp_cov_ridge_penalty=1e-3,
+        epochs=40,
+        num_classes=3,
+        device="cpu",
+    ):
+        super(GPClassificationHead, self).__init__()
+        self.final_epochs = epochs - 1
+        self.gp_cov_ridge_penalty = gp_cov_ridge_penalty
+        self.gp_cov_momentum = gp_cov_momentum
+        self.pooled_output_dim = hidden_size
+        self.gp_input_scale = 1.0 / math.sqrt(gp_kernel_scale)
+        self.gp_feature_scale = math.sqrt(2.0 / float(num_inducing))
+        self.gp_output_bias = gp_output_bias
+        self.scale_random_features = scale_random_features
+        self.normalize_input = normalize_input
+        self.device = device
+        self._gp_input_normalize_layer = torch.nn.LayerNorm(
+            hidden_size, eps=layer_norm_eps
+        )
+        self._gp_output_layer = nn.Linear(
+            num_inducing, num_classes, bias=False
+        )  # gp_output_bias set to not trainable
+        self._gp_output_bias = torch.tensor([self.gp_output_bias] * num_classes).to(
+            device
+        )
+        self._random_feature = RandomFeatureLinear(self.pooled_output_dim, num_inducing)
+        # Inverse covariance matrix corresponding to RFF-GP posterior
+        self.initial_precision_matrix = self.gp_cov_ridge_penalty * torch.eye(
+            num_inducing
+        ).to(device)
+        self.precision_matrix = torch.nn.Parameter(
+            copy.deepcopy(self.initial_precision_matrix), requires_grad=False
+        )
+    def gp_layer(self, gp_inputs, update_cov=True):
+        if self.normalize_input:
+            gp_inputs = self._gp_input_normalize_layer(gp_inputs)
+        gp_feature = self._random_feature(gp_inputs)
+        gp_feature = torch.cos(gp_feature)
+        if self.scale_random_features:
+            gp_feature = gp_feature * self.gp_input_scale
+        gp_output = self._gp_output_layer(gp_feature).to(
+            self.device
+        ) + self._gp_output_bias.to(self.device)
+        if update_cov:
+            self.update_cov(gp_feature)
+        return gp_feature, gp_output
+    def reset_cov(self):
+        self.precision_matrix = torch.nn.Parameter(
+            copy.deepcopy(self.initial_precision_matrix), requires_grad=False
+        )
+    def update_cov(self, gp_feature):
+        # https://github.com/google/edward2/blob/main/edward2/tensorflow/layers/random_feature.py#L346
+        batch_size = gp_feature.size()[0]
+        precision_matrix_minibatch = torch.matmul(gp_feature.t(), gp_feature)
+        # Moving average updates to precision matrix
+        precision_matrix_minibatch = precision_matrix_minibatch / batch_size
+        precision_matrix_new = (
+            self.gp_cov_momentum * self.precision_matrix
+            + (1.0 - self.gp_cov_momentum) * precision_matrix_minibatch
+        )
+        self.precision_matrix = torch.nn.Parameter(
+            precision_matrix_new, requires_grad=False
+        )
+    def compute_predictive_covariance(self, gp_feature):
+        # https://github.com/google/edward2/blob/main/edward2/tensorflow/layers/random_feature.py#L403
+        # Covariance matrix of feature coefficient
+        feature_cov_matrix = torch.linalg.inv(self.precision_matrix)
+        # Predictive covariance matrix for the GP
+        cov_feature_product = (
+            torch.matmul(feature_cov_matrix, gp_feature.t()) * self.gp_cov_ridge_penalty
+        )
+        gp_cov_matrix = torch.matmul(gp_feature, cov_feature_product)
+        return gp_cov_matrix
+    def forward(
+        self,
+        input_features,
+        return_gp_cov: bool = False,
+        update_cov: bool = True,
+    ):
+        gp_feature, gp_output = self.gp_layer(input_features, update_cov=update_cov)
+        if return_gp_cov:
+            gp_cov_matrix = self.compute_predictive_covariance(gp_feature)
+            return gp_output, gp_cov_matrix
+        return gp_output

train.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# This is a heavily adapted version of this notebook:
+# https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb ,
+# where we show on a simple text classification problem how we can integrate
+# components for uncertainty quantification into large pretrained models.
+import evaluate
+import numpy as np
+from datasets import load_dataset
+from transformers import (
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+    TrainerCallback,
+)
+from uq import BertForUQSequenceClassification
+BATCH_SIZE = 16
+EVAL_BATCH_SIZE = 128
+DEVICE = "cpu"
+# cola dataset for determining whether sentences are gramatically correct
+task = "cola"
+model_checkpoint = "bert-base-uncased"
+dataset = load_dataset("glue", task)
+metric = evaluate.load("glue", task)
+# Load our tokenizer and tokenize our data as it streams in
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
+def tokenize_data(data):
+    # Will add input ID and attention mask columns to dataset
+    return tokenizer(data["sentence"], truncation=True)
+encoded_dataset = dataset.map(tokenize_data, batched=True)
+# Now we can load our pretrained model and introduce our uncertainty quantification component,
+# which in this case is a GP final layer without any spectral normalization of the transformer weights
+num_labels = 2
+id2label = {0: "Invalid", 1: "Valid"}
+label2id = {val: key for key, val in id2label.items()}
+model = BertForUQSequenceClassification.from_pretrained(
+    model_checkpoint, num_labels=num_labels, id2label=id2label, label2id=label2id
+)
+# Specify training arguments
+metric_name = "matthews_correlation"
+model_name = model_checkpoint.split("/")[-1]
+args = TrainingArguments(
+    f"{model_name}-finetuned-{task}",
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    learning_rate=2e-5,
+    per_device_train_batch_size=BATCH_SIZE,
+    per_device_eval_batch_size=EVAL_BATCH_SIZE,
+    num_train_epochs=3,
+    weight_decay=0.01,
+    load_best_model_at_end=True,
+    metric_for_best_model=metric_name,
+    push_to_hub=True,
+    use_mps_device=False,
+    no_cuda=True,
+)
+# Set up metric tracking
+def compute_metrics(eval_predictions):
+    predictions, labels = eval_predictions
+    predictions = np.argmax(predictions, axis=1)
+    return metric.compute(predictions=predictions, references=labels)
+# Finally, set up trainer for finetuning the model
+model.to(DEVICE)
+trainer = Trainer(
+    model,
+    args,
+    train_dataset=encoded_dataset["train"],
+    eval_dataset=encoded_dataset["validation"],
+    tokenizer=tokenizer,
+    compute_metrics=compute_metrics,
+)
+# Add in a callback to reset the covariance matrix after each epoch, as we only need
+# to do this once at the final epoch, so we don't double count any of the data. We
+# could use a more elegant solution, but the covariance computation is very cheap
+# so doing it ~5 times rather than once isn't a big deal.
+class ResetCovarianceCallback(TrainerCallback):
+    def __init__(self, trainer) -> None:
+        super().__init__()
+        self._trainer = trainer
+    def on_epoch_end(self, args, state, control, **kwargs):
+        if control.should_evaluate:
+            self._trainer.model.classifier.reset_cov()
+trainer.add_callback(ResetCovarianceCallback(trainer))
+trainer.train()
+trainer.push_to_hub()

uq.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel
+from gp import GPClassificationHead
+class BertForUQSequenceClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = GPClassificationHead(
+            hidden_size=config.hidden_size,
+            num_classes=config.num_labels,
+            num_inducing=512,
+        )
+        self.return_gp_cov = False
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        r"""
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+                Classification (or regression if config.num_labels==1) loss.
+            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
+        """
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        if self.return_gp_cov:
+            logits, gp_cov = self.classifier(
+                pooled_output,
+                return_gp_cov=True,
+                update_cov=False,
+            )
+        else:
+            logits = self.classifier(pooled_output)
+        outputs = (logits,) + outputs[
+            2:
+        ]  # add hidden states and attention if they are here
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+        if self.return_gp_cov:
+            return outputs, gp_cov
+        else:
+            return outputs  # (loss), logits, (hidden_states), (attentions)