Spaces:

tetrisd
/

biasprobe

Runtime error

File size: 4,305 Bytes

from threading import Lock
import argparse

import numpy as np
from matplotlib import pyplot as plt
import gradio as gr
import torch
import pandas as pd

from biasprobe import BinaryProbe, PairwiseExtractionRunner, SimplePairPromptBuilder, ProbeConfig


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--seed', '-s', type=int, default=0, help="the random seed")
    parser.add_argument('--port', '-p', type=int, default=8080, help="the port to launch the demo")
    parser.add_argument('--no-cuda', action='store_true', help="Use CPUs instead of GPUs")
    args = parser.parse_args()
    return args


def main():
    args = get_args()
    plt.switch_backend('agg')
    dmap = 'auto'
    mdict = {0: '24GIB'}
    config = ProbeConfig.create_for_model('mistralai/Mistral-7B-Instruct-v0.1')
    probe = BinaryProbe(config).cuda()
    probe.load_state_dict(torch.load('probe.pt'))

    runner = PairwiseExtractionRunner.from_pretrained('mistralai/Mistral-7B-Instruct-v0.1', optimize=False, torch_dtype=torch.float16, max_memory=mdict, device_map=dmap, low_cpu_mem_usage=True)

    @torch.no_grad()
    def run_extraction(prompt):
        builder = SimplePairPromptBuilder(criterion='more positive')
        lst = [x.strip() for x in prompt.lower()[:300].split(',')][:100]
        exp = runner.run_extraction(lst, lst, layers=[15], num_repeat=50, builder=builder, parallel=False, run_inference=True, debug=True, max_new_tokens=2)
        test_ds = exp.make_dataset(15)

        import torch

        raw_scores = []
        preds_list = []
        hs = []

        for idx, (tensor, labels) in enumerate(test_ds):
            with torch.no_grad():
                labels = labels - 1  # 1-indexed

                if tensor.shape[0] != 2:
                    continue

                h = tensor[1] - tensor[0]
                hs.append(h)

                try:
                    x = probe(tensor.unsqueeze(0).cuda().float()).squeeze()
                except IndexError:
                    continue

                pred = [0, 1] if x.item() > 0 else [1, 0]
                pred = np.array(pred)

            if test_ds.original_examples is not None:
                items = [x.content for x in test_ds.original_examples[idx].hits]
                preds_list.append(np.array(items, dtype=object)[labels][pred].tolist())

            raw_scores.append(x.item())

        df = pd.DataFrame({'Win Rate': np.array(raw_scores) > 0, 'Word': [x[0] for x in preds_list]})
        win_df = df.groupby('Word').mean('Win Rate')
        win_df = win_df.reset_index().sort_values('Win Rate')
        win_df['Win Rate'] = [str(x) + '%' for x in (win_df['Win Rate'] * 100).round(2).tolist()]

        return win_df

    with gr.Blocks(css='scrollbar.css') as demo:
        md = '''# BiasProbe: Revealing Preference Biases in Language Model Representations
        What do llamas really "think" about controversial words?
        Type some words below to see how Mistral-7B-Instruct associates them with
        positive and negative emotions.
        Higher win rates indicate that the word is more likely to be associated with
        positive emotions than other words in the list.
        
        Check out our paper, [What Do Llamas Really Think? Revealing Preference Biases in Language Model Representations](http://arxiv.org/abs/2311.18812).
        See our [codebase](https://github.com/castorini/biasprobe) on GitHub.
        '''
        gr.Markdown(md)

        with gr.Row():
            with gr.Column():
                text = gr.Textbox(label='Words', value='Republican, democrat, libertarian, authoritarian')
                submit_btn = gr.Button('Submit', elem_id='submit-btn')
            output = gr.DataFrame(pd.DataFrame({'Word': ['authoritarian', 'republican', 'democrat', 'libertarian'],
                                                'Win Rate': ['44.44%', '81.82%', '100%', '100%']}))

            submit_btn.click(
                fn=run_extraction,
                inputs=[text],
                outputs=[output])

    while True:
        try:
            demo.launch(server_name='0.0.0.0')
        except OSError:
            gr.close_all()
        except KeyboardInterrupt:
            gr.close_all()
            break


if __name__ == '__main__':
    main()