File size: 1,297 Bytes
bac893c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import os

import numpy as np
import pandas as pd
import torch

from sklearn.manifold import TSNE
from tqdm import tqdm


def load_feats(path='./feats'):
    print('==> loading feats')
    feats = {}
    for pt in os.listdir(path):
        if pt.split('.')[-1] == 'pt' and pt.split('.')[0].isdigit():
            feats[int(pt.split('.')[0])] = torch.load(os.path.join('../data/feats', pt))
    return feats


def calc_tsne(feat):
    tsne = TSNE(n_components=2, random_state=0, perplexity=30, n_iter=1000)
    res = tsne.fit_transform(feat['all'].numpy())
    return res


def test_open(fp='./feats_tsne.parquet'):
    df = pd.read_parquet(fp)
    print(df.head())


if __name__ == '__main__':
    feats = load_feats()
    df = pd.DataFrame(columns=['x', 'y', 'prompt_id', 'modelVersion_id'])

    print('==> applying t-SNE')
    for k, v in tqdm(feats.items()):
        modelVersion_ids = []
        for id in v.keys():
            if id != 'all' and id != 'tsne':
                modelVersion_ids.append(int(id.item()))

        res = calc_tsne(v)

        tmp = pd.DataFrame(res, columns=['x', 'y'])
        tmp['prompt_id'] = k
        tmp['modelVersion_id'] = modelVersion_ids

        df = pd.concat([df, tmp], ignore_index=True)

    df.to_parquet('./feats_tsne.parquet')

    # test_open()