File size: 1,823 Bytes
2c5347a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import os
import json
import numpy as np
import pandas as pd

def load_huggingface_tokenizer(tokenizer_path: str):
    with open(os.path.join(tokenizer_path, 'config.json'), 'r') as f:
            config = json.load(f)
    tokenizer_type = config['tokenizer_type']
    tokenizer = {'BPE': BPETokenizer,
                 'BBPE': ByteLevelBPETokenizer,
                 'BERT': BertWordPieceTokenizer}[tokenizer_type]
    if tokenizer_type in ['BPE', 'BBPE']:
        vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.json' in x][0]
        merges_file = [x for x in os.listdir(tokenizer_path) if 'merges.txt' in x][0]
        tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file),
                            merges_file=os.path.join(tokenizer_path, merges_file))
    else:
        vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.txt' in x][0]
        tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file))
    return tokenizer


def jackknife(data, num_partitions=5):
    data = data.sample(frac=1)
    splits = np.split(data, range(0, data.shape[0], int(data.shape[0]/num_partitions) )[1:])
    for i, split in enumerate(splits):
        train_parts = list(range(0, num_partitions))
        try:
            train_parts.remove(i)
            yield pd.concat([splits[ix] for ix in train_parts], 0), split
        except ValueError:
            continue
    

def stratified_sample(df, col, n_samples):
    n = min(n_samples, df[col].value_counts().min())
    rand_int = np.random.randint(1, 10000)
    df_ = df.groupby(col).apply(lambda x: x.sample(n, random_state=rand_int))
    df_.index = df_.index.droplevel(0)
    return df_


def replace_bool(x):
    if x == 'true':
        return 1
    elif x == 'false':
        return 0
    else:
        return x