roberta_zinc_decoder / prepare_data_script.py
entropy's picture
Upload 2 files
963134f
raw
history blame
3.79 kB
import pandas as pd
import os
import torch
from transformers import RobertaTokenizerFast, RobertaForMaskedLM, DataCollatorWithPadding
import datasets
from datasets import disable_caching
disable_caching()
DEVICE = 'cuda:0' # model device
ENCODER_MODEL_NAME = "entropy/roberta_zinc_480m" # encoder name
ENCODER_BATCH_SIZE = 1024 # batch size for computing embeddings
TOKENIZER_MAX_LEN = 256 # max_length param on tokenizer
TOKENIZATION_NUM_PROC = 32 # number of processes for tokenization
'''
Data source is expected to be a CSV file with a column of SMILES strings
denoted by `SMILES_COLUMN`. The CSV is processed in chunks of size `PROCESS_CHUNKSIZE`.
Processed chunks are saved to `SAVE_PATH` with the format `SAVE_PATH/processed_shard_{i}.hf`
'''
DATASET_CSV_FILENAME = None # path to data csv
PROCESS_CHUNKSIZE = 1000000 # how many rows to process/save for each dataset shard
SMILES_COLUMN = 'smiles' # csv column holding smiles strings
MAX_CHUNKS = None # total number of chunks to process (if None, all chunks are processed)
MAX_SMILES_LENGTH = 90 # max smiles string length (exclusive)
MIN_SMILES_LENGTH = 5 # min smiles string length (exclusive)
FILTER_NUM_PROC = 32 # number of processes for filtering
SAVE_PATH = None # directory to save data shards to
assert DATASET_CSV_FILENAME is not None, "must specify dataset filename"
assert SAVE_PATH is not None, "must specify save path"
def tokenization(example):
return tokenizer(example[SMILES_COLUMN], add_special_tokens=True,
truncation=True, max_length=TOKENIZER_MAX_LEN)
def embed(inputs):
inputs = {k:inputs[k] for k in ['input_ids', 'attention_mask']}
inputs = collator(inputs)
inputs = {k:v.to(DEVICE) for k,v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs, output_hidden_states=True)
full_embeddings = outputs[-1][-1]
mask = inputs['attention_mask']
mean_embeddings = ((full_embeddings * mask.unsqueeze(-1)).sum(1) / mask.sum(-1).unsqueeze(-1))
return {'encoder_hidden_states' : mean_embeddings}
def length_filter_smiles(example):
min_check = (len(example[SMILES_COLUMN])>MIN_SMILES_LENGTH) if (MIN_SMILES_LENGTH is not None) else True
max_check = (len(example[SMILES_COLUMN])<MAX_SMILES_LENGTH) if (MIN_SMILES_LENGTH is not None) else True
type_check = type(example[SMILES_COLUMN])==str
filter_pass = all([min_check, max_check, type_check])
return filter_pass
tokenizer = RobertaTokenizerFast.from_pretrained(ENCODER_MODEL_NAME, max_len=TOKENIZER_MAX_LEN)
collator = DataCollatorWithPadding(tokenizer, padding=True, return_tensors='pt')
model = RobertaForMaskedLM.from_pretrained(ENCODER_MODEL_NAME)
model.to(DEVICE)
model.eval()
df_iter = pd.read_csv(DATASET_CSV_FILENAME, chunksize=PROCESS_CHUNKSIZE, usecols=[SMILES_COLUMN])
for i, df in enumerate(df_iter):
print(f'processing dataset chunk {i}')
dataset = datasets.Dataset.from_pandas(df)
dataset = dataset.filter(lambda example: length_filter_smiles(example), num_proc=FILTER_NUM_PROC)
dataset = dataset.map(tokenization, batched=True, num_proc=TOKENIZATION_NUM_PROC)
dataset = dataset.map(embed, batched=True, batch_size=ENCODER_BATCH_SIZE)
dataset.save_to_disk(f'{SAVE_PATH}/processed_shard_{i}.hf')
if (MAX_CHUNKS is not None) and (i >= MAX_CHUNKS-1):
break
print('finished data processing')