from transformers import CamembertTokenizer def get_tokenizer(model_name='camembert-base'): tokenizer = CamembertTokenizer.from_pretrained(model_name) return tokenizer def tokenize_encode_corpus(tokenizer, descriptions, max_len): encoded_corpus = tokenizer(text=descriptions, add_special_tokens=True, padding='max_length', truncation='longest_first', max_length=max_len, return_attention_mask=True) return encoded_corpus def extract_inputs_masks(encoded_corpus): try: input_ids = encoded_corpus['input_ids'] attention_mask = encoded_corpus['attention_mask'] except: print('Available keys are = ', encoded_corpus.keys()) return None return input_ids, attention_mask