import pandas as pd import torch def preparing_data(text:str , domain: int): """ Args: text (_str_): input text from the user domain (_int_): output domain from domain identification pipeline Returns: _DataFrame_: dataframe contains texts and domain """ # Let's assume you have the following dictionary # the model can't do inference with only one example so this dummy example must be put dict_data = { 'text': ['hello world' ] , 'domain': [0] , } dict_data["text"].append(text) dict_data["domain"].append(domain) # Convert the dictionary to a DataFrame df = pd.DataFrame(dict_data) # return the dataframe return df def loading_data(tokenizer , df: pd.DataFrame ): ids = [] masks = [] domain_list = [] texts = df["text"] domains= df["domain"] for i in range(len(df)): text = texts[i] token = tokenizer(text) ids.append(token["token_id"]) masks.append(token["mask"]) domain_list.append(domains[i]) input_ids = torch.cat(ids , dim=0) input_masks = torch.cat(masks ,dim = 0) input_domains = torch.tensor(domain_list) return input_ids , input_masks , input_domains