Spaces:
Sleeping
Sleeping
File size: 3,798 Bytes
e1aa577 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
from utils.llm_chain import ChainWrapper, get_chain_metadata
from pathlib import Path
from dataset.base_dataset import DatasetBase
import pandas as pd
class LLMEstimator:
"""
A wrapper for an estimator using LLM
"""
def __init__(self, opt):
"""
Initialize a new instance of the LLMEstimator class.
:param opt: The configuration file (EasyDict)
"""
self.opt = opt
self.chain = None
self.mini_batch_size = opt.mini_batch_size
self.mode = opt.mode
self.num_workers = opt.num_workers
if 'instruction' in opt.keys():
self.cur_instruct = opt.instruction
else:
self.cur_instruct = None
@staticmethod
def generate_sample_text(sample_id: int, text: str) -> str:
"""
Generate a sample text for the chain prompt
:param sample_id: The sample id
:param text: The text of the sample
:return: The sample text for the prompt
"""
return f"ID: {sample_id}; Sample: {text}\n"
def calc_usage(self) -> float:
""""
Calculate the usage of the estimator
"""
return self.chain.accumulate_usage
def init_chain(self, label_schema: set[str]):
"""
Initialize the chain
:param label_schema: The label schema
"""
chain_metadata = get_chain_metadata(Path(self.opt.prompt), retrieve_module=True)
if hasattr(chain_metadata['module'], 'update_classification_prediction_schema'):
chain_metadata['json_schema'] = chain_metadata['module'].update_classification_prediction_schema(
chain_metadata['json_schema'],
label_schema
)
self.chain = ChainWrapper(self.opt.llm, self.opt.prompt, chain_metadata['json_schema'],
chain_metadata['parser_func'])
def apply_dataframe(self, record: pd.DataFrame):
"""
Apply the estimator on a dataframe
:param record: The record
"""
chain_input = ''
mini_batch_inputs = []
record[self.mode] = 'Discarded'
# prepare all the inputs for the chains
for i, row in record.iterrows():
chain_input += self.generate_sample_text(i, row['text'])
if ((i + 1) % self.mini_batch_size) == 0:
mini_batch_inputs.append({'batch_size': self.mini_batch_size, 'task_instruction': self.cur_instruct,
'samples': chain_input})
chain_input = ''
if not (chain_input == ''):
mini_batch_inputs.append({'batch_size': self.mini_batch_size, 'task_instruction': self.cur_instruct,
'samples': chain_input})
all_results = self.chain.batch_invoke(mini_batch_inputs, self.num_workers)
union_results = [element for sublist in all_results for element in sublist['results']]
for res in union_results:
record.loc[res['id'], self.mode] = res['prediction']
return record
def apply(self, dataset: DatasetBase, idx: int, leq: bool = False):
"""
Apply the estimator on the batches up to idx (includes), it then updates the annotation field
if self.mode is 'annotation', otherwise it update the prediction field.
:param dataset: The dataset
:param idx: The current batch index
:param leq: If True, apply on all the batches up to idx (includes), otherwise apply only on idx
"""
if self.chain is None:
self.init_chain(dataset.label_schema)
if leq:
batch_records = dataset.get_leq(idx)
else:
batch_records = dataset[idx]
return self.apply_dataframe(batch_records)
|