metadata
license: apache-2.0
language:
- he
datasets:
- HeTree/MevakerConcSen
Hebrew Conclusion Extraction Model (based on sequence plus context classification)
How to use
from transformers import RobertaTokenizerFast, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset
from functools import partial
from tqdm.auto import tqdm
tqdm._instances.clear()
def tokenize_function(example):
inputs = tokenizer(
example["sentence"],
example["context"],
max_length=512,
truncation=True,
padding="max_length",
)
return inputs
def create_windowed_context_ds(context_l, example, idx):
example["context"] = context_l[idx]
return example
def create_windowed_context(raw_dataset, window_size):
df_pandas = raw_dataset['train'].to_pandas()
len1 = len(raw_dataset['train'])
context_l = []
for i in tqdm(range(len1)):
if i - window_size <0:
context_l.append(' '.join(df_pandas['sentence'][0:window_size]))
else:
if i + window_size > len1 :
context_l.append(' '.join(df_pandas['sentence'][i - window_size:-1]))
else:
context_l.append(' '.join(df_pandas['sentence'][i - window_size:i + window_size]))
return context_l
model = AutoModelForSequenceClassification.from_pretrained('HeTree/HeConEspc', num_labels=2)
tokenizer = RobertaTokenizerFast.from_pretrained('HeTree/HeConEspc')
raw_dataset = load_dataset('HeTree/MevakerConcSen')
window_size = 5
context_l = create_windowed_context(raw_dataset, window_size)
raw_dataset_window = raw_dataset.map(partial(create_windowed_context_ds, context_l), batched=False, with_indices=True)
tokenized_data = raw_dataset_window.map(tokenize_function, batched=True)
Citing
If you use HeConEspc in your research, please cite Mevaker: Conclusion Extraction and Allocation Resources for the Hebrew Language.
@article{shalumov2024mevaker,
title={Mevaker: Conclusion Extraction and Allocation Resources for the Hebrew Language},
author={Vitaly Shalumov and Harel Haskey and Yuval Solaz},
year={2024},
eprint={2403.09719},
archivePrefix={arXiv},
primaryClass={cs.CL}
}