vitvit commited on
Commit
f02d636
·
1 Parent(s): b5447b4

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +47 -0
README.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```python
2
+ from transformers import RobertaTokenizerFast, AutoModelForSequenceClassification
3
+ from datasets import load_dataset, Dataset
4
+ from functools import partial
5
+ from tqdm.auto import tqdm
6
+ tqdm._instances.clear()
7
+
8
+ def tokenize_function(example):
9
+ inputs = tokenizer(
10
+ example["sentence"],
11
+ example["context"],
12
+ max_length=512,
13
+ truncation=True,
14
+ padding="max_length",
15
+ )
16
+ return inputs
17
+
18
+ def create_windowed_context_ds(context_l, example, idx):
19
+ example["context"] = context_l[idx]
20
+ return example
21
+
22
+ def create_windowed_context(raw_dataset, window_size):
23
+ df_pandas = raw_dataset['train'].to_pandas()
24
+ len1 = len(raw_dataset['train'])
25
+ context_l = []
26
+ for i in tqdm(range(len1)):
27
+ if i - window_size <0:
28
+ context_l.append(' '.join(df_pandas['sentence'][0:window_size]))
29
+ else:
30
+ if i + window_size > len1 :
31
+ context_l.append(' '.join(df_pandas['sentence'][i - window_size:-1]))
32
+ else:
33
+ context_l.append(' '.join(df_pandas['sentence'][i - window_size:i + window_size]))
34
+ return context_l
35
+
36
+ model = AutoModelForSequenceClassification.from_pretrained('HeTree/HeConEspc', num_labels=2)
37
+ tokenizer = RobertaTokenizerFast.from_pretrained('HeTree/HeConEspc')
38
+ raw_dataset = load_dataset('HeTree/MevakerConcSen')
39
+ window_size = 5
40
+ context_l = create_windowed_context(raw_dataset, window_size)
41
+ raw_dataset_window = raw_dataset.map(partial(create_windowed_context_ds, context_l), batched=False, with_indices=True)
42
+ tokenized_data = raw_dataset_window.map(tokenize_function, batched=True)
43
+
44
+
45
+
46
+
47
+ ```