alexkueck commited on
Commit
50b8512
·
1 Parent(s): 8949dd7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -24
app.py CHANGED
@@ -8,6 +8,31 @@ from utils import *
8
  from presets import *
9
  from transformers import Trainer, TrainingArguments
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  ######################################################################
13
  #Modelle und Tokenizer
@@ -80,30 +105,6 @@ def trainieren_neu():
80
 
81
 
82
 
83
- #####################################################
84
- #Hilfsfunktionen für das training
85
- #####################################################
86
- #Datensets in den Tokenizer schieben...
87
- def tokenize_function(examples):
88
- return tokenizer(examples["text"])
89
-
90
-
91
- #Funktion, die den gegebenen Text aus dem Datenset gruppiert
92
- def group_texts(examples):
93
- # Concatenate all texts.
94
- concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
95
- total_length = len(concatenated_examples[list(examples.keys())[0]])
96
- # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
97
- # customize this part to your needs.
98
- total_length = (total_length // block_size) * block_size
99
- # Split by chunks of max_len.
100
- result = {
101
- k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
102
- for k, t in concatenated_examples.items()
103
- }
104
- result["labels"] = result["input_ids"].copy()
105
- return result
106
-
107
 
108
 
109
 
 
8
  from presets import *
9
  from transformers import Trainer, TrainingArguments
10
 
11
+ #####################################################
12
+ #Hilfsfunktionen für das training
13
+ #####################################################
14
+ #Datensets in den Tokenizer schieben...
15
+ def tokenize_function(examples):
16
+ return tokenizer(examples["text"])
17
+
18
+
19
+ #Funktion, die den gegebenen Text aus dem Datenset gruppiert
20
+ def group_texts(examples):
21
+ # Concatenate all texts.
22
+ concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
23
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
24
+ # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
25
+ # customize this part to your needs.
26
+ total_length = (total_length // block_size) * block_size
27
+ # Split by chunks of max_len.
28
+ result = {
29
+ k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
30
+ for k, t in concatenated_examples.items()
31
+ }
32
+ result["labels"] = result["input_ids"].copy()
33
+ return result
34
+
35
+
36
 
37
  ######################################################################
38
  #Modelle und Tokenizer
 
105
 
106
 
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
 
110