Update generic_ner.py
Browse files- generic_ner.py +2 -23
generic_ner.py
CHANGED
@@ -1,16 +1,14 @@
|
|
1 |
from transformers import Pipeline
|
2 |
import numpy as np
|
3 |
import torch
|
|
|
|
|
4 |
from nltk.chunk import conlltags2tree
|
5 |
from nltk import pos_tag
|
6 |
from nltk.tree import Tree
|
7 |
import string
|
8 |
import torch.nn.functional as F
|
9 |
-
from langdetect import detect
|
10 |
-
|
11 |
-
|
12 |
import re, string
|
13 |
-
import pysbd
|
14 |
|
15 |
|
16 |
def tokenize(text):
|
@@ -202,27 +200,8 @@ class MultitaskTokenClassificationPipeline(Pipeline):
|
|
202 |
}
|
203 |
return preprocess_kwargs, {}, {}
|
204 |
|
205 |
-
# def preprocess(self, text, **kwargs):
|
206 |
-
#
|
207 |
-
# language = detect(text)
|
208 |
-
# sentences = segment_and_trim_sentences(text, language, 512)
|
209 |
-
#
|
210 |
-
# tokenized_inputs = self.tokenizer(
|
211 |
-
# text,
|
212 |
-
# padding="max_length",
|
213 |
-
# truncation=True,
|
214 |
-
# max_length=512,
|
215 |
-
# return_tensors="pt",
|
216 |
-
# )
|
217 |
-
#
|
218 |
-
# text_sentences = [
|
219 |
-
# tokenize(add_spaces_around_punctuation(sentence)) for sentence in sentences
|
220 |
-
# ]
|
221 |
-
# return tokenized_inputs, text_sentences, text
|
222 |
def preprocess(self, text, **kwargs):
|
223 |
|
224 |
-
# sentences = segment_and_trim_sentences(text, language, 512)
|
225 |
-
|
226 |
tokenized_inputs = self.tokenizer(
|
227 |
text, padding="max_length", truncation=True, max_length=512
|
228 |
)
|
|
|
1 |
from transformers import Pipeline
|
2 |
import numpy as np
|
3 |
import torch
|
4 |
+
import nltk
|
5 |
+
nltk.download('averaged_perceptron_tagger')
|
6 |
from nltk.chunk import conlltags2tree
|
7 |
from nltk import pos_tag
|
8 |
from nltk.tree import Tree
|
9 |
import string
|
10 |
import torch.nn.functional as F
|
|
|
|
|
|
|
11 |
import re, string
|
|
|
12 |
|
13 |
|
14 |
def tokenize(text):
|
|
|
200 |
}
|
201 |
return preprocess_kwargs, {}, {}
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
def preprocess(self, text, **kwargs):
|
204 |
|
|
|
|
|
205 |
tokenized_inputs = self.tokenizer(
|
206 |
text, padding="max_length", truncation=True, max_length=512
|
207 |
)
|