emanuelaboros commited on
Commit
61675e4
·
verified ·
1 Parent(s): d0242b2

Update generic_ner.py

Browse files
Files changed (1) hide show
  1. generic_ner.py +2 -23
generic_ner.py CHANGED
@@ -1,16 +1,14 @@
1
  from transformers import Pipeline
2
  import numpy as np
3
  import torch
 
 
4
  from nltk.chunk import conlltags2tree
5
  from nltk import pos_tag
6
  from nltk.tree import Tree
7
  import string
8
  import torch.nn.functional as F
9
- from langdetect import detect
10
-
11
-
12
  import re, string
13
- import pysbd
14
 
15
 
16
  def tokenize(text):
@@ -202,27 +200,8 @@ class MultitaskTokenClassificationPipeline(Pipeline):
202
  }
203
  return preprocess_kwargs, {}, {}
204
 
205
- # def preprocess(self, text, **kwargs):
206
- #
207
- # language = detect(text)
208
- # sentences = segment_and_trim_sentences(text, language, 512)
209
- #
210
- # tokenized_inputs = self.tokenizer(
211
- # text,
212
- # padding="max_length",
213
- # truncation=True,
214
- # max_length=512,
215
- # return_tensors="pt",
216
- # )
217
- #
218
- # text_sentences = [
219
- # tokenize(add_spaces_around_punctuation(sentence)) for sentence in sentences
220
- # ]
221
- # return tokenized_inputs, text_sentences, text
222
  def preprocess(self, text, **kwargs):
223
 
224
- # sentences = segment_and_trim_sentences(text, language, 512)
225
-
226
  tokenized_inputs = self.tokenizer(
227
  text, padding="max_length", truncation=True, max_length=512
228
  )
 
1
  from transformers import Pipeline
2
  import numpy as np
3
  import torch
4
+ import nltk
5
+ nltk.download('averaged_perceptron_tagger')
6
  from nltk.chunk import conlltags2tree
7
  from nltk import pos_tag
8
  from nltk.tree import Tree
9
  import string
10
  import torch.nn.functional as F
 
 
 
11
  import re, string
 
12
 
13
 
14
  def tokenize(text):
 
200
  }
201
  return preprocess_kwargs, {}, {}
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  def preprocess(self, text, **kwargs):
204
 
 
 
205
  tokenized_inputs = self.tokenizer(
206
  text, padding="max_length", truncation=True, max_length=512
207
  )