emanuelaboros commited on
Commit
97a98d1
·
verified ·
1 Parent(s): 3db4bd4

Update generic_ner.py

Browse files
Files changed (1) hide show
  1. generic_ner.py +41 -30
generic_ner.py CHANGED
@@ -540,6 +540,7 @@ def remove_trailing_stopwords(entities):
540
  rOffset = entity.get("rOffset", original_len)
541
 
542
  # Remove stopwords and punctuation from the beginning
 
543
  while entity_text and (
544
  entity_text.split()[0].lower() in stop_words
545
  or entity_text[0] in punctuation
@@ -561,36 +562,48 @@ def remove_trailing_stopwords(entities):
561
  print(
562
  f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
563
  )
 
564
 
 
565
  # Remove stopwords and punctuation from the end
566
- if len(entity_text.strip()) > 1:
567
- while entity_text and (
568
- entity_text.split()[-1].lower() in stop_words
569
- or entity_text[-1] in punctuation
570
- ):
571
- if entity_text.split()[-1].lower() in stop_words:
572
- stopword_len = (
573
- len(entity_text.split()[-1]) + 1
574
- ) # Adjust length for stopword and preceding space
575
- entity_text = entity_text[
576
- :-stopword_len
577
- ] # Remove trailing stopword
578
- rOffset -= stopword_len # Adjust the right offset
579
- if DEBUG:
580
- print(
581
- f"Removed trailing stopword from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
582
- )
583
- if entity_text:
584
- if entity_text[-1] in punctuation:
585
- entity_text = entity_text[
586
- :-1
587
- ] # Remove trailing punctuation
588
- rOffset -= 1 # Adjust the right offset
589
- if DEBUG:
590
- print(
591
- f"Removed trailing punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
592
- )
 
 
593
 
 
 
 
 
 
 
 
 
594
  # Skip certain entities based on rules
595
  if entity_text in string.punctuation:
596
  if DEBUG:
@@ -669,13 +682,11 @@ def remove_trailing_stopwords(entities):
669
  entities.remove(entity)
670
  else:
671
  new_entities.append(entity)
672
- else:
673
- new_entities.append(entity)
674
  if DEBUG:
675
  print(f"Remained entities: {len(new_entities)}")
676
  return new_entities
677
 
678
-
679
  class MultitaskTokenClassificationPipeline(Pipeline):
680
 
681
  def _sanitize_parameters(self, **kwargs):
 
540
  rOffset = entity.get("rOffset", original_len)
541
 
542
  # Remove stopwords and punctuation from the beginning
543
+ i = 0
544
  while entity_text and (
545
  entity_text.split()[0].lower() in stop_words
546
  or entity_text[0] in punctuation
 
562
  print(
563
  f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
564
  )
565
+ i += 1
566
 
567
+ i = 0
568
  # Remove stopwords and punctuation from the end
569
+ iteration = 0
570
+ max_iterations = len(entity_text) # Prevent infinite loops
571
+
572
+ while entity_text and iteration < max_iterations:
573
+ # Check if the last word is a stopword or the last character is punctuation
574
+ last_word = entity_text.split()[-1] if entity_text.split() else ""
575
+ last_char = entity_text[-1]
576
+
577
+ if last_word.lower() in stop_words:
578
+ # Remove trailing stopword and adjust rOffset
579
+ stopword_len = len(last_word) + 1 # Include space before stopword
580
+ entity_text = entity_text[:-stopword_len].rstrip()
581
+ rOffset -= stopword_len
582
+ if DEBUG:
583
+ print(
584
+ f"Removed trailing stopword from entity: {entity_text} (rOffset={rOffset})"
585
+ )
586
+
587
+ elif last_char in punctuation:
588
+ # Remove trailing punctuation and adjust rOffset
589
+ entity_text = entity_text[:-1].rstrip()
590
+ rOffset -= 1
591
+ if DEBUG:
592
+ print(
593
+ f"Removed trailing punctuation from entity: {entity_text} (rOffset={rOffset})"
594
+ )
595
+ else:
596
+ # Exit loop if neither stopwords nor punctuation are found
597
+ break
598
 
599
+ iteration += 1
600
+ # print(f"ITERATION: {iteration} [{entity['surface']}] for {entity_text}")
601
+
602
+ if len(entity_text.strip()) == 1:
603
+ entities.remove(entity)
604
+ if DEBUG:
605
+ print(f"Skipping entity: {entity_text}")
606
+ continue
607
  # Skip certain entities based on rules
608
  if entity_text in string.punctuation:
609
  if DEBUG:
 
682
  entities.remove(entity)
683
  else:
684
  new_entities.append(entity)
685
+
 
686
  if DEBUG:
687
  print(f"Remained entities: {len(new_entities)}")
688
  return new_entities
689
 
 
690
  class MultitaskTokenClassificationPipeline(Pipeline):
691
 
692
  def _sanitize_parameters(self, **kwargs):