Spaces:

tomsoderlund
/

swedish-entity-recognition

Sleeping

tomsoderlund commited on Dec 17, 2022

Commit

e746d10

•

1 Parent(s): 2e6b605

merge_split_tokens

Files changed (2) hide show

README.md CHANGED Viewed

@@ -9,28 +9,20 @@ python_version: 3.9.13
 app_file: app.py
 pinned: false
 license: openrail
-models: ["KBLab/bert-base-swedish-cased-ner"]
 ---
 # Swedish Entity Recognition
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
-## Preparing a Gradio app for Huggingface Spaces
 Setup:
-    # Create a “safe” virtual Python environment
-    python3 -m venv env
-    # Install Gradio
-    pip3 install gradio
-    # Install optional packages for your specific app: pip3 install torch transformers
-    # Update the list of required packages
-    pip3 freeze > requirements.txt
-    # Create a blank app.py
-    touch app.py
-Edit `app.py`, then run:
     python3 app.py

 app_file: app.py
 pinned: false
 license: openrail
+models:
+    - KBLab/bert-base-swedish-cased-ner
 ---
 # Swedish Entity Recognition
+## Installing locally
 Setup:
+    source env/bin/activate
+    pip3 install -r requirements.txt
+Then run:
     python3 app.py

app.py CHANGED Viewed

@@ -1,6 +1,16 @@
 import gradio
 from transformers import pipeline
 def process_swedish_text(text):
   # Models from https://huggingface.co/models
   # https://huggingface.co/KBLab/bert-base-swedish-cased-ner
@@ -8,8 +18,9 @@ def process_swedish_text(text):
   # Run NER
   nlp_results = nlp(text)
   print('nlp_results:', nlp_results)
   # Fix TypeError("'numpy.float32' object is not iterable")
-  nlp_results_adjusted = map(lambda entity: dict(entity, **{ 'score': float(entity['score']) }), nlp_results)
   print('nlp_results_adjusted:', nlp_results_adjusted)
   # Return values
   return {'entities': list(nlp_results_adjusted)}
@@ -21,6 +32,9 @@ gradio_interface = gradio.Interface(
   examples=[
     ["Jag heter Tom och bor i Stockholm."],
     ["Groens malmgård är en av Stockholms malmgårdar, belägen vid Malmgårdsvägen 53 på Södermalm i Stockholm."]
-  ]
 )
 gradio_interface.launch()

 import gradio
 from transformers import pipeline
+# Merge split tokens starting with '##'
+def merge_split_tokens(tokens):
+  merged_tokens = []
+  for token in tokens:
+    if token["word"].startswith('##'):
+      merged_tokens[-1]["word"] += token["word"][2:]
+    else:
+      merged_tokens.append(token)
+  return merged_tokens
 def process_swedish_text(text):
   # Models from https://huggingface.co/models
   # https://huggingface.co/KBLab/bert-base-swedish-cased-ner
   # Run NER
   nlp_results = nlp(text)
   print('nlp_results:', nlp_results)
+  nlp_results_merged = merge_split_tokens(nlp_results)
   # Fix TypeError("'numpy.float32' object is not iterable")
+  nlp_results_adjusted = map(lambda entity: dict(entity, **{ 'score': float(entity['score']) }), nlp_results_merged)
   print('nlp_results_adjusted:', nlp_results_adjusted)
   # Return values
   return {'entities': list(nlp_results_adjusted)}
   examples=[
     ["Jag heter Tom och bor i Stockholm."],
     ["Groens malmgård är en av Stockholms malmgårdar, belägen vid Malmgårdsvägen 53 på Södermalm i Stockholm."]
+  ],
+  title="Swedish Entity Recognition",
+  description="Recognizing Swedish tokens e.g. locations and person names.",
+  article="© Tom Söderlund 2022"
 )
 gradio_interface.launch()