Spaces:

wietsedv
/

xpos

Runtime error

wietsedv commited on Feb 25, 2022

Commit

0d557da

•

1 Parent(s): 9bce6fd

Fix token aggregation

Files changed (1) hide show

app.py CHANGED Viewed

@@ -31,8 +31,14 @@ def tag(text, lang_index):
         loaded_model_id = model_id
         pipe = pipeline("token-classification", model_id, aggregation_strategy="first")
-    out = pipe(text)
-    out = [(g["word"], g["entity_group"]) for g in out]
     return out, model_link(model_id)

         loaded_model_id = model_id
         pipe = pipeline("token-classification", model_id, aggregation_strategy="first")
+    # Aggregate words:
+    # split on whitespace and PUNCT, but merge other subtokens (keep first tag)
+    out = []
+    for g in pipe(text):
+        if g["word"][0] == "▁" or g["entity"] == "PUNCT":
+            out.append((g["word"].lstrip("▁"), g["entity"]))
+        else:
+            out[-1] = (out[-1][0] + g["word"], out[-1][1])
     return out, model_link(model_id)