Update spaCy pipeline

Files changed (7) hide show

README.md CHANGED Viewed

@@ -24,7 +24,7 @@ model-index:
 | Feature | Description |
 | --- | --- |
 | **Name** | `en_setec_mk_tv` |
-| **Version** | `0.0.1` |
 | **spaCy** | `>=3.7.5,<3.8.0` |
 | **Default Pipeline** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
 | **Components** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
@@ -51,4 +51,6 @@ model-index:
 | --- | --- |
 | `ENTS_F` | 99.18 |
 | `ENTS_P` | 99.20 |
-| `ENTS_R` | 99.16 |

 | Feature | Description |
 | --- | --- |
 | **Name** | `en_setec_mk_tv` |
+| **Version** | `0.0.2` |
 | **spaCy** | `>=3.7.5,<3.8.0` |
 | **Default Pipeline** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
 | **Components** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
 | --- | --- |
 | `ENTS_F` | 99.18 |
 | `ENTS_P` | 99.20 |
+| `ENTS_R` | 99.16 |
+| `TOK2VEC_LOSS` | 49774.20 |
+| `NER_LOSS` | 66917.02 |

config.cfg CHANGED Viewed

@@ -31,7 +31,7 @@ factory = "feature_aggregator_component"
 [components.feature_aggregator_component.config]
 [components.feature_aggregator_component.config.AUDIO_FEATURE]
-method = "first"
 [components.feature_aggregator_component.config.COLOR]
 method = "join"

 [components.feature_aggregator_component.config]
 [components.feature_aggregator_component.config.AUDIO_FEATURE]
+method = "expand"
 [components.feature_aggregator_component.config.COLOR]
 method = "join"

count_extraction_component.py CHANGED Viewed

@@ -7,6 +7,7 @@ import re
 # https://spacy.io/usage/processing-pipelines#custom-components
 @Language.factory("count_extraction_component")
 class CountExtractorComponent(object):
     def __init__(self, nlp, name, label="CONNECTION"):
         self.label = label
         self.reg_left           = re.compile(r"^(?P<count>\d+)\s*[xX]\s*(?P<name>.+)$")
@@ -16,8 +17,6 @@ class CountExtractorComponent(object):
         # set extensions to tokens, spans and docs
         Span.set_extension("count", default=None, force=True)
         Span.set_extension("text",  default=None, force=True)
-        Doc.set_extension("connections", getter=self.connections, force=True)
     def __call__(self, doc):
         for ent in doc.ents:
@@ -50,15 +49,3 @@ class CountExtractorComponent(object):
             ent._.text = text
             ent._.count = 1
         return doc
-    def connections(self, doc):
-        connections = {}
-        for ent in doc.ents:
-            if ent._.count is None:
-                continue
-            if ent._.text not in connections:
-                connections[ent._.text] = ent._.count
-                continue
-            connections[ent._.text] += ent._.count
-        return connections

 # https://spacy.io/usage/processing-pipelines#custom-components
 @Language.factory("count_extraction_component")
 class CountExtractorComponent(object):
+    # By default it only extracts count from CONNECTION type but this can be changed.
     def __init__(self, nlp, name, label="CONNECTION"):
         self.label = label
         self.reg_left           = re.compile(r"^(?P<count>\d+)\s*[xX]\s*(?P<name>.+)$")
         # set extensions to tokens, spans and docs
         Span.set_extension("count", default=None, force=True)
         Span.set_extension("text",  default=None, force=True)
     def __call__(self, doc):
         for ent in doc.ents:
             ent._.text = text
             ent._.count = 1
         return doc

en_setec_mk_tv-any-py3-none-any.whl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9dab2aad0666f0c1ff0c0fe028dd6821004e75a6ad991cafe84241a639bcf4eb
-size 5709065

 version https://git-lfs.github.com/spec/v1
+oid sha256:3ed6b65aefd826e2c8c35c2e60bf17d929e5eaf7f450d965b260c1e1a5e1ea7f
+size 5709306

feature_aggregator_component.py CHANGED Viewed

@@ -2,13 +2,15 @@ from spacy.tokens import Doc, Span, Token
 from spacy.language import Language
 import pandas as pd
 default_feature_aggregation_config = {
-    'AUDIO_FEATURE': {
-        'method': 'first',
-    },
-    'COLOR': {
-        'method': 'join',
-    },
     'INCH': {
         'method': 'first',
     },
@@ -30,6 +32,12 @@ default_feature_aggregation_config = {
     'VIDEO_FEATURE': {
         'method': 'expand',
     },
     'WIRELESS_FEATURE': {
         'method': 'expand',
     },

 from spacy.language import Language
 import pandas as pd
+# Default modes for feature extraction for the labels
+#
+# There are three methods:
+# - `first`: Wich gets the first occurance and stops, this is nice for features like resolution,
+#            if the resolution is reapeted we just want the first accurance.
+# - `expand`: This effectively does OneHot encoding where the feature value names
+#             become columns and 1 is put if the feature is there.
+# - `join`: This concatinates the feature values under feature label.
 default_feature_aggregation_config = {
     'INCH': {
         'method': 'first',
     },
     'VIDEO_FEATURE': {
         'method': 'expand',
     },
+    'AUDIO_FEATURE': {
+        'method': 'expand',
+    },
+    'COLOR': {
+        'method': 'join',
+    },
     'WIRELESS_FEATURE': {
         'method': 'expand',
     },

meta.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "lang":"en",
   "name":"setec_mk_tv",
-  "version":"0.0.1",
   "description":"",
   "author":"",
   "email":"",

 {
   "lang":"en",
   "name":"setec_mk_tv",
+  "version":"0.0.2",
   "description":"",
   "author":"",
   "email":"",

normalizer_component.py CHANGED Viewed

@@ -3,6 +3,7 @@ import json
 from spacy.language import Language
 from spacy.matcher import PhraseMatcher
 default_normalization_table = {
     "Dolby Atmos": ["Dolby Atmos", "Dolby Audio Atmos", "Dolby Atmos Audio"],
     "Ethernet": [

 from spacy.language import Language
 from spacy.matcher import PhraseMatcher
+# Default normalization table, that can be customized by passing it to the component as a parameter.
 default_normalization_table = {
     "Dolby Atmos": ["Dolby Atmos", "Dolby Audio Atmos", "Dolby Atmos Audio"],
     "Ethernet": [