Update spaCy pipeline
Browse files- README.md +4 -2
- config.cfg +1 -1
- count_extraction_component.py +1 -14
- en_setec_mk_tv-any-py3-none-any.whl +2 -2
- feature_aggregator_component.py +14 -6
- meta.json +1 -1
- normalizer_component.py +1 -0
README.md
CHANGED
@@ -24,7 +24,7 @@ model-index:
|
|
24 |
| Feature | Description |
|
25 |
| --- | --- |
|
26 |
| **Name** | `en_setec_mk_tv` |
|
27 |
-
| **Version** | `0.0.
|
28 |
| **spaCy** | `>=3.7.5,<3.8.0` |
|
29 |
| **Default Pipeline** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
|
30 |
| **Components** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
|
@@ -51,4 +51,6 @@ model-index:
|
|
51 |
| --- | --- |
|
52 |
| `ENTS_F` | 99.18 |
|
53 |
| `ENTS_P` | 99.20 |
|
54 |
-
| `ENTS_R` | 99.16 |
|
|
|
|
|
|
24 |
| Feature | Description |
|
25 |
| --- | --- |
|
26 |
| **Name** | `en_setec_mk_tv` |
|
27 |
+
| **Version** | `0.0.2` |
|
28 |
| **spaCy** | `>=3.7.5,<3.8.0` |
|
29 |
| **Default Pipeline** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
|
30 |
| **Components** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
|
|
|
51 |
| --- | --- |
|
52 |
| `ENTS_F` | 99.18 |
|
53 |
| `ENTS_P` | 99.20 |
|
54 |
+
| `ENTS_R` | 99.16 |
|
55 |
+
| `TOK2VEC_LOSS` | 49774.20 |
|
56 |
+
| `NER_LOSS` | 66917.02 |
|
config.cfg
CHANGED
@@ -31,7 +31,7 @@ factory = "feature_aggregator_component"
|
|
31 |
[components.feature_aggregator_component.config]
|
32 |
|
33 |
[components.feature_aggregator_component.config.AUDIO_FEATURE]
|
34 |
-
method = "
|
35 |
|
36 |
[components.feature_aggregator_component.config.COLOR]
|
37 |
method = "join"
|
|
|
31 |
[components.feature_aggregator_component.config]
|
32 |
|
33 |
[components.feature_aggregator_component.config.AUDIO_FEATURE]
|
34 |
+
method = "expand"
|
35 |
|
36 |
[components.feature_aggregator_component.config.COLOR]
|
37 |
method = "join"
|
count_extraction_component.py
CHANGED
@@ -7,6 +7,7 @@ import re
|
|
7 |
# https://spacy.io/usage/processing-pipelines#custom-components
|
8 |
@Language.factory("count_extraction_component")
|
9 |
class CountExtractorComponent(object):
|
|
|
10 |
def __init__(self, nlp, name, label="CONNECTION"):
|
11 |
self.label = label
|
12 |
self.reg_left = re.compile(r"^(?P<count>\d+)\s*[xX]\s*(?P<name>.+)$")
|
@@ -16,8 +17,6 @@ class CountExtractorComponent(object):
|
|
16 |
# set extensions to tokens, spans and docs
|
17 |
Span.set_extension("count", default=None, force=True)
|
18 |
Span.set_extension("text", default=None, force=True)
|
19 |
-
|
20 |
-
Doc.set_extension("connections", getter=self.connections, force=True)
|
21 |
|
22 |
def __call__(self, doc):
|
23 |
for ent in doc.ents:
|
@@ -50,15 +49,3 @@ class CountExtractorComponent(object):
|
|
50 |
ent._.text = text
|
51 |
ent._.count = 1
|
52 |
return doc
|
53 |
-
|
54 |
-
def connections(self, doc):
|
55 |
-
connections = {}
|
56 |
-
for ent in doc.ents:
|
57 |
-
if ent._.count is None:
|
58 |
-
continue
|
59 |
-
|
60 |
-
if ent._.text not in connections:
|
61 |
-
connections[ent._.text] = ent._.count
|
62 |
-
continue
|
63 |
-
connections[ent._.text] += ent._.count
|
64 |
-
return connections
|
|
|
7 |
# https://spacy.io/usage/processing-pipelines#custom-components
|
8 |
@Language.factory("count_extraction_component")
|
9 |
class CountExtractorComponent(object):
|
10 |
+
# By default it only extracts count from CONNECTION type but this can be changed.
|
11 |
def __init__(self, nlp, name, label="CONNECTION"):
|
12 |
self.label = label
|
13 |
self.reg_left = re.compile(r"^(?P<count>\d+)\s*[xX]\s*(?P<name>.+)$")
|
|
|
17 |
# set extensions to tokens, spans and docs
|
18 |
Span.set_extension("count", default=None, force=True)
|
19 |
Span.set_extension("text", default=None, force=True)
|
|
|
|
|
20 |
|
21 |
def __call__(self, doc):
|
22 |
for ent in doc.ents:
|
|
|
49 |
ent._.text = text
|
50 |
ent._.count = 1
|
51 |
return doc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
en_setec_mk_tv-any-py3-none-any.whl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3ed6b65aefd826e2c8c35c2e60bf17d929e5eaf7f450d965b260c1e1a5e1ea7f
|
3 |
+
size 5709306
|
feature_aggregator_component.py
CHANGED
@@ -2,13 +2,15 @@ from spacy.tokens import Doc, Span, Token
|
|
2 |
from spacy.language import Language
|
3 |
import pandas as pd
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
default_feature_aggregation_config = {
|
6 |
-
'AUDIO_FEATURE': {
|
7 |
-
'method': 'first',
|
8 |
-
},
|
9 |
-
'COLOR': {
|
10 |
-
'method': 'join',
|
11 |
-
},
|
12 |
'INCH': {
|
13 |
'method': 'first',
|
14 |
},
|
@@ -30,6 +32,12 @@ default_feature_aggregation_config = {
|
|
30 |
'VIDEO_FEATURE': {
|
31 |
'method': 'expand',
|
32 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
'WIRELESS_FEATURE': {
|
34 |
'method': 'expand',
|
35 |
},
|
|
|
2 |
from spacy.language import Language
|
3 |
import pandas as pd
|
4 |
|
5 |
+
# Default modes for feature extraction for the labels
|
6 |
+
#
|
7 |
+
# There are three methods:
|
8 |
+
# - `first`: Wich gets the first occurance and stops, this is nice for features like resolution,
|
9 |
+
# if the resolution is reapeted we just want the first accurance.
|
10 |
+
# - `expand`: This effectively does OneHot encoding where the feature value names
|
11 |
+
# become columns and 1 is put if the feature is there.
|
12 |
+
# - `join`: This concatinates the feature values under feature label.
|
13 |
default_feature_aggregation_config = {
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
'INCH': {
|
15 |
'method': 'first',
|
16 |
},
|
|
|
32 |
'VIDEO_FEATURE': {
|
33 |
'method': 'expand',
|
34 |
},
|
35 |
+
'AUDIO_FEATURE': {
|
36 |
+
'method': 'expand',
|
37 |
+
},
|
38 |
+
'COLOR': {
|
39 |
+
'method': 'join',
|
40 |
+
},
|
41 |
'WIRELESS_FEATURE': {
|
42 |
'method': 'expand',
|
43 |
},
|
meta.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
{
|
2 |
"lang":"en",
|
3 |
"name":"setec_mk_tv",
|
4 |
-
"version":"0.0.
|
5 |
"description":"",
|
6 |
"author":"",
|
7 |
"email":"",
|
|
|
1 |
{
|
2 |
"lang":"en",
|
3 |
"name":"setec_mk_tv",
|
4 |
+
"version":"0.0.2",
|
5 |
"description":"",
|
6 |
"author":"",
|
7 |
"email":"",
|
normalizer_component.py
CHANGED
@@ -3,6 +3,7 @@ import json
|
|
3 |
from spacy.language import Language
|
4 |
from spacy.matcher import PhraseMatcher
|
5 |
|
|
|
6 |
default_normalization_table = {
|
7 |
"Dolby Atmos": ["Dolby Atmos", "Dolby Audio Atmos", "Dolby Atmos Audio"],
|
8 |
"Ethernet": [
|
|
|
3 |
from spacy.language import Language
|
4 |
from spacy.matcher import PhraseMatcher
|
5 |
|
6 |
+
# Default normalization table, that can be customized by passing it to the component as a parameter.
|
7 |
default_normalization_table = {
|
8 |
"Dolby Atmos": ["Dolby Atmos", "Dolby Audio Atmos", "Dolby Atmos Audio"],
|
9 |
"Ethernet": [
|