Update spaCy pipeline
Browse files- README.md +2 -2
- config.cfg +37 -2
- en_setec_mk_tv-any-py3-none-any.whl +2 -2
- feature_aggregator_component.py +101 -0
- meta.json +4 -2
- normalizer_component.py +8 -8
README.md
CHANGED
@@ -26,8 +26,8 @@ model-index:
|
|
26 |
| **Name** | `en_setec_mk_tv` |
|
27 |
| **Version** | `0.0.0` |
|
28 |
| **spaCy** | `>=3.7.5,<3.8.0` |
|
29 |
-
| **Default Pipeline** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component` |
|
30 |
-
| **Components** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component` |
|
31 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
32 |
| **Sources** | n/a |
|
33 |
| **License** | n/a |
|
|
|
26 |
| **Name** | `en_setec_mk_tv` |
|
27 |
| **Version** | `0.0.0` |
|
28 |
| **spaCy** | `>=3.7.5,<3.8.0` |
|
29 |
+
| **Default Pipeline** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
|
30 |
+
| **Components** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
|
31 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
32 |
| **Sources** | n/a |
|
33 |
| **License** | n/a |
|
config.cfg
CHANGED
@@ -10,7 +10,7 @@ seed = 0
|
|
10 |
|
11 |
[nlp]
|
12 |
lang = "en"
|
13 |
-
pipeline = ["tok2vec","ner","count_extraction_component","normalizer_component"]
|
14 |
batch_size = 1000
|
15 |
disabled = []
|
16 |
before_creation = null
|
@@ -25,6 +25,41 @@ vectors = {"@vectors":"spacy.Vectors.v1"}
|
|
25 |
factory = "count_extraction_component"
|
26 |
label = "CONNECTION"
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
[components.ner]
|
29 |
factory = "ner"
|
30 |
incorrect_spans_key = null
|
@@ -48,7 +83,7 @@ upstream = "*"
|
|
48 |
|
49 |
[components.normalizer_component]
|
50 |
factory = "normalizer_component"
|
51 |
-
|
52 |
|
53 |
[components.tok2vec]
|
54 |
factory = "tok2vec"
|
|
|
10 |
|
11 |
[nlp]
|
12 |
lang = "en"
|
13 |
+
pipeline = ["tok2vec","ner","count_extraction_component","normalizer_component","feature_aggregator_component"]
|
14 |
batch_size = 1000
|
15 |
disabled = []
|
16 |
before_creation = null
|
|
|
25 |
factory = "count_extraction_component"
|
26 |
label = "CONNECTION"
|
27 |
|
28 |
+
[components.feature_aggregator_component]
|
29 |
+
factory = "feature_aggregator_component"
|
30 |
+
|
31 |
+
[components.feature_aggregator_component.config]
|
32 |
+
|
33 |
+
[components.feature_aggregator_component.config.AUDIO_FEATURE]
|
34 |
+
method = "first"
|
35 |
+
|
36 |
+
[components.feature_aggregator_component.config.COLOR]
|
37 |
+
method = "join"
|
38 |
+
|
39 |
+
[components.feature_aggregator_component.config.INCH]
|
40 |
+
method = "first"
|
41 |
+
|
42 |
+
[components.feature_aggregator_component.config.MOUNTING_FEATURE]
|
43 |
+
method = "join"
|
44 |
+
|
45 |
+
[components.feature_aggregator_component.config.OS]
|
46 |
+
method = "first"
|
47 |
+
|
48 |
+
[components.feature_aggregator_component.config.REFRESH_RATE]
|
49 |
+
method = "first"
|
50 |
+
|
51 |
+
[components.feature_aggregator_component.config.RESOLUTION]
|
52 |
+
method = "first"
|
53 |
+
|
54 |
+
[components.feature_aggregator_component.config.SOFTWARE_FEATURE]
|
55 |
+
method = "expand"
|
56 |
+
|
57 |
+
[components.feature_aggregator_component.config.VIDEO_FEATURE]
|
58 |
+
method = "expand"
|
59 |
+
|
60 |
+
[components.feature_aggregator_component.config.WIRELESS_FEATURE]
|
61 |
+
method = "expand"
|
62 |
+
|
63 |
[components.ner]
|
64 |
factory = "ner"
|
65 |
incorrect_spans_key = null
|
|
|
83 |
|
84 |
[components.normalizer_component]
|
85 |
factory = "normalizer_component"
|
86 |
+
norms = null
|
87 |
|
88 |
[components.tok2vec]
|
89 |
factory = "tok2vec"
|
en_setec_mk_tv-any-py3-none-any.whl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:95caa99fc72a38765852ae3e92072e5c28bf4357f166a16a2cfff6969b5c03e9
|
3 |
+
size 5709056
|
feature_aggregator_component.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from spacy.tokens import Doc, Span, Token
|
2 |
+
from spacy.language import Language
|
3 |
+
|
4 |
+
default_feature_aggregation_config = {
|
5 |
+
'AUDIO_FEATURE': {
|
6 |
+
'method': 'first',
|
7 |
+
},
|
8 |
+
'COLOR': {
|
9 |
+
'method': 'join',
|
10 |
+
},
|
11 |
+
'INCH': {
|
12 |
+
'method': 'first',
|
13 |
+
},
|
14 |
+
'MOUNTING_FEATURE': {
|
15 |
+
'method': 'join',
|
16 |
+
},
|
17 |
+
'OS': {
|
18 |
+
'method': 'first',
|
19 |
+
},
|
20 |
+
'REFRESH_RATE': {
|
21 |
+
'method': 'first',
|
22 |
+
},
|
23 |
+
'RESOLUTION': {
|
24 |
+
'method': 'first',
|
25 |
+
},
|
26 |
+
'SOFTWARE_FEATURE': {
|
27 |
+
'method': 'expand',
|
28 |
+
},
|
29 |
+
'VIDEO_FEATURE': {
|
30 |
+
'method': 'expand',
|
31 |
+
},
|
32 |
+
'WIRELESS_FEATURE': {
|
33 |
+
'method': 'expand',
|
34 |
+
},
|
35 |
+
}
|
36 |
+
|
37 |
+
@Language.factory("feature_aggregator_component")
|
38 |
+
class FeatureAggregatorComponent(object):
|
39 |
+
def __init__(self, nlp, name, config=default_feature_aggregation_config):
|
40 |
+
self.config = config
|
41 |
+
Doc.set_extension("raw_features", getter=self.raw_features, force=True)
|
42 |
+
Doc.set_extension("features", getter=self.features, force=True)
|
43 |
+
Doc.set_extension("add_to_dataframe", method=self.add_to_dataframe, force=True)
|
44 |
+
Doc.set_extension("feature_aggregation_config", getter=self.get_feature_aggregation_config, setter=self.set_feature_aggregation_config, force=True)
|
45 |
+
|
46 |
+
def __call__(self, doc):
|
47 |
+
return doc
|
48 |
+
|
49 |
+
def get_feature_aggregation_config(self, doc):
|
50 |
+
return self.config
|
51 |
+
|
52 |
+
def set_feature_aggregation_config(self, doc, config):
|
53 |
+
self.config = config
|
54 |
+
|
55 |
+
def raw_features(self, doc):
|
56 |
+
features = {}
|
57 |
+
for ent in doc.ents:
|
58 |
+
if ent._.count is None:
|
59 |
+
if not ent.label_ in features:
|
60 |
+
features[ent.label_] = set()
|
61 |
+
features[ent.label_].add(ent._.text)
|
62 |
+
else:
|
63 |
+
# If it has a count we put it in a separate column an accumulate the counts
|
64 |
+
if not ent._.text in features:
|
65 |
+
features[ent._.text] = 0
|
66 |
+
features[ent._.text] += ent._.count
|
67 |
+
return features
|
68 |
+
|
69 |
+
def features(self, doc):
|
70 |
+
features = {}
|
71 |
+
for name, values in self.raw_features(doc).items():
|
72 |
+
if not name in self.config:
|
73 |
+
features[name] = values
|
74 |
+
continue
|
75 |
+
|
76 |
+
if not 'method' in self.config[name]:
|
77 |
+
features[name] = values
|
78 |
+
continue
|
79 |
+
|
80 |
+
method = self.config[name]["method"]
|
81 |
+
if method == 'first':
|
82 |
+
if len(values) != 0:
|
83 |
+
features[name] = values.pop()
|
84 |
+
else:
|
85 |
+
features[name] = float('nan')
|
86 |
+
elif method == 'join':
|
87 |
+
features[name] = ','.join(list(values))
|
88 |
+
elif method == 'expand':
|
89 |
+
for value in values:
|
90 |
+
features[value] = 1
|
91 |
+
else:
|
92 |
+
print(f"unknown feature aggregation method: {method}, skipping...")
|
93 |
+
features[name] = values
|
94 |
+
return features
|
95 |
+
|
96 |
+
def add_to_dataframe(self, doc, df):
|
97 |
+
features = self.features(doc)
|
98 |
+
for name, feature in features.items():
|
99 |
+
features[name] = [feature]
|
100 |
+
df = pd.concat([df, pd.DataFrame(features)])
|
101 |
+
return df
|
meta.json
CHANGED
@@ -37,13 +37,15 @@
|
|
37 |
"tok2vec",
|
38 |
"ner",
|
39 |
"count_extraction_component",
|
40 |
-
"normalizer_component"
|
|
|
41 |
],
|
42 |
"components":[
|
43 |
"tok2vec",
|
44 |
"ner",
|
45 |
"count_extraction_component",
|
46 |
-
"normalizer_component"
|
|
|
47 |
],
|
48 |
"disabled":[
|
49 |
|
|
|
37 |
"tok2vec",
|
38 |
"ner",
|
39 |
"count_extraction_component",
|
40 |
+
"normalizer_component",
|
41 |
+
"feature_aggregator_component"
|
42 |
],
|
43 |
"components":[
|
44 |
"tok2vec",
|
45 |
"ner",
|
46 |
"count_extraction_component",
|
47 |
+
"normalizer_component",
|
48 |
+
"feature_aggregator_component"
|
49 |
],
|
50 |
"disabled":[
|
51 |
|
normalizer_component.py
CHANGED
@@ -50,19 +50,19 @@ default_normalization_table = {
|
|
50 |
"1280x720": ["HD"],
|
51 |
"640x480": ["SD"],
|
52 |
"Wifi": ["Wifi", "Wi-Fi", "Wifi built in", "built in Wifi", "WiFi integrated"],
|
53 |
-
"
|
54 |
}
|
55 |
|
56 |
|
57 |
@Language.factory("normalizer_component")
|
58 |
class NormalizerComponent(object):
|
59 |
-
def __init__(self, nlp, name,
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
|
67 |
self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
|
68 |
self.nlp = nlp
|
|
|
50 |
"1280x720": ["HD"],
|
51 |
"640x480": ["SD"],
|
52 |
"Wifi": ["Wifi", "Wi-Fi", "Wifi built in", "built in Wifi", "WiFi integrated"],
|
53 |
+
"Blutooth": ["BLUETOOTH"],
|
54 |
}
|
55 |
|
56 |
|
57 |
@Language.factory("normalizer_component")
|
58 |
class NormalizerComponent(object):
|
59 |
+
def __init__(self, nlp, name, norms=None):
|
60 |
+
if norms is None:
|
61 |
+
self.norm_table = default_normalization_table
|
62 |
+
elif isinstance(norms, str):
|
63 |
+
self.norm_table = json.load(open(norms))
|
64 |
+
else:
|
65 |
+
self.norm_table = norms
|
66 |
|
67 |
self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
|
68 |
self.nlp = nlp
|