oh201516 commited on
Commit
0044d08
1 Parent(s): 864cb91

Update spaCy pipeline

Browse files
README.md CHANGED
@@ -26,8 +26,8 @@ model-index:
26
  | **Name** | `en_setec_mk_tv` |
27
  | **Version** | `0.0.0` |
28
  | **spaCy** | `>=3.7.5,<3.8.0` |
29
- | **Default Pipeline** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component` |
30
- | **Components** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component` |
31
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
32
  | **Sources** | n/a |
33
  | **License** | n/a |
 
26
  | **Name** | `en_setec_mk_tv` |
27
  | **Version** | `0.0.0` |
28
  | **spaCy** | `>=3.7.5,<3.8.0` |
29
+ | **Default Pipeline** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
30
+ | **Components** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
31
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
32
  | **Sources** | n/a |
33
  | **License** | n/a |
config.cfg CHANGED
@@ -10,7 +10,7 @@ seed = 0
10
 
11
  [nlp]
12
  lang = "en"
13
- pipeline = ["tok2vec","ner","count_extraction_component","normalizer_component"]
14
  batch_size = 1000
15
  disabled = []
16
  before_creation = null
@@ -25,6 +25,41 @@ vectors = {"@vectors":"spacy.Vectors.v1"}
25
  factory = "count_extraction_component"
26
  label = "CONNECTION"
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  [components.ner]
29
  factory = "ner"
30
  incorrect_spans_key = null
@@ -48,7 +83,7 @@ upstream = "*"
48
 
49
  [components.normalizer_component]
50
  factory = "normalizer_component"
51
- norm_file = null
52
 
53
  [components.tok2vec]
54
  factory = "tok2vec"
 
10
 
11
  [nlp]
12
  lang = "en"
13
+ pipeline = ["tok2vec","ner","count_extraction_component","normalizer_component","feature_aggregator_component"]
14
  batch_size = 1000
15
  disabled = []
16
  before_creation = null
 
25
  factory = "count_extraction_component"
26
  label = "CONNECTION"
27
 
28
+ [components.feature_aggregator_component]
29
+ factory = "feature_aggregator_component"
30
+
31
+ [components.feature_aggregator_component.config]
32
+
33
+ [components.feature_aggregator_component.config.AUDIO_FEATURE]
34
+ method = "first"
35
+
36
+ [components.feature_aggregator_component.config.COLOR]
37
+ method = "join"
38
+
39
+ [components.feature_aggregator_component.config.INCH]
40
+ method = "first"
41
+
42
+ [components.feature_aggregator_component.config.MOUNTING_FEATURE]
43
+ method = "join"
44
+
45
+ [components.feature_aggregator_component.config.OS]
46
+ method = "first"
47
+
48
+ [components.feature_aggregator_component.config.REFRESH_RATE]
49
+ method = "first"
50
+
51
+ [components.feature_aggregator_component.config.RESOLUTION]
52
+ method = "first"
53
+
54
+ [components.feature_aggregator_component.config.SOFTWARE_FEATURE]
55
+ method = "expand"
56
+
57
+ [components.feature_aggregator_component.config.VIDEO_FEATURE]
58
+ method = "expand"
59
+
60
+ [components.feature_aggregator_component.config.WIRELESS_FEATURE]
61
+ method = "expand"
62
+
63
  [components.ner]
64
  factory = "ner"
65
  incorrect_spans_key = null
 
83
 
84
  [components.normalizer_component]
85
  factory = "normalizer_component"
86
+ norms = null
87
 
88
  [components.tok2vec]
89
  factory = "tok2vec"
en_setec_mk_tv-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa2fe9279a4fe64e62863259d1b3a8412df014f391706cd345415ea9776b3d27
3
- size 5707735
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95caa99fc72a38765852ae3e92072e5c28bf4357f166a16a2cfff6969b5c03e9
3
+ size 5709056
feature_aggregator_component.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from spacy.tokens import Doc, Span, Token
2
+ from spacy.language import Language
3
+
4
+ default_feature_aggregation_config = {
5
+ 'AUDIO_FEATURE': {
6
+ 'method': 'first',
7
+ },
8
+ 'COLOR': {
9
+ 'method': 'join',
10
+ },
11
+ 'INCH': {
12
+ 'method': 'first',
13
+ },
14
+ 'MOUNTING_FEATURE': {
15
+ 'method': 'join',
16
+ },
17
+ 'OS': {
18
+ 'method': 'first',
19
+ },
20
+ 'REFRESH_RATE': {
21
+ 'method': 'first',
22
+ },
23
+ 'RESOLUTION': {
24
+ 'method': 'first',
25
+ },
26
+ 'SOFTWARE_FEATURE': {
27
+ 'method': 'expand',
28
+ },
29
+ 'VIDEO_FEATURE': {
30
+ 'method': 'expand',
31
+ },
32
+ 'WIRELESS_FEATURE': {
33
+ 'method': 'expand',
34
+ },
35
+ }
36
+
37
+ @Language.factory("feature_aggregator_component")
38
+ class FeatureAggregatorComponent(object):
39
+ def __init__(self, nlp, name, config=default_feature_aggregation_config):
40
+ self.config = config
41
+ Doc.set_extension("raw_features", getter=self.raw_features, force=True)
42
+ Doc.set_extension("features", getter=self.features, force=True)
43
+ Doc.set_extension("add_to_dataframe", method=self.add_to_dataframe, force=True)
44
+ Doc.set_extension("feature_aggregation_config", getter=self.get_feature_aggregation_config, setter=self.set_feature_aggregation_config, force=True)
45
+
46
+ def __call__(self, doc):
47
+ return doc
48
+
49
+ def get_feature_aggregation_config(self, doc):
50
+ return self.config
51
+
52
+ def set_feature_aggregation_config(self, doc, config):
53
+ self.config = config
54
+
55
+ def raw_features(self, doc):
56
+ features = {}
57
+ for ent in doc.ents:
58
+ if ent._.count is None:
59
+ if not ent.label_ in features:
60
+ features[ent.label_] = set()
61
+ features[ent.label_].add(ent._.text)
62
+ else:
63
+ # If it has a count we put it in a separate column an accumulate the counts
64
+ if not ent._.text in features:
65
+ features[ent._.text] = 0
66
+ features[ent._.text] += ent._.count
67
+ return features
68
+
69
+ def features(self, doc):
70
+ features = {}
71
+ for name, values in self.raw_features(doc).items():
72
+ if not name in self.config:
73
+ features[name] = values
74
+ continue
75
+
76
+ if not 'method' in self.config[name]:
77
+ features[name] = values
78
+ continue
79
+
80
+ method = self.config[name]["method"]
81
+ if method == 'first':
82
+ if len(values) != 0:
83
+ features[name] = values.pop()
84
+ else:
85
+ features[name] = float('nan')
86
+ elif method == 'join':
87
+ features[name] = ','.join(list(values))
88
+ elif method == 'expand':
89
+ for value in values:
90
+ features[value] = 1
91
+ else:
92
+ print(f"unknown feature aggregation method: {method}, skipping...")
93
+ features[name] = values
94
+ return features
95
+
96
+ def add_to_dataframe(self, doc, df):
97
+ features = self.features(doc)
98
+ for name, feature in features.items():
99
+ features[name] = [feature]
100
+ df = pd.concat([df, pd.DataFrame(features)])
101
+ return df
meta.json CHANGED
@@ -37,13 +37,15 @@
37
  "tok2vec",
38
  "ner",
39
  "count_extraction_component",
40
- "normalizer_component"
 
41
  ],
42
  "components":[
43
  "tok2vec",
44
  "ner",
45
  "count_extraction_component",
46
- "normalizer_component"
 
47
  ],
48
  "disabled":[
49
 
 
37
  "tok2vec",
38
  "ner",
39
  "count_extraction_component",
40
+ "normalizer_component",
41
+ "feature_aggregator_component"
42
  ],
43
  "components":[
44
  "tok2vec",
45
  "ner",
46
  "count_extraction_component",
47
+ "normalizer_component",
48
+ "feature_aggregator_component"
49
  ],
50
  "disabled":[
51
 
normalizer_component.py CHANGED
@@ -50,19 +50,19 @@ default_normalization_table = {
50
  "1280x720": ["HD"],
51
  "640x480": ["SD"],
52
  "Wifi": ["Wifi", "Wi-Fi", "Wifi built in", "built in Wifi", "WiFi integrated"],
53
- "BLUETOOTH": ["BLUETOOTH", "Blutooth"],
54
  }
55
 
56
 
57
  @Language.factory("normalizer_component")
58
  class NormalizerComponent(object):
59
- def __init__(self, nlp, name, norm_file=None):
60
- # if norm_file is None:
61
- self.norm_table = default_normalization_table
62
- # elif isinstance(norm_file, object):
63
- # self.norm_table = norm_file
64
- # else:
65
- # self.norm_table = json.load(open(norm_file))
66
 
67
  self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
68
  self.nlp = nlp
 
50
  "1280x720": ["HD"],
51
  "640x480": ["SD"],
52
  "Wifi": ["Wifi", "Wi-Fi", "Wifi built in", "built in Wifi", "WiFi integrated"],
53
+ "Blutooth": ["BLUETOOTH"],
54
  }
55
 
56
 
57
  @Language.factory("normalizer_component")
58
  class NormalizerComponent(object):
59
+ def __init__(self, nlp, name, norms=None):
60
+ if norms is None:
61
+ self.norm_table = default_normalization_table
62
+ elif isinstance(norms, str):
63
+ self.norm_table = json.load(open(norms))
64
+ else:
65
+ self.norm_table = norms
66
 
67
  self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
68
  self.nlp = nlp