oh201516 commited on
Commit
ecea783
·
verified ·
1 Parent(s): a0572d5

Update spaCy pipeline

Browse files
README.md CHANGED
@@ -24,7 +24,7 @@ model-index:
24
  | Feature | Description |
25
  | --- | --- |
26
  | **Name** | `en_setec_mk_tv` |
27
- | **Version** | `0.0.1` |
28
  | **spaCy** | `>=3.7.5,<3.8.0` |
29
  | **Default Pipeline** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
30
  | **Components** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
@@ -51,4 +51,6 @@ model-index:
51
  | --- | --- |
52
  | `ENTS_F` | 99.18 |
53
  | `ENTS_P` | 99.20 |
54
- | `ENTS_R` | 99.16 |
 
 
 
24
  | Feature | Description |
25
  | --- | --- |
26
  | **Name** | `en_setec_mk_tv` |
27
+ | **Version** | `0.0.2` |
28
  | **spaCy** | `>=3.7.5,<3.8.0` |
29
  | **Default Pipeline** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
30
  | **Components** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component`, `feature_aggregator_component` |
 
51
  | --- | --- |
52
  | `ENTS_F` | 99.18 |
53
  | `ENTS_P` | 99.20 |
54
+ | `ENTS_R` | 99.16 |
55
+ | `TOK2VEC_LOSS` | 49774.20 |
56
+ | `NER_LOSS` | 66917.02 |
config.cfg CHANGED
@@ -31,7 +31,7 @@ factory = "feature_aggregator_component"
31
  [components.feature_aggregator_component.config]
32
 
33
  [components.feature_aggregator_component.config.AUDIO_FEATURE]
34
- method = "first"
35
 
36
  [components.feature_aggregator_component.config.COLOR]
37
  method = "join"
 
31
  [components.feature_aggregator_component.config]
32
 
33
  [components.feature_aggregator_component.config.AUDIO_FEATURE]
34
+ method = "expand"
35
 
36
  [components.feature_aggregator_component.config.COLOR]
37
  method = "join"
count_extraction_component.py CHANGED
@@ -7,6 +7,7 @@ import re
7
  # https://spacy.io/usage/processing-pipelines#custom-components
8
  @Language.factory("count_extraction_component")
9
  class CountExtractorComponent(object):
 
10
  def __init__(self, nlp, name, label="CONNECTION"):
11
  self.label = label
12
  self.reg_left = re.compile(r"^(?P<count>\d+)\s*[xX]\s*(?P<name>.+)$")
@@ -16,8 +17,6 @@ class CountExtractorComponent(object):
16
  # set extensions to tokens, spans and docs
17
  Span.set_extension("count", default=None, force=True)
18
  Span.set_extension("text", default=None, force=True)
19
-
20
- Doc.set_extension("connections", getter=self.connections, force=True)
21
 
22
  def __call__(self, doc):
23
  for ent in doc.ents:
@@ -50,15 +49,3 @@ class CountExtractorComponent(object):
50
  ent._.text = text
51
  ent._.count = 1
52
  return doc
53
-
54
- def connections(self, doc):
55
- connections = {}
56
- for ent in doc.ents:
57
- if ent._.count is None:
58
- continue
59
-
60
- if ent._.text not in connections:
61
- connections[ent._.text] = ent._.count
62
- continue
63
- connections[ent._.text] += ent._.count
64
- return connections
 
7
  # https://spacy.io/usage/processing-pipelines#custom-components
8
  @Language.factory("count_extraction_component")
9
  class CountExtractorComponent(object):
10
+ # By default it only extracts count from CONNECTION type but this can be changed.
11
  def __init__(self, nlp, name, label="CONNECTION"):
12
  self.label = label
13
  self.reg_left = re.compile(r"^(?P<count>\d+)\s*[xX]\s*(?P<name>.+)$")
 
17
  # set extensions to tokens, spans and docs
18
  Span.set_extension("count", default=None, force=True)
19
  Span.set_extension("text", default=None, force=True)
 
 
20
 
21
  def __call__(self, doc):
22
  for ent in doc.ents:
 
49
  ent._.text = text
50
  ent._.count = 1
51
  return doc
 
 
 
 
 
 
 
 
 
 
 
 
en_setec_mk_tv-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9dab2aad0666f0c1ff0c0fe028dd6821004e75a6ad991cafe84241a639bcf4eb
3
- size 5709065
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ed6b65aefd826e2c8c35c2e60bf17d929e5eaf7f450d965b260c1e1a5e1ea7f
3
+ size 5709306
feature_aggregator_component.py CHANGED
@@ -2,13 +2,15 @@ from spacy.tokens import Doc, Span, Token
2
  from spacy.language import Language
3
  import pandas as pd
4
 
 
 
 
 
 
 
 
 
5
  default_feature_aggregation_config = {
6
- 'AUDIO_FEATURE': {
7
- 'method': 'first',
8
- },
9
- 'COLOR': {
10
- 'method': 'join',
11
- },
12
  'INCH': {
13
  'method': 'first',
14
  },
@@ -30,6 +32,12 @@ default_feature_aggregation_config = {
30
  'VIDEO_FEATURE': {
31
  'method': 'expand',
32
  },
 
 
 
 
 
 
33
  'WIRELESS_FEATURE': {
34
  'method': 'expand',
35
  },
 
2
  from spacy.language import Language
3
  import pandas as pd
4
 
5
+ # Default modes for feature extraction for the labels
6
+ #
7
+ # There are three methods:
8
+ # - `first`: Wich gets the first occurance and stops, this is nice for features like resolution,
9
+ # if the resolution is reapeted we just want the first accurance.
10
+ # - `expand`: This effectively does OneHot encoding where the feature value names
11
+ # become columns and 1 is put if the feature is there.
12
+ # - `join`: This concatinates the feature values under feature label.
13
  default_feature_aggregation_config = {
 
 
 
 
 
 
14
  'INCH': {
15
  'method': 'first',
16
  },
 
32
  'VIDEO_FEATURE': {
33
  'method': 'expand',
34
  },
35
+ 'AUDIO_FEATURE': {
36
+ 'method': 'expand',
37
+ },
38
+ 'COLOR': {
39
+ 'method': 'join',
40
+ },
41
  'WIRELESS_FEATURE': {
42
  'method': 'expand',
43
  },
meta.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "lang":"en",
3
  "name":"setec_mk_tv",
4
- "version":"0.0.1",
5
  "description":"",
6
  "author":"",
7
  "email":"",
 
1
  {
2
  "lang":"en",
3
  "name":"setec_mk_tv",
4
+ "version":"0.0.2",
5
  "description":"",
6
  "author":"",
7
  "email":"",
normalizer_component.py CHANGED
@@ -3,6 +3,7 @@ import json
3
  from spacy.language import Language
4
  from spacy.matcher import PhraseMatcher
5
 
 
6
  default_normalization_table = {
7
  "Dolby Atmos": ["Dolby Atmos", "Dolby Audio Atmos", "Dolby Atmos Audio"],
8
  "Ethernet": [
 
3
  from spacy.language import Language
4
  from spacy.matcher import PhraseMatcher
5
 
6
+ # Default normalization table, that can be customized by passing it to the component as a parameter.
7
  default_normalization_table = {
8
  "Dolby Atmos": ["Dolby Atmos", "Dolby Audio Atmos", "Dolby Atmos Audio"],
9
  "Ethernet": [