lterriel commited on
Commit
56f7cac
·
1 Parent(s): 4291284

update batch size + endpoint EF

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +24 -17
  3. config.json +3 -0
  4. requirements.txt +5 -116
.gitignore CHANGED
@@ -2,3 +2,4 @@ Legacy.py
2
  .idea
3
  standoffconverter
4
  venv/
 
 
2
  .idea
3
  standoffconverter
4
  venv/
5
+ test.py
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import re
 
2
 
3
  import streamlit
4
  import spacy_streamlit
@@ -10,6 +11,11 @@ streamlit.set_page_config(layout="wide")
10
 
11
  samples_test = {"FRAN_IR_050370.xml": "./samples/FRAN_IR_050370.xml"}
12
 
 
 
 
 
 
13
  # TITLE APP
14
  streamlit.title("NER4Archives visualizer")
15
  streamlit.sidebar.title("NER4Archives visualizer")
@@ -119,13 +125,12 @@ if flag_model:
119
  nlp = spacy.load(model)
120
  nlp.max_length = 5000000
121
  if linking:
122
- nlp.add_pipe('entityfishing', config={"language": "fr"})
123
 
124
  with streamlit.spinner('NER processing...'):
125
  if linking:
126
  start_sentence = 0
127
- docs = nlp.pipe(sentences, batch_size=150)
128
- for doc in docs:
129
  end_sentence = start_sentence + len(doc.text) + 1
130
  for ent in doc.ents:
131
  start_tok = start_sentence + ent.start_char
@@ -141,16 +146,21 @@ if flag_model:
141
  ))
142
  start_sentence = end_sentence
143
  else:
144
- entities = [
145
- (ent.start_char,
146
- ent.end_char,
147
- ent.text,
148
- ent.label_,
149
- "",
150
- "",
151
- ""
152
- ) for ent in nlp(plain).ents
153
- ]
 
 
 
 
 
154
 
155
 
156
  streamlit.success('😃 NER applied with success!')
@@ -194,7 +204,4 @@ if flag_model:
194
  "ORG": "#f39c12",
195
  "PER": "#3498db"
196
  }
197
- })
198
-
199
-
200
-
 
1
  import re
2
+ import json
3
 
4
  import streamlit
5
  import spacy_streamlit
 
11
 
12
  samples_test = {"FRAN_IR_050370.xml": "./samples/FRAN_IR_050370.xml"}
13
 
14
+ with open('config.json', mode="r") as json_file:
15
+ CONFIGURATION = json.loads(json_file.read())
16
+
17
+
18
+
19
  # TITLE APP
20
  streamlit.title("NER4Archives visualizer")
21
  streamlit.sidebar.title("NER4Archives visualizer")
 
125
  nlp = spacy.load(model)
126
  nlp.max_length = 5000000
127
  if linking:
128
+ nlp.add_pipe('entityfishing', config={"language": "fr", "api_ef_base": CONFIGURATION['ef_endpoint']})
129
 
130
  with streamlit.spinner('NER processing...'):
131
  if linking:
132
  start_sentence = 0
133
+ for doc in nlp.pipe(sentences, batch_size=250):
 
134
  end_sentence = start_sentence + len(doc.text) + 1
135
  for ent in doc.ents:
136
  start_tok = start_sentence + ent.start_char
 
146
  ))
147
  start_sentence = end_sentence
148
  else:
149
+ start_sentence = 0
150
+ for doc in nlp.pipe(sentences):
151
+ end_sentence = start_sentence + len(doc.text) + 1
152
+ for ent in doc.ents:
153
+ start_tok = start_sentence + ent.start_char
154
+ end_tok = start_tok + len(ent.text)
155
+ entities.append((start_tok,
156
+ end_tok,
157
+ ent.text,
158
+ ent.label_,
159
+ "",
160
+ "",
161
+ ""
162
+ ))
163
+ start_sentence = end_sentence
164
 
165
 
166
  streamlit.success('😃 NER applied with success!')
 
204
  "ORG": "#f39c12",
205
  "PER": "#3498db"
206
  }
207
+ })
 
 
 
config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "ef_endpoint": "http://nerd.huma-num.fr/nerd/service"
3
+ }
requirements.txt CHANGED
@@ -1,119 +1,8 @@
1
- altair==4.2.0
2
- argon2-cffi==21.3.0
3
- argon2-cffi-bindings==21.2.0
4
- asttokens==2.0.5
5
- attrs==22.1.0
6
- backcall==0.2.0
7
- backports.zoneinfo==0.2.1
8
- beautifulsoup4==4.11.1
9
- bleach==5.0.1
10
- blinker==1.5
11
- blis==0.7.8
12
- cachetools==5.2.0
13
- catalogue==2.0.8
14
- certifi==2022.6.15
15
- cffi==1.15.1
16
- charset-normalizer==2.1.0
17
- click==8.1.3
18
- commonmark==0.9.1
19
- cymem==2.0.6
20
- debugpy==1.6.2
21
- decorator==5.1.1
22
- defusedxml==0.7.1
23
- entrypoints==0.4
24
- executing==0.9.1
25
- fastjsonschema==2.16.1
26
- gitdb==4.0.9
27
- GitPython==3.1.27
28
- idna==3.3
29
- importlib-metadata==4.12.0
30
- importlib-resources==5.9.0
31
- ipykernel==6.15.1
32
- ipython==8.4.0
33
- ipython-genutils==0.2.0
34
- ipywidgets==7.7.1
35
- jedi==0.18.1
36
- Jinja2==3.1.2
37
- jsonschema==4.8.0
38
- jupyter-client==7.3.4
39
- jupyter-core==4.11.1
40
- jupyterlab-pygments==0.2.2
41
- jupyterlab-widgets==1.1.1
42
- langcodes==3.3.0
43
- lxml==4.9.1
44
- MarkupSafe==2.1.1
45
- matplotlib-inline==0.1.3
46
- mistune==0.8.4
47
- murmurhash==1.0.7
48
- nbclient==0.6.6
49
- nbconvert==6.5.0
50
- nbformat==5.4.0
51
- nest-asyncio==1.5.5
52
- notebook==6.4.12
53
- numpy==1.23.1
54
- packaging==21.3
55
- pandas==1.4.3
56
- pandocfilters==1.5.0
57
- parso==0.8.3
58
- pathy==0.6.2
59
- pexpect==4.8.0
60
- pickleshare==0.7.5
61
- Pillow==9.2.0
62
- preshed==3.0.6
63
- prometheus-client==0.14.1
64
- prompt-toolkit==3.0.30
65
- protobuf==3.20.1
66
- psutil==5.9.1
67
- ptyprocess==0.7.0
68
- pure-eval==0.2.2
69
- pyarrow==8.0.0
70
- pycparser==2.21
71
- pydantic==1.8.2
72
- pydeck==0.7.1
73
- Pygments==2.12.0
74
- Pympler==1.0.1
75
- pyparsing==3.0.9
76
- pyrsistent==0.18.1
77
- python-dateutil==2.8.2
78
- pytz==2022.1
79
- pytz-deprecation-shim==0.1.0.post0
80
- pyzmq==23.2.0
81
- requests==2.28.1
82
- rich==12.5.1
83
- semver==2.13.0
84
- Send2Trash==1.8.0
85
- six==1.16.0
86
- smart-open==5.2.1
87
- smmap==5.0.0
88
- soupsieve==2.3.2.post1
89
- spacy==3.4.0
90
- spacy-legacy==3.0.9
91
- spacy-loggers==1.0.3
92
  spacy-streamlit==1.0.4
93
  spacyfishing==0.1.8
94
- srsly==2.4.4
95
- stack-data==0.3.0
96
  streamlit==1.11.1
97
- terminado==0.15.0
98
- thinc==8.1.2
99
- tinycss2==1.1.1
100
- toml==0.10.2
101
- toolz==0.12.0
102
- tornado==6.2
103
- tqdm==4.64.0
104
- traitlets==5.3.0
105
- typer==0.4.2
106
- typing_extensions==4.3.0
107
- tzdata==2022.1
108
- tzlocal==4.2
109
- urllib3==1.26.11
110
- validators==0.20.0
111
- wasabi==0.10.1
112
- watchdog==2.1.9
113
- wcwidth==0.2.5
114
- webencodings==0.5.1
115
- widgetsnbextension==3.6.1
116
- zipp==3.8.1
117
- fr_core_ner4archives_v3_default @ https://huggingface.co/ner4archives/fr_core_ner4archives_v3_default/resolve/main/fr_core_ner4archives_v3_default-any-py3-none-any.whl
118
- fr_core_ner4archives_v3_with_vectors @ https://huggingface.co/ner4archives/fr_core_ner4archives_v3_with_vectors/resolve/main/fr_core_ner4archives_v3_with_vectors-any-py3-none-any.whl
119
- fr_core_ner4archives_V3_camembert_base @ https://huggingface.co/ner4archives/fr_core_ner4archives_V3_camembert_base/resolve/main/fr_core_ner4archives_V3_camembert_base-any-py3-none-any.whl
 
1
+ spacy==3.4.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  spacy-streamlit==1.0.4
3
  spacyfishing==0.1.8
 
 
4
  streamlit==1.11.1
5
+ lxml==4.9.1
6
+ spacy-transformers==1.1.8
7
+ fr_ner4archives_v3_default @ https://huggingface.co/ner4archives/fr_ner4archives_v3_default/resolve/main/fr_ner4archives_v3_default-any-py3-none-any.whl
8
+ fr_ner4archives_v3_with_vectors @ https://huggingface.co/ner4archives/fr_ner4archives_v3_with_vectors/resolve/main/fr_ner4archives_v3_with_vectors-any-py3-none-any.whl