Spaces:
Runtime error
Runtime error
new: First version
Browse files- .gitignore +2 -0
- Makefile +42 -0
- app.py +40 -0
- data/.gitignore +1 -0
- data/processed/.gitignore +1 -0
- data/raw.dvc +5 -0
- dvc.lock +15 -0
- dvc.yaml +8 -0
- requirements.txt +79 -0
- requirements_dev.txt +2 -0
- src/subset_data.py +25 -0
- unpinned_requirements.txt +6 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
images/
|
Makefile
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#################################################################################
|
2 |
+
# GLOBALS #
|
3 |
+
#################################################################################
|
4 |
+
|
5 |
+
PYTHON_VERSION = python3.8
|
6 |
+
VIRTUALENV := .venv
|
7 |
+
|
8 |
+
#################################################################################
|
9 |
+
# COMMANDS #
|
10 |
+
#################################################################################
|
11 |
+
|
12 |
+
# Set the default location for the virtualenv to be stored
|
13 |
+
# Create the virtualenv by installing the requirements and test requirements
|
14 |
+
|
15 |
+
$(VIRTUALENV)/.installed: requirements.txt
|
16 |
+
@if [ -d $(VIRTUALENV) ]; then rm -rf $(VIRTUALENV); fi
|
17 |
+
@mkdir -p $(VIRTUALENV)
|
18 |
+
virtualenv --python $(PYTHON_VERSION) $(VIRTUALENV)
|
19 |
+
$(VIRTUALENV)/bin/pip3 install -r requirements.txt
|
20 |
+
$(VIRTUALENV)/bin/pip3 install -r requirements_dev.txt
|
21 |
+
#${VIRTUALENV}/bin/pre-commit install --hook-type pre-push --hook-type post-checkout --hook-type pre-commit
|
22 |
+
touch $@
|
23 |
+
|
24 |
+
# Update the requirements to latest. This is required because typically we won't
|
25 |
+
# want to incldue test requirements in the requirements of the application, and
|
26 |
+
# because it makes life much easier when we want to keep our dependencies up to
|
27 |
+
# date.
|
28 |
+
|
29 |
+
.PHONY: update-requirements-txt
|
30 |
+
update-requirements-txt: unpinned_requirements.txt
|
31 |
+
update-requirements-txt: VIRTUALENV := /tmp/update-requirements-virtualenv
|
32 |
+
update-requirements-txt:
|
33 |
+
@if [ -d $(VIRTUALENV) ]; then rm -rf $(VIRTUALENV); fi
|
34 |
+
@mkdir -p $(VIRTUALENV)
|
35 |
+
virtualenv --python $(PYTHON_VERSION) $(VIRTUALENV)
|
36 |
+
$(VIRTUALENV)/bin/pip3 install --upgrade -r unpinned_requirements.txt
|
37 |
+
echo "# Created by 'make update-requirements-txt'. DO NOT EDIT!" > requirements.txt
|
38 |
+
$(VIRTUALENV)/bin/pip freeze | grep -v pkg_resources==0.0.0 >> requirements.txt
|
39 |
+
|
40 |
+
.PHONY: virtualenv
|
41 |
+
virtualenv: $(VIRTUALENV)/.installed
|
42 |
+
|
app.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
# from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
|
5 |
+
|
6 |
+
|
7 |
+
def render_entities(entities):
|
8 |
+
colors = {"LOCATION": "#5cff84"}
|
9 |
+
options = {"ents": ["LOCATION"], "colors": colors}
|
10 |
+
html = spacy.displacy.render(entities, style="ent", options=options, manual=True)
|
11 |
+
html = html.replace("\n", " ")
|
12 |
+
|
13 |
+
return html
|
14 |
+
|
15 |
+
|
16 |
+
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
|
17 |
+
|
18 |
+
st.header("Location Entity Recognition Demo 🔎🌆🌍")
|
19 |
+
threshold = st.sidebar.slider("Threshold", value=0.5, min_value=0.0, max_value=1.0)
|
20 |
+
display_probabilities = st.sidebar.checkbox("Display probabilities")
|
21 |
+
|
22 |
+
|
23 |
+
text = st.text_area("Text input", value="This text is about Malaria", height=400)
|
24 |
+
|
25 |
+
nlp = spacy.load("en_core_web_trf")
|
26 |
+
|
27 |
+
doc = nlp(text)
|
28 |
+
|
29 |
+
ents = [
|
30 |
+
{"start": ent.start_char, "end": ent.end_char, "label": "LOCATION"}
|
31 |
+
for ent in doc.ents
|
32 |
+
]
|
33 |
+
foo = {"text": text, "ents": ents}
|
34 |
+
|
35 |
+
|
36 |
+
print(ents)
|
37 |
+
print(doc.ents)
|
38 |
+
|
39 |
+
html = render_entities(foo)
|
40 |
+
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/raw
|
data/processed/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/wellcome_grant_descriptions.csv
|
data/raw.dvc
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
outs:
|
2 |
+
- md5: b50ad44187720e74dddd6ec443649e27.dir
|
3 |
+
size: 42921143
|
4 |
+
nfiles: 2
|
5 |
+
path: raw
|
dvc.lock
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
schema: '2.0'
|
2 |
+
stages:
|
3 |
+
subset:
|
4 |
+
cmd: python src/subset_data.py
|
5 |
+
deps:
|
6 |
+
- path: data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv
|
7 |
+
md5: 5c0d0e532709648b61625e7e130dfaa4
|
8 |
+
size: 31028261
|
9 |
+
- path: src/subset_data.py
|
10 |
+
md5: 3b6059867baea4de020776bcfdc9c2a4
|
11 |
+
size: 604
|
12 |
+
outs:
|
13 |
+
- path: data/processed/wellcome_grant_descriptions.csv
|
14 |
+
md5: bb28282adc17ccd209ed370bc4557e40
|
15 |
+
size: 1307583
|
dvc.yaml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
stages:
|
2 |
+
subset:
|
3 |
+
cmd: python src/subset_data.py
|
4 |
+
deps:
|
5 |
+
- src/subset_data.py
|
6 |
+
- data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv
|
7 |
+
outs:
|
8 |
+
- data/processed/wellcome_grant_descriptions.csv
|
requirements.txt
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Created by 'make update-requirements-txt'. DO NOT EDIT!
|
2 |
+
altair==4.2.0
|
3 |
+
attrs==22.1.0
|
4 |
+
backports.zoneinfo==0.2.1
|
5 |
+
blinker==1.5
|
6 |
+
blis==0.7.8
|
7 |
+
cachetools==5.2.0
|
8 |
+
catalogue==2.0.8
|
9 |
+
certifi==2022.9.24
|
10 |
+
charset-normalizer==2.1.1
|
11 |
+
click==8.1.3
|
12 |
+
commonmark==0.9.1
|
13 |
+
confection==0.0.3
|
14 |
+
cymem==2.0.6
|
15 |
+
decorator==5.1.1
|
16 |
+
en-core-web-trf==3.4.0
|
17 |
+
entrypoints==0.4
|
18 |
+
filelock==3.8.0
|
19 |
+
gitdb==4.0.9
|
20 |
+
GitPython==3.1.29
|
21 |
+
huggingface-hub==0.10.1
|
22 |
+
idna==3.4
|
23 |
+
importlib-metadata==5.0.0
|
24 |
+
importlib-resources==5.10.0
|
25 |
+
Jinja2==3.1.2
|
26 |
+
jsonschema==4.16.0
|
27 |
+
langcodes==3.3.0
|
28 |
+
MarkupSafe==2.1.1
|
29 |
+
murmurhash==1.0.8
|
30 |
+
numpy==1.23.3
|
31 |
+
packaging==21.3
|
32 |
+
pandas==1.5.0
|
33 |
+
pathy==0.6.2
|
34 |
+
Pillow==9.2.0
|
35 |
+
pkgutil-resolve-name==1.3.10
|
36 |
+
preshed==3.0.7
|
37 |
+
protobuf==3.20.3
|
38 |
+
pyarrow==9.0.0
|
39 |
+
pydantic==1.9.2
|
40 |
+
pydeck==0.8.0b3
|
41 |
+
Pygments==2.13.0
|
42 |
+
Pympler==1.0.1
|
43 |
+
pyparsing==3.0.9
|
44 |
+
pyrsistent==0.18.1
|
45 |
+
python-dateutil==2.8.2
|
46 |
+
pytz==2022.4
|
47 |
+
pytz-deprecation-shim==0.1.0.post0
|
48 |
+
PyYAML==6.0
|
49 |
+
regex==2022.9.13
|
50 |
+
requests==2.28.1
|
51 |
+
rich==12.6.0
|
52 |
+
semver==2.13.0
|
53 |
+
six==1.16.0
|
54 |
+
smart-open==5.2.1
|
55 |
+
smmap==5.0.0
|
56 |
+
spacy==3.4.1
|
57 |
+
spacy-alignments==0.8.5
|
58 |
+
spacy-legacy==3.0.10
|
59 |
+
spacy-loggers==1.0.3
|
60 |
+
spacy-transformers==1.1.8
|
61 |
+
srsly==2.4.4
|
62 |
+
streamlit==1.13.0
|
63 |
+
thinc==8.1.3
|
64 |
+
tokenizers==0.13.1
|
65 |
+
toml==0.10.2
|
66 |
+
toolz==0.12.0
|
67 |
+
torch==1.12.1
|
68 |
+
tornado==6.2
|
69 |
+
tqdm==4.64.1
|
70 |
+
transformers==4.23.1
|
71 |
+
typer==0.4.2
|
72 |
+
typing-extensions==4.4.0
|
73 |
+
tzdata==2022.4
|
74 |
+
tzlocal==4.2
|
75 |
+
urllib3==1.26.12
|
76 |
+
validators==0.20.0
|
77 |
+
wasabi==0.10.1
|
78 |
+
watchdog==2.1.9
|
79 |
+
zipp==3.9.0
|
requirements_dev.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
dvc[s3]
|
2 |
+
pre-commit
|
src/subset_data.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
INPUT_FILE = "data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv"
|
5 |
+
OUTPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
|
6 |
+
|
7 |
+
print(f"Reading data from {INPUT_FILE}")
|
8 |
+
|
9 |
+
|
10 |
+
data = pd.read_csv(INPUT_FILE)
|
11 |
+
|
12 |
+
data = (
|
13 |
+
data[["Description"]]
|
14 |
+
.replace("Not available", np.nan)
|
15 |
+
.dropna()
|
16 |
+
.drop_duplicates()
|
17 |
+
.reset_index(drop=True)
|
18 |
+
.sample(1000)
|
19 |
+
)
|
20 |
+
|
21 |
+
print(f"Number of rows: {data.shape[0]}")
|
22 |
+
print(f"Number of unique rows: {data['Description'].nunique()}")
|
23 |
+
|
24 |
+
print(f"Saving file to {OUTPUT_FILE}")
|
25 |
+
data.to_csv(OUTPUT_FILE, index=False)
|
unpinned_requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
streamlit
|
3 |
+
transformers
|
4 |
+
torch
|
5 |
+
spacy
|
6 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.4.0/en_core_web_trf-3.4.0-py3-none-any.whl
|