mattupson commited on
Commit
65e9efa
1 Parent(s): 339c1c6

new: First version

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ images/
Makefile ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #################################################################################
2
+ # GLOBALS #
3
+ #################################################################################
4
+
5
+ PYTHON_VERSION = python3.8
6
+ VIRTUALENV := .venv
7
+
8
+ #################################################################################
9
+ # COMMANDS #
10
+ #################################################################################
11
+
12
+ # Set the default location for the virtualenv to be stored
13
+ # Create the virtualenv by installing the requirements and test requirements
14
+
15
+ $(VIRTUALENV)/.installed: requirements.txt
16
+ @if [ -d $(VIRTUALENV) ]; then rm -rf $(VIRTUALENV); fi
17
+ @mkdir -p $(VIRTUALENV)
18
+ virtualenv --python $(PYTHON_VERSION) $(VIRTUALENV)
19
+ $(VIRTUALENV)/bin/pip3 install -r requirements.txt
20
+ $(VIRTUALENV)/bin/pip3 install -r requirements_dev.txt
21
+ #${VIRTUALENV}/bin/pre-commit install --hook-type pre-push --hook-type post-checkout --hook-type pre-commit
22
+ touch $@
23
+
24
+ # Update the requirements to latest. This is required because typically we won't
25
+ # want to incldue test requirements in the requirements of the application, and
26
+ # because it makes life much easier when we want to keep our dependencies up to
27
+ # date.
28
+
29
+ .PHONY: update-requirements-txt
30
+ update-requirements-txt: unpinned_requirements.txt
31
+ update-requirements-txt: VIRTUALENV := /tmp/update-requirements-virtualenv
32
+ update-requirements-txt:
33
+ @if [ -d $(VIRTUALENV) ]; then rm -rf $(VIRTUALENV); fi
34
+ @mkdir -p $(VIRTUALENV)
35
+ virtualenv --python $(PYTHON_VERSION) $(VIRTUALENV)
36
+ $(VIRTUALENV)/bin/pip3 install --upgrade -r unpinned_requirements.txt
37
+ echo "# Created by 'make update-requirements-txt'. DO NOT EDIT!" > requirements.txt
38
+ $(VIRTUALENV)/bin/pip freeze | grep -v pkg_resources==0.0.0 >> requirements.txt
39
+
40
+ .PHONY: virtualenv
41
+ virtualenv: $(VIRTUALENV)/.installed
42
+
app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import streamlit as st
3
+
4
+ # from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
5
+
6
+
7
+ def render_entities(entities):
8
+ colors = {"LOCATION": "#5cff84"}
9
+ options = {"ents": ["LOCATION"], "colors": colors}
10
+ html = spacy.displacy.render(entities, style="ent", options=options, manual=True)
11
+ html = html.replace("\n", " ")
12
+
13
+ return html
14
+
15
+
16
+ HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
17
+
18
+ st.header("Location Entity Recognition Demo 🔎🌆🌍")
19
+ threshold = st.sidebar.slider("Threshold", value=0.5, min_value=0.0, max_value=1.0)
20
+ display_probabilities = st.sidebar.checkbox("Display probabilities")
21
+
22
+
23
+ text = st.text_area("Text input", value="This text is about Malaria", height=400)
24
+
25
+ nlp = spacy.load("en_core_web_trf")
26
+
27
+ doc = nlp(text)
28
+
29
+ ents = [
30
+ {"start": ent.start_char, "end": ent.end_char, "label": "LOCATION"}
31
+ for ent in doc.ents
32
+ ]
33
+ foo = {"text": text, "ents": ents}
34
+
35
+
36
+ print(ents)
37
+ print(doc.ents)
38
+
39
+ html = render_entities(foo)
40
+ st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
data/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ /raw
data/processed/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ /wellcome_grant_descriptions.csv
data/raw.dvc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ outs:
2
+ - md5: b50ad44187720e74dddd6ec443649e27.dir
3
+ size: 42921143
4
+ nfiles: 2
5
+ path: raw
dvc.lock ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ schema: '2.0'
2
+ stages:
3
+ subset:
4
+ cmd: python src/subset_data.py
5
+ deps:
6
+ - path: data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv
7
+ md5: 5c0d0e532709648b61625e7e130dfaa4
8
+ size: 31028261
9
+ - path: src/subset_data.py
10
+ md5: 3b6059867baea4de020776bcfdc9c2a4
11
+ size: 604
12
+ outs:
13
+ - path: data/processed/wellcome_grant_descriptions.csv
14
+ md5: bb28282adc17ccd209ed370bc4557e40
15
+ size: 1307583
dvc.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ stages:
2
+ subset:
3
+ cmd: python src/subset_data.py
4
+ deps:
5
+ - src/subset_data.py
6
+ - data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv
7
+ outs:
8
+ - data/processed/wellcome_grant_descriptions.csv
requirements.txt ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by 'make update-requirements-txt'. DO NOT EDIT!
2
+ altair==4.2.0
3
+ attrs==22.1.0
4
+ backports.zoneinfo==0.2.1
5
+ blinker==1.5
6
+ blis==0.7.8
7
+ cachetools==5.2.0
8
+ catalogue==2.0.8
9
+ certifi==2022.9.24
10
+ charset-normalizer==2.1.1
11
+ click==8.1.3
12
+ commonmark==0.9.1
13
+ confection==0.0.3
14
+ cymem==2.0.6
15
+ decorator==5.1.1
16
+ en-core-web-trf==3.4.0
17
+ entrypoints==0.4
18
+ filelock==3.8.0
19
+ gitdb==4.0.9
20
+ GitPython==3.1.29
21
+ huggingface-hub==0.10.1
22
+ idna==3.4
23
+ importlib-metadata==5.0.0
24
+ importlib-resources==5.10.0
25
+ Jinja2==3.1.2
26
+ jsonschema==4.16.0
27
+ langcodes==3.3.0
28
+ MarkupSafe==2.1.1
29
+ murmurhash==1.0.8
30
+ numpy==1.23.3
31
+ packaging==21.3
32
+ pandas==1.5.0
33
+ pathy==0.6.2
34
+ Pillow==9.2.0
35
+ pkgutil-resolve-name==1.3.10
36
+ preshed==3.0.7
37
+ protobuf==3.20.3
38
+ pyarrow==9.0.0
39
+ pydantic==1.9.2
40
+ pydeck==0.8.0b3
41
+ Pygments==2.13.0
42
+ Pympler==1.0.1
43
+ pyparsing==3.0.9
44
+ pyrsistent==0.18.1
45
+ python-dateutil==2.8.2
46
+ pytz==2022.4
47
+ pytz-deprecation-shim==0.1.0.post0
48
+ PyYAML==6.0
49
+ regex==2022.9.13
50
+ requests==2.28.1
51
+ rich==12.6.0
52
+ semver==2.13.0
53
+ six==1.16.0
54
+ smart-open==5.2.1
55
+ smmap==5.0.0
56
+ spacy==3.4.1
57
+ spacy-alignments==0.8.5
58
+ spacy-legacy==3.0.10
59
+ spacy-loggers==1.0.3
60
+ spacy-transformers==1.1.8
61
+ srsly==2.4.4
62
+ streamlit==1.13.0
63
+ thinc==8.1.3
64
+ tokenizers==0.13.1
65
+ toml==0.10.2
66
+ toolz==0.12.0
67
+ torch==1.12.1
68
+ tornado==6.2
69
+ tqdm==4.64.1
70
+ transformers==4.23.1
71
+ typer==0.4.2
72
+ typing-extensions==4.4.0
73
+ tzdata==2022.4
74
+ tzlocal==4.2
75
+ urllib3==1.26.12
76
+ validators==0.20.0
77
+ wasabi==0.10.1
78
+ watchdog==2.1.9
79
+ zipp==3.9.0
requirements_dev.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ dvc[s3]
2
+ pre-commit
src/subset_data.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+ INPUT_FILE = "data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv"
5
+ OUTPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
6
+
7
+ print(f"Reading data from {INPUT_FILE}")
8
+
9
+
10
+ data = pd.read_csv(INPUT_FILE)
11
+
12
+ data = (
13
+ data[["Description"]]
14
+ .replace("Not available", np.nan)
15
+ .dropna()
16
+ .drop_duplicates()
17
+ .reset_index(drop=True)
18
+ .sample(1000)
19
+ )
20
+
21
+ print(f"Number of rows: {data.shape[0]}")
22
+ print(f"Number of unique rows: {data['Description'].nunique()}")
23
+
24
+ print(f"Saving file to {OUTPUT_FILE}")
25
+ data.to_csv(OUTPUT_FILE, index=False)
unpinned_requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pandas
2
+ streamlit
3
+ transformers
4
+ torch
5
+ spacy
6
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.4.0/en_core_web_trf-3.4.0-py3-none-any.whl