Upload 35 files
Browse files- app/app.py +116 -0
- docs/Makefile +153 -0
- docs/commands.rst +10 -0
- docs/conf.py +244 -0
- docs/getting-started.rst +6 -0
- docs/index.rst +24 -0
- docs/make.bat +190 -0
- models/.gitkeep +0 -0
- notebooks/.gitkeep +0 -0
- references/.gitkeep +0 -0
- reports/.gitkeep +0 -0
- reports/figures/.gitkeep +0 -0
- src/__init__.py +0 -0
- src/data/.gitkeep +0 -0
- src/data/__init__.py +0 -0
- src/data/__pycache__/wiki_scrape.cpython-311.pyc +0 -0
- src/data/entities.txt +15 -0
- src/data/make_dataset.py +30 -0
- src/data/wiki_scrape.py +56 -0
- src/features/.gitkeep +0 -0
- src/features/__init__.py +0 -0
- src/features/build_features.py +0 -0
- src/models/.gitkeep +0 -0
- src/models/__init__.py +0 -0
- src/models/__pycache__/extractive_qa.cpython-311.pyc +0 -0
- src/models/__pycache__/search_engine.cpython-311.pyc +0 -0
- src/models/__pycache__/visual_qa.cpython-311.pyc +0 -0
- src/models/extractive_qa.py +50 -0
- src/models/predict_model.py +0 -0
- src/models/search_engine.py +118 -0
- src/models/train_model.py +0 -0
- src/models/visual_qa.py +14 -0
- src/visualization/.gitkeep +0 -0
- src/visualization/__init__.py +0 -0
- src/visualization/visualize.py +0 -0
app/app.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# python3 -m streamlit run app.py
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
from PIL import Image
|
5 |
+
import numpy as np
|
6 |
+
from pathlib import Path
|
7 |
+
import shutil
|
8 |
+
import sys
|
9 |
+
sys.path.insert(1, "src/models")
|
10 |
+
from extractive_qa import QA
|
11 |
+
from visual_qa import VisualQA
|
12 |
+
from search_engine import IR
|
13 |
+
# from src.models.extractive_qa import QA
|
14 |
+
# from src.models.search_engine import IR
|
15 |
+
|
16 |
+
|
17 |
+
@st.cache_resource
|
18 |
+
def load_visual_qa_module():
|
19 |
+
"""
|
20 |
+
Loads the Visual QA module
|
21 |
+
"""
|
22 |
+
qa_module = VisualQA()
|
23 |
+
return qa_module
|
24 |
+
|
25 |
+
@st.cache_resource
|
26 |
+
def load_qa_module():
|
27 |
+
"""
|
28 |
+
Loads the extractive QA module
|
29 |
+
"""
|
30 |
+
qa_module = QA()
|
31 |
+
return qa_module
|
32 |
+
|
33 |
+
@st.cache_resource
|
34 |
+
def load_search_engine():
|
35 |
+
"""
|
36 |
+
Loads the extractive QA module
|
37 |
+
"""
|
38 |
+
search_engine = IR()
|
39 |
+
return search_engine
|
40 |
+
|
41 |
+
def get_metadata_from_question(question):
|
42 |
+
if 'artist' in question:
|
43 |
+
return 'artist'
|
44 |
+
elif 'style' in question:
|
45 |
+
return 'style'
|
46 |
+
elif 'genre' in question:
|
47 |
+
return 'genre'
|
48 |
+
|
49 |
+
# Defining session variables
|
50 |
+
if 'extractive_qa' not in st.session_state:
|
51 |
+
st.session_state.extractive_qa = False
|
52 |
+
|
53 |
+
if 'vqa_prediction' not in st.session_state:
|
54 |
+
st.session_state.vqa_prediction = None
|
55 |
+
|
56 |
+
dirpath = Path.cwd() / 'results'
|
57 |
+
model_path = Path.cwd() / 'models'
|
58 |
+
#print(dirpath)
|
59 |
+
if dirpath.exists() and dirpath.is_dir():
|
60 |
+
shutil.rmtree(dirpath)
|
61 |
+
|
62 |
+
vqa_module = load_visual_qa_module()
|
63 |
+
qa_module = load_qa_module()
|
64 |
+
search_engine = load_search_engine()
|
65 |
+
|
66 |
+
st.title("VQArt")
|
67 |
+
|
68 |
+
st.markdown("""Hello, please take a picture of the painting and ask a question about it. \
|
69 |
+
I can answer questions about the style, artist and genre of the painting, \
|
70 |
+
and then questions about these topics. \
|
71 |
+
""")
|
72 |
+
|
73 |
+
# Take a picture
|
74 |
+
imgbuffer = st.camera_input('')
|
75 |
+
|
76 |
+
# Upload a file
|
77 |
+
uploaded_file = st.file_uploader('Upload a photo of a painting')
|
78 |
+
|
79 |
+
# Prompt for a question
|
80 |
+
question = st.text_input(label="What is your question (e.g. Who's the artist of this painting?)")
|
81 |
+
|
82 |
+
if question:
|
83 |
+
print(f'Received question: {question}')
|
84 |
+
|
85 |
+
if st.session_state.extractive_qa:
|
86 |
+
# Doing Extractive QA
|
87 |
+
full_question = f'[{st.session_state.vqa_prediction}] {question}'
|
88 |
+
|
89 |
+
articles, scores = search_engine.retrieve_documents(full_question, 5)
|
90 |
+
print(f'Found {len(articles)} search results')
|
91 |
+
|
92 |
+
if len(articles) == 0:
|
93 |
+
st.markdown("Sorry, I don't know the answer to that question :(")
|
94 |
+
else:
|
95 |
+
best_result = articles[0]
|
96 |
+
answer = qa_module.answer_question(full_question, best_result)
|
97 |
+
st.markdown(f'Answer: {answer}')
|
98 |
+
else:
|
99 |
+
# Doing VQA
|
100 |
+
|
101 |
+
if imgbuffer:
|
102 |
+
# Camera
|
103 |
+
img = Image.open(imgbuffer)
|
104 |
+
elif uploaded_file:
|
105 |
+
# Uploaded file
|
106 |
+
img = Image.open(uploaded_file)
|
107 |
+
|
108 |
+
result = vqa_module.answer_question(question, img)
|
109 |
+
meta_data = get_metadata_from_question(question)
|
110 |
+
st.markdown(f"Answer: The {meta_data} of this painting is {result}")
|
111 |
+
|
112 |
+
# Switching to extractive QA
|
113 |
+
st.session_state.extractive_qa = True
|
114 |
+
|
115 |
+
# Saving the predicted VQA answer
|
116 |
+
st.session_state.vqa_prediction = result
|
docs/Makefile
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Makefile for Sphinx documentation
|
2 |
+
#
|
3 |
+
|
4 |
+
# You can set these variables from the command line.
|
5 |
+
SPHINXOPTS =
|
6 |
+
SPHINXBUILD = sphinx-build
|
7 |
+
PAPER =
|
8 |
+
BUILDDIR = _build
|
9 |
+
|
10 |
+
# Internal variables.
|
11 |
+
PAPEROPT_a4 = -D latex_paper_size=a4
|
12 |
+
PAPEROPT_letter = -D latex_paper_size=letter
|
13 |
+
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
|
14 |
+
# the i18n builder cannot share the environment and doctrees with the others
|
15 |
+
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
|
16 |
+
|
17 |
+
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
|
18 |
+
|
19 |
+
help:
|
20 |
+
@echo "Please use \`make <target>' where <target> is one of"
|
21 |
+
@echo " html to make standalone HTML files"
|
22 |
+
@echo " dirhtml to make HTML files named index.html in directories"
|
23 |
+
@echo " singlehtml to make a single large HTML file"
|
24 |
+
@echo " pickle to make pickle files"
|
25 |
+
@echo " json to make JSON files"
|
26 |
+
@echo " htmlhelp to make HTML files and a HTML help project"
|
27 |
+
@echo " qthelp to make HTML files and a qthelp project"
|
28 |
+
@echo " devhelp to make HTML files and a Devhelp project"
|
29 |
+
@echo " epub to make an epub"
|
30 |
+
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
|
31 |
+
@echo " latexpdf to make LaTeX files and run them through pdflatex"
|
32 |
+
@echo " text to make text files"
|
33 |
+
@echo " man to make manual pages"
|
34 |
+
@echo " texinfo to make Texinfo files"
|
35 |
+
@echo " info to make Texinfo files and run them through makeinfo"
|
36 |
+
@echo " gettext to make PO message catalogs"
|
37 |
+
@echo " changes to make an overview of all changed/added/deprecated items"
|
38 |
+
@echo " linkcheck to check all external links for integrity"
|
39 |
+
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
|
40 |
+
|
41 |
+
clean:
|
42 |
+
-rm -rf $(BUILDDIR)/*
|
43 |
+
|
44 |
+
html:
|
45 |
+
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
|
46 |
+
@echo
|
47 |
+
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
|
48 |
+
|
49 |
+
dirhtml:
|
50 |
+
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
|
51 |
+
@echo
|
52 |
+
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
|
53 |
+
|
54 |
+
singlehtml:
|
55 |
+
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
|
56 |
+
@echo
|
57 |
+
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
|
58 |
+
|
59 |
+
pickle:
|
60 |
+
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
|
61 |
+
@echo
|
62 |
+
@echo "Build finished; now you can process the pickle files."
|
63 |
+
|
64 |
+
json:
|
65 |
+
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
|
66 |
+
@echo
|
67 |
+
@echo "Build finished; now you can process the JSON files."
|
68 |
+
|
69 |
+
htmlhelp:
|
70 |
+
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
|
71 |
+
@echo
|
72 |
+
@echo "Build finished; now you can run HTML Help Workshop with the" \
|
73 |
+
".hhp project file in $(BUILDDIR)/htmlhelp."
|
74 |
+
|
75 |
+
qthelp:
|
76 |
+
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
|
77 |
+
@echo
|
78 |
+
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
|
79 |
+
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
|
80 |
+
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/art_chatbot.qhcp"
|
81 |
+
@echo "To view the help file:"
|
82 |
+
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/art_chatbot.qhc"
|
83 |
+
|
84 |
+
devhelp:
|
85 |
+
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
|
86 |
+
@echo
|
87 |
+
@echo "Build finished."
|
88 |
+
@echo "To view the help file:"
|
89 |
+
@echo "# mkdir -p $$HOME/.local/share/devhelp/art_chatbot"
|
90 |
+
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/art_chatbot"
|
91 |
+
@echo "# devhelp"
|
92 |
+
|
93 |
+
epub:
|
94 |
+
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
|
95 |
+
@echo
|
96 |
+
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
|
97 |
+
|
98 |
+
latex:
|
99 |
+
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
100 |
+
@echo
|
101 |
+
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
|
102 |
+
@echo "Run \`make' in that directory to run these through (pdf)latex" \
|
103 |
+
"(use \`make latexpdf' here to do that automatically)."
|
104 |
+
|
105 |
+
latexpdf:
|
106 |
+
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
107 |
+
@echo "Running LaTeX files through pdflatex..."
|
108 |
+
$(MAKE) -C $(BUILDDIR)/latex all-pdf
|
109 |
+
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
|
110 |
+
|
111 |
+
text:
|
112 |
+
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
|
113 |
+
@echo
|
114 |
+
@echo "Build finished. The text files are in $(BUILDDIR)/text."
|
115 |
+
|
116 |
+
man:
|
117 |
+
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
|
118 |
+
@echo
|
119 |
+
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
|
120 |
+
|
121 |
+
texinfo:
|
122 |
+
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
123 |
+
@echo
|
124 |
+
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
|
125 |
+
@echo "Run \`make' in that directory to run these through makeinfo" \
|
126 |
+
"(use \`make info' here to do that automatically)."
|
127 |
+
|
128 |
+
info:
|
129 |
+
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
130 |
+
@echo "Running Texinfo files through makeinfo..."
|
131 |
+
make -C $(BUILDDIR)/texinfo info
|
132 |
+
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
|
133 |
+
|
134 |
+
gettext:
|
135 |
+
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
|
136 |
+
@echo
|
137 |
+
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
|
138 |
+
|
139 |
+
changes:
|
140 |
+
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
|
141 |
+
@echo
|
142 |
+
@echo "The overview file is in $(BUILDDIR)/changes."
|
143 |
+
|
144 |
+
linkcheck:
|
145 |
+
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
|
146 |
+
@echo
|
147 |
+
@echo "Link check complete; look for any errors in the above output " \
|
148 |
+
"or in $(BUILDDIR)/linkcheck/output.txt."
|
149 |
+
|
150 |
+
doctest:
|
151 |
+
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
|
152 |
+
@echo "Testing of doctests in the sources finished, look at the " \
|
153 |
+
"results in $(BUILDDIR)/doctest/output.txt."
|
docs/commands.rst
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Commands
|
2 |
+
========
|
3 |
+
|
4 |
+
The Makefile contains the central entry points for common tasks related to this project.
|
5 |
+
|
6 |
+
Syncing data to S3
|
7 |
+
^^^^^^^^^^^^^^^^^^
|
8 |
+
|
9 |
+
* `make sync_data_to_s3` will use `aws s3 sync` to recursively sync files in `data/` up to `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/`.
|
10 |
+
* `make sync_data_from_s3` will use `aws s3 sync` to recursively sync files from `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/` to `data/`.
|
docs/conf.py
ADDED
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
#
|
3 |
+
# art_chatbot documentation build configuration file, created by
|
4 |
+
# sphinx-quickstart.
|
5 |
+
#
|
6 |
+
# This file is execfile()d with the current directory set to its containing dir.
|
7 |
+
#
|
8 |
+
# Note that not all possible configuration values are present in this
|
9 |
+
# autogenerated file.
|
10 |
+
#
|
11 |
+
# All configuration values have a default; values that are commented out
|
12 |
+
# serve to show the default.
|
13 |
+
|
14 |
+
import os
|
15 |
+
import sys
|
16 |
+
|
17 |
+
# If extensions (or modules to document with autodoc) are in another directory,
|
18 |
+
# add these directories to sys.path here. If the directory is relative to the
|
19 |
+
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
20 |
+
# sys.path.insert(0, os.path.abspath('.'))
|
21 |
+
|
22 |
+
# -- General configuration -----------------------------------------------------
|
23 |
+
|
24 |
+
# If your documentation needs a minimal Sphinx version, state it here.
|
25 |
+
# needs_sphinx = '1.0'
|
26 |
+
|
27 |
+
# Add any Sphinx extension module names here, as strings. They can be extensions
|
28 |
+
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
29 |
+
extensions = []
|
30 |
+
|
31 |
+
# Add any paths that contain templates here, relative to this directory.
|
32 |
+
templates_path = ['_templates']
|
33 |
+
|
34 |
+
# The suffix of source filenames.
|
35 |
+
source_suffix = '.rst'
|
36 |
+
|
37 |
+
# The encoding of source files.
|
38 |
+
# source_encoding = 'utf-8-sig'
|
39 |
+
|
40 |
+
# The master toctree document.
|
41 |
+
master_doc = 'index'
|
42 |
+
|
43 |
+
# General information about the project.
|
44 |
+
project = u'art_chatbot'
|
45 |
+
|
46 |
+
# The version info for the project you're documenting, acts as replacement for
|
47 |
+
# |version| and |release|, also used in various other places throughout the
|
48 |
+
# built documents.
|
49 |
+
#
|
50 |
+
# The short X.Y version.
|
51 |
+
version = '0.1'
|
52 |
+
# The full version, including alpha/beta/rc tags.
|
53 |
+
release = '0.1'
|
54 |
+
|
55 |
+
# The language for content autogenerated by Sphinx. Refer to documentation
|
56 |
+
# for a list of supported languages.
|
57 |
+
# language = None
|
58 |
+
|
59 |
+
# There are two options for replacing |today|: either, you set today to some
|
60 |
+
# non-false value, then it is used:
|
61 |
+
# today = ''
|
62 |
+
# Else, today_fmt is used as the format for a strftime call.
|
63 |
+
# today_fmt = '%B %d, %Y'
|
64 |
+
|
65 |
+
# List of patterns, relative to source directory, that match files and
|
66 |
+
# directories to ignore when looking for source files.
|
67 |
+
exclude_patterns = ['_build']
|
68 |
+
|
69 |
+
# The reST default role (used for this markup: `text`) to use for all documents.
|
70 |
+
# default_role = None
|
71 |
+
|
72 |
+
# If true, '()' will be appended to :func: etc. cross-reference text.
|
73 |
+
# add_function_parentheses = True
|
74 |
+
|
75 |
+
# If true, the current module name will be prepended to all description
|
76 |
+
# unit titles (such as .. function::).
|
77 |
+
# add_module_names = True
|
78 |
+
|
79 |
+
# If true, sectionauthor and moduleauthor directives will be shown in the
|
80 |
+
# output. They are ignored by default.
|
81 |
+
# show_authors = False
|
82 |
+
|
83 |
+
# The name of the Pygments (syntax highlighting) style to use.
|
84 |
+
pygments_style = 'sphinx'
|
85 |
+
|
86 |
+
# A list of ignored prefixes for module index sorting.
|
87 |
+
# modindex_common_prefix = []
|
88 |
+
|
89 |
+
|
90 |
+
# -- Options for HTML output ---------------------------------------------------
|
91 |
+
|
92 |
+
# The theme to use for HTML and HTML Help pages. See the documentation for
|
93 |
+
# a list of builtin themes.
|
94 |
+
html_theme = 'default'
|
95 |
+
|
96 |
+
# Theme options are theme-specific and customize the look and feel of a theme
|
97 |
+
# further. For a list of options available for each theme, see the
|
98 |
+
# documentation.
|
99 |
+
# html_theme_options = {}
|
100 |
+
|
101 |
+
# Add any paths that contain custom themes here, relative to this directory.
|
102 |
+
# html_theme_path = []
|
103 |
+
|
104 |
+
# The name for this set of Sphinx documents. If None, it defaults to
|
105 |
+
# "<project> v<release> documentation".
|
106 |
+
# html_title = None
|
107 |
+
|
108 |
+
# A shorter title for the navigation bar. Default is the same as html_title.
|
109 |
+
# html_short_title = None
|
110 |
+
|
111 |
+
# The name of an image file (relative to this directory) to place at the top
|
112 |
+
# of the sidebar.
|
113 |
+
# html_logo = None
|
114 |
+
|
115 |
+
# The name of an image file (within the static path) to use as favicon of the
|
116 |
+
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
|
117 |
+
# pixels large.
|
118 |
+
# html_favicon = None
|
119 |
+
|
120 |
+
# Add any paths that contain custom static files (such as style sheets) here,
|
121 |
+
# relative to this directory. They are copied after the builtin static files,
|
122 |
+
# so a file named "default.css" will overwrite the builtin "default.css".
|
123 |
+
html_static_path = ['_static']
|
124 |
+
|
125 |
+
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
|
126 |
+
# using the given strftime format.
|
127 |
+
# html_last_updated_fmt = '%b %d, %Y'
|
128 |
+
|
129 |
+
# If true, SmartyPants will be used to convert quotes and dashes to
|
130 |
+
# typographically correct entities.
|
131 |
+
# html_use_smartypants = True
|
132 |
+
|
133 |
+
# Custom sidebar templates, maps document names to template names.
|
134 |
+
# html_sidebars = {}
|
135 |
+
|
136 |
+
# Additional templates that should be rendered to pages, maps page names to
|
137 |
+
# template names.
|
138 |
+
# html_additional_pages = {}
|
139 |
+
|
140 |
+
# If false, no module index is generated.
|
141 |
+
# html_domain_indices = True
|
142 |
+
|
143 |
+
# If false, no index is generated.
|
144 |
+
# html_use_index = True
|
145 |
+
|
146 |
+
# If true, the index is split into individual pages for each letter.
|
147 |
+
# html_split_index = False
|
148 |
+
|
149 |
+
# If true, links to the reST sources are added to the pages.
|
150 |
+
# html_show_sourcelink = True
|
151 |
+
|
152 |
+
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
|
153 |
+
# html_show_sphinx = True
|
154 |
+
|
155 |
+
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
|
156 |
+
# html_show_copyright = True
|
157 |
+
|
158 |
+
# If true, an OpenSearch description file will be output, and all pages will
|
159 |
+
# contain a <link> tag referring to it. The value of this option must be the
|
160 |
+
# base URL from which the finished HTML is served.
|
161 |
+
# html_use_opensearch = ''
|
162 |
+
|
163 |
+
# This is the file name suffix for HTML files (e.g. ".xhtml").
|
164 |
+
# html_file_suffix = None
|
165 |
+
|
166 |
+
# Output file base name for HTML help builder.
|
167 |
+
htmlhelp_basename = 'art_chatbotdoc'
|
168 |
+
|
169 |
+
|
170 |
+
# -- Options for LaTeX output --------------------------------------------------
|
171 |
+
|
172 |
+
latex_elements = {
|
173 |
+
# The paper size ('letterpaper' or 'a4paper').
|
174 |
+
# 'papersize': 'letterpaper',
|
175 |
+
|
176 |
+
# The font size ('10pt', '11pt' or '12pt').
|
177 |
+
# 'pointsize': '10pt',
|
178 |
+
|
179 |
+
# Additional stuff for the LaTeX preamble.
|
180 |
+
# 'preamble': '',
|
181 |
+
}
|
182 |
+
|
183 |
+
# Grouping the document tree into LaTeX files. List of tuples
|
184 |
+
# (source start file, target name, title, author, documentclass [howto/manual]).
|
185 |
+
latex_documents = [
|
186 |
+
('index',
|
187 |
+
'art_chatbot.tex',
|
188 |
+
u'art_chatbot Documentation',
|
189 |
+
u"Your name (or your organization/company/team)", 'manual'),
|
190 |
+
]
|
191 |
+
|
192 |
+
# The name of an image file (relative to this directory) to place at the top of
|
193 |
+
# the title page.
|
194 |
+
# latex_logo = None
|
195 |
+
|
196 |
+
# For "manual" documents, if this is true, then toplevel headings are parts,
|
197 |
+
# not chapters.
|
198 |
+
# latex_use_parts = False
|
199 |
+
|
200 |
+
# If true, show page references after internal links.
|
201 |
+
# latex_show_pagerefs = False
|
202 |
+
|
203 |
+
# If true, show URL addresses after external links.
|
204 |
+
# latex_show_urls = False
|
205 |
+
|
206 |
+
# Documents to append as an appendix to all manuals.
|
207 |
+
# latex_appendices = []
|
208 |
+
|
209 |
+
# If false, no module index is generated.
|
210 |
+
# latex_domain_indices = True
|
211 |
+
|
212 |
+
|
213 |
+
# -- Options for manual page output --------------------------------------------
|
214 |
+
|
215 |
+
# One entry per manual page. List of tuples
|
216 |
+
# (source start file, name, description, authors, manual section).
|
217 |
+
man_pages = [
|
218 |
+
('index', 'art_chatbot', u'art_chatbot Documentation',
|
219 |
+
[u"Your name (or your organization/company/team)"], 1)
|
220 |
+
]
|
221 |
+
|
222 |
+
# If true, show URL addresses after external links.
|
223 |
+
# man_show_urls = False
|
224 |
+
|
225 |
+
|
226 |
+
# -- Options for Texinfo output ------------------------------------------------
|
227 |
+
|
228 |
+
# Grouping the document tree into Texinfo files. List of tuples
|
229 |
+
# (source start file, target name, title, author,
|
230 |
+
# dir menu entry, description, category)
|
231 |
+
texinfo_documents = [
|
232 |
+
('index', 'art_chatbot', u'art_chatbot Documentation',
|
233 |
+
u"Your name (or your organization/company/team)", 'art_chatbot',
|
234 |
+
'A short description of the project.', 'Miscellaneous'),
|
235 |
+
]
|
236 |
+
|
237 |
+
# Documents to append as an appendix to all manuals.
|
238 |
+
# texinfo_appendices = []
|
239 |
+
|
240 |
+
# If false, no module index is generated.
|
241 |
+
# texinfo_domain_indices = True
|
242 |
+
|
243 |
+
# How to display URL addresses: 'footnote', 'no', or 'inline'.
|
244 |
+
# texinfo_show_urls = 'footnote'
|
docs/getting-started.rst
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Getting started
|
2 |
+
===============
|
3 |
+
|
4 |
+
This is where you describe how to get set up on a clean install, including the
|
5 |
+
commands necessary to get the raw data (using the `sync_data_from_s3` command,
|
6 |
+
for example), and then how to make the cleaned, final data sets.
|
docs/index.rst
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.. art_chatbot documentation master file, created by
|
2 |
+
sphinx-quickstart.
|
3 |
+
You can adapt this file completely to your liking, but it should at least
|
4 |
+
contain the root `toctree` directive.
|
5 |
+
|
6 |
+
art_chatbot documentation!
|
7 |
+
==============================================
|
8 |
+
|
9 |
+
Contents:
|
10 |
+
|
11 |
+
.. toctree::
|
12 |
+
:maxdepth: 2
|
13 |
+
|
14 |
+
getting-started
|
15 |
+
commands
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
Indices and tables
|
20 |
+
==================
|
21 |
+
|
22 |
+
* :ref:`genindex`
|
23 |
+
* :ref:`modindex`
|
24 |
+
* :ref:`search`
|
docs/make.bat
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@ECHO OFF
|
2 |
+
|
3 |
+
REM Command file for Sphinx documentation
|
4 |
+
|
5 |
+
if "%SPHINXBUILD%" == "" (
|
6 |
+
set SPHINXBUILD=sphinx-build
|
7 |
+
)
|
8 |
+
set BUILDDIR=_build
|
9 |
+
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
|
10 |
+
set I18NSPHINXOPTS=%SPHINXOPTS% .
|
11 |
+
if NOT "%PAPER%" == "" (
|
12 |
+
set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
|
13 |
+
set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
|
14 |
+
)
|
15 |
+
|
16 |
+
if "%1" == "" goto help
|
17 |
+
|
18 |
+
if "%1" == "help" (
|
19 |
+
:help
|
20 |
+
echo.Please use `make ^<target^>` where ^<target^> is one of
|
21 |
+
echo. html to make standalone HTML files
|
22 |
+
echo. dirhtml to make HTML files named index.html in directories
|
23 |
+
echo. singlehtml to make a single large HTML file
|
24 |
+
echo. pickle to make pickle files
|
25 |
+
echo. json to make JSON files
|
26 |
+
echo. htmlhelp to make HTML files and a HTML help project
|
27 |
+
echo. qthelp to make HTML files and a qthelp project
|
28 |
+
echo. devhelp to make HTML files and a Devhelp project
|
29 |
+
echo. epub to make an epub
|
30 |
+
echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
|
31 |
+
echo. text to make text files
|
32 |
+
echo. man to make manual pages
|
33 |
+
echo. texinfo to make Texinfo files
|
34 |
+
echo. gettext to make PO message catalogs
|
35 |
+
echo. changes to make an overview over all changed/added/deprecated items
|
36 |
+
echo. linkcheck to check all external links for integrity
|
37 |
+
echo. doctest to run all doctests embedded in the documentation if enabled
|
38 |
+
goto end
|
39 |
+
)
|
40 |
+
|
41 |
+
if "%1" == "clean" (
|
42 |
+
for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
|
43 |
+
del /q /s %BUILDDIR%\*
|
44 |
+
goto end
|
45 |
+
)
|
46 |
+
|
47 |
+
if "%1" == "html" (
|
48 |
+
%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
|
49 |
+
if errorlevel 1 exit /b 1
|
50 |
+
echo.
|
51 |
+
echo.Build finished. The HTML pages are in %BUILDDIR%/html.
|
52 |
+
goto end
|
53 |
+
)
|
54 |
+
|
55 |
+
if "%1" == "dirhtml" (
|
56 |
+
%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
|
57 |
+
if errorlevel 1 exit /b 1
|
58 |
+
echo.
|
59 |
+
echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
|
60 |
+
goto end
|
61 |
+
)
|
62 |
+
|
63 |
+
if "%1" == "singlehtml" (
|
64 |
+
%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
|
65 |
+
if errorlevel 1 exit /b 1
|
66 |
+
echo.
|
67 |
+
echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
|
68 |
+
goto end
|
69 |
+
)
|
70 |
+
|
71 |
+
if "%1" == "pickle" (
|
72 |
+
%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
|
73 |
+
if errorlevel 1 exit /b 1
|
74 |
+
echo.
|
75 |
+
echo.Build finished; now you can process the pickle files.
|
76 |
+
goto end
|
77 |
+
)
|
78 |
+
|
79 |
+
if "%1" == "json" (
|
80 |
+
%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
|
81 |
+
if errorlevel 1 exit /b 1
|
82 |
+
echo.
|
83 |
+
echo.Build finished; now you can process the JSON files.
|
84 |
+
goto end
|
85 |
+
)
|
86 |
+
|
87 |
+
if "%1" == "htmlhelp" (
|
88 |
+
%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
|
89 |
+
if errorlevel 1 exit /b 1
|
90 |
+
echo.
|
91 |
+
echo.Build finished; now you can run HTML Help Workshop with the ^
|
92 |
+
.hhp project file in %BUILDDIR%/htmlhelp.
|
93 |
+
goto end
|
94 |
+
)
|
95 |
+
|
96 |
+
if "%1" == "qthelp" (
|
97 |
+
%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
|
98 |
+
if errorlevel 1 exit /b 1
|
99 |
+
echo.
|
100 |
+
echo.Build finished; now you can run "qcollectiongenerator" with the ^
|
101 |
+
.qhcp project file in %BUILDDIR%/qthelp, like this:
|
102 |
+
echo.^> qcollectiongenerator %BUILDDIR%\qthelp\art_chatbot.qhcp
|
103 |
+
echo.To view the help file:
|
104 |
+
echo.^> assistant -collectionFile %BUILDDIR%\qthelp\art_chatbot.ghc
|
105 |
+
goto end
|
106 |
+
)
|
107 |
+
|
108 |
+
if "%1" == "devhelp" (
|
109 |
+
%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
|
110 |
+
if errorlevel 1 exit /b 1
|
111 |
+
echo.
|
112 |
+
echo.Build finished.
|
113 |
+
goto end
|
114 |
+
)
|
115 |
+
|
116 |
+
if "%1" == "epub" (
|
117 |
+
%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
|
118 |
+
if errorlevel 1 exit /b 1
|
119 |
+
echo.
|
120 |
+
echo.Build finished. The epub file is in %BUILDDIR%/epub.
|
121 |
+
goto end
|
122 |
+
)
|
123 |
+
|
124 |
+
if "%1" == "latex" (
|
125 |
+
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
|
126 |
+
if errorlevel 1 exit /b 1
|
127 |
+
echo.
|
128 |
+
echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
|
129 |
+
goto end
|
130 |
+
)
|
131 |
+
|
132 |
+
if "%1" == "text" (
|
133 |
+
%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
|
134 |
+
if errorlevel 1 exit /b 1
|
135 |
+
echo.
|
136 |
+
echo.Build finished. The text files are in %BUILDDIR%/text.
|
137 |
+
goto end
|
138 |
+
)
|
139 |
+
|
140 |
+
if "%1" == "man" (
|
141 |
+
%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
|
142 |
+
if errorlevel 1 exit /b 1
|
143 |
+
echo.
|
144 |
+
echo.Build finished. The manual pages are in %BUILDDIR%/man.
|
145 |
+
goto end
|
146 |
+
)
|
147 |
+
|
148 |
+
if "%1" == "texinfo" (
|
149 |
+
%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
|
150 |
+
if errorlevel 1 exit /b 1
|
151 |
+
echo.
|
152 |
+
echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
|
153 |
+
goto end
|
154 |
+
)
|
155 |
+
|
156 |
+
if "%1" == "gettext" (
|
157 |
+
%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
|
158 |
+
if errorlevel 1 exit /b 1
|
159 |
+
echo.
|
160 |
+
echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
|
161 |
+
goto end
|
162 |
+
)
|
163 |
+
|
164 |
+
if "%1" == "changes" (
|
165 |
+
%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
|
166 |
+
if errorlevel 1 exit /b 1
|
167 |
+
echo.
|
168 |
+
echo.The overview file is in %BUILDDIR%/changes.
|
169 |
+
goto end
|
170 |
+
)
|
171 |
+
|
172 |
+
if "%1" == "linkcheck" (
|
173 |
+
%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
|
174 |
+
if errorlevel 1 exit /b 1
|
175 |
+
echo.
|
176 |
+
echo.Link check complete; look for any errors in the above output ^
|
177 |
+
or in %BUILDDIR%/linkcheck/output.txt.
|
178 |
+
goto end
|
179 |
+
)
|
180 |
+
|
181 |
+
if "%1" == "doctest" (
|
182 |
+
%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
|
183 |
+
if errorlevel 1 exit /b 1
|
184 |
+
echo.
|
185 |
+
echo.Testing of doctests in the sources finished, look at the ^
|
186 |
+
results in %BUILDDIR%/doctest/output.txt.
|
187 |
+
goto end
|
188 |
+
)
|
189 |
+
|
190 |
+
:end
|
models/.gitkeep
ADDED
File without changes
|
notebooks/.gitkeep
ADDED
File without changes
|
references/.gitkeep
ADDED
File without changes
|
reports/.gitkeep
ADDED
File without changes
|
reports/figures/.gitkeep
ADDED
File without changes
|
src/__init__.py
ADDED
File without changes
|
src/data/.gitkeep
ADDED
File without changes
|
src/data/__init__.py
ADDED
File without changes
|
src/data/__pycache__/wiki_scrape.cpython-311.pyc
ADDED
Binary file (3.78 kB). View file
|
|
src/data/entities.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Realism
|
2 |
+
portrait
|
3 |
+
Romanticism
|
4 |
+
landscape
|
5 |
+
Surrealism
|
6 |
+
Impressionism
|
7 |
+
genre painting
|
8 |
+
religious painting
|
9 |
+
Neoclassicism
|
10 |
+
Symbolist painting
|
11 |
+
Ivan Aivazovsky
|
12 |
+
Marc Chagall
|
13 |
+
John Singer Sargent
|
14 |
+
Gustave Dore
|
15 |
+
Zdzisław Beksiński
|
src/data/make_dataset.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import click
|
3 |
+
import logging
|
4 |
+
from pathlib import Path
|
5 |
+
from dotenv import find_dotenv, load_dotenv
|
6 |
+
|
7 |
+
|
8 |
+
@click.command()
|
9 |
+
@click.argument('input_filepath', type=click.Path(exists=True))
|
10 |
+
@click.argument('output_filepath', type=click.Path())
|
11 |
+
def main(input_filepath, output_filepath):
|
12 |
+
""" Runs data processing scripts to turn raw data from (../raw) into
|
13 |
+
cleaned data ready to be analyzed (saved in ../processed).
|
14 |
+
"""
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
logger.info('making final data set from raw data')
|
17 |
+
|
18 |
+
|
19 |
+
if __name__ == '__main__':
|
20 |
+
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
21 |
+
logging.basicConfig(level=logging.INFO, format=log_fmt)
|
22 |
+
|
23 |
+
# not used in this stub but often useful for finding various files
|
24 |
+
project_dir = Path(__file__).resolve().parents[2]
|
25 |
+
|
26 |
+
# find .env automagically by walking up directories until it's found, then
|
27 |
+
# load up the .env entries as environment variables
|
28 |
+
load_dotenv(find_dotenv())
|
29 |
+
|
30 |
+
main()
|
src/data/wiki_scrape.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import wikipedia
|
2 |
+
import os
|
3 |
+
|
4 |
+
def get_raw_wikipedia_article(entity):
|
5 |
+
try:
|
6 |
+
results = wikipedia.search(entity)
|
7 |
+
best_result = results[0]
|
8 |
+
page = wikipedia.page(best_result, auto_suggest=False)
|
9 |
+
return page.content
|
10 |
+
except wikipedia.exceptions.DisambiguationError as e:
|
11 |
+
# Search term can't be disambiguated so we try
|
12 |
+
# again with a more specific search term adding ' (arts)'
|
13 |
+
return get_raw_wikipedia_article(entity + ' (arts)')
|
14 |
+
|
15 |
+
except wikipedia.exceptions.PageError as e:
|
16 |
+
# If the page doesn't exist, handle the PageError here.
|
17 |
+
print("The requested page does not exist on Wikipedia.")
|
18 |
+
return None
|
19 |
+
|
20 |
+
def clean_article(raw_article):
|
21 |
+
lines = raw_article.split('\n')
|
22 |
+
clean_lines = []
|
23 |
+
for l in lines:
|
24 |
+
if l.startswith('== See also'):
|
25 |
+
break
|
26 |
+
if l.startswith('== References'):
|
27 |
+
break
|
28 |
+
if l.startswith('='):
|
29 |
+
continue
|
30 |
+
if len(l.strip()) == 0:
|
31 |
+
continue
|
32 |
+
|
33 |
+
clean_lines.append(l.strip())
|
34 |
+
return '\n'.join(clean_lines)
|
35 |
+
|
36 |
+
def save_article(content, path):
|
37 |
+
with open(path, 'w', encoding='utf-8') as f:
|
38 |
+
f.write(content)
|
39 |
+
|
40 |
+
def load_entities(entities_path):
|
41 |
+
with open(entities_path, 'r', encoding='utf-8') as f:
|
42 |
+
return [l.strip() for l in f.readlines()]
|
43 |
+
|
44 |
+
def scrape(entities_path, save_path):
|
45 |
+
entities = load_entities(entities_path)
|
46 |
+
for entity in entities:
|
47 |
+
raw_article = get_raw_wikipedia_article(entity)
|
48 |
+
if raw_article == None:
|
49 |
+
print(f'Article on Wikipedia not found for entity {entity} :(')
|
50 |
+
continue
|
51 |
+
|
52 |
+
cleaned_article = clean_article(raw_article)
|
53 |
+
save_article(cleaned_article, os.path.join(save_path, f'{entity}.txt'))
|
54 |
+
|
55 |
+
if __name__ == '__main__':
|
56 |
+
scrape('src/data/entities.txt', 'data/wiki_articles')
|
src/features/.gitkeep
ADDED
File without changes
|
src/features/__init__.py
ADDED
File without changes
|
src/features/build_features.py
ADDED
File without changes
|
src/models/.gitkeep
ADDED
File without changes
|
src/models/__init__.py
ADDED
File without changes
|
src/models/__pycache__/extractive_qa.cpython-311.pyc
ADDED
Binary file (2.74 kB). View file
|
|
src/models/__pycache__/search_engine.cpython-311.pyc
ADDED
Binary file (6.58 kB). View file
|
|
src/models/__pycache__/visual_qa.cpython-311.pyc
ADDED
Binary file (1.41 kB). View file
|
|
src/models/extractive_qa.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import BertTokenizer, BertForQuestionAnswering
|
2 |
+
import torch
|
3 |
+
|
4 |
+
class QA(object):
|
5 |
+
def __init__(self,
|
6 |
+
model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'):
|
7 |
+
|
8 |
+
self.model_name = model_name
|
9 |
+
|
10 |
+
self.__load_model_and_tokenizer()
|
11 |
+
|
12 |
+
def __load_model_and_tokenizer(self):
|
13 |
+
self.model = BertForQuestionAnswering.from_pretrained(self.model_name)
|
14 |
+
self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
|
15 |
+
|
16 |
+
def __get_segment_ids(self, input_ids):
|
17 |
+
# Search the input_ids for the first instance of the `[SEP]` token.
|
18 |
+
sep_index = input_ids.index(self.tokenizer.sep_token_id)
|
19 |
+
|
20 |
+
# The number of segment A tokens includes the [SEP] token istelf.
|
21 |
+
num_seg_a = sep_index + 1
|
22 |
+
|
23 |
+
# The remainder are segment B.
|
24 |
+
num_seg_b = len(input_ids) - num_seg_a
|
25 |
+
|
26 |
+
# Construct the list of 0s and 1s.
|
27 |
+
segment_ids = [0]*num_seg_a + [1]*num_seg_b
|
28 |
+
|
29 |
+
# There should be a segment_id for every input token.
|
30 |
+
assert len(segment_ids) == len(input_ids)
|
31 |
+
|
32 |
+
return segment_ids
|
33 |
+
|
34 |
+
def answer_question(self, query, passage):
|
35 |
+
input_ids = self.tokenizer.encode(query, passage)
|
36 |
+
segment_ids = self.__get_segment_ids(input_ids)
|
37 |
+
|
38 |
+
# Run our example through the model.
|
39 |
+
outputs = self.model(torch.tensor([input_ids]), # The tokens representing our input text.
|
40 |
+
token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
|
41 |
+
return_dict=True)
|
42 |
+
|
43 |
+
start_scores = outputs.start_logits
|
44 |
+
end_scores = outputs.end_logits
|
45 |
+
|
46 |
+
# Find the tokens with the highest `start` and `end` scores.
|
47 |
+
answer_start = torch.argmax(start_scores)
|
48 |
+
answer_end = torch.argmax(end_scores)
|
49 |
+
|
50 |
+
return self.tokenizer.decode(input_ids[answer_start:answer_end+1])
|
src/models/predict_model.py
ADDED
File without changes
|
src/models/search_engine.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from tqdm import tqdm
|
3 |
+
from whoosh.index import * # whoosh: full-text indexing and searching
|
4 |
+
from whoosh.fields import *
|
5 |
+
from whoosh import qparser
|
6 |
+
import sys
|
7 |
+
sys.path.insert(1, "src/data")
|
8 |
+
#import src.data.wiki_scrape as wiki_scrape
|
9 |
+
import wiki_scrape as wiki_scrape
|
10 |
+
|
11 |
+
class IR(object):
|
12 |
+
def __init__(self,
|
13 |
+
max_passage_length = 800,
|
14 |
+
overlap = 0.4,
|
15 |
+
passages_limit = 10000,
|
16 |
+
data_path = 'data/wiki_articles',
|
17 |
+
index_path = 'index'):
|
18 |
+
self.max_passage_length = max_passage_length
|
19 |
+
self.overlap = overlap
|
20 |
+
self.passages_limit = passages_limit
|
21 |
+
self.data_path = data_path
|
22 |
+
self.index_path = index_path
|
23 |
+
self.ix = None
|
24 |
+
|
25 |
+
passages = self.__load_passages()
|
26 |
+
|
27 |
+
if os.path.exists(self.index_path):
|
28 |
+
print(f'Index already exists in the directory {self.index_path}')
|
29 |
+
print('Skipping building the index...')
|
30 |
+
self.ix = open_dir(self.index_path)
|
31 |
+
else:
|
32 |
+
self.__create_index(passages)
|
33 |
+
|
34 |
+
def __create_passages_from_article(self, content):
|
35 |
+
passages = []
|
36 |
+
passage_diff = int(self.max_passage_length * (1-self.overlap))
|
37 |
+
|
38 |
+
for i in range(0, len(content), passage_diff):
|
39 |
+
passages.append(content[i: i + self.max_passage_length])
|
40 |
+
return passages
|
41 |
+
|
42 |
+
def __scrape_wiki_if_not_exists(self):
|
43 |
+
if not os.path.exists(self.data_path):
|
44 |
+
os.makedirs(self.data_path)
|
45 |
+
|
46 |
+
if len(os.listdir(self.data_path)) == 0:
|
47 |
+
print('No Wiki articles. Scraping...')
|
48 |
+
wiki_scrape.scrape('src/data/entities.txt', 'data/wiki_articles')
|
49 |
+
|
50 |
+
def __load_passages(self):
|
51 |
+
self.__scrape_wiki_if_not_exists()
|
52 |
+
|
53 |
+
passages = []
|
54 |
+
count = 0
|
55 |
+
|
56 |
+
directory = os.fsencode(self.data_path)
|
57 |
+
|
58 |
+
for file in os.listdir(directory):
|
59 |
+
filename = os.fsdecode(file)
|
60 |
+
if not filename.endswith(".txt"):
|
61 |
+
continue
|
62 |
+
|
63 |
+
with open(os.path.join(self.data_path, filename), 'r', encoding='utf-8') as f:
|
64 |
+
content = f.read()
|
65 |
+
article_passages = self.__create_passages_from_article(content)
|
66 |
+
#print(f'Created {len(article_passages)} passages')
|
67 |
+
passages.extend(article_passages)
|
68 |
+
|
69 |
+
count += 1
|
70 |
+
if count == self.passages_limit:
|
71 |
+
break
|
72 |
+
return passages
|
73 |
+
|
74 |
+
def __create_index(self, passages):
|
75 |
+
# Create the index directory
|
76 |
+
os.mkdir(self.index_path)
|
77 |
+
|
78 |
+
# Schema definition:
|
79 |
+
# - id: type ID, unique, stored; doc id in order given the passages file
|
80 |
+
# - text: type TEXT processed by StemmingAnalyzer; not stored; content of the passage
|
81 |
+
schema = Schema(id = ID(stored=True,unique=True),
|
82 |
+
text = TEXT(analyzer=analysis.StemmingAnalyzer())
|
83 |
+
)
|
84 |
+
|
85 |
+
# Create an index
|
86 |
+
self.ix = create_in("index", schema)
|
87 |
+
writer = self.ix.writer() #run once! or restart runtime
|
88 |
+
|
89 |
+
# Add papers to the index, iterating through each row in the metadata dataframe
|
90 |
+
id = 0
|
91 |
+
for passage_text in tqdm(passages, desc='Building index'):
|
92 |
+
writer.add_document(id=str(id),text=passage_text)
|
93 |
+
id += 1
|
94 |
+
|
95 |
+
# Save the added documents
|
96 |
+
writer.commit()
|
97 |
+
print("Index successfully created")
|
98 |
+
|
99 |
+
def retrieve_documents(self, query, topk):
|
100 |
+
scores=[]
|
101 |
+
text=[]
|
102 |
+
passages = self.__load_passages()
|
103 |
+
# Open the searcher for reading the index. The default BM25 algorithm will be used for scoring
|
104 |
+
with self.ix.searcher() as searcher:
|
105 |
+
searcher = self.ix.searcher()
|
106 |
+
|
107 |
+
# Define the query parser ('text' will be the default field to search), and set the input query
|
108 |
+
q = qparser.QueryParser("text", self.ix.schema, group=qparser.OrGroup).parse(query)
|
109 |
+
|
110 |
+
# Search using the query q, and get the n_docs documents, sorted with the highest-scoring documents first
|
111 |
+
results = searcher.search(q, limit=topk)
|
112 |
+
# results is a list of dictionaries where each dictionary is the stored fields of the document
|
113 |
+
|
114 |
+
# Iterate over the retrieved documents
|
115 |
+
for hit in results:
|
116 |
+
scores.append(hit.score)
|
117 |
+
text.append(passages[int(hit['id'])])
|
118 |
+
return text, scores
|
src/models/train_model.py
ADDED
File without changes
|
src/models/visual_qa.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
|
3 |
+
|
4 |
+
class VisualQA(object):
|
5 |
+
def __init__(self, model_name='nflechas/VQArt', tokenizer_name='dandelin/vilt-b32-finetuned-vqa'):
|
6 |
+
self.model_name = model_name
|
7 |
+
self.tokenizer_name = tokenizer_name
|
8 |
+
self.__load_model()
|
9 |
+
|
10 |
+
def __load_model(self):
|
11 |
+
self.model = pipeline('vqa', model=self.model_name, tokenizer=self.tokenizer_name)
|
12 |
+
|
13 |
+
def answer_question(self, query, image):
|
14 |
+
return self.model(question=query, image=image, top_k=1)[0]['answer']
|
src/visualization/.gitkeep
ADDED
File without changes
|
src/visualization/__init__.py
ADDED
File without changes
|
src/visualization/visualize.py
ADDED
File without changes
|