Spaces:

nouman-10
/

VQArt

Sleeping

App Files Files Community

nouman-10 commited on May 25, 2023

Commit

ccba2d5

1 Parent(s): b9d2b52

Upload 35 files

Browse files

Files changed (35) hide show

app/app.py +116 -0
docs/Makefile +153 -0
docs/commands.rst +10 -0
docs/conf.py +244 -0
docs/getting-started.rst +6 -0
docs/index.rst +24 -0
docs/make.bat +190 -0
models/.gitkeep +0 -0
notebooks/.gitkeep +0 -0
references/.gitkeep +0 -0
reports/.gitkeep +0 -0
reports/figures/.gitkeep +0 -0
src/__init__.py +0 -0
src/data/.gitkeep +0 -0
src/data/__init__.py +0 -0
src/data/__pycache__/wiki_scrape.cpython-311.pyc +0 -0
src/data/entities.txt +15 -0
src/data/make_dataset.py +30 -0
src/data/wiki_scrape.py +56 -0
src/features/.gitkeep +0 -0
src/features/__init__.py +0 -0
src/features/build_features.py +0 -0
src/models/.gitkeep +0 -0
src/models/__init__.py +0 -0
src/models/__pycache__/extractive_qa.cpython-311.pyc +0 -0
src/models/__pycache__/search_engine.cpython-311.pyc +0 -0
src/models/__pycache__/visual_qa.cpython-311.pyc +0 -0
src/models/extractive_qa.py +50 -0
src/models/predict_model.py +0 -0
src/models/search_engine.py +118 -0
src/models/train_model.py +0 -0
src/models/visual_qa.py +14 -0
src/visualization/.gitkeep +0 -0
src/visualization/__init__.py +0 -0
src/visualization/visualize.py +0 -0

app/app.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# python3 -m streamlit run app.py
+import streamlit as st
+from PIL import Image
+import numpy as np
+from pathlib import Path
+import shutil
+import sys
+sys.path.insert(1, "src/models")
+from extractive_qa import QA
+from visual_qa import VisualQA
+from search_engine import IR
+# from src.models.extractive_qa import QA
+# from src.models.search_engine import IR
+@st.cache_resource
+def load_visual_qa_module():
+    """
+    Loads the Visual QA module
+    """
+    qa_module = VisualQA()
+    return qa_module
+@st.cache_resource
+def load_qa_module():
+    """
+    Loads the extractive QA module
+    """
+    qa_module = QA()
+    return qa_module
+@st.cache_resource
+def load_search_engine():
+    """
+    Loads the extractive QA module
+    """
+    search_engine = IR()
+    return search_engine
+def get_metadata_from_question(question):
+    if 'artist' in question:
+        return 'artist'
+    elif 'style' in question:
+        return 'style'
+    elif 'genre' in question:
+        return 'genre'
+# Defining session variables
+if 'extractive_qa' not in st.session_state:
+    st.session_state.extractive_qa = False
+if 'vqa_prediction' not in st.session_state:
+    st.session_state.vqa_prediction = None
+dirpath = Path.cwd() / 'results'
+model_path = Path.cwd() / 'models'
+#print(dirpath)
+if dirpath.exists() and dirpath.is_dir():
+    shutil.rmtree(dirpath)
+vqa_module = load_visual_qa_module()
+qa_module = load_qa_module()
+search_engine = load_search_engine()
+st.title("VQArt")
+st.markdown("""Hello, please take a picture of the painting and ask a question about it. \
+               I can answer questions about the style, artist and genre of the painting, \
+               and then questions about these topics. \
+               """)
+# Take a picture
+imgbuffer = st.camera_input('')
+# Upload a file
+uploaded_file = st.file_uploader('Upload a photo of a painting')
+# Prompt for a question
+question = st.text_input(label="What is your question (e.g. Who's the artist of this painting?)")
+if question:
+    print(f'Received question: {question}')
+    if st.session_state.extractive_qa:
+        # Doing Extractive QA
+        full_question = f'[{st.session_state.vqa_prediction}] {question}'
+        articles, scores = search_engine.retrieve_documents(full_question, 5)
+        print(f'Found {len(articles)} search results')
+        if len(articles) == 0:
+            st.markdown("Sorry, I don't know the answer to that question :(")
+        else:
+            best_result = articles[0]
+            answer = qa_module.answer_question(full_question, best_result)
+            st.markdown(f'Answer: {answer}')
+    else:
+        # Doing VQA
+        if imgbuffer:
+            # Camera
+            img = Image.open(imgbuffer)
+        elif uploaded_file:
+            # Uploaded file
+            img = Image.open(uploaded_file)
+        result = vqa_module.answer_question(question, img)
+        meta_data = get_metadata_from_question(question)
+        st.markdown(f"Answer: The {meta_data} of this painting is {result}")
+        # Switching to extractive QA
+        st.session_state.extractive_qa = True
+        # Saving the predicted VQA answer
+        st.session_state.vqa_prediction = result

docs/Makefile ADDED Viewed

	@@ -0,0 +1,153 @@

+# Makefile for Sphinx documentation
+#
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+clean:
+	-rm -rf $(BUILDDIR)/*
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/art_chatbot.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/art_chatbot.qhc"
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/art_chatbot"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/art_chatbot"
+	@echo "# devhelp"
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."

docs/commands.rst ADDED Viewed

	@@ -0,0 +1,10 @@

+Commands
+========
+The Makefile contains the central entry points for common tasks related to this project.
+Syncing data to S3
+^^^^^^^^^^^^^^^^^^
+* `make sync_data_to_s3` will use `aws s3 sync` to recursively sync files in `data/` up to `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/`.
+* `make sync_data_from_s3` will use `aws s3 sync` to recursively sync files from `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/` to `data/`.

docs/conf.py ADDED Viewed

	@@ -0,0 +1,244 @@

+# -*- coding: utf-8 -*-
+#
+# art_chatbot documentation build configuration file, created by
+# sphinx-quickstart.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+import os
+import sys
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+# sys.path.insert(0, os.path.abspath('.'))
+# -- General configuration -----------------------------------------------------
+# If your documentation needs a minimal Sphinx version, state it here.
+# needs_sphinx = '1.0'
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = []
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+# The suffix of source filenames.
+source_suffix = '.rst'
+# The encoding of source files.
+# source_encoding = 'utf-8-sig'
+# The master toctree document.
+master_doc = 'index'
+# General information about the project.
+project = u'art_chatbot'
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.1'
+# The full version, including alpha/beta/rc tags.
+release = '0.1'
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+# language = None
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+# today = ''
+# Else, today_fmt is used as the format for a strftime call.
+# today_fmt = '%B %d, %Y'
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+# The reST default role (used for this markup: `text`) to use for all documents.
+# default_role = None
+# If true, '()' will be appended to :func: etc. cross-reference text.
+# add_function_parentheses = True
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+# add_module_names = True
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+# show_authors = False
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+# -- Options for HTML output ---------------------------------------------------
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+# html_theme_options = {}
+# Add any paths that contain custom themes here, relative to this directory.
+# html_theme_path = []
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+# html_title = None
+# A shorter title for the navigation bar.  Default is the same as html_title.
+# html_short_title = None
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+# html_logo = None
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+# html_favicon = None
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+# html_last_updated_fmt = '%b %d, %Y'
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+# html_use_smartypants = True
+# Custom sidebar templates, maps document names to template names.
+# html_sidebars = {}
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+# html_additional_pages = {}
+# If false, no module index is generated.
+# html_domain_indices = True
+# If false, no index is generated.
+# html_use_index = True
+# If true, the index is split into individual pages for each letter.
+# html_split_index = False
+# If true, links to the reST sources are added to the pages.
+# html_show_sourcelink = True
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+# html_show_sphinx = True
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+# html_show_copyright = True
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+# html_use_opensearch = ''
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'art_chatbotdoc'
+# -- Options for LaTeX output --------------------------------------------------
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    # 'papersize': 'letterpaper',
+    # The font size ('10pt', '11pt' or '12pt').
+    # 'pointsize': '10pt',
+    # Additional stuff for the LaTeX preamble.
+    # 'preamble': '',
+}
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+    ('index',
+     'art_chatbot.tex',
+     u'art_chatbot Documentation',
+     u"Your name (or your organization/company/team)", 'manual'),
+]
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+# latex_logo = None
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+# latex_use_parts = False
+# If true, show page references after internal links.
+# latex_show_pagerefs = False
+# If true, show URL addresses after external links.
+# latex_show_urls = False
+# Documents to append as an appendix to all manuals.
+# latex_appendices = []
+# If false, no module index is generated.
+# latex_domain_indices = True
+# -- Options for manual page output --------------------------------------------
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'art_chatbot', u'art_chatbot Documentation',
+     [u"Your name (or your organization/company/team)"], 1)
+]
+# If true, show URL addresses after external links.
+# man_show_urls = False
+# -- Options for Texinfo output ------------------------------------------------
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    ('index', 'art_chatbot', u'art_chatbot Documentation',
+     u"Your name (or your organization/company/team)", 'art_chatbot',
+     'A short description of the project.', 'Miscellaneous'),
+]
+# Documents to append as an appendix to all manuals.
+# texinfo_appendices = []
+# If false, no module index is generated.
+# texinfo_domain_indices = True
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+# texinfo_show_urls = 'footnote'

docs/getting-started.rst ADDED Viewed

	@@ -0,0 +1,6 @@

+Getting started
+===============
+This is where you describe how to get set up on a clean install, including the
+commands necessary to get the raw data (using the `sync_data_from_s3` command,
+for example), and then how to make the cleaned, final data sets.

docs/index.rst ADDED Viewed

	@@ -0,0 +1,24 @@

+.. art_chatbot documentation master file, created by
+   sphinx-quickstart.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+art_chatbot documentation!
+==============================================
+Contents:
+.. toctree::
+   :maxdepth: 2
+   getting-started
+   commands
+Indices and tables
+==================
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`

docs/make.bat ADDED Viewed

	@@ -0,0 +1,190 @@

+@ECHO OFF
+REM Command file for Sphinx documentation
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+if "%1" == "" goto help
+if "%1" == "help" (
+	:help
+	echo.Please use `make ^<target^>` where ^<target^> is one of
+	echo.  html       to make standalone HTML files
+	echo.  dirhtml    to make HTML files named index.html in directories
+	echo.  singlehtml to make a single large HTML file
+	echo.  pickle     to make pickle files
+	echo.  json       to make JSON files
+	echo.  htmlhelp   to make HTML files and a HTML help project
+	echo.  qthelp     to make HTML files and a qthelp project
+	echo.  devhelp    to make HTML files and a Devhelp project
+	echo.  epub       to make an epub
+	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+	echo.  text       to make text files
+	echo.  man        to make manual pages
+	echo.  texinfo    to make Texinfo files
+	echo.  gettext    to make PO message catalogs
+	echo.  changes    to make an overview over all changed/added/deprecated items
+	echo.  linkcheck  to check all external links for integrity
+	echo.  doctest    to run all doctests embedded in the documentation if enabled
+	goto end
+)
+if "%1" == "clean" (
+	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+	del /q /s %BUILDDIR%\*
+	goto end
+)
+if "%1" == "html" (
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+	goto end
+)
+if "%1" == "dirhtml" (
+	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+	goto end
+)
+if "%1" == "singlehtml" (
+	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+	goto end
+)
+if "%1" == "pickle" (
+	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the pickle files.
+	goto end
+)
+if "%1" == "json" (
+	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the JSON files.
+	goto end
+)
+if "%1" == "htmlhelp" (
+	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+	goto end
+)
+if "%1" == "qthelp" (
+	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\art_chatbot.qhcp
+	echo.To view the help file:
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\art_chatbot.ghc
+	goto end
+)
+if "%1" == "devhelp" (
+	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished.
+	goto end
+)
+if "%1" == "epub" (
+	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The epub file is in %BUILDDIR%/epub.
+	goto end
+)
+if "%1" == "latex" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+	goto end
+)
+if "%1" == "text" (
+	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The text files are in %BUILDDIR%/text.
+	goto end
+)
+if "%1" == "man" (
+	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The manual pages are in %BUILDDIR%/man.
+	goto end
+)
+if "%1" == "texinfo" (
+	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+	goto end
+)
+if "%1" == "gettext" (
+	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+	goto end
+)
+if "%1" == "changes" (
+	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.The overview file is in %BUILDDIR%/changes.
+	goto end
+)
+if "%1" == "linkcheck" (
+	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+	goto end
+)
+if "%1" == "doctest" (
+	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+	goto end
+)
+:end

models/.gitkeep ADDED Viewed

File without changes

notebooks/.gitkeep ADDED Viewed

File without changes

references/.gitkeep ADDED Viewed

File without changes

reports/.gitkeep ADDED Viewed

File without changes

reports/figures/.gitkeep ADDED Viewed

File without changes

src/__init__.py ADDED Viewed

File without changes

src/data/.gitkeep ADDED Viewed

File without changes

src/data/__init__.py ADDED Viewed

File without changes

src/data/__pycache__/wiki_scrape.cpython-311.pyc ADDED Viewed

Binary file (3.78 kB). View file

src/data/entities.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+Realism
+portrait
+Romanticism
+landscape
+Surrealism
+Impressionism
+genre painting
+religious painting
+Neoclassicism
+Symbolist painting
+Ivan Aivazovsky
+Marc Chagall
+John Singer Sargent
+Gustave Dore
+Zdzisław Beksiński

src/data/make_dataset.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# -*- coding: utf-8 -*-
+import click
+import logging
+from pathlib import Path
+from dotenv import find_dotenv, load_dotenv
+@click.command()
+@click.argument('input_filepath', type=click.Path(exists=True))
+@click.argument('output_filepath', type=click.Path())
+def main(input_filepath, output_filepath):
+    """ Runs data processing scripts to turn raw data from (../raw) into
+        cleaned data ready to be analyzed (saved in ../processed).
+    """
+    logger = logging.getLogger(__name__)
+    logger.info('making final data set from raw data')
+if __name__ == '__main__':
+    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    logging.basicConfig(level=logging.INFO, format=log_fmt)
+    # not used in this stub but often useful for finding various files
+    project_dir = Path(__file__).resolve().parents[2]
+    # find .env automagically by walking up directories until it's found, then
+    # load up the .env entries as environment variables
+    load_dotenv(find_dotenv())
+    main()

src/data/wiki_scrape.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import wikipedia
+import os
+def get_raw_wikipedia_article(entity):
+  try:
+    results = wikipedia.search(entity)
+    best_result = results[0]
+    page = wikipedia.page(best_result, auto_suggest=False)
+    return page.content
+  except wikipedia.exceptions.DisambiguationError as e:
+    # Search term can't be disambiguated so we try
+    # again with a more specific search term adding ' (arts)'
+    return get_raw_wikipedia_article(entity + ' (arts)')
+  except wikipedia.exceptions.PageError as e:
+    # If the page doesn't exist, handle the PageError here.
+    print("The requested page does not exist on Wikipedia.")
+    return None
+def clean_article(raw_article):
+  lines = raw_article.split('\n')
+  clean_lines = []
+  for l in lines:
+    if l.startswith('== See also'):
+      break
+    if l.startswith('== References'):
+      break
+    if l.startswith('='):
+      continue
+    if len(l.strip()) == 0:
+      continue
+    clean_lines.append(l.strip())
+  return '\n'.join(clean_lines)
+def save_article(content, path):
+  with open(path, 'w', encoding='utf-8') as f:
+    f.write(content)
+def load_entities(entities_path):
+  with open(entities_path, 'r', encoding='utf-8') as f:
+    return [l.strip() for l in f.readlines()]
+def scrape(entities_path, save_path):
+  entities = load_entities(entities_path)
+  for entity in entities:
+    raw_article = get_raw_wikipedia_article(entity)
+    if raw_article == None:
+      print(f'Article on Wikipedia not found for entity {entity} :(')
+      continue
+    cleaned_article = clean_article(raw_article)
+    save_article(cleaned_article, os.path.join(save_path, f'{entity}.txt'))
+if __name__ == '__main__':
+  scrape('src/data/entities.txt', 'data/wiki_articles')

src/features/.gitkeep ADDED Viewed

File without changes

src/features/__init__.py ADDED Viewed

File without changes

src/features/build_features.py ADDED Viewed

File without changes

src/models/.gitkeep ADDED Viewed

File without changes

src/models/__init__.py ADDED Viewed

File without changes

src/models/__pycache__/extractive_qa.cpython-311.pyc ADDED Viewed

Binary file (2.74 kB). View file

src/models/__pycache__/search_engine.cpython-311.pyc ADDED Viewed

Binary file (6.58 kB). View file

src/models/__pycache__/visual_qa.cpython-311.pyc ADDED Viewed

Binary file (1.41 kB). View file

src/models/extractive_qa.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from transformers import BertTokenizer, BertForQuestionAnswering
+import torch
+class QA(object):
+    def __init__(self,
+                 model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'):
+        self.model_name = model_name
+        self.__load_model_and_tokenizer()
+    def __load_model_and_tokenizer(self):
+        self.model = BertForQuestionAnswering.from_pretrained(self.model_name)
+        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
+    def __get_segment_ids(self, input_ids):
+        # Search the input_ids for the first instance of the `[SEP]` token.
+        sep_index = input_ids.index(self.tokenizer.sep_token_id)
+        # The number of segment A tokens includes the [SEP] token istelf.
+        num_seg_a = sep_index + 1
+        # The remainder are segment B.
+        num_seg_b = len(input_ids) - num_seg_a
+        # Construct the list of 0s and 1s.
+        segment_ids = [0]*num_seg_a + [1]*num_seg_b
+        # There should be a segment_id for every input token.
+        assert len(segment_ids) == len(input_ids)
+        return segment_ids
+    def answer_question(self, query, passage):
+        input_ids = self.tokenizer.encode(query, passage)
+        segment_ids = self.__get_segment_ids(input_ids)
+        # Run our example through the model.
+        outputs = self.model(torch.tensor([input_ids]), # The tokens representing our input text.
+                            token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
+                            return_dict=True)
+        start_scores = outputs.start_logits
+        end_scores = outputs.end_logits
+        # Find the tokens with the highest `start` and `end` scores.
+        answer_start = torch.argmax(start_scores)
+        answer_end = torch.argmax(end_scores)
+        return self.tokenizer.decode(input_ids[answer_start:answer_end+1])

src/models/predict_model.py ADDED Viewed

File without changes

src/models/search_engine.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import os
+from tqdm import tqdm
+from whoosh.index import * # whoosh: full-text indexing and searching
+from whoosh.fields import *
+from whoosh import qparser
+import sys
+sys.path.insert(1, "src/data")
+#import src.data.wiki_scrape as wiki_scrape
+import wiki_scrape as wiki_scrape
+class IR(object):
+    def __init__(self,
+                 max_passage_length = 800,
+                 overlap = 0.4,
+                 passages_limit = 10000,
+                 data_path = 'data/wiki_articles',
+                 index_path = 'index'):
+        self.max_passage_length = max_passage_length
+        self.overlap = overlap
+        self.passages_limit = passages_limit
+        self.data_path = data_path
+        self.index_path = index_path
+        self.ix = None
+        passages = self.__load_passages()
+        if os.path.exists(self.index_path):
+            print(f'Index already exists in the directory {self.index_path}')
+            print('Skipping building the index...')
+            self.ix = open_dir(self.index_path)
+        else:
+          self.__create_index(passages)
+    def __create_passages_from_article(self, content):
+        passages = []
+        passage_diff = int(self.max_passage_length * (1-self.overlap))
+        for i in range(0, len(content), passage_diff):
+            passages.append(content[i: i + self.max_passage_length])
+        return passages
+    def __scrape_wiki_if_not_exists(self):
+        if not os.path.exists(self.data_path):
+            os.makedirs(self.data_path)
+        if len(os.listdir(self.data_path)) == 0:
+            print('No Wiki articles. Scraping...')
+            wiki_scrape.scrape('src/data/entities.txt', 'data/wiki_articles')
+    def __load_passages(self):
+        self.__scrape_wiki_if_not_exists()
+        passages = []
+        count = 0
+        directory = os.fsencode(self.data_path)
+        for file in os.listdir(directory):
+            filename = os.fsdecode(file)
+            if not filename.endswith(".txt"):
+               continue
+            with open(os.path.join(self.data_path, filename), 'r', encoding='utf-8') as f:
+                content = f.read()
+                article_passages = self.__create_passages_from_article(content)
+                #print(f'Created {len(article_passages)} passages')
+                passages.extend(article_passages)
+            count += 1
+            if count == self.passages_limit:
+                break
+        return passages
+    def __create_index(self, passages):
+        # Create the index directory
+        os.mkdir(self.index_path)
+        # Schema definition:
+        # - id: type ID, unique, stored; doc id in order given the passages file
+        # - text: type TEXT processed by StemmingAnalyzer; not stored; content of the passage
+        schema = Schema(id = ID(stored=True,unique=True),
+                        text = TEXT(analyzer=analysis.StemmingAnalyzer())
+                        )
+        # Create an index
+        self.ix = create_in("index", schema)
+        writer = self.ix.writer() #run once! or restart runtime
+        # Add papers to the index, iterating through each row in the metadata dataframe
+        id = 0
+        for passage_text in tqdm(passages, desc='Building index'):
+            writer.add_document(id=str(id),text=passage_text)
+            id += 1
+        # Save the added documents
+        writer.commit()
+        print("Index successfully created")
+    def retrieve_documents(self, query, topk):
+        scores=[]
+        text=[]
+        passages = self.__load_passages()
+        # Open the searcher for reading the index. The default BM25 algorithm will be used for scoring
+        with self.ix.searcher() as searcher:
+            searcher = self.ix.searcher()
+            # Define the query parser ('text' will be the default field to search), and set the input query
+            q = qparser.QueryParser("text", self.ix.schema, group=qparser.OrGroup).parse(query)
+            # Search using the query q, and get the n_docs documents, sorted with the highest-scoring documents first
+            results = searcher.search(q, limit=topk)
+            # results is a list of dictionaries where each dictionary is the stored fields of the document
+        # Iterate over the retrieved documents
+        for hit in results:
+            scores.append(hit.score)
+            text.append(passages[int(hit['id'])])
+        return text, scores

src/models/train_model.py ADDED Viewed

File without changes

src/models/visual_qa.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from transformers import pipeline
+class VisualQA(object):
+    def __init__(self, model_name='nflechas/VQArt', tokenizer_name='dandelin/vilt-b32-finetuned-vqa'):
+        self.model_name = model_name
+        self.tokenizer_name = tokenizer_name
+        self.__load_model()
+    def __load_model(self):
+        self.model = pipeline('vqa', model=self.model_name, tokenizer=self.tokenizer_name)
+    def answer_question(self, query, image):
+        return self.model(question=query, image=image, top_k=1)[0]['answer']

src/visualization/.gitkeep ADDED Viewed

File without changes

src/visualization/__init__.py ADDED Viewed

File without changes

src/visualization/visualize.py ADDED Viewed

File without changes