Spaces:
Runtime error
Runtime error
gchhablani
commited on
Commit
β’
fb3c77c
1
Parent(s):
36c3aaa
Update layout
Browse files- app.py +1 -13
- apps/mlm.py +5 -1
- apps/utils.py +6 -1
- apps/vqa.py +5 -1
- multiapp.py +1 -0
- sections/mlm_intro.md +5 -0
- sections/mlm_usage.md +7 -0
- sections/{intro.md β vqa_intro.md} +2 -2
- sections/{usage.md β vqa_usage.md} +1 -1
app.py
CHANGED
@@ -1,14 +1,8 @@
|
|
1 |
from apps import mlm, vqa
|
2 |
-
import os
|
3 |
import streamlit as st
|
4 |
from session import _get_state
|
5 |
from multiapp import MultiApp
|
6 |
-
|
7 |
-
|
8 |
-
def read_markdown(path, parent="./sections/"):
|
9 |
-
with open(os.path.join(parent, path)) as f:
|
10 |
-
return f.read()
|
11 |
-
|
12 |
|
13 |
def main():
|
14 |
state = _get_state()
|
@@ -24,12 +18,6 @@ def main():
|
|
24 |
"[Gunjan Chhablani](https://huggingface.co/gchhablani), [Bhavitvya Malik](https://huggingface.co/bhavitvyamalik)"
|
25 |
)
|
26 |
|
27 |
-
image_col, intro_col = st.beta_columns([3, 8])
|
28 |
-
image_col.image("./misc/mvqa-logo-3-white.png", use_column_width="always")
|
29 |
-
intro_col.write(read_markdown("intro.md"))
|
30 |
-
with st.beta_expander("Usage"):
|
31 |
-
st.write(read_markdown("usage.md"))
|
32 |
-
|
33 |
with st.beta_expander("Article"):
|
34 |
st.write(read_markdown("abstract.md"))
|
35 |
st.write(read_markdown("caveats.md"))
|
|
|
1 |
from apps import mlm, vqa
|
|
|
2 |
import streamlit as st
|
3 |
from session import _get_state
|
4 |
from multiapp import MultiApp
|
5 |
+
from apps.utils import read_markdown
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
def main():
|
8 |
state = _get_state()
|
|
|
18 |
"[Gunjan Chhablani](https://huggingface.co/gchhablani), [Bhavitvya Malik](https://huggingface.co/bhavitvyamalik)"
|
19 |
)
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
with st.beta_expander("Article"):
|
22 |
st.write(read_markdown("abstract.md"))
|
23 |
st.write(read_markdown("caveats.md"))
|
apps/mlm.py
CHANGED
@@ -12,7 +12,7 @@ import pandas as pd
|
|
12 |
import os
|
13 |
import matplotlib.pyplot as plt
|
14 |
from mtranslate import translate
|
15 |
-
|
16 |
|
17 |
from .model.flax_clip_vision_bert.modeling_clip_vision_bert import (
|
18 |
FlaxCLIPVisionBertForMaskedLM,
|
@@ -25,6 +25,10 @@ def softmax(logits):
|
|
25 |
def app(state):
|
26 |
mlm_state = state
|
27 |
|
|
|
|
|
|
|
|
|
28 |
# @st.cache(persist=False) # TODO: Make this work with mlm_state. Currently not supported.
|
29 |
def predict(transformed_image, caption_inputs):
|
30 |
outputs = mlm_state.mlm_model(pixel_values=transformed_image, **caption_inputs)
|
|
|
12 |
import os
|
13 |
import matplotlib.pyplot as plt
|
14 |
from mtranslate import translate
|
15 |
+
from .utils import read_markdown
|
16 |
|
17 |
from .model.flax_clip_vision_bert.modeling_clip_vision_bert import (
|
18 |
FlaxCLIPVisionBertForMaskedLM,
|
|
|
25 |
def app(state):
|
26 |
mlm_state = state
|
27 |
|
28 |
+
with st.beta_expander("Usage"):
|
29 |
+
st.write(read_markdown("mlm_usage.md"))
|
30 |
+
st.write(read_markdown("mlm_intro.md"))
|
31 |
+
|
32 |
# @st.cache(persist=False) # TODO: Make this work with mlm_state. Currently not supported.
|
33 |
def predict(transformed_image, caption_inputs):
|
34 |
outputs = mlm_state.mlm_model(pixel_values=transformed_image, **caption_inputs)
|
apps/utils.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import json
|
2 |
-
|
3 |
import numpy as np
|
4 |
import plotly.express as px
|
5 |
import torch
|
@@ -81,3 +81,8 @@ def plotly_express_horizontal_bar_plot(values, labels):
|
|
81 |
orientation="h",
|
82 |
)
|
83 |
return fig
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import json
|
2 |
+
import os
|
3 |
import numpy as np
|
4 |
import plotly.express as px
|
5 |
import torch
|
|
|
81 |
orientation="h",
|
82 |
)
|
83 |
return fig
|
84 |
+
|
85 |
+
|
86 |
+
def read_markdown(path, parent="./sections/"):
|
87 |
+
with open(os.path.join(parent, path)) as f:
|
88 |
+
return f.read()
|
apps/vqa.py
CHANGED
@@ -14,7 +14,7 @@ import matplotlib.pyplot as plt
|
|
14 |
import json
|
15 |
|
16 |
from mtranslate import translate
|
17 |
-
|
18 |
|
19 |
from .model.flax_clip_vision_bert.modeling_clip_vision_bert import (
|
20 |
FlaxCLIPVisionBertForSequenceClassification,
|
@@ -28,6 +28,10 @@ def softmax(logits):
|
|
28 |
def app(state):
|
29 |
vqa_state = state
|
30 |
|
|
|
|
|
|
|
|
|
31 |
# @st.cache(persist=False)
|
32 |
def predict(transformed_image, question_inputs):
|
33 |
return np.array(
|
|
|
14 |
import json
|
15 |
|
16 |
from mtranslate import translate
|
17 |
+
from .utils import read_markdown
|
18 |
|
19 |
from .model.flax_clip_vision_bert.modeling_clip_vision_bert import (
|
20 |
FlaxCLIPVisionBertForSequenceClassification,
|
|
|
28 |
def app(state):
|
29 |
vqa_state = state
|
30 |
|
31 |
+
with st.beta_expander("Usage"):
|
32 |
+
st.write(read_markdown("vqa_usage.md"))
|
33 |
+
st.write(read_markdown("vqa_intro.md"))
|
34 |
+
|
35 |
# @st.cache(persist=False)
|
36 |
def predict(transformed_image, question_inputs):
|
37 |
return np.array(
|
multiapp.py
CHANGED
@@ -10,6 +10,7 @@ class MultiApp:
|
|
10 |
self.apps.append({"title": title, "function": func})
|
11 |
|
12 |
def run(self):
|
|
|
13 |
st.sidebar.header("Tasks")
|
14 |
app = st.sidebar.radio(
|
15 |
"", self.apps, format_func=lambda app: app["title"]
|
|
|
10 |
self.apps.append({"title": title, "function": func})
|
11 |
|
12 |
def run(self):
|
13 |
+
logo = st.image("./misc/mvqa-logo-3-white.png")
|
14 |
st.sidebar.header("Tasks")
|
15 |
app = st.sidebar.radio(
|
16 |
"", self.apps, format_func=lambda app: app["title"]
|
sections/mlm_intro.md
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This demo uses a [CLIP-Vision-Bert model checkpoint](https://huggingface.co/flax-community/clip-vision-bert-cc12m-70k) pre-trained using text-only Masked LM on approximately 10 million image-text pairs taken from the [Conceptual 12M dataset](https://github.com/google-research-datasets/conceptual-12m) translated using [MBart](https://huggingface.co/transformers/model_doc/mbart.html). The translations are performed in the following four languages: English, French, German and Spanish, giving 2.5M examples in each language.
|
2 |
+
|
3 |
+
The model can be used for mask-filling as shown in this demo. The caption can be present or written in any of the following: English, French, German and Spanish.
|
4 |
+
|
5 |
+
For more details, click on `Usage` or `Article` π€ above.
|
sections/mlm_usage.md
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
- This demo loads the `FlaxCLIPVisionBertForMaskedLM` present in the `model` directory of this repository. The checkpoint is loaded from [`flax-community/clip-vision-bert-cc12m-70k`](https://huggingface.co/flax-community/clip-vision-bert-cc12m-70k) which is pre-trained checkpoint with 70k steps. 100 random validation set examples are present in the `cc12m_data/vqa_val.tsv` with respective images in the `cc12m_data/images_data` directory.
|
2 |
+
|
3 |
+
- We provide `English Translation` of the caption for users who are not well-acquainted with the other languages. This is done using `mtranslate` to keep things flexible enough and needs internet connection as it uses the Google Translate API.
|
4 |
+
|
5 |
+
- The model predicts the scores for tokens from the `bert-base-multilingual-uncased` checkpoint.
|
6 |
+
|
7 |
+
- The top-5 predictions are displayed below and their respective confidence scores are shown in form of a bar plot.
|
sections/{intro.md β vqa_intro.md}
RENAMED
@@ -1,5 +1,5 @@
|
|
1 |
-
This demo uses a [
|
2 |
|
3 |
The model predicts one out of 3129 classes in English which can be found [here](https://huggingface.co/spaces/flax-community/Multilingual-VQA/blob/main/answer_reverse_mapping.json), and then the translated versions are provided based on the language chosen as `Answer Language`. The question can be present or written in any of the following: English, French, German and Spanish.
|
4 |
|
5 |
-
For more details, click on `Usage` or `Article` π€
|
|
|
1 |
+
This demo uses a [CLIP-Vision-Bert model checkpoint](https://huggingface.co/flax-community/clip-vision-bert-vqa-ft-6k) fine-tuned on a [MarianMT](https://huggingface.co/transformers/model_doc/marian.html)-translated version of the [VQA v2 dataset](https://visualqa.org/challenge.html). The fine-tuning is performed after pre-training using text-only Masked LM on approximately 10 million image-text pairs taken from the [Conceptual 12M dataset](https://github.com/google-research-datasets/conceptual-12m) translated using [MBart](https://huggingface.co/transformers/model_doc/mbart.html). The translations are performed in the following four languages: English, French, German and Spanish.
|
2 |
|
3 |
The model predicts one out of 3129 classes in English which can be found [here](https://huggingface.co/spaces/flax-community/Multilingual-VQA/blob/main/answer_reverse_mapping.json), and then the translated versions are provided based on the language chosen as `Answer Language`. The question can be present or written in any of the following: English, French, German and Spanish.
|
4 |
|
5 |
+
For more details, click on `Usage` or `Article` π€ above.
|
sections/{usage.md β vqa_usage.md}
RENAMED
@@ -1,4 +1,4 @@
|
|
1 |
-
- This demo loads the `
|
2 |
|
3 |
- We provide `English Translation` of the question for users who are not well-acquainted with the other languages. This is done using `mtranslate` to keep things flexible enough and needs internet connection as it uses the Google Translate API.
|
4 |
|
|
|
1 |
+
- This demo loads the `FlaxCLIPVisionBertForSequenceClassification` present in the `model` directory of this repository. The checkpoint is loaded from [`flax-community/clip-vision-bert-vqa-ft-6k`](https://huggingface.co/flax-community/clip-vision-bert-vqa-ft-6k) which is pre-trained checkpoint with 60k steps and 6k fine-tuning steps. 100 random validation set examples are present in the `dummy_vqa_multilingual.tsv` with respective images in the `images/val2014` directory.
|
2 |
|
3 |
- We provide `English Translation` of the question for users who are not well-acquainted with the other languages. This is done using `mtranslate` to keep things flexible enough and needs internet connection as it uses the Google Translate API.
|
4 |
|