Spaces:
Runtime error
Runtime error
gchhablani
commited on
Commit
•
0808df5
1
Parent(s):
74cb830
Update app
Browse files- app.py +28 -23
- hf_logo.png +0 -0
- misc/mvqa-logo-2.png +0 -0
- misc/mvqa-logo-white.png +0 -0
- misc/mvqa-logo.png +0 -0
- sections/acknowledgements.md +3 -1
- sections/intro.md +5 -0
- sections/usage.md +1 -5
app.py
CHANGED
@@ -66,7 +66,7 @@ st.set_page_config(
|
|
66 |
page_title="Multilingual VQA",
|
67 |
layout="wide",
|
68 |
initial_sidebar_state="collapsed",
|
69 |
-
page_icon="./misc/mvqa-logo.png",
|
70 |
)
|
71 |
|
72 |
st.title("Multilingual Visual Question Answering")
|
@@ -74,8 +74,26 @@ st.write(
|
|
74 |
"[Gunjan Chhablani](https://huggingface.co/gchhablani), [Bhavitvya Malik](https://huggingface.co/bhavitvyamalik)"
|
75 |
)
|
76 |
|
|
|
|
|
|
|
77 |
with st.beta_expander("Usage"):
|
78 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
first_index = 20
|
81 |
# Init Session State
|
@@ -92,7 +110,7 @@ if state.image_file is None:
|
|
92 |
|
93 |
col1, col2 = st.beta_columns([6, 4])
|
94 |
|
95 |
-
if col2.button("Get a random example"):
|
96 |
sample = dummy_data.sample(1).reset_index()
|
97 |
state.image_file = sample.loc[0, "image_file"]
|
98 |
state.question = sample.loc[0, "question"].strip("- ")
|
@@ -116,24 +134,26 @@ transformed_image = get_transformed_image(state.image)
|
|
116 |
# Display Image
|
117 |
col1.image(state.image, use_column_width="auto")
|
118 |
|
|
|
119 |
# Display Question
|
120 |
-
question =
|
121 |
-
|
122 |
f"""**English Translation**: {question if state.question_lang_id == "en" else translate(question, 'en')}"""
|
123 |
)
|
124 |
|
125 |
-
col2.markdown("**Actual Answer in English**: " + answer_reverse_mapping[str(state.answer_label)])
|
126 |
-
|
127 |
question_inputs = get_text_attributes(question)
|
128 |
|
129 |
# Select Language
|
130 |
options = ["en", "de", "es", "fr"]
|
131 |
-
state.answer_lang_id =
|
132 |
"Answer Language",
|
133 |
index=options.index(state.answer_lang_id),
|
134 |
options=options,
|
135 |
format_func=lambda x: code_to_name[x],
|
136 |
)
|
|
|
|
|
|
|
137 |
# Display Top-5 Predictions
|
138 |
with st.spinner("Loading model..."):
|
139 |
model = load_model(checkpoints[0])
|
@@ -144,18 +164,3 @@ labels, values = get_top_5_predictions(logits, answer_reverse_mapping)
|
|
144 |
translated_labels = translate_labels(labels, state.answer_lang_id)
|
145 |
fig = plotly_express_horizontal_bar_plot(values, translated_labels)
|
146 |
st.plotly_chart(fig, use_container_width=True)
|
147 |
-
|
148 |
-
|
149 |
-
st.write(read_markdown("abstract.md"))
|
150 |
-
st.write(read_markdown("caveats.md"))
|
151 |
-
st.write("# Methodology")
|
152 |
-
st.image(
|
153 |
-
"./misc/Multilingual-VQA.png", caption="Masked LM model for Image-text Pretraining."
|
154 |
-
)
|
155 |
-
st.markdown(read_markdown("pretraining.md"))
|
156 |
-
st.markdown(read_markdown("finetuning.md"))
|
157 |
-
st.write(read_markdown("challenges.md"))
|
158 |
-
st.write(read_markdown("social_impact.md"))
|
159 |
-
st.write(read_markdown("references.md"))
|
160 |
-
st.write(read_markdown("checkpoints.md"))
|
161 |
-
st.write(read_markdown("acknowledgements.md"))
|
|
|
66 |
page_title="Multilingual VQA",
|
67 |
layout="wide",
|
68 |
initial_sidebar_state="collapsed",
|
69 |
+
page_icon="./misc/mvqa-logo-white.png",
|
70 |
)
|
71 |
|
72 |
st.title("Multilingual Visual Question Answering")
|
|
|
74 |
"[Gunjan Chhablani](https://huggingface.co/gchhablani), [Bhavitvya Malik](https://huggingface.co/bhavitvyamalik)"
|
75 |
)
|
76 |
|
77 |
+
image_col, intro_col = st.beta_columns([2,8])
|
78 |
+
image_col.image("./misc/mvqa-logo-white.png", use_column_width='always')
|
79 |
+
intro_col.write(read_markdown('intro.md'))
|
80 |
with st.beta_expander("Usage"):
|
81 |
+
st.write(read_markdown("usage.md"))
|
82 |
+
|
83 |
+
with st.beta_expander("Article"):
|
84 |
+
st.write(read_markdown("abstract.md"))
|
85 |
+
st.write(read_markdown("caveats.md"))
|
86 |
+
st.write("# Methodology")
|
87 |
+
st.image(
|
88 |
+
"./misc/Multilingual-VQA.png", caption="Masked LM model for Image-text Pretraining."
|
89 |
+
)
|
90 |
+
st.markdown(read_markdown("pretraining.md"))
|
91 |
+
st.markdown(read_markdown("finetuning.md"))
|
92 |
+
st.write(read_markdown("challenges.md"))
|
93 |
+
st.write(read_markdown("social_impact.md"))
|
94 |
+
st.write(read_markdown("references.md"))
|
95 |
+
st.write(read_markdown("checkpoints.md"))
|
96 |
+
st.write(read_markdown("acknowledgements.md"))
|
97 |
|
98 |
first_index = 20
|
99 |
# Init Session State
|
|
|
110 |
|
111 |
col1, col2 = st.beta_columns([6, 4])
|
112 |
|
113 |
+
if col2.button("Get a random example", help="Get a random example from the 100 "):
|
114 |
sample = dummy_data.sample(1).reset_index()
|
115 |
state.image_file = sample.loc[0, "image_file"]
|
116 |
state.question = sample.loc[0, "question"].strip("- ")
|
|
|
134 |
# Display Image
|
135 |
col1.image(state.image, use_column_width="auto")
|
136 |
|
137 |
+
new_col1, new_col2 = st.beta_columns([5,5])
|
138 |
# Display Question
|
139 |
+
question = new_col1.text_input(label="Question", value=state.question)
|
140 |
+
new_col1.markdown(
|
141 |
f"""**English Translation**: {question if state.question_lang_id == "en" else translate(question, 'en')}"""
|
142 |
)
|
143 |
|
|
|
|
|
144 |
question_inputs = get_text_attributes(question)
|
145 |
|
146 |
# Select Language
|
147 |
options = ["en", "de", "es", "fr"]
|
148 |
+
state.answer_lang_id = new_col2.selectbox(
|
149 |
"Answer Language",
|
150 |
index=options.index(state.answer_lang_id),
|
151 |
options=options,
|
152 |
format_func=lambda x: code_to_name[x],
|
153 |
)
|
154 |
+
|
155 |
+
new_col2.markdown("**Actual Answer in English**: " + answer_reverse_mapping[str(state.answer_label)])
|
156 |
+
|
157 |
# Display Top-5 Predictions
|
158 |
with st.spinner("Loading model..."):
|
159 |
model = load_model(checkpoints[0])
|
|
|
164 |
translated_labels = translate_labels(labels, state.answer_lang_id)
|
165 |
fig = plotly_express_horizontal_bar_plot(values, translated_labels)
|
166 |
st.plotly_chart(fig, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hf_logo.png
DELETED
Binary file (5.64 kB)
|
|
misc/mvqa-logo-2.png
ADDED
misc/mvqa-logo-white.png
ADDED
misc/mvqa-logo.png
CHANGED
sections/acknowledgements.md
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
# Acknowledgements
|
2 |
We thank [Nilakshan Kunananthaseelan](https://huggingface.co/knilakshan20) for helping us whenever he could get a chance. We also thank [Abheesht Sharma](https://huggingface.co/abheesht) for helping in the discussions in the initial phases. [Luke Melas](https://github.com/lukemelas) helped us get the CC-12M data on our TPU-VMs and we are very grateful to him.
|
3 |
|
4 |
-
This project would not be possible without the help of [Patrick](https://huggingface.co/patrickvonplaten) and [Suraj](https://huggingface.co/valhalla) who met with us and helped review our approach and guided us throughout the project.
|
|
|
|
|
|
1 |
# Acknowledgements
|
2 |
We thank [Nilakshan Kunananthaseelan](https://huggingface.co/knilakshan20) for helping us whenever he could get a chance. We also thank [Abheesht Sharma](https://huggingface.co/abheesht) for helping in the discussions in the initial phases. [Luke Melas](https://github.com/lukemelas) helped us get the CC-12M data on our TPU-VMs and we are very grateful to him.
|
3 |
|
4 |
+
This project would not be possible without the help of [Patrick](https://huggingface.co/patrickvonplaten) and [Suraj](https://huggingface.co/valhalla) who met with us and helped review our approach and guided us throughout the project.
|
5 |
+
|
6 |
+
Lastly, we thank the Google Team for helping answer our queries on the Slack channel, and for providing us TPU-VMs.
|
sections/intro.md
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This demo uses a [ViTBert model checkpoint](https://huggingface.co/flax-community/multilingual-vqa-pt-60k-ft/tree/main/ckpt-5999) fine-tuned on a [MarianMT](https://huggingface.co/transformers/model_doc/marian.html)-translated version of the [VQA v2 dataset](https://visualqa.org/challenge.html). The fine-tuning is performed afterpre-training using text-only Masked LM on approximately 10 million image-text pairs taken from the [Conceptual 12M dataset](https://github.com/google-research-datasets/conceptual-12m) translated using [MBart](https://huggingface.co/transformers/model_doc/mbart.html). The translations are performed in the following four languages: English, French, German and Spanish.
|
2 |
+
|
3 |
+
The model predicts one out of 3129 classes in English which can be found [here](https://huggingface.co/spaces/flax-community/Multilingual-VQA/blob/main/answer_reverse_mapping.json), and then the translated versions are provided based on the language chosen as `Answer Language`. The question can be present or written in any of the following: English, French, German and Spanish.
|
4 |
+
|
5 |
+
For more details, click on `Usage` or `Article` 🤗 below.
|
sections/usage.md
CHANGED
@@ -8,8 +8,4 @@
|
|
8 |
|
9 |
- Lastly, once can choose the `Answer Language` which also uses a saved dictionary created using `mtranslate` library for the 3129 answer options.
|
10 |
|
11 |
-
- The top-5 predictions are displayed below and their respective confidence scores are shown in form of a bar plot.
|
12 |
-
|
13 |
-
For more info, scroll to the end of this app.
|
14 |
-
|
15 |
-
|
|
|
8 |
|
9 |
- Lastly, once can choose the `Answer Language` which also uses a saved dictionary created using `mtranslate` library for the 3129 answer options.
|
10 |
|
11 |
+
- The top-5 predictions are displayed below and their respective confidence scores are shown in form of a bar plot.
|
|
|
|
|
|
|
|