Spaces:

Gurnam-AI
/

Multimodal

Sleeping

App Files Files Community

Gurnam-AI commited on May 31, 2024

Commit

fef2faa

verified ·

1 Parent(s): b5e8fa6

Upload 2 files

Browse files

Files changed (2) hide show

app.py +144 -0
requirements.txt +1 -0

app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import base64
+import vertexai
+from vertexai.generative_models import GenerativeModel, Part
+import vertexai.preview.generative_models as generative_models
+import os
+import mimetypes
+import gradio as gr
+import mimetypes
+import tempfile
+def get_credentials():
+    creds_json_str = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
+    if creds_json_str is None:
+        raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
+    # create a temporary file
+    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as temp:
+        temp.write(creds_json_str) # write in json format
+        temp_filename = temp.name
+    return temp_filename
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= get_credentials()
+def get_matching_format(file_path):
+    file_formats = [
+        "image/png",
+        "image/jpeg",
+        "audio/aac",
+        "audio/flac",
+        "audio/mp3",
+        "audio/m4a",
+        "audio/mpeg",
+        "audio/mpga",
+        "audio/mp4",
+        "audio/opus",
+        "audio/pcm",
+        "audio/wav",
+        "audio/webm",
+        "video/x-flv",
+        "video/mov",
+        "video/mpeg",
+        "video/mpegps",
+        "video/mpg",
+        "video/mp4",
+        "video/webm",
+        "video/wmv",
+        "video/3gpp",
+        "application/pdf"
+    ]
+    mime_type, _ = mimetypes.guess_type(file_path)
+    if mime_type in file_formats:
+        return mime_type
+    else:
+        return None
+def encode_file(file_path):
+    mime_type = get_matching_format(file_path)
+    if mime_type:
+        with open(file_path, 'rb') as file:
+            file_content = file.read()
+        encoded_content = base64.b64encode(file_content)
+        encoded_string = encoded_content.decode('utf-8')
+        return encoded_string
+    else:
+        return None
+def multiturn_generate_content(file_path, user_query):
+    encoded_string = encode_file(file_path)
+    mime_type = get_matching_format(file_path)
+    if encoded_string:
+        vertexai.init(project="imgcp-ff81e7053b072ce5", location="us-central1")
+        model = GenerativeModel(
+            "gemini-1.5-flash-001",
+        )
+        chat = model.start_chat()
+        doc = Part.from_data(
+            mime_type=mime_type,
+            data=base64.b64decode(encoded_string)
+        )
+        return chat.send_message(
+            [doc, user_query],
+            generation_config={
+                "max_output_tokens": 8192,
+                "temperature": 1,
+                "top_p": 0.95,
+            }
+        ).text
+    else:
+        return "Model Error"
+demo = gr.Blocks()
+with demo:
+    gr.Blocks(theme="base")
+    # gr.Markdown("")
+    with gr.Tabs():
+        with gr.TabItem("Use Cases"):
+            gr.Markdown("""<h1>Gemini Multimodal</h1>""")
+            gr.Markdown("""<b>This Model performs well at a variety of multimodal tasks such as visual understanding, classification, summarization, and creating content from image, audio and video. It's adept at processing visual and text inputs such as photographs, documents, infographics, and screenshots.</b>""")
+            gr.Markdown("""<ul>
+                        <li><b>Visual Information Seeking:</b> Use external knowledge combined with information extracted from the input image or video to answer questions.</li>
+                        <li><b>Object Recognition:</b> Answer questions related to fine-grained identification of the objects in images and videos.</li>
+                        <li><b>Digital Content Understanding:</b> Answer questions and extract information from visual content like infographics, charts, figures, tables, and web pages.</li>
+                        <li><b>Structured Content Generation:</b> Generate responses based on multimodal inputs in formats like HTML and JSON.</li>
+                        <li><b>Captioning and Description:</b> Generate descriptions of images and videos with varying levels of detail.</li>
+                        <li><b>Reasoning:</b> Compositionally infer new information without memorization or retrieval.</li>
+                        <li><b>Audio:</b> Analyze speech files for summarization, transcription, and Q&A.</li>
+                        <li><b>Multimodal Processing:</b> Process multiple types of input media at the same time, such as video and audio input.</li>
+                        </ul>""")
+        with gr.TabItem("Upload"):
+            gr.Markdown("""<b>Note: Please upload the file and submit your query in the next tab.</b>""")
+            with gr.Row():
+                filepath = gr.File(type='filepath')
+        with gr.TabItem("Chat"):
+            with gr.Column():
+                text_input_one = gr.Textbox(lines=15, show_label=False, container=True)
+                image_output = gr.Textbox(show_label=False, min_width=120)
+            text_button_one = gr.Button("Submit")
+    # text_button.click(encode_file, inputs=text_input)
+    text_button_one.click(multiturn_generate_content, inputs=[filepath, image_output], outputs=text_input_one)
+demo.launch(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ google-cloud-aiplatform