Spaces:

awacke1
/

GPT-4o-omni-text-audio-image-video

Running on CPU Upgrade

App Files Files Community

awacke1 commited on May 14

Commit

878ab12

•

1 Parent(s): f9040a4

Create app.py

Browse files

Files changed (1) hide show

app.py +122 -0

app.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import streamlit as st
+from openai import OpenAI
+import os
+import base64
+import cv2
+from moviepy.editor import VideoFileClip
+# Set the API key and model name
+MODEL = "gpt-4o"
+client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "<your OpenAI API key if not set as an env var>"))
+def process_text():
+    text_input = st.text_input("Enter your text:")
+    if text_input:
+        completion = client.chat.completions.create(
+            model=MODEL,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant. Help me with my math homework!"},
+                {"role": "user", "content": f"Hello! Could you solve {text_input}?"}
+            ]
+        )
+        st.write("Assistant: " + completion.choices[0].message.content)
+def process_image(image_input):
+    if image_input:
+        base64_image = base64.b64encode(image_input.read()).decode("utf-8")
+        response = client.chat.completions.create(
+            model=MODEL,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant that responds in Markdown. Help me with my math homework!"},
+                {"role": "user", "content": [
+                    {"type": "text", "text": "What's the area of the triangle?"},
+                    {"type": "image_url", "image_url": {
+                        "url": f"data:image/png;base64,{base64_image}"}
+                    }
+                ]}
+            ],
+            temperature=0.0,
+        )
+        st.markdown(response.choices[0].message.content)
+def process_audio(audio_input):
+    if audio_input:
+        transcription = client.audio.transcriptions.create(
+            model="whisper-1",
+            file=audio_input,
+        )
+        response = client.chat.completions.create(
+            model=MODEL,
+            messages=[
+                {"role": "system", "content": "You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown."},
+                {"role": "user", "content": [
+                    {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
+                ]},
+            ],
+            temperature=0,
+        )
+        st.markdown(response.choices[0].message.content)
+def process_video(video_input):
+    if video_input:
+        base64Frames, audio_path = process_video_frames(video_input)
+        transcription = client.audio.transcriptions.create(
+            model="whisper-1",
+            file=open(audio_path, "rb"),
+        )
+        response = client.chat.completions.create(
+            model=MODEL,
+            messages=[
+                {"role": "system", "content": "You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown"},
+                {"role": "user", "content": [
+                    "These are the frames from the video.",
+                    *map(lambda x: {"type": "image_url",
+                                    "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
+                    {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
+                ]},
+            ],
+            temperature=0,
+        )
+        st.markdown(response.choices[0].message.content)
+def process_video_frames(video_path, seconds_per_frame=2):
+    base64Frames = []
+    base_video_path, _ = os.path.splitext(video_path.name)
+    video = cv2.VideoCapture(video_path.name)
+    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = video.get(cv2.CAP_PROP_FPS)
+    frames_to_skip = int(fps * seconds_per_frame)
+    curr_frame = 0
+    while curr_frame < total_frames - 1:
+        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
+        success, frame = video.read()
+        if not success:
+            break
+        _, buffer = cv2.imencode(".jpg", frame)
+        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
+        curr_frame += frames_to_skip
+    video.release()
+    audio_path = f"{base_video_path}.mp3"
+    clip = VideoFileClip(video_path.name)
+    clip.audio.write_audiofile(audio_path, bitrate="32k")
+    clip.audio.close()
+    clip.close()
+    return base64Frames, audio_path
+def main():
+    st.title("Omni Demo")
+    option = st.selectbox("Select an option", ("Text", "Image", "Audio", "Video"))
+    if option == "Text":
+        process_text()
+    elif option == "Image":
+        image_input = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
+        process_image(image_input)
+    elif option == "Audio":
+        audio_input = st.file_uploader("Upload an audio file", type=["mp3", "wav"])
+        process_audio(audio_input)
+    elif option == "Video":
+        video_input = st.file_uploader("Upload a video file", type=["mp4"])
+        process_video(video_input)
+if __name__ == "__main__":
+    main()