|
import streamlit as st |
|
from together import Together |
|
import base64 |
|
from PIL import Image |
|
from io import BytesIO |
|
import os |
|
from elevenlabs.client import ElevenLabs |
|
|
|
|
|
os.environ["TOGETHER_API_KEY"] = st.secrets['together_api'] |
|
os.environ["ELEVENLABS_API_KEY"] = st.secrets['elevenlabs_api'] |
|
|
|
|
|
together_client = Together(api_key=os.environ["TOGETHER_API_KEY"]) |
|
|
|
|
|
elevenlabs_client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"]) |
|
|
|
|
|
def encode_image(image): |
|
buffered = BytesIO() |
|
image.save(buffered, format="JPEG") |
|
image_bytes = buffered.getvalue() |
|
return base64.b64encode(image_bytes).decode('utf-8') |
|
|
|
|
|
def get_image_description(image): |
|
get_description_prompt = "Describe the given image in detail in only 20 words max." |
|
|
|
|
|
base64_image = encode_image(image) |
|
|
|
|
|
response = together_client.chat.completions.create( |
|
model="meta-llama/Llama-Vision-Free", |
|
messages=[{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "text", "text": get_description_prompt}, |
|
{ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": f"data:image/jpeg;base64,{base64_image}", |
|
}, |
|
}, |
|
], |
|
}], |
|
stream=False, |
|
) |
|
|
|
|
|
return response.choices[0].message.content |
|
|
|
|
|
def tts(text): |
|
try: |
|
|
|
audio_generator = elevenlabs_client.text_to_speech.convert( |
|
text=text, |
|
voice_id="JBFqnCBsd6RMkjVDRZzb", |
|
model_id="eleven_multilingual_v2", |
|
output_format="mp3_44100_128", |
|
) |
|
|
|
|
|
audio_file_path = "temp_audio.mp3" |
|
with open(audio_file_path, "wb") as f: |
|
for chunk in audio_generator: |
|
f.write(chunk) |
|
|
|
|
|
st.audio(audio_file_path, format="audio/mp3",autoplay=True) |
|
except Exception as e: |
|
st.error(f"Error generating speech: {e}") |
|
|
|
|
|
st.markdown( |
|
""" |
|
<style> |
|
.stApp { |
|
background: linear-gradient(135deg, #1e1e2f, #2a2a40); |
|
color: #ffffff; |
|
font-family: 'Arial', sans-serif; |
|
} |
|
.stButton>button { |
|
background: linear-gradient(135deg, #6a11cb, #2575fc); |
|
color: white; |
|
border: none; |
|
border-radius: 12px; |
|
padding: 10px 20px; |
|
font-size: 16px; |
|
font-weight: bold; |
|
} |
|
.stButton>button:hover { |
|
background: linear-gradient(135deg, #2575fc, #6a11cb); |
|
} |
|
.stImage { |
|
border-radius: 12px; |
|
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2); |
|
} |
|
.stMarkdown h1 { |
|
color: #6a11cb; |
|
text-align: center; |
|
font-size: 36px; |
|
font-weight: bold; |
|
} |
|
.stMarkdown h2 { |
|
color: #2575fc; |
|
font-size: 24px; |
|
font-weight: bold; |
|
} |
|
.stSpinner>div { |
|
color: #6a11cb; |
|
} |
|
</style> |
|
""", |
|
unsafe_allow_html=True, |
|
) |
|
|
|
|
|
st.title("๐ฎ Visox | Koshur AI") |
|
st.markdown("### See the world through AI's eyes!") |
|
|
|
|
|
st.sidebar.markdown("## About") |
|
st.sidebar.markdown("This app uses advanced AI to describe what it sees through your camera in real-time.") |
|
st.sidebar.markdown("Powered by [Together AI](https://together.ai) and Streamlit.") |
|
|
|
|
|
img_file_buffer = st.camera_input("Take a picture") |
|
|
|
if img_file_buffer is not None: |
|
try: |
|
|
|
img = Image.open(img_file_buffer) |
|
|
|
|
|
st.image(img, caption='Captured Image', width=300) |
|
|
|
|
|
with st.spinner('๐ Analyzing the image...'): |
|
description = get_image_description(img) |
|
st.success('โ
Analysis complete!') |
|
st.markdown("### AI Description:") |
|
st.write(description) |
|
|
|
|
|
|
|
tts(description) |
|
|