Spaces:

adil9858
/

visox

Sleeping

File size: 4,579 Bytes

import streamlit as st
from together import Together
import base64
from PIL import Image
from io import BytesIO
import os
from elevenlabs.client import ElevenLabs

# Set API keys as environment variables
os.environ["TOGETHER_API_KEY"] = st.secrets['together_api']
os.environ["ELEVENLABS_API_KEY"] = st.secrets['elevenlabs_api']

# Initialize the Together client
together_client = Together(api_key=os.environ["TOGETHER_API_KEY"])

# Initialize ElevenLabs client
elevenlabs_client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"])

# Function to encode image to base64
def encode_image(image):
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    image_bytes = buffered.getvalue()
    return base64.b64encode(image_bytes).decode('utf-8')

# Function to get image description from Together API
def get_image_description(image):
    get_description_prompt = "Describe the given image in detail in only 20 words max."
    
    # Encode the image to base64
    base64_image = encode_image(image)

    # Create the request to Together API
    response = together_client.chat.completions.create(
        model="meta-llama/Llama-Vision-Free",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": get_description_prompt},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}",
                    },
                },
            ],
        }],
        stream=False,
    )

    # Return the result from the API
    return response.choices[0].message.content

# Function to convert text to speech using ElevenLabs
def tts(text):
    try:
        # Generate the audio (returns a generator)
        audio_generator = elevenlabs_client.text_to_speech.convert(
            text=text,
            voice_id="JBFqnCBsd6RMkjVDRZzb",  # Replace with your preferred voice ID
            model_id="eleven_multilingual_v2",
            output_format="mp3_44100_128",
        )
        
        # Save the audio to a temporary file
        audio_file_path = "temp_audio.mp3"
        with open(audio_file_path, "wb") as f:
            for chunk in audio_generator:
                f.write(chunk)
        
        # Play the audio in Streamlit
        st.audio(audio_file_path, format="audio/mp3")
    except Exception as e:
        st.error(f"Error generating speech: {e}")

# Custom CSS for a futuristic look
st.markdown(
    """
    <style>
    .stApp {
        background: linear-gradient(135deg, #1e1e2f, #2a2a40);
        color: #ffffff;
        font-family: 'Arial', sans-serif;
    }
    .stButton>button {
        background: linear-gradient(135deg, #6a11cb, #2575fc);
        color: white;
        border: none;
        border-radius: 12px;
        padding: 10px 20px;
        font-size: 16px;
        font-weight: bold;
    }
    .stButton>button:hover {
        background: linear-gradient(135deg, #2575fc, #6a11cb);
    }
    .stImage {
        border-radius: 12px;
        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
    }
    .stMarkdown h1 {
        color: #6a11cb;
        text-align: center;
        font-size: 36px;
        font-weight: bold;
    }
    .stMarkdown h2 {
        color: #2575fc;
        font-size: 24px;
        font-weight: bold;
    }
    .stSpinner>div {
        color: #6a11cb;
    }
    </style>
    """,
    unsafe_allow_html=True,
)

# Streamlit app layout
st.title("🔮 Visox | Koshur AI")
st.markdown("### See the world through AI's eyes!")

# Sidebar for additional info
st.sidebar.markdown("## About")
st.sidebar.markdown("This app uses advanced AI to describe what it sees through your camera in real-time.")
st.sidebar.markdown("Powered by [Together AI](https://together.ai) and Streamlit.")

# Access the camera
img_file_buffer = st.camera_input("Take a picture")

if img_file_buffer is not None:
    try:
        # Convert the image file buffer to a PIL Image
        img = Image.open(img_file_buffer)

        # Display the captured image
        st.image(img, caption='Captured Image', width=300)

        # Get and display the description
        with st.spinner('🔍 Analyzing the image...'):
            description = get_image_description(img)
        st.success('✅ Analysis complete!')
        st.markdown("### AI Description:")
        st.write(description)

        # Convert description to speech and play it
        if st.button("🔊 Read Aloud"):
            tts(description)
    except Exception as e:
        st.error(f"An error occurred: {e}")