import streamlit as st from together import Together import base64 from PIL import Image from io import BytesIO import os from elevenlabs.client import ElevenLabs # Set API keys as environment variables os.environ["TOGETHER_API_KEY"] = st.secrets['together_api'] os.environ["ELEVENLABS_API_KEY"] = st.secrets['elevenlabs_api'] # Initialize the Together client together_client = Together(api_key=os.environ["TOGETHER_API_KEY"]) # Initialize ElevenLabs client elevenlabs_client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"]) # Function to encode image to base64 def encode_image(image): buffered = BytesIO() image.save(buffered, format="JPEG") image_bytes = buffered.getvalue() return base64.b64encode(image_bytes).decode('utf-8') # Function to get image description from Together API def get_image_description(image): get_description_prompt = "Describe the given image in detail in only 20 words max." # Encode the image to base64 base64_image = encode_image(image) # Create the request to Together API response = together_client.chat.completions.create( model="meta-llama/Llama-Vision-Free", messages=[{ "role": "user", "content": [ {"type": "text", "text": get_description_prompt}, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}", }, }, ], }], stream=False, ) # Return the result from the API return response.choices[0].message.content # Function to convert text to speech using ElevenLabs def tts(text): try: # Generate the audio (returns a generator) audio_generator = elevenlabs_client.text_to_speech.convert( text=text, voice_id="JBFqnCBsd6RMkjVDRZzb", # Replace with your preferred voice ID model_id="eleven_multilingual_v2", output_format="mp3_44100_128", ) # Save the audio to a temporary file audio_file_path = "temp_audio.mp3" with open(audio_file_path, "wb") as f: for chunk in audio_generator: f.write(chunk) # Play the audio in Streamlit st.audio(audio_file_path, format="audio/mp3",autoplay=True) except Exception as e: st.error(f"Error generating speech: {e}") # Custom CSS for a futuristic look st.markdown( """ """, unsafe_allow_html=True, ) # Streamlit app layout st.title("🔮 Visox | Koshur AI") st.markdown("### See the world through AI's eyes!") # Sidebar for additional info st.sidebar.markdown("## About") st.sidebar.markdown("This app uses advanced AI to describe what it sees through your camera in real-time.") st.sidebar.markdown("Powered by [Together AI](https://together.ai) and Streamlit.") # Access the camera img_file_buffer = st.camera_input("Take a picture") if img_file_buffer is not None: # Convert the image file buffer to a PIL Image img = Image.open(img_file_buffer) # Display the captured image st.image(img, caption='Captured Image', width=300) # Get and display the description with st.spinner('🔍 Analyzing the image...'): description = get_image_description(img) st.success('✅ Analysis complete!') st.markdown("### AI Description:") st.write(description) # Convert description to speech and play it tts(description)