Spaces:

adil9858
/

visox

Sleeping

App Files Files Community

visox / app.py

adil9858

Update app.py

c2d4854 verified 16 days ago

raw

history blame

4.49 kB

	import streamlit as st
	from together import Together
	import base64
	from PIL import Image
	from io import BytesIO
	import os
	from elevenlabs.client import ElevenLabs

	# Set API keys as environment variables
	os.environ["TOGETHER_API_KEY"] = st.secrets['together_api']
	os.environ["ELEVENLABS_API_KEY"] = st.secrets['elevenlabs_api']

	# Initialize the Together client
	together_client = Together(api_key=os.environ["TOGETHER_API_KEY"])

	# Initialize ElevenLabs client
	elevenlabs_client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"])

	# Function to encode image to base64
	def encode_image(image):
	buffered = BytesIO()
	image.save(buffered, format="JPEG")
	image_bytes = buffered.getvalue()
	return base64.b64encode(image_bytes).decode('utf-8')

	# Function to get image description from Together API
	def get_image_description(image):
	get_description_prompt = "Describe the given image in detail in only 20 words max."

	# Encode the image to base64
	base64_image = encode_image(image)

	# Create the request to Together API
	response = together_client.chat.completions.create(
	model="meta-llama/Llama-Vision-Free",
	messages=[{
	"role": "user",
	"content": [
	{"type": "text", "text": get_description_prompt},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{base64_image}",
	},
	},
	],
	}],
	stream=False,
	)

	# Return the result from the API
	return response.choices[0].message.content

	# Function to convert text to speech using ElevenLabs
	def tts(text):
	try:
	# Generate the audio (returns a generator)
	audio_generator = elevenlabs_client.text_to_speech.convert(
	text=text,
	voice_id="JBFqnCBsd6RMkjVDRZzb", # Replace with your preferred voice ID
	model_id="eleven_multilingual_v2",
	output_format="mp3_44100_128",
	)

	# Save the audio to a temporary file
	audio_file_path = "temp_audio.mp3"
	with open(audio_file_path, "wb") as f:
	for chunk in audio_generator:
	f.write(chunk)

	# Play the audio in Streamlit
	st.audio(audio_file_path, format="audio/mp3",autoplay=True)
	except Exception as e:
	st.error(f"Error generating speech: {e}")

	# Custom CSS for a futuristic look
	st.markdown(
	"""
	<style>
	.stApp {
	background: linear-gradient(135deg, #1e1e2f, #2a2a40);
	color: #ffffff;
	font-family: 'Arial', sans-serif;
	}
	.stButton>button {
	background: linear-gradient(135deg, #6a11cb, #2575fc);
	color: white;
	border: none;
	border-radius: 12px;
	padding: 10px 20px;
	font-size: 16px;
	font-weight: bold;
	}
	.stButton>button:hover {
	background: linear-gradient(135deg, #2575fc, #6a11cb);
	}
	.stImage {
	border-radius: 12px;
	box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
	}
	.stMarkdown h1 {
	color: #6a11cb;
	text-align: center;
	font-size: 36px;
	font-weight: bold;
	}
	.stMarkdown h2 {
	color: #2575fc;
	font-size: 24px;
	font-weight: bold;
	}
	.stSpinner>div {
	color: #6a11cb;
	}
	</style>
	""",
	unsafe_allow_html=True,
	)

	# Streamlit app layout
	st.title("🔮 Visox \| Koshur AI")
	st.markdown("### See the world through AI's eyes!")

	# Sidebar for additional info
	st.sidebar.markdown("## About")
	st.sidebar.markdown("This app uses advanced AI to describe what it sees through your camera in real-time.")
	st.sidebar.markdown("Powered by [Together AI](https://together.ai) and Streamlit.")

	# Access the camera
	img_file_buffer = st.camera_input("Take a picture")

	if img_file_buffer is not None:
	try:
	# Convert the image file buffer to a PIL Image
	img = Image.open(img_file_buffer)

	# Display the captured image
	st.image(img, caption='Captured Image', width=300)

	# Get and display the description
	with st.spinner('🔍 Analyzing the image...'):
	description = get_image_description(img)
	st.success('✅ Analysis complete!')
	st.markdown("### AI Description:")
	st.write(description)

	# Convert description to speech and play it

	tts(description)