import streamlit as st from transformers import pipeline import torch from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline from transformers import AutoProcessor, AutoModel import edge_tts import asyncio import os import io import tempfile # Initialize session state for storing data if 'scenario' not in st.session_state: st.session_state.scenario = None if 'scenario_zh' not in st.session_state: st.session_state.scenario_zh = None if 'story' not in st.session_state: st.session_state.story = None if 'story_zh' not in st.session_state: st.session_state.story_zh = None if 'audio_generated_zh' not in st.session_state: st.session_state.audio_generated_zh = False if 'audio_path_zh' not in st.session_state: st.session_state.audio_path_zh = None if 'audio_generated_en' not in st.session_state: st.session_state.audio_generated_en = False if 'audio_path_en' not in st.session_state: st.session_state.audio_path_en = None # function part # img2text def img2text(url): image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") text = image_to_text_model(url)[0]["generated_text"] return text # Translation function EN to ZH def translate_to_chinese(text): translator = pipeline("translation", model="steve-tong/opus-mt-en-zh-hk") translation = translator(text)[0]["translation_text"] return translation # text2story - using mosaicml/mpt-7b-storywriter model for better stories def text2story(text): try: # Initialize the improved story generation pipeline generator = pipeline("text-generation", model="2173ars/llama-3-8b-Instruct-bnb-4bit-personal-shortstory") # Create a prompt for the story prompt = f"{text}" # Generate the story - limit to a smaller max_length due to model size story = generator(prompt, min_length=100, max_length=800, num_return_sequences=1, top_k=50, top_p=0.92, no_repeat_ngram_size=3, temperature=0.8, repetition_penalty=1.3, do_sample=True)[0]['generated_text'] # Clean up the story by removing the prompt story = story.replace(prompt, "").strip() # Trim to a reasonable length if needed if len(story) > 500: sentences = story.split('.') trimmed_story = '.'.join(sentences[:5]) + '.' return trimmed_story return story except Exception as e: # Fallback to simpler model if the advanced one fails fallback_generator = pipeline('text-generation', model='gpt2') fallback_prompt = f"Create a short story about this scene: {text}\n\nStory:" fallback_story = fallback_generator(fallback_prompt, min_length=100, num_return_sequences=1)[0]['generated_text'] return fallback_story.replace(fallback_prompt, "").strip() def load_css(css_file): with open(css_file) as f: st.markdown(f'', unsafe_allow_html=True) # Text to audio using edge_tts for Cantonese audio async def text2audio_cantonese(text): try: # Use Cantonese voice from edge-tts voice = "zh-HK-HiuMaanNeural" # Female Cantonese voice # Alternative: "zh-HK-WanLungNeural" for male voice # Create a temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") temp_file.close() # Configure edge-tts to save to the file path communicate = edge_tts.Communicate(text, voice) await communicate.save(temp_file.name) # Return the path to the audio file return { 'path': temp_file.name, 'success': True } except Exception as e: st.error(f"中文音頻製作出左問題: {str(e)}") return { 'path': None, 'success': False } # Text to audio using edge_tts for English audio async def text2audio_english(text): try: # Use English voice from edge-tts voice = "en-US-AriaNeural" # Female English voice # Alternative: "en-US-GuyNeural" for male voice # Create a temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") temp_file.close() # Configure edge-tts to save to the file path communicate = edge_tts.Communicate(text, voice) await communicate.save(temp_file.name) # Return the path to the audio file return { 'path': temp_file.name, 'success': True } except Exception as e: st.error(f"English audio generation error: {str(e)}") return { 'path': None, 'success': False } # Apply custom CSS for modern, stylish kid-friendly UI st.set_page_config(page_title="歡迎嚟到 ISOM 5240 - 故事魔法師！", page_icon="✨", layout="wide") load_css('styles.css') # App header with Cantonese st.title("") st.markdown("

上載一張你鍾意嘅相片，我哋嘅魔法師會幫你變出一個好好玩嘅故事！

🚀🦄 🔮🌈

", unsafe_allow_html=True) # Add a progress indicator for model loading progress_placeholder = st.empty() # File uploader with Cantonese with st.container(): st.subheader("請選擇一張畫作上載，格式必須係 JPG、JPEG 或 PNG！最大 200 MB !") uploaded_file = st.file_uploader( "", type=["jpg", "jpeg", "png"], # Limit file types key="upload" ) if uploaded_file is not None: st.success("上載成功！") else: st.info("請選擇一張畫作上載，格式必須係 JPG、JPEG 或 PNG！最大 200 MB !") if uploaded_file is not None: # Save uploaded file bytes_data = uploaded_file.getvalue() temp_file_path = uploaded_file.name with open(temp_file_path, "wb") as file: file.write(bytes_data) # Display image st.image(uploaded_file, use_container_width=True) # Reset session state if a new file is uploaded (detect by checking if there's no scenario yet) if st.session_state.scenario is None: # Stage 1: Image to Text with st.container(): st.markdown("

🔍 畫作解讀中

", unsafe_allow_html=True) with progress_placeholder.container(): st.write("正在分析圖片...") progress_bar = st.progress(0) # Generate caption if not already done st.session_state.scenario = img2text(temp_file_path) progress_bar.progress(33) # Display English caption st.text("英文描述: " + st.session_state.scenario) # Translate the caption to Chinese with progress_placeholder.container(): st.write("正在翻譯...") st.session_state.scenario_zh = translate_to_chinese(st.session_state.scenario) progress_bar.progress(66) # Display Chinese caption st.text("中文描述: " + st.session_state.scenario_zh) # Stage 2: Text to Story with st.container(): st.markdown("

📝 故事創作中

", unsafe_allow_html=True) with progress_placeholder.container(): st.write("正在創作故事...") # Generate story if not already done st.session_state.story = text2story(st.session_state.scenario) progress_bar.progress(85) # Display English story st.text("英文故事: " + st.session_state.story) # Translate the story to Chinese with progress_placeholder.container(): st.write("正在翻譯故事...") st.session_state.story_zh = translate_to_chinese(st.session_state.story) progress_bar.progress(100) # Display Chinese story st.text("中文故事: " + st.session_state.story_zh) # Clear progress indicator progress_placeholder.empty() else: # Display saved results from session state with st.container(): st.markdown("

🔍 以下係我哋嘅魔術師覺得你嘅畫作係關於 :

", unsafe_allow_html=True) st.text("英文描述: " + st.session_state.scenario) st.text("中文描述: " + st.session_state.scenario_zh) with st.container(): st.markdown("

📝 以下係我哋嘅魔術師正根據你嘅畫作嘅故事:

", unsafe_allow_html=True) st.text("英文故事: " + st.session_state.story) st.text("中文故事: " + st.session_state.story_zh) # Stage 3: Story to Audio data with st.container(): st.markdown("

🔊 你嘅故事準備好俾我哋嘅魔術師讀出嚟喇！

", unsafe_allow_html=True) # Create two columns for English and Cantonese buttons col1, col2 = st.columns(2) # English audio button with col1: if st.button("🔊 Play Story in English by our teacher"): # Only generate audio if not already done if not st.session_state.audio_generated_en: # Need to run async function with asyncio audio_result = asyncio.run(text2audio_english(st.session_state.story)) st.session_state.audio_path_en = audio_result['path'] st.session_state.audio_generated_en = audio_result['success'] # Play the audio if st.session_state.audio_path_en and os.path.exists(st.session_state.audio_path_en): with open(st.session_state.audio_path_en, "rb") as audio_file: audio_bytes = audio_file.read() st.audio(audio_bytes, format="audio/mp3") else: st.error("Sorry! Please try again.") # Cantonese audio button with col2: if st.button("🔊 你嘅故事已經準備好俾我哋嘅魔術師用廣東話讀出嚟喇！"): # Only generate audio if not already done if not st.session_state.audio_generated_zh: # Need to run async function with asyncio audio_result = asyncio.run(text2audio_cantonese(st.session_state.story_zh)) st.session_state.audio_path_zh = audio_result['path'] st.session_state.audio_generated_zh = audio_result['success'] # Play the audio if st.session_state.audio_path_zh and os.path.exists(st.session_state.audio_path_zh): with open(st.session_state.audio_path_zh, "rb") as audio_file: audio_bytes = audio_file.read() st.audio(audio_bytes, format="audio/mp3") else: st.error("哎呀！再試多次啦！") # Cleanup: Remove the temporary file when the user is done if os.path.exists(temp_file_path): os.remove(temp_file_path) else: # Clear session state when no file is uploaded # Also clean up any temporary audio files if st.session_state.audio_path_zh and os.path.exists(st.session_state.audio_path_zh): try: os.remove(st.session_state.audio_path_zh) except: pass if st.session_state.audio_path_en and os.path.exists(st.session_state.audio_path_en): try: os.remove(st.session_state.audio_path_en) except: pass st.session_state.scenario = None st.session_state.scenario_zh = None st.session_state.story = None st.session_state.story_zh = None st.session_state.audio_generated_zh = False st.session_state.audio_path_zh = None st.session_state.audio_generated_en = False st.session_state.audio_path_en = None