import streamlit as st from transformers import pipeline import torch from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline from transformers import AutoProcessor, AutoModel import edge_tts import asyncio import os import io import tempfile # Initialize session state for storing data if 'scenario' not in st.session_state: st.session_state.scenario = None if 'scenario_zh' not in st.session_state: st.session_state.scenario_zh = None if 'story' not in st.session_state: st.session_state.story = None if 'story_zh' not in st.session_state: st.session_state.story_zh = None if 'audio_generated_zh' not in st.session_state: st.session_state.audio_generated_zh = False if 'audio_path_zh' not in st.session_state: st.session_state.audio_path_zh = None if 'audio_generated_en' not in st.session_state: st.session_state.audio_generated_en = False if 'audio_path_en' not in st.session_state: st.session_state.audio_path_en = None # function part # img2text def img2text(url): image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") text = image_to_text_model(url)[0]["generated_text"] return text # Translation function EN to ZH def translate_to_chinese(text): translator = pipeline("translation", model="steve-tong/opus-mt-en-zh-hk") translation = translator(text)[0]["translation_text"] return translation # text2story - using mosaicml/mpt-7b-storywriter model for better stories def text2story(text): try: # Initialize the improved story generation pipeline generator = pipeline("text-generation", model="2173ars/llama-3-8b-Instruct-bnb-4bit-personal-shortstory") # Create a prompt for the story prompt = f"{text}" # Generate the story - limit to a smaller max_length due to model size story = generator(prompt, min_length=100, max_length=800, num_return_sequences=1, top_k=50, top_p=0.92, no_repeat_ngram_size=3, temperature=0.8, repetition_penalty=1.3, do_sample=True)[0]['generated_text'] # Clean up the story by removing the prompt story = story.replace(prompt, "").strip() # Trim to a reasonable length if needed if len(story) > 500: sentences = story.split('.') trimmed_story = '.'.join(sentences[:5]) + '.' return trimmed_story return story except Exception as e: # Fallback to simpler model if the advanced one fails fallback_generator = pipeline('text-generation', model='gpt2') fallback_prompt = f"Create a short story about this scene: {text}\n\nStory:" fallback_story = fallback_generator(fallback_prompt, min_length=100, num_return_sequences=1)[0]['generated_text'] return fallback_story.replace(fallback_prompt, "").strip() def load_css(css_file): with open(css_file) as f: st.markdown(f'', unsafe_allow_html=True) # Text to audio using edge_tts for Cantonese audio async def text2audio_cantonese(text): try: # Use Cantonese voice from edge-tts voice = "zh-HK-HiuMaanNeural" # Female Cantonese voice # Alternative: "zh-HK-WanLungNeural" for male voice # Create a temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") temp_file.close() # Configure edge-tts to save to the file path communicate = edge_tts.Communicate(text, voice) await communicate.save(temp_file.name) # Return the path to the audio file return { 'path': temp_file.name, 'success': True } except Exception as e: st.error(f"中文音頻製作出左問題: {str(e)}") return { 'path': None, 'success': False } # Text to audio using edge_tts for English audio async def text2audio_english(text): try: # Use English voice from edge-tts voice = "en-US-AriaNeural" # Female English voice # Alternative: "en-US-GuyNeural" for male voice # Create a temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") temp_file.close() # Configure edge-tts to save to the file path communicate = edge_tts.Communicate(text, voice) await communicate.save(temp_file.name) # Return the path to the audio file return { 'path': temp_file.name, 'success': True } except Exception as e: st.error(f"English audio generation error: {str(e)}") return { 'path': None, 'success': False } # Apply custom CSS for modern, stylish kid-friendly UI st.set_page_config(page_title="歡迎嚟到 ISOM 5240 - 故事魔法師!", page_icon="✨", layout="wide") load_css('styles.css') # App header with Cantonese st.title("") st.markdown("
", unsafe_allow_html=True) # Add a progress indicator for model loading progress_placeholder = st.empty() # File uploader with Cantonese with st.container(): st.subheader("請選擇一張畫作上載,格式必須係 JPG、JPEG 或 PNG! 最大 200 MB !") uploaded_file = st.file_uploader( "", type=["jpg", "jpeg", "png"], # Limit file types key="upload" ) if uploaded_file is not None: st.success("上載成功!") else: st.info("請選擇一張畫作上載,格式必須係 JPG、JPEG 或 PNG! 最大 200 MB !") if uploaded_file is not None: # Save uploaded file bytes_data = uploaded_file.getvalue() temp_file_path = uploaded_file.name with open(temp_file_path, "wb") as file: file.write(bytes_data) # Display image st.image(uploaded_file, use_container_width=True) # Reset session state if a new file is uploaded (detect by checking if there's no scenario yet) if st.session_state.scenario is None: # Stage 1: Image to Text with st.container(): st.markdown("