import streamlit as st from transformers import pipeline import torch from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline , AutoModelForCausalLM , AutoProcessor, AutoModel , AutoModelForSequenceClassification import edge_tts import asyncio import os import io import tempfile # Initialize session state for storing data if 'scenario' not in st.session_state: st.session_state.scenario = None if 'scenario_zh' not in st.session_state: st.session_state.scenario_zh = None if 'story' not in st.session_state: st.session_state.story = None if 'story_zh' not in st.session_state: st.session_state.story_zh = None if 'audio_generated_zh' not in st.session_state: st.session_state.audio_generated_zh = False if 'audio_path_zh' not in st.session_state: st.session_state.audio_path_zh = None if 'audio_generated_en' not in st.session_state: st.session_state.audio_generated_en = False if 'audio_path_en' not in st.session_state: st.session_state.audio_path_en = None # function part # img2text def img2text(url): image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") text = image_to_text_model(url)[0]["generated_text"] return text # Translation function EN to ZH def translate_to_chinese(text): translator = pipeline("translation", model="steve-tong/opus-mt-en-zh-hk") translation = translator(text)[0]["translation_text"] return translation # text2story - using sert121/bert_finetuned_shortstories model for better stories def text2story(text): try: model = AutoTokenizer.from_pretrained("sert121/bert_finetuned_shortstories") prompt = f"{text}" tokenizer = AutoModelForSequenceClassification.from_pretrained("sert121/bert_finetuned_shortstories") inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate( inputs.input_ids, min_length=50, max_new_tokens=100, temperature=0.7, top_p=0.9, top_k=40, repetition_penalty=1.2, do_sample=True, pad_token_id=tokenizer.eos_token_id ) story = tokenizer.decode(outputs[0], skip_special_tokens=True) return story except Exception as e: # Fallback to openai-community/gpt2 if the advanced one fails fallback_generator = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") fallback_prompt = f"{text}" tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") inputs = tokenizer(fallback_prompt, return_tensors="pt") fallback_story = fallback_generator.generate( inputs.input_ids, min_length=50, max_new_tokens=100, temperature=0.7, top_p=0.9, top_k=40, repetition_penalty=1.2, do_sample=True, pad_token_id=tokenizer.eos_token_id ) fallback_story = tokenizer.decode(fallback_story[0], skip_special_tokens=True) return fallback_story def load_css(css_file): with open(css_file) as f: st.markdown(f'', unsafe_allow_html=True) # Text to audio using edge_tts for Cantonese audio async def text2audio_cantonese(text): try: # Use Cantonese voice from edge-tts voice = "zh-HK-HiuMaanNeural" # Female Cantonese voice # Alternative: "zh-HK-WanLungNeural" for male voice # Create a temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") temp_file.close() # Configure edge-tts to save to the file path communicate = edge_tts.Communicate(text, voice) await communicate.save(temp_file.name) # Return the path to the audio file return { 'path': temp_file.name, 'success': True } except Exception as e: st.error(f"中文音頻製作出左問題: {str(e)}") return { 'path': None, 'success': False } # Text to audio using edge_tts for English audio async def text2audio_english(text): try: # Use English voice from edge-tts voice = "en-US-AriaNeural" # Female English voice # Alternative: "en-US-GuyNeural" for male voice # Create a temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") temp_file.close() # Configure edge-tts to save to the file path communicate = edge_tts.Communicate(text, voice) await communicate.save(temp_file.name) # Return the path to the audio file return { 'path': temp_file.name, 'success': True } except Exception as e: st.error(f"English audio generation error: {str(e)}") return { 'path': None, 'success': False } # Apply custom CSS for modern, stylish kid-friendly UI st.set_page_config(page_title="歡迎嚟到 ISOM 5240 - 故事魔法師!", page_icon="✨", layout="wide") load_css('styles.css') # App header with Cantonese st.title("") st.markdown("
", unsafe_allow_html=True) # Add a progress indicator for model loading progress_placeholder = st.empty() # File uploader with Cantonese with st.container(): uploaded_file = st.file_uploader( "", type=["jpg", "jpeg", "png"], # Limit file types key="upload" ) if uploaded_file is not None: st.success("上載成功!") else: st.info("請選擇一張畫作上載,格式必須係 JPG、JPEG 或 PNG! 最大 200 MB !") if uploaded_file is not None: # Save uploaded file bytes_data = uploaded_file.getvalue() temp_file_path = uploaded_file.name with open(temp_file_path, "wb") as file: file.write(bytes_data) # Display image st.image(uploaded_file, use_container_width=True) # Reset session state if a new file is uploaded (detect by checking if there's no scenario yet) if st.session_state.scenario is None: # Stage 1: Image to Text with st.container(): st.markdown("