from transformers import MBartForConditionalGeneration, MBartTokenizer, MarianMTModel, MarianTokenizer import streamlit as st # Load multilingual summarization model and tokenizer multilingual_summarization_model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-50') multilingual_summarization_tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-50') # Dictionary of languages and their corresponding Hugging Face model codes LANGUAGES = { "English": "en_XX", "French": "fr_XX", "Spanish": "es_XX", "German": "de_DE", "Chinese": "zh_CN", "Russian": "ru_RU", "Arabic": "ar_AR", "Portuguese": "pt_PT", "Hindi": "hi_IN", "Italian": "it_IT", "Japanese": "ja_XX", "Korean": "ko_KR", "Dutch": "nl_NL", "Polish": "pl_PL", "Turkish": "tr_TR", "Swedish": "sv_SE", "Greek": "el_EL", "Finnish": "fi_FI", "Hungarian": "hu_HU", "Danish": "da_DK", "Norwegian": "no_NO", "Czech": "cs_CZ", "Romanian": "ro_RO", "Thai": "th_TH", "Hebrew": "he_IL", "Vietnamese": "vi_VN", "Indonesian": "id_ID", "Malay": "ms_MY", "Bengali": "bn_BD", "Ukrainian": "uk_UA", "Urdu": "ur_PK", "Swahili": "sw_KE", "Serbian": "sr_SR", "Croatian": "hr_HR", "Slovak": "sk_SK", "Lithuanian": "lt_LT", "Latvian": "lv_LV", "Estonian": "et_EE", "Bulgarian": "bg_BG", "Macedonian": "mk_MK", "Albanian": "sq_AL", "Georgian": "ka_GE", "Armenian": "hy_AM", "Kazakh": "kk_KZ", "Uzbek": "uz_UZ", "Tajik": "tg_TJ", "Kyrgyz": "ky_KG", "Turkmen": "tk_TM" } # Function to get the appropriate translation model and tokenizer def get_translation_model(source_lang, target_lang): model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}" try: model = MarianMTModel.from_pretrained(model_name) tokenizer = MarianTokenizer.from_pretrained(model_name) print(f"Loaded translation model for {source_lang} to {target_lang}") return model, tokenizer except Exception as e: print(f"Error loading translation model for {source_lang} to {target_lang}: {e}") return None, None # Function to translate text def translate_text(text, source_lang, target_lang): model, tokenizer = get_translation_model(source_lang, target_lang) if model is None or tokenizer is None: return "Translation model error." inputs = tokenizer([text], return_tensors="pt", truncation=True) translated_ids = model.generate(inputs['input_ids'], max_length=1024) translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True) return translated_text # Summarization function with multi-language support def summarize_text(text, target_language="English"): # Summarize the text using mBART (assuming input text is in English) inputs = multilingual_summarization_tokenizer(text, return_tensors='pt', padding=True, truncation=True) summary_ids = multilingual_summarization_model.generate( inputs['input_ids'], num_beams=6, # Increased beams for better quality max_length=1500, # Increased maximum length for longer summaries min_length=400, # Set a minimum length for the summary length_penalty=1.5, # Adjust length penalty to control the length of the summary early_stopping=True ) summary = multilingual_summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True) print(f"Generated summary in English: {summary}") target_lang_code = LANGUAGES.get(target_language, "en_XX") # Translate summary to the target language if needed if target_lang_code != "en_XX": summary = translate_text(summary, "en_XX", target_lang_code) print(f"Translated summary to {target_language}: {summary}") return summary # Streamlit interface st.title("Multi-Language Text Summarization Tool") text = st.text_area("Input Text (in English)") target_language = st.selectbox("Target Language for Summary", options=list(LANGUAGES.keys()), index=list(LANGUAGES.keys()).index("English")) if st.button("Summarize"): if text: summary = summarize_text(text, target_language) st.subheader("Summary") st.write(summary) else: st.warning("Please enter text to summarize.")