Spaces:

naofunyannn
/

LLaMAX_Translator

Sleeping

App Files Files Community

naofunyannn commited on Dec 13, 2024

Commit

cc69e73

verified ·

1 Parent(s): 0e848ab

Upload 5 files

Browse files

Files changed (5) hide show

LICENSE +21 -0
README.md +1 -12
app.py +241 -0
gitattributes +35 -0
requirements.txt +11 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Thái Trương
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1 @@
----
-title: LLaMAX Translator
-emoji: 📈
-colorFrom: blue
-colorTo: pink
-sdk: gradio
-sdk_version: 5.8.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+

app.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import torch
+import gradio as gr
+import spaces
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import os
+import re
+from polyglot.detect import Detector
+from nltk.translate.bleu_score import sentence_bleu
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
+MODEL = "LLaMAX/LLaMAX3-8B-Alpaca"
+RELATIVE_MODEL="LLaMAX/LLaMAX3-8B"
+TITLE = "<h1><center>LLaMAX Translator</center></h1>"
+model = AutoModelForCausalLM.from_pretrained(
+        MODEL,
+        torch_dtype=torch.float16,
+        device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
+def lang_detector(text):
+    min_chars = 5
+    if len(text) < min_chars:
+        return "Input text too short"
+    try:
+        detector = Detector(text).language
+        lang_info = str(detector)
+        code = re.search(r"name: (\w+)", lang_info).group(1)
+        return code
+    except Exception as e:
+        return f"ERROR：{str(e)}"
+def Prompt_template(inst, prompt, query, src_language, trg_language):
+    inst = inst.format(src_language=src_language, trg_language=trg_language)
+    instruction = f"`{inst}`"
+    prompt = (
+        f'{prompt}'
+        f'### Instruction:\n{instruction}\n'
+        f'### Input:\n{query}\n### Response:'
+    )
+    return prompt
+# Unfinished
+def chunk_text():
+    pass
+# Function to calculate BLEU score
+def calculate_bleu_score(candidate: str, references: list):
+    candidate_tokens = candidate.split()  # Tokenizing the candidate output
+    bleu_score = sentence_bleu(references, candidate_tokens)  # Calculating BLEU score
+    return bleu_score
+@spaces.GPU(duration=60)
+def translate(
+    source_text: str,
+    source_lang: str,
+    target_lang: str,
+    inst: str,
+    prompt: str,
+    max_length: int,
+    temperature: float,
+    top_p: float,
+    rp: float):
+    print(f'Text is - {source_text}')
+    prompt = Prompt_template(inst, prompt, source_text, source_lang, target_lang)
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
+    generate_kwargs = dict(
+        input_ids=input_ids,
+        max_length=max_length,
+        do_sample=True,
+        temperature=temperature,
+        top_p=top_p,
+        repetition_penalty=rp,
+    )
+    outputs = model.generate(**generate_kwargs)
+    resp = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+    #yield resp[len(prompt):]
+    # Calculate BLEU score
+    '''
+    references = [
+        'this is a dog'.split(),
+        'it is dog'.split(),
+        'dog it is'.split(),
+        'a dog, it is'.split()
+    ]
+    bleu_score = calculate_bleu_score(resp[len(prompt):], references)  # Calculate BLEU score
+    '''
+    references = [resp[len(prompt):].split()]  # Use the generated response as the reference
+    bleu_score = calculate_bleu_score(resp[len(prompt):], references)  # Calculate BLEU score
+    yield resp[len(prompt):], bleu_score
+CSS = """
+    h1 {
+        text-align: center;
+        display: block;
+        height: 10vh;
+        align-content: center;
+        font-family: Arial, Helvetica, sans-serif;
+    }
+    footer {
+        visibility: hidden;
+    }
+    font-family: Arial, Helvetica, sans-serif;
+"""
+LICENSE = """
+Model: <a href="https://huggingface.co/LLaMAX/LLaMAX3-8B-Alpaca">LLaMAX3-8B-Alpaca</a>
+"""
+LANG_LIST = ['Akrikaans', 'Amharic', 'Arabic', 'Armenian', 'Assamese', 'Asturian', 'Azerbaijani', \
+             'Belarusian', 'Bengali', 'Bosnian', 'Bulgarian', 'Burmese', \
+             'Catalan', 'Cebuano', 'Simplified Chinese', 'Traditional Chinese', 'Croatian', 'Czech', \
+             'Danish', 'Dutch', 'English', 'Estonian', 'Filipino', 'Finnish', 'French', 'Fulah', \
+             'Galician', 'Ganda', 'Georgian', 'German', 'Greek', 'Gujarati', \
+             'Hausa', 'Hebrew', 'Hindi', 'Hungarian', \
+             'Icelandic', 'Igbo', 'Indonesian', 'Irish', 'Italian', \
+             'Japanese', 'Javanese', \
+             'Kabuverdianu', 'Kamba', 'Kannada', 'Kazakh', 'Khmer', 'Korean', 'Kyrgyz', \
+             'Lao', 'Latvian', 'Lingala', 'Lithuanian', 'Luo', 'Luxembourgish', \
+             'Macedonian', 'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Mongolian', \
+             'Nepali', 'Northern', 'Norwegian', 'Nyanja', \
+             'Occitan', 'Oriya', 'Oromo', \
+             'Pashto', 'Persian', 'Polish', 'Portuguese', 'Punjabi', \
+             'Romanian', 'Russian', \
+             'Serbian', 'Shona', 'Sindhi', 'Slovak', 'Slovenian', 'Somali', 'Sorani', 'Spanish', 'Swahili', 'Swedish', \
+             'Tajik', 'Tamil', 'Telugu', 'Thai', 'Turkish', \
+             'Ukrainian', 'Umbundu', 'Urdu', 'Uzbek', \
+             'Vietnamese', 'Welsh', 'Wolof', 'Xhosa', 'Yoruba', 'Zulu']
+chatbot = gr.Chatbot(height=600)
+with gr.Blocks(theme="soft", css=CSS) as demo:
+    gr.Markdown(TITLE)
+    with gr.Row():
+        with gr.Column(scale=4):
+            source_text = gr.Textbox(
+                label="Văn bản gốc",
+                value="LLaMAX is a language model with powerful multilingual capabilities without loss instruction-following capabilities. "+\
+                "LLaMAX supports translation between more than 100 languages, "+\
+                "surpassing the performance of similarly scaled LLMs.",
+                lines=10,
+            )
+            output_text = gr.Textbox(
+                label="Văn bản đã được dịch",
+                lines=10,
+                show_copy_button=True,
+            )
+            bleu_score_output = gr.Textbox(  # New holder area for BLEU score
+                label="BLEU Score",
+                lines=10,
+                interactive=False,
+            )
+        with gr.Column(scale=1):
+            source_lang = gr.Dropdown(
+                label="Ngôn ngữ nguồn",
+                value="English",
+                choices=LANG_LIST,
+            )
+            target_lang = gr.Dropdown(
+                label="Ngôn ngữ đích",
+                value="Vietnamese",
+                choices=LANG_LIST,
+            )
+            max_length = gr.Slider(
+                label="Độ dài tối đa",
+                minimum=512,
+                maximum=8192,
+                value=4000,
+                step=8,
+            )
+            temperature = gr.Slider(
+                label="Temperature",
+                minimum=0,
+                maximum=1,
+                value=0.3,
+                step=0.1,
+            )
+            top_p = gr.Slider(
+                label="top_p",
+                minimum=0.0,
+                maximum=1.0,
+                step=0.1,
+                value=1.0,
+            )
+            rp = gr.Slider(
+                label="Repetition penalty",
+                minimum=1.0,
+                maximum=2.0,
+                step=0.1,
+                value=1.2,
+            )
+            with gr.Accordion("Tùy chọn nâng cao", open=False):
+                inst = gr.Textbox(
+                    label="Instruction",
+                    value="Translate the following sentences from {src_language} to {trg_language}.",
+                    lines=3,
+                )
+                prompt = gr.Textbox(
+                    label="Prompt",
+                    # Prompt 1
+                    #value="""Below is an instruction that describes a task, paired with an input that provides further context.
+#Write a response that appropriately completes the request.
+### Instruction:
+#{instruction}
+### Input:
+#{query}
+### Response:""",#
+                    # Prompt 2
+                    value="""Below is an instruction that describes a task, paired with an input that provides further context.
+Write a response that ensuring accuracy and maintaining the tone and style of the original text.
+### Instruction:
+{instruction}
+### Input:
+{query}
+### Response:""",
+                    lines=8,
+                )
+    with gr.Row():
+        submit = gr.Button(value="Submit")
+        clear = gr.ClearButton([source_text, output_text])
+    gr.Markdown(LICENSE)
+    #source_text.change(lang_detector, source_text, source_lang)
+    #submit.click(fn=translate, inputs=[source_text, source_lang, target_lang, inst, prompt, max_length, temperature, top_p, rp], outputs=[output_text])
+    submit.click(fn=translate, inputs=[source_text, source_lang, target_lang, inst, prompt, max_length, temperature, top_p, rp], outputs=[output_text, bleu_score_output])
+if __name__ == "__main__":
+    demo.launch()

gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+accelerate
+timm
+einops
+torch
+Pillow
+transformers
+polyglot
+pyicu
+pycld2
+gradio
+spaces