Spaces:

prithivMLmods
/

Text-to-Image-SD3.5

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 4

Commit

6db128e

verified ·

1 Parent(s): e8c0c12

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -90

app.py CHANGED Viewed

@@ -19,15 +19,15 @@ from transformers import (
     TextIteratorStreamer,
     Qwen2VLForConditionalGeneration,
     AutoProcessor,
-    AutoModelForImageTextToText,
 )
 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
-# Application description and CSS
 DESCRIPTION = """
 # QwQ Edge 💬
 """
 css = '''
 h1 {
   text-align: center;
@@ -48,9 +48,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# -------------------------
-# Load Text-only Model
-# -------------------------
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
@@ -60,14 +58,19 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
-# -------------------------
-# TTS Settings
-# -------------------------
 TTS_VOICES = [
     "en-US-JennyNeural",  # @tts1
     "en-US-GuyNeural",    # @tts2
 ]
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
     communicate = edge_tts.Communicate(text, voice)
@@ -85,36 +88,14 @@ def clean_chat_history(chat_history):
             cleaned.append(msg)
     return cleaned
-# -------------------------
-# Load Multimodal Model (Qwen2-VL)
-# -------------------------
-MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
-processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-model_m = Qwen2VLForConditionalGeneration.from_pretrained(
-    MODEL_ID,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to("cuda").eval()
-# -------------------------
-# Load Aya-Vision Model (New Feature)
-# -------------------------
-AYA_MODEL_ID = "CohereForAI/aya-vision-8b"
-aya_processor = AutoProcessor.from_pretrained(AYA_MODEL_ID)
-aya_model = AutoModelForImageTextToText.from_pretrained(
-    AYA_MODEL_ID, device_map="auto", torch_dtype=torch.float16
-)
-aya_tokenizer = AutoTokenizer.from_pretrained(AYA_MODEL_ID)
-# -------------------------
-# Stable Diffusion XL Settings & Pipeline
-# -------------------------
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
@@ -123,12 +104,15 @@ sd_pipe = StableDiffusionXLPipeline.from_pretrained(
 ).to(device)
 sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
 if torch.cuda.is_available():
     sd_pipe.text_encoder = sd_pipe.text_encoder.half()
 if USE_TORCH_COMPILE:
     sd_pipe.compile()
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
@@ -184,6 +168,7 @@ def generate_image_fn(
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
         if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
             batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
         if device.type == "cuda":
             with torch.autocast("cuda", dtype=torch.float16):
                 outputs = sd_pipe(**batch_options)
@@ -208,55 +193,12 @@ def generate(
     Special commands:
       - "@tts1" or "@tts2": triggers text-to-speech.
       - "@image": triggers image generation using the SDXL pipeline.
-      - "@aya-vision": triggers image-text-to-text generation using the Aya-Vision model.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
-    # -------------------------
-    # Aya-Vision Feature
-    # -------------------------
-    if text.strip().lower().startswith("@aya-vision"):
-        prompt = text[len("@aya-vision"):].strip()
-        if files:
-            if len(files) > 1:
-                images = [load_image(file) for file in files]
-            elif len(files) == 1:
-                images = [load_image(files[0])]
-            messages = [{
-                "role": "user",
-                "content": [
-                    *[{"type": "image", "image": image} for image in images],
-                    {"type": "text", "text": prompt},
-                ]
-            }]
-        else:
-            messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
-        yield "Processing with Aya-Vision..."
-        inputs = aya_processor.apply_chat_template(
-            messages,
-            padding=True,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt"
-        ).to(aya_model.device)
-        # Remove deprecated parameter if present to avoid conflicts.
-        inputs.pop("num_logits_to_keep", None)
-        gen_tokens = aya_model.generate(
-            **inputs,
-            max_new_tokens=300,
-            do_sample=True,
-            temperature=0.3,
-        )
-        gen_text = aya_tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
-        yield gen_text
-        return  # Exit early after processing with Aya-Vision
-    # -------------------------
-    # Image Generation Feature (@image)
-    # -------------------------
     if text.strip().lower().startswith("@image"):
         prompt = text[len("@image"):].strip()
         yield "Generating image..."
         image_paths, used_seed = generate_image_fn(
@@ -272,12 +214,10 @@ def generate(
             use_resolution_binning=True,
             num_images=1,
         )
         yield gr.Image(image_paths[0])
         return  # Exit early
-    # -------------------------
-    # TTS Feature (@tts1 or @tts2)
-    # -------------------------
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
@@ -294,9 +234,6 @@ def generate(
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
-    # -------------------------
-    # Multimodal Input (with files) using Qwen2-VL
-    # -------------------------
     if files:
         if len(files) > 1:
             images = [load_image(image) for image in files]
@@ -326,9 +263,7 @@ def generate(
             time.sleep(0.01)
             yield buffer
     else:
-        # -------------------------
-        # Text-only Generation
-        # -------------------------
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
@@ -356,7 +291,8 @@ def generate(
         final_response = "".join(outputs)
         yield final_response
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
@@ -371,12 +307,13 @@ demo = gr.ChatInterface(
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
-        [{"text": "@aya-vision Extract JSON from the image", "files": ["examples/document.jpg"]}],
-        [{"text": "@aya-vision Summarize the letter", "files": ["examples/1.png"]}],
         ["@tts1 Who is Nikola Tesla, and why did he die?"],
         ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
         ["Write a Python function to check if a number is prime."],
         ["@tts2 What causes rainbows to form?"],
     ],
     cache_examples=False,
     type="messages",
@@ -389,5 +326,4 @@ demo = gr.ChatInterface(
 )
 if __name__ == "__main__":
-    # To create a public link, set share=True in launch().
-    demo.queue(max_size=20).launch(share=True)

     TextIteratorStreamer,
     Qwen2VLForConditionalGeneration,
     AutoProcessor,
 )
 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 DESCRIPTION = """
 # QwQ Edge 💬
 """
 css = '''
 h1 {
   text-align: center;
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load text-only model and tokenizer
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
 TTS_VOICES = [
     "en-US-JennyNeural",  # @tts1
     "en-US-GuyNeural",    # @tts2
 ]
+MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+model_m = Qwen2VLForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to("cuda").eval()
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
     communicate = edge_tts.Communicate(text, voice)
             cleaned.append(msg)
     return cleaned
+# Environment variables and parameters for Stable Diffusion XL
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
+# Load the SDXL pipeline
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
 ).to(device)
 sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+# Ensure that the text encoder is in half-precision if using CUDA.
 if torch.cuda.is_available():
     sd_pipe.text_encoder = sd_pipe.text_encoder.half()
+# Optional: compile the model for speedup if enabled
 if USE_TORCH_COMPILE:
     sd_pipe.compile()
+# Optional: offload parts of the model to CPU if needed
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
         if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
             batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
+        # Wrap the pipeline call in autocast if using CUDA
         if device.type == "cuda":
             with torch.autocast("cuda", dtype=torch.float16):
                 outputs = sd_pipe(**batch_options)
     Special commands:
       - "@tts1" or "@tts2": triggers text-to-speech.
       - "@image": triggers image generation using the SDXL pipeline.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
     if text.strip().lower().startswith("@image"):
+        # Remove the "@image" tag and use the rest as prompt
         prompt = text[len("@image"):].strip()
         yield "Generating image..."
         image_paths, used_seed = generate_image_fn(
             use_resolution_binning=True,
             num_images=1,
         )
+        # Yield the generated image so that the chat interface displays it.
         yield gr.Image(image_paths[0])
         return  # Exit early
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
     if files:
         if len(files) > 1:
             images = [load_image(image) for image in files]
             time.sleep(0.01)
             yield buffer
     else:
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         final_response = "".join(outputs)
         yield final_response
+        # If TTS was requested, convert the final response to speech.
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
         ["@tts1 Who is Nikola Tesla, and why did he die?"],
+        [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
+        [{"text": "summarize the letter", "files": ["examples/1.png"]}],
         ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
         ["Write a Python function to check if a number is prime."],
         ["@tts2 What causes rainbows to form?"],
     ],
     cache_examples=False,
     type="messages",
 )
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch(share=True)