import gradio as gr import spaces import json import re from gradio_client import Client def get_caption_from_kosmos(image_in): kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/") kosmos2_result = kosmos2_client.predict( image_in, # str (filepath or URL to image) in 'Test Image' Image component "Detailed", # str in 'Description Type' Radio component fn_index=4 ) print(f"KOSMOS2 RETURNS: {kosmos2_result}") with open(kosmos2_result[1], 'r') as f: data = json.load(f) reconstructed_sentence = [] for sublist in data: reconstructed_sentence.append(sublist[0]) full_sentence = ' '.join(reconstructed_sentence) #print(full_sentence) # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)... pattern = r'^Describe this image in detail:\s*(.*)$' # Apply the regex pattern to extract the description text. match = re.search(pattern, full_sentence) if match: description = match.group(1) print(description) else: print("Unable to locate valid description.") # Find the last occurrence of "." #last_period_index = full_sentence.rfind('.') # Truncate the string up to the last period #truncated_caption = full_sentence[:last_period_index + 1] # print(truncated_caption) #print(f"\n—\nIMAGE CAPTION: {truncated_caption}") return description def get_caption_from_MD(image_in): client = Client("https://vikhyatk-moondream1.hf.space/") result = client.predict( image_in, # filepath in 'image' Image component "Describe character like if it was fictional", # str in 'Question' Textbox component api_name="/answer_question" ) print(result) return result import re import torch from transformers import pipeline pipe = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.bfloat16, device_map="auto") @spaces.GPU(enable_queue=True) def get_llm_idea(user_prompt): agent_maker_sys = f""" """ instruction = f""" <|system|> {agent_maker_sys} <|user|> """ prompt = f"{instruction.strip()}\n{user_prompt}" #print(f"PROMPT: {prompt}") outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) return outputs def infer(image_in, cap_type): gr.Info("Getting image description...") if cap_type == "Fictional" : user_prompt = get_caption_from_MD(image_in) elif cap_type == "Literal" : user_prompt = get_caption_from_kosmos(image_in) gr.Info("Building a system according to the image caption ...") outputs = get_llm_idea(user_prompt) pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>' cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL) print(f"SUGGESTED LLM: {cleaned_text}") return user_prompt, cleaned_text.lstrip("\n") title = f"Magic Card Generator", description = f"" css = """ #col-container{ margin: 0 auto; max-width: 780px; text-align: left; } /* fix examples gallery width on mobile */ div#component-14 > .gallery > .gallery-item > .container > img { width: auto!important; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.HTML(f"""

LLM Agent from a Picture

{description}

""") with gr.Row(): with gr.Column(): image_in = gr.Image( label = "Image reference", type = "filepath", elem_id = "image-in" ) cap_type = gr.Radio( label = "Caption type", choices = [ "Literal", "Fictional" ], value = "Fictional" ) submit_btn = gr.Button("Make LLM system from my pic !") with gr.Column(): caption = gr.Textbox( label = "Image caption", elem_id = "image-caption" ) result = gr.Textbox( label = "Suggested System", lines = 6, max_lines = 30, elem_id = "suggested-system-prompt" ) submit_btn.click( fn = infer, inputs = [ image_in, cap_type ], outputs =[ caption, result ] ) demo.queue().launch(show_api=False, show_error=True)