import spaces
import gradio as gr
from joycaption import stream_chat_mod, get_text_model, change_text_model, get_repo_gguf
JC_TITLE_MD = "
JoyCaption Alpha Two Mod
"
JC_DESC_MD = """This space is mod of [fancyfeast/joy-caption-alpha-two](https://huggingface.co/spaces/fancyfeast/joy-caption-alpha-two),
[Wi-zz/joy-caption-pre-alpha](https://huggingface.co/Wi-zz/joy-caption-pre-alpha).
Thanks to [dominic1021](https://huggingface.co/dominic1021), [IceHibiki](https://huggingface.co/IceHibiki)."""
css = """
.info {text-align:center; !important}
"""
with gr.Blocks(fill_width=True, css=css, delete_cache=(60, 3600)) as demo:
gr.HTML(JC_TITLE_MD)
with gr.Row():
with gr.Column():
with gr.Group():
jc_input_image = gr.Image(type="pil", label="Input Image", sources=["upload", "clipboard"], height=384)
with gr.Accordion("Options", open=False):
with gr.Row():
jc_caption_type = gr.Dropdown(
choices=["Descriptive", "Descriptive (Informal)", "Training Prompt", "MidJourney", "Booru tag list", "Booru-like tag list", "Art Critic", "Product Listing", "Social Media Post"],
label="Caption Type",
value="Descriptive",
)
jc_caption_length = gr.Dropdown(
choices=["any", "very short", "short", "medium-length", "long", "very long"] +
[str(i) for i in range(20, 261, 10)],
label="Caption Length",
value="long",
)
jc_extra_options = gr.CheckboxGroup(
choices=[
"If there is a person/character in the image you must refer to them as {name}.",
"Do NOT include information about people/characters that cannot be changed (like ethnicity, gender, etc), but do still include changeable attributes (like hair style).",
"Include information about lighting.",
"Include information about camera angle.",
"Include information about whether there is a watermark or not.",
"Include information about whether there are JPEG artifacts or not.",
"If it is a photo you MUST include information about what camera was likely used and details such as aperture, shutter speed, ISO, etc.",
"Do NOT include anything sexual; keep it PG.",
"Do NOT mention the image's resolution.",
"You MUST include information about the subjective aesthetic quality of the image from low to very high.",
"Include information on the image's composition style, such as leading lines, rule of thirds, or symmetry.",
"Do NOT mention any text that is in the image.",
"Specify the depth of field and whether the background is in focus or blurred.",
"If applicable, mention the likely use of artificial or natural lighting sources.",
"Do NOT use any ambiguous language.",
"Include whether the image is sfw, suggestive, or nsfw.",
"ONLY describe the most important elements of the image."
],
label="Extra Options"
)
with gr.Row():
jc_name_input = gr.Textbox(label="Person/Character Name (if applicable)")
gr.Markdown("**Note:** Name input is only used if an Extra Option is selected that requires it.")
jc_custom_prompt = gr.Textbox(label="Custom Prompt (optional, will override all other settings)")
gr.Markdown("**Note:** Alpha Two is not a general instruction follower and will not follow prompts outside its training data well. Use this feature with caution.")
with gr.Accordion("Advanced", open=False):
with gr.Row():
jc_text_model = gr.Dropdown(label="LLM Model", info="You can enter a huggingface model repo_id to want to use.",
choices=get_text_model(), value=get_text_model()[0],
allow_custom_value=True, interactive=True, min_width=320)
jc_gguf = gr.Dropdown(label=f"GGUF Filename", choices=[], value="",
allow_custom_value=True, min_width=320, visible=False)
jc_nf4 = gr.Checkbox(label="Use NF4 quantization", value=True)
jc_lora = gr.Checkbox(label="Use Custom VLM", info="Llama 3 BF16 only", value=True)
jc_text_model_button = gr.Button("Load Model", variant="secondary", visible=False)
jc_use_inference_client = gr.Checkbox(label="Use Inference Client", value=False, visible=False)
with gr.Row():
jc_tokens = gr.Slider(minimum=1, maximum=4096, value=300, step=1, label="Max tokens")
jc_temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.6, step=0.1, label="Temperature")
jc_topp = gr.Slider(minimum=0, maximum=2.0, value=0.9, step=0.01, label="Top-P")
jc_run_button = gr.Button("Caption", variant="primary")
with gr.Column():
jc_output_prompt = gr.Textbox(label="Prompt that was used")
jc_output_caption = gr.Textbox(label="Caption", show_copy_button=True)
gr.Markdown(JC_DESC_MD, elem_classes="info")
gr.LoginButton()
gr.DuplicateButton(value="Duplicate Space for private use (This demo does not work on CPU. Requires GPU Space)")
jc_run_button.click(fn=stream_chat_mod, inputs=[jc_input_image, jc_caption_type, jc_caption_length, jc_extra_options, jc_name_input, jc_custom_prompt,
jc_tokens, jc_topp, jc_temperature, jc_text_model], outputs=[jc_output_prompt, jc_output_caption])
jc_text_model.change(change_text_model, [jc_text_model, jc_use_inference_client, jc_gguf, jc_nf4, jc_lora], [jc_text_model], show_api=False)
#jc_text_model_button.click(change_text_model, [jc_text_model, jc_use_inference_client, jc_gguf, jc_nf4], [jc_text_model], show_api=False)
#jc_text_model.change(get_repo_gguf, [jc_text_model], [jc_gguf], show_api=False)
#jc_use_inference_client.change(change_text_model, [jc_text_model, jc_use_inference_client], [jc_text_model], show_api=False)
if __name__ == "__main__":
#demo.queue()
demo.launch()