|
--- |
|
license: apache-2.0 |
|
datasets: |
|
- Jonathan-Zhou/GameLabel-10k |
|
base_model: |
|
- black-forest-labs/FLUX.1-schnell |
|
pipeline_tag: text-to-image |
|
--- |
|
# Flux GameLabel Lora |
|
|
|
This model is intended purely for research purposes as a demonstration of the the quality of data labeled by random video game players. It achieves its purpose (higher prompt adherence), but suffers from a variety of issues due to being fine tuned on synthetic outputs. |
|
|
|
Inference code that runs on a 24GB consumer card is below. More details are in the paper at [https://arxiv.org/abs/2409.19830](https://arxiv.org/abs/2409.19830) |
|
|
|
|
|
```python3 |
|
from diffusers import FlowMatchEulerDiscreteScheduler, AutoencoderKL |
|
from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel |
|
from diffusers.pipelines.flux.pipeline_flux import FluxPipeline |
|
from transformers import CLIPTextModel, CLIPTokenizer,T5EncoderModel, T5TokenizerFast |
|
import torch |
|
from huggingface_hub import hf_hub_download |
|
from torchao.quantization.quant_api import ( |
|
quantize_, |
|
int8_weight_only |
|
) |
|
dtype = torch.bfloat16 |
|
flux_repo = "black-forest-labs/FLUX.1-schnell" |
|
revision = "refs/pr/1" |
|
|
|
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=dtype) |
|
tokenizer_2 = T5TokenizerFast.from_pretrained(flux_repo, subfolder="tokenizer_2", torch_dtype=dtype, revision=revision) |
|
scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(flux_repo, subfolder="scheduler", revision=revision) |
|
transformer = FluxTransformer2DModel.from_pretrained(flux_repo, subfolder="transformer", torch_dtype=dtype, revision=revision) |
|
lora_file_path = hf_hub_download(repo_id = "Jonathan-Zhou/Flux-GameLabel-Lora", filename = "lora.safetensors") |
|
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=dtype) |
|
text_encoder_2 = T5EncoderModel.from_pretrained(flux_repo, subfolder="text_encoder_2", torch_dtype=dtype, revision=revision) |
|
vae = AutoencoderKL.from_pretrained(flux_repo, subfolder="vae", torch_dtype=dtype, revision=revision) |
|
|
|
|
|
pipe = FluxPipeline( |
|
scheduler=scheduler, |
|
text_encoder=text_encoder, |
|
tokenizer=tokenizer, |
|
text_encoder_2=text_encoder_2, |
|
tokenizer_2=tokenizer_2, |
|
vae=vae, |
|
transformer=transformer, |
|
) |
|
|
|
# If you want to compare the lora with the bsae model, you can comment out these two lines |
|
pipe.load_lora_weights(lora_file_path, adapter_name="lora1") |
|
pipe.fuse_lora() |
|
|
|
# Quantization needed if run on a GPU with 24 GB VRAM |
|
quantize_(transformer, int8_weight_only()) |
|
quantize_(text_encoder, int8_weight_only()) |
|
quantize_(text_encoder_2, int8_weight_only()) |
|
quantize_(vae, int8_weight_only()) |
|
|
|
|
|
pipe.to("cuda") |
|
torch.cuda.empty_cache() |
|
generator = torch.Generator().manual_seed(12345) |
|
output = pipe( |
|
prompt="a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background", |
|
width=1024, |
|
height=1024, |
|
num_inference_steps=6, |
|
num_images_per_prompt = 1, |
|
generator=generator, |
|
guidance_scale=3.5, |
|
) |
|
image = output.images[0] |
|
image.show() |
|
``` |