Spaces:

kyoto-kaira
/

personalized-font-ai

Sleeping

+import time
+import traceback
 import streamlit as st
+import torch
+from PIL import Image
+from streamlit_drawable_canvas import st_canvas
+from models.diffusion import Diffusion
+from utils import initialize_data_dir, save_image
+# 設定
+st.set_page_config(page_title="手書き文字生成アプリ", layout="wide")
+# タイトル
+st.title("手書き文字生成アプリ")
+# 説明
+st.markdown(
+    """
+このアプリでは、「あ」「い」「う」「え」「お」の手書き文字をそれぞれ5回ずつ描いてください。
+「生成」ボタンを押すと、モデルがファインチューニングされ、生成された文字が表示されます。
+学習率、エポック数、最適化手法も調整可能です。
+"""
+)
+# 文字リスト
+characters = ["あ", "い", "う", "え", "お"]
+num_samples = 5  # 各文字のサンプル数
+# 描画スペースの配置
+st.header("手書き文字を描いてください")
+# 保存用ディレクトリ
+data_dir = initialize_data_dir()
+# 描画領域の作成
+for char in characters:
+    st.subheader(f"文字「{char}」を{num_samples}回描いてください")
+    cols = st.columns(num_samples)
+    for i in range(num_samples):
+        with cols[i]:
+            canvas = st_canvas(
+                fill_color="white",
+                stroke_width=3,
+                stroke_color="black",
+                background_color="white",
+                width=150,
+                height=150,
+                drawing_mode="freedraw",
+                key=f"{char}_{i}",
+            )
+            if canvas.image_data is not None:
+                img = Image.fromarray(
+                    canvas.image_data.astype("uint8"), "RGBA"
+                ).convert("L")
+                # 二値化
+                img = img.point(lambda x: 0 if x < 128 else 255, "1")
+                save_image(img, char, i, data_dir)
+# ハイパーパラメータの入力
+st.sidebar.header("ハイパーパラメータ設定")
+learning_rate = st.sidebar.number_input(
+    "学習率", min_value=0.0001, max_value=1.0, value=0.001, step=0.0001, format="%.4f"
+)
+epochs = st.sidebar.number_input(
+    "エポック数", min_value=1, max_value=100, value=10, step=1
+)
+optimizer_name = st.sidebar.selectbox("最適化手法", ["SGD", "Adam", "RMSprop"])
+# サンプリングの設定
+st.sidebar.header("サンプリング設定")
+noise_steps = st.sidebar.number_input(
+    "ノイズステップ数", min_value=1, max_value=1000, value=1000, step=1
+)
+beta_start = st.sidebar.number_input(
+    "βの初期値", min_value=0.0, max_value=1.0, value=0.0001, step=0.0001, format="%.4f"
+)
+beta_end = st.sidebar.number_input(
+    "βの終了値", min_value=0.0, max_value=1.0, value=0.02, step=0.0001, format="%.4f"
+)
+# 生成ボタン
+if st.button("生成"):
+    try:
+        # デバイスの設定
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # モデルインスタンスの生成
+        diffusion_model = Diffusion(
+            noise_steps=1000,
+            beta_start=1e-4,
+            beta_end=0.02,
+            img_size=32,
+            num_classes=46,
+            c_in=3,
+            c_out=3,
+            device=device,
+        )
+        save_path = "models/model.pth"
+        # モデルのロード
+        diffusion_model.model.load_state_dict(
+            torch.load(save_path, weights_only=True, map_location=device)
+        )
+        # フォント画像の生成と表示
+        def chuncked(iterable, n):
+            for i in range(0, len(iterable), n):
+                yield iterable[i : i + n]
+        labels = list(range(46))
+        columns_per_row = 5
+        start_time = time.time()
+        with st.spinner("フォント画像を生成中..."):
+            labels_tensor = torch.tensor(labels).to(device)
+            font_image = diffusion_model.sample(
+                diffusion_model.model, labels_tensor
+            )
+            elapsed_time = time.time() - start_time
+            st.success(f"フォント画像の生成に成功しました（{elapsed_time:.2f}秒）")
+            for label_row in chuncked(labels, columns_per_row):
+                cols = st.columns(columns_per_row)
+                for col, label in zip(cols, label_row):
+                    col.image(
+                        font_image[label].permute(1, 2, 0).cpu().numpy(),
+                        caption=f"{label}",
+                        use_container_width=True,
+                    )
+    except FileNotFoundError as e:
+        st.error(str(e))
+    except ValueError as e:
+        st.error(str(e))
+    except Exception as e:
+        st.error(f"予期せぬエラーが発生しました: {e}")
+        st.error(traceback.format_exc())

models/__init__.py ADDED Viewed

File without changes

models/diffusion.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import numpy as np
+import torch
+from .unet import UNet_conditional
+class Diffusion:
+    def __init__(
+        self,
+        noise_steps: int = 1000,
+        beta_start: float = 1e-4,
+        beta_end: float = 0.02,
+        img_size: int = 32,
+        num_classes: int = 46,
+        c_in: int = 3,
+        c_out: int = 3,
+        device: torch.device = torch.device("cuda"),
+        time_dim: int = 256,
+        **kwargs,
+    ):
+        self.noise_steps = noise_steps
+        self.beta_start = beta_start
+        self.beta_end = beta_end
+        self.img_size = img_size
+        self.device = device
+        self.time_dim = time_dim
+        self.num_classes = num_classes
+        self.c_in = c_in
+        self.c_out = c_out
+        self.model = UNet_conditional(
+            c_in, c_out, time_dim, num_classes=num_classes, **kwargs
+        ).to(device)
+        self.beta = self.prepare_noise_schedule().to(device)
+        self.alpha = 1.0 - self.beta
+        self.alpha_hat = torch.cumprod(self.alpha, dim=0)
+    def __call__(self, x, t, labels):
+        return self.model(x, t, labels)
+    def prepare_noise_schedule(self) -> torch.Tensor:
+        """
+        ノイズスケジュールの生成
+        """
+        return torch.linspace(self.beta_start, self.beta_end, self.noise_steps)
+    def noise_images(self, x: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        """
+        ノイズ画像とノイズの生成
+        """
+        sqrt_alpha_hat = torch.sqrt(self.alpha_hat[t])[:, None, None, None]
+        sqrt_one_minus_alpha_hat = torch.sqrt(1 - self.alpha_hat[t])[:, None, None, None]
+        noise = torch.randn_like(x)
+        return sqrt_alpha_hat * x + sqrt_one_minus_alpha_hat * noise, noise
+    def sample_timesteps(self, n: int) -> torch.Tensor:
+        """
+        タイムステップのサンプリング
+        """
+        return torch.randint(low=1, high=self.noise_steps, size=(n,))
+    def sample(self, model: torch.nn.Module, labels: torch.Tensor) -> torch.Tensor:
+        """
+        画像の生成
+        """
+        self.model = model
+        n = len(labels)
+        print(f"Sampling {n} new images....")
+        model.eval()
+        with torch.no_grad():
+            x = torch.randn((n, self.c_in, self.img_size, self.img_size)).to(
+                self.device
+            )
+            for i in reversed(range(1, self.noise_steps)):
+                t = (torch.ones(n) * i).long().to(self.device)
+                predicted_noise = model(x, t, labels)
+                alpha = self.alpha[t][:, None, None, None]
+                alpha_hat = self.alpha_hat[t][:, None, None, None]
+                beta = self.beta[t][:, None, None, None]
+                if i > 1:
+                    noise = torch.randn_like(x)
+                else:
+                    noise = torch.zeros_like(x)
+                x = (
+                    1
+                    / torch.sqrt(alpha)
+                    * (
+                        x
+                        - ((1 - alpha) / (torch.sqrt(1 - alpha_hat))) * predicted_noise
+                    )
+                    + torch.sqrt(beta) * noise
+                )
+        model.train()
+        x = (x.clamp(-1, 1) + 1) / 2
+        x = (x * 255).type(torch.uint8)
+        return x
+    def fit(
+        self,
+        optimizer: torch.optim.Optimizer,
+        criterion: torch.nn.Module,
+        num_epochs: int,
+        train_loader: torch.utils.data.DataLoader,
+        test_loader: torch.utils.data.DataLoader,
+        model: torch.nn.Module,
+        device: torch.device,
+        history: np.ndarray,
+        save_path: str,
+    ) -> np.ndarray:
+        """
+        モデルの学習
+        """
+        base_epochs = len(history)
+        # 最小損失の初期化
+        min_test_loss = 9e9
+        for epoch in range(base_epochs, base_epochs + num_epochs):
+            # 1エポックあたりの累積損失（平均化前）
+            train_loss, test_loss = 0, 0
+            # 1エポックあたりのデータ累積件数
+            n_train, n_test = 0, 0
+            # 訓練フェーズ
+            self.model.train()
+            for x, labels in train_loader:
+                # 1バッチあたりのデータ件数
+                train_batch_size = len(labels)
+                # 1エポックあたりのデータ累積件数
+                n_train += train_batch_size
+                # GPUに転送
+                x = x.to(device)
+                labels = labels.to(device)
+                # ノイズステップを生成
+                t = self.sample_timesteps(x.size(0)).to(device)
+                # ノイズ画像とノイズを生成
+                xt, noise = self.noise_images(x, t)
+                # 勾配の初期化
+                optimizer.zero_grad()
+                # 予測計算
+                predicted_noise = model(xt, t, labels)
+                # 損失計算
+                loss = criterion(predicted_noise, noise)
+                # 勾配計算
+                loss.backward()
+                # パラメータ修正
+                optimizer.step()
+                # 平均前の損失計算
+                train_loss += loss.item() * train_batch_size
+            # 予測フェーズ
+            self.model.eval()
+            for x, labels in test_loader:
+                # 1バッチあたりのデータ件数
+                test_batch_size = len(labels)
+                # 1エポックあたりのデータ累積件数
+                n_test += test_batch_size
+                # GPUに転送
+                x = x.to(device)
+                labels = labels.to(device)
+                # ノイズステップを生成
+                t = self.sample_timesteps(x.size(0)).to(device)
+                # ノイズ画像とノイズを生成
+                xt, noise = self.noise_images(x, t)
+                # 予測計算
+                predicted_noise = model(xt, t, labels)
+                # 損失計算
+                loss = criterion(predicted_noise, noise)
+                # 平均前の損失計算
+                test_loss += loss.item() * test_batch_size
+            # 損失計算
+            avg_train_loss = train_loss / n_train
+            avg_test_loss = test_loss / n_test
+            # 最小損失の更新とモデルの保存
+            if avg_test_loss < min_test_loss:
+                min_test_loss = avg_test_loss
+                torch.save(self.model.state_dict(), save_path)
+            # 結果表示
+            print(
+                f"Epoch {epoch + 1}, Train loss: {avg_train_loss:.3f}, Test loss: {avg_test_loss:.3f}"
+            )
+            # 記録
+            item = np.array([epoch + 1, avg_train_loss, avg_test_loss])
+            history = np.vstack([history, item])
+        return history

models/unet.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def one_param(m: nn.Module) -> nn.Parameter:
+    """
+    get model first parameter
+    """
+    return next(iter(m.parameters()))
+class SelfAttention(nn.Module):
+    def __init__(self, channels):
+        super(SelfAttention, self).__init__()
+        self.channels = channels
+        self.mha = nn.MultiheadAttention(channels, 4, batch_first=True)
+        self.ln = nn.LayerNorm([channels])
+        self.ff_self = nn.Sequential(
+            nn.LayerNorm([channels]),
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels),
+        )
+    def forward(self, x):
+        size = x.shape[-1]
+        x = x.view(-1, self.channels, size * size).swapaxes(1, 2)
+        x_ln = self.ln(x)
+        attention_value, _ = self.mha(x_ln, x_ln, x_ln)
+        attention_value = attention_value + x
+        attention_value = self.ff_self(attention_value) + attention_value
+        return attention_value.swapaxes(2, 1).view(-1, self.channels, size, size)
+class DoubleConv(nn.Module):
+    def __init__(self, in_channels, out_channels, mid_channels=None, residual=False):
+        super().__init__()
+        self.residual = residual
+        if not mid_channels:
+            mid_channels = out_channels
+        self.double_conv = nn.Sequential(
+            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
+            nn.GroupNorm(1, mid_channels),
+            nn.GELU(),
+            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
+            nn.GroupNorm(1, out_channels),
+        )
+    def forward(self, x):
+        if self.residual:
+            return F.gelu(x + self.double_conv(x))
+        else:
+            return self.double_conv(x)
+class Down(nn.Module):
+    def __init__(self, in_channels, out_channels, emb_dim=256):
+        super().__init__()
+        self.maxpool_conv = nn.Sequential(
+            nn.MaxPool2d(2),
+            DoubleConv(in_channels, in_channels, residual=True),
+            DoubleConv(in_channels, out_channels),
+        )
+        self.emb_layer = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(emb_dim, out_channels),
+        )
+    def forward(self, x, t):
+        x = self.maxpool_conv(x)
+        emb = self.emb_layer(t)[:, :, None, None].repeat(1, 1, x.shape[-2], x.shape[-1])
+        return x + emb
+class Up(nn.Module):
+    def __init__(self, in_channels, out_channels, emb_dim=256):
+        super().__init__()
+        self.up = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
+        self.conv = nn.Sequential(
+            DoubleConv(in_channels, in_channels, residual=True),
+            DoubleConv(in_channels, out_channels, in_channels // 2),
+        )
+        self.emb_layer = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(emb_dim, out_channels),
+        )
+    def forward(self, x, skip_x, t):
+        x = self.up(x)
+        x = torch.cat([skip_x, x], dim=1)
+        x = self.conv(x)
+        emb = self.emb_layer(t)[:, :, None, None].repeat(1, 1, x.shape[-2], x.shape[-1])
+        return x + emb
+# UNetの定義
+class UNet(nn.Module):
+    def __init__(self, c_in=3, c_out=3, time_dim=256, remove_deep_conv=False):
+        super().__init__()
+        self.time_dim = time_dim
+        self.remove_deep_conv = remove_deep_conv
+        self.inc = DoubleConv(c_in, 64)
+        self.down1 = Down(64, 128)
+        self.sa1 = SelfAttention(128)
+        self.down2 = Down(128, 256)
+        self.sa2 = SelfAttention(256)
+        self.down3 = Down(256, 256)
+        self.sa3 = SelfAttention(256)
+        if remove_deep_conv:
+            self.bot1 = DoubleConv(256, 256)
+            self.bot3 = DoubleConv(256, 256)
+        else:
+            self.bot1 = DoubleConv(256, 512)
+            self.bot2 = DoubleConv(512, 512)
+            self.bot3 = DoubleConv(512, 256)
+        self.up1 = Up(512, 128)
+        self.sa4 = SelfAttention(128)
+        self.up2 = Up(256, 64)
+        self.sa5 = SelfAttention(64)
+        self.up3 = Up(128, 64)
+        self.sa6 = SelfAttention(64)
+        self.outc = nn.Conv2d(64, c_out, kernel_size=1)
+    def pos_encoding(self, t, channels):
+        inv_freq = 1.0 / (
+            10000
+            ** (
+                torch.arange(0, channels, 2, device=one_param(self).device).float()
+                / channels
+            )
+        )
+        pos_enc_a = torch.sin(t.repeat(1, channels // 2) * inv_freq)
+        pos_enc_b = torch.cos(t.repeat(1, channels // 2) * inv_freq)
+        pos_enc = torch.cat([pos_enc_a, pos_enc_b], dim=-1)
+        return pos_enc
+    def unet_forwad(self, x, t):
+        x1 = self.inc(x)
+        x2 = self.down1(x1, t)
+        x2 = self.sa1(x2)
+        x3 = self.down2(x2, t)
+        x3 = self.sa2(x3)
+        x4 = self.down3(x3, t)
+        x4 = self.sa3(x4)
+        x4 = self.bot1(x4)
+        if not self.remove_deep_conv:
+            x4 = self.bot2(x4)
+        x4 = self.bot3(x4)
+        x = self.up1(x4, x3, t)
+        x = self.sa4(x)
+        x = self.up2(x, x2, t)
+        x = self.sa5(x)
+        x = self.up3(x, x1, t)
+        x = self.sa6(x)
+        output = self.outc(x)
+        return output
+    def forward(self, x, t):
+        t = t.unsqueeze(-1)
+        t = self.pos_encoding(t, self.time_dim)
+        return self.unet_forwad(x, t)
+class UNet_conditional(UNet):
+    def __init__(self, c_in=3, c_out=3, time_dim=256, num_classes=46, **kwargs):
+        super().__init__(c_in, c_out, time_dim, **kwargs)
+        self.label_emb = nn.Embedding(num_classes, time_dim)
+    def forward(self, x, t, y):
+        """
+        ラベルの埋め込みをタイムステップの埋め込みに加算
+        """
+        t = t.unsqueeze(-1)
+        t = self.pos_encoding(t, self.time_dim)
+        if y is not None:
+            t += self.label_emb(y)
+        return self.unet_forwad(x, t)

requirements.txt CHANGED Viewed

	@@ -1 +1,4 @@
1	streamlit==1.40.1

 streamlit==1.40.1
+streamlit-drawable-canvas==0.9.3
+torch==2.5.1
+torchvision==0.20.1

utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+import tempfile
+from PIL import Image
+def initialize_data_dir() -> str:
+    if "data_dir" not in os.environ:
+        data_dir = tempfile.mkdtemp()
+        os.environ["data_dir"] = data_dir
+    return os.environ["data_dir"]
+def save_image(img, char, idx, data_dir):
+    char_dir = os.path.join(data_dir, char)
+    os.makedirs(char_dir, exist_ok=True)
+    img_path = os.path.join(char_dir, f"{idx}.png")
+    img.save(img_path)
+def load_images(characters, num_samples, data_dir, transform):
+    X = []
+    y = []
+    for label, char in enumerate(characters):
+        char_dir = os.path.join(data_dir, char)
+        if not os.path.exists(char_dir):
+            raise FileNotFoundError(
+                f"文字「{char}」の画像が不足しています。全てのサンプルを描いてください。"
+            )
+        for i in range(num_samples):
+            img_path = os.path.join(char_dir, f"{i}.png")
+            if not os.path.exists(img_path):
+                raise FileNotFoundError(
+                    f"文字「{char}」のサンプル{i+1}が存在しません。"
+                )
+            img = Image.open(img_path).convert("L")
+            img = transform(img)
+            X.append(img)
+            y.append(label)
+    return X, y