dsgt-kaggle-clef
/

dsgt-snakeclef

Anthony Miyaguchi commited on May 24

Commit

8e899a8

•

1 Parent(s): 2759b86

Use mode, updated model, and larger batch size

Files changed (3) hide show

fetch_model.sh CHANGED Viewed

@@ -1,4 +1,4 @@
 #!/usr/bin/bash
-path=gs://dsgt-clef-snakeclef-2024/models/torch-petastorm-v1-cls-token/checkpoints/last.ckpt
 gcloud storage cp $path .

 #!/usr/bin/bash
+path=gs://dsgt-clef-snakeclef-2024/models/torch-petastorm-v2-cls-token/checkpoints/last.ckpt
 gcloud storage cp $path .

last.ckpt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:28eae8eefd438883dd21903deee85ecb5355ade592d207cbdd6b946e83ec8da0
-size 16145720

 version https://git-lfs.github.com/spec/v1
+oid sha256:93e5258ca3f1b82cd84882e435a064c463cc4e69e015b4ea91be23f7f43195b9
+size 16145784

script.py CHANGED Viewed

@@ -10,6 +10,7 @@ from PIL import Image
 from torch import nn
 from torch.utils.data import DataLoader, Dataset
 from transformers import AutoImageProcessor, AutoModel
 class ImageDataset(Dataset):
@@ -61,19 +62,28 @@ def make_submission(
     hparams = checkpoint["hyper_parameters"]
     model = LinearClassifier(hparams["num_features"], hparams["num_classes"])
     model.load_state_dict(checkpoint["state_dict"])
     dataloader = DataLoader(
-        ImageDataset(test_metadata, images_root_path), batch_size=32
     )
     rows = []
-    for batch in dataloader:
-        observation_ids = batch["observation_id"]
-        logits = model(batch["features"])
         class_ids = torch.argmax(logits, dim=1)
         for observation_id, class_id in zip(observation_ids, class_ids):
             row = {"observation_id": int(observation_id), "class_id": int(class_id)}
             rows.append(row)
-    submission_df = pd.DataFrame(rows).drop_duplicates("observation_id", keep="first")
     submission_df.to_csv(output_csv_path, index=False)

 from torch import nn
 from torch.utils.data import DataLoader, Dataset
 from transformers import AutoImageProcessor, AutoModel
+import tqdm
 class ImageDataset(Dataset):
     hparams = checkpoint["hyper_parameters"]
     model = LinearClassifier(hparams["num_features"], hparams["num_classes"])
     model.load_state_dict(checkpoint["state_dict"])
+    # to gpu
+    model = model.cuda()
     dataloader = DataLoader(
+        ImageDataset(test_metadata, images_root_path), batch_size=250
     )
     rows = []
+    for batch in tqdm.tqdm(dataloader):
+        observation_ids = batch["observation_id"].cuda()
+        logits = model(batch["features"].cuda())
         class_ids = torch.argmax(logits, dim=1)
         for observation_id, class_id in zip(observation_ids, class_ids):
             row = {"observation_id": int(observation_id), "class_id": int(class_id)}
             rows.append(row)
+    # group by observation take the mode of the class_id
+    # make sure to keep the observation id
+    submission_df = (
+        pd.DataFrame(rows)
+        .groupby("observation_id")
+        .agg(lambda x: pd.Series.mode(x)[0])
+        .reset_index()
+    )
     submission_df.to_csv(output_csv_path, index=False)