Anthony Miyaguchi commited on
Commit
8e899a8
1 Parent(s): 2759b86

Use mode, updated model, and larger batch size

Browse files
Files changed (3) hide show
  1. fetch_model.sh +1 -1
  2. last.ckpt +2 -2
  3. script.py +15 -5
fetch_model.sh CHANGED
@@ -1,4 +1,4 @@
1
  #!/usr/bin/bash
2
 
3
- path=gs://dsgt-clef-snakeclef-2024/models/torch-petastorm-v1-cls-token/checkpoints/last.ckpt
4
  gcloud storage cp $path .
 
1
  #!/usr/bin/bash
2
 
3
+ path=gs://dsgt-clef-snakeclef-2024/models/torch-petastorm-v2-cls-token/checkpoints/last.ckpt
4
  gcloud storage cp $path .
last.ckpt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:28eae8eefd438883dd21903deee85ecb5355ade592d207cbdd6b946e83ec8da0
3
- size 16145720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93e5258ca3f1b82cd84882e435a064c463cc4e69e015b4ea91be23f7f43195b9
3
+ size 16145784
script.py CHANGED
@@ -10,6 +10,7 @@ from PIL import Image
10
  from torch import nn
11
  from torch.utils.data import DataLoader, Dataset
12
  from transformers import AutoImageProcessor, AutoModel
 
13
 
14
 
15
  class ImageDataset(Dataset):
@@ -61,19 +62,28 @@ def make_submission(
61
  hparams = checkpoint["hyper_parameters"]
62
  model = LinearClassifier(hparams["num_features"], hparams["num_classes"])
63
  model.load_state_dict(checkpoint["state_dict"])
 
 
64
 
65
  dataloader = DataLoader(
66
- ImageDataset(test_metadata, images_root_path), batch_size=32
67
  )
68
  rows = []
69
- for batch in dataloader:
70
- observation_ids = batch["observation_id"]
71
- logits = model(batch["features"])
72
  class_ids = torch.argmax(logits, dim=1)
73
  for observation_id, class_id in zip(observation_ids, class_ids):
74
  row = {"observation_id": int(observation_id), "class_id": int(class_id)}
75
  rows.append(row)
76
- submission_df = pd.DataFrame(rows).drop_duplicates("observation_id", keep="first")
 
 
 
 
 
 
 
77
  submission_df.to_csv(output_csv_path, index=False)
78
 
79
 
 
10
  from torch import nn
11
  from torch.utils.data import DataLoader, Dataset
12
  from transformers import AutoImageProcessor, AutoModel
13
+ import tqdm
14
 
15
 
16
  class ImageDataset(Dataset):
 
62
  hparams = checkpoint["hyper_parameters"]
63
  model = LinearClassifier(hparams["num_features"], hparams["num_classes"])
64
  model.load_state_dict(checkpoint["state_dict"])
65
+ # to gpu
66
+ model = model.cuda()
67
 
68
  dataloader = DataLoader(
69
+ ImageDataset(test_metadata, images_root_path), batch_size=250
70
  )
71
  rows = []
72
+ for batch in tqdm.tqdm(dataloader):
73
+ observation_ids = batch["observation_id"].cuda()
74
+ logits = model(batch["features"].cuda())
75
  class_ids = torch.argmax(logits, dim=1)
76
  for observation_id, class_id in zip(observation_ids, class_ids):
77
  row = {"observation_id": int(observation_id), "class_id": int(class_id)}
78
  rows.append(row)
79
+ # group by observation take the mode of the class_id
80
+ # make sure to keep the observation id
81
+ submission_df = (
82
+ pd.DataFrame(rows)
83
+ .groupby("observation_id")
84
+ .agg(lambda x: pd.Series.mode(x)[0])
85
+ .reset_index()
86
+ )
87
  submission_df.to_csv(output_csv_path, index=False)
88
 
89