Spaces:
Running
Running
File size: 2,749 Bytes
85e3d20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import subprocess
import pandas as pd
import json
taskname = "fathomnet-out-of-sample-detection"
# download_dir = f"benchmarks/{taskname}/env"
download_dir = "benchmarks/" + taskname + "/env"
input(f"Consent to the competition at https://www.kaggle.com/competitions/{taskname}/data; Press any key after you have accepted the rules online.")
subprocess.run(["kaggle", "competitions", "download", "-c", taskname], cwd=download_dir)
subprocess.run(["unzip", "-n", f"{taskname}.zip"], cwd=download_dir)
subprocess.run(["rm", f"{taskname}.zip"], cwd=download_dir)
### download images
input(f"""Download large amount of images to current directory by doing this:
conda create -n fgvc_test python=3.9 pip
conda activate fgvc_test
pip install -r requirements.txt
python download_images.py ../env/object_detection/train.json --outpath ../env/images
Press any key after done""")
subprocess.run(["rm", "download_images.py"], cwd=download_dir)
subprocess.run(["rm", "demo_download.ipynb"], cwd=download_dir)
subprocess.run(["rm", "requirements.txt"], cwd=download_dir)
# ## split train to train and test in env
trainset = pd.read_csv(f"{download_dir}/multilabel_classification/train.csv")
trainset = trainset.sample(frac=1, random_state=42)
trainset = trainset.reset_index(drop=True)
trainset.iloc[:int(len(trainset)*0.98)].to_csv(f"{download_dir}/multilabel_classification/train.csv", index=False)
testset = trainset.iloc[int(len(trainset)*0.98):]
# split testset to only full_text and labels
testset.to_csv(f"answer.csv", index=False)
# split train json
orig_train_json = json.load(open(f"{download_dir}/object_detection/train.json"))
test_json = json.load(open(f"{download_dir}/object_detection/eval.json"))
# split train_json according to trainset
train_json = orig_train_json.copy()
train_json["images"] = [x for x in orig_train_json["images"] if x["file_name"][:-4] not in testset["id"].values]
images_ids = [x["id"] for x in train_json["images"]]
train_json["annotations"] = [x for x in orig_train_json["annotations"] if x["image_id"] in images_ids]
test_json["images"] = [x for x in orig_train_json["images"] if x["file_name"][:-4] in testset["id"].values]
# relabel ids
for i, x in enumerate(train_json["images"]):
for y in train_json["annotations"]:
if y["image_id"] == x["id"]:
y["image_id"] = i + 1
x["id"] = i + 1
for i, x in enumerate(train_json["annotations"]):
x["id"] = i + 1
for i, x in enumerate(test_json["images"]):
x["id"] = i + 1
# write train_json and test_json
with open(f"{download_dir}/object_detection/train.json", "w") as f:
json.dump(train_json, f, indent=4)
with open(f"{download_dir}/object_detection/eval.json", "w") as f:
json.dump(test_json, f, indent=4)
|