Spaces:
Running
Running
Merge pull request #27 from alecrem/feature/issue-26/sequential-script
Browse files- .gitignore +1 -0
- list_scripts/1_download_mtgjson.sh +10 -0
- list_scripts/2_per_set_json_files.py +57 -0
- list_scripts/4_compile_from_legal_sets.py +30 -0
- list_scripts/5_remove_wrong_names.py +20 -0
- list_scripts/6_find_remaining_japanese_names.py +46 -0
- list_scripts/7_remove_banned_cards.py +51 -0
- output/middleschool.csv +0 -0
- output/middleschool.json +0 -0
.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1 |
data/*
|
|
|
|
1 |
data/*
|
2 |
+
list_scripts/3_separate_json_files_per_set.sh
|
list_scripts/1_download_mtgjson.sh
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Download mtgjson data and extract it on the `data` directory
|
2 |
+
# Feel free to make the file available in any other way
|
3 |
+
|
4 |
+
# Important: run this script from the parent directory
|
5 |
+
# (the root directory in this repository)
|
6 |
+
|
7 |
+
cd data
|
8 |
+
wget "https://mtgjson.com/api/v5/AllPrintings.json.bz2"
|
9 |
+
bunzip2 AllPrintings.json.bz2
|
10 |
+
cd -
|
list_scripts/2_per_set_json_files.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Important: run this script from the parent directory
|
2 |
+
## (the root directory in this repository)
|
3 |
+
#
|
4 |
+
# python3 list_scripts/1.py
|
5 |
+
|
6 |
+
# The Raw data is very large, so let's make JSON files for all relevant sets
|
7 |
+
# Note: this can take a couple minutes to run
|
8 |
+
|
9 |
+
setlist = [
|
10 |
+
"4ED",
|
11 |
+
"ICE",
|
12 |
+
"CHR",
|
13 |
+
"HML",
|
14 |
+
"ALL",
|
15 |
+
"MIR",
|
16 |
+
"VIS",
|
17 |
+
"5ED",
|
18 |
+
"WTH",
|
19 |
+
"POR",
|
20 |
+
"TMP",
|
21 |
+
"STH",
|
22 |
+
"EXO",
|
23 |
+
"P02",
|
24 |
+
"USG",
|
25 |
+
"ULG",
|
26 |
+
"6ED",
|
27 |
+
"UDS",
|
28 |
+
"PTK",
|
29 |
+
"S99",
|
30 |
+
"MMQ",
|
31 |
+
"NEM",
|
32 |
+
"PCY",
|
33 |
+
"S00",
|
34 |
+
"INV",
|
35 |
+
"PLS",
|
36 |
+
"7ED",
|
37 |
+
"APC",
|
38 |
+
"ODY",
|
39 |
+
"TOR",
|
40 |
+
"JUD",
|
41 |
+
"ONS",
|
42 |
+
"LGN",
|
43 |
+
"SCG",
|
44 |
+
"PDRC",
|
45 |
+
"PHPR",
|
46 |
+
"ATH",
|
47 |
+
"BRB",
|
48 |
+
"BTD",
|
49 |
+
"DKM",
|
50 |
+
]
|
51 |
+
with open("list_scripts/3_separate_json_files_per_set.sh", "w") as f:
|
52 |
+
for set in setlist:
|
53 |
+
# Write a separate JSON document for each Middle School legal set
|
54 |
+
line = f"cat data/AllPrintings.json | jq '.data.\"{set}\".cards' > data/set_{set}.json"
|
55 |
+
f.write(line + "\n")
|
56 |
+
line = "jq -s add data/set_* > data/middleschool.json"
|
57 |
+
f.write(line + "\n")
|
list_scripts/4_compile_from_legal_sets.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
with open("data/middleschool.json") as json_data:
|
5 |
+
cards = json.loads(json_data.read())
|
6 |
+
|
7 |
+
# Create a pandas DataFrame with all cards from all legal sets
|
8 |
+
column_names = ["oracle_id", "name", "name_ja"]
|
9 |
+
middleschool_df = pd.DataFrame(columns=column_names)
|
10 |
+
for card in cards:
|
11 |
+
oracle_id = card["identifiers"]["scryfallOracleId"]
|
12 |
+
name = card["name"]
|
13 |
+
lang_ja = [lang for lang in card["foreignData"] if lang["language"] == "Japanese"]
|
14 |
+
# Some cards do not have a Japanese name
|
15 |
+
if len(lang_ja) > 0:
|
16 |
+
name_ja = lang_ja[0]["name"]
|
17 |
+
else:
|
18 |
+
name_ja = None
|
19 |
+
temporary_df = pd.DataFrame(
|
20 |
+
{"oracle_id": [oracle_id], "name": [name], "name_ja": [name_ja]}
|
21 |
+
)
|
22 |
+
middleschool_df = pd.concat([middleschool_df, temporary_df])
|
23 |
+
|
24 |
+
# For cards with multiple occurrences, put the rows that have the Japanese name on top
|
25 |
+
middleschool_df = middleschool_df.sort_values(by=["name", "name_ja"])
|
26 |
+
# For cards with multiple occurrences, delete all rows except for the top one
|
27 |
+
middleschool_df = middleschool_df.drop_duplicates(subset=["oracle_id"])
|
28 |
+
|
29 |
+
# Write a CSV file
|
30 |
+
middleschool_df.to_csv("data/middleschool_all_sets.csv")
|
list_scripts/5_remove_wrong_names.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
# Remove Japanese card names that are wrong on MTGJSON
|
4 |
+
wrongnames = [
|
5 |
+
"Aether Barrier",
|
6 |
+
"Aether Burst",
|
7 |
+
"Aether Charge",
|
8 |
+
"Aether Flash",
|
9 |
+
"Aether Mutation",
|
10 |
+
"Aether Sting",
|
11 |
+
"Aether Storm",
|
12 |
+
"Aether Tide",
|
13 |
+
"Tainted Aether",
|
14 |
+
"Tar Pit Warrior",
|
15 |
+
]
|
16 |
+
|
17 |
+
middleschool_df = pd.read_csv("data/middleschool_all_sets.csv")
|
18 |
+
|
19 |
+
# Write a CSV file
|
20 |
+
middleschool_df.to_csv("data/middleschool_all_sets_removed_wrong_names.csv")
|
list_scripts/6_find_remaining_japanese_names.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import time
|
3 |
+
from requests_html import HTMLSession
|
4 |
+
|
5 |
+
middleschool_df = pd.read_csv("data/middleschool_all_sets_removed_wrong_names.csv")
|
6 |
+
|
7 |
+
session = HTMLSession()
|
8 |
+
|
9 |
+
|
10 |
+
def find_japanese_name(name):
|
11 |
+
url = "http://whisper.wisdom-guild.net/card/" + name + "/"
|
12 |
+
r = session.get(url)
|
13 |
+
# Find the text on the <title> element in the HTML document
|
14 |
+
title = r.html.find("title")[0].text
|
15 |
+
# Find the position of the English card name within the title
|
16 |
+
idx = title.find(name)
|
17 |
+
# The Japanese name should be before the English name,
|
18 |
+
# so if idx is 0, there is no Japanese name
|
19 |
+
if idx == 0:
|
20 |
+
print(f"{name} ->")
|
21 |
+
return None
|
22 |
+
# If the exact English card name can't be found, we look for a '/'
|
23 |
+
if idx == -1:
|
24 |
+
idx = title.find("/")
|
25 |
+
# No '/' means no Japanese name
|
26 |
+
if idx == -1:
|
27 |
+
return None
|
28 |
+
# Take only the Japanese name from the title
|
29 |
+
name_ja = title[0:idx]
|
30 |
+
else:
|
31 |
+
# Take only the Japanese name from the title
|
32 |
+
name_ja = title[0 : idx - 1]
|
33 |
+
print(f"{name} -> {name_ja}")
|
34 |
+
return name_ja
|
35 |
+
|
36 |
+
|
37 |
+
english_only_cards = middleschool_df[middleschool_df["name_ja"].isnull()]
|
38 |
+
name_list = english_only_cards["name"].to_list()
|
39 |
+
for idx, name in enumerate(name_list):
|
40 |
+
middleschool_df.loc[
|
41 |
+
middleschool_df["name"] == name, "name_ja"
|
42 |
+
] = find_japanese_name(name)
|
43 |
+
time.sleep(1)
|
44 |
+
|
45 |
+
# Write a CSV file
|
46 |
+
middleschool_df.to_csv("data/middleschool_all_sets_added_japanese_names.csv")
|
list_scripts/7_remove_banned_cards.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
# Remove cards that are banned in the format
|
4 |
+
banlist = [
|
5 |
+
"Amulet of Quoz",
|
6 |
+
"Balance",
|
7 |
+
"Brainstorm",
|
8 |
+
"Bronze Tablet",
|
9 |
+
"Channel",
|
10 |
+
"Dark Ritual",
|
11 |
+
"Demonic Consultation",
|
12 |
+
"Flash",
|
13 |
+
"Goblin Recruiter",
|
14 |
+
"Imperial Seal",
|
15 |
+
"Jeweled Bird",
|
16 |
+
"Mana Crypt",
|
17 |
+
"Mana Vault",
|
18 |
+
"Memory Jar",
|
19 |
+
"Mind's Desire",
|
20 |
+
"Mind Twist",
|
21 |
+
"Rebirth",
|
22 |
+
"Strip Mine",
|
23 |
+
"Tempest Efreet",
|
24 |
+
"Timmerian Fiends",
|
25 |
+
"Tolarian Academy",
|
26 |
+
"Vampiric Tutor",
|
27 |
+
"Windfall",
|
28 |
+
"Yawgmoth's Bargain",
|
29 |
+
"Yawgmoth's Will",
|
30 |
+
]
|
31 |
+
|
32 |
+
middleschool_df = pd.read_csv("data/middleschool_all_sets_added_japanese_names.csv")
|
33 |
+
|
34 |
+
print("Cards legal by set:", middleschool_df.shape[0])
|
35 |
+
# Find the rows with the banned cards
|
36 |
+
banned_df = middleschool_df[
|
37 |
+
pd.DataFrame(middleschool_df.name.tolist()).isin(banlist).any(axis=1).values
|
38 |
+
]
|
39 |
+
print("Banned cards:", banned_df.shape[0])
|
40 |
+
# Append the banned cards to the main Middle School DataFrame,
|
41 |
+
# then remove any rows that appear twice,
|
42 |
+
# effectively leaving only the legal cards
|
43 |
+
middleschool_df = pd.concat([middleschool_df, banned_df]).drop_duplicates(keep=False)
|
44 |
+
print("Cards legal by set and not banned:", middleschool_df.shape[0])
|
45 |
+
middleschool_df = middleschool_df.reset_index(drop=True)
|
46 |
+
middleschool_df = middleschool_df[["oracle_id", "name", "name_ja"]]
|
47 |
+
middleschool_df = middleschool_df.sort_values(by=["name", "name_ja"])
|
48 |
+
|
49 |
+
# Write a CSV file
|
50 |
+
middleschool_df.to_csv("output/middleschool.csv")
|
51 |
+
middleschool_df.to_json("output/middleschool.json")
|
output/middleschool.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
output/middleschool.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|