Alejandro Cremades commited on
Commit
f645f21
2 Parent(s): b2d9d69 17dcef2

Merge pull request #27 from alecrem/feature/issue-26/sequential-script

Browse files
.gitignore CHANGED
@@ -1 +1,2 @@
1
  data/*
 
 
1
  data/*
2
+ list_scripts/3_separate_json_files_per_set.sh
list_scripts/1_download_mtgjson.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Download mtgjson data and extract it on the `data` directory
2
+ # Feel free to make the file available in any other way
3
+
4
+ # Important: run this script from the parent directory
5
+ # (the root directory in this repository)
6
+
7
+ cd data
8
+ wget "https://mtgjson.com/api/v5/AllPrintings.json.bz2"
9
+ bunzip2 AllPrintings.json.bz2
10
+ cd -
list_scripts/2_per_set_json_files.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Important: run this script from the parent directory
2
+ ## (the root directory in this repository)
3
+ #
4
+ # python3 list_scripts/1.py
5
+
6
+ # The Raw data is very large, so let's make JSON files for all relevant sets
7
+ # Note: this can take a couple minutes to run
8
+
9
+ setlist = [
10
+ "4ED",
11
+ "ICE",
12
+ "CHR",
13
+ "HML",
14
+ "ALL",
15
+ "MIR",
16
+ "VIS",
17
+ "5ED",
18
+ "WTH",
19
+ "POR",
20
+ "TMP",
21
+ "STH",
22
+ "EXO",
23
+ "P02",
24
+ "USG",
25
+ "ULG",
26
+ "6ED",
27
+ "UDS",
28
+ "PTK",
29
+ "S99",
30
+ "MMQ",
31
+ "NEM",
32
+ "PCY",
33
+ "S00",
34
+ "INV",
35
+ "PLS",
36
+ "7ED",
37
+ "APC",
38
+ "ODY",
39
+ "TOR",
40
+ "JUD",
41
+ "ONS",
42
+ "LGN",
43
+ "SCG",
44
+ "PDRC",
45
+ "PHPR",
46
+ "ATH",
47
+ "BRB",
48
+ "BTD",
49
+ "DKM",
50
+ ]
51
+ with open("list_scripts/3_separate_json_files_per_set.sh", "w") as f:
52
+ for set in setlist:
53
+ # Write a separate JSON document for each Middle School legal set
54
+ line = f"cat data/AllPrintings.json | jq '.data.\"{set}\".cards' > data/set_{set}.json"
55
+ f.write(line + "\n")
56
+ line = "jq -s add data/set_* > data/middleschool.json"
57
+ f.write(line + "\n")
list_scripts/4_compile_from_legal_sets.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+
4
+ with open("data/middleschool.json") as json_data:
5
+ cards = json.loads(json_data.read())
6
+
7
+ # Create a pandas DataFrame with all cards from all legal sets
8
+ column_names = ["oracle_id", "name", "name_ja"]
9
+ middleschool_df = pd.DataFrame(columns=column_names)
10
+ for card in cards:
11
+ oracle_id = card["identifiers"]["scryfallOracleId"]
12
+ name = card["name"]
13
+ lang_ja = [lang for lang in card["foreignData"] if lang["language"] == "Japanese"]
14
+ # Some cards do not have a Japanese name
15
+ if len(lang_ja) > 0:
16
+ name_ja = lang_ja[0]["name"]
17
+ else:
18
+ name_ja = None
19
+ temporary_df = pd.DataFrame(
20
+ {"oracle_id": [oracle_id], "name": [name], "name_ja": [name_ja]}
21
+ )
22
+ middleschool_df = pd.concat([middleschool_df, temporary_df])
23
+
24
+ # For cards with multiple occurrences, put the rows that have the Japanese name on top
25
+ middleschool_df = middleschool_df.sort_values(by=["name", "name_ja"])
26
+ # For cards with multiple occurrences, delete all rows except for the top one
27
+ middleschool_df = middleschool_df.drop_duplicates(subset=["oracle_id"])
28
+
29
+ # Write a CSV file
30
+ middleschool_df.to_csv("data/middleschool_all_sets.csv")
list_scripts/5_remove_wrong_names.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ # Remove Japanese card names that are wrong on MTGJSON
4
+ wrongnames = [
5
+ "Aether Barrier",
6
+ "Aether Burst",
7
+ "Aether Charge",
8
+ "Aether Flash",
9
+ "Aether Mutation",
10
+ "Aether Sting",
11
+ "Aether Storm",
12
+ "Aether Tide",
13
+ "Tainted Aether",
14
+ "Tar Pit Warrior",
15
+ ]
16
+
17
+ middleschool_df = pd.read_csv("data/middleschool_all_sets.csv")
18
+
19
+ # Write a CSV file
20
+ middleschool_df.to_csv("data/middleschool_all_sets_removed_wrong_names.csv")
list_scripts/6_find_remaining_japanese_names.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import time
3
+ from requests_html import HTMLSession
4
+
5
+ middleschool_df = pd.read_csv("data/middleschool_all_sets_removed_wrong_names.csv")
6
+
7
+ session = HTMLSession()
8
+
9
+
10
+ def find_japanese_name(name):
11
+ url = "http://whisper.wisdom-guild.net/card/" + name + "/"
12
+ r = session.get(url)
13
+ # Find the text on the <title> element in the HTML document
14
+ title = r.html.find("title")[0].text
15
+ # Find the position of the English card name within the title
16
+ idx = title.find(name)
17
+ # The Japanese name should be before the English name,
18
+ # so if idx is 0, there is no Japanese name
19
+ if idx == 0:
20
+ print(f"{name} ->")
21
+ return None
22
+ # If the exact English card name can't be found, we look for a '/'
23
+ if idx == -1:
24
+ idx = title.find("/")
25
+ # No '/' means no Japanese name
26
+ if idx == -1:
27
+ return None
28
+ # Take only the Japanese name from the title
29
+ name_ja = title[0:idx]
30
+ else:
31
+ # Take only the Japanese name from the title
32
+ name_ja = title[0 : idx - 1]
33
+ print(f"{name} -> {name_ja}")
34
+ return name_ja
35
+
36
+
37
+ english_only_cards = middleschool_df[middleschool_df["name_ja"].isnull()]
38
+ name_list = english_only_cards["name"].to_list()
39
+ for idx, name in enumerate(name_list):
40
+ middleschool_df.loc[
41
+ middleschool_df["name"] == name, "name_ja"
42
+ ] = find_japanese_name(name)
43
+ time.sleep(1)
44
+
45
+ # Write a CSV file
46
+ middleschool_df.to_csv("data/middleschool_all_sets_added_japanese_names.csv")
list_scripts/7_remove_banned_cards.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ # Remove cards that are banned in the format
4
+ banlist = [
5
+ "Amulet of Quoz",
6
+ "Balance",
7
+ "Brainstorm",
8
+ "Bronze Tablet",
9
+ "Channel",
10
+ "Dark Ritual",
11
+ "Demonic Consultation",
12
+ "Flash",
13
+ "Goblin Recruiter",
14
+ "Imperial Seal",
15
+ "Jeweled Bird",
16
+ "Mana Crypt",
17
+ "Mana Vault",
18
+ "Memory Jar",
19
+ "Mind's Desire",
20
+ "Mind Twist",
21
+ "Rebirth",
22
+ "Strip Mine",
23
+ "Tempest Efreet",
24
+ "Timmerian Fiends",
25
+ "Tolarian Academy",
26
+ "Vampiric Tutor",
27
+ "Windfall",
28
+ "Yawgmoth's Bargain",
29
+ "Yawgmoth's Will",
30
+ ]
31
+
32
+ middleschool_df = pd.read_csv("data/middleschool_all_sets_added_japanese_names.csv")
33
+
34
+ print("Cards legal by set:", middleschool_df.shape[0])
35
+ # Find the rows with the banned cards
36
+ banned_df = middleschool_df[
37
+ pd.DataFrame(middleschool_df.name.tolist()).isin(banlist).any(axis=1).values
38
+ ]
39
+ print("Banned cards:", banned_df.shape[0])
40
+ # Append the banned cards to the main Middle School DataFrame,
41
+ # then remove any rows that appear twice,
42
+ # effectively leaving only the legal cards
43
+ middleschool_df = pd.concat([middleschool_df, banned_df]).drop_duplicates(keep=False)
44
+ print("Cards legal by set and not banned:", middleschool_df.shape[0])
45
+ middleschool_df = middleschool_df.reset_index(drop=True)
46
+ middleschool_df = middleschool_df[["oracle_id", "name", "name_ja"]]
47
+ middleschool_df = middleschool_df.sort_values(by=["name", "name_ja"])
48
+
49
+ # Write a CSV file
50
+ middleschool_df.to_csv("output/middleschool.csv")
51
+ middleschool_df.to_json("output/middleschool.json")
output/middleschool.csv CHANGED
The diff for this file is too large to render. See raw diff
 
output/middleschool.json CHANGED
The diff for this file is too large to render. See raw diff