politics / process_bignews.py
jacobthebanana's picture
Saving weights and logs of step 143142
cc8e143
raw
history blame
3.27 kB
from typing import List, Dict, Tuple
import ujson as json
import argparse
from collections import defaultdict
from tqdm.auto import tqdm
Entry = Dict[str, str]
BigNews = List[List[Entry]]
parser = argparse.ArgumentParser()
parser.add_argument("big_news_path")
parser.add_argument("output_prefix")
parser.add_argument("--ratio", type=float, default=1.0, required=False)
args = parser.parse_args()
big_news_path: str = args.big_news_path
output_prefix: str = args.output_prefix
ratio: float = args.ratio
print("Loading input file.")
with open(big_news_path, "r") as big_news_file:
big_news: BigNews = json.load(big_news_file)
side_map_transposed = {
"L": ["dailykos", "hpo", "cnn", "wpo", "nyt"],
"R": ["wat", "fox", "breitbart"],
}
side_map = {}
for key, values in side_map_transposed.items():
for value in values:
side_map[value] = key
def get_entry_side(entry: Entry) -> str:
outlet = entry["source"]
side = side_map.get(outlet)
return side
big_news_flattened: List[str] = list()
# (event, side)
big_news_event_side_lookup_keys: List[List[int]] = list()
entry_index = 0
event_lookup_info: List[Tuple[int, int]] = []
if ratio < 1.0:
num_events = int(len(big_news) * ratio)
else:
num_events = len(big_news)
for event in tqdm(big_news[:num_events], desc="Flattening"):
event: List[Entry]
index_by_sides = defaultdict(list)
sides = set()
for entry in event:
entry_text = entry["text"]
entry_text = " ".join(entry_text) + "\n"
big_news_flattened.append("")
big_news_flattened[entry_index] = entry_text
entry_side = get_entry_side(entry)
if entry_side:
index_by_sides[entry_side].append(entry_index)
sides.add(entry_side)
entry_index += 1
sides = list(sides)
num_sides = len(index_by_sides.keys())
for side_a_index in range(num_sides):
for side_b_index in range(side_a_index + 1, num_sides):
side_a = sides[side_a_index]
side_b = sides[side_b_index]
for side_a_entry_index in index_by_sides[side_a]:
for side_b_entry_index in index_by_sides[side_b]:
# Maximize distance over negative (non-matching) examples
event_lookup_info.append(
(-1, side_a_entry_index, side_b_entry_index)
)
side_a_indices = index_by_sides[side_a]
num_side_a_indices = len(side_a_indices)
for x_index in range(num_side_a_indices):
# Minimize distance over positive (matching) examples
for y_index in range(x_index + 1, num_side_a_indices):
event_lookup_info.append(
(1, side_a_indices[x_index], side_a_indices[y_index])
)
del big_news
big_news_flattened_path = output_prefix + "_text.txt"
lookup_json_path = output_prefix + "_lookup.json"
print("Writing flattened text.")
with open(big_news_flattened_path, "w") as big_news_flattened_file:
big_news_flattened_file.writelines(big_news_flattened)
print("Writing lookup json.")
with open(lookup_json_path, "w") as lookup_json_file:
json.dump(event_lookup_info, lookup_json_file, indent=2)