from typing import List, Dict, Tuple import ujson as json import argparse from collections import defaultdict from tqdm.auto import tqdm Entry = Dict[str, str] BigNews = List[List[Entry]] parser = argparse.ArgumentParser() parser.add_argument("big_news_path") parser.add_argument("output_prefix") parser.add_argument("--ratio", type=float, default=1.0, required=False) args = parser.parse_args() big_news_path: str = args.big_news_path output_prefix: str = args.output_prefix ratio: float = args.ratio print("Loading input file.") with open(big_news_path, "r") as big_news_file: big_news: BigNews = json.load(big_news_file) side_map_transposed = { "L": ["dailykos", "hpo", "cnn", "wpo", "nyt"], "R": ["wat", "fox", "breitbart"], } side_map = {} for key, values in side_map_transposed.items(): for value in values: side_map[value] = key def get_entry_side(entry: Entry) -> str: outlet = entry["source"] side = side_map.get(outlet) return side big_news_flattened: List[str] = list() # (event, side) big_news_event_side_lookup_keys: List[List[int]] = list() entry_index = 0 event_lookup_info: List[Tuple[int, int]] = [] if ratio < 1.0: num_events = int(len(big_news) * ratio) else: num_events = len(big_news) for event in tqdm(big_news[:num_events], desc="Flattening"): event: List[Entry] index_by_sides = defaultdict(list) sides = set() for entry in event: entry_text = entry["text"] entry_text = " ".join(entry_text) + "\n" big_news_flattened.append("") big_news_flattened[entry_index] = entry_text entry_side = get_entry_side(entry) if entry_side: index_by_sides[entry_side].append(entry_index) sides.add(entry_side) entry_index += 1 sides = list(sides) num_sides = len(index_by_sides.keys()) for side_a_index in range(num_sides): for side_b_index in range(side_a_index + 1, num_sides): side_a = sides[side_a_index] side_b = sides[side_b_index] for side_a_entry_index in index_by_sides[side_a]: for side_b_entry_index in index_by_sides[side_b]: # Maximize distance over negative (non-matching) examples event_lookup_info.append( (-1, side_a_entry_index, side_b_entry_index) ) side_a_indices = index_by_sides[side_a] num_side_a_indices = len(side_a_indices) for x_index in range(num_side_a_indices): # Minimize distance over positive (matching) examples for y_index in range(x_index + 1, num_side_a_indices): event_lookup_info.append( (1, side_a_indices[x_index], side_a_indices[y_index]) ) del big_news big_news_flattened_path = output_prefix + "_text.txt" lookup_json_path = output_prefix + "_lookup.json" print("Writing flattened text.") with open(big_news_flattened_path, "w") as big_news_flattened_file: big_news_flattened_file.writelines(big_news_flattened) print("Writing lookup json.") with open(lookup_json_path, "w") as lookup_json_file: json.dump(event_lookup_info, lookup_json_file, indent=2)