|
from typing import List, Dict, Tuple |
|
import ujson as json |
|
import argparse |
|
from collections import defaultdict |
|
|
|
from tqdm.auto import tqdm |
|
|
|
Entry = Dict[str, str] |
|
BigNews = List[List[Entry]] |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("big_news_path") |
|
parser.add_argument("output_prefix") |
|
parser.add_argument("--ratio", type=float, default=1.0, required=False) |
|
args = parser.parse_args() |
|
|
|
big_news_path: str = args.big_news_path |
|
output_prefix: str = args.output_prefix |
|
ratio: float = args.ratio |
|
|
|
print("Loading input file.") |
|
with open(big_news_path, "r") as big_news_file: |
|
big_news: BigNews = json.load(big_news_file) |
|
|
|
|
|
side_map_transposed = { |
|
"L": ["dailykos", "hpo", "cnn", "wpo", "nyt"], |
|
"R": ["wat", "fox", "breitbart"], |
|
} |
|
|
|
side_map = {} |
|
|
|
for key, values in side_map_transposed.items(): |
|
for value in values: |
|
side_map[value] = key |
|
|
|
|
|
def get_entry_side(entry: Entry) -> str: |
|
outlet = entry["source"] |
|
side = side_map.get(outlet) |
|
return side |
|
|
|
|
|
big_news_flattened: List[str] = list() |
|
|
|
|
|
big_news_event_side_lookup_keys: List[List[int]] = list() |
|
|
|
entry_index = 0 |
|
|
|
event_lookup_info: List[Tuple[int, int]] = [] |
|
if ratio < 1.0: |
|
num_events = int(len(big_news) * ratio) |
|
else: |
|
num_events = len(big_news) |
|
|
|
for event in tqdm(big_news[:num_events], desc="Flattening"): |
|
event: List[Entry] |
|
index_by_sides = defaultdict(list) |
|
sides = set() |
|
|
|
for entry in event: |
|
entry_text = entry["text"] |
|
entry_text = " ".join(entry_text) + "\n" |
|
big_news_flattened.append("") |
|
big_news_flattened[entry_index] = entry_text |
|
|
|
entry_side = get_entry_side(entry) |
|
|
|
if entry_side: |
|
index_by_sides[entry_side].append(entry_index) |
|
sides.add(entry_side) |
|
|
|
entry_index += 1 |
|
|
|
sides = list(sides) |
|
|
|
num_sides = len(index_by_sides.keys()) |
|
|
|
for side_a_index in range(num_sides): |
|
for side_b_index in range(side_a_index + 1, num_sides): |
|
side_a = sides[side_a_index] |
|
side_b = sides[side_b_index] |
|
|
|
for side_a_entry_index in index_by_sides[side_a]: |
|
for side_b_entry_index in index_by_sides[side_b]: |
|
|
|
event_lookup_info.append( |
|
(-1, side_a_entry_index, side_b_entry_index) |
|
) |
|
|
|
side_a_indices = index_by_sides[side_a] |
|
num_side_a_indices = len(side_a_indices) |
|
for x_index in range(num_side_a_indices): |
|
|
|
for y_index in range(x_index + 1, num_side_a_indices): |
|
event_lookup_info.append( |
|
(1, side_a_indices[x_index], side_a_indices[y_index]) |
|
) |
|
|
|
del big_news |
|
|
|
big_news_flattened_path = output_prefix + "_text.txt" |
|
lookup_json_path = output_prefix + "_lookup.json" |
|
|
|
print("Writing flattened text.") |
|
with open(big_news_flattened_path, "w") as big_news_flattened_file: |
|
big_news_flattened_file.writelines(big_news_flattened) |
|
|
|
print("Writing lookup json.") |
|
with open(lookup_json_path, "w") as lookup_json_file: |
|
json.dump(event_lookup_info, lookup_json_file, indent=2) |
|
|