File size: 3,270 Bytes
cc8e143 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
from typing import List, Dict, Tuple
import ujson as json
import argparse
from collections import defaultdict
from tqdm.auto import tqdm
Entry = Dict[str, str]
BigNews = List[List[Entry]]
parser = argparse.ArgumentParser()
parser.add_argument("big_news_path")
parser.add_argument("output_prefix")
parser.add_argument("--ratio", type=float, default=1.0, required=False)
args = parser.parse_args()
big_news_path: str = args.big_news_path
output_prefix: str = args.output_prefix
ratio: float = args.ratio
print("Loading input file.")
with open(big_news_path, "r") as big_news_file:
big_news: BigNews = json.load(big_news_file)
side_map_transposed = {
"L": ["dailykos", "hpo", "cnn", "wpo", "nyt"],
"R": ["wat", "fox", "breitbart"],
}
side_map = {}
for key, values in side_map_transposed.items():
for value in values:
side_map[value] = key
def get_entry_side(entry: Entry) -> str:
outlet = entry["source"]
side = side_map.get(outlet)
return side
big_news_flattened: List[str] = list()
# (event, side)
big_news_event_side_lookup_keys: List[List[int]] = list()
entry_index = 0
event_lookup_info: List[Tuple[int, int]] = []
if ratio < 1.0:
num_events = int(len(big_news) * ratio)
else:
num_events = len(big_news)
for event in tqdm(big_news[:num_events], desc="Flattening"):
event: List[Entry]
index_by_sides = defaultdict(list)
sides = set()
for entry in event:
entry_text = entry["text"]
entry_text = " ".join(entry_text) + "\n"
big_news_flattened.append("")
big_news_flattened[entry_index] = entry_text
entry_side = get_entry_side(entry)
if entry_side:
index_by_sides[entry_side].append(entry_index)
sides.add(entry_side)
entry_index += 1
sides = list(sides)
num_sides = len(index_by_sides.keys())
for side_a_index in range(num_sides):
for side_b_index in range(side_a_index + 1, num_sides):
side_a = sides[side_a_index]
side_b = sides[side_b_index]
for side_a_entry_index in index_by_sides[side_a]:
for side_b_entry_index in index_by_sides[side_b]:
# Maximize distance over negative (non-matching) examples
event_lookup_info.append(
(-1, side_a_entry_index, side_b_entry_index)
)
side_a_indices = index_by_sides[side_a]
num_side_a_indices = len(side_a_indices)
for x_index in range(num_side_a_indices):
# Minimize distance over positive (matching) examples
for y_index in range(x_index + 1, num_side_a_indices):
event_lookup_info.append(
(1, side_a_indices[x_index], side_a_indices[y_index])
)
del big_news
big_news_flattened_path = output_prefix + "_text.txt"
lookup_json_path = output_prefix + "_lookup.json"
print("Writing flattened text.")
with open(big_news_flattened_path, "w") as big_news_flattened_file:
big_news_flattened_file.writelines(big_news_flattened)
print("Writing lookup json.")
with open(lookup_json_path, "w") as lookup_json_file:
json.dump(event_lookup_info, lookup_json_file, indent=2)
|