File size: 3,270 Bytes
cc8e143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from typing import List, Dict, Tuple
import ujson as json
import argparse
from collections import defaultdict

from tqdm.auto import tqdm

Entry = Dict[str, str]
BigNews = List[List[Entry]]

parser = argparse.ArgumentParser()
parser.add_argument("big_news_path")
parser.add_argument("output_prefix")
parser.add_argument("--ratio", type=float, default=1.0, required=False)
args = parser.parse_args()

big_news_path: str = args.big_news_path
output_prefix: str = args.output_prefix
ratio: float = args.ratio

print("Loading input file.")
with open(big_news_path, "r") as big_news_file:
    big_news: BigNews = json.load(big_news_file)


side_map_transposed = {
    "L": ["dailykos", "hpo", "cnn", "wpo", "nyt"],
    "R": ["wat", "fox", "breitbart"],
}

side_map = {}

for key, values in side_map_transposed.items():
    for value in values:
        side_map[value] = key


def get_entry_side(entry: Entry) -> str:
    outlet = entry["source"]
    side = side_map.get(outlet)
    return side


big_news_flattened: List[str] = list()

# (event, side)
big_news_event_side_lookup_keys: List[List[int]] = list()

entry_index = 0

event_lookup_info: List[Tuple[int, int]] = []
if ratio < 1.0:
    num_events = int(len(big_news) * ratio)
else:
    num_events = len(big_news)

for event in tqdm(big_news[:num_events], desc="Flattening"):
    event: List[Entry]
    index_by_sides = defaultdict(list)
    sides = set()

    for entry in event:
        entry_text = entry["text"]
        entry_text = " ".join(entry_text) + "\n"
        big_news_flattened.append("")
        big_news_flattened[entry_index] = entry_text

        entry_side = get_entry_side(entry)

        if entry_side:
            index_by_sides[entry_side].append(entry_index)
            sides.add(entry_side)

        entry_index += 1

    sides = list(sides)

    num_sides = len(index_by_sides.keys())

    for side_a_index in range(num_sides):
        for side_b_index in range(side_a_index + 1, num_sides):
            side_a = sides[side_a_index]
            side_b = sides[side_b_index]

            for side_a_entry_index in index_by_sides[side_a]:
                for side_b_entry_index in index_by_sides[side_b]:
                    # Maximize distance over negative (non-matching) examples
                    event_lookup_info.append(
                        (-1, side_a_entry_index, side_b_entry_index)
                    )

            side_a_indices = index_by_sides[side_a]
            num_side_a_indices = len(side_a_indices)
            for x_index in range(num_side_a_indices):
                # Minimize distance over positive (matching) examples
                for y_index in range(x_index + 1, num_side_a_indices):
                    event_lookup_info.append(
                        (1, side_a_indices[x_index], side_a_indices[y_index])
                    )

del big_news

big_news_flattened_path = output_prefix + "_text.txt"
lookup_json_path = output_prefix + "_lookup.json"

print("Writing flattened text.")
with open(big_news_flattened_path, "w") as big_news_flattened_file:
    big_news_flattened_file.writelines(big_news_flattened)

print("Writing lookup json.")
with open(lookup_json_path, "w") as lookup_json_file:
    json.dump(event_lookup_info, lookup_json_file, indent=2)