|
import csv |
|
import json |
|
import sys |
|
import time |
|
import traceback |
|
from datetime import datetime |
|
|
|
import requests |
|
|
|
username = "" |
|
subreddit = "BestofRedditorUpdates" |
|
thread_id = "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output_format = "csv" |
|
|
|
|
|
|
|
|
|
start_time = datetime.strptime("04/02/2023", "%m/%d/%Y") |
|
end_time = None |
|
|
|
convert_to_ascii = False |
|
convert_thread_id_to_base_ten = True |
|
|
|
|
|
def write_csv_line(writer, obj, is_submission): |
|
output_list = [] |
|
output_list.append(str(obj['score'])) |
|
output_list.append(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d")) |
|
if is_submission: |
|
output_list.append(obj['title']) |
|
output_list.append(f"u/{obj['author']}") |
|
output_list.append(f"https://www.reddit.com{obj['permalink']}") |
|
if is_submission: |
|
if obj['is_self']: |
|
if 'selftext' in obj: |
|
output_list.append(obj['selftext']) |
|
else: |
|
output_list.append("") |
|
else: |
|
output_list.append(obj['url']) |
|
else: |
|
output_list.append(obj['body']) |
|
writer.writerow(output_list) |
|
|
|
|
|
def write_json_line(handle, obj): |
|
handle.write(json.dumps(obj)) |
|
handle.write("\n") |
|
|
|
|
|
def download_from_url(filename, url_base, output_format, start_datetime, end_datetime, is_submission, convert_to_ascii): |
|
print(f"Saving to {filename}") |
|
|
|
count = 0 |
|
if output_format == "human" or output_format == "json": |
|
if convert_to_ascii: |
|
handle = open(filename, 'w', encoding='ascii') |
|
else: |
|
handle = open(filename, 'w', encoding='UTF-8') |
|
else: |
|
handle = open(filename, 'w', encoding='UTF-8', newline='') |
|
writer = csv.writer(handle) |
|
|
|
previous_epoch = int(start_datetime.timestamp()) |
|
break_out = False |
|
while True: |
|
new_url = url_base + str(previous_epoch) |
|
json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"}) |
|
time.sleep(1) |
|
try: |
|
json_data = json_text.json() |
|
except json.decoder.JSONDecodeError: |
|
time.sleep(1) |
|
continue |
|
|
|
if 'data' not in json_data: |
|
break |
|
objects = json_data['data'] |
|
if len(objects) == 0: |
|
break |
|
|
|
for obj in objects: |
|
previous_epoch = obj['created_utc'] - 1 |
|
if end_datetime is not None and datetime.utcfromtimestamp(previous_epoch) < end_datetime: |
|
break_out = True |
|
break |
|
count += 1 |
|
try: |
|
if output_format == "csv": |
|
write_csv_line(writer, obj, is_submission) |
|
elif output_format == "json": |
|
write_json_line(handle, obj) |
|
except Exception as err: |
|
if 'permalink' in obj: |
|
print(f"Couldn't print object: https://www.reddit.com{obj['permalink']}") |
|
else: |
|
print(f"Couldn't print object, missing permalink: {obj['id']}") |
|
print(err) |
|
print(traceback.format_exc()) |
|
|
|
if break_out: |
|
break |
|
|
|
print(f"Saved {count} through {datetime.fromtimestamp(previous_epoch).strftime('%Y-%m-%d')}") |
|
|
|
print(f"Saved {count}") |
|
handle.close() |
|
|
|
|
|
if __name__ == "__main__": |
|
filter_string = None |
|
if username == "" and subreddit == "" and thread_id == "": |
|
print("Fill in username, subreddit or thread id") |
|
sys.exit(0) |
|
if output_format not in ("human", "csv", "json"): |
|
print("Output format must be one of human, csv, json") |
|
sys.exit(0) |
|
|
|
filters = [] |
|
if username: |
|
filters.append(f"author={username}") |
|
if subreddit: |
|
filters.append(f"subreddit={subreddit}") |
|
if thread_id: |
|
if convert_thread_id_to_base_ten: |
|
filters.append(f"link_id={int(thread_id, 36)}") |
|
else: |
|
filters.append(f"link_id=t3_{thread_id}") |
|
filter_string = '&'.join(filters) |
|
|
|
url_template = "https://api.pushshift.io/reddit/{}/search?limit=1000&order=desc&{}&before=" |
|
|
|
if not thread_id: |
|
download_from_url("posts.txt", url_template.format("submission", filter_string), output_format, start_time, |
|
end_time, True, convert_to_ascii) |
|
|
|
|
|
|
|
|