dataset-creator-reddit-uwaterloo

Running

App Files Files Community

dataset-creator-reddit-uwaterloo / archive /subreddit_downloader.py

derek-thomas HF staff

Init commit

749d1d8 over 1 year ago

raw

history blame

5.71 kB

	import csv
	import json
	import sys
	import time
	import traceback
	from datetime import datetime

	import requests

	username = "" # put the username you want to download in the quotes
	subreddit = "BestofRedditorUpdates" # put the subreddit you want to download in the quotes
	thread_id = "" # put the id of the thread you want to download in the quotes, it's the first 5 to 7 character string of letters and numbers from the url, like 107xayi
	# leave either one blank to download an entire user's or subreddit's history
	# or fill in both to download a specific users history from a specific subreddit

	# change this to one of "human", "csv" or "json"
	# - human: the score, creation date, author, link and then the comment/submission body on a second line. Objects are separated by lines of dashes
	# - csv: a comma seperated value file with the fields score, date, title, author, link and then body or url
	# - json: the full json object
	output_format = "csv"

	# default start time is the current time and default end time is all history
	# you can change out the below lines to set a custom start and end date. The script works backwards, so the end date has to be before the start date
	# start_time = datetime.utcnow() # datetime.strptime("10/05/2021", "%m/%d/%Y")
	start_time = datetime.strptime("04/02/2023", "%m/%d/%Y")
	end_time = None # datetime.strptime("09/25/2021", "%m/%d/%Y")

	convert_to_ascii = False # don't touch this unless you know what you're doing
	convert_thread_id_to_base_ten = True # don't touch this unless you know what you're doing


	def write_csv_line(writer, obj, is_submission):
	output_list = []
	output_list.append(str(obj['score']))
	output_list.append(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d"))
	if is_submission:
	output_list.append(obj['title'])
	output_list.append(f"u/{obj['author']}")
	output_list.append(f"https://www.reddit.com{obj['permalink']}")
	if is_submission:
	if obj['is_self']:
	if 'selftext' in obj:
	output_list.append(obj['selftext'])
	else:
	output_list.append("")
	else:
	output_list.append(obj['url'])
	else:
	output_list.append(obj['body'])
	writer.writerow(output_list)


	def write_json_line(handle, obj):
	handle.write(json.dumps(obj))
	handle.write("\n")


	def download_from_url(filename, url_base, output_format, start_datetime, end_datetime, is_submission, convert_to_ascii):
	print(f"Saving to {filename}")

	count = 0
	if output_format == "human" or output_format == "json":
	if convert_to_ascii:
	handle = open(filename, 'w', encoding='ascii')
	else:
	handle = open(filename, 'w', encoding='UTF-8')
	else:
	handle = open(filename, 'w', encoding='UTF-8', newline='')
	writer = csv.writer(handle)

	previous_epoch = int(start_datetime.timestamp())
	break_out = False
	while True:
	new_url = url_base + str(previous_epoch)
	json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"})
	time.sleep(1) # pushshift has a rate limit, if we send requests too fast it will start returning error messages
	try:
	json_data = json_text.json()
	except json.decoder.JSONDecodeError:
	time.sleep(1)
	continue

	if 'data' not in json_data:
	break
	objects = json_data['data']
	if len(objects) == 0:
	break

	for obj in objects:
	previous_epoch = obj['created_utc'] - 1
	if end_datetime is not None and datetime.utcfromtimestamp(previous_epoch) < end_datetime:
	break_out = True
	break
	count += 1
	try:
	if output_format == "csv":
	write_csv_line(writer, obj, is_submission)
	elif output_format == "json":
	write_json_line(handle, obj)
	except Exception as err:
	if 'permalink' in obj:
	print(f"Couldn't print object: https://www.reddit.com{obj['permalink']}")
	else:
	print(f"Couldn't print object, missing permalink: {obj['id']}")
	print(err)
	print(traceback.format_exc())

	if break_out:
	break

	print(f"Saved {count} through {datetime.fromtimestamp(previous_epoch).strftime('%Y-%m-%d')}")

	print(f"Saved {count}")
	handle.close()


	if __name__ == "__main__":
	filter_string = None
	if username == "" and subreddit == "" and thread_id == "":
	print("Fill in username, subreddit or thread id")
	sys.exit(0)
	if output_format not in ("human", "csv", "json"):
	print("Output format must be one of human, csv, json")
	sys.exit(0)

	filters = []
	if username:
	filters.append(f"author={username}")
	if subreddit:
	filters.append(f"subreddit={subreddit}")
	if thread_id:
	if convert_thread_id_to_base_ten:
	filters.append(f"link_id={int(thread_id, 36)}")
	else:
	filters.append(f"link_id=t3_{thread_id}")
	filter_string = '&'.join(filters)

	url_template = "https://api.pushshift.io/reddit/{}/search?limit=1000&order=desc&{}&before="

	if not thread_id:
	download_from_url("posts.txt", url_template.format("submission", filter_string), output_format, start_time,
	end_time, True, convert_to_ascii)
	# download_from_url("comments.txt", url_template.format("comment", filter_string), output_format, start_time,
	# end_time, False, convert_to_ascii)