import json import time import os import sys import requests def fetch_digital_commonwealth(): start = time.time() BASE_URL = "https://www.digitalcommonwealth.org/search.json?search_field=all_fields&per_page=100&q=" PAGE = sys.argv[1] END_PAGE = sys.argv[2] file_name = f"out{PAGE}_{END_PAGE}.json" FINAL_PAGE = 13038 # hardcoded from old version, I suggest doing logic to determine final page. This was used to keep us from going out of index. output = [] file_path = f"./{file_name}" # file_path = './output.json' if os.path.exists(file_path): with open(file_path,'r') as file: output = json.load(file) if int(PAGE) < (len(output) + 1): PAGE = len(output) + 1 if int(PAGE) >= int(END_PAGE): return None print(f'Reading page {PAGE} up to page {END_PAGE}') retries = 0 while True: try: response = requests.get(f"{BASE_URL}&page={PAGE}") response.raise_for_status() data = response.json() # Append current page data to the output list output.append(data) # Save the entire output to a JSON file after each iteration with open(file_path, 'w') as f: json.dump(output, f) # check if theres a next page # print(len(response)) if data['meta']['pages']['next_page']: if data['meta']['pages']['next_page'] == int(END_PAGE): print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}") break elif data['meta']['pages']['next_page'] == FINAL_PAGE: # This is hardcoded from an old version print(f"finished page {PAGE}") PAGE = FINAL_PAGE else: print(f"finished page {PAGE}") PAGE = data['meta']['pages']['next_page'] else: print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}") break retries = 0 # time.sleep(0.5) was concerned about rate limiting except requests.exceptions.RequestException as e: print(f"An error occurred: {e}") print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}") retries += 1 if retries >= 5: break end = time.time() print(f"Timer: {end - start}") print(f"Finished processing all pages. Total pages saved: {len(output)}") if __name__ == "__main__": fetch_digital_commonwealth()