Spaces:
Running
Running
File size: 2,615 Bytes
b296661 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import json
import time
import os
import sys
import requests
def fetch_digital_commonwealth():
start = time.time()
BASE_URL = "https://www.digitalcommonwealth.org/search.json?search_field=all_fields&per_page=100&q="
PAGE = sys.argv[1]
END_PAGE = sys.argv[2]
file_name = f"out{PAGE}_{END_PAGE}.json"
FINAL_PAGE = 13038
output = []
file_path = f"./{file_name}"
# file_path = './output.json'
if os.path.exists(file_path):
with open(file_path,'r') as file:
output = json.load(file)
if int(PAGE) < (len(output) + 1):
PAGE = len(output) + 1
if int(PAGE) >= int(END_PAGE):
return None
print(f'Reading page {PAGE} up to page {END_PAGE}')
retries = 0
while True:
try:
response = requests.get(f"{BASE_URL}&page={PAGE}")
response.raise_for_status()
data = response.json()
# Append current page data to the output list
output.append(data)
# Save the entire output to a JSON file after each iteration
with open(file_path, 'w') as f:
json.dump(output, f)
# check if theres a next page
# print(len(response))
if data['meta']['pages']['next_page']:
if data['meta']['pages']['next_page'] == int(END_PAGE):
print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")
break
elif data['meta']['pages']['next_page'] == FINAL_PAGE:
print(f"finished page {PAGE}")
PAGE = FINAL_PAGE
else:
print(f"finished page {PAGE}")
PAGE = data['meta']['pages']['next_page']
else:
print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")
break
retries = 0
# Optional: Add a small delay to avoid overwhelming the API
# time.sleep(0.5)
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")
retries += 1
if retries >= 5:
break
end = time.time()
print(f"Timer: {end - start}")
print(f"Finished processing all pages. Total pages saved: {len(output)}")
if __name__ == "__main__":
fetch_digital_commonwealth()
|