File size: 2,615 Bytes
b296661
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141


import json

import time

import os

import sys

import requests


def fetch_digital_commonwealth():

    start = time.time()

    BASE_URL = "https://www.digitalcommonwealth.org/search.json?search_field=all_fields&per_page=100&q="

    PAGE = sys.argv[1]

    END_PAGE = sys.argv[2]

    file_name = f"out{PAGE}_{END_PAGE}.json"

    FINAL_PAGE = 13038

    output = []

    file_path = f"./{file_name}"

    # file_path = './output.json'

    if os.path.exists(file_path):

        with open(file_path,'r') as file:

            output = json.load(file)

            if int(PAGE) < (len(output) + 1):

                PAGE = len(output) + 1

    

    if int(PAGE) >= int(END_PAGE):

        return None

    print(f'Reading page {PAGE} up to page {END_PAGE}')

    retries = 0

    while True:

        try:

            response = requests.get(f"{BASE_URL}&page={PAGE}")

            response.raise_for_status()

            data = response.json()

            

            # Append current page data to the output list

            output.append(data)

            

            # Save the entire output to a JSON file after each iteration

            with open(file_path, 'w') as f:

                json.dump(output, f)





            # check if theres a next page

            # print(len(response))

            if data['meta']['pages']['next_page']:

                if data['meta']['pages']['next_page'] == int(END_PAGE):

                    print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")

                    break

                elif data['meta']['pages']['next_page'] == FINAL_PAGE:

                    print(f"finished page {PAGE}")

                    PAGE = FINAL_PAGE

                else:

                    print(f"finished page {PAGE}")

                    PAGE = data['meta']['pages']['next_page']

            else:

                print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")

                break

            

            retries = 0

            # Optional: Add a small delay to avoid overwhelming the API

            # time.sleep(0.5)

        except requests.exceptions.RequestException as e:

            print(f"An error occurred: {e}")

            print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")

            retries += 1

            if retries >= 5:

                break

    end = time.time()

    print(f"Timer: {end - start}")

    print(f"Finished processing all pages. Total pages saved: {len(output)}")

if __name__ == "__main__":

    fetch_digital_commonwealth()