Spaces:

spark-ds549
/

LibRAG

Running

File size: 2,669 Bytes

import json
import time
import os
import sys
import requests

def fetch_digital_commonwealth():
    start = time.time()
    BASE_URL = "https://www.digitalcommonwealth.org/search.json?search_field=all_fields&per_page=100&q="
    PAGE = sys.argv[1]
    END_PAGE = sys.argv[2]
    file_name = f"out{PAGE}_{END_PAGE}.json"
    FINAL_PAGE = 13038 # hardcoded from old version, I suggest doing logic to determine final page. This was used to keep us from going out of index.
    output = []
    file_path = f"./{file_name}"
    # file_path = './output.json'
    if os.path.exists(file_path):
        with open(file_path,'r') as file:
            output = json.load(file)
            if int(PAGE) < (len(output) + 1):
                PAGE = len(output) + 1
    
    if int(PAGE) >= int(END_PAGE):
        return None
    print(f'Reading page {PAGE} up to page {END_PAGE}')

    retries = 0

    while True:
        try:
            response = requests.get(f"{BASE_URL}&page={PAGE}")
            response.raise_for_status()
            data = response.json()
    
            # Append current page data to the output list
            output.append(data)
            
            # Save the entire output to a JSON file after each iteration
            with open(file_path, 'w') as f:
                json.dump(output, f)


            # check if theres a next page
            # print(len(response))
            if data['meta']['pages']['next_page']:
                if data['meta']['pages']['next_page'] == int(END_PAGE):
                    print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")
                    break
                elif data['meta']['pages']['next_page'] == FINAL_PAGE: # This is hardcoded from an old version
                    print(f"finished page {PAGE}")
                    PAGE = FINAL_PAGE
                else:
                    print(f"finished page {PAGE}")
                    PAGE = data['meta']['pages']['next_page']
            else:
                print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")
                break
            
            retries = 0
            # time.sleep(0.5) was concerned about rate limiting
        except requests.exceptions.RequestException as e:
            print(f"An error occurred: {e}")
            print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")
            retries += 1
            if retries >= 5:
                break

    end = time.time()
    print(f"Timer: {end - start}")
    print(f"Finished processing all pages. Total pages saved: {len(output)}")
if __name__ == "__main__":
    fetch_digital_commonwealth()