Spaces:
Running
Running
File size: 2,478 Bytes
120773d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
from firecrawl import FirecrawlApp
import os
import time
from dotenv import load_dotenv
from urllib.parse import urlparse
load_dotenv()
base_url = os.getenv('BASE_URL')
def map_website(url):
# Initialize the Firecrawl application with the API key
app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY'))
# Use the /map endpoint to get all URLs from the website
map_status = app.map_url(url)
# Check if the mapping was successful
if isinstance(map_status, list):
return map_status
else:
print("Failed to map the website:", map_status)
return []
def scrape_url(url):
# Initialize the Firecrawl application with the API key
app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY'))
# Use the /scrape endpoint to scrape the URL
scrape_status = app.scrape_url(url)
# Print the scrape_status to understand its structure
print(f"Scrape status for {url}: {scrape_status}")
# Check if the scraping was successful
if 'markdown' in scrape_status:
return scrape_status['markdown']
else:
print(f"Failed to scrape {url}: {scrape_status}")
return ""
def scrape_all_urls(base_url):
# Map the URLs
urls = map_website(base_url)
# Parse the base URL to get the domain without 'www' and scheme
parsed_url = urlparse(base_url)
domain = parsed_url.netloc.replace("www.", "")
# Create the directory if it doesn't exist
os.makedirs('scraped_documentation', exist_ok=True)
# Generate the output file name and save location
output_file = os.path.join('scraped_documentation', f"{domain}.md")
# Open the output file in write mode
with open(output_file, 'w', encoding='utf-8') as md_file:
# Iterate over the URLs
for i, url in enumerate(urls):
# Print the URL being scraped
print(f"Scraping {url} ({i+1}/{len(urls)})")
# Scrape the URL
markdown_content = scrape_url(url)
# Write the scraped content to the file
md_file.write(f"# {url}\n\n")
md_file.write(markdown_content)
md_file.write("\n\n---\n\n")
# Rate limiting: 10 scrapes per minute
if os.getenv('LIMIT_RATE') == 'True':
if (i + 1) % 10 == 0:
print("Rate limit reached, waiting for 60 seconds...")
time.sleep(60)
if __name__ == "__main__":
scrape_all_urls(base_url) |