import json import time from datetime import datetime from typing import Dict, List import logging from serpapi import GoogleSearch from pathlib import Path from config import ( SERP_API_KEY, SERP_MONTHLY_LIMIT, SEARCH_QUERIES, RAW_DIR, LOG_DIR ) class BloomingtonScraper: def __init__(self): self.search_count = 0 self.results_by_category = {} # Set up logging log_file = LOG_DIR / f"scraper_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', filename=log_file ) def _make_serp_request(self, query: str, category: str) -> List[Dict]: """Make a single SERP API request""" if self.search_count >= SERP_MONTHLY_LIMIT: logging.warning("Monthly SERP API limit reached") return [] params = { "api_key": SERP_API_KEY, "engine": "google", "q": query, "location": "Bloomington, Indiana, United States", "google_domain": "google.com", "num": 100, # Get maximum results per query "start": 0 } try: search = GoogleSearch(params) results = search.get_dict() self.search_count += 1 # Save raw results timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') raw_file = RAW_DIR / f"raw_results_{category}_{timestamp}.json" with open(raw_file, 'w') as f: json.dump(results, f, indent=2) logging.info(f"SERP API calls used: {self.search_count}/{SERP_MONTHLY_LIMIT}") return results.get('organic_results', []) except Exception as e: logging.error(f"SERP API error for query '{query}': {e}") return [] def scrape_all_categories(self) -> Dict[str, List[Dict]]: """Scrape data for all categories""" for category, queries in SEARCH_QUERIES.items(): logging.info(f"Starting scraping for category: {category}") category_results = [] for query in queries: if self.search_count >= SERP_MONTHLY_LIMIT: logging.warning(f"Monthly limit reached during {category} scraping") break results = self._make_serp_request(query, category) category_results.extend(results) time.sleep(2) # Polite delay between requests self.results_by_category[category] = category_results # Save category results category_file = RAW_DIR / f"{category}_results.json" with open(category_file, 'w') as f: json.dump(category_results, f, indent=2) logging.info(f"Completed scraping for {category}: {len(category_results)} results") return self.results_by_category def get_search_stats(self) -> Dict: """Get statistics about the search results""" stats = { "total_searches": self.search_count, "remaining_searches": SERP_MONTHLY_LIMIT - self.search_count, "results_per_category": { category: len(results) for category, results in self.results_by_category.items() } } # Save stats stats_file = RAW_DIR / "search_stats.json" with open(stats_file, 'w') as f: json.dump(stats, f, indent=2) return stats