precollege_scraper

Sleeping

File size: 5,753 Bytes

import os
import google.generativeai as genai
from playwright.async_api import async_playwright
from dotenv import load_dotenv
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn
import asyncio
import json
import requests
from bs4 import BeautifulSoup

# Load environment variables
load_dotenv()

# Configure Google Generative AI API key
genai.configure(api_key=os.environ["API_KEY"])

# FastAPI app initialization
app = FastAPI()

# Function to scrape webpage and extract visible text
async def scrape_visible_text(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)  # Launch browser in headless mode
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
            viewport={"width": 1280, "height": 800},
            extra_http_headers={
                "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
                "accept-encoding": "gzip, deflate, br, zstd",
                "accept-language": "en-US,en;q=0.9,hi;q=0.8",
                "cache-control": "max-age=0",
                "sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
                "sec-ch-ua-mobile": "?0",
                "sec-ch-ua-platform": '"Windows"',
                "sec-fetch-dest": "document",
                "sec-fetch-mode": "navigate",
                "sec-fetch-site": "none",
                "sec-fetch-user": "?1",
                "upgrade-insecure-requests": "1"
            }
        )
        page = await context.new_page()
        await page.goto(url, wait_until="networkidle")
        visible_text = await page.evaluate("document.body.innerText")
        await browser.close()
        return visible_text

# Function to structure data using Google's Gemini model
def structure_data(text, college_name):
    prompt = f"Convert the following unstructured text into a well-written and comprehensive structured form with titles and content containing all relevant data. The response should be a detailed paragraph mentioning everything about the college named '{college_name}', ensuring no important information is missed. Include details such as connectivity, placement, nearby colleges, infrastructure, courses, branches, students, festivals, clubs, reviews, Q&A, and any other college-related parameters available in the text. Provide the response text with no fromatting!\n{text}"
    model = genai.GenerativeModel("gemini-1.5-pro")
    response = model.generate_content(prompt)
    return response.text.strip()

# Pydantic model for request body
class URLRequest(BaseModel):
    url: str
    college_name: str
    # Pydantic model for Crawler request
class CrawlerRequest(BaseModel):
    topic_title: str

# Function to perform Google search and return top N links
def google_search(query, num_results=5):
    search_url = f"https://www.google.com/search?q={query}&num={num_results}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
    }
    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    links = []
    for a in soup.find_all('a', href=True, attrs={'jsname': True}):
        link = a['href']
        if link.startswith("https://"):
            links.append(link)
    return links[:num_results]

# Function to perform advanced search on specific sites
def advanced_search_on_site(site, topic, num_results=10):
    query = f"site:{site} {topic}"
    return google_search(query, num_results)

# FastAPI endpoint to scrape and structure data
@app.post("/scrape")
async def scrape_and_structure_data(request: URLRequest):
    try:
        # Scrape visible text from the webpage
        visible_text = await scrape_visible_text(request.url)
        
        # Structure the data using Google's Gemini model
        structured_data = structure_data(visible_text, request.college_name)
        
        # Return the structured data
        return {"structured_data": structured_data}
    except Exception as e:
        print(f"Error occurred while processing the request: {e}")
        raise HTTPException(status_code=500, detail=str(e))
    
# FastAPI endpoint to perform web crawling
@app.post("/crawl")
async def crawl_web(request: CrawlerRequest):
    try:
        topic_title = request.topic_title

        # Get top 5 links from Google search
        google_links = google_search(topic_title, num_results=5)

        # Get links from Quora
        quora_links = advanced_search_on_site("quora.com", topic_title, num_results=10)

        # Additional sites can be added similarly
        other_links = advanced_search_on_site("reddit.com", topic_title, num_results=10)

        # Combine all links
        all_links = google_links + quora_links + other_links

        # Use Gemini to filter and list relevant URLs
        prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{topic_title}':\n{all_links}"
        model = genai.GenerativeModel("gemini-1.5-pro")
        response = model.generate_content(prompt)
        filtered_links = response.text.strip().split('\n')

        # Return the filtered links
        return {"filtered_links": filtered_links}
    except Exception as e:
        print(f"Error occurred while processing the request: {e}")
        raise HTTPException(status_code=500, detail=str(e))
    
if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)