precollege_scraper

Sleeping

App Files Files Community

precollege_scraper / main.py

adityasproutsai

api v1.2

a88b526 6 months ago

raw

history blame

5.75 kB

	import os
	import google.generativeai as genai
	from playwright.async_api import async_playwright
	from dotenv import load_dotenv
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	import uvicorn
	import asyncio
	import json
	import requests
	from bs4 import BeautifulSoup

	# Load environment variables
	load_dotenv()

	# Configure Google Generative AI API key
	genai.configure(api_key=os.environ["API_KEY"])

	# FastAPI app initialization
	app = FastAPI()

	# Function to scrape webpage and extract visible text
	async def scrape_visible_text(url):
	async with async_playwright() as p:
	browser = await p.chromium.launch(headless=True) # Launch browser in headless mode
	context = await browser.new_context(
	user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
	viewport={"width": 1280, "height": 800},
	extra_http_headers={
	"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7",
	"accept-encoding": "gzip, deflate, br, zstd",
	"accept-language": "en-US,en;q=0.9,hi;q=0.8",
	"cache-control": "max-age=0",
	"sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
	"sec-ch-ua-mobile": "?0",
	"sec-ch-ua-platform": '"Windows"',
	"sec-fetch-dest": "document",
	"sec-fetch-mode": "navigate",
	"sec-fetch-site": "none",
	"sec-fetch-user": "?1",
	"upgrade-insecure-requests": "1"
	}
	)
	page = await context.new_page()
	await page.goto(url, wait_until="networkidle")
	visible_text = await page.evaluate("document.body.innerText")
	await browser.close()
	return visible_text

	# Function to structure data using Google's Gemini model
	def structure_data(text, college_name):
	prompt = f"Convert the following unstructured text into a well-written and comprehensive structured form with titles and content containing all relevant data. The response should be a detailed paragraph mentioning everything about the college named '{college_name}', ensuring no important information is missed. Include details such as connectivity, placement, nearby colleges, infrastructure, courses, branches, students, festivals, clubs, reviews, Q&A, and any other college-related parameters available in the text. Provide the response text with no fromatting!\n{text}"
	model = genai.GenerativeModel("gemini-1.5-pro")
	response = model.generate_content(prompt)
	return response.text.strip()

	# Pydantic model for request body
	class URLRequest(BaseModel):
	url: str
	college_name: str
	# Pydantic model for Crawler request
	class CrawlerRequest(BaseModel):
	topic_title: str

	# Function to perform Google search and return top N links
	def google_search(query, num_results=5):
	search_url = f"https://www.google.com/search?q={query}&num={num_results}"
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
	}
	response = requests.get(search_url, headers=headers)
	soup = BeautifulSoup(response.text, "html.parser")
	links = []
	for a in soup.find_all('a', href=True, attrs={'jsname': True}):
	link = a['href']
	if link.startswith("https://"):
	links.append(link)
	return links[:num_results]

	# Function to perform advanced search on specific sites
	def advanced_search_on_site(site, topic, num_results=10):
	query = f"site:{site} {topic}"
	return google_search(query, num_results)

	# FastAPI endpoint to scrape and structure data
	@app.post("/scrape")
	async def scrape_and_structure_data(request: URLRequest):
	try:
	# Scrape visible text from the webpage
	visible_text = await scrape_visible_text(request.url)

	# Structure the data using Google's Gemini model
	structured_data = structure_data(visible_text, request.college_name)

	# Return the structured data
	return {"structured_data": structured_data}
	except Exception as e:
	print(f"Error occurred while processing the request: {e}")
	raise HTTPException(status_code=500, detail=str(e))

	# FastAPI endpoint to perform web crawling
	@app.post("/crawl")
	async def crawl_web(request: CrawlerRequest):
	try:
	topic_title = request.topic_title

	# Get top 5 links from Google search
	google_links = google_search(topic_title, num_results=5)

	# Get links from Quora
	quora_links = advanced_search_on_site("quora.com", topic_title, num_results=10)

	# Additional sites can be added similarly
	other_links = advanced_search_on_site("reddit.com", topic_title, num_results=10)

	# Combine all links
	all_links = google_links + quora_links + other_links

	# Use Gemini to filter and list relevant URLs
	prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{topic_title}':\n{all_links}"
	model = genai.GenerativeModel("gemini-1.5-pro")
	response = model.generate_content(prompt)
	filtered_links = response.text.strip().split('\n')

	# Return the filtered links
	return {"filtered_links": filtered_links}
	except Exception as e:
	print(f"Error occurred while processing the request: {e}")
	raise HTTPException(status_code=500, detail=str(e))

	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)