Spaces:
Sleeping
Sleeping
import os | |
import google.generativeai as genai | |
from playwright.async_api import async_playwright | |
from dotenv import load_dotenv | |
from fastapi import FastAPI, HTTPException | |
from pydantic import BaseModel | |
import uvicorn | |
import asyncio | |
import json | |
import requests | |
from bs4 import BeautifulSoup | |
# Load environment variables | |
load_dotenv() | |
# Configure Google Generative AI API key | |
genai.configure(api_key=os.environ["API_KEY"]) | |
# FastAPI app initialization | |
app = FastAPI() | |
# Function to scrape webpage and extract visible text | |
async def scrape_visible_text(url): | |
async with async_playwright() as p: | |
browser = await p.chromium.launch(headless=True) # Launch browser in headless mode | |
context = await browser.new_context( | |
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", | |
viewport={"width": 1280, "height": 800}, | |
extra_http_headers={ | |
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", | |
"accept-encoding": "gzip, deflate, br, zstd", | |
"accept-language": "en-US,en;q=0.9,hi;q=0.8", | |
"cache-control": "max-age=0", | |
"sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"', | |
"sec-ch-ua-mobile": "?0", | |
"sec-ch-ua-platform": '"Windows"', | |
"sec-fetch-dest": "document", | |
"sec-fetch-mode": "navigate", | |
"sec-fetch-site": "none", | |
"sec-fetch-user": "?1", | |
"upgrade-insecure-requests": "1" | |
} | |
) | |
page = await context.new_page() | |
await page.goto(url, wait_until="networkidle") | |
visible_text = await page.evaluate("document.body.innerText") | |
await browser.close() | |
return visible_text | |
# Function to structure data using Google's Gemini model | |
def structure_data(text, college_name): | |
prompt = f"Convert the following unstructured text into a well-written and comprehensive structured form with titles and content containing all relevant data. The response should be a detailed paragraph mentioning everything about the college named '{college_name}', ensuring no important information is missed. Include details such as connectivity, placement, nearby colleges, infrastructure, courses, branches, students, festivals, clubs, reviews, Q&A, and any other college-related parameters available in the text. Provide the response text with no fromatting!\n{text}" | |
model = genai.GenerativeModel("gemini-1.5-pro") | |
response = model.generate_content(prompt) | |
return response.text.strip() | |
# Pydantic model for request body | |
class URLRequest(BaseModel): | |
url: str | |
college_name: str | |
# Pydantic model for Crawler request | |
class CrawlerRequest(BaseModel): | |
topic_title: str | |
# Function to perform Google search and return top N links | |
def google_search(query, num_results=5): | |
search_url = f"https://www.google.com/search?q={query}&num={num_results}" | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36" | |
} | |
response = requests.get(search_url, headers=headers) | |
soup = BeautifulSoup(response.text, "html.parser") | |
links = [] | |
for a in soup.find_all('a', href=True, attrs={'jsname': True}): | |
link = a['href'] | |
if link.startswith("https://"): | |
links.append(link) | |
return links[:num_results] | |
# Function to perform advanced search on specific sites | |
def advanced_search_on_site(site, topic, num_results=10): | |
query = f"site:{site} {topic}" | |
return google_search(query, num_results) | |
# FastAPI endpoint to scrape and structure data | |
async def scrape_and_structure_data(request: URLRequest): | |
try: | |
# Scrape visible text from the webpage | |
visible_text = await scrape_visible_text(request.url) | |
# Structure the data using Google's Gemini model | |
structured_data = structure_data(visible_text, request.college_name) | |
# Return the structured data | |
return {"structured_data": structured_data} | |
except Exception as e: | |
print(f"Error occurred while processing the request: {e}") | |
raise HTTPException(status_code=500, detail=str(e)) | |
# FastAPI endpoint to perform web crawling | |
async def crawl_web(request: CrawlerRequest): | |
try: | |
topic_title = request.topic_title | |
# Get top 5 links from Google search | |
google_links = google_search(topic_title, num_results=5) | |
# Get links from Quora | |
quora_links = advanced_search_on_site("quora.com", topic_title, num_results=10) | |
# Additional sites can be added similarly | |
other_links = advanced_search_on_site("reddit.com", topic_title, num_results=10) | |
# Combine all links | |
all_links = google_links + quora_links + other_links | |
# Use Gemini to filter and list relevant URLs | |
prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{topic_title}':\n{all_links}" | |
model = genai.GenerativeModel("gemini-1.5-pro") | |
response = model.generate_content(prompt) | |
filtered_links = response.text.strip().split('\n') | |
# Return the filtered links | |
return {"filtered_links": filtered_links} | |
except Exception as e: | |
print(f"Error occurred while processing the request: {e}") | |
raise HTTPException(status_code=500, detail=str(e)) | |
if __name__ == "__main__": | |
uvicorn.run(app, host="0.0.0.0", port=7860) |