Spaces:
Sleeping
Sleeping
File size: 5,753 Bytes
60b9061 a88b526 60b9061 6db47e9 60b9061 cb932f2 db5c3f6 60b9061 a88b526 60b9061 a88b526 60b9061 a88b526 60b9061 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import os
import google.generativeai as genai
from playwright.async_api import async_playwright
from dotenv import load_dotenv
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn
import asyncio
import json
import requests
from bs4 import BeautifulSoup
# Load environment variables
load_dotenv()
# Configure Google Generative AI API key
genai.configure(api_key=os.environ["API_KEY"])
# FastAPI app initialization
app = FastAPI()
# Function to scrape webpage and extract visible text
async def scrape_visible_text(url):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True) # Launch browser in headless mode
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
viewport={"width": 1280, "height": 800},
extra_http_headers={
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-encoding": "gzip, deflate, br, zstd",
"accept-language": "en-US,en;q=0.9,hi;q=0.8",
"cache-control": "max-age=0",
"sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1"
}
)
page = await context.new_page()
await page.goto(url, wait_until="networkidle")
visible_text = await page.evaluate("document.body.innerText")
await browser.close()
return visible_text
# Function to structure data using Google's Gemini model
def structure_data(text, college_name):
prompt = f"Convert the following unstructured text into a well-written and comprehensive structured form with titles and content containing all relevant data. The response should be a detailed paragraph mentioning everything about the college named '{college_name}', ensuring no important information is missed. Include details such as connectivity, placement, nearby colleges, infrastructure, courses, branches, students, festivals, clubs, reviews, Q&A, and any other college-related parameters available in the text. Provide the response text with no fromatting!\n{text}"
model = genai.GenerativeModel("gemini-1.5-pro")
response = model.generate_content(prompt)
return response.text.strip()
# Pydantic model for request body
class URLRequest(BaseModel):
url: str
college_name: str
# Pydantic model for Crawler request
class CrawlerRequest(BaseModel):
topic_title: str
# Function to perform Google search and return top N links
def google_search(query, num_results=5):
search_url = f"https://www.google.com/search?q={query}&num={num_results}"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
response = requests.get(search_url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
links = []
for a in soup.find_all('a', href=True, attrs={'jsname': True}):
link = a['href']
if link.startswith("https://"):
links.append(link)
return links[:num_results]
# Function to perform advanced search on specific sites
def advanced_search_on_site(site, topic, num_results=10):
query = f"site:{site} {topic}"
return google_search(query, num_results)
# FastAPI endpoint to scrape and structure data
@app.post("/scrape")
async def scrape_and_structure_data(request: URLRequest):
try:
# Scrape visible text from the webpage
visible_text = await scrape_visible_text(request.url)
# Structure the data using Google's Gemini model
structured_data = structure_data(visible_text, request.college_name)
# Return the structured data
return {"structured_data": structured_data}
except Exception as e:
print(f"Error occurred while processing the request: {e}")
raise HTTPException(status_code=500, detail=str(e))
# FastAPI endpoint to perform web crawling
@app.post("/crawl")
async def crawl_web(request: CrawlerRequest):
try:
topic_title = request.topic_title
# Get top 5 links from Google search
google_links = google_search(topic_title, num_results=5)
# Get links from Quora
quora_links = advanced_search_on_site("quora.com", topic_title, num_results=10)
# Additional sites can be added similarly
other_links = advanced_search_on_site("reddit.com", topic_title, num_results=10)
# Combine all links
all_links = google_links + quora_links + other_links
# Use Gemini to filter and list relevant URLs
prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{topic_title}':\n{all_links}"
model = genai.GenerativeModel("gemini-1.5-pro")
response = model.generate_content(prompt)
filtered_links = response.text.strip().split('\n')
# Return the filtered links
return {"filtered_links": filtered_links}
except Exception as e:
print(f"Error occurred while processing the request: {e}")
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860) |