File size: 5,753 Bytes
60b9061
 
 
 
 
 
 
 
 
a88b526
 
60b9061
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6db47e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60b9061
 
 
 
 
 
 
 
 
cb932f2
db5c3f6
60b9061
 
 
 
 
 
 
a88b526
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60b9061
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a88b526
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60b9061
a88b526
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60b9061
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import google.generativeai as genai
from playwright.async_api import async_playwright
from dotenv import load_dotenv
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn
import asyncio
import json
import requests
from bs4 import BeautifulSoup

# Load environment variables
load_dotenv()

# Configure Google Generative AI API key
genai.configure(api_key=os.environ["API_KEY"])

# FastAPI app initialization
app = FastAPI()

# Function to scrape webpage and extract visible text
async def scrape_visible_text(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)  # Launch browser in headless mode
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
            viewport={"width": 1280, "height": 800},
            extra_http_headers={
                "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
                "accept-encoding": "gzip, deflate, br, zstd",
                "accept-language": "en-US,en;q=0.9,hi;q=0.8",
                "cache-control": "max-age=0",
                "sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
                "sec-ch-ua-mobile": "?0",
                "sec-ch-ua-platform": '"Windows"',
                "sec-fetch-dest": "document",
                "sec-fetch-mode": "navigate",
                "sec-fetch-site": "none",
                "sec-fetch-user": "?1",
                "upgrade-insecure-requests": "1"
            }
        )
        page = await context.new_page()
        await page.goto(url, wait_until="networkidle")
        visible_text = await page.evaluate("document.body.innerText")
        await browser.close()
        return visible_text

# Function to structure data using Google's Gemini model
def structure_data(text, college_name):
    prompt = f"Convert the following unstructured text into a well-written and comprehensive structured form with titles and content containing all relevant data. The response should be a detailed paragraph mentioning everything about the college named '{college_name}', ensuring no important information is missed. Include details such as connectivity, placement, nearby colleges, infrastructure, courses, branches, students, festivals, clubs, reviews, Q&A, and any other college-related parameters available in the text. Provide the response text with no fromatting!\n{text}"
    model = genai.GenerativeModel("gemini-1.5-pro")
    response = model.generate_content(prompt)
    return response.text.strip()

# Pydantic model for request body
class URLRequest(BaseModel):
    url: str
    college_name: str
    # Pydantic model for Crawler request
class CrawlerRequest(BaseModel):
    topic_title: str

# Function to perform Google search and return top N links
def google_search(query, num_results=5):
    search_url = f"https://www.google.com/search?q={query}&num={num_results}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
    }
    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    links = []
    for a in soup.find_all('a', href=True, attrs={'jsname': True}):
        link = a['href']
        if link.startswith("https://"):
            links.append(link)
    return links[:num_results]

# Function to perform advanced search on specific sites
def advanced_search_on_site(site, topic, num_results=10):
    query = f"site:{site} {topic}"
    return google_search(query, num_results)

# FastAPI endpoint to scrape and structure data
@app.post("/scrape")
async def scrape_and_structure_data(request: URLRequest):
    try:
        # Scrape visible text from the webpage
        visible_text = await scrape_visible_text(request.url)
        
        # Structure the data using Google's Gemini model
        structured_data = structure_data(visible_text, request.college_name)
        
        # Return the structured data
        return {"structured_data": structured_data}
    except Exception as e:
        print(f"Error occurred while processing the request: {e}")
        raise HTTPException(status_code=500, detail=str(e))
    
# FastAPI endpoint to perform web crawling
@app.post("/crawl")
async def crawl_web(request: CrawlerRequest):
    try:
        topic_title = request.topic_title

        # Get top 5 links from Google search
        google_links = google_search(topic_title, num_results=5)

        # Get links from Quora
        quora_links = advanced_search_on_site("quora.com", topic_title, num_results=10)

        # Additional sites can be added similarly
        other_links = advanced_search_on_site("reddit.com", topic_title, num_results=10)

        # Combine all links
        all_links = google_links + quora_links + other_links

        # Use Gemini to filter and list relevant URLs
        prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{topic_title}':\n{all_links}"
        model = genai.GenerativeModel("gemini-1.5-pro")
        response = model.generate_content(prompt)
        filtered_links = response.text.strip().split('\n')

        # Return the filtered links
        return {"filtered_links": filtered_links}
    except Exception as e:
        print(f"Error occurred while processing the request: {e}")
        raise HTTPException(status_code=500, detail=str(e))
    
if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)