precollege_scraper

Sleeping

App Files Files Community

adityaiiitr commited on Oct 10, 2024

Commit

0c89398

verified ·

1 Parent(s): 4c75363

initial setup

Browse files

Files changed (3) hide show

Dockerfile +19 -0
main.py +63 -0
requirements.txt +5 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,19 @@

+# Use an official Python runtime as a parent image
+FROM python:3.9
+# Set the working directory in the container
+WORKDIR /app
+# Copy the current directory contents into the container at /app
+COPY . /app
+# Install any needed packages specified in requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+RUN playwright install
+# Expose the port the app runs on
+EXPOSE 8000
+# Run the command to start the FastAPI server
+CMD ["python", "main.py"]

main.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import google.generativeai as genai
+from playwright.async_api import async_playwright
+from dotenv import load_dotenv
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import uvicorn
+import asyncio
+import json
+# Load environment variables
+load_dotenv()
+# Configure Google Generative AI API key
+genai.configure(api_key=os.environ["API_KEY"])
+# FastAPI app initialization
+app = FastAPI()
+# Function to scrape webpage and extract visible text
+async def scrape_visible_text(url):
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)  # Launch browser in headless mode
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
+            viewport={"width": 1280, "height": 800}
+        )
+        page = await context.new_page()
+        await page.goto(url, wait_until="networkidle")
+        visible_text = await page.evaluate("document.body.innerText")
+        await browser.close()
+        return visible_text
+# Function to structure data using Google's Gemini model
+def structure_data(text, college_name):
+    prompt = f"Convert the following unstructured text into a structured format with the titles and content containing the data. Properly structure tables and general text. The structured data should contain details only about the college named '{college_name}':\n{text}"
+    model = genai.GenerativeModel("gemini-1.5-flash")
+    response = model.generate_content(prompt)
+    return response.text.strip()
+# Pydantic model for request body
+class URLRequest(BaseModel):
+    url: str
+    college_name: str
+# FastAPI endpoint to scrape and structure data
+@app.post("/scrape")
+async def scrape_and_structure_data(request: URLRequest):
+    try:
+        # Scrape visible text from the webpage
+        visible_text = await scrape_visible_text(request.url)
+        # Structure the data using Google's Gemini model
+        structured_data = structure_data(visible_text, request.college_name)
+        # Return the structured data
+        return {"structured_data": structured_data}
+    except Exception as e:
+        print(f"Error occurred while processing the request: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+fastapi
+uvicorn
+playwright
+google-generativeai
+python-dotenv