adityaiiitr commited on
Commit
0c89398
·
verified ·
1 Parent(s): 4c75363

initial setup

Browse files
Files changed (3) hide show
  1. Dockerfile +19 -0
  2. main.py +63 -0
  3. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.9
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy the current directory contents into the container at /app
8
+ COPY . /app
9
+
10
+ # Install any needed packages specified in requirements.txt
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ RUN playwright install
14
+
15
+ # Expose the port the app runs on
16
+ EXPOSE 8000
17
+
18
+ # Run the command to start the FastAPI server
19
+ CMD ["python", "main.py"]
main.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import google.generativeai as genai
3
+ from playwright.async_api import async_playwright
4
+ from dotenv import load_dotenv
5
+ from fastapi import FastAPI, HTTPException
6
+ from pydantic import BaseModel
7
+ import uvicorn
8
+ import asyncio
9
+ import json
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+ # Configure Google Generative AI API key
15
+ genai.configure(api_key=os.environ["API_KEY"])
16
+
17
+ # FastAPI app initialization
18
+ app = FastAPI()
19
+
20
+ # Function to scrape webpage and extract visible text
21
+ async def scrape_visible_text(url):
22
+ async with async_playwright() as p:
23
+ browser = await p.chromium.launch(headless=True) # Launch browser in headless mode
24
+ context = await browser.new_context(
25
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
26
+ viewport={"width": 1280, "height": 800}
27
+ )
28
+ page = await context.new_page()
29
+ await page.goto(url, wait_until="networkidle")
30
+ visible_text = await page.evaluate("document.body.innerText")
31
+ await browser.close()
32
+ return visible_text
33
+
34
+ # Function to structure data using Google's Gemini model
35
+ def structure_data(text, college_name):
36
+ prompt = f"Convert the following unstructured text into a structured format with the titles and content containing the data. Properly structure tables and general text. The structured data should contain details only about the college named '{college_name}':\n{text}"
37
+ model = genai.GenerativeModel("gemini-1.5-flash")
38
+ response = model.generate_content(prompt)
39
+ return response.text.strip()
40
+
41
+ # Pydantic model for request body
42
+ class URLRequest(BaseModel):
43
+ url: str
44
+ college_name: str
45
+
46
+ # FastAPI endpoint to scrape and structure data
47
+ @app.post("/scrape")
48
+ async def scrape_and_structure_data(request: URLRequest):
49
+ try:
50
+ # Scrape visible text from the webpage
51
+ visible_text = await scrape_visible_text(request.url)
52
+
53
+ # Structure the data using Google's Gemini model
54
+ structured_data = structure_data(visible_text, request.college_name)
55
+
56
+ # Return the structured data
57
+ return {"structured_data": structured_data}
58
+ except Exception as e:
59
+ print(f"Error occurred while processing the request: {e}")
60
+ raise HTTPException(status_code=500, detail=str(e))
61
+
62
+ if __name__ == "__main__":
63
+ uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ playwright
4
+ google-generativeai
5
+ python-dotenv