pvanand commited on
Commit
600b195
·
verified ·
1 Parent(s): 43545a2

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +17 -195
main.py CHANGED
@@ -1,195 +1,17 @@
1
- from fastapi import FastAPI, HTTPException, Query, Depends
2
- from fastapi.responses import Response
3
- from pydantic import BaseModel
4
- import hrequests
5
- import trafilatura
6
- from fastapi.middleware.cors import CORSMiddleware
7
- from typing import Optional
8
- from pytrends.request import TrendReq
9
- from datetime import datetime, timedelta
10
- from fastapi_cache import FastAPICache
11
- from fastapi_cache.backends.inmemory import InMemoryBackend
12
- from fastapi_cache.decorator import cache
13
- import pdfkit
14
-
15
- app = FastAPI()
16
-
17
- class URLRequest(BaseModel):
18
- url: str
19
-
20
- @app.post("/scrape")
21
- async def scrape(url_request: URLRequest):
22
- try:
23
- response = hrequests.get(url_request.url, browser='chrome')
24
-
25
-
26
- return {"content": response.text}
27
- except Exception as e:
28
- raise e
29
-
30
- @app.get("/extract-article")
31
- def extract_article(
32
- url: str,
33
- record_id: Optional[str] = Query(None, description="Add an ID to the metadata."),
34
- no_fallback: Optional[bool] = Query(False, description="Skip the backup extraction with readability-lxml and justext."),
35
- favor_precision: Optional[bool] = Query(False, description="Prefer less text but correct extraction."),
36
- favor_recall: Optional[bool] = Query(False, description="When unsure, prefer more text."),
37
- include_comments: Optional[bool] = Query(True, description="Extract comments along with the main text."),
38
- output_format: Optional[str] = Query('txt', description="Define an output format: 'csv', 'json', 'markdown', 'txt', 'xml', 'xmltei'.", enum=["csv", "json", "markdown", "txt", "xml", "xmltei"]),
39
- target_language: Optional[str] = Query(None, description="Define a language to discard invalid documents (ISO 639-1 format)."),
40
- include_tables: Optional[bool] = Query(True, description="Take into account information within the HTML <table> element."),
41
- include_images: Optional[bool] = Query(False, description="Take images into account (experimental)."),
42
- include_links: Optional[bool] = Query(False, description="Keep links along with their targets (experimental)."),
43
- deduplicate: Optional[bool] = Query(False, description="Remove duplicate segments and documents."),
44
- max_tree_size: Optional[int] = Query(None, description="Discard documents with too many elements.")
45
- ):
46
- response = hrequests.get(url)
47
- filecontent = response.text
48
- extracted = trafilatura.extract(
49
- filecontent,
50
- url=url,
51
- record_id=record_id,
52
- no_fallback=no_fallback,
53
- favor_precision=favor_precision,
54
- favor_recall=favor_recall,
55
- include_comments=include_comments,
56
- output_format=output_format,
57
- target_language=target_language,
58
- include_tables=include_tables,
59
- include_images=include_images,
60
- include_links=include_links,
61
- deduplicate=deduplicate,
62
- max_tree_size=max_tree_size
63
- )
64
-
65
- if extracted:
66
- return {"article": trafilatura.utils.sanitize(extracted)}
67
- else:
68
- return {"error": "Could not extract the article"}
69
-
70
- app.add_middleware(
71
- CORSMiddleware,
72
- allow_origins=["*"],
73
- allow_credentials=True,
74
- allow_methods=["*"],
75
- allow_headers=["*"],
76
- )
77
-
78
- pytrends = TrendReq()
79
-
80
- @app.on_event("startup")
81
- async def startup():
82
- FastAPICache.init(InMemoryBackend(), prefix="fastapi-cache")
83
-
84
- @app.get("/realtime_trending_searches")
85
- @cache(expire=3600)
86
- async def get_realtime_trending_searches(pn: str = Query('US', description="Country code for trending searches")):
87
- trending_searches = pytrends.realtime_trending_searches(pn=pn)
88
- return trending_searches.to_dict(orient='records')
89
-
90
- @app.get("/", tags=["Home"])
91
- def api_home():
92
- return {'detail': 'Welcome to Web-Scraping API! Visit https://pvanand-web-scraping.hf.space/docs to test'}
93
-
94
- class HTMLRequest(BaseModel):
95
- html_content: str
96
-
97
- @app.post("/html_to_pdf")
98
- async def convert_to_pdf(request: HTMLRequest):
99
- try:
100
- options = {
101
- 'page-size': 'A4',
102
- 'margin-top': '0.75in',
103
- 'margin-right': '0.75in',
104
- 'margin-bottom': '0.75in',
105
- 'margin-left': '0.75in',
106
- 'encoding': "UTF-8",
107
- }
108
-
109
- pdf = pdfkit.from_string(request.html_content, False, options=options)
110
- return Response(content=pdf, media_type="application/pdf")
111
- except Exception as e:
112
- raise HTTPException(status_code=500, detail=str(e))
113
-
114
-
115
- from fastapi import FastAPI, HTTPException, Response
116
- from pydantic import BaseModel
117
- from html4docx import HtmlToDocx
118
- import os
119
-
120
-
121
- class HTMLInput(BaseModel):
122
- html: str
123
-
124
- # Define the path to the temporary folder
125
- TEMP_FOLDER = "/app/temp"
126
-
127
- @app.post("/convert")
128
- async def convert_html_to_docx(input_data: HTMLInput):
129
- temp_filename = None
130
- try:
131
- # Create a new HtmlToDocx parser
132
- parser = HtmlToDocx()
133
-
134
- # Parse the HTML string to DOCX
135
- docx = parser.parse_html_string(input_data.html)
136
-
137
- # Create a unique filename in the temporary folder
138
- temp_filename = os.path.join(TEMP_FOLDER, f"temp_{os.urandom(8).hex()}.docx")
139
-
140
- # Save the DOCX to the temporary file
141
- docx.save(temp_filename)
142
-
143
- # Open the file and read its contents
144
- with open(temp_filename, 'rb') as file:
145
- file_contents = file.read()
146
-
147
- # Return the DOCX file as a response
148
- return Response(
149
- content=file_contents,
150
- media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
151
- headers={"Content-Disposition": "attachment; filename=converted.docx"}
152
- )
153
- except Exception as e:
154
- raise HTTPException(status_code=500, detail=str(e))
155
- finally:
156
- # Clean up: remove the temporary file
157
- if temp_filename and os.path.exists(temp_filename):
158
- os.remove(temp_filename)
159
-
160
- @app.post("/html_to_docx")
161
- async def convert_html_to_docx(input_data: HTMLRequest):
162
- temp_filename = None
163
- try:
164
- # Create a new HtmlToDocx parser
165
- parser = HtmlToDocx()
166
-
167
- # Parse the HTML string to DOCX
168
- docx = parser.parse_html_string(input_data.html_content)
169
-
170
- # Create a unique filename in the temporary folder
171
- temp_filename = os.path.join(TEMP_FOLDER, f"temp_{os.urandom(8).hex()}.docx")
172
-
173
- # Save the DOCX to the temporary file
174
- docx.save(temp_filename)
175
-
176
- # Open the file and read its contents
177
- with open(temp_filename, 'rb') as file:
178
- file_contents = file.read()
179
-
180
- # Return the DOCX file as a response
181
- return Response(
182
- content=file_contents,
183
- media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
184
- headers={"Content-Disposition": "attachment; filename=converted.docx"}
185
- )
186
- except Exception as e:
187
- raise HTTPException(status_code=500, detail=str(e))
188
- finally:
189
- # Clean up: remove the temporary file
190
- if temp_filename and os.path.exists(temp_filename):
191
- os.remove(temp_filename)
192
-
193
- if __name__ == "__main__":
194
- import uvicorn
195
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ import asyncio
2
+ import nest_asyncio
3
+ from crawl4ai import AsyncWebCrawler
4
+ from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
5
+ import json
6
+ import time
7
+ from pydantic import BaseModel, Field
8
+
9
+ nest_asyncio.apply()
10
+
11
+ async def simple_crawl():
12
+ async with AsyncWebCrawler(verbose=True) as crawler:
13
+ result = await crawler.arun(url="https://www.nbcnews.com/business")
14
+ print(len(result.markdown))
15
+ return result
16
+ result = await simple_crawl()
17
+ print(result.markdown)