Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -8,6 +8,7 @@ from crawl4ai import AsyncWebCrawler
|
|
8 |
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
|
9 |
import json
|
10 |
import logging
|
|
|
11 |
|
12 |
# Set up logging
|
13 |
logging.basicConfig(level=logging.INFO)
|
@@ -77,6 +78,47 @@ async def test_url(api_key: str = Depends(verify_api_key), url: str = Query(...,
|
|
77 |
result = await simple_crawl(url=url)
|
78 |
return {"markdown": result.markdown}
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
@app.get("/test")
|
81 |
async def test(api_key: str = Depends(verify_api_key)):
|
82 |
result = await simple_crawl("https://www.nbcnews.com/business")
|
|
|
8 |
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
|
9 |
import json
|
10 |
import logging
|
11 |
+
import trafilatura
|
12 |
|
13 |
# Set up logging
|
14 |
logging.basicConfig(level=logging.INFO)
|
|
|
78 |
result = await simple_crawl(url=url)
|
79 |
return {"markdown": result.markdown}
|
80 |
|
81 |
+
|
82 |
+
@app.get("/basic-crawl-article")
|
83 |
+
def extract_article(
|
84 |
+
url: str,
|
85 |
+
record_id: Optional[str] = Query(None, description="Add an ID to the metadata."),
|
86 |
+
no_fallback: Optional[bool] = Query(False, description="Skip the backup extraction with readability-lxml and justext."),
|
87 |
+
favor_precision: Optional[bool] = Query(False, description="Prefer less text but correct extraction."),
|
88 |
+
favor_recall: Optional[bool] = Query(False, description="When unsure, prefer more text."),
|
89 |
+
include_comments: Optional[bool] = Query(True, description="Extract comments along with the main text."),
|
90 |
+
output_format: Optional[str] = Query('txt', description="Define an output format: 'csv', 'json', 'markdown', 'txt', 'xml', 'xmltei'.", enum=["csv", "json", "markdown", "txt", "xml", "xmltei"]),
|
91 |
+
target_language: Optional[str] = Query(None, description="Define a language to discard invalid documents (ISO 639-1 format)."),
|
92 |
+
include_tables: Optional[bool] = Query(True, description="Take into account information within the HTML <table> element."),
|
93 |
+
include_images: Optional[bool] = Query(False, description="Take images into account (experimental)."),
|
94 |
+
include_links: Optional[bool] = Query(False, description="Keep links along with their targets (experimental)."),
|
95 |
+
deduplicate: Optional[bool] = Query(False, description="Remove duplicate segments and documents."),
|
96 |
+
max_tree_size: Optional[int] = Query(None, description="Discard documents with too many elements.")
|
97 |
+
):
|
98 |
+
response = await simple_crawl(url=url)
|
99 |
+
filecontent = response.html
|
100 |
+
extracted = trafilatura.extract(
|
101 |
+
filecontent,
|
102 |
+
url=url,
|
103 |
+
record_id=record_id,
|
104 |
+
no_fallback=no_fallback,
|
105 |
+
favor_precision=favor_precision,
|
106 |
+
favor_recall=favor_recall,
|
107 |
+
include_comments=include_comments,
|
108 |
+
output_format=output_format,
|
109 |
+
target_language=target_language,
|
110 |
+
include_tables=include_tables,
|
111 |
+
include_images=include_images,
|
112 |
+
include_links=include_links,
|
113 |
+
deduplicate=deduplicate,
|
114 |
+
max_tree_size=max_tree_size
|
115 |
+
)
|
116 |
+
|
117 |
+
if extracted:
|
118 |
+
return {"article": trafilatura.utils.sanitize(extracted)}
|
119 |
+
else:
|
120 |
+
return {"error": "Could not extract the article"}
|
121 |
+
|
122 |
@app.get("/test")
|
123 |
async def test(api_key: str = Depends(verify_api_key)):
|
124 |
result = await simple_crawl("https://www.nbcnews.com/business")
|