pvanand commited on
Commit
7faacd1
·
verified ·
1 Parent(s): 954d1a4

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +42 -0
main.py CHANGED
@@ -8,6 +8,7 @@ from crawl4ai import AsyncWebCrawler
8
  from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
9
  import json
10
  import logging
 
11
 
12
  # Set up logging
13
  logging.basicConfig(level=logging.INFO)
@@ -77,6 +78,47 @@ async def test_url(api_key: str = Depends(verify_api_key), url: str = Query(...,
77
  result = await simple_crawl(url=url)
78
  return {"markdown": result.markdown}
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  @app.get("/test")
81
  async def test(api_key: str = Depends(verify_api_key)):
82
  result = await simple_crawl("https://www.nbcnews.com/business")
 
8
  from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
9
  import json
10
  import logging
11
+ import trafilatura
12
 
13
  # Set up logging
14
  logging.basicConfig(level=logging.INFO)
 
78
  result = await simple_crawl(url=url)
79
  return {"markdown": result.markdown}
80
 
81
+
82
+ @app.get("/basic-crawl-article")
83
+ def extract_article(
84
+ url: str,
85
+ record_id: Optional[str] = Query(None, description="Add an ID to the metadata."),
86
+ no_fallback: Optional[bool] = Query(False, description="Skip the backup extraction with readability-lxml and justext."),
87
+ favor_precision: Optional[bool] = Query(False, description="Prefer less text but correct extraction."),
88
+ favor_recall: Optional[bool] = Query(False, description="When unsure, prefer more text."),
89
+ include_comments: Optional[bool] = Query(True, description="Extract comments along with the main text."),
90
+ output_format: Optional[str] = Query('txt', description="Define an output format: 'csv', 'json', 'markdown', 'txt', 'xml', 'xmltei'.", enum=["csv", "json", "markdown", "txt", "xml", "xmltei"]),
91
+ target_language: Optional[str] = Query(None, description="Define a language to discard invalid documents (ISO 639-1 format)."),
92
+ include_tables: Optional[bool] = Query(True, description="Take into account information within the HTML <table> element."),
93
+ include_images: Optional[bool] = Query(False, description="Take images into account (experimental)."),
94
+ include_links: Optional[bool] = Query(False, description="Keep links along with their targets (experimental)."),
95
+ deduplicate: Optional[bool] = Query(False, description="Remove duplicate segments and documents."),
96
+ max_tree_size: Optional[int] = Query(None, description="Discard documents with too many elements.")
97
+ ):
98
+ response = await simple_crawl(url=url)
99
+ filecontent = response.html
100
+ extracted = trafilatura.extract(
101
+ filecontent,
102
+ url=url,
103
+ record_id=record_id,
104
+ no_fallback=no_fallback,
105
+ favor_precision=favor_precision,
106
+ favor_recall=favor_recall,
107
+ include_comments=include_comments,
108
+ output_format=output_format,
109
+ target_language=target_language,
110
+ include_tables=include_tables,
111
+ include_images=include_images,
112
+ include_links=include_links,
113
+ deduplicate=deduplicate,
114
+ max_tree_size=max_tree_size
115
+ )
116
+
117
+ if extracted:
118
+ return {"article": trafilatura.utils.sanitize(extracted)}
119
+ else:
120
+ return {"error": "Could not extract the article"}
121
+
122
  @app.get("/test")
123
  async def test(api_key: str = Depends(verify_api_key)):
124
  result = await simple_crawl("https://www.nbcnews.com/business")