pvanand commited on
Commit
ffa393c
·
verified ·
1 Parent(s): 9300ae8

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +42 -1
main.py CHANGED
@@ -1,6 +1,7 @@
1
- from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
  import hrequests
 
4
  from fastapi.middleware.cors import CORSMiddleware
5
 
6
  app = FastAPI()
@@ -18,6 +19,46 @@ async def scrape(url_request: URLRequest):
18
  except Exception as e:
19
  raise e
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  app.add_middleware(
22
  CORSMiddleware,
23
  allow_origins=["*"],
 
1
+ from fastapi import FastAPI, HTTPException, Query
2
  from pydantic import BaseModel
3
  import hrequests
4
+ import trafilatura
5
  from fastapi.middleware.cors import CORSMiddleware
6
 
7
  app = FastAPI()
 
19
  except Exception as e:
20
  raise e
21
 
22
+ @app.get("/extract-article")
23
+ def extract_article(
24
+ url: str,
25
+ record_id: Optional[str] = Query(None, description="Add an ID to the metadata."),
26
+ no_fallback: Optional[bool] = Query(False, description="Skip the backup extraction with readability-lxml and justext."),
27
+ favor_precision: Optional[bool] = Query(False, description="Prefer less text but correct extraction."),
28
+ favor_recall: Optional[bool] = Query(False, description="When unsure, prefer more text."),
29
+ include_comments: Optional[bool] = Query(True, description="Extract comments along with the main text."),
30
+ output_format: Optional[str] = Query('txt', description="Define an output format: 'csv', 'json', 'markdown', 'txt', 'xml', 'xmltei'.", enum=["csv", "json", "markdown", "txt", "xml", "xmltei"]),
31
+ target_language: Optional[str] = Query(None, description="Define a language to discard invalid documents (ISO 639-1 format)."),
32
+ include_tables: Optional[bool] = Query(True, description="Take into account information within the HTML <table> element."),
33
+ include_images: Optional[bool] = Query(False, description="Take images into account (experimental)."),
34
+ include_links: Optional[bool] = Query(False, description="Keep links along with their targets (experimental)."),
35
+ deduplicate: Optional[bool] = Query(False, description="Remove duplicate segments and documents."),
36
+ max_tree_size: Optional[int] = Query(None, description="Discard documents with too many elements.")
37
+ ):
38
+ response = hrequests.get(url)
39
+ filecontent = response.text
40
+ extracted = trafilatura.extract(
41
+ filecontent,
42
+ url=url,
43
+ record_id=record_id,
44
+ no_fallback=no_fallback,
45
+ favor_precision=favor_precision,
46
+ favor_recall=favor_recall,
47
+ include_comments=include_comments,
48
+ output_format=output_format,
49
+ target_language=target_language,
50
+ include_tables=include_tables,
51
+ include_images=include_images,
52
+ include_links=include_links,
53
+ deduplicate=deduplicate,
54
+ max_tree_size=max_tree_size
55
+ )
56
+
57
+ if extracted:
58
+ return {"article": extracted}
59
+ else:
60
+ return {"error": "Could not extract the article"}
61
+
62
  app.add_middleware(
63
  CORSMiddleware,
64
  allow_origins=["*"],