Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -18,7 +18,10 @@ except ImportError:
|
|
18 |
except ImportError:
|
19 |
try:
|
20 |
import trafilatura
|
|
|
21 |
EXTRACTOR_NET = 'trafilatura'
|
|
|
|
|
22 |
except ImportError:
|
23 |
raise ImportError
|
24 |
|
@@ -301,7 +304,7 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
|
|
301 |
extracted = extract_content(requests.get(url).content)
|
302 |
input_batch_content.append(extracted)
|
303 |
elif(EXTRACTOR_NET == 'trafilatura'):
|
304 |
-
extracted = trafilatura.extract(trafilatura.fetch_url(url), include_comments=False)
|
305 |
input_batch_content.append(extracted)
|
306 |
else:
|
307 |
print("[i] Data is news contents")
|
|
|
18 |
except ImportError:
|
19 |
try:
|
20 |
import trafilatura
|
21 |
+
from trafilatura.settings import use_config
|
22 |
EXTRACTOR_NET = 'trafilatura'
|
23 |
+
trafilatura_config = use_config()
|
24 |
+
trafilatura_config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0") #To avoid it runnig signals to avoid clashing with gradio threads
|
25 |
except ImportError:
|
26 |
raise ImportError
|
27 |
|
|
|
304 |
extracted = extract_content(requests.get(url).content)
|
305 |
input_batch_content.append(extracted)
|
306 |
elif(EXTRACTOR_NET == 'trafilatura'):
|
307 |
+
extracted = trafilatura.extract(trafilatura.fetch_url(url), include_comments=False, config=trafilatura_config, include_tables=False)
|
308 |
input_batch_content.append(extracted)
|
309 |
else:
|
310 |
print("[i] Data is news contents")
|