rdose commited on
Commit
f606246
·
1 Parent(s): cbfe1e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -2
app.py CHANGED
@@ -306,7 +306,16 @@ def inference(input_batch,isurl,use_archive,filt_companies_topic,limit_companies
306
  extracted = extract_content(requests.get(url).content)
307
  input_batch_content.append(extracted)
308
  elif(EXTRACTOR_NET == 'trafilatura'):
309
- extracted = trafilatura.extract(trafilatura.fetch_url(url), include_comments=False, config=trafilatura_config, include_tables=False)
 
 
 
 
 
 
 
 
 
310
  input_batch_content.append(extracted)
311
  else:
312
  print("[i] Data is news contents")
@@ -349,7 +358,7 @@ def inference(input_batch,isurl,use_archive,filt_companies_topic,limit_companies
349
  if ner_labels[idx]: #not empty
350
  for ner in ner_labels[idx]:
351
  if filt_companies_topic:
352
- if news_sectors[idx] != ner[1]:
353
  continue
354
  dfo = pd.concat( [dfo, df.loc[[idx]].assign(company=ner[0], sector=ner[1], symbol=ner[2])], join='outer', ignore_index=True) #axis=0
355
  print("[i] Pandas output shape:",dfo.shape)
 
306
  extracted = extract_content(requests.get(url).content)
307
  input_batch_content.append(extracted)
308
  elif(EXTRACTOR_NET == 'trafilatura'):
309
+ try:
310
+ extracted = trafilatura.extract(trafilatura.fetch_url(url), include_comments=False, config=trafilatura_config, include_tables=False)
311
+ except:
312
+ archive = is_in_archive(url)
313
+ if archive['archived']:
314
+ print("[W] Using archive.org version of",url)
315
+ url = archive['url']
316
+ extracted = trafilatura.extract(trafilatura.fetch_url(url), include_comments=False, config=trafilatura_config, include_tables=False)
317
+ else:
318
+ print("[E] URL=",url,"not found")
319
  input_batch_content.append(extracted)
320
  else:
321
  print("[i] Data is news contents")
 
358
  if ner_labels[idx]: #not empty
359
  for ner in ner_labels[idx]:
360
  if filt_companies_topic:
361
+ if news_sectors[idx][0] not in ner[1]:
362
  continue
363
  dfo = pd.concat( [dfo, df.loc[[idx]].assign(company=ner[0], sector=ner[1], symbol=ner[2])], join='outer', ignore_index=True) #axis=0
364
  print("[i] Pandas output shape:",dfo.shape)