heymenn commited on
Commit
cb965ad
1 Parent(s): b5ddf28

Update scrape_3gpp.py

Browse files
Files changed (1) hide show
  1. scrape_3gpp.py +30 -0
scrape_3gpp.py CHANGED
@@ -8,6 +8,7 @@ import zipfile
8
  import textract
9
  import gradio as gr
10
  import shutil
 
11
 
12
  def browse_folder(url):
13
  if url.lower().endswith(('docs', 'docs/')):
@@ -297,6 +298,8 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
297
  if file.endswith((".pptx", ".ppt", ".pdf", ".docx", ".doc", ".DOCX")):
298
  try:
299
  text = textract.process(file_path).decode('utf-8')
 
 
300
  except Exception as e:
301
  print(f"Error processing {file_path}: {e}")
302
  errors_count += 1
@@ -419,6 +422,33 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
419
  # Here's a simplified example
420
  discussion_details = Discussion
421
  extracted_content.append(discussion_details)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  # Add more categories as needed
423
  contenu = "\n".join(extracted_content)
424
 
 
8
  import textract
9
  import gradio as gr
10
  import shutil
11
+ from pypdf import PdfReader
12
 
13
  def browse_folder(url):
14
  if url.lower().endswith(('docs', 'docs/')):
 
298
  if file.endswith((".pptx", ".ppt", ".pdf", ".docx", ".doc", ".DOCX")):
299
  try:
300
  text = textract.process(file_path).decode('utf-8')
301
+ if file.endswith((".pdf")):
302
+ pdfReader = PdfReader(file_path)
303
  except Exception as e:
304
  print(f"Error processing {file_path}: {e}")
305
  errors_count += 1
 
422
  # Here's a simplified example
423
  discussion_details = Discussion
424
  extracted_content.append(discussion_details)
425
+
426
+ elif category == "pdf":
427
+ tabLine = []
428
+ file = pdfReader
429
+ pdfNumberPages = len(file.pages)
430
+ for pdfPage in range(0, pdfNumberPages):
431
+
432
+ load_page = file.get_page(pdfPage)
433
+ text = load_page.extract_text()
434
+ lines = text.split("\n")
435
+
436
+ keyword = ["objective", "introduction", "summary", "scope"]
437
+ for line in lines:
438
+ print(line)
439
+ if len(line) < 20:
440
+ for key in keyword:
441
+ line = line.lower()
442
+ if key in line:
443
+ start_index = line.find(key)
444
+ selectedText = lines[start_index:]
445
+
446
+ tabLine.append([pdfPage,selectedText,key])
447
+ print(f"Selected line in keywords is: {line}")
448
+ for r in tabLine:
449
+ extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')
450
+ extracted_content.append(' '.join(r[1]))
451
+
452
  # Add more categories as needed
453
  contenu = "\n".join(extracted_content)
454