MaksG commited on
Commit
92701ab
1 Parent(s): 507ff3f

Update scrape_3gpp.py

Browse files
Files changed (1) hide show
  1. scrape_3gpp.py +7 -35
scrape_3gpp.py CHANGED
@@ -228,7 +228,7 @@ def update_excel(data, excel_file, url):
228
  temp_df = pd.DataFrame(data, columns=new_df_columns)
229
 
230
  try:
231
- # Load the existing Excel file if it exists, else create a new one
232
  if os.path.exists(excel_file):
233
  old_df = pd.read_excel(excel_file)
234
  df = pd.concat([old_df, temp_df], axis=0, ignore_index=True)
@@ -242,7 +242,7 @@ def update_excel(data, excel_file, url):
242
 
243
  def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
244
  folder_name = 'nom provisoire'
245
- temp_excel = '/content/temporaire.xlsx'
246
  progress(0.0,desc='Telechargement')
247
  result, message = scrape(url, excel_file, folder_name, status_list)
248
  if result:
@@ -279,7 +279,7 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
279
  pre_title_section = None
280
 
281
  try:
282
- df = pd.read_excel(excel_file)
283
  except Exception as e:
284
  print(f"Initializing a new DataFrame because: {e}")
285
  df = pd.DataFrame(columns=["URL", "File", "Type", "Title", "Source", "Status", "Content"])
@@ -450,39 +450,11 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
450
  print(f"Updated after processing {processed_count} files.")
451
  data = [] # Clear the data list after updating
452
 
453
-
454
-
455
- new_df_columns = ["URL", "File", "Type", "Title", "Source", "Status", "Content"] # Create a DataFrame with the updated data
456
- new_df = pd.DataFrame(data, columns=new_df_columns)
457
- try:
458
- old_df = pd.read_excel(excel_file)
459
-
460
- # Check if 'Actions' column exists in the old DataFrame
461
- if 'Actions' in old_df.columns:
462
- # Assuming you want to update 'Content' in old_df for matching 'TDoc' values in 'File'
463
- for index, new_row in new_df.iterrows():
464
- # Find matching rows in old_df where 'TDoc' matches 'File' from new_df
465
- match_indices = old_df[old_df['TDoc'] == new_row['File']].index
466
- # Update 'Content' in old_df for matching rows
467
- for i in match_indices:
468
- old_df.at[i, 'Content'] = new_row['Content']
469
- old_df.at[i, 'URL'] = new_row['URL']
470
-
471
- df = old_df
472
- ###placer la colonne content en 4eme position
473
- # current_columns = df.columns.tolist()
474
- # current_columns.remove('URL')
475
- # # Insert 'Content' at the desired position
476
- # new_columns_order = current_columns[:1] + ['URL'] + current_columns[3:]
477
- # df = df[new_columns_order]
478
- else:
479
- # If 'Actions' column doesn't exist, simply concatenate the DataFrames
480
- df = pd.concat([old_df, new_df], axis=0, ignore_index=True)
481
- except Exception as e:
482
- print("The provided excel file seems invalid:", e)
483
- df = new_df
484
 
485
  file_name = url.split("/")[-2] + ".xlsx"
486
  # Save the updated DataFrame to Excel
487
- df.to_excel(file_name, index=False)
488
  return file_name, "Téléchargement réussi"
 
228
  temp_df = pd.DataFrame(data, columns=new_df_columns)
229
 
230
  try:
231
+ # Check if the Excel file already exists and append data to it
232
  if os.path.exists(excel_file):
233
  old_df = pd.read_excel(excel_file)
234
  df = pd.concat([old_df, temp_df], axis=0, ignore_index=True)
 
242
 
243
  def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
244
  folder_name = 'nom provisoire'
245
+ temp_excel = 'temporaire.xlsx'
246
  progress(0.0,desc='Telechargement')
247
  result, message = scrape(url, excel_file, folder_name, status_list)
248
  if result:
 
279
  pre_title_section = None
280
 
281
  try:
282
+ df = pd.read_excel(temp_excel)
283
  except Exception as e:
284
  print(f"Initializing a new DataFrame because: {e}")
285
  df = pd.DataFrame(columns=["URL", "File", "Type", "Title", "Source", "Status", "Content"])
 
450
  print(f"Updated after processing {processed_count} files.")
451
  data = [] # Clear the data list after updating
452
 
453
+ if data:
454
+ # This final call ensures that any remaining data is processed and saved.
455
+ update_excel(data, temp_excel, url)
456
+ print(f"Final update after processing all files.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
  file_name = url.split("/")[-2] + ".xlsx"
459
  # Save the updated DataFrame to Excel
 
460
  return file_name, "Téléchargement réussi"