MaksG commited on
Commit
529f969
1 Parent(s): 31f53ef

Update scrape_3gpp.py

Browse files
Files changed (1) hide show
  1. scrape_3gpp.py +7 -7
scrape_3gpp.py CHANGED
@@ -67,7 +67,8 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
67
  status_filenames = []
68
  df = pd.DataFrame() # Initialize df to ensure it's always defined
69
 
70
- if os.path.exists(excel_file):
 
71
  try:
72
  df = pd.read_excel(excel_file)
73
  print(f"Initial DataFrame size: {len(df)}")
@@ -76,7 +77,6 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
76
  df = df[df['TDoc Status'].isin(status_list)]
77
  print(f"Filtered DataFrame size: {len(df)}")
78
  else:
79
- # If status_list is empty, consider all statuses
80
  print("No filtering applied based on TDoc Status")
81
 
82
  if not df.empty:
@@ -86,13 +86,15 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
86
  status_filenames = df['URL'].tolist()
87
  else:
88
  print("No valid 'TDoc' or 'URL' entries found.")
89
-
90
  print(f"Filenames: {status_filenames}")
91
  else:
92
  print("DataFrame is empty after filtering.")
93
 
94
  except Exception as e:
95
  print(f"Error reading Excel file: {e}")
 
 
96
 
97
  download_directory = folder_name
98
  if not os.path.exists(download_directory):
@@ -109,8 +111,7 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
109
  filename = os.path.basename(file_url)
110
  save_path = os.path.join(download_directory, filename)
111
  progress(pourcentss, desc='Downloading')
112
- # Adjust progress calculation based on actual number of files
113
- pourcentss += 0.4 / len(status_filenames)
114
  try:
115
  with requests.get(file_url, stream=True) as r:
116
  r.raise_for_status()
@@ -119,14 +120,13 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
119
  f.write(chunk)
120
  except requests.exceptions.HTTPError as e:
121
  print(f"HTTP error occurred: {file_url}: {e}")
122
- # Decide how you want to handle HTTP errors (e.g., skip this file, stop the process, etc.)
123
 
124
- # Ensure correct return value, especially if the function should indicate success/failure and the number of processed files
125
  return True, len(status_filenames)
126
 
127
 
128
 
129
 
 
130
  def extractZip(url):
131
  # Répertoire où les fichiers zip sont déjà téléchargés
132
  nom_extract = url.split("/")[-3] + "_extraction"
 
67
  status_filenames = []
68
  df = pd.DataFrame() # Initialize df to ensure it's always defined
69
 
70
+ # Only proceed if excel_file is not None and it exists
71
+ if excel_file and os.path.exists(excel_file):
72
  try:
73
  df = pd.read_excel(excel_file)
74
  print(f"Initial DataFrame size: {len(df)}")
 
77
  df = df[df['TDoc Status'].isin(status_list)]
78
  print(f"Filtered DataFrame size: {len(df)}")
79
  else:
 
80
  print("No filtering applied based on TDoc Status")
81
 
82
  if not df.empty:
 
86
  status_filenames = df['URL'].tolist()
87
  else:
88
  print("No valid 'TDoc' or 'URL' entries found.")
89
+
90
  print(f"Filenames: {status_filenames}")
91
  else:
92
  print("DataFrame is empty after filtering.")
93
 
94
  except Exception as e:
95
  print(f"Error reading Excel file: {e}")
96
+ else:
97
+ print("No valid excel_file path provided.")
98
 
99
  download_directory = folder_name
100
  if not os.path.exists(download_directory):
 
111
  filename = os.path.basename(file_url)
112
  save_path = os.path.join(download_directory, filename)
113
  progress(pourcentss, desc='Downloading')
114
+ pourcentss += 0.4 / len(status_filenames) if status_filenames else 1 # Adjust to prevent division by zero
 
115
  try:
116
  with requests.get(file_url, stream=True) as r:
117
  r.raise_for_status()
 
120
  f.write(chunk)
121
  except requests.exceptions.HTTPError as e:
122
  print(f"HTTP error occurred: {file_url}: {e}")
 
123
 
 
124
  return True, len(status_filenames)
125
 
126
 
127
 
128
 
129
+
130
  def extractZip(url):
131
  # Répertoire où les fichiers zip sont déjà téléchargés
132
  nom_extract = url.split("/")[-3] + "_extraction"