Commit
•
130902a
1
Parent(s):
ffea6b6
Updating log levels
Browse files- main.py +8 -8
- utilities/pushshift_data.py +1 -1
main.py
CHANGED
@@ -68,23 +68,23 @@ def main(date_to_fetch):
|
|
68 |
# Load the existing dataset from the Hugging Face hub or create a new one
|
69 |
try:
|
70 |
dataset = load_dataset(dataset_name, download_mode="reuse_cache_if_exists", ignore_verifications=True)
|
71 |
-
logger.
|
72 |
if "__index_level_0__" in dataset["all_days"].column_names:
|
73 |
dataset = dataset.remove_columns(["__index_level_0__"])
|
74 |
except FileNotFoundError:
|
75 |
-
logger.
|
76 |
dataset = DatasetDict()
|
77 |
|
78 |
# Call get_subreddit_day with the calculated date
|
79 |
logger.info(f"Fetching data for {str(date_to_fetch)}")
|
80 |
submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
|
81 |
df = submissions_to_dataframe(submissions)
|
82 |
-
logger.
|
83 |
most_recent_date = date_to_fetch
|
84 |
|
85 |
# Append DataFrame to split 'all_days' or create new split
|
86 |
if "all_days" in dataset:
|
87 |
-
logger.
|
88 |
# Merge the new submissions
|
89 |
old_data = dataset['all_days'].to_pandas()
|
90 |
new_data = pd.concat([old_data, df], ignore_index=True)
|
@@ -105,13 +105,13 @@ def main(date_to_fetch):
|
|
105 |
# Convert back to dataset
|
106 |
dataset["all_days"] = Dataset.from_pandas(new_data)
|
107 |
else:
|
108 |
-
logger.
|
109 |
dataset["all_days"] = Dataset.from_pandas(df)
|
110 |
# Log appending or creating split 'all'
|
111 |
-
logger.
|
112 |
|
113 |
# Push the augmented dataset to the Hugging Face hub
|
114 |
-
logger.
|
115 |
readme_text = update_readme(dataset_name, subreddit, date_to_fetch)
|
116 |
dataset.description = readme_text
|
117 |
dataset.push_to_hub(dataset_name, token=auth_token)
|
@@ -136,7 +136,7 @@ def run_main_continuously():
|
|
136 |
two_days_ago = today - timedelta(days=2)
|
137 |
|
138 |
if start_date <= two_days_ago:
|
139 |
-
logger.
|
140 |
most_recent_date = main(start_date)
|
141 |
start_date = most_recent_date + timedelta(days=1)
|
142 |
else:
|
|
|
68 |
# Load the existing dataset from the Hugging Face hub or create a new one
|
69 |
try:
|
70 |
dataset = load_dataset(dataset_name, download_mode="reuse_cache_if_exists", ignore_verifications=True)
|
71 |
+
logger.debug("Loading existing dataset")
|
72 |
if "__index_level_0__" in dataset["all_days"].column_names:
|
73 |
dataset = dataset.remove_columns(["__index_level_0__"])
|
74 |
except FileNotFoundError:
|
75 |
+
logger.warning("Creating new dataset")
|
76 |
dataset = DatasetDict()
|
77 |
|
78 |
# Call get_subreddit_day with the calculated date
|
79 |
logger.info(f"Fetching data for {str(date_to_fetch)}")
|
80 |
submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
|
81 |
df = submissions_to_dataframe(submissions)
|
82 |
+
logger.debug(f"Data fetched for {str(date_to_fetch)}")
|
83 |
most_recent_date = date_to_fetch
|
84 |
|
85 |
# Append DataFrame to split 'all_days' or create new split
|
86 |
if "all_days" in dataset:
|
87 |
+
logger.debug("Appending data to split 'all_days'")
|
88 |
# Merge the new submissions
|
89 |
old_data = dataset['all_days'].to_pandas()
|
90 |
new_data = pd.concat([old_data, df], ignore_index=True)
|
|
|
105 |
# Convert back to dataset
|
106 |
dataset["all_days"] = Dataset.from_pandas(new_data)
|
107 |
else:
|
108 |
+
logger.debug("Creating new split 'all_days'")
|
109 |
dataset["all_days"] = Dataset.from_pandas(df)
|
110 |
# Log appending or creating split 'all'
|
111 |
+
logger.debug("Appended or created split 'all_days'")
|
112 |
|
113 |
# Push the augmented dataset to the Hugging Face hub
|
114 |
+
logger.debug(f"Pushing data for {date_to_fetch} to the Hugging Face hub")
|
115 |
readme_text = update_readme(dataset_name, subreddit, date_to_fetch)
|
116 |
dataset.description = readme_text
|
117 |
dataset.push_to_hub(dataset_name, token=auth_token)
|
|
|
136 |
two_days_ago = today - timedelta(days=2)
|
137 |
|
138 |
if start_date <= two_days_ago:
|
139 |
+
logger.warning(f"Running main function for date: {start_date}")
|
140 |
most_recent_date = main(start_date)
|
141 |
start_date = most_recent_date + timedelta(days=1)
|
142 |
else:
|
utilities/pushshift_data.py
CHANGED
@@ -115,7 +115,7 @@ def scrape_submissions_by_day(subreddit_to_scrape: str, day_to_scrape: str) -> L
|
|
115 |
actual_requests = 0
|
116 |
while after < before:
|
117 |
after_str, before_str = convert_timestamp_to_datetime(after), convert_timestamp_to_datetime(before)
|
118 |
-
logger.
|
119 |
data = get_pushshift_data(subreddit_to_scrape, before=before, after=after)
|
120 |
if data is None or len(data["data"]) == 0:
|
121 |
break
|
|
|
115 |
actual_requests = 0
|
116 |
while after < before:
|
117 |
after_str, before_str = convert_timestamp_to_datetime(after), convert_timestamp_to_datetime(before)
|
118 |
+
logger.debug(f"Fetching data between timestamps {after_str} and {before_str}")
|
119 |
data = get_pushshift_data(subreddit_to_scrape, before=before, after=after)
|
120 |
if data is None or len(data["data"]) == 0:
|
121 |
break
|