alvanli
commited on
Commit
•
99ec3d4
1
Parent(s):
d08f251
use subsets
Browse files- main.py +6 -11
- utilities/praw_downloader.py +1 -1
- utilities/user_defined_functions.py +8 -7
main.py
CHANGED
@@ -22,6 +22,7 @@ subreddit = os.environ["SUBREDDIT"]
|
|
22 |
username = os.environ["USERNAME"]
|
23 |
dataset_name = f"{username}/reddit-{subreddit}"
|
24 |
comment_dataset_name = f"{username}/reddit-comments-{subreddit}"
|
|
|
25 |
|
26 |
dataset_readme_path = "README.md"
|
27 |
|
@@ -38,16 +39,10 @@ def upload(new_df, dataset, hf_dataset_name):
|
|
38 |
date = datetime.now().strftime('%Y-%m-%d')
|
39 |
|
40 |
# Using dataset from hub
|
41 |
-
if 'train' in dataset.keys()
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
# New dataset
|
46 |
-
else:
|
47 |
-
df = new_df
|
48 |
-
df['new'] = True
|
49 |
-
df['updated'] = False
|
50 |
-
new_rows = len(new_df)
|
51 |
df = remove_filtered_rows(df)
|
52 |
dataset['train'] = Dataset.from_pandas(df, preserve_index=False)
|
53 |
|
@@ -56,7 +51,7 @@ def upload(new_df, dataset, hf_dataset_name):
|
|
56 |
|
57 |
# Push the augmented dataset to the Hugging Face hub
|
58 |
logger.debug(f"Pushing data for {date} to {hf_dataset_name}")
|
59 |
-
dataset.push_to_hub(hf_dataset_name, token=auth_token)
|
60 |
logger.info(f"Processed and pushed data for {date} to {hf_dataset_name}")
|
61 |
update_dataset_readme(dataset_name=hf_dataset_name, subreddit=subreddit, new_rows=new_rows)
|
62 |
logger.info(f"Updated README.")
|
|
|
22 |
username = os.environ["USERNAME"]
|
23 |
dataset_name = f"{username}/reddit-{subreddit}"
|
24 |
comment_dataset_name = f"{username}/reddit-comments-{subreddit}"
|
25 |
+
subset = f"year_{datetime.now().year}"
|
26 |
|
27 |
dataset_readme_path = "README.md"
|
28 |
|
|
|
39 |
date = datetime.now().strftime('%Y-%m-%d')
|
40 |
|
41 |
# Using dataset from hub
|
42 |
+
old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
|
43 |
+
df = merge_data(old_df=old_df, new_df=new_df)
|
44 |
+
new_rows = len(df) - len(old_df)
|
45 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
df = remove_filtered_rows(df)
|
47 |
dataset['train'] = Dataset.from_pandas(df, preserve_index=False)
|
48 |
|
|
|
51 |
|
52 |
# Push the augmented dataset to the Hugging Face hub
|
53 |
logger.debug(f"Pushing data for {date} to {hf_dataset_name}")
|
54 |
+
dataset.push_to_hub(hf_dataset_name, subset, token=auth_token)
|
55 |
logger.info(f"Processed and pushed data for {date} to {hf_dataset_name}")
|
56 |
update_dataset_readme(dataset_name=hf_dataset_name, subreddit=subreddit, new_rows=new_rows)
|
57 |
logger.info(f"Updated README.")
|
utilities/praw_downloader.py
CHANGED
@@ -49,7 +49,7 @@ def extract_comment_data(comment: praw.models.Comment) -> Dict[str, Any]:
|
|
49 |
'permalink': comment.permalink,
|
50 |
'depth': comment.depth,
|
51 |
'link_id': comment.link_id,
|
52 |
-
'
|
53 |
'id': comment.id
|
54 |
}
|
55 |
|
|
|
49 |
'permalink': comment.permalink,
|
50 |
'depth': comment.depth,
|
51 |
'link_id': comment.link_id,
|
52 |
+
'parent_id': comment.parent_id,
|
53 |
'id': comment.id
|
54 |
}
|
55 |
|
utilities/user_defined_functions.py
CHANGED
@@ -16,6 +16,7 @@ subreddit = os.environ["SUBREDDIT"]
|
|
16 |
username = os.environ["USERNAME"]
|
17 |
dataset_name = f"{username}/reddit-{subreddit}"
|
18 |
comment_dataset_name = f"{username}/reddit-comments-{subreddit}"
|
|
|
19 |
|
20 |
frequency = os.environ.get("FREQUENCY", '').lower()
|
21 |
if frequency not in ["daily", "hourly"]:
|
@@ -56,7 +57,7 @@ dummy_comment_data = {
|
|
56 |
"new": [False],
|
57 |
"depth": [2],
|
58 |
"link_id": ["eqrkhgbjeh"],
|
59 |
-
"
|
60 |
}
|
61 |
|
62 |
|
@@ -77,7 +78,7 @@ def load_or_create_dataset():
|
|
77 |
# Load the existing dataset from the Hugging Face hub or create a new one
|
78 |
try:
|
79 |
logger.debug(f"Trying to download {dataset_name}")
|
80 |
-
dataset = load_dataset(dataset_name, download_mode=DownloadMode.FORCE_REDOWNLOAD)
|
81 |
logger.debug("Loading existing dataset")
|
82 |
except FileNotFoundError:
|
83 |
logger.warning("Creating new dataset")
|
@@ -85,10 +86,10 @@ def load_or_create_dataset():
|
|
85 |
# Creating Initial Repo
|
86 |
dataset = DatasetDict()
|
87 |
dataset['train'] = Dataset.from_dict(dummy_data)
|
88 |
-
dataset.push_to_hub(
|
89 |
|
90 |
# Pulling from Initial Repo
|
91 |
-
dataset = load_dataset(dataset_name)
|
92 |
|
93 |
# Remove dummy data
|
94 |
del dataset['train']
|
@@ -99,7 +100,7 @@ def load_or_create_comment_dataset():
|
|
99 |
# Load the existing dataset from the Hugging Face hub or create a new one
|
100 |
try:
|
101 |
logger.debug(f"Trying to download {comment_dataset_name}")
|
102 |
-
dataset = load_dataset(comment_dataset_name, download_mode=DownloadMode.FORCE_REDOWNLOAD)
|
103 |
logger.debug("Loading existing comment dataset")
|
104 |
except FileNotFoundError:
|
105 |
logger.warning("Creating new comment dataset")
|
@@ -107,10 +108,10 @@ def load_or_create_comment_dataset():
|
|
107 |
# Creating Initial Repo
|
108 |
dataset = DatasetDict()
|
109 |
dataset['train'] = Dataset.from_dict(dummy_comment_data)
|
110 |
-
dataset.push_to_hub(
|
111 |
|
112 |
# Pulling from Initial Repo
|
113 |
-
dataset = load_dataset(comment_dataset_name)
|
114 |
|
115 |
# Remove dummy data
|
116 |
del dataset['train']
|
|
|
16 |
username = os.environ["USERNAME"]
|
17 |
dataset_name = f"{username}/reddit-{subreddit}"
|
18 |
comment_dataset_name = f"{username}/reddit-comments-{subreddit}"
|
19 |
+
subset = f"year_{datetime.now().year}"
|
20 |
|
21 |
frequency = os.environ.get("FREQUENCY", '').lower()
|
22 |
if frequency not in ["daily", "hourly"]:
|
|
|
57 |
"new": [False],
|
58 |
"depth": [2],
|
59 |
"link_id": ["eqrkhgbjeh"],
|
60 |
+
"parent_id": ["eqrkhgbjeh"]
|
61 |
}
|
62 |
|
63 |
|
|
|
78 |
# Load the existing dataset from the Hugging Face hub or create a new one
|
79 |
try:
|
80 |
logger.debug(f"Trying to download {dataset_name}")
|
81 |
+
dataset = load_dataset(dataset_name, subset, download_mode=DownloadMode.FORCE_REDOWNLOAD)
|
82 |
logger.debug("Loading existing dataset")
|
83 |
except FileNotFoundError:
|
84 |
logger.warning("Creating new dataset")
|
|
|
86 |
# Creating Initial Repo
|
87 |
dataset = DatasetDict()
|
88 |
dataset['train'] = Dataset.from_dict(dummy_data)
|
89 |
+
dataset.push_to_hub(dataset_name, subset, token=auth_token)
|
90 |
|
91 |
# Pulling from Initial Repo
|
92 |
+
dataset = load_dataset(dataset_name, subset)
|
93 |
|
94 |
# Remove dummy data
|
95 |
del dataset['train']
|
|
|
100 |
# Load the existing dataset from the Hugging Face hub or create a new one
|
101 |
try:
|
102 |
logger.debug(f"Trying to download {comment_dataset_name}")
|
103 |
+
dataset = load_dataset(comment_dataset_name, subset, download_mode=DownloadMode.FORCE_REDOWNLOAD)
|
104 |
logger.debug("Loading existing comment dataset")
|
105 |
except FileNotFoundError:
|
106 |
logger.warning("Creating new comment dataset")
|
|
|
108 |
# Creating Initial Repo
|
109 |
dataset = DatasetDict()
|
110 |
dataset['train'] = Dataset.from_dict(dummy_comment_data)
|
111 |
+
dataset.push_to_hub(comment_dataset_name, subset, token=auth_token)
|
112 |
|
113 |
# Pulling from Initial Repo
|
114 |
+
dataset = load_dataset(comment_dataset_name, subset)
|
115 |
|
116 |
# Remove dummy data
|
117 |
del dataset['train']
|