alvanli commited on
Commit
99ec3d4
1 Parent(s): d08f251

use subsets

Browse files
main.py CHANGED
@@ -22,6 +22,7 @@ subreddit = os.environ["SUBREDDIT"]
22
  username = os.environ["USERNAME"]
23
  dataset_name = f"{username}/reddit-{subreddit}"
24
  comment_dataset_name = f"{username}/reddit-comments-{subreddit}"
 
25
 
26
  dataset_readme_path = "README.md"
27
 
@@ -38,16 +39,10 @@ def upload(new_df, dataset, hf_dataset_name):
38
  date = datetime.now().strftime('%Y-%m-%d')
39
 
40
  # Using dataset from hub
41
- if 'train' in dataset.keys():
42
- old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
43
- df = merge_data(old_df=old_df, new_df=new_df)
44
- new_rows = len(df) - len(old_df)
45
- # New dataset
46
- else:
47
- df = new_df
48
- df['new'] = True
49
- df['updated'] = False
50
- new_rows = len(new_df)
51
  df = remove_filtered_rows(df)
52
  dataset['train'] = Dataset.from_pandas(df, preserve_index=False)
53
 
@@ -56,7 +51,7 @@ def upload(new_df, dataset, hf_dataset_name):
56
 
57
  # Push the augmented dataset to the Hugging Face hub
58
  logger.debug(f"Pushing data for {date} to {hf_dataset_name}")
59
- dataset.push_to_hub(hf_dataset_name, token=auth_token)
60
  logger.info(f"Processed and pushed data for {date} to {hf_dataset_name}")
61
  update_dataset_readme(dataset_name=hf_dataset_name, subreddit=subreddit, new_rows=new_rows)
62
  logger.info(f"Updated README.")
 
22
  username = os.environ["USERNAME"]
23
  dataset_name = f"{username}/reddit-{subreddit}"
24
  comment_dataset_name = f"{username}/reddit-comments-{subreddit}"
25
+ subset = f"year_{datetime.now().year}"
26
 
27
  dataset_readme_path = "README.md"
28
 
 
39
  date = datetime.now().strftime('%Y-%m-%d')
40
 
41
  # Using dataset from hub
42
+ old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
43
+ df = merge_data(old_df=old_df, new_df=new_df)
44
+ new_rows = len(df) - len(old_df)
45
+
 
 
 
 
 
 
46
  df = remove_filtered_rows(df)
47
  dataset['train'] = Dataset.from_pandas(df, preserve_index=False)
48
 
 
51
 
52
  # Push the augmented dataset to the Hugging Face hub
53
  logger.debug(f"Pushing data for {date} to {hf_dataset_name}")
54
+ dataset.push_to_hub(hf_dataset_name, subset, token=auth_token)
55
  logger.info(f"Processed and pushed data for {date} to {hf_dataset_name}")
56
  update_dataset_readme(dataset_name=hf_dataset_name, subreddit=subreddit, new_rows=new_rows)
57
  logger.info(f"Updated README.")
utilities/praw_downloader.py CHANGED
@@ -49,7 +49,7 @@ def extract_comment_data(comment: praw.models.Comment) -> Dict[str, Any]:
49
  'permalink': comment.permalink,
50
  'depth': comment.depth,
51
  'link_id': comment.link_id,
52
- 'submission_id': comment._submission.id,
53
  'id': comment.id
54
  }
55
 
 
49
  'permalink': comment.permalink,
50
  'depth': comment.depth,
51
  'link_id': comment.link_id,
52
+ 'parent_id': comment.parent_id,
53
  'id': comment.id
54
  }
55
 
utilities/user_defined_functions.py CHANGED
@@ -16,6 +16,7 @@ subreddit = os.environ["SUBREDDIT"]
16
  username = os.environ["USERNAME"]
17
  dataset_name = f"{username}/reddit-{subreddit}"
18
  comment_dataset_name = f"{username}/reddit-comments-{subreddit}"
 
19
 
20
  frequency = os.environ.get("FREQUENCY", '').lower()
21
  if frequency not in ["daily", "hourly"]:
@@ -56,7 +57,7 @@ dummy_comment_data = {
56
  "new": [False],
57
  "depth": [2],
58
  "link_id": ["eqrkhgbjeh"],
59
- "submission_id": ["eqrkhgbjeh"]
60
  }
61
 
62
 
@@ -77,7 +78,7 @@ def load_or_create_dataset():
77
  # Load the existing dataset from the Hugging Face hub or create a new one
78
  try:
79
  logger.debug(f"Trying to download {dataset_name}")
80
- dataset = load_dataset(dataset_name, download_mode=DownloadMode.FORCE_REDOWNLOAD)
81
  logger.debug("Loading existing dataset")
82
  except FileNotFoundError:
83
  logger.warning("Creating new dataset")
@@ -85,10 +86,10 @@ def load_or_create_dataset():
85
  # Creating Initial Repo
86
  dataset = DatasetDict()
87
  dataset['train'] = Dataset.from_dict(dummy_data)
88
- dataset.push_to_hub(repo_id=dataset_name, token=auth_token)
89
 
90
  # Pulling from Initial Repo
91
- dataset = load_dataset(dataset_name)
92
 
93
  # Remove dummy data
94
  del dataset['train']
@@ -99,7 +100,7 @@ def load_or_create_comment_dataset():
99
  # Load the existing dataset from the Hugging Face hub or create a new one
100
  try:
101
  logger.debug(f"Trying to download {comment_dataset_name}")
102
- dataset = load_dataset(comment_dataset_name, download_mode=DownloadMode.FORCE_REDOWNLOAD)
103
  logger.debug("Loading existing comment dataset")
104
  except FileNotFoundError:
105
  logger.warning("Creating new comment dataset")
@@ -107,10 +108,10 @@ def load_or_create_comment_dataset():
107
  # Creating Initial Repo
108
  dataset = DatasetDict()
109
  dataset['train'] = Dataset.from_dict(dummy_comment_data)
110
- dataset.push_to_hub(repo_id=comment_dataset_name, token=auth_token)
111
 
112
  # Pulling from Initial Repo
113
- dataset = load_dataset(comment_dataset_name)
114
 
115
  # Remove dummy data
116
  del dataset['train']
 
16
  username = os.environ["USERNAME"]
17
  dataset_name = f"{username}/reddit-{subreddit}"
18
  comment_dataset_name = f"{username}/reddit-comments-{subreddit}"
19
+ subset = f"year_{datetime.now().year}"
20
 
21
  frequency = os.environ.get("FREQUENCY", '').lower()
22
  if frequency not in ["daily", "hourly"]:
 
57
  "new": [False],
58
  "depth": [2],
59
  "link_id": ["eqrkhgbjeh"],
60
+ "parent_id": ["eqrkhgbjeh"]
61
  }
62
 
63
 
 
78
  # Load the existing dataset from the Hugging Face hub or create a new one
79
  try:
80
  logger.debug(f"Trying to download {dataset_name}")
81
+ dataset = load_dataset(dataset_name, subset, download_mode=DownloadMode.FORCE_REDOWNLOAD)
82
  logger.debug("Loading existing dataset")
83
  except FileNotFoundError:
84
  logger.warning("Creating new dataset")
 
86
  # Creating Initial Repo
87
  dataset = DatasetDict()
88
  dataset['train'] = Dataset.from_dict(dummy_data)
89
+ dataset.push_to_hub(dataset_name, subset, token=auth_token)
90
 
91
  # Pulling from Initial Repo
92
+ dataset = load_dataset(dataset_name, subset)
93
 
94
  # Remove dummy data
95
  del dataset['train']
 
100
  # Load the existing dataset from the Hugging Face hub or create a new one
101
  try:
102
  logger.debug(f"Trying to download {comment_dataset_name}")
103
+ dataset = load_dataset(comment_dataset_name, subset, download_mode=DownloadMode.FORCE_REDOWNLOAD)
104
  logger.debug("Loading existing comment dataset")
105
  except FileNotFoundError:
106
  logger.warning("Creating new comment dataset")
 
108
  # Creating Initial Repo
109
  dataset = DatasetDict()
110
  dataset['train'] = Dataset.from_dict(dummy_comment_data)
111
+ dataset.push_to_hub(comment_dataset_name, subset, token=auth_token)
112
 
113
  # Pulling from Initial Repo
114
+ dataset = load_dataset(comment_dataset_name, subset)
115
 
116
  # Remove dummy data
117
  del dataset['train']