Commit
•
61f9cd0
1
Parent(s):
04fde16
Refactoring updates and minor fixes
Browse files- main.py +3 -5
- utilities/user_defined_functions.py +6 -8
main.py
CHANGED
@@ -4,10 +4,9 @@ from datetime import datetime
|
|
4 |
|
5 |
import pandas as pd
|
6 |
import schedule
|
7 |
-
from datasets import Dataset
|
8 |
-
from huggingface_hub import login
|
9 |
|
10 |
-
from utilities.user_defined_functions import get_latest_data, merge_data
|
11 |
from utilities.my_logger import setup_logger
|
12 |
from utilities.readme_update import update_readme
|
13 |
|
@@ -30,9 +29,8 @@ logger = setup_logger(__name__)
|
|
30 |
def main():
|
31 |
date = datetime.now().strftime('%Y-%m-%d')
|
32 |
logger.warning(f"Running main function for date: {date}")
|
33 |
-
dataset =
|
34 |
|
35 |
-
# Get Latest Data and merge with historic data
|
36 |
new_df = get_latest_data()
|
37 |
|
38 |
# Using dataset from hub
|
|
|
4 |
|
5 |
import pandas as pd
|
6 |
import schedule
|
7 |
+
from datasets import Dataset
|
|
|
8 |
|
9 |
+
from utilities.user_defined_functions import get_latest_data, merge_data, load_or_create_dataset
|
10 |
from utilities.my_logger import setup_logger
|
11 |
from utilities.readme_update import update_readme
|
12 |
|
|
|
29 |
def main():
|
30 |
date = datetime.now().strftime('%Y-%m-%d')
|
31 |
logger.warning(f"Running main function for date: {date}")
|
32 |
+
dataset = load_or_create_dataset()
|
33 |
|
|
|
34 |
new_df = get_latest_data()
|
35 |
|
36 |
# Using dataset from hub
|
utilities/user_defined_functions.py
CHANGED
@@ -24,7 +24,8 @@ auth_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
|
|
24 |
login(auth_token, add_to_git_credential=True)
|
25 |
|
26 |
logger = setup_logger(__name__)
|
27 |
-
|
|
|
28 |
dummy_data = {
|
29 |
"id": ['id'],
|
30 |
"content": ["This is a sample post content. Just for demonstration purposes!"],
|
@@ -34,8 +35,8 @@ dummy_data = {
|
|
34 |
"title": ["Sample Post Title: How to Use Hugging Face?"],
|
35 |
"score": [457],
|
36 |
"permalink": ["/r/sampleSubreddit/comments/sampleID/sample_post_title_how_to_use_hugging_face/"],
|
37 |
-
"updated": False,
|
38 |
-
"new": False,
|
39 |
}
|
40 |
|
41 |
|
@@ -89,10 +90,7 @@ def merge_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
|
|
89 |
- pd.DataFrame: The merged, sorted, and marked dataframe.
|
90 |
"""
|
91 |
|
92 |
-
|
93 |
-
old_df['new'] = False
|
94 |
-
new_df['new'] = True
|
95 |
-
|
96 |
# Concatenate old and new dataframes, sort by 'date_utc', and reset index
|
97 |
df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
|
98 |
|
@@ -100,7 +98,7 @@ def merge_data(old_df: pd.DataFrame, new_df: pd.DataFrame) -> pd.DataFrame:
|
|
100 |
df = data_processing(df)
|
101 |
|
102 |
# Identify new rows (present in new_df but not in old_df)
|
103 |
-
df['new'] = df['
|
104 |
|
105 |
return df
|
106 |
|
|
|
24 |
login(auth_token, add_to_git_credential=True)
|
25 |
|
26 |
logger = setup_logger(__name__)
|
27 |
+
|
28 |
+
# Dummy row for when we create a new repo make sure to put everything in a list
|
29 |
dummy_data = {
|
30 |
"id": ['id'],
|
31 |
"content": ["This is a sample post content. Just for demonstration purposes!"],
|
|
|
35 |
"title": ["Sample Post Title: How to Use Hugging Face?"],
|
36 |
"score": [457],
|
37 |
"permalink": ["/r/sampleSubreddit/comments/sampleID/sample_post_title_how_to_use_hugging_face/"],
|
38 |
+
"updated": [False],
|
39 |
+
"new": [False],
|
40 |
}
|
41 |
|
42 |
|
|
|
90 |
- pd.DataFrame: The merged, sorted, and marked dataframe.
|
91 |
"""
|
92 |
|
93 |
+
old_df.drop(columns=['new', 'updated'], inplace=True)
|
|
|
|
|
|
|
94 |
# Concatenate old and new dataframes, sort by 'date_utc', and reset index
|
95 |
df = pd.concat([old_df, new_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
|
96 |
|
|
|
98 |
df = data_processing(df)
|
99 |
|
100 |
# Identify new rows (present in new_df but not in old_df)
|
101 |
+
df['new'] = df['id'].apply(lambda x: x in set(new_df['id']) - set(old_df['id']))
|
102 |
|
103 |
return df
|
104 |
|