davanstrien HF staff commited on
Commit
e35e532
1 Parent(s): a8bda03

chore: Filter out unmodified template cards in load_cards()

Browse files
Files changed (1) hide show
  1. load_data.py +45 -9
load_data.py CHANGED
@@ -12,7 +12,10 @@ from chromadb.utils import embedding_functions
12
  from dotenv import load_dotenv
13
  from huggingface_hub import InferenceClient
14
  from tqdm.contrib.concurrent import thread_map
 
 
15
 
 
16
  # Set up logging
17
  logging.basicConfig(
18
  level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
@@ -67,15 +70,20 @@ def get_collection(chroma_client, embedding_function):
67
 
68
  def get_last_modified_in_collection(collection) -> datetime | None:
69
  logger.info("Fetching last modified date from collection")
70
- all_items = collection.get(include=["metadatas"])
71
- if last_modified := [
72
- datetime.fromisoformat(item["last_modified"]) for item in all_items["metadatas"]
73
- ]:
74
- last_mod = max(last_modified)
75
- logger.info(f"Last modified date: {last_mod}")
76
- return last_mod
77
- else:
78
- logger.info("No last modified date found")
 
 
 
 
 
79
  return None
80
 
81
 
@@ -106,6 +114,26 @@ def parse_markdown_column(
106
  )
107
 
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  def load_cards(
110
  min_len: int = 50,
111
  min_likes: int | None = None,
@@ -122,6 +150,14 @@ def load_cards(
122
  df = df.filter(pl.col("likes") > min_likes)
123
  if last_modified:
124
  df = df.filter(pl.col("last_modified") > last_modified)
 
 
 
 
 
 
 
 
125
  if len(df) == 0:
126
  logger.info("No cards found matching criteria")
127
  return None
 
12
  from dotenv import load_dotenv
13
  from huggingface_hub import InferenceClient
14
  from tqdm.contrib.concurrent import thread_map
15
+ import os
16
+
17
 
18
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
19
  # Set up logging
20
  logging.basicConfig(
21
  level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 
70
 
71
  def get_last_modified_in_collection(collection) -> datetime | None:
72
  logger.info("Fetching last modified date from collection")
73
+ try:
74
+ all_items = collection.get(include=["metadatas"])
75
+ if last_modified := [
76
+ datetime.fromisoformat(item["last_modified"])
77
+ for item in all_items["metadatas"]
78
+ ]:
79
+ last_mod = max(last_modified)
80
+ logger.info(f"Last modified date: {last_mod}")
81
+ return last_mod
82
+ else:
83
+ logger.info("No last modified date found")
84
+ return None
85
+ except Exception as e:
86
+ logger.error(f"Error fetching last modified date: {str(e)}")
87
  return None
88
 
89
 
 
114
  )
115
 
116
 
117
+ def is_unmodified_template(card: str) -> bool:
118
+ # Check for a combination of template-specific phrases
119
+ template_indicators = [
120
+ "# Dataset Card for Dataset Name",
121
+ "<!-- Provide a quick summary of the dataset. -->",
122
+ "This dataset card aims to be a base template for new datasets",
123
+ "[More Information Needed]",
124
+ ]
125
+
126
+ # Count how many indicators are present
127
+ indicator_count = sum(indicator in card for indicator in template_indicators)
128
+
129
+ # Check if the card contains a high number of "[More Information Needed]" occurrences
130
+ more_info_needed_count = card.count("[More Information Needed]")
131
+
132
+ # Consider it an unmodified template if it has most of the indicators
133
+ # and a high number of "[More Information Needed]" occurrences
134
+ return indicator_count >= 3 or more_info_needed_count >= 7
135
+
136
+
137
  def load_cards(
138
  min_len: int = 50,
139
  min_likes: int | None = None,
 
150
  df = df.filter(pl.col("likes") > min_likes)
151
  if last_modified:
152
  df = df.filter(pl.col("last_modified") > last_modified)
153
+
154
+ # Filter out unmodified template cards
155
+ df = df.filter(
156
+ ~pl.col("prepended_markdown").map_elements(
157
+ is_unmodified_template, return_dtype=pl.Boolean
158
+ )
159
+ )
160
+
161
  if len(df) == 0:
162
  logger.info("No cards found matching criteria")
163
  return None