yonikremer commited on
Commit
dbafbbf
1 Parent(s): 8d0a0d3

added a minimum number of liken and downloades for a model to be supported

Browse files
Files changed (1) hide show
  1. supported_models.py +68 -6
supported_models.py CHANGED
@@ -1,19 +1,55 @@
1
- from typing import Generator, Set
2
 
3
  import requests
4
- from bs4 import BeautifulSoup
5
  from concurrent.futures import ThreadPoolExecutor, as_completed
6
 
7
  SUPPORTED_MODEL_NAME_PAGES_FORMAT = "https://huggingface.co/models?pipeline_tag=text-generation&library=pytorch"
8
  MAX_WORKERS = 10
 
 
 
9
 
10
 
11
- def get_model_name(model_card: BeautifulSoup) -> str:
12
  h4_class = "text-md truncate font-mono text-black dark:group-hover:text-yellow-500 group-hover:text-indigo-600"
13
  h4_tag = model_card.find("h4", class_=h4_class)
14
  return h4_tag.text
15
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def get_page(page_index: int):
18
  curr_page_url = f"{SUPPORTED_MODEL_NAME_PAGES_FORMAT}&p={page_index}"
19
  response = requests.get(curr_page_url)
@@ -23,9 +59,29 @@ def get_page(page_index: int):
23
  return None
24
 
25
 
26
- def get_model_names(soup):
27
- model_cards = soup.find_all("article", class_="overview-card-wrapper group", recursive=True)
28
- return [get_model_name(model_card) for model_card in model_cards]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
 
31
  def generate_supported_model_names() -> Generator[str, None, None]:
@@ -39,3 +95,9 @@ def generate_supported_model_names() -> Generator[str, None, None]:
39
 
40
  def get_supported_model_names() -> Set[str]:
41
  return set(generate_supported_model_names())
 
 
 
 
 
 
 
1
+ from typing import Generator, Set, Union, List
2
 
3
  import requests
4
+ from bs4 import BeautifulSoup, Tag, NavigableString, PageElement
5
  from concurrent.futures import ThreadPoolExecutor, as_completed
6
 
7
  SUPPORTED_MODEL_NAME_PAGES_FORMAT = "https://huggingface.co/models?pipeline_tag=text-generation&library=pytorch"
8
  MAX_WORKERS = 10
9
+ BLACKLISTED_MODEL_NAMES = {"ykilcher/gpt-4chan"}
10
+ MIN_NUMBER_OF_DOWNLOADS = 100
11
+ MIN_NUMBER_OF_LIKES = 20
12
 
13
 
14
+ def get_model_name(model_card: Tag) -> str:
15
  h4_class = "text-md truncate font-mono text-black dark:group-hover:text-yellow-500 group-hover:text-indigo-600"
16
  h4_tag = model_card.find("h4", class_=h4_class)
17
  return h4_tag.text
18
 
19
 
20
+ def is_a_number(s: PageElement) -> bool:
21
+ s = s.text.strip().lower().replace("k", "").replace("m", "").replace(",", "").replace(".", "").replace("b", "")
22
+ try:
23
+ float(s)
24
+ return True
25
+ except ValueError:
26
+ return False
27
+
28
+
29
+ def get_numeric_contents(model_card):
30
+ div: Union[Tag | NavigableString] = model_card.find(
31
+ "div",
32
+ class_="mr-1 flex items-center overflow-hidden whitespace-nowrap text-sm leading-tight text-gray-400",
33
+ recursive=True
34
+ )
35
+ contents: List[PageElement] = div.contents
36
+ contents_without_tags: List[PageElement] = [content for content in contents if not isinstance(content, Tag)]
37
+ number_contents: List[PageElement] = [content for content in contents_without_tags if is_a_number(content)]
38
+ return number_contents
39
+
40
+
41
+ def convert_to_int(element: PageElement) -> int:
42
+ element_str = element.text.strip().lower()
43
+ if element_str.endswith("k"):
44
+ return int(float(element_str[:-1]) * 1_000)
45
+ elif element_str.endswith("m"):
46
+ return int(float(element_str[:-1]) * 1_000_000)
47
+ elif element_str.endswith("b"):
48
+ return int(float(element_str[:-1]) * 1_000_000_000)
49
+ else:
50
+ return int(element_str)
51
+
52
+
53
  def get_page(page_index: int):
54
  curr_page_url = f"{SUPPORTED_MODEL_NAME_PAGES_FORMAT}&p={page_index}"
55
  response = requests.get(curr_page_url)
 
59
  return None
60
 
61
 
62
+ def card_filter(model_card: Tag, model_name: str) -> bool:
63
+ if model_name in BLACKLISTED_MODEL_NAMES:
64
+ return False
65
+ numeric_contents = get_numeric_contents(model_card)
66
+ if len(numeric_contents) < 2:
67
+ # If the model card doesn't have at least 2 numeric contents,
68
+ # It means that he doesn't have any downloads/likes, so it's not a valid model card.
69
+ return False
70
+ number_of_downloads = convert_to_int(numeric_contents[0])
71
+ if number_of_downloads < MIN_NUMBER_OF_DOWNLOADS:
72
+ return False
73
+ number_of_likes = convert_to_int(numeric_contents[1])
74
+ if number_of_likes < MIN_NUMBER_OF_LIKES:
75
+ return False
76
+ return True
77
+
78
+
79
+ def get_model_names(soup: BeautifulSoup):
80
+ model_cards: List[Tag] = soup.find_all("article", class_="overview-card-wrapper group", recursive=True)
81
+ for model_card in model_cards:
82
+ model_name = get_model_name(model_card)
83
+ if card_filter(model_card, model_name):
84
+ yield model_name
85
 
86
 
87
  def generate_supported_model_names() -> Generator[str, None, None]:
 
95
 
96
  def get_supported_model_names() -> Set[str]:
97
  return set(generate_supported_model_names())
98
+
99
+
100
+ if __name__ == "__main__":
101
+ supported_model_names = get_supported_model_names()
102
+ print(f"Number of supported model names: {len(supported_model_names)}")
103
+ print(f"Supported model names: {supported_model_names}")