from typing import List, Generator from bs4 import BeautifulSoup, Tag import urllib3 SUPPORTED_MODEL_NAME_PAGES_FORMAT: str = "https://huggingface.co/models?pipeline_tag=text-generation&library=pytorch" def get_model_name(model_card: Tag) -> str: """ Gets the model name from the model card. :param model_card: The model card to get the model name from. :return: The model name. """ h4_class = "text-md truncate font-mono text-black dark:group-hover:text-yellow-500 group-hover:text-indigo-600" h4_tag: Tag = model_card.find("h4", class_=h4_class) return h4_tag.text def get_soups() -> Generator[BeautifulSoup, None, None]: """ Gets the pages to scrape. :return: A list of the pages to scrape. """ curr_page_index = 0 while True: curr_page_url = f"{SUPPORTED_MODEL_NAME_PAGES_FORMAT}&p={curr_page_index}" request = urllib3.PoolManager().request("GET", curr_page_url) if request.status != 200: return yield BeautifulSoup(request.data, "html.parser") curr_page_index += 1 def get_supported_model_names() -> Generator[str, None, None]: """ Scrapes the supported model names from the hugging face website. :return: A list of the supported model names. """ for soup in get_soups(): model_cards: List[Tag] = soup.find_all("article", class_="overview-card-wrapper group", recursive=True) for model_card in model_cards: yield get_model_name(model_card) if __name__ == "__main__": for model_name in get_supported_model_names(): print(model_name)