SearchingFace / helpers.py
nkasmanoff's picture
Update helpers.py
fb6da00
raw
history blame
1.98 kB
import requests
def clean_up_tags(tags_list):
tags_cleaned = []
for tag in tags_list:
if ':' in tag:
tag = tag.split(':')[1]
tags_cleaned.append(tag)
return ", ".join(tags_cleaned)
def check_api_url(url):
"""
This function checks to see if "api" is present in the URL between ".co" and "/datasets". If not, it inserts "api" in the correct position.
Args:
url (str): A URL string
Returns:
str: A URL string with "api" inserted if necessary
"""
# Split the URL into three parts based on the location of ".co" and "/datasets"
parts = url.split(".co")
first_part = parts[0] + ".co"
last_part = parts[1]
last_parts = last_part.split("/datasets")
middle_part = ""
if len(last_parts) > 1 and "/api" not in last_parts[0]:
middle_part = "/api"
# Concatenate the three parts to form the final URL
new_url = first_part + middle_part + last_parts[0] + "/datasets" + last_parts[1]
return new_url
def get_dataset_metadata(dataset_url):
retrieved_metadata = {}
dataset_url = check_api_url(dataset_url)
keys_to_retrieve = ['id','description', 'tags']
response = requests.get(dataset_url)
if response.status_code == 200:
response_json = response.json()
for key in keys_to_retrieve:
if key in response_json:
retrieved_metadata[key] = response_json[key]
return retrieved_metadata
def get_dataset_readme(dataset_url):
retrieved_metadata = {}
metadata_url = check_api_url(dataset_url)
readme_url = dataset_url + '/raw/main/README.md'
readme_response = requests.get(readme_url)
metadata_response = requests.get(metadata_url)
if readme_response.status_code == 200:
response_text = readme_response.text
dataset_id = metadata_response.json()['id']
retrieved_metadata = {'id': dataset_id, 'README': response_text}
return retrieved_metadata