LysandreJik's picture
Decimate
9ac5ea2
raw
history blame
5.18 kB
import collections
import os
from datetime import datetime, timedelta
import json
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
from urllib.parse import parse_qs, urlparse
from huggingface_hub import list_datasets, set_access_token, HfFolder
from datasets import load_dataset
import numpy as np
HF_TOKEN = os.environ['HF_TOKEN']
set_access_token(HF_TOKEN)
HfFolder.save_token(HF_TOKEN)
datasets = {
# "stars": load_dataset("open-source-metrics/stars"),
"issues": load_dataset("open-source-metrics/issues"),
"pip": load_dataset("open-source-metrics/pip").sort('day')
}
def running_mean(x, N, total_length=-1):
cumsum = np.cumsum(np.insert(x, 0, 0))
to_pad = max(total_length - len(cumsum), 0)
return np.pad(cumsum[N:] - cumsum[:-N], (to_pad, 0)) / float(N)
class RequestHandler(SimpleHTTPRequestHandler):
def do_GET(self):
print(self.path)
if self.path == "/":
self.path = "index.html"
return SimpleHTTPRequestHandler.do_GET(self)
if self.path.startswith("/initialize"):
dataset_keys = {k: set(v.keys()) for k, v in datasets.items()}
dataset_keys['issues'].remove('transformers')
dataset_with_most_splits = max([d for d in dataset_keys.values()], key=len)
warnings = []
for k, v in dataset_keys.items():
if len(v) < len(dataset_with_most_splits):
warnings.extend(f"The {k} dataset does not contain all splits. Missing: {dataset_with_most_splits - v}")
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
# TODO: Send and display warnings
dataset_with_most_splits = list(dataset_with_most_splits)
dataset_with_most_splits.sort()
self.wfile.write(json.dumps(list(dataset_with_most_splits)).encode("utf-8"))
return SimpleHTTPRequestHandler
if self.path.startswith("/retrievePipInstalls"):
url = urlparse(self.path)
query = parse_qs(url.query)
library_names = query.get("input", None)[0]
library_names = library_names.split(',')
returned_values = {}
for library_name in library_names:
for i in datasets['pip'][library_name]:
if i['day'] in returned_values:
returned_values[i['day']][library_name] = i['num_downloads']
else:
returned_values[i['day']] = {library_name: i['num_downloads']}
for library_name in library_names:
for i in returned_values.keys():
if library_name not in returned_values[i]:
returned_values[i][library_name] = None
returned_values = collections.OrderedDict(sorted(returned_values.items()))
output = {l: [k[l] for k in returned_values.values()] for l in library_names}
output['day'] = list(returned_values.keys())
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps(output).encode("utf-8"))
return SimpleHTTPRequestHandler
if self.path.startswith("/retrieveStars"):
url = urlparse(self.path)
query = parse_qs(url.query)
library_names = query.get("input", None)[0]
library_names = library_names.split(',')
returned_values = {}
dataset_dict = load_dataset(f"open-source-metrics/stars", use_auth_token=True).sort('dates')
for library_name in library_names:
dataset = dataset_dict[library_name]
n = 0
for k, i in enumerate(dataset):
# Decimate values if there are too many
if len(dataset) > 1000 and k % int(len(dataset) / 1000) != 0:
continue
n += 1
if i['dates'] in returned_values:
returned_values[i['dates']][library_name] = n
else:
returned_values[i['dates']] = {library_name: n}
for library_name in library_names:
for i in returned_values.keys():
if library_name not in returned_values[i]:
returned_values[i][library_name] = None
returned_values = collections.OrderedDict(sorted(returned_values.items()))
output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names}
output['day'] = list(returned_values.keys())[::-1]
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps(output).encode("utf-8"))
return SimpleHTTPRequestHandler
return SimpleHTTPRequestHandler.do_GET(self)
server = ThreadingHTTPServer(("", 7860), RequestHandler)
print("Running on port 7860")
server.serve_forever()