|
import os |
|
import zipfile |
|
import requests |
|
import pandas as pd |
|
import time |
|
|
|
from buster.documents_manager import DeepLakeDocumentsManager |
|
|
|
from buster.docparser import get_all_documents |
|
from buster.parser import HuggingfaceParser |
|
|
|
hf_transformers_zip_url = "https://huggingface.co/datasets/hf-doc-build/doc-build/resolve/main/transformers/main.zip" |
|
|
|
|
|
def download_and_unzip(zip_url, target_dir, overwrite=False): |
|
"""Download a zip file from zip_url and unzip it to target_dir. |
|
|
|
# Example usage |
|
zip_url = "https://example.com/example.zip" |
|
target_dir = "downloaded_files" |
|
download_and_unzip(zip_url, target_dir, overwrite=True) |
|
|
|
ChatGPT generated. |
|
""" |
|
|
|
if not os.path.exists(target_dir): |
|
os.makedirs(target_dir) |
|
|
|
|
|
zip_filename = os.path.basename(zip_url) |
|
target_path = os.path.join(target_dir, zip_filename) |
|
|
|
|
|
if os.path.exists(target_path) and not overwrite: |
|
print(f"{zip_filename} already exists in the target directory.") |
|
return |
|
|
|
|
|
response = requests.get(zip_url, stream=True) |
|
if response.status_code == 200: |
|
with open(target_path, "wb") as file: |
|
for chunk in response.iter_content(chunk_size=8192): |
|
file.write(chunk) |
|
print(f"{zip_filename} downloaded successfully.") |
|
|
|
|
|
with zipfile.ZipFile(target_path, "r") as zip_ref: |
|
zip_ref.extractall(target_dir) |
|
print(f"{zip_filename} extracted successfully.") |
|
else: |
|
print(f"Failed to download {zip_filename}. Status code: {response.status_code}") |
|
|
|
|
|
|
|
download_and_unzip(zip_url=hf_transformers_zip_url, target_dir=".") |
|
|
|
|
|
df = get_all_documents( |
|
root_dir="transformers/main/en/", |
|
base_url="https://huggingface.co/docs/transformers/main/en/", |
|
parser_cls=HuggingfaceParser, |
|
min_section_length=100, |
|
max_section_length=1000, |
|
) |
|
|
|
|
|
df["source"] = "hf_transformers" |
|
|
|
|
|
df.to_csv("hf_transformers.csv") |
|
|
|
|
|
dm = DeepLakeDocumentsManager( |
|
vector_store_path="deeplake_store", |
|
overwrite=True, |
|
required_columns=["url", "content", "source", "title"], |
|
) |
|
|
|
|
|
dm.batch_add( |
|
df=df, |
|
batch_size=3000, |
|
min_time_interval=60, |
|
num_workers=32, |
|
csv_filename="embeddings.csv", |
|
csv_overwrite=False, |
|
) |
|
|