Spaces:

jerpint
/

buster

Running

File size: 2,634 Bytes

d60948f

import os
import zipfile
import requests
import pandas as pd
import time

from buster.documents_manager import DeepLakeDocumentsManager

from buster.docparser import get_all_documents
from buster.parser import HuggingfaceParser

hf_transformers_zip_url = "https://huggingface.co/datasets/hf-doc-build/doc-build/resolve/main/transformers/main.zip"


def download_and_unzip(zip_url, target_dir, overwrite=False):
    """Download a zip file from zip_url and unzip it to target_dir.

    # Example usage
    zip_url = "https://example.com/example.zip"
    target_dir = "downloaded_files"
    download_and_unzip(zip_url, target_dir, overwrite=True)

    ChatGPT generated.
    """
    # Create the target directory if it doesn't exist
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # Get the filename from the zip_url
    zip_filename = os.path.basename(zip_url)
    target_path = os.path.join(target_dir, zip_filename)

    # Check if the file already exists
    if os.path.exists(target_path) and not overwrite:
        print(f"{zip_filename} already exists in the target directory.")
        return

    # Download the zip file
    response = requests.get(zip_url, stream=True)
    if response.status_code == 200:
        with open(target_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"{zip_filename} downloaded successfully.")

        # Unzip the file
        with zipfile.ZipFile(target_path, "r") as zip_ref:
            zip_ref.extractall(target_dir)
        print(f"{zip_filename} extracted successfully.")
    else:
        print(f"Failed to download {zip_filename}. Status code: {response.status_code}")


# Download the tranformers html pages and unzip it
download_and_unzip(zip_url=hf_transformers_zip_url, target_dir=".")

# Extract all documents from the html into a dataframe
df = get_all_documents(
    root_dir="transformers/main/en/",
    base_url="https://huggingface.co/docs/transformers/main/en/",
    parser_cls=HuggingfaceParser,
    min_section_length=100,
    max_section_length=1000,
)

# Add the source column
df["source"] = "hf_transformers"

# Save the .csv with chunks to disk
df.to_csv("hf_transformers.csv")

# Initialize the vector store
dm = DeepLakeDocumentsManager(
    vector_store_path="deeplake_store",
    overwrite=True,
    required_columns=["url", "content", "source", "title"],
)

# Add all embeddings to the vector store
dm.batch_add(
    df=df,
    batch_size=3000,
    min_time_interval=60,
    num_workers=32,
    csv_filename="embeddings.csv",
    csv_overwrite=False,
)