Spaces:

HF-test-lab
/

bulk_embeddings

Runtime error

App Files Files Community

nbroad HF staff commited on Jul 14, 2023

Commit

f827190

•

1 Parent(s): f0c5d5d

use upload file/folder and dataloader

Browse files

Files changed (1) hide show

utils.py +63 -43

utils.py CHANGED Viewed

@@ -5,6 +5,7 @@ from pathlib import Path
 from typing import Union, Dict, List
 import torch
 import datasets
 from datasets import load_dataset, Dataset
 from transformers import AutoTokenizer, PreTrainedTokenizer
@@ -274,17 +275,15 @@ def batch_embed(
     repo = init_git_repo(new_dataset_id)
-    iterator = iter(
-        ds.map(
-            tokenize,
-            batched=True,
-            batch_size=map_batch_size,
-            fn_kwargs={
-                "tokenizer": tokenizer,
-                "column_name": column_name,
-                "padding": "max_length" if opt_level == "O4" else True,
-            },
-        )
     )
     embeds = []
@@ -299,23 +298,20 @@ def batch_embed(
     inference_bs = get_batch_size(torch.cuda.get_device_name(0), model_name, opt_level)
-    loop = True
-    # skip through some examples
     if num2skip > 0:
-        [next(iterator) for _ in range(num2skip)]
     start_time = time.time()
-    while loop:
-        batch = [next(iterator, None) for _ in range(inference_bs)]
-        # batch will have None values when iterator runs out
-        if batch[-1] is None:
-            batch = [x for x in batch if x is not None]
-            loop = False
-            if len(batch) == 0:
-                break
         ids = torch.tensor([b["input_ids"] for b in batch], device=device)
         mask = torch.tensor([b["attention_mask"] for b in batch], device=device)
         t_ids = torch.zeros_like(ids)
@@ -325,7 +321,7 @@ def batch_embed(
         embeds.extend(mean_pooling(outputs[0], mask).cpu().tolist())
         texts.extend([b[column_name] for b in batch])
-        current_count += len(batch)
         # Check if we have embedded enough examples
         if current_count >= num2embed:
@@ -405,18 +401,19 @@ def init_git_repo(repo_id: str):
 def push_to_repo(
-    repo: str,
     last_count: int,
     current_count: int,
     embeds: List[List[float]],
     texts: List[str],
 ):
     """
     Push embeddings to the repo.
     Args:
-        repo (`huggingface_hub.Repository`):
-            repo to push to
         last_count (`int`):
             last count of embeddings.
             This is the number of embeddings that have already been pushed.
@@ -427,9 +424,10 @@ def push_to_repo(
             list of embeddings to push to the repo
         texts (`List[str]`):
             list of texts to push to the repo
     """
-    # TODO: write dataset loading script as well
     temp_ds = Dataset.from_dict(
         {
@@ -438,24 +436,46 @@ def push_to_repo(
         }
     )
-    data_dir = Path(repo.local_dir) / "data"
     data_dir.mkdir(exist_ok=True, parents=True)
-    temp_ds.to_parquet(
-        str(data_dir / f"embeddings_{last_count}_{current_count}.parquet")
-    )
-    repo.push_to_hub(
-        commit_message=f"Embedded examples {last_count} thru {current_count}",
-        blocking=False,
-        auto_lfs_prune=True,
-    )
-    # TODO: delete/untrack old files
-    # most_recent_file = f"embeddings_{last_count}_{current_count}.parquet"
     # Delete old files
-    # for f in data_dir.glob("*.parquet"):
-    #     if f.name != most_recent_file:
-    #         f.unlink()

 from typing import Union, Dict, List
 import torch
+from torch.utils.data import DataLoader
 import datasets
 from datasets import load_dataset, Dataset
 from transformers import AutoTokenizer, PreTrainedTokenizer
     repo = init_git_repo(new_dataset_id)
+    ds = ds.map(
+        tokenize,
+        batched=True,
+        batch_size=map_batch_size,
+        fn_kwargs={
+            "tokenizer": tokenizer,
+            "column_name": column_name,
+            "padding": "max_length" if opt_level == "O4" else True,
+        },
     )
     embeds = []
     inference_bs = get_batch_size(torch.cuda.get_device_name(0), model_name, opt_level)
+    # skip through some examples if specified
     if num2skip > 0:
+        ds = ds.skip(num2skip)
     start_time = time.time()
+    for batch in DataLoader(
+        ds,
+        batch_size=inference_bs,
+        shuffle=False,
+        num_workers=2,
+        pin_memory=True,
+        drop_last=False,
+    ):
         ids = torch.tensor([b["input_ids"] for b in batch], device=device)
         mask = torch.tensor([b["attention_mask"] for b in batch], device=device)
         t_ids = torch.zeros_like(ids)
         embeds.extend(mean_pooling(outputs[0], mask).cpu().tolist())
         texts.extend([b[column_name] for b in batch])
+        current_count += ids.shape[0]
         # Check if we have embedded enough examples
         if current_count >= num2embed:
 def push_to_repo(
+    repo_id: str,
     last_count: int,
     current_count: int,
     embeds: List[List[float]],
     texts: List[str],
+    api: HfApi,
 ):
     """
     Push embeddings to the repo.
     Args:
+        repo_id (`str`):
+            id of the new dataset to create. Should include username or organization.
         last_count (`int`):
             last count of embeddings.
             This is the number of embeddings that have already been pushed.
             list of embeddings to push to the repo
         texts (`List[str]`):
             list of texts to push to the repo
+        api (`huggingface_hub.HfApi`):
+            api to use to push to the repo
     """
     temp_ds = Dataset.from_dict(
         {
         }
     )
+    local_dir = repo_id.replace("/", "_")
+    data_dir = Path(local_dir) / "data"
     data_dir.mkdir(exist_ok=True, parents=True)
+    # use zfill so sorting puts the files in order
+    filename = f"embeddings_{str(last_count).zfill(8)}_{current_count}.parquet"
+    filepath = str(data_dir / filename)
+    temp_ds.to_parquet(filepath)
+    files = sorted(list(data_dir.glob("*.parquet")))
+    if len(files) == 1:
+        api.upload_folder(
+            folder_path=str(data_dir),
+            repo_id=repo_id,
+            repo_type="dataset",
+            run_as_future=True,
+            token=os.environ["HF_TOKEN"],
+            commit_message=f"Embedded examples {last_count} thru {current_count} with folder",
+        )
+    else:
+        api.upload_file(
+            path_or_fileobj=filepath,
+            path_in_repo=f"data/{filename}",
+            repo_id=repo_id,
+            repo_type="dataset",
+            run_as_future=True,
+            token=os.environ["HF_TOKEN"],
+            commit_message=f"Embedded examples {last_count} thru {current_count}",
+        )
     # Delete old files
+    if len(files) > 4:
+        for file in files[:2]:
+            file.unlink()