legalkit-retrieval / dataset.py
louisbrulenaudet's picture
Upload 11 files
6b2dcd4 verified
# -*- coding: utf-8 -*-
# Copyright (c) Louis Brulé Naudet. All Rights Reserved.
# This software may be used and distributed according to the terms of the License Agreement.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datasets
import polars as pl
class Dataset:
@staticmethod
def load(
dataset_path:str
):
"""
Load a dataset from disk.
Parameters
----------
dataset_path : str
The path to the dataset on disk.
Returns
-------
datasets.Dataset
The loaded dataset.
Notes
-----
This method statically loads a dataset from disk using the `load_from_disk` function
provided by the `datasets` module. The dataset is expected to be stored in a specific
format supported by the `datasets` library.
Example
-------
>>> dataset_path = "/path/to/dataset"
>>> dataset = Dataset.load(dataset_path)
"""
dataset = datasets.load_from_disk(
dataset_path=dataset_path
)
return dataset
@staticmethod
def save(
dataset: datasets.Dataset,
dataset_path: str
) -> None:
"""
Save a dataset to disk.
Parameters
----------
dataset : datasets.Dataset
The dataset to be saved.
dataset_path : str
The path where the dataset will be saved on disk.
Returns
-------
None
Notes
-----
This method statically saves a dataset to disk using the `save_to_disk` function
provided by the `datasets` module. The dataset is expected to be in a format
supported by the `datasets` library.
Example
-------
>>> dataset = load_dataset("my_dataset")
>>> dataset_path = "/path/to/save/dataset"
>>> Dataset.save(dataset, dataset_path)
"""
datasets.save_to_disk(
dataset,
dataset_path
)
return None
@staticmethod
def convert_to_polars(
dataset: datasets.Dataset
) -> pl.DataFrame:
"""
Convert a dataset to a Polars DataFrame.
Parameters
----------
dataset : datasets.Dataset
The dataset to be converted to a Polars DataFrame.
Returns
-------
pl.DataFrame
A Polars DataFrame representing the dataset.
Notes
-----
This method converts a dataset object to a Polars DataFrame, which is a
memory-efficient and fast data manipulation library for Rust.
Raises
------
Exception
If an error occurs during the conversion process.
Examples
--------
>>> dataset = datasets.Dataset(data=arrow_table)
>>> dataframe = ClassName.convert_to_polars(dataset)
"""
try:
dataframe = pl.from_arrow(dataset.data.table).with_row_index()
except:
dataframe = pl.from_arrow(dataset.data.table).with_row_count(
name="index"
)
return dataframe