Spaces:

elineve
/

H2OTest

Runtime error

File size: 8,426 Bytes

07423df

import os
from abc import abstractmethod
from dataclasses import dataclass
from typing import Any, Callable, List, Optional, Sequence, Set, Tuple, Union

from llm_studio.src.nesting import Dependency


def _scan_dirs(dirname) -> List[str]:
    """Scans a directory for subfolders

    Args:
        dirname: directory name

    Returns:
        List of subfolders

    """

    subfolders = [f.path for f in os.scandir(dirname) if f.is_dir()]
    for dirname in list(subfolders):
        subfolders.extend(_scan_dirs(dirname))
    subfolders = [x + "/" if x[-1] != "/" else x for x in subfolders]
    return subfolders


def _scan_files(
    dirname, extensions: Tuple[str, ...] = (".csv", ".pq", ".parquet", ".json")
) -> List[str]:
    """Scans a directory for files with given extension

    Args:
        dirname: directory name
        extensions: extensions to consider

    Returns:
        List of files

    """
    path_list = [
        os.path.join(dirpath, filename)
        for dirpath, _, filenames in os.walk(dirname)
        for filename in filenames
        if any(map(filename.__contains__, extensions))
        and not filename.startswith("__meta_info__")
    ]
    return sorted(path_list)


def strip_prefix(paths: Sequence[str], ignore_set: Set[str] = set()) -> Tuple[str, ...]:
    """
    Strips the common prefix of all the given paths.

    Args:
        paths: the paths to strip
        ignore_set: set of path names to ignore when computing the prefix.

    Returns:
        List with the same length as `paths` without common prefixes.
    """

    paths_to_check = [
        os.path.split(os.path.normpath(path))[0]
        for path in paths
        if path not in ignore_set
    ]

    if len(paths_to_check) == 0:
        return tuple(paths)

    prefix = os.path.commonpath(paths_to_check)
    stripped = tuple(
        [
            path if path in ignore_set else os.path.relpath(path, prefix)
            for path in paths
        ]
    )

    return stripped


class Value:
    pass


@dataclass
class Number:
    min: Optional[float] = None
    max: Optional[float] = None
    step: Union[str, float] = 1.0


@dataclass
class String:
    # Each element of the tuple can be either:
    # - a tuple of (value, name)
    # - a string. In that case the same value will be used for name and value
    values: Any = None
    allow_custom: bool = False
    placeholder: Optional[str] = None


class DatasetValue:
    pass

    @abstractmethod
    def get_value(
        self, dataset: Any, value: Any, type_annotation: type, mode: str
    ) -> Tuple[String, Any]:
        pass

    @staticmethod
    def _compute_current_values(
        current_values: List[str],
        possible_values: List[str],
        prefer_with: Optional[Callable[[str], bool]] = None,
    ) -> List[str]:
        """
        Compute current values.

        Args:
            current_values: The preliminary current values.
            possible_values: All possible values.
            prefer_with: Function determining which values to prefer as default.

        Returns:
            A list
        """
        if len(possible_values) == 0:
            return [""]

        # allow only values which are in the possible values
        current_values = list(
            filter(lambda value: value in possible_values, current_values)
        )

        if len(current_values) == 0:
            # if the values are empty, take all the values where `prefer_with` is true
            for c in possible_values:
                if prefer_with is not None and prefer_with(c):
                    current_values.append(c)

            # if they are still empty, just take the first possible value
            if len(current_values) == 0:
                current_values = [possible_values[0]]

        return current_values


@dataclass
class Directories(DatasetValue):
    add_none: Union[bool, Callable[[str], bool]] = False
    prefer_with: Optional[Callable[[str], bool]] = None
    prefer_none: bool = True

    def get_value(self, dataset, value, type_annotation, mode) -> Tuple[String, Any]:
        if dataset is None:
            return String(tuple()), value

        available_dirs = _scan_dirs(dataset["path"])

        if (isinstance(self.add_none, bool) and self.add_none) or (
            callable(self.add_none) and self.add_none(mode)
        ):
            if self.prefer_none:
                available_dirs.insert(0, "None")
            else:
                available_dirs.insert(len(available_dirs), "None")

        if isinstance(value, str):
            value = [value]

        value = DatasetValue._compute_current_values(
            value, available_dirs, self.prefer_with
        )

        return (
            String(
                tuple(
                    zip(
                        available_dirs,
                        strip_prefix(available_dirs, ignore_set={"None"}),
                    )
                )
            ),
            value if type_annotation == Tuple[str, ...] else value[0],
        )


@dataclass
class Files(DatasetValue):
    add_none: Union[bool, Callable[[str], bool]] = False
    prefer_with: Optional[Callable[[str], bool]] = None
    # For the case where no match found, whether to prioritize
    # selecting any file or selecting no file
    prefer_none: bool = True

    def get_value(self, dataset, value, type_annotation, mode) -> Tuple[String, Any]:
        if dataset is None:
            return String(tuple()), value

        available_files = _scan_files(dataset["path"])

        if (isinstance(self.add_none, bool) and self.add_none) or (
            callable(self.add_none) and self.add_none(mode)
        ):
            if self.prefer_none:
                available_files.insert(0, "None")
            else:
                available_files.insert(len(available_files), "None")

        if isinstance(value, str):
            value = [value]

        value = DatasetValue._compute_current_values(
            value, available_files, self.prefer_with
        )

        return (
            String(
                tuple(
                    zip(
                        available_files,
                        strip_prefix(available_files, ignore_set={"None"}),
                    )
                )
            ),
            value if type_annotation == Tuple[str, ...] else value[0],
        )


@dataclass
class Columns(DatasetValue):
    add_none: Union[bool, Callable[[str], bool]] = False
    prefer_with: Optional[Callable[[str], bool]] = None

    def get_value(self, dataset, value, type_annotation, mode) -> Tuple[String, Any]:
        if dataset is None:
            return String(tuple()), value

        try:
            columns = list(dataset["dataframe"].columns)
        except KeyError:
            columns = []

        if (isinstance(self.add_none, bool) and self.add_none) or (
            callable(self.add_none) and self.add_none(mode)
        ):
            columns.insert(0, "None")

        if isinstance(value, str):
            value = [value]
        if value is None:
            value = [columns[0]]

        value = DatasetValue._compute_current_values(value, columns, self.prefer_with)

        return (
            String(tuple(columns)),
            value if type_annotation == Tuple[str, ...] else value[0],
        )


@dataclass
class ColumnValue(DatasetValue):
    column: str
    default: List[str]
    prefer_with: Optional[Callable[[str], bool]] = None
    dependency: Optional[Dependency] = None

    def get_value(self, dataset, value, type_annotation, mode) -> Tuple[String, Any]:
        if dataset is None:
            return String(tuple()), value

        try:
            df = dataset["dataframe"]
        except KeyError:
            df = None

        if df is not None:
            if self.dependency is not None and not self.dependency.check(
                [dataset[self.dependency.key]]
            ):
                values = self.default
            elif self.column in df:
                values = [str(v) for v in sorted(list(df[self.column].unique()))]
            else:
                values = self.default
        else:
            values = self.default

        value = DatasetValue._compute_current_values(value, values, self.prefer_with)

        return (
            String(tuple(values)),
            value if type_annotation == Tuple[str, ...] else value[0],
        )