Source code for dwrappr.dataset

from typing import Tuple, List, Optional
import random
import os

import pandas as pd
import numpy as np
from dataclasses import dataclass, field, asdict

from .filehandler import save_file, load_file, get_file_extension, del_file_extension, get_folder_files
from .utils import check_any

import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)



[docs]
@dataclass
class DataSetMeta:
    """
    Represents metadata information for a dataset, including details about its attributes,
    usage, and related files.

    This class is designed to store and manipulate metadata for datasets, providing
    an interface for converting metadata to a pandas DataFrame, loading metadata from
    JSON files and scanning directories for metadata files.

    Attributes:
        name: str
            Name of the dataset.
        time_series: bool
            Indicates whether the dataset contains time series data.
        synthetic_data: bool
            Indicates whether the dataset contains synthetic data.
        feature_names: List[str]
            List of feature names in the dataset.
        target_names: List[str]
            List of target names in the dataset.
        auxiliary_names: List[str]
            List of auxiliary  names in the dataset.
        origin: str
            Source or origin of the dataset.
        year: str
            Year associated with the dataset.
        url: str
            URL to access further information about the dataset.
        sector: str
            Sector to which the dataset belongs.
        target_type: str
            Type of the target variable (e.g., 'classification', 'regression').
        description: str
            Description or additional details about the dataset.
    """
    name: str
    feature_names: List[str]
    target_names: List[str] = field(default_factory=list)
    auxiliary_names: List[str] = field(default_factory=list)
    time_series: bool = field(init=True, default=False)
    synthetic_data: bool = field(init=True, default=False)
    origin: str = field(init=True, default=None)
    year: str = field(init=True, default=None)
    url: str = field(init=True, default=None)
    sector: str = field(init=True, default=None)
    target_type: str = field(init=True, default=None)
    description: str = field(init=True, default=None)

    def __str__(self):
        # Collect available metadata
        meta_info = [
            f"Name: {self.name}",
            f"Time Series: {self.time_series}",
            f"Synthetic Data: {self.synthetic_data}",
            f"Feature Names: {', '.join(self.feature_names)}",
        ]

        # Add optional metadata if available
        if self.target_names:
            meta_info.append(f"Target Names: {', '.join(self.target_names)}")
        if self.auxiliary_names:
            meta_info.append(f"Auxiliary Names: {', '.join(self.auxiliary_names)}")
        if self.origin:
            meta_info.append(f"Origin: {self.origin}")
        if self.year:
            meta_info.append(f"Year: {self.year}")
        if self.url:
            meta_info.append(f"URL: {self.url}")
        if self.sector:
            meta_info.append(f"Sector: {self.sector}")
        if self.target_type:
            meta_info.append(f"Target Type: {self.target_type}")
        if self.description:
            meta_info.append(f"Description: {self.description}")

        # Join all information into a single string
        return "\n".join(meta_info)

    @property
    def as_df(self) -> pd.DataFrame:
        """
        Returns the available Metadata as a Dataframe.

        This property Converts the object's attributes into a pandas DataFrame.
        Lists in attributes are transformed into comma-separated strings for better readability.

        Returns:
            pd.DataFrame: A single-row DataFrame representing the object's metadata and attributes.
        """
        # Start with dataclass fields
        meta_dict = asdict(self)
        # Add/overwrite with any additional attributes
        meta_dict.update({
            k: v for k, v in self.__dict__.items() if k not in meta_dict
        })

        # Convert lists to comma-separated strings for readability
        for key, value in meta_dict.items():
            if isinstance(value, list):
                meta_dict[key] = ', '.join(str(x) for x in value)

        # Convert the dictionary to a DataFrame
        df = pd.DataFrame([meta_dict])
        return df


[docs]
    @classmethod
    def load(cls, filepath: str) -> 'DataSetMeta':
        r"""
        Loads an instance of DataSetMeta from a JSON file.

        This class method reads a JSON file and initializes an instance of
        DataSetMeta using the contents of the file. If the file provided does
        not have a .json extension, a ValueError is raised.

        Args:
            filepath (str) :The path to the JSON file that contains the data needed to initialize a DataSetMeta instance.

        Returns:
            DataSetMeta: An instance of DataSetMeta initialized with the data from the JSON file.

        Raises:
            ValueError : If the file specified by 'filepath' does not have a ".json" extension.
        Example:
            >>> file_path_meta = r"dwrappr/examples/data/example_dataset_meta.json"
            >>> meta = DataSetMeta.load(file_path_meta)
            >>> meta
            DataSetMeta(name='example_data', time_series='False', synthetic_data='True', feature_names=['feature'], target_names=['target'], origin=None, year=None, url=None, sector=None, target_type=None, description=None)

        """
        if not get_file_extension(filepath) == ".json":
            raise ValueError(f"File {filepath} should have extension '.json'")
        load_dict = load_file(filepath)
        return DataSetMeta(**load_dict)



[docs]
    @classmethod
    def scan_for_meta(cls, path: str, recursive: bool = True) -> List['DataSetMeta']:
        """
        Scans the directory for metadata and corresponding dataset objects.

        This function scans a specified directory for metadata and associated dataset object files
        and returns a list of DataSetMeta instances. Files with extensions '.joblib'
        and corresponding '_meta.json' are paired, with unpaired files logged.

        Args:
            path (str): The root directory path to scan for metadata and dataset object files.
            recursive (bool, optional): Indicates whether subdirectories should also be scanned. Defaults to True.

        Returns:
            List[DataSetMeta]: A list containing DataSetMeta objects where both'.joblib' dataset files and matching '_meta.json' metadata files are found. If any of these files are missing a counterpart, a warning is logged.
        Example:
            >>> DataSetMeta.scan_for_meta(r"dwrappr/examples/data")
            [DataSetMeta(name='example_data', time_series='False', synthetic_data='True', feature_names=['feature'], target_names=['target'], origin=None, year=None, url=None, sector=None, target_type=None, description=None)]


        """
        # Use the utility function to get all file paths
        all_files = get_folder_files(path, recursive)

        # Filtered list to store the matching DataSetMeta objects
        meta_data = []

        # Dictionaries to map base names to their respective paths
        dataset_object_files = {}
        meta_json_files = {}

        # Populate the dictionaries
        for file_path in all_files:
            file_name = os.path.basename(file_path)
            base_name, ext = os.path.splitext(file_name)
            if ext == '.joblib':
                dataset_object_files[base_name] = file_path
            elif file_name.endswith('_meta.json'):
                base_name = base_name[:-5]  # Remove '_meta' from the base name
                meta_json_files[base_name] = file_path

        # Check for corresponding files and handle cases
        for base_name in dataset_object_files:
            if base_name in meta_json_files:
                # Both files exist, load the meta
                meta = cls.load(meta_json_files[base_name])
                meta.local_filepath = dataset_object_files[base_name]
                meta_data.append(meta)
            else:
                # .joblib exists without _meta.json
                logger.warning(f"Missing _meta.json-file for {dataset_object_files[base_name]}")

        for base_name in meta_json_files:
            if base_name not in dataset_object_files:
                # _meta.json exists without .joblib
                logger.warning(f"Missing dataset_object-file for {meta_json_files[base_name]}")

        return meta_data



[docs]
    def save(self, filepath: str) -> None:
        """
        Saves the instance data to a specified JSON file.

        The method ensures that the file has a '.json' extension before attempting
        to save the instance data. If the extension is incorrect, a ValueError is
        raised. The instance is first converted to a dictionary representation and
        then written to the specified file path.

        Args:
            filepath (str) : The path to the file where the instance data will be saved. The file must have a '.json' extension.

        Returns:
            None

        Raises:
            ValueError: Raised if the file does not have a '.json' extension.

        """
        if not get_file_extension(filepath) == ".json":
            raise ValueError(f"File {filepath} should have extension '.json'")
        save_file(asdict(self), filepath)





[docs]
@dataclass
class DataPoint:
    """
    Represents a data point with associated x and optional y data arrays.

    This class is designed to encapsulate a data point represented by Numpy arrays
    and optionally associated data. It validates inputs during initialization to
    ensure they are Numpy arrays. The class also supports saving itself to and
    loading from joblib files, with helper methods for these tasks.

    Attributes:
        x: np.ndarray
            The primary data array, must be a Numpy array.
        y: Optional[np.ndarray]
            The secondary or optional data array, can be a Numpy array or None.
    """
    x: np.ndarray
    y: Optional[np.ndarray] = field(default=None)
    z: Optional[np.ndarray] = field(default=None)

    def __post_init__(self):
        """
        Ensures proper types for the instance variables during object initialization.

        Validates that 'x' is a NumPy array and that 'y' is either a NumPy array or None.

        Raises:
            TypeError: If 'x' is not a numpy.ndarray.
            TypeError: If 'y' is neither a numpy.ndarray nor None.
        """
        if not isinstance(self.x, np.ndarray):
            raise TypeError(f"x should be a numpy.ndarray, got {type(self.x).__name__} instead.")
        if self.y is not None and not isinstance(self.y, np.ndarray):
            raise TypeError(f"y should be a numpy.ndarray or None, got {type(self.y).__name__} instead.")
        if self.z is not None and not isinstance(self.z, np.ndarray):
            raise TypeError(f"z should be a numpy.ndarray, got {type(self.z).__name__} instead.")


[docs]
    @classmethod
    def load(cls, filepath: str) -> 'DataPoint':
        """
        Load a DataPoint object from a .joblib file.

        This method reads a .joblib file from the given filepath, validates
        its extension, and loads the data to instantiate a DataPoint object.

        Args:
            filepath (str) : Path to the .joblib file to be loaded. The file must have a '.joblib' extension.

        Returns:
            DataPoint : An instance of DataPoint created using the data in the file.

        Raises:
            ValueError : If the file does not have a '.joblib' extension.
        """
        if not get_file_extension(filepath) == ".joblib":
            raise ValueError(f"File {filepath} should have extension '.joblib'")
        load_dict = load_file(filepath)
        return DataPoint(**load_dict)



[docs]
    def save(self, filepath: str) -> None:
        """
        Saves the object's data to a specified file in Joblib format.

        Raises an error if the specified file does not have a '.joblib' extension.
        Uses the internal representation of the object's data converted to a dictionary.

        Args:
            filepath (str) : The path to the file where the object's data will be saved.

        Raises:
            ValueError : If the specified filepath does not end with '.joblib'.
        """
        if not get_file_extension(filepath) == ".joblib":
            raise ValueError(f"File {filepath} should have extension '.joblib'")
        save_file(asdict(self), filepath)





[docs]
@dataclass
class DataSet:
    """
    Represents a dataset consisting of data points, metadata, and associated attributes.

    The DataSet class is designed to store and manipulate a collection of data points,
    along with metadata and data types for features and targets. It provides multiple
    methods and properties for retrieving subsets of the dataset, accessing features and
    targets in various formats (e.g., numpy array, pandas DataFrame, PyTorch tensor),
    and loading/saving datasets.

    Attributes:
        datapoints : List[DataPoint]
            A list of data point objects that make up the dataset.
        dtypes : dict[str, str]
            A dictionary mapping column names to their data types.
        meta : DataSetMeta
            Metadata object that contains information such as feature names, target names, and dataset name.
    """
    datapoints: list[DataPoint] = field(default_factory=list)
    dtypes: dict[str, str] = field(default_factory=dict)  # Dictionary to store dtypes
    meta: DataSetMeta = field(default_factory=DataSetMeta)

    def __getitem__(self, idx):
        """
        Retrieve a subset of data points from the dataset based on the specified index.
        If the index is an integer, retrieves a single data point as a subset. If the
        index is a slice, retrieves a subset of data points based on the range
        specified by the slice. The returned subset is encapsulated within a new
        DataSet instance.

        Args:
            idx (int or slice): The index or range to access. Must be an integer for retrieving a single data point, or a slice for retrieving a subset.

        Returns:
            DataSet: A new DataSet instance containing the selected subset of data points.

        Raises:
            TypeError: If the provided index is neither an integer nor a slice.

        Example:
            >>> ds[0]
            DataSet(datapoints=[DataPoint(x=array([12]), y=array([0]))], dtypes={'feature': dtype('int64'), 'target': dtype('int64')}, meta=DataSetMeta(name='example_data', time_series='False', synthetic_data='True', feature_names=['feature'], target_names=['target'], origin=None, year=None, url=None, sector=None, target_type=None, description=None))
            >>> ds[1:3]
            DataSet(datapoints=[DataPoint(x=array([7]), y=array([1])), DataPoint(x=array([15]), y=array([0]))], dtypes={'feature': dtype('int64'), 'target': dtype('int64')}, meta=DataSetMeta(name='example_data', time_series='False', synthetic_data='True', feature_names=['feature'], target_names=['target'], origin=None, year=None, url=None, sector=None, target_type=None, description=None))
        """
        if isinstance(idx, slice):
            subset_datapoints = self.datapoints[idx]
        elif isinstance(idx, int):
            subset_datapoints = [self.datapoints[idx]]
        else:
            raise TypeError("Index must be an int or slice")

        # Create a new DataSet instance with the subset of datapoints
        subset = DataSet(datapoints=subset_datapoints,
                         dtypes=self.dtypes,
                         meta=self.meta, )
        return subset

    def __str__(self) -> str:
        """
        Provides a string representation of the object.

        This function returns a string representation of the dataset object by combining its key attributes
        in a human-readable format. This method formats the details of the object's
        name, data types, metadata, number of data points, and how to access data
        points. Aim is to offer a concise summary of the object's state.

        Returns:
            str: A string representation of the object containing its essential attributes.
        Example:
            >>> ds
            DataSet(datapoints=[DataPoint(x=array([12]), y=array([0])), DataPoint(x=array([7]), y=array([1])), DataPoint(x=array([15]), y=array([0])), DataPoint(x=array([9]), y=array([1]))], dtypes={'feature': dtype('int64'), 'target': dtype('int64')}, meta=DataSetMeta(name='example_data', time_series='False', synthetic_data='True', feature_names=['feature'], target_names=['target'], origin=None, year=None, url=None, sector=None, target_type=None, description=None))
        """
        return (f"name: {self.name}\n"
                f"meta: {self.meta}\n"
                f"num_datapoints: {self.num_datapoints}\n"
                f"dtypes: acess dtypes with <{self.__class__.__name__}_instance>.dtypes\n"
                f"datapoints[list]: access datapoints with <{self.__class__.__name__}_instance>.datapoints")

    def __len__(self) -> int:
        """
        Returns the number of data points in the object.

        This method allows determining the size or length of the dataset
        or collection represented by the object. It is often used where
        an object defines a collection-like interface.

        Returns:
            int: The total number of data points contained in the object.

        Example:
            >>> len(ds)
            4
        """
        return self.num_datapoints

    @property
    def name(self) -> str:
        """
        Returns the name attribute of the meta property.

        This property retrieves the name stored in the meta attribute. It does
        not accept any arguments and directly returns the name as a string.

        Returns:
            str: The name value associated with the meta attribute.
        Example:
            >>> ds.name
            'example_data'
        """
        return self.meta.name

    @property
    def feature_names(self) -> list[str]:
        """
        Returns the names of features used in the metadata.

        This property provides access to the feature names attribute present in
        the metadata object. It retrieves and returns the list of feature names.

        Returns:
            List[str]: The list of feature names.
        Example:
            >>> ds.feature_names
            ['feature']
        """
        return self.meta.feature_names

    @property
    def target_names(self) -> list[str]:
        """
        Returns the list of target names specified in the metadata.

        The method fetches and provides a list containing the target names which
        are stored in the meta attribute. The list represents the names or labels
        that correspond to target values in a dataset or similar context.

        Returns:
            list[str]: A list of target names.
        Example:
            >>> ds.target_names
            ['target']
        """
        return self.meta.target_names

    @property
    def auxiliary_names(self) -> list[str]:
        """
        Returns the list of auxiliary names specified in the metadata.

        The method fetches and provides a list containing the auxiliary names which
        are stored in the meta attribute. The list represents the names or labels
        that correspond to auxiliary values in a dataset or similar context.

        Returns:
            list[str]: A list of auxiliary names.
        Example:
            >>> ds.auxiliary_names
            ['target']
        """
        return self.meta.auxiliary_names

    @property
    def num_datapoints(self) -> int:
        """
        Returns the number of datapoints in the dataset.

        This property calculates the total count of datapoints currently
        present and provides this information as an integer.

        Returns:
            int: The total number of datapoints in the dataset.
        """
        return len(self.datapoints)

    @property
    def x(self) -> np.array:
        """
        Returns the x-coordinates of all datapoints in the current object.

        This property compiles a list of the x-values from all elements in
        the 'datapoints' attribute and returns them as a NumPy array. The
        returning array provides a structured format of the x-coordinates
        for further computations or manipulations.

        Returns:
            np.array: A NumPy array containing the x-coordinates of the datapoints in the object.
        Example:
            >>> ds.x
            0,12
            1,7
            2,15
            3,9
        """
        return np.array([datapoint.x for datapoint in self.datapoints])

    @property
    def x_as_df(self) -> pd.DataFrame:
        """
        Returns the `x` attribute as a pandas DataFrame.

        Provides a property method to process and return the `x` attribute formatted as
        a pandas DataFrame with updated data types. The output DataFrame's schema is
        adjusted according to the stored metadata and type definitions.

        Returns:
            pd.DataFrame: A pandas DataFrame created from the `x` attribute, with columns named according to `meta.feature_names` and updated data types based on the metadata settings.
        """
        df = pd.DataFrame(self.x, columns=self.meta.feature_names)

        # Apply the stored dtypes to the DataFrame
        df = self._update_df_dtypes(df)
        return df

    @property
    def x_as_tensor(self) -> 'torch.Tensor':
        """
        Returns the `x` attribute of the instance as a PyTorch tensor.

        This property converts the `x` attribute to a PyTorch tensor of type
        torch.float32. It requires PyTorch to be installed in the environment.

        Returns:
            torch.Tensor: The x attribute of the instance converted to a tensor.

        Raises:
            ImportError: If PyTorch is not installed in the environment.
        """
        try:
            import torch
        except ImportError:
            raise ImportError("PyTorch is required to use x_as_tensor. Please install it via 'pip install torch'.")

        return torch.tensor(self.x, dtype=torch.float32)

    @property
    def y(self) -> Optional[np.ndarray]:
        """
        Returns the y values extracted from all datapoints as a NumPy array.

        Y values correspond to the 'y' attribute of each datapoint in the list of
        datapoints. If no datapoints are present, it returns None.

        Return:
            Optional[np.ndarray]: A NumPy array of y values from datapoints, or None
            if no datapoints exist.
        Example:
            >>> ds.y
            0,0
            1,1
            2,0
            3,1
        """
        return np.array([datapoint.y for datapoint in self.datapoints])

    @property
    def y_as_df(self) -> pd.DataFrame:
        """
        Returns the target variable as a pandas DataFrame.

        This property provides a DataFrame representation of the target
        variable with column names corresponding to the `target_names` attribute.
        It also ensures that the DataFrame's data types are updated consistent
        with any pre-defined data type information.

        Returns:
            pd.DataFrame: The target variable represented as a pandas
            DataFrame with appropriately updated data types.
        """
        if not self.target_names:
            return pd.DataFrame()

        df = pd.DataFrame(self.y, columns=self.target_names)

        # Apply the stored dtypes to the DataFrame
        df = self._update_df_dtypes(df)
        return df

    @property
    def y_as_tensor(self) -> 'torch.Tensor':
        """
        Returns the attribute 'y' as a PyTorch tensor.

        This property converts the 'y' attribute of the object into a PyTorch tensor
        with a data type of float32. It requires PyTorch to be installed, and will
        raise an ImportError if it is not available.

        Returns:
            torch.Tensor: The attribute 'y' represented as a tensor of type torch.float32.
        Raises:
            ImportError: If PyTorch library is not installed.
        """
        try:
            import torch
        except ImportError:
            raise ImportError("PyTorch is required to use y_as_tensor. Please install it via 'pip install torch'.")

        return torch.tensor(self.y, dtype=torch.float32)

    @property
    def z(self) -> np.ndarray:
        """
        Returns the z values extracted from all datapoints as a NumPy array.

        Z values correspond to the 'z' attribute of each datapoint in the list of
        datapoints. If no datapoints are present, it returns None.

        Return:
            Optional[np.ndarray]: A NumPy array of z values from datapoints, or None
            if no datapoints exist.
        Example:
            >>> ds.z
            0,0
            1,1
            2,0
            3,1
        """
        return np.array([datapoint.z for datapoint in self.datapoints])

    @property
    def z_as_df(self) -> pd.DataFrame:
        """
        Returns the target variable as a pandas DataFrame.

        This property provides a DataFrame representation of the target
        variable with column names corresponding to the `target_names` attribute.
        It also ensures that the DataFrame's data types are updated consistent
        with any pre-defined data type information.

        Returns:
            pd.DataFrame: The target variable represented as a pandas
            DataFrame with appropriately updated data types.
        """
        if not self.auxiliary_names:
            return pd.DataFrame()
        df = pd.DataFrame(self.z, columns=self.auxiliary_names)

        # Apply the stored dtypes to the DataFrame
        df = self._update_df_dtypes(df)
        return df

    @property
    def as_df(self) -> pd.DataFrame:
        """
        Returns the dataset object as a pandas DataFrame.

        This property converts the stored DataPoints into a DataFrame. It concatenates the
        x and y DataFrames along the columns axis and applies the stored data types to the
        resulting DataFrame before returning it.

        Returns:
            pd.DataFrame: A DataFrame representation of the stored DataPoints, with the
            stored data types applied.
        """
        # Convert DataPoints back into a DataFrame
        df = pd.concat([self.x_as_df, self.y_as_df, self.z_as_df], axis=1)

        # Apply the stored dtypes to the DataFrame
        df = self._update_df_dtypes(df)
        return df

    def _update_df_dtypes(self, df) -> pd.DataFrame:
        """
        Updates the data types of a DataFrame's columns based on a specific dtype mapping.

        This method takes a DataFrame and applies the column data types specified in
        the 'dtypes' attribute of the class to the corresponding columns in the DataFrame.
        Only the columns that exist in both the DataFrame and the dtype mapping will
        have their data types updated.

        Args:
            df (pd.DataFrame) : The DataFrame whose column data types are to be updated.

        Returns:
            None
        """
        # Filter the dtype dictionary to only use keys that exist in the DataFrame
        filtered_dtypes = {col: dtype for col, dtype in self.dtypes.items() if col in df.columns}

        # Apply the dtypes to the DataFrame using astype
        df = df.astype(filtered_dtypes)
        return df


[docs]
    @classmethod
    def load(cls, filepath: str) -> 'DataSet':
        """
        Loads a dataset object.

        This function loads a DataSet object from a file in `.joblib` format while reconstructing
        necessary components such as `DataPoint` and `DataSetMeta` objects. Assumes
        the file contains serialized elements suitable for creating a DataSet.

        Args:
            filepath (str): The path to the `.joblib` file from which the DataSet will be loaded.
        Returns:
            DataSet: A fully reconstructed DataSet instance based on the data provided in the file.
        Raises:
            ValueError: If the provided file does not have the `.joblib` extension.
        """
        if not get_file_extension(filepath) == '.joblib':
            raise ValueError(f"File {filepath} should have extension '.joblib'")

        load_dict = load_file(filepath)

        # Reconstruct DataPoint objects from the loaded dictionary
        datapoints = [
            DataPoint(x=np.array(dp['x'], dtype=object),
                      y=np.array(dp['y'], dtype=object) if dp['y'] is not None else None,
                      z=np.array(dp['z'], dtype=object) if dp['z'] is not None else None)
            for dp in load_dict['datapoints']
        ]

        # Reconstruct DataSetMeta objects from the loaded dictionary
        meta = DataSetMeta(**load_dict['meta'])

        # Use all keys except 'datapoints' to create the DataSet instance
        dataset_args = {key: value for key, value in load_dict.items() if key not in ['datapoints', 'meta']}
        dataset_args['datapoints'] = datapoints
        dataset_args['meta'] = meta

        return cls(**dataset_args)



[docs]
    @classmethod
    def from_dataframe(cls,
                       df: pd.DataFrame,
                       meta: DataSetMeta,
                       check_df=True) -> 'DataSet':
        """
        Create a new DataSet instance from a given pandas DataFrame and metadata.

        This method constructs a DataSet object from a DataFrame by extracting
        features and target values based on the provided metadata. It also ensures
        that the DataFrame aligns with the metadata specifications and performs a
        check if enabled. Additionally, the method captures data types of the
        specified feature and target columns for later retransformation from the
        DataSet to a DataFrame.

        Args:
            df (pd.DataFrame): The input DataFrame containing data structured according to the provided metadata.
            meta (DataSetMeta): Metadata object that specifies feature and target column names, among other dataset properties.
            check_df (bool, optional): An flag that determines whether to validate the DataFrame against the metadata. Default is True.

        Returns:
            DataSet: A new DataSet instance populated with DataPoint objects derived from the input DataFrame.

        Raises:
            No exceptions specified, as this section is not included as per the guidelines.
        Examples:
            >>> import pandas as pd
            >>> from dwrappr import DataSet, DataSetMeta
            >>> file_path_meta = r"dwrappr/examples/data/example_dataset_meta.json"
            >>> meta = DataSetMeta.load(file_path_meta)
            >>> file_path_data = r"dwrappr/examples/data/example_data.csv"
            >>> df = pd.read_csv(file_path_data)
            >>> ds = DataSet.from_dataframe(df = df, meta = meta)
            >>> ds
            DataSet(datapoints=[DataPoint(x=array([12]), y=array([0])), DataPoint(x=array([7]), y=array([1])), DataPoint(x=array([15]), y=array([0])), DataPoint(x=array([9]), y=array([1]))], dtypes={'feature': dtype('int64'), 'target': dtype('int64')}, meta=DataSetMeta(name='example_data', time_series='False', synthetic_data='True', feature_names=['feature'], target_names=['target'], origin=None, year=None, url=None, sector=None, target_type=None, description=None))
            todo (jacob): Add example without json
        """
        dataset = cls(meta=meta)

        if check_df:
            dataset._check_df(df)

        # Save dtypes for the specified feature and target columns for retransformation from DataSet to DataFrame later

        # Combine feature columns with target columns if target columns are provided and auxiliary columns if auxiliary columns are provided
        columns = meta.feature_names + (meta.target_names if meta.target_names else []) + (meta.auxiliary_names if meta.auxiliary_names else [])
        # Create the dtypes dictionary using the combined columns list
        dataset.dtypes = {col: df.dtypes[col] for col in columns}

        # Add DataPoints from DataFrame
        for _, row in df.iterrows():
            # Extract features and targets from the row
            x = row[meta.feature_names].values
            y = row[meta.target_names].values if meta.target_names else None
            z = row[meta.auxiliary_names].values if meta.auxiliary_names else None

            # Create a DataPoint and add it to the dataset
            datapoint = DataPoint(x=np.array(x), y=np.array(y), z=np.array(z))
            dataset.datapoints.append(datapoint)
        return dataset



[docs]
    @classmethod
    def from_list(cls,
                  features: list,
                  meta: DataSetMeta,
                  targets: list = None,
                  ) -> 'DataSet':
        """
        Creates a DataSet object from given lists of features and targets along with a
        DataSetMeta instance.

        Args:
            features (list): A list containing the feature data, where each sub-list represents a row of feature values.
            meta (DataSetMeta): The metadata associated with the dataset, including feature and target names.
            targets  (list, optional) : A list containing the target data, where each sub-list represents a row of target values. Defaults to None.

        Returns:
            DataSet: Returns an instance of the DataSet object created from the provided features, targets, and metadata.
        """
        # Create a DataFrame from features and targets
        feature_df = pd.DataFrame(features, columns=meta.feature_names)
        df = feature_df

        if check_any(targets):
            target_df = pd.DataFrame(targets, columns=meta.target_names)

            # Concatenate the feature and target DataFrames
            df = pd.concat([feature_df, target_df], axis=1)

        return cls.from_dataframe(
            df=df,
            meta=meta,
        )



[docs]
    @staticmethod
    def get_available_datasets_in_folder(path: str) -> pd.DataFrame:
        """
        Gets available datasets from a specified folder and combines them into a single DataFrame.

        Scans the folder to identify dataset metadata, retrieves the datasets,
        and concatenates them into one DataFrame.

        Args:
            path (str): The file path to the folder containing datasets.

        Returns:
            pd.DataFrame: A DataFrame containing the combined data from all
            datasets found in the folder.
        """
        datasets = DataSetMeta.scan_for_meta(path)
        dataframes = [dataset.as_df for dataset in datasets]
        df = pd.concat(dataframes, axis=0, ignore_index=True)
        return df


    # Method to validate input DataFrame
    def _check_df(self, df: pd.DataFrame) -> None:
        """
        Validates the structure and content of a given pandas DataFrame against pre-defined
        feature and target column requirements. This method checks for the existence of
        required feature and target names in the DataFrame's columns, as well as ensuring
        the DataFrame does not contain any NaN values. If any issue is found, an appropriate
        error is logged and raised.

        Args:
            df (pd.DataFrame): The pandas DataFrame that needs to be validated.

        Raises:
            ValueError: Raised if one or more required features are missing from the DataFrame.
            ValueError: Raised if one or more required targets are missing from the DataFrame.
            ValueError: Raised if the DataFrame contains NaN values.
        """
        # Check if all the features column exist in dataframe, if not raise error
        if not set(self.feature_names).issubset(df.columns):
            missing_features = set(self.meta.feature_names) - set(df.columns)
            if missing_features:
                logger.error(
                    f"The following feature/s are missing in the dataframe: {', '.join(missing_features)}")
                raise ValueError("Feature/s missing in the dataset")

        # Check if all the target columns exist in dataframe, if not raise error
        if self.target_names:
            if not set(self.target_names).issubset(df.columns):
                missing_targets = set(self.feature_names) - set(df.columns)
                if missing_targets:
                    logger.error(
                        f"The following targets/s are missing in the dataframe: {', '.join(missing_targets)}")
                    raise ValueError("Target/s missing in the dataset")
        if df.isnull().values.any():
            # Raise error if dataframe contains NaN values
            logger.error("The dataset contains NaN values")
            raise ValueError("The dataset contains NaN values")
        logger.info("Data checked successfully.")


[docs]
    def save(self, filepath: str, drop_meta_json: bool = True) -> None:
        """
        Saves the current object state to a specified file path, optionally excluding a
        meta JSON file. Ensures the file has the correct extension before saving.

        Args:
            filepath (str): The file path to save the object to. Must end with '.joblib'.
            drop_meta_json (bool): Whether to drop saving the meta JSON file. Defaults to True.

        Raises:
            ValueError: If the provided file path does not have a '.joblib' extension.

        Returns:
            None
        """
        if not get_file_extension(filepath) == '.joblib':
            raise ValueError(f"File {filepath} should have extension '.joblib'")
        save_file(asdict(self), filepath)
        if drop_meta_json:
            self.meta.save(f"{del_file_extension(filepath)}_meta.json")



[docs]
    def split_dataset(
            self,
            first_ds_size: float,
            shuffle: bool = True,
            random_state: int = 42,
            group_by_features: Optional[List[str]] = None
    ) -> Tuple['DataSet', 'DataSet']:
        """
        Splits the dataset into two subsets based on a specified ratio. The split can optionally
        group data points by specific feature values to ensure grouped subsets stay intact.

        Args:
            first_ds_size (float): Proportion of the dataset to assign to the first subset. Should be a value between 0 and 1.
            shuffle (bool, optional): Whether to shuffle the dataset or groups before splitting. Defaults to True.
            random_state (int, optional): Random seed for reproducibility of shuffling. Defaults to 42.
            group_by_features ([List[str]], optional): List of feature names to group data points by before splitting. If None, no grouping is applied. Defaults to None.

        Returns:
            Tuple['DataSet', 'DataSet']: A tuple containing the two resulting datasets after the split.
        Example:
            >>> ds(0.5)
            (DataSet(datapoints=[DataPoint(x=array([12]), y=array([0])), DataPoint(x=array([9]), y=array([1]))], dtypes={'feature': dtype('int64'), 'target': dtype('int64')}, meta=DataSetMeta(name='example_data', time_series='False', synthetic_data='True', feature_names=['feature'], target_names=['target'], origin=None, year=None, url=None, sector=None, target_type=None, description=None)),

            (DataSet(datapoints=[DataPoint(x=array([7]), y=array([1])), DataPoint(x=array([15]), y=array([0]))], dtypes={'feature': dtype('int64'), 'target': dtype('int64')}, meta=DataSetMeta(name='example_data', time_series='False', synthetic_data='True', feature_names=['feature'], target_names=['target'], origin=None, year=None, url=None, sector=None, target_type=None, description=None)))
        """
        random.seed(random_state)

        if group_by_features:
            # Create a mapping from feature values to datasets points
            grouped_datapoints = {}
            for datapoint in self.datapoints:
                # Create a key based on the feature values
                key = tuple(datapoint.x[self.feature_names.index(f)] for f in group_by_features)
                if key not in grouped_datapoints:
                    grouped_datapoints[key] = []
                grouped_datapoints[key].append(datapoint)

            # Convert dictionary to a list of groups
            groups = list(grouped_datapoints.values())

            # Shuffle the groups if shuffle is True
            if shuffle:
                random.shuffle(groups)

            # Calculate the number of groups to include in the first dataset
            num_groups = len(groups)
            split_index = max(1, min(num_groups - 1, int(num_groups * first_ds_size)))

            # Flatten the groups for each dataset
            ds1_datapoints = [dp for group in groups[:split_index] for dp in group]
            random.shuffle(ds1_datapoints)
            ds2_datapoints = [dp for group in groups[split_index:] for dp in group]
            random.shuffle(ds2_datapoints)
        else:
            # If no grouping, simply shuffle and split
            all_datapoints = self.datapoints
            if shuffle:
                all_datapoints = random.sample(all_datapoints, len(all_datapoints))

            split_index = max(1, min(len(all_datapoints) - 1, int(len(all_datapoints) * first_ds_size)))
            ds1_datapoints = all_datapoints[:split_index]
            ds2_datapoints = all_datapoints[split_index:]

        ds1 = DataSet(
            meta=self.meta,
            dtypes=self.dtypes,
            datapoints=ds1_datapoints
        )

        ds2 = DataSet(
            meta=self.meta,
            dtypes=self.dtypes,
            datapoints=ds2_datapoints
        )
        return ds1, ds2




if __name__ == '__main__':
    pass