Source code for dwrappr.utils

# seperated due to circular imports

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
import numpy as np
from typing import Union

import logging

logger = logging.getLogger(__name__)


[docs] def shuffle_split_dataframe(df: pd.DataFrame, train_size: float, group: str = None, rnd_state: int = 42 ) -> tuple[pd.DataFrame, pd.DataFrame]: """ Shuffles and splits a pandas DataFrame into training and testing sets. This function splits a DataFrame into two subsets while optionally grouping by a specified column. If a group parameter is provided, the function preserves grouped data in either the training or testing subset, ensuring that rows belonging to the same group do not span both subsets. Otherwise, it performs a straightforward random split. The split sizes are determined by the train_size parameter. Additionally, the function provides reproducibility of the split through a random state parameter. Args: df (pd.DataFrame): The input pandas DataFrame to be shuffled and split. train_size (float): The fraction of the dataset to include in the training set. Must be a float between 0.0 and 1.0. group (str): Optional. The column name in the DataFrame to be used for grouping. If specified, rows belonging to the same group will remain together in either the training or testing set. Defaults to None. rnd_state (int): A random seed for reproducibility of the shuffling and splitting process. Defaults to 42. Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing two pandas DataFrames, where the first DataFrame is the training set and the second is the testing set. Example: >>> from dwrappr.utils import shuffle_split_dataframe >>> df = pd.DataFrame({ ... 'feature': [1, 2, 3, 4, 5, 6], ... 'target': [0, 1, 0, 1, 0, 1], ... 'group_id': ['A', 'A', 'B', 'B', 'C', 'C'] ... }) >>> train_df, test_df = shuffle_split_dataframe(df, train_size=0.5) >>> train_df 2025-05-18 16:33:17 [INFO] df1_size: 3, df2_size: 3 feature target group_id 0 3 0 B 1 5 0 C 2 4 1 B todo (jacob): add test_df and group parameter """ if not group: df1, df2 = train_test_split(df, train_size=train_size, random_state=rnd_state) elif group: # Define GroupShuffleSplit gss = GroupShuffleSplit(n_splits=1, train_size=train_size, random_state=42) # Split the DataFrame into train and test sets based on 'group' for df1_idx, df2_idx in gss.split(df, groups=df[group]): df1 = df.iloc[df1_idx] df2 = df.iloc[df2_idx] df1.reset_index(drop=True, inplace=True) df2.reset_index(drop=True, inplace=True) logger.info(f"df1_size: {len(df1)}, df2_size: {len(df2)}") return df1, df2
[docs] def deep_update(original, updates) -> None: """ Recursively updates a dictionary with values from another dictionary. This function merges two dictionaries, with the second dictionary’s values overwriting or updating the first dictionary's values. If both dictionaries contain nested dictionaries for the same key, the function performs a deep update by recursing into the nested dictionaries. Args: original (dict) : The dictionary to be updated. updates (dict) : The dictionary containing updates to be applied to the original dictionary. Example: >>> from dwrappr.utils import deep_update >>> original = {'a': 1, 'b': {'x': 10, 'y': 20}} >>> updates = {'b': {'y': 99, 'z': 42}, 'c': 3} >>> deep_update(original, updates) >>> original {'a': 1, 'b': {'x': 10, 'y': 99, 'z': 42}, 'c': 3} """ for key, value in updates.items(): if isinstance(value, dict) and key in original: # If the value is a dictionary and the key exists in the original, recurse deep_update(original[key], value) else: # Otherwise, update the original dictionary with the new value original[key] = value
[docs] def convert_to_native_types(d): """ Converts NumPy data types in a nested dictionary to native Python data types. This function iterates through a nested dictionary and converts any detected NumPy data types (e.g., np.float64, np.int64, np.bool_) to their respective native Python types (e.g., float, int, bool). For nested dictionaries, the function applies itself recursively. Args: d (dict): The input dictionary, which may contain nested dictionaries and values of NumPy data types. Returns: None: The function modifies the input dictionary in place, replacing NumPy data types with native Python equivalents. Examples: >>> import numpy as np >>> from dwrappr.utils import convert_to_native_types >>> data = { ... 'a': np.float64(1.5), ... 'b': {'x': np.int64(3), 'y': np.bool_(True)} ... } >>> convert_to_native_types(data) >>> data {'a': 1.5, 'b': {'x': 3, 'y': True}} """ for key, value in d.items(): if isinstance(value, dict): convert_to_native_types(value) elif isinstance(value, np.float64): d[key] = float(value) elif isinstance(value, np.int64): d[key] = int(value) elif isinstance(value, np.bool_): d[key] = bool(value)
[docs] def df_row_to_nested_dict(row) -> dict: """ Converts a row of a DataFrame into a nested dictionary. The function processes a row from a DataFrame where the keys in the row contain paths separated by slashes ('/'), indicating hierarchical structure. It constructs a nested dictionary based on these keys, recursively creating dictionaries for the intermediate paths. The function also converts values in the resulting dictionary to their native Python types. Args: row (Mapping[str, Any]): A row from a DataFrame, which is a map-like object where keys are string paths (e.g., 'a/b/c') and values can be of any type. Returns: dict: A nested dictionary representation where keys are hierarchical paths broken down based on the slash ('/') delimiter. Examples: >>> import pandas as pd >>> from dwrappr.utils import df_row_to_nested_dict >>> row = pd.Series({ ... 'a/b/c': 1, ... 'a/b/d': 2, ... 'x/y': 3 ... }) >>> df_row_to_nested_dict(row) {'a': {'b': {'c': 1, 'd': 2}}, 'x': {'y': 3}} """ nested_dict = {} for col, value in row.items(): keys = col.split('/') d = nested_dict for key in keys[:-1]: d = d.setdefault(key, {}) d[keys[-1]] = value convert_to_native_types(nested_dict) return nested_dict
[docs] def check_any(x: Union[np.ndarray, list]) -> bool: """ Returns True if any element in the input is True/nonzero. Accepts numpy arrays and Python lists only. Args: x (np.ndarray or list): Input array or list. Returns: bool: True if any element is True/nonzero, else False. Raises: TypeError: If input is not a numpy array or Python list. Examples: >>> import numpy as np >>> from dwrappr.utils import check_any >>> check_any([0, 0, 1]) True >>> check_any(np.array([0, 0, 0])) False >>> check_any([]) False """ if isinstance(x, np.ndarray): return x.any() elif isinstance(x, list): return any(x) else: raise TypeError( f"Input must be a numpy.ndarray or a Python list, got {type(x)}." )
[docs] def ensure_list_of_lists(data:list[np.ndarray]): if isinstance(data, list) and all(isinstance(item, np.ndarray) for item in data): # Convert each ndarray in the list to a normal list return [item.tolist() for item in data] else: raise NotImplemented("Can only handle list of numpy arrays.")