Source code for pandas_cat.preparation

import re
import warnings

import numpy as np
import pandas as pd
from cleverminer import cleverminer
from packaging.version import Version


def _is_continuous(series: pd.Series) -> bool:
    """Return True if *series* holds numeric non-categorical data."""
    if isinstance(series.dtype, pd.CategoricalDtype):
        return False
    return bool(pd.api.types.is_numeric_dtype(series))


def _to_float_codes(series: pd.Series) -> np.ndarray:
    """Encode a Series as float category codes; -1 (missing) converts to NaN."""
    codes = series.astype("category").cat.codes.to_numpy(dtype=float)
    codes[codes == -1] = np.nan
    return codes


def _automatic_data_conversions(
    df: pd.DataFrame, cat_limit: int = 20, verbose: bool = True
) -> pd.DataFrame:
    """Internal fallback for numeric-category ordering (no CleverMiner)."""
    df = df.copy()
    if verbose:
        print("Automatically reordering numeric categories...")
    for col in df.columns:
        # Plain numeric columns with more unique values than cat_limit are
        # continuous variables — converting them to categorical would prevent
        # histogram profiling and trigger cat_limit exclusion.
        if _is_continuous(df[col]) and df[col].nunique() > cat_limit:
            continue
        try:
            converted = df[col].astype(str).astype(float)
            uniq = [v for v in pd.unique(converted) if pd.notna(v)]
            if all(v % 1 == 0 for v in uniq):
                converted = converted.astype("Int64")  # nullable int preserves NA
                uniq = [int(v) for v in uniq]
            sorted_cats = sorted(uniq)
            df[col] = converted.astype(
                pd.CategoricalDtype(categories=sorted_cats, ordered=True)
            )
        except Exception:
            try:
                values = df[col].dropna().unique()
                # deduplicate mixed int/str (1 and '1' are the same value)
                seen: dict = {}
                for v in values:
                    if str(v).strip().lower() not in seen:
                        seen[str(v).strip().lower()] = v
                values = list(seen.values())
                # partition: values with a numeric token sort first, rest alphabetically
                numeric_pairs, non_numeric = [], []
                for val in values:
                    s = str(val).strip()
                    res = re.findall(r"\d+(?:\.\d+)?", s)
                    if res:
                        numeric_pairs.append(
                            (float(res[0]), int(s[0].isdigit()), s.lower(), val)
                        )
                    else:
                        non_numeric.append(val)
                sorted_list = (
                    [v for *_, v in sorted(numeric_pairs)]
                    + sorted(non_numeric, key=lambda v: str(v).strip().lower())
                )
                # normalise column to str so mixed-type values match string categories
                df[col] = df[col].where(df[col].isna(), df[col].astype(str))
                df[col] = df[col].astype(
                    pd.CategoricalDtype(categories=[str(v) for v in sorted_list], ordered=True)
                )
            except Exception:
                pass  # column cannot be converted; leave as-is
    if verbose:
        print("Automatically reordering numeric categories...done")
    return df


[docs] def prepare( df: pd.DataFrame | None = None, opts: dict | None = None, auto_data_prep: str = "default", verbose: bool = True, ) -> pd.DataFrame: """Prepare a categorical dataset by converting numeric-like columns to ordered ``pandas.Categorical``. :param df: DataFrame to prepare. :param opts: Options forwarded to the underlying engine. :param auto_data_prep: ``'CLM'`` to use CleverMiner; any other value (default ``'default'``) uses the built-in conversion, which respects ``cat_limit`` and leaves high-cardinality numeric columns as continuous. :returns: New DataFrame with eligible columns as ordered ``CategoricalDtype``. """ if not isinstance(df, pd.DataFrame): raise TypeError(f"df must be a pandas DataFrame, got {type(df).__name__}") _pandas_cat_keys = { "auto_prepare", "cat_limit", "na_values", "na_ignore", "keep_default_na" } opts2 = {k: v for k, v in opts.items() if k not in _pandas_cat_keys} if opts else {} opts2["keep_df"] = True if auto_data_prep == "CLM": clm = cleverminer(df=df.copy(), opts=opts2) if Version(cleverminer.version_string) < Version("1.0.7"): return df return clm.df cat_limit = opts.get("cat_limit", 20) if opts else 20 return _automatic_data_conversions(df, cat_limit=cat_limit, verbose=verbose)
[docs] def handle_missing_values( df: pd.DataFrame, na_values: list | None = None, na_ignore: list | None = None, keep_default_na: bool = True, ): """Replace sentinel string values with ``pd.NA``. Returns a new DataFrame — the input is never modified. :param df: DataFrame to process. :param na_values: Additional strings to treat as missing. :param na_ignore: Built-in sentinel strings to exclude. :param keep_default_na: When ``False``, only ``na_values`` are used. :returns: ``(df, detected, counts)`` tuple. """ na_values = na_values or [] na_ignore = na_ignore or [] df = df.copy() default_missing_values = [ "-1.#IND", "1.#QNAN", "1.#IND", "-1.#QNAN", "#N/A N/A", "#N/A", "N/A", "n/a", "NA", "na", "<NA>", "#NA", "NULL", "null", "Null", "NAN", "NaN", "-NaN", "nan", "-nan", "NONE", "None", "none", "UNKNOWN", "Unknown", "unknown", "UNKNOWN/INVALID", "Unknown/Invalid", "Unknown/invalid", "unknown/invalid", "INVALID", "Invalid", "invalid", "UNAVAILABLE", "Unavailable", "unavailable", "MISSING", "Missing", "missing", "UNSPECIFIED", "Unspecified", "unspecified", "IGNORE", "Ignore", "ignore", "NO INFO", "NO_INFO", "No Info", "No info", "no info", "no_info", "UNDETERMINED", "Undetermined", "undetermined", "NOT GIVEN", "UNDEFINED", "Undefined", "undefined", "NOT DEFINED", "Not Defined", "Not defined", "not_defined", "NOT_GIVEN", "Not Given", "Not given", "not given", "not_given", "UNSURE", "Unsure", "unsure", "I WOULD RATHER NOT SAY", "I would rather not say", "i would rather not say", "NO DEFINIDO", "No Definido", "No definido", "no definido", "no_definido", "NO COLOR", "No Color", "No color", "no color", "no_color", "NOT RATED", "NR", "Not Rated", "Not rated", "not rated", "not_rated", "nr", '""', "?", "–", "-", "", ] if na_ignore: default_missing_values = [v for v in default_missing_values if v not in na_ignore] missing_values = default_missing_values if keep_default_na else [] if na_values: missing_values = list(missing_values) + list(na_values) detected_missing_values: dict = {} replaced_counts: dict = {} for column in df.columns: # Coerce object columns with mixed int/str types to all-str so that # e.g. 1 (int) and '1' (str) don't appear as duplicate categories. if df[column].dtype == object and not hasattr(df[column], "cat"): types_present = {type(v) for v in df[column].dropna()} if len(types_present) > 1: df[column] = df[column].where(df[column].isna(), df[column].astype(str)) cats = df[column].value_counts() hits = cats[cats.index.isin(missing_values)] detected_missing_values[column] = hits.index.tolist() replaced_counts[column] = hits.values.tolist() na_already = df[column].isna().sum() if na_already > 0: detected_missing_values[column].insert(0, "pandas.NAN") replaced_counts[column].insert(0, int(na_already)) if hasattr(df[column], "cat"): to_remove = [v for v in missing_values if v in df[column].cat.categories] if to_remove: df[column] = df[column].cat.remove_categories(to_remove) else: df[column] = df[column].replace(missing_values, pd.NA) return df, detected_missing_values, replaced_counts