import re
import warnings
import numpy as np
import pandas as pd
from cleverminer import cleverminer
from packaging.version import Version
def _is_continuous(series: pd.Series) -> bool:
"""Return True if *series* holds numeric non-categorical data."""
if isinstance(series.dtype, pd.CategoricalDtype):
return False
return bool(pd.api.types.is_numeric_dtype(series))
def _to_float_codes(series: pd.Series) -> np.ndarray:
"""Encode a Series as float category codes; -1 (missing) converts to NaN."""
codes = series.astype("category").cat.codes.to_numpy(dtype=float)
codes[codes == -1] = np.nan
return codes
def _automatic_data_conversions(
df: pd.DataFrame, cat_limit: int = 20, verbose: bool = True
) -> pd.DataFrame:
"""Internal fallback for numeric-category ordering (no CleverMiner)."""
df = df.copy()
if verbose:
print("Automatically reordering numeric categories...")
for col in df.columns:
# Plain numeric columns with more unique values than cat_limit are
# continuous variables — converting them to categorical would prevent
# histogram profiling and trigger cat_limit exclusion.
if _is_continuous(df[col]) and df[col].nunique() > cat_limit:
continue
try:
converted = df[col].astype(str).astype(float)
uniq = [v for v in pd.unique(converted) if pd.notna(v)]
if all(v % 1 == 0 for v in uniq):
converted = converted.astype("Int64") # nullable int preserves NA
uniq = [int(v) for v in uniq]
sorted_cats = sorted(uniq)
df[col] = converted.astype(
pd.CategoricalDtype(categories=sorted_cats, ordered=True)
)
except Exception:
try:
values = df[col].dropna().unique()
# deduplicate mixed int/str (1 and '1' are the same value)
seen: dict = {}
for v in values:
if str(v).strip().lower() not in seen:
seen[str(v).strip().lower()] = v
values = list(seen.values())
# partition: values with a numeric token sort first, rest alphabetically
numeric_pairs, non_numeric = [], []
for val in values:
s = str(val).strip()
res = re.findall(r"\d+(?:\.\d+)?", s)
if res:
numeric_pairs.append(
(float(res[0]), int(s[0].isdigit()), s.lower(), val)
)
else:
non_numeric.append(val)
sorted_list = (
[v for *_, v in sorted(numeric_pairs)]
+ sorted(non_numeric, key=lambda v: str(v).strip().lower())
)
# normalise column to str so mixed-type values match string categories
df[col] = df[col].where(df[col].isna(), df[col].astype(str))
df[col] = df[col].astype(
pd.CategoricalDtype(categories=[str(v) for v in sorted_list], ordered=True)
)
except Exception:
pass # column cannot be converted; leave as-is
if verbose:
print("Automatically reordering numeric categories...done")
return df
[docs]
def prepare(
df: pd.DataFrame | None = None,
opts: dict | None = None,
auto_data_prep: str = "default",
verbose: bool = True,
) -> pd.DataFrame:
"""Prepare a categorical dataset by converting numeric-like columns to
ordered ``pandas.Categorical``.
:param df: DataFrame to prepare.
:param opts: Options forwarded to the underlying engine.
:param auto_data_prep: ``'CLM'`` to use CleverMiner; any other value
(default ``'default'``) uses the built-in conversion, which respects
``cat_limit`` and leaves high-cardinality numeric columns as continuous.
:returns: New DataFrame with eligible columns as ordered
``CategoricalDtype``.
"""
if not isinstance(df, pd.DataFrame):
raise TypeError(f"df must be a pandas DataFrame, got {type(df).__name__}")
_pandas_cat_keys = {
"auto_prepare", "cat_limit", "na_values", "na_ignore", "keep_default_na"
}
opts2 = {k: v for k, v in opts.items() if k not in _pandas_cat_keys} if opts else {}
opts2["keep_df"] = True
if auto_data_prep == "CLM":
clm = cleverminer(df=df.copy(), opts=opts2)
if Version(cleverminer.version_string) < Version("1.0.7"):
return df
return clm.df
cat_limit = opts.get("cat_limit", 20) if opts else 20
return _automatic_data_conversions(df, cat_limit=cat_limit, verbose=verbose)
[docs]
def handle_missing_values(
df: pd.DataFrame,
na_values: list | None = None,
na_ignore: list | None = None,
keep_default_na: bool = True,
):
"""Replace sentinel string values with ``pd.NA``.
Returns a new DataFrame — the input is never modified.
:param df: DataFrame to process.
:param na_values: Additional strings to treat as missing.
:param na_ignore: Built-in sentinel strings to exclude.
:param keep_default_na: When ``False``, only ``na_values`` are used.
:returns: ``(df, detected, counts)`` tuple.
"""
na_values = na_values or []
na_ignore = na_ignore or []
df = df.copy()
default_missing_values = [
"-1.#IND", "1.#QNAN", "1.#IND", "-1.#QNAN", "#N/A N/A", "#N/A",
"N/A", "n/a", "NA", "na", "<NA>", "#NA", "NULL", "null", "Null",
"NAN", "NaN", "-NaN", "nan", "-nan", "NONE", "None", "none",
"UNKNOWN", "Unknown", "unknown", "UNKNOWN/INVALID", "Unknown/Invalid",
"Unknown/invalid", "unknown/invalid", "INVALID", "Invalid", "invalid",
"UNAVAILABLE", "Unavailable", "unavailable", "MISSING", "Missing",
"missing", "UNSPECIFIED", "Unspecified", "unspecified", "IGNORE",
"Ignore", "ignore", "NO INFO", "NO_INFO", "No Info", "No info",
"no info", "no_info", "UNDETERMINED", "Undetermined", "undetermined",
"NOT GIVEN", "UNDEFINED", "Undefined", "undefined", "NOT DEFINED",
"Not Defined", "Not defined", "not_defined", "NOT_GIVEN", "Not Given",
"Not given", "not given", "not_given", "UNSURE", "Unsure", "unsure",
"I WOULD RATHER NOT SAY", "I would rather not say",
"i would rather not say", "NO DEFINIDO", "No Definido", "No definido",
"no definido", "no_definido", "NO COLOR", "No Color", "No color",
"no color", "no_color", "NOT RATED", "NR", "Not Rated", "Not rated",
"not rated", "not_rated", "nr", '""', "?", "–", "-", "",
]
if na_ignore:
default_missing_values = [v for v in default_missing_values if v not in na_ignore]
missing_values = default_missing_values if keep_default_na else []
if na_values:
missing_values = list(missing_values) + list(na_values)
detected_missing_values: dict = {}
replaced_counts: dict = {}
for column in df.columns:
# Coerce object columns with mixed int/str types to all-str so that
# e.g. 1 (int) and '1' (str) don't appear as duplicate categories.
if df[column].dtype == object and not hasattr(df[column], "cat"):
types_present = {type(v) for v in df[column].dropna()}
if len(types_present) > 1:
df[column] = df[column].where(df[column].isna(), df[column].astype(str))
cats = df[column].value_counts()
hits = cats[cats.index.isin(missing_values)]
detected_missing_values[column] = hits.index.tolist()
replaced_counts[column] = hits.values.tolist()
na_already = df[column].isna().sum()
if na_already > 0:
detected_missing_values[column].insert(0, "pandas.NAN")
replaced_counts[column].insert(0, int(na_already))
if hasattr(df[column], "cat"):
to_remove = [v for v in missing_values if v in df[column].cat.categories]
if to_remove:
df[column] = df[column].cat.remove_categories(to_remove)
else:
df[column] = df[column].replace(missing_values, pd.NA)
return df, detected_missing_values, replaced_counts