"""
Models' interface and creation pipeline.
"""
from __future__ import annotations
import logging
import operator as op
import re
import typing as t
from abc import ABCMeta, abstractmethod
from collections import abc
from itertools import chain
from statistics import mean
import numpy as np
import optuna
import pandas as pd
from eBoruta import eBoruta
from more_itertools import unique_everseen
from sklearn.metrics import f1_score, r2_score
from toolz import curry
from tqdm.auto import tqdm
from xgboost import XGBClassifier, XGBRegressor
from kinactive.config import ColNames, DFG_MAP_REV
PDB_PATTERN = re.compile(r"\((\w{4}):\w+\|")
LOGGER = logging.getLogger(__name__)
[docs]class ModelBase(metaclass=ABCMeta):
"""
An abstract base class for model objects.
"""
@property
@abstractmethod
def targets(self) -> abc.Sequence[str]:
"""
:return: A sequence of target variables.
"""
[docs] @abstractmethod
def reinit_model(self):
"""
Reinitialize model.
"""
[docs] @abstractmethod
def train(self, df: pd.DataFrame):
"""
Train the model on entirety of the provided data.
"""
[docs] @abstractmethod
def predict(self, df: pd.DataFrame):
"""
Make predictions from the provided data.
"""
[docs] @abstractmethod
def cv(self, df: pd.DataFrame, n: int) -> float:
"""
Cross-validate the model.
:param df: Data to use for training/testing.
:param n: The number of CV folds.
:return: A performance estimate aggregated across testing folds.
"""
[docs] @abstractmethod
def generate_fold_idx(
self, df: pd.DataFrame, n: int
) -> abc.Iterator[tuple[np.ndarray, np.ndarray]]:
"""
Generate fold indices from the provided data.
:param df: DataFrame with predictors.
:param n: The number of folds.
:return: An iterator over tuples with train and test boolean indices
allowing to select train and test observations from `df`.
"""
[docs] @abstractmethod
def score(self, df: pd.DataFrame) -> float:
"""
Score the model.
:class:`KinactiveClassifier` uses :func:`f1_score`.
:class:`KinactiveRegressor` uses :func:`r2_score`.
:param df: Data to predict from.
:return: A single number -- model's performance estimate (the higher
the better).
"""
[docs]class ObjectiveFn(t.Protocol):
"""
An objective function type.
"""
[docs] def __call__(
self, trial: optuna.Trial, df: pd.DataFrame, model: ModelBase, n_cv: int
) -> float:
...
[docs]class ModelT(t.Protocol):
"""
A minimalistic model interface.
"""
[docs] def fit(
self, x: pd.DataFrame | np.ndarray, y: np.ndarray | pd.Series, **kwargs
) -> ModelT:
"""
Fit the model
"""
[docs] def predict(self, x: pd.DataFrame | np.ndarray, **kwargs) -> np.ndarray:
"""Predict the results."""
[docs] def predict_proba(self, x: pd.DataFrame | np.ndarray, **kwargs) -> np.ndarray:
"""Predict classes' probabilities."""
[docs]def xgb_objective(
trial: optuna.Trial,
df: pd.DataFrame,
model: KinactiveModel,
n_cv: int = 5,
use_early_stopping: bool = False,
) -> float:
"""
A default objective function for XGB models. It uses the following setup::
learning_rate: [0, 1]
max_depth: [4, 16]
gamma: [0.0, 10.0]
reg_lambda: [0.0, 10.0]
reg_alpha: [0.0, 10.0]
colsample_bytree: [0.4, 1.0]
colsample_bylevel: [0.4, 1.0]
Additionally, for the ``XGBclassifier`` it adds::
scale_pos_weight: [0.0, 10.0]
After the parameters are sampled, they are combined with the existing model
parameters via ``{**model.params, **params}``. Then, the model is instantiated
with the new parameters and cross-validated using :meth:`KinactiveModel.cv`.
:param trial: A trial instance used dynamically by optuna. Leave as is.
:param df: A dataset used to fit and test the model.
:param model: The model to optimize the params for.
:param n_cv: The number of CV folds to derive the score.
:param use_early_stopping: Passed to the ``model``.
:return: The cross-validated score.
"""
params = {
"learning_rate": trial.suggest_float("learning_rate", 0, 1),
"max_depth": trial.suggest_int("max_depth", 4, 16),
"gamma": trial.suggest_float("gamma", 0, 10.0),
"reg_lambda": trial.suggest_float("reg_lambda", 0, 10.0),
"reg_alpha": trial.suggest_float("reg_alpha", 0, 10.0),
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.4, 1.0),
}
if isinstance(model, KinactiveClassifier):
params["scale_pos_weight"] = trial.suggest_float("scale_pos_weight", 0.0, 10.0)
if not use_early_stopping:
params["n_estimators"] = trial.suggest_int("max_depth", 10, 1000)
# callback = optuna.integration.XGBoostPruningCallback(trial, 'validation_logloss')
params = {**model.params, **params}
model = model.__class__(
model.model, model.targets, model.features, params, use_early_stopping
)
return model.cv(df, n_cv)
[docs]def lr_objective(
trial: optuna.Trial,
df: pd.DataFrame,
model: KinactiveModel,
n_cv: int = 5,
use_early_stopping: bool = False,
) -> float:
"""
A default objective function for the logistic regression model.
It optimizes the following params::
C: [0.0, 1.0]
class_weight: [None, "balanced"]
solver: ["newton-cg", "sag", "saga", "lbfgs"]
multi_class: ["auto", "ovr", "multinomial"]
If ``solver == "saga"``, it encodes "l2" as the ``penalty`` parameters.
Otherwise, it chooses between "l1", "l2", and "elasticnet".
If the latter is chosen, it adds samples the ``l1_ratio`` parameter between
zero and one.
The options ``max_iter`` and ``n_jobs`` are hard-coded to 1000 and -1.
After sampling, the process is identical to the :func:`xgb_objective`.
:param trial: A trial instance used dynamically by optuna. Leave as is.
:param df: A dataset used to fit and test the model.
:param model: The model to optimize the params for.
:param n_cv: The number of CV folds to derive the score.
:param use_early_stopping: Passed to the ``model``.
:return: The cross-validated score.
"""
params = {
"C": trial.suggest_float("C", 0, 1),
"class_weight": trial.suggest_categorical("class_weight", [None, "balanced"]),
"solver": trial.suggest_categorical(
"solver", ["newton-cg", "sag", "saga", "lbfgs"]
),
"multi_class": trial.suggest_categorical(
"multi_class", ["auto", "ovr", "multinomial"]
),
"max_iter": 1000,
"n_jobs": -1,
}
if params["solver"] in ["newton-cg", "sag", "lbfgs"]:
params["penalty"] = "l2"
else:
params["penalty"] = trial.suggest_categorical(
"penalty", ["l1", "l2", "elasticnet"]
)
if params["penalty"] == "elasticnet":
params["l1_ratio"] = trial.suggest_float("l1_ratio", 0, 1)
params = {**model.params, **params}
model = model.__class__(
model.model, model.targets, model.features, params, use_early_stopping
)
score = model.cv(df, n_cv)
return score
[docs]class KinactiveModel(ModelBase, metaclass=ABCMeta):
"""
An interface wrapper around the ML algorithm.
Its methods operate on a ``DataFrame``, applying stored :meth:`features`
and :meth:`targets` to obtain necessary variables.
.. seealso::
:func:`make` -- a model's creation pipeline.
"""
[docs] def __init__(
self,
model: ModelT,
targets: abc.Iterable[str],
features: abc.Iterable[str] = (),
params: dict[str, t.Any] | None = None,
use_early_stopping: bool = False,
selector: eBoruta | None = None,
):
"""
:param model: A model defining ``fit`` and ``predict`` methods.
:meth:`select_params` assumes it to be either XGBoost or
``LogisticRegressionClassifier`.
:param targets: Target variables' names.
:param features: Feature variables' names.
:param params: Initial parameters for the model.
:param use_early_stopping: If ``True``, and the model is either
``XGBClassifier`` or ``XGBRegressor``, the :meth:`train` will
split the provided dataset into training and evaluation parts
and use the evaluation part to monitor the loss function and stop
adding new trees (thus, finish training), if the loss didn't improve
for a number of consecutive steps. The number of early stopping
rounds should be provided in ``params``.
:param selector: The feature selector to use in :meth:`select_params`.
"""
if not isinstance(features, list):
features = list(features)
if not isinstance(targets, list):
targets = list(targets)
self._features = features
self._targets = targets
self._model = model
#: Model's parameters.
self.params = params or {}
#: Use early stopping via eval set.
self.use_early_stopping = use_early_stopping
#: eBoruta instance
self.selector = selector
@property
def model(self):
"""
:return: Current model instance.
"""
return self._model
@property
def features(self) -> list[str]:
"""
:return: A list of features used to train the model.
"""
return self._features
@property
def targets(self) -> list[str]:
return self._targets
[docs] def reinit_model(self):
if isinstance(self.model, type):
self._model = self._model(**self.params)
else:
self._model = self._model.__class__(**self.params)
[docs] def train(self, df: pd.DataFrame):
if self.use_early_stopping and isinstance(
self.model, (XGBClassifier, XGBRegressor)
):
train_idx, eval_idx = next(self.generate_fold_idx(df, 10))
train_df, train_ys = _get_xy(df[train_idx], self.features, self.targets)
eval_df, eval_ys = _get_xy(df[eval_idx], self.features, self.targets)
self._model.fit(
train_df,
np.squeeze(train_ys.values),
eval_set=[(eval_df, np.squeeze(eval_ys.values))],
verbose=0,
)
else:
assert (
"early_stopping_rounds" not in self.params
), "Must not have early stopping params if `use_early_stopping` is `False`"
xs, ys = _get_xy(df, self.features, self.targets)
assert ys is not None, f"failed finding target variables {self.targets}"
if isinstance(self.model, (XGBClassifier, XGBRegressor)):
self._model.fit(xs.values, np.squeeze(ys.values), verbose=0)
else:
self._model.fit(xs.values, np.squeeze(ys.values))
[docs] def predict(self, df: pd.DataFrame) -> np.ndarray:
df = _apply_selection(df, self.features)
return self._model.predict(df.values)
[docs] def cv(self, df: pd.DataFrame, n: int, verbose: bool = False) -> float:
return _cross_validate(self, df, n, verbose)
[docs] def cv_pred(
self, df: pd.DataFrame, n: int, verbose: bool = False
) -> tuple[float, pd.DataFrame]:
"""
Cross-validate the score and predict the data in test folds.
:param df: Input data with features and target columns.
:param n: The number of CV folds to use.
:param verbose: Output progress bar.
:return: A tuple with score and a copy of the supplied dataframe with
fold assignment and model prediction columns added.
"""
return _cross_validate_and_predict(self, df, n, verbose)
[docs] def select_params(
self,
df: pd.DataFrame,
n_trials: int,
direction: str = "maximize",
early_stopping_rounds: int = 0,
) -> optuna.Study:
"""
Optimize hyperparameters.
:param df: Input data with features and target columns.
:param n_trials: The number of optimization rounds.
:param direction: "maximize" or "minimize" the objective.
:param early_stopping_rounds: The number of early stopping rounds to use.
Zero means no early stopping.
:return: The ``Study`` instance from optuna.
"""
objective_fn = (
xgb_objective
if isinstance(self.model, (XGBClassifier, XGBRegressor))
else lr_objective
)
objective = curry(objective_fn)(
df=df, model=self, use_early_stopping=self.use_early_stopping
)
study = optuna.create_study(direction=direction)
cb = (
[EarlyStoppingCallback(early_stopping_rounds, direction=direction)]
if early_stopping_rounds
else None
)
study.optimize(objective, n_trials=n_trials, callbacks=cb)
self.params = study.best_params
return study
[docs] def select_features(self, df: pd.DataFrame, **kwargs) -> eBoruta:
"""
Select important features and store the selection to :meth:`features`.
:param df: A dataframe with features and targets.
:param kwargs: Passed to the :attr:`selector`.
:return: The ``selector.fit()`` output.
"""
if self.selector is None:
self.selector = eBoruta(**kwargs)
df_x, df_y = _get_xy(df, self.features, self.targets)
assert df_y is not None
res = self.selector.fit(
df_x, np.squeeze(df_y.values), model=self.model, verbose=0
)
self._features = list(res.features_.accepted)
return res
[docs] def rank_features(self, features: abc.Sequence[str] | None, **kwargs):
"""
Rank features using ``selector.rank()``.
:param features: A sequence of features. If not provided, will use
:meth:`features`.
:param kwargs: Passed to ``selector.rank()``.
:return: A table with ranked features.
"""
if self.selector is None:
raise ValueError(
"No selector instance present. Call `select_features` first"
)
features = features or self.features
return self.selector.rank(features, **kwargs)
[docs]class KinactiveClassifier(KinactiveModel):
"""
A model wrapper for classification objective.
"""
[docs] def generate_fold_idx(
self, df: pd.DataFrame, n: int
) -> abc.Iterator[tuple[np.ndarray, np.ndarray]]:
return _generate_stratified_fold_idx(
df["ObjectID"].values, np.squeeze(df[self.targets].values), n
)
[docs] def predict_proba(self, df: pd.DataFrame) -> np.ndarray:
"""
Predict classes' probabilities.
:param df: A tabular dataset with features and target columns.
:return: The array of predicted probabilities. Its shape depends on the
number of targets and classes.
"""
df = _apply_selection(df, self.features)
return self._model.predict_proba(df.values)
[docs] def score(self, df: pd.DataFrame, **kwargs) -> float:
"""
Predict and score using the ``f1_score()`` function. For multiclass
problems, the ``average`` is "micro" by default unless specified
otherwise by kwargs.
:param df: A tabular dataset with features and target columns.
:param kwargs: Passed to the scoring function.
:return: The resulting score.
"""
y_pred = self.predict(df)
y_true = np.squeeze(df[self.targets].values)
if (
len(self.targets) > 1
or len(np.bincount(y_true)) > 2
or len(np.bincount(y_pred)) > 2
and "average" not in kwargs
):
kwargs["average"] = "micro"
return f1_score(y_true, y_pred, **kwargs)
[docs]class KinactiveRegressor(KinactiveModel):
"""
A model wrapper for regression objective.
"""
[docs] def generate_fold_idx(
self, df: pd.DataFrame, n: int
) -> abc.Iterator[tuple[np.ndarray, np.ndarray]]:
return _generate_fold_idx(df["ObjectID"].map(_get_unique_group).values, n)
[docs] def score(self, df: pd.DataFrame, **kwargs) -> float:
"""
Predict and score using the ``r2_score()`` function.
:param df: A tabular dataset with features and target columns.
:param kwargs: Passed to the scoring function.
:return: The resulting score.
"""
y_pred = self.predict(df)
y_true = np.squeeze(df[self.targets].values)
return r2_score(y_true, y_pred, **kwargs)
DFGModels = t.NamedTuple(
"DFGModels",
[
("in_", KinactiveClassifier),
("out", KinactiveClassifier),
("other", KinactiveClassifier),
("meta", KinactiveClassifier),
],
)
[docs]class DFGClassifier(ModelBase):
"""
A composite model encapsulating three binary classifiers each predicting
its own DFG conformation and a logistic regression meta-classifier trained
on the [in, other, out] probabilities.
Nevertheless, it behaves like a regular model providing interface similar to
the :class:`KinActiveClassifier`.
"""
[docs] def __init__(
self,
in_model: KinactiveClassifier,
out_model: KinactiveClassifier,
other_model: KinactiveClassifier,
meta_model: KinactiveClassifier,
):
self.models: DFGModels = DFGModels(in_model, out_model, other_model, meta_model)
@property
def targets(
self,
) -> list[str]:
return self.models.meta.targets
@property
def dfg_features(self) -> list[str]:
"""
:return: A list of features used by the XGBoost binary "in", "out", and
"other" models.
"""
return list(
unique_everseen(chain.from_iterable(m.features for m in self.models[:3]))
)
@property
def meta_features(self) -> list[str]:
"""
:return: A list of features used by the "meta" LR classifier.
"""
return self.models.meta.features
@property
def proba_names(self) -> list[str]:
"""
:return: A list of column names of [in, out, other] probabilities.
"""
return [ColNames.dfg_in_proba, ColNames.dfg_out_proba, ColNames.dfg_other_proba]
[docs] def train(self, df: pd.DataFrame):
"""
1. Train :attr:`models`
2. Use trained :attr:`models` to predict their response variables.
3. Use predicted variables to train the `meta` model.
:param df: A dataset to train on. Must include all relevant variables.
"""
for m in self.models[:3]:
m.train(df)
df = df.copy()
for n, m in zip(self.proba_names, self.models[:3]):
df[n] = m.predict_proba(df)[:, 1]
self.models.meta.train(df)
[docs] def predict_full(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Predict all response variables.
:param df: A dataset to predict on. Must include all relevant
variables.
:return: A copy of the ``df`` with predictions.
"""
df = df.copy()
for n, m in zip(self.proba_names, self.models[:3]):
df[n] = m.predict_proba(df)[:, 1]
y_prob = self.models.meta.predict_proba(df).round(2)
for i, c in enumerate(ColNames.dfg_meta_proba_cols):
df[c] = y_prob[:, i]
df[ColNames.dfg_cls_pred] = np.argmax(y_prob, axis=1)
df[ColNames.dfg_pred] = df[ColNames.dfg_cls_pred].map(DFG_MAP_REV)
return df
[docs] def predict(self, df: pd.DataFrame) -> np.ndarray:
"""
Predict the DFG class. ``0`` stands for DFGin, ``1`` for DFGout, and
``2`` for DFGinter.
.. note::
This is equivalent to :meth:`predict_full` and selecting the
relevant column.
:param df: A dataset to predict from. Must include all relevant
variables.
:return: An array of predicted classes.
"""
return self.predict_full(df)[ColNames.dfg_cls_pred].values
[docs] def reinit_model(self):
for m in self.models[:3]:
m.reinit_model()
[docs] def score(self, df: pd.DataFrame, **kwargs):
y_true = df[ColNames.dfg_cls].values
y_pred = self.predict(df)
if "average" not in kwargs:
kwargs["average"] = "micro"
return f1_score(y_true, y_pred, **kwargs)
[docs] def generate_fold_idx(
self, df: pd.DataFrame, n: int
) -> abc.Iterator[tuple[np.ndarray, np.ndarray]]:
return _generate_stratified_fold_idx(
df["ObjectID"].values, np.squeeze(df[self.targets].values), n
)
[docs] def cv(self, df: pd.DataFrame, n: int, verbose: bool = True):
return _cross_validate(self, df, n, verbose)
[docs] def cv_pred(self, df: pd.DataFrame, n: int, verbose: bool = True):
"""
Cross-validate the score and predict the data in test folds.
:param df: Input data with features and target columns.
:param n: The number of CV folds to use.
:param verbose: Output progress bar.
:return: A tuple with score and a copy of the supplied dataframe with
fold assignment and model prediction columns added.
"""
return _cross_validate_and_predict(self, df, n, verbose)
[docs]class EarlyStoppingCallback:
"""
Early stopping callback for Optuna.
See https://github.com/optuna/optuna/issues/1001#issuecomment-862843041
"""
[docs] def __init__(self, early_stopping_rounds: int, direction: str = "minimize") -> None:
self.early_stopping_rounds = early_stopping_rounds
self._iter = 0
if direction == "minimize":
self._operator = op.lt
self._score = np.inf
elif direction == "maximize":
self._operator = op.gt
self._score = -np.inf
else:
raise ValueError(f"invalid direction: {direction}")
[docs] def __call__(self, study: optuna.Study, trial: optuna.Trial) -> None:
if self._operator(study.best_value, self._score):
self._iter = 0
self._score = study.best_value
else:
self._iter += 1
if self._iter >= self.early_stopping_rounds:
LOGGER.info(
f"Stopping optimization at max iter {self.early_stopping_rounds}"
)
study.stop()
def _apply_selection(df: pd.DataFrame, features: list[str]):
if not features:
return df
return df[features]
def _generate_fold_chunks(
obj_ids: np.ndarray, n_folds: int
) -> abc.Generator[tuple[np.ndarray, np.ndarray], None, None]:
ids = np.unique(obj_ids)
np.random.shuffle(ids)
chunks = np.array_split(ids, n_folds)
for i in range(n_folds):
chunk_test = chunks[i]
chunk_train = np.concatenate([x for j, x in enumerate(chunks) if j != i])
yield chunk_train, chunk_test
def _generate_fold_idx(
obj_ids: np.ndarray, n_folds: int
) -> abc.Generator[tuple[np.ndarray, np.ndarray], None, None]:
for train_chunk, test_chunk in _generate_fold_chunks(obj_ids, n_folds):
idx_test = np.isin(obj_ids, test_chunk)
idx_train = np.isin(obj_ids, train_chunk)
yield idx_train, idx_test
def _generate_stratified_fold_idx(
obj_ids: abc.Sequence[abc.Hashable],
target: abc.Sequence[abc.Hashable],
n_folds: int,
):
df = pd.DataFrame({"ObjectID": obj_ids, "Target": target})
groups = [
_generate_fold_chunks(gg["ObjectID"], n_folds) for _, gg in df.groupby("Target")
]
for id_pairs in map(list, zip(*groups)):
ids_train = np.concatenate([x[0] for x in id_pairs])
ids_test = np.concatenate([x[1] for x in id_pairs])
idx_train = np.isin(obj_ids, ids_train)
idx_test = np.isin(obj_ids, ids_test)
yield idx_train, idx_test
def _parse_pdb_id(obj_id: str) -> str:
finds = PDB_PATTERN.findall(obj_id)
if not finds:
raise ValueError(f"Failed to find any PDB IDs in {obj_id}")
if len(finds) > 1:
raise ValueError(f"Found multiple PDB IDs in the same object ID {obj_id}")
return finds.pop()
def _get_unique_group(obj_id: str) -> str:
try:
return _parse_pdb_id(obj_id)
except ValueError:
return obj_id
def _get_xy(
df, features: list[str], targets: list[str] | None
) -> tuple[pd.DataFrame, pd.DataFrame | None]:
ys = _apply_selection(df, targets) if targets else None
df = _apply_selection(df, features)
return df, ys
def _cross_validate(
model: ModelBase,
df: pd.DataFrame,
n: int,
verbose: bool = False,
) -> float:
idx_gen = model.generate_fold_idx(df, n)
if verbose:
idx_gen = tqdm(idx_gen, total=n, desc="Cross-validating")
scores = []
for train_idx, test_idx in idx_gen:
model.reinit_model()
model.train(df[train_idx])
scores.append(model.score(df[test_idx]))
score = float(np.mean(scores))
msg = f"Scores: {np.array(scores).round(2)}; mean={score}"
if verbose:
LOGGER.info(msg)
else:
LOGGER.debug(msg)
return score
def _cross_validate_and_predict(
model: ModelBase,
df: pd.DataFrame,
n: int,
verbose: bool = False,
) -> tuple[float, pd.DataFrame]:
df = df.copy()
idx_gen = model.generate_fold_idx(df, n)
if verbose:
idx_gen = tqdm(idx_gen, total=n, desc="Cross-validating")
scores = []
for fold_i, (train_idx, test_idx) in enumerate(idx_gen, start=1):
model.reinit_model()
model.train(df[train_idx])
scores.append(model.score(df[test_idx]))
if isinstance(model, DFGClassifier):
df_pred = model.predict_full(df[test_idx])
for col in model.proba_names:
df.loc[test_idx, col] = df_pred[col]
df.loc[test_idx, ColNames.dfg_cls_pred] = df_pred[ColNames.dfg_cls_pred]
else:
df.loc[test_idx, f"{model.targets[0]}_pred"] = model.predict(df[test_idx])
df.loc[test_idx, "Fold_i"] = fold_i
score = mean(scores)
msg = f"Scores: {scores}; mean={score}"
if verbose:
LOGGER.info(msg)
else:
LOGGER.debug(msg)
return score, df
[docs]def make(
df: pd.DataFrame,
targets: list[str],
features: list[str],
starting_params: dict[str, t.Any],
use_early_stopping: bool = False,
early_stopping_rounds_param_sel: int = 0,
classifier: bool = True,
n_trials_sel_1: int = 50,
n_trials_sel_2: int = 50,
n_final_cv: int = 10,
boruta_kwargs: dict[str, t.Any] | None = None,
) -> tuple[KinactiveClassifier | KinactiveRegressor, float, pd.DataFrame]:
"""
A pipeline to make a new ``KinActive`` model. It comprises:
#. Initializing the model using starting params.
#. A parameter-selection run.
#. A feature selection run.
#. Another parameter selection run.
#. Cross-validate and predict on test folds.
#. Train on the full dataset.
:param df: A table to train on.
:param targets: The names of the target columns.
:param features: The names of the feature columns.
:param starting_params: The starting model's parameters.
:param use_early_stopping: Use early stopping to cap the number of trees.
The ``early_stopping_rounds`` param may be provided via
``starting_params``.
:param early_stopping_rounds_param_sel: The number of early stopping rounds
for the hyperparameter optimization. ``0`` indicates no early stopping.
:param classifier: If ``True``, assume classification objective and init
the :class:`KinactiveClassifier`. Otherwise, assume the regression and
init the :class:`KinactiveRegressor`.
:param n_trials_sel_1: The number of parameter selection rounds before the
feature selection.
:param n_trials_sel_2: The number of parameter selection rounds after the
feature selection.
:param n_final_cv: The number of CV folds for the final CV.
:param boruta_kwargs: Passed to the ``eBoruta`` feature selector.
:return:
"""
taken_names = ["ObjectID", *targets]
features = features or [c for c in df.columns if c not in taken_names]
if classifier:
model = KinactiveClassifier(
XGBClassifier(),
targets,
features,
params=starting_params,
use_early_stopping=use_early_stopping,
)
else:
model = KinactiveRegressor(
XGBRegressor(),
targets,
features,
params=starting_params,
use_early_stopping=use_early_stopping,
)
if n_trials_sel_1 > 0:
LOGGER.info(
f"Selecting params using full feature set for {n_trials_sel_1} trials"
)
model.select_params(df, n_trials_sel_1)
LOGGER.info(f"Final params: {model.params}")
LOGGER.info("Selecting features")
kwargs = boruta_kwargs or {}
model.select_features(df, **kwargs)
LOGGER.info(f"Selected {len(model.features)} features")
if n_trials_sel_2 > 0:
LOGGER.info("Selecting params 2")
model.select_params(
df, n_trials_sel_2, early_stopping_rounds=early_stopping_rounds_param_sel
)
LOGGER.info(f"Final params: {model.params}")
cv_score, df_pred = model.cv_pred(df, n_final_cv, verbose=True)
LOGGER.info(f"Final CV score: {cv_score}")
model.train(df)
return model, cv_score, df_pred
if __name__ == "__main__":
raise RuntimeError