"""
Work with scikit-learn
======================
"""
# %%
# ``abess`` is very easy to work with the famous package ``scikit-learn``, and here is an example.
# We going to illustrate the integration of the ``abess`` with ``scikit-learn``’s pre-processing and model selection modules to
# build a non-linear model for diagnosing malignant tumors.
# Let start with importing necessary dependencies:
import numpy as np
from abess.datasets import make_glm_data
from abess.linear import LinearRegression, LogisticRegression
from sklearn.datasets import fetch_openml, load_breast_cancer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import roc_auc_score, make_scorer, roc_curve, auc
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, cross_val_score
from sklearn.feature_selection import SelectFromModel
#%%
# Establish the process
# ---------------------
# Suppose we would like to extend the original variables to their
# interactions, and then do ``LogisticRegression`` on them. This can be
# record with ``Pipeline``:
pipe = Pipeline([
('poly', PolynomialFeatures(include_bias=False)), # without intercept
('standard', StandardScaler()),
('alogistic', LogisticRegression())
])
#%%
# Parameter grid
# --------------
# We can give different parameters to model and let the program choose the
# best. Here we should give parameters for ``PolynomialFeatures``, for
# example:
param_grid = {
# whether the "self-combination" (e.g. X^2, X^3) exists
'poly__interaction_only': [True, False],
'poly__degree': [1, 2, 3] # the degree of polynomial
}
# %%
# Note that the program would try all combinations of what we give, which means that there are :math:`2\times3=6` combinations of parameters will be tried.
#
# Criterion
# ---------
# After giving a grid of parameters, we should define what is a "better"
# result. For example, the AUC (area under ROC curve) can be a criterion
# and the larger, the better.
scorer = make_scorer(roc_auc_score, greater_is_better=True)
#%%
# Cross Validation
# ----------------
# For more accurate results, cross validation (CV) is often formed.
#%%
# Suppose that the data is independent and identically distributed (i.i.d.)
# that all samples stem from the same generative process
# and that the generative process has no memory of past generated samples.
# A typical CV strategy is K-fold and a corresponding grid search procedure
# can be made as follows:
grid_search = GridSearchCV(pipe, param_grid, scoring=scorer, cv=5)
#%%
# However, if there exists correlation between observations (e.g. time-series data),
# K-fold strategy is not appropriate any more. An alternative CV strategy is ``TimeSeriesSplit``.
# It is a variation of K-fold which returns first K folds as train set and the
# (K+1)-th fold as test set.
#%%
# The following example shows a combinatioon of ``abess``
# and ``TimeSeriesSplit`` applied to ``Bike_Sharing_Demand`` dataset and it returns the
# cv score of a specific choice of ``support_size``.
bike_sharing = fetch_openml('Bike_Sharing_Demand', version=2, as_frame=True)
df = bike_sharing.frame
X = df.drop('count', axis='columns')
y = df['count'] / df['count'].max()
ts_cv = TimeSeriesSplit(
n_splits=5,
gap=48,
max_train_size=10000,
test_size=1000,
)
categorical_columns = ['weather', 'season', 'holiday', 'workingday',]
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
one_hot_abess_pipeline = make_pipeline(
ColumnTransformer(
transformers=[
('categorical', one_hot_encoder, categorical_columns),
('one_hot_time', one_hot_encoder, ['hour', 'weekday', 'month']),
],
remainder=MinMaxScaler(),
),
LinearRegression(support_size=5),
)
scores = cross_val_score(one_hot_abess_pipeline, X, y, cv=ts_cv)
print("%0.2f score with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
#%%
# Model fitting
# -------------
# Eveything is prepared now. We can simply load the data and put it into
# ``grid_search``:
X, y = load_breast_cancer(return_X_y=True)
grid_search.fit(X, y)
print([grid_search.best_score_, grid_search.best_params_])
# %%
# The output of the code reports the information of the polynomial features for the selected model among candidates,
# and its corresponding area under the curve (AUC), which is over 0.97,
# indicating the selected model would have an admirable contribution in practice.
#
# Moreover, the best choice of parameter combination is shown above: 2 degree with "self-combination",
# implying the inclusion of the pairwise interactions between any two
# features can lead to a better model generalization.
# %%
# Here is its ROC curve:
import matplotlib.pyplot as plt
proba = grid_search.predict_proba(X)
fpr, tpr, _ = roc_curve(y, proba[:, 1])
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], 'k--', label="ROC curve (area = %0.2f)" % auc(fpr, tpr))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title("Receiver operating characteristic (ROC) curve")
plt.legend(loc="lower right")
plt.show()
#%%
# Feature selection
# ------------------
#%%
# Besides being used to make prediction explicitly, ``abess`` can be exploited to
# select important features.
# The following example shows how to perform abess-based feature selection
# using ``sklearn.feature_selection.SelectFromModel``.
#%%
np.random.seed(0)
n, p, k = 300, 1000, 5
data = make_glm_data(n=n, p=p, k=k, family='gaussian')
X, y = data.x, data.y
print('Shape of original data: ', X.shape)
model = LinearRegression().fit(X, y)
sfm = SelectFromModel(model, prefit=True)
X_new = sfm.transform(X)
print('Shape of transformed data: ', X_new.shape)
# %%
# sphinx_gallery_thumbnail_path = 'Tutorial/figure/scikit_learn.png'