Source code for abess.datasets

import numpy as np


def sample(p, k):
    full = np.arange(p)
    select = sorted(np.random.choice(full, k, replace=False))
    return select


def sparse_beta_generator(p, Nonzero, k, M):
    Tbeta = np.zeros([p, M])
    beta_value = beta_generator(k, M)
    Tbeta[Nonzero, :] = beta_value
    return Tbeta


def beta_generator(k, M):
    # # strong_num <- 3
    # # moderate_num <- 7
    # # weak_num <- 5
    # # strong_num <- 10
    # # moderate_num <- 10
    # # weak_num <- 10
    strong_num = int(k * 0.3)
    moderate_num = int(k * 0.4)
    weak_num = k - strong_num - moderate_num
    # signal_num = strong_num + moderate_num + weak_num

    strong_signal = np.random.normal(
        0, 10, strong_num * M).reshape(strong_num, M)
    moderate_signal = np.random.normal(
        0, 5, moderate_num * M).reshape(moderate_num, M)
    weak_signal = np.random.normal(0, 2, weak_num * M).reshape(weak_num, M)
    beta_value = np.concatenate((strong_signal, moderate_signal, weak_signal))

    beta_value = beta_value[sample(k, k), :]

    # beta_value = np.random.normal(size=(k, M))
    return beta_value


[docs]class make_glm_data: r""" Generate a dataset with single response. Parameters ---------- n: int The number of observations. p: int The number of predictors of interest. k: int The number of nonzero coefficients in the underlying regression model. family: {gaussian, binomial, poisson, gamma, cox} The distribution of the simulated response. "gaussian" for univariate quantitative response, "binomial" for binary classification response, "poisson" for counting response, "gamma" for positive continuous response, "cox" for left-censored response. rho: float, optional, default=0 A parameter used to characterize the pairwise correlation in predictors. corr_type: string, optional, default="const" The structure of correlation matrix. "const" for constant pairwise correlation, "exp" for pairwise correlation with exponential decay. sigma: float, optional, default=1 The variance of the gaussian noise. It would be unused if snr is not None. coef_: array_like, optional, default=None The coefficient values in the underlying regression model. censoring: bool, optional, default=True For Cox data, it indicates whether censoring is existed. c: int, optional, default=1 For Cox data and censoring=True, it indicates the maximum censoring time. So that all observations have chances to be censored at (0, c). scal: float, optional, default=10 The scale of survival time in Cox data. snr: float, optional, default=None A numerical value controlling the signal-to-noise ratio (SNR) in gaussian data. class_num: int, optional, default=3 The number of possible classes in oridinal dataset, i.e. :math:`y \in \{0, 1, 2, ..., \text{class_num}-1\}` Attributes ---------- x: array-like, shape(n, p) Design matrix of predictors. y: array-like, shape(n,) Response variable. coef_: array-like, shape(p,) The coefficients used in the underlying regression model. It has k nonzero values. Notes ----- The output, whose type is named ``data``, contains three elements: ``x``, ``y`` and ``coef_``, which correspond the variables, responses and coefficients, respectively. Each row of ``x`` or ``y`` indicates a sample and is independent to the other. We denote :math:`x, y, \beta` for one sample in the math formulas below. * Linear Regression * Usage: ``family='gaussian'[, sigma=...]`` * Model: :math:`y \sim N(\mu, \sigma^2),\ \mu = x^T\beta`. * the coefficient :math:`\beta\sim U[m, 100m]`, where :math:`m = 5\sqrt{2\log p/n}`; * the variance :math:`\sigma = 1`. * Logistic Regression * Usage: ``family='binomial'`` * Model: :math:`y \sim \text{Binom}(\pi),\ \text{logit}(\pi) = x^T \beta`. * the coefficient :math:`\beta\sim U[2m, 10m]`, where :math:`m = 5\sqrt{2\log p/n}`. * Poisson Regression * Usage: ``family='poisson'`` * Model: :math:`y \sim \text{Poisson}(\lambda),\ \lambda = \exp(x^T \beta)`. * the coefficient :math:`\beta\sim U[2m, 10m]`, where :math:`m = 5\sqrt{2\log p/n}`. * Gamma Regression * Usage: ``family='gamma'`` * Model: :math:`y \sim \text{Gamma}(k, \theta),\ k\theta = -1/(x^T \beta + \epsilon), k\sim U[0.1, 100.1]` in shape-scale definition. * the coefficient :math:`\beta\sim U[m, 100m]`, where :math:`m = 5\sqrt{2\log p/n}`. * Cox PH Survival Analysis * Usage: ``family='cox'[, scal=..., censoring=..., c=...]`` * Model: :math:`y=\min(t,C)`, where :math:`t = \left[-\dfrac{\log U}{\exp(X \beta)}\right]^s,\ U\sim N(0,1),\ s=\dfrac{1}{\text{scal}}` and censoring time :math:`C\sim U(0, c)`. * the coefficient :math:`\beta\sim U[2m, 10m]`, where :math:`m = 5\sqrt{2\log p/n}`; * the scale of survival time :math:`\text{scal} = 10`; * censoring is enabled, and max censoring time :math:`c=1`. * Ordinal Regression * Usage: ``family='ordinal'[, class_num=...]`` * Model: :math:`y\in \{0, 1, \dots, n_{class}\}`, :math:`\mathbb{P}(y\leq i) = \dfrac{1} {1+\exp(-x^T\beta - \varepsilon_i)}`, where :math:`i\in \{0, 1, \dots, n_{class}\}` and :math:`\forall i<j, \varepsilon_i < \varepsilon_j`. * the coefficient :math:`\beta\sim U[-M, M]`, where :math:`M = 125\sqrt{2\log p/n}`; * the intercept: :math:`\forall i,\varepsilon_i\sim U[-M, M]`; * the number of classes :math:`n_{class}=3`. """
[docs] def __init__(self, n, p, k, family, rho=0, corr_type="const", sigma=1, coef_=None, censoring=True, c=1, scal=10, snr=None, class_num=3): self.n = n self.p = p self.k = k self.family = family if corr_type == "exp": # generate correlation matrix with exponential decay R = np.zeros((p, p)) for i in range(p): for j in range(i, p): R[i, j] = rho ** abs(i - j) R = R + R.T - np.identity(p) elif corr_type == "const": # generate correlation matrix with constant correlation R = np.ones((p, p)) * rho for i in range(p): R[i, i] = 1 else: raise ValueError( "corr_type should be \'const\' or \'exp\'") x = np.random.multivariate_normal(mean=np.zeros(p), cov=R, size=(n,)) nonzero = sample(p, k) Tbeta = np.zeros(p) sign = np.random.choice([1, -1], k) if family == "gaussian": m = 5 * np.sqrt(2 * np.log(p) / n) M = 100 * m if coef_ is None: Tbeta[nonzero] = np.random.uniform(m, M, k) * sign else: Tbeta = coef_ if snr is None: y = np.matmul(x, Tbeta) + sigma * np.random.normal(0, 1, n) else: y = np.matmul(x, Tbeta) power = np.mean(np.square(y)) npower = power / 10 ** (snr / 10) noise = np.random.randn(len(y)) * np.sqrt(npower) y += noise elif family == "binomial": m = 5 * sigma * np.sqrt(2 * np.log(p) / n) if coef_ is None: Tbeta[nonzero] = np.random.uniform(2 * m, 10 * m, k) * sign else: Tbeta = coef_ xbeta = np.matmul(x, Tbeta) xbeta[xbeta > 30] = 30 xbeta[xbeta < -30] = -30 p = np.exp(xbeta) / (1 + np.exp(xbeta)) y = np.random.binomial(1, p) elif family == "poisson": x = x / 16 m = 5 * sigma * np.sqrt(2 * np.log(p) / n) if coef_ is None: Tbeta[nonzero] = np.random.uniform(2 * m, 10 * m, k) * sign # Tbeta[nonzero] = np.random.normal(0, 4*m, k) else: Tbeta = coef_ xbeta = np.matmul(x, Tbeta) xbeta[xbeta > 30] = 30 xbeta[xbeta < -30] = -30 lam = np.exp(xbeta) y = np.random.poisson(lam=lam) elif family == "cox": m = 5 * sigma * np.sqrt(2 * np.log(p) / n) if coef_ is None: Tbeta[nonzero] = np.random.uniform(2 * m, 10 * m, k) * sign else: Tbeta = coef_ time = np.power(-np.log(np.random.uniform(0, 1, n)) / np.exp(np.matmul(x, Tbeta)), 1 / scal) if censoring: ctime = c * np.random.uniform(0, 1, n) status = (time < ctime) * 1 censoringrate = 1 - sum(status) / n print("censoring rate:" + str(censoringrate)) for i in range(n): time[i] = min(time[i], ctime[i]) else: status = np.ones(n) print("no censoring") y = np.hstack((time.reshape((-1, 1)), status.reshape((-1, 1)))) elif family == "gamma": x = x / 16 m = 5 * np.sqrt(2 * np.log(p) / n) if coef_ is None: Tbeta[nonzero] = np.random.uniform(m, 100 * m, k) * sign else: Tbeta = coef_ # add noise eta = x @ Tbeta + np.random.normal(0, sigma, n) # set coef_0 to make eta<0 eta = eta - np.abs(np.max(eta)) - 10 eta = -1 / eta # set the shape para of gamma uniformly in [0.1,100.1] shape_para = 100 * np.random.uniform(0, 1) + 0.1 y = np.random.gamma( shape=shape_para, scale=eta / shape_para, size=n) elif family == "ordinal": M = 125 * np.sqrt(2 * np.log(p) / n) if coef_ is None: Tbeta[nonzero] = np.random.uniform(-M, M, k) else: Tbeta = coef_ intercept = np.sort(np.random.uniform(-M, M, class_num - 1)) eta = x @ Tbeta[:, np.newaxis] + intercept logit = 1 / (1 + np.exp(-eta)) # prob prob = np.zeros((n, class_num)) prob[:, 0] = logit[:, 0] prob[:, 1:class_num - 1] = (logit[:, 1:class_num - 1] - logit[:, 0:class_num - 2]) prob[:, class_num - 1] = 1 - logit[:, class_num - 2] # y y = np.zeros(n) for i in range(n): y[i] = np.random.choice(np.arange(class_num), 1, p=prob[i, :]) else: raise ValueError( "Family should be \'gaussian\', \'binomial\', " "\'poisson\', \'gamma\', \'cox\', or \'ordinal\'.") self.x = x self.y = y self.coef_ = Tbeta
[docs]class make_multivariate_glm_data: r""" Generate a dataset with multi-responses. Parameters ---------- n: int, optional, default=100 The number of observations. p: int, optional, default=100 The number of predictors of interest. family: {multigaussian, multinomial, poisson}, optional default="multigaussian". The distribution of the simulated multi-response. "multigaussian" for multivariate quantitative responses, "multinomial" for multiple classification responses, "poisson" for counting responses. k: int, optional, default=10 The number of nonzero coefficients in the underlying regression model. M: int, optional, default=1 The number of responses. rho: float, optional, default=0.5 A parameter used to characterize the pairwise correlation in predictors. corr_type: string, optional, default="const" The structure of correlation matrix. "const" for constant pairwise correlation, "exp" for pairwise correlation with exponential decay. coef_: array_like, optional, default=None The coefficient values in the underlying regression model. sparse_ratio: float, optional, default=None The sparse ratio of predictor matrix (x). Attributes ---------- x: array-like, shape(n, p) Design matrix of predictors. y: array-like, shape(n, M) Response variable. coef_: array-like, shape(p, M) The coefficients used in the underlying regression model. It is rowwise sparse, with k nonzero rows. Notes ----- The output, whose type is named ``data``, contains three elements: ``x``, ``y`` and ``coef_``, which correspond the variables, responses and coefficients, respectively. Note that the ``y`` and ``coef_`` here are both matrix: 1. each row of ``x`` and ``y`` indicates a sample; 2. each column of ``coef_`` corresponds to the effect on one response. It is rowwise sparsity. Under this setting, a "useful" variable is relevant to all responses. We :math:`x, y, \beta` for one sample in the math formulas below. * Multitask Regression * Usage: ``family='multigaussian'`` * Model: :math:`y \sim MVN(\mu, \Sigma),\ \mu^T=x^T \beta`. * the variance :math:`\Sigma = \text{diag}(1, 1, \cdots, 1)`; * the coefficient :math:`\beta` contains 30% "strong" values, 40% "moderate" values and the rest are "weak". They come from :math:`N(0, 10)`, :math:`N(0, 5)` and :math:`N(0, 2)`, respectively. * Multinomial Regression * Usage: ``family='multinomial'`` * Model: :math:`y` is a "0-1" array with only one "1". Its index is chosed under probabilities :math:`\pi = \exp(x^T \beta)`. * the coefficient :math:`\beta` contains 30% "strong" values, 40% "moderate" values and the rest are "weak". They come from :math:`N(0, 10)`, :math:`N(0, 5)` and :math:`N(0, 2)`, respectively. """
[docs] def __init__(self, n=100, p=100, k=10, family="multigaussian", rho=0.5, corr_type="const", coef_=None, M=1, sparse_ratio=None): if corr_type == "exp": # generate correlation matrix with exponential decay R = np.zeros((p, p)) for i in range(p): for j in range(i, p): R[i, j] = rho ** abs(i - j) R = R + R.T - np.identity(p) elif corr_type == "const": # generate correlation matrix with constant correlation R = np.ones((p, p)) * rho for i in range(p): R[i, i] = 1 else: raise ValueError( "corr_type should be \'const\' or \'exp\'") X = np.random.multivariate_normal(mean=np.zeros(p), cov=R, size=(n,)) if sparse_ratio is not None: sparse_size = int((1 - sparse_ratio) * n * p) position = sample(n * p, sparse_size) print(position) for i in range(sparse_size): X[int(position[i] / p), position[i] % p] = 0 Nonzero = sample(p, k) # Nonzero = np.array([0, 1, 2]) # Nonzero[:k] = 1 if coef_ is None: Tbeta = sparse_beta_generator(p, Nonzero, k, M) else: Tbeta = coef_ if family in ("multigaussian", "gaussian"): eta = np.matmul(X, Tbeta) y = eta + np.random.normal(0, 1, n * M).reshape(n, M) elif family in ("multinomial", "binomial"): for i in range(M): Tbeta[:, i] = Tbeta[:, i] - Tbeta[:, M - 1] eta = np.exp(np.matmul(X, Tbeta)) # y2 = np.zeros(n) y = np.zeros([n, M]) index = np.linspace(0, M - 1, M) for i in range(n): p = eta[i, :] / np.sum(eta[i, :]) j = np.random.choice(index, size=1, replace=True, p=p) # print(j) y[i, int(j[0])] = 1 # y2[i] = j elif family == "poisson": X = X / 16 eta = np.matmul(X, Tbeta) eta[eta > 30] = 30 eta[eta < -30] = -30 lam = np.exp(eta) y = np.random.poisson(lam=lam) else: raise ValueError( "Family should be \'gaussian\', \'multigaussian\', " "or \'multinomial\'.") self.x = X self.y = y self.coef_ = Tbeta