{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\nGroupLasso for linear regression with dummy variables\n=====================================================\n\nA sample script for group lasso with dummy variables\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Setup\n-----\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\nimport numpy as np\nfrom sklearn.linear_model import Ridge\nfrom sklearn.metrics import r2_score\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import OneHotEncoder\n\nfrom group_lasso import GroupLasso\nfrom group_lasso.utils import extract_ohe_groups\n\nnp.random.seed(42)\nGroupLasso.LOG_LOSSES = True"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Set dataset parameters\n----------------------\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "num_categories = 30\nmin_options = 2\nmax_options = 10\nnum_datapoints = 10000\nnoise_std = 1"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Generate data matrix\n--------------------\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "X_cat = np.empty((num_datapoints, num_categories))\nfor i in range(num_categories):\n    X_cat[:, i] = np.random.randint(min_options, max_options, num_datapoints)\n\nohe = OneHotEncoder()\nX = ohe.fit_transform(X_cat)\ngroups = extract_ohe_groups(ohe)\ngroup_sizes = [np.sum(groups == g) for g in np.unique(groups)]\nactive_groups = [np.random.randint(0, 2) for _ in np.unique(groups)]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Generate coefficients\n---------------------\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "w = np.concatenate(\n    [\n        np.random.standard_normal(group_size) * is_active\n        for group_size, is_active in zip(group_sizes, active_groups)\n    ]\n)\nw = w.reshape(-1, 1)\ntrue_coefficient_mask = w != 0\nintercept = 2"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Generate regression targets\n---------------------------\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "y_true = X @ w + intercept\ny = y_true + np.random.randn(*y_true.shape) * noise_std"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "View noisy data and compute maximum R^2\n---------------------------------------\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "plt.figure()\nplt.plot(y, y_true, \".\")\nplt.xlabel(\"Noisy targets\")\nplt.ylabel(\"Noise-free targets\")\n# Use noisy y as true because that is what we would have access\n# to in a real-life setting.\nR2_best = r2_score(y, y_true)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Generate pipeline and train it\n------------------------------\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "pipe = pipe = Pipeline(\n    memory=None,\n    steps=[\n        (\n            \"variable_selection\",\n            GroupLasso(\n                groups=groups,\n                group_reg=0.1,\n                l1_reg=0,\n                scale_reg=None,\n                supress_warning=True,\n                n_iter=100000,\n                frobenius_lipschitz=False,\n            ),\n        ),\n        (\"regressor\", Ridge(alpha=1)),\n    ],\n)\npipe.fit(X, y)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Extract results and compute performance metrics\n-----------------------------------------------\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# Extract from pipeline\nyhat = pipe.predict(X)\nsparsity_mask = pipe[\"variable_selection\"].sparsity_mask_\ncoef = pipe[\"regressor\"].coef_.T\n\n# Construct full coefficient vector\nw_hat = np.zeros_like(w)\nw_hat[sparsity_mask] = coef\n\nR2 = r2_score(y, yhat)\n\n# Print performance metrics\nprint(f\"Number variables: {len(sparsity_mask)}\")\nprint(f\"Number of chosen variables: {sparsity_mask.sum()}\")\nprint(f\"R^2: {R2}, best possible R^2 = {R2_best}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Visualise regression coefficients\n---------------------------------\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "for i in range(w.shape[1]):\n    plt.figure()\n    plt.plot(w[:, i], \".\", label=\"True weights\")\n    plt.plot(w_hat[:, i], \".\", label=\"Estimated weights\")\n\nplt.figure()\nplt.plot([w.min(), w.max()], [coef.min(), coef.max()], \"gray\")\nplt.scatter(w, w_hat, s=10)\nplt.ylabel(\"Learned coefficients\")\nplt.xlabel(\"True coefficients\")\nplt.show()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}