From c43453a4619f77bacc33fcc278523bdcadb3e78d Mon Sep 17 00:00:00 2001 From: Tijana Zrnic Date: Tue, 4 Jun 2024 08:01:20 -0700 Subject: [PATCH] ppboot updates --- examples/census_healthcare_ppboot.ipynb | 382 ++++++++++++++++++++++++ ppi_py/baselines.py | 99 +++++- ppi_py/ppi.py | 6 + 3 files changed, 486 insertions(+), 1 deletion(-) create mode 100644 examples/census_healthcare_ppboot.ipynb diff --git a/examples/census_healthcare_ppboot.ipynb b/examples/census_healthcare_ppboot.ipynb new file mode 100644 index 0000000..1f979ba --- /dev/null +++ b/examples/census_healthcare_ppboot.ipynb @@ -0,0 +1,382 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "23c114b7-6751-4192-9939-86d40967caba", + "metadata": {}, + "source": [ + "# Correlation between income and private health insurance\n", + "\n", + "The goal is to investigate the correlation between income and the procurement of private health insurance using US census data. The target of inference is the Pearson correlation coefficient when regressing the binary indicator of health insurance on income. The data from California in the year 2019 is downloaded through the Folktables interface (1). Predictions of health insurance are made by training a gradient boosting tree via XGBoost (2) on the previous year’s data.\n", + "\n", + "Since the basic PPI method is not applicable to this estimation problem, we use PPBoot for prediction-powered inference. We also use the bootstrap for classical inference.\n", + "\n", + "1. F. Ding, M. Hardt, J. Miller, L. Schmidt, “Retiring adult: New datasets for fair machine learning” in Advances in Neural Information Processing Systems 34 (2021), pp. 6478–6490.\n", + "2. T. Chen, C. Guestrin, “XGBoost: A scalable tree boosting system” in Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (2016), pp. 785–794." + ] + }, + { + "cell_type": "markdown", + "id": "fa0b89de-40f4-4225-ba6f-f35428d8f648", + "metadata": {}, + "source": [ + "### Import necessary packages" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "bec4524b-d6bd-4d3c-ac59-2d6b77ac8a21", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import os, sys\n", + "\n", + "sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))\n", + "import numpy as np\n", + "import pandas as pd\n", + "from ppi_py.datasets import load_dataset\n", + "from ppi_py import ppboot, classical_bootstrap_ci\n", + "from sklearn.linear_model import LogisticRegression\n", + "from tqdm import tqdm\n", + "from scipy.optimize import brentq\n", + "from utils import *" + ] + }, + { + "cell_type": "markdown", + "id": "5cf90ae6", + "metadata": {}, + "source": [ + "### Import the census healthcare data set\n", + "\n", + "Load the data. The data set contains reported indicators of health insurance (```Y```), predicted indicators of health insurance (```Yhat```), and reported income (```X```)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a6da3138", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_folder = \"./data/\"\n", + "data = load_dataset(dataset_folder, \"census_healthcare\")\n", + "Y_total = data[\"Y\"]\n", + "Yhat_total = data[\"Yhat\"]\n", + "X_total = data[\"X\"][:,0] # first coordinate is income; second is constant term" + ] + }, + { + "cell_type": "markdown", + "id": "8969f9db", + "metadata": {}, + "source": [ + "### Problem setup\n", + "\n", + "Specify the error level (```alpha```), range of values for the labeled data set size (```ns```), and number of trials (```num_trials```).\n", + "\n", + "Compute the ground-truth value of the estimand." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5b3c8f29", + "metadata": {}, + "outputs": [], + "source": [ + "alpha = 0.1\n", + "n_total = Y_total.shape[0] # Total number of labeled examples\n", + "ns = np.array([500, 750, 1000, 1500, 2000]).astype(\n", + " int\n", + ") # Test for different numbers of labeled ballots\n", + "num_trials = 100\n", + "# define Pearson correlation coefficient\n", + "def pearson(X, Y):\n", + " return np.corrcoef(X, Y)[0,1]\n", + " \n", + "# Compute ground truth\n", + "true_theta = pearson(X_total, Y_total)" + ] + }, + { + "cell_type": "markdown", + "id": "83ce18be", + "metadata": {}, + "source": [ + "### Construct intervals\n", + "\n", + "Form confidence intervals for all methods and problem parameters. A dataframe with the following columns is formed:\n", + "1. ```method``` (one of ```PPI```, ```Classical```, and ```Imputation```)\n", + "2. ```n``` (labeled data set size, takes values in ```ns```)\n", + "3. ```lower``` (lower endpoint of the confidence interval)\n", + "4. ```upper``` (upper endpoint of the confidence interval)\n", + "5. ```trial``` (index of trial, goes from ```0``` to ```num_trials-1```)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "812f8fd5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████| 100/100 [09:29<00:00, 5.70s/it]\n", + "100%|█████████████████████████████████████████| 100/100 [09:27<00:00, 5.68s/it]\n", + "100%|█████████████████████████████████████████| 100/100 [09:26<00:00, 5.66s/it]\n", + "100%|█████████████████████████████████████████| 100/100 [09:29<00:00, 5.69s/it]\n", + "100%|█████████████████████████████████████████| 100/100 [09:28<00:00, 5.69s/it]\n" + ] + } + ], + "source": [ + "# Run prediction-powered inference and classical inference for many values of n\n", + "results = []\n", + "for i in range(ns.shape[0]):\n", + " for j in tqdm(range(num_trials)):\n", + " # Prediction-Powered Inference\n", + " n = ns[i]\n", + " rand_idx = np.random.permutation(n_total)\n", + " _X, _X_unlabeled = X_total[rand_idx[:n]], X_total[rand_idx[n:]]\n", + " _Y, _Y_unlabeled = Y_total[rand_idx[:n]], Y_total[rand_idx[n:]]\n", + " _Yhat, _Yhat_unlabeled = (\n", + " Yhat_total[rand_idx[:n]],\n", + " Yhat_total[rand_idx[n:]],\n", + " )\n", + "\n", + " ppi_ci = ppboot(\n", + " pearson,\n", + " _Y,\n", + " _Yhat,\n", + " _Yhat_unlabeled,\n", + " _X,\n", + " _X_unlabeled,\n", + " alpha=alpha\n", + " )\n", + "\n", + " # Classical interval\n", + " classical_ci = classical_bootstrap_ci(pearson, _X, _Y, alpha=alpha)\n", + "\n", + " # Append results\n", + " results += [\n", + " pd.DataFrame(\n", + " [\n", + " {\n", + " \"method\": \"PPI\",\n", + " \"n\": n,\n", + " \"lower\": ppi_ci[0],\n", + " \"upper\": ppi_ci[1],\n", + " \"trial\": j,\n", + " }\n", + " ]\n", + " )\n", + " ]\n", + " results += [\n", + " pd.DataFrame(\n", + " [\n", + " {\n", + " \"method\": \"Classical\",\n", + " \"n\": n,\n", + " \"lower\": classical_ci[0],\n", + " \"upper\": classical_ci[1],\n", + " \"trial\": j,\n", + " }\n", + " ]\n", + " )\n", + " ]\n", + "\n", + "# Imputed CI\n", + "imputed_ci = classical_bootstrap_ci(pearson, X_total, (Yhat_total > 0.5).astype(int), alpha=alpha)\n", + "results += [\n", + " pd.DataFrame(\n", + " [\n", + " {\n", + " \"method\": \"Imputation\",\n", + " \"n\": np.nan,\n", + " \"lower\": imputed_ci[0],\n", + " \"upper\": imputed_ci[1],\n", + " \"trial\": 0,\n", + " }\n", + " ]\n", + " )\n", + "]\n", + "\n", + "df = pd.concat(results, axis=0, ignore_index=True)\n", + "df[\"width\"] = df[\"upper\"] - df[\"lower\"]" + ] + }, + { + "cell_type": "markdown", + "id": "d15ba288", + "metadata": {}, + "source": [ + "### Plot results\n", + "\n", + "Plot:\n", + "1. Five randomly chosen intervals from the dataframe for PPI and the classical method, and the imputed interval;\n", + "2. The average interval width for PPI and the classical method, together with a scatterplot of the widths from the five random draws." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6077b2c4", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "make_plots(\n", + " df,\n", + " \"./plots/census_corr.pdf\",\n", + " intervals_xlabel=\"Correlation coeff\",\n", + " n_idx=-1,\n", + " true_theta=true_theta,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2b114272-e293-4375-a5b8-0bf870857408", + "metadata": {}, + "source": [ + "### Power experiment\n", + "\n", + "For PPI and the classical approach, find the smallest value of ```n``` such that the method has power 80% against the null $H_0: \\theta^* < 0.5 \\cdot 10^{-5}$." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c4fd41f6", + "metadata": {}, + "outputs": [], + "source": [ + "# Find n such that we reject H0: Corr coeff < 0.15 with probability 80% using a test at level alpha\n", + "num_experiments = 100\n", + "list_rand_idx = [\n", + " np.random.permutation(n_total) for i in range(num_experiments)\n", + "]\n", + "\n", + "\n", + "def _to_invert_ppi(n):\n", + " # print(f\"PPI: {n}\")\n", + " n = int(n)\n", + " nulls_rejected = 0\n", + " # Data setup\n", + " for i in range(num_experiments):\n", + " # print(f\"PPI: {n}, {i}\")\n", + " rand_idx = list_rand_idx[i]\n", + " _X, _X_unlabeled = X_total[rand_idx[:n]], X_total[rand_idx[n:]]\n", + " _Y, _Y_unlabeled = Y_total[rand_idx[:n]], Y_total[rand_idx[n:]]\n", + " _Yhat, _Yhat_unlabeled = (\n", + " Yhat_total[rand_idx[:n]],\n", + " Yhat_total[rand_idx[n:]],\n", + " )\n", + "\n", + " ppi_ci = ppboot(\n", + " pearson,\n", + " _Y,\n", + " _Yhat,\n", + " _Yhat_unlabeled,\n", + " _X,\n", + " _X_unlabeled,\n", + " alpha=alpha\n", + " )\n", + " if ppi_ci[0] > 0.15:\n", + " nulls_rejected += 1\n", + " return nulls_rejected / num_experiments - 0.8\n", + "\n", + "\n", + "def _to_invert_classical(n):\n", + " # print(f\"Classical: {n}\")\n", + " n = int(n)\n", + " nulls_rejected = 0\n", + " # Data setup\n", + " for i in range(num_experiments):\n", + " rand_idx = list_rand_idx[i]\n", + " _X, _X_unlabeled = X_total[rand_idx[:n]], X_total[rand_idx[n:]]\n", + " _Y, _Y_unlabeled = Y_total[rand_idx[:n]], Y_total[rand_idx[n:]]\n", + " _Yhat, _Yhat_unlabeled = (\n", + " Yhat_total[rand_idx[:n]],\n", + " Yhat_total[rand_idx[n:]],\n", + " )\n", + "\n", + " classical_ci = classical_bootstrap_ci(pearson, _X, _Y, alpha=alpha)\n", + " if classical_ci[0] > 0.15:\n", + " nulls_rejected += 1\n", + " return nulls_rejected / num_experiments - 0.8" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8ca727f2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The PPBoot test requires n=264 labeled data points to reject the null.\n", + "The classical test requires n=314 labeled data points to reject the null.\n" + ] + } + ], + "source": [ + "n_ppi = int(brentq(_to_invert_ppi, 50, 800, xtol=50))\n", + "n_classical = int(brentq(_to_invert_classical, 50, 1000, xtol=50))\n", + "print(\n", + " f\"The PPBoot test requires n={n_ppi} labeled data points to reject the null.\"\n", + ")\n", + "print(\n", + " f\"The classical test requires n={n_classical} labeled data points to reject the null.\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a3676a6-fcb2-44e5-a0ff-85935df5f0c0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ppi_py/baselines.py b/ppi_py/baselines.py index f6c4e2e..36e93ca 100644 --- a/ppi_py/baselines.py +++ b/ppi_py/baselines.py @@ -5,7 +5,7 @@ from statsmodels.stats.weightstats import _zconfint_generic, _zstat_generic from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.isotonic import IsotonicRegression -from .utils import dataframe_decorator +from .utils import dataframe_decorator, bootstrap from .ppi import _ols, _wls import pdb @@ -309,3 +309,100 @@ def classical_logistic_ci(X, Y, alpha=0.1, alternative="two-sided"): return _zconfint_generic( pointest, np.sqrt(np.diag(cov_mat) / n), alpha, alternative ) + +""" + BOOTSTRAP CI + +""" + +def classical_bootstrap_ci( + estimator, + Y, + X=None, + n_resamples=1000, + alpha=0.1, + alternative="two-sided", + method="percentile", +): + """Classical bootstrap confidence interval for the estimator. + + Args: + estimator (callable): Estimator function. Takes in (X,Y) or (Y) and returns a point estimate. + Y (ndarray): Gold-standard labels. + X (ndarray, optional): Covariates corresponding to the gold-standard labels. Defaults to `None`. If `None`, the estimator is assumed to only take in `Y`. + n_resamples (int, optional): Number of bootstrap resamples. Defaults to `1000`. + alpha (float, optional): Error level; the confidence interval will target a coverage of 1 - alpha. Must be in (0, 1). Defaults to `0.1`. + alternative (str, optional): Alternative hypothesis, either 'two-sided', 'larger' or 'smaller'. Defaults to `'two-sided'`. + method (str, optional): Method to compute the confidence interval, either 'percentile' or 'basic'. Defaults to `'percentile'`. + + Returns: + float or ndarray: Lower and upper bounds of the bootstrap confidence interval for the estimator. + """ + + if X is None: + + pointest = estimator(Y) + + bootstrap_distribution = np.array( + bootstrap( + [Y], + estimator, + n_resamples=n_resamples + ) + ) + + else: + + pointest = estimator(X, Y) + + bootstrap_distribution = np.array( + bootstrap( + [X, Y], + estimator, + n_resamples=n_resamples, + paired=[[0, 1]] + ) + ) + + # Deal with the different types of alternative hypotheses + if alternative == "two-sided": + alpha_lower = alpha / 2 + alpha_upper = alpha / 2 + elif alternative == "larger": + alpha_lower = alpha + alpha_upper = 0 + elif alternative == "smaller": + alpha_lower = 0 + alpha_upper = alpha + + # Compute the lower and upper bounds depending on the method + if method == "percentile": + lower_bound = np.quantile( + bootstrap_distribution, alpha_lower, axis=0 + ) + upper_bound = np.quantile( + bootstrap_distribution, 1 - alpha_upper, axis=0 + ) + elif method == "basic": + lower_bound = 2 * pointest - np.quantile( + bootstrap_distribution, 1 - alpha_lower, axis=0 + ) + upper_bound = 2 * pointest - np.quantile( + bootstrap_distribution, alpha_upper, axis=0 + ) + else: + raise ValueError( + "Method must be either 'percentile' or 'basic'. The others are not implemented yet... want to contribute? ;)" + ) + + if alternative == "two-sided": + return lower_bound, upper_bound + elif alternative == "larger": + return -np.inf, upper_bound + elif alternative == "smaller": + return lower_bound, np.inf + else: + raise ValueError( + "Alternative must be either 'two-sided', 'larger' or 'smaller'." + ) + \ No newline at end of file diff --git a/ppi_py/ppi.py b/ppi_py/ppi.py index d499f6a..e9aa10c 100644 --- a/ppi_py/ppi.py +++ b/ppi_py/ppi.py @@ -1071,6 +1071,12 @@ def ppi_logistic_ci( ) +""" + PPBOOT + +""" + + def ppboot( estimator, Y,