Skip to content

Commit

Permalink
major refactoring + added trees, boosting, bagging and random forests
Browse files Browse the repository at this point in the history
  • Loading branch information
erachelson committed Apr 12, 2018
1 parent 009b97f commit 8fb3460
Show file tree
Hide file tree
Showing 71 changed files with 6,416 additions and 2,379 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,4 @@ TSWLatexianTemp*
# standalone packages
*.sta

# Fashion-MNIST data
5 - Artificial Neural Networks/data/

417 changes: 417 additions & 0 deletions 10 - Bagging/Bagging.ipynb

Large diffs are not rendered by default.

Binary file added 10 - Bagging/WhyBaggingWorks.pdf
Binary file not shown.
115 changes: 115 additions & 0 deletions 10 - Bagging/WhyBaggingWorks.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
%\documentclass[handout]{beamer}
\documentclass{beamer}

\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{alltt}
\usepackage{color}
\newcommand{\highlight}[1]{\colorbox{yellow}{#1}}
\usepackage{verbatim}

%\usecolortheme{seagull}
\useoutertheme{infolines}
\usefonttheme[onlymath]{serif}

\author{Emmanuel Rachelson}
\title{An Introduction to Bagging}
\date{}

\newcommand{\argmax}{\operatornamewithlimits{argmax}}
\newcommand{\argmin}{\operatornamewithlimits{argmin}}

\setbeamertemplate{footline}{}
\setbeamertemplate{navigation symbols}{}

\begin{document}

\begin{frame}{Why does Bagging work?}
\only<1>{\begin{center}Why would $\varphi_B$ be any better than $\varphi$?\end{center}}
\only<2>{\underline{Illustration on the regression case:}\\
\begin{center}
Suppose $(X,Y)$ drawn from distribution $P_{X,Y}$.\\
$\varphi$ predictor trained on $\mathcal{T}$ or any bootstrap sample of $\mathcal{T}$\\
%$\varphi_\mathcal{T}$ predictor trained on $\mathcal{T}$\\
%$\varphi^b$ predictor trained on $\mathcal{T}^b$\\
$\hat{P}_\mathcal{T}$ empirical distribution of $\mathcal{T}$\\
$P_\mathcal{T}$ true distribution of $\mathcal{T}$\\
To simplify notation: $\mathbb{E}_{P_{X,Y}} = \mathbb{E}_{X,Y}$, $\mathbb{E}_{P_{\mathcal{T}}} = \mathbb{E}_{\mathcal{T}}$ and $\mathbb{E}_{\hat{P}_{\mathcal{T}}} = \mathbb{E}_{\hat{\mathcal{T}}}$.\\
$\varphi_B(\cdot) = \mathbb{E}_{\hat{\mathcal{T}}} \left( \varphi(\cdot) \right)$ Bagging predictor\\
$\varphi_A(\cdot) = \mathbb{E}_{\mathcal{T}} \left( \varphi(\cdot) \right)$ aggregated predictor\\
\end{center}
}
\only<3-13>{
\visible<3->{Average prediction error of $\varphi$: $e=\mathbb{E}_{\mathcal{T}}\left(\mathbb{E}_{X,Y}\left(\left[Y-\varphi\left(X\right)\right]^2\right)\right)$.\\}
\visible<4->{Average prediction error of $\varphi_A$: $e_A=\mathbb{E}_{X,Y}\left(\left[Y-\varphi_A\left(X\right)\right]^2\right)$.\\}
\visible<5->{
$e = \mathbb{E}_{X,Y}\left(Y^2\right) -2\mathbb{E}_{X,Y}\left(\only<5-6>{\alert<6>{\mathbb{E}_{\mathcal{T}}\left(Y\varphi\left(X\right)\right)}}\only<7->{Y\varphi_A(X)}\right) + \mathbb{E}_{X,Y}\left(\mathbb{E}_{\mathcal{T}}\left(\left[\varphi(X)\right]^2\right)\right)$\\
}
%\visible<7->{$e = \mathbb{E}_{X,Y}\left(Y^2\right) -2\mathbb{E}_{X,Y}\left( Y\varphi_A(X) \right) + \mathbb{E}_{X,Y}\left(\mathbb{E}_{\mathcal{T}}\left(\left[\varphi(X)\right]^2\right)\right)$\\}
\visible<8->{
But $\mathbb{E}_{X,Y}\left(\mathbb{E}_{\mathcal{T}}\left(\left[\varphi(X)\right]^2\right)\right) \geq \mathbb{E}_{X,Y}\left(\left[\only<8-9>{\alert<9>{\mathbb{E}_{\mathcal{T}}\left(\varphi(X)\right)}}\only<10->{\varphi_A(X)}\right]^2\right)$\\
}
\visible<11->{So \alert<11>{$e \geq e_A$}.\\}
\visible<12->{Moreover:\\
$e-e_A = \mathbb{E}_{X,Y}\left( \mathbb{E}_{\mathcal{T}}\left(\left[\varphi(X)\right]^2\right) - \left[\mathbb{E}_{\mathcal{T}}\left(\varphi(X)\right)\right]^2 \right)$\\
$e-e_A = \mathbb{E}_{X,Y}\left( \mathbb{E}_{\mathcal{T}}\left(\left[\varphi(X)\right]^2\right) - \left[\varphi_A(X)\right]^2 \right)$\\
}
\visible<13->{
~\\
\underline{Interpretation:} if $\varphi_\mathcal{T}$ differs a lot from $\varphi_{\mathcal{T}'}$, then $e-e_A$ is large.\\
$\Rightarrow$ \alert<13>{The highest the variance of $\varphi$ across training sets $\mathcal{T}$, the more improvement $\varphi_A$ produces.}
}
}
\only<14-16>{
\begin{center}
\visible<14->{
Ok, so $\varphi_A$ always improves on $\varphi$,\\
especially when $\varphi$ is highly variable w.r.t. changes in $\mathcal{T}$.\\}
\visible<15->{
~\\
But $\varphi_A$ is not $\varphi_B$. Recall:\\
$\varphi_A(\cdot) = \mathbb{E}_{\mathcal{T}} \left( \varphi(\cdot) \right)$ aggregated predictor (over all $N$-size training sets)\\
$\varphi_B(\cdot) = \mathbb{E}_{\hat{\mathcal{T}}} \left( \varphi(\cdot) \right)$ Bagging predictor (over bootstrap samples)\\
$\varphi_B$ approximates $\varphi_A$ and thus $e_B\geq e_A$\\
}
\visible<16->{
~\\
\begin{itemize}
\item If $\varphi$ highly variable w.r.t. $\mathcal{T}$, $\varphi_B$ improves on $\varphi$ through aggregation.
\item But if $\varphi$ is rather stable w.r.t. $\mathcal{T}$, $e_A\approx e$ and since $\varphi_B$ approximates $\varphi_A$, $e_B$ might be greater than $e$.
\end{itemize}
}
\end{center}
}
\only<17-18>{
\visible<17->{
So it does not always work?\\
}
\visible<18->{
Actually, no, it does not always work.\\
Bagging should be used to transform highly variable predictors $\varphi$ into a more accurate averaged commitee $\varphi_B$.\\
~\\
Examples of $\varphi$ that Bagging improve:\\
$\rightarrow$ Trees, Neural Networks.\\
Examples of $\varphi$ that Bagging does not improve much (or degrades):\\
$\rightarrow$ Support Vector Machines, Gaussian Processes.
}
}
\only<19->{
\visible<19->{And in the classification case?\\}
\visible<20->{
Majority vote: $\varphi_B(x) = \arg\max\limits_{j} \sum\limits_{b=1}^B I(\varphi^b(x) = j)$\\
More drastic conclusions:
\begin{itemize}
\item $\varphi$ unstable w.r.t. $\mathcal{T}$ and reasonable performance $\Rightarrow$ $\varphi_B$ near optimal.
\item $\varphi$ stable w.r.t. $\mathcal{T}$ $\Rightarrow$ $\varphi_B$ worse than $\varphi$.
\item $\varphi$ poor performance $\Rightarrow$ $\varphi_B$ worse than $\varphi$.
\end{itemize}
}
}
\end{frame}

\end{document}
972 changes: 972 additions & 0 deletions 11 - Random Forests/Random Forests.ipynb

Large diffs are not rendered by default.

Binary file added 11 - Random Forests/pres.pdf
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -60,20 +60,6 @@

\begin{document}

\begin{frame}{Decision trees}
\begin{itemize}
\item Easy to interpret and to explain
\item Poor representative power
\item Greedy growth procedure $\Rightarrow$ suboptimal resulting tree
\item Offline training
\item Very sensitive to noise in the input data
\end{itemize}
\end{frame}

\begin{frame}{Bagging}

\end{frame}

\begin{frame}{Random Forests}
\begin{itemize}
\item RF = decision trees + random feature selection + Bagging
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2196,7 +2196,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
"version": "3.5.4"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit 8fb3460

Please sign in to comment.