From cf29925273aca7c6c5c21b83b7fe47df5d1dbba5 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 2 Jul 2020 13:32:28 -0400 Subject: [PATCH] Add uncommitted *.tex --- .gitignore | 1 + computer-science--concurrency.tex | 33 +++++ ...--error-detection-and-error-correction.tex | 0 discrete-math--chess.tex | 76 ++++++++++ elaenia.tex | 134 ++++++++++++++++++ hmm.tex | 6 + linear-algebra-kun-pimbook-exercises.tex | 34 +++++ neural_networks.tex | 12 ++ xenops.tex | 51 +++++++ 9 files changed, 347 insertions(+) create mode 100644 computer-science--concurrency.tex create mode 100644 computer-science--error-detection-and-error-correction.tex create mode 100644 discrete-math--chess.tex create mode 100644 elaenia.tex create mode 100644 hmm.tex create mode 100644 linear-algebra-kun-pimbook-exercises.tex create mode 100644 neural_networks.tex create mode 100644 xenops.tex diff --git a/.gitignore b/.gitignore index b8a149e..dfd821c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ _region_.tex fragments.tex junk.tex +z.tex .ignore diff --git a/computer-science--concurrency.tex b/computer-science--concurrency.tex new file mode 100644 index 0000000..1e43eef --- /dev/null +++ b/computer-science--concurrency.tex @@ -0,0 +1,33 @@ +\section{aio} +\begin{itemize} +\item A promise is (result, callbacks) +\item To {\it subscribe} to a promise means to add your callback (aio-listen). +\item Callbacks take one argument: the {\it value function} that will be supplied when the callback is resolved. +\item To {\it resolve} a promise means to call all the callbacks, passing them a {\it value function} (aio-resolve) +\item The simplest example of the promise lifecyle is an non-async function creating and resolving a promise: + +\begin{minted}{emacs-lisp} +(setq lexical-binding t) + +(defun make-call-function-promise (seconds function) + (let ((promise (aio-promise)) + (value-function (lambda () (funcall function)))) + (aio-listen promise (lambda (value-function) (funcall value-function))) + (prog1 promise + (run-at-time seconds nil #'aio-resolve promise value-function)))) + +(make-call-function-promise 1 (lambda () (message "Hello world!"))) +\end{minted} + +\subsection{Chaining promises} + +To chain two promises means to create an awaitable object which: +\begin{enumerate} +\item Can be kicked off +\item When resolved, will kick off the second +\end{enumerate} + + +\begin{minted}{emacs-lisp} + (aio-listen promise-1 (lambda (value-function) (schedule promise-2))) +\end{minted} diff --git a/computer-science--error-detection-and-error-correction.tex b/computer-science--error-detection-and-error-correction.tex new file mode 100644 index 0000000..e69de29 diff --git a/discrete-math--chess.tex b/discrete-math--chess.tex new file mode 100644 index 0000000..25c54af --- /dev/null +++ b/discrete-math--chess.tex @@ -0,0 +1,76 @@ +\url{https://erikbern.com/2014/11/29/deep-learning-for-chess.html} + +In chess there are 64 squares and 12 piece types. + +\begin{definition} + A {\it position} comprises two bits of information: + \begin{enumerate} + \item An assignment to each of the 64 squares of one of the 12 piece types, or EMPTY. + \item Whether white or black is to play next. + \end{enumerate} + A {\it valid position} is a position for which the counts of the piece types are less than or equal to their initial counts. + + A {\it terminal position} is a valid position which is a win for white, a win for black, or a draw. +\end{definition} +Define the set $F = \{-1, 0, 1\}$ of labels. We label a terminal position $p$ as follows: +\begin{align*} + f(p) = + \begin{cases} + -1 & \text{if $p$ is a win for white} \\ + 0 & \text{if $p$ is a draw} \\ + +1 & \text{if $p$ is a win for black}. + \end{cases} +\end{align*} +We extend this definition recursively to non-terminal positions as follows. Define $M(p)$ to be the set of +valid positions reachable in one move from $p$. The label at a non-terminal position $p$ is +\begin{align*} + f(p) = \min_{p' \in M(p)} f(p'). +\end{align*} +Thus, for + +We now want to extend this definition to assign labels $f(p) \in F$ to non-terminal positions. + +We do so (conceptually) according to the following algorithm: + +\begin{definition*} + \begin{itemize} + \item Define $M(p)$ to be the set of valid positions reachable in one move from $p$. + \item Define $M^{-1}(p)$ to be the set of valid positions from which $p$ is reachable in one move. + \end{itemize} +\end{definition*} + +#+begin_src python +def visit(position): + if position.is_terminal: + return + + ancestors = position.get_ancestors() + + for + + +#+end_src + +\begin{enumerate} +\item Let $T_w$ be the set of all terminal positions (white to play). +\item For all $$$q \in T_w$ +\end{enumerate} + +Let $p$ (white to play) be a valid non-terminal position, and let $M(p)$ be the set of valid positions reachable in one move from $p$. + +Note that if some $q = -1$ for $q \in M(p)$ then white can win in one move. Accordingly, we define $f(p) = -1$ if $-1 \in M(p)$. + +Alternatively, suppose that $-1 \not\in M(p)$. + +Let $q$ be a terminal position (black to play) and consider the set $P$ of positions (white to play) from which $q$ is reachable in one move. + +Note that if $q = -1$ then white can win in one move from all positions $p \in P$. Therefore we will define $p = -1$ for all .$p \in P$. + + + + + + +Define $f(p) \in \{-1, 0, 1\}$ as follows: + +Assuming that diff --git a/elaenia.tex b/elaenia.tex new file mode 100644 index 0000000..955f95f --- /dev/null +++ b/elaenia.tex @@ -0,0 +1,134 @@ +\documentclass{article} + +\begin{document} + +\section{Objective} + +We record an audio signal on a smart phone and we want to infer: +\begin{enumerate} +\item The set of species that are vocalising during the recording +\item Points in time at which each vocalisation starts and stops + + +\section{VGGish} +\footnote{ + \url{https://github.com/tensorflow/models/tree/master/research/audioset/vggish#input-audio-features} + \url{https://arxiv.org/pdf/1903.00765.pdf} I am slightly confused. + Google describe the initial step as: Audio signal is converted to a log-mel spectrogram via STFT (window-size 25ms, hop-size 10ms, Hann window) I guess I don't know what hop size is. +} +\begin{enumerate} +\item Input audio is divided into non-overlapping 0.96s segments (treated as distinct observations for training) +\item Each observation is represented as a 96 x 64 pixel input spectrogram image (a log mel spectrogram with 10ms frames). +\item The penultimate (``bottleneck'') layer has 128 units: these values are the ``embedding'' that summarizes one 0.96s input. + +\item {\bf VGGish Training}: + \begin{enumerate} + \item The VGGish network was trained by Google on a large YouTube audio data set. + \item Basically it learns to associate 0.96s frames with the set of tags of the parent image. + \item The trained VGGish is henceforth a fixed function that maps a 0.96s frame to an ``embedding'' + representation. + \item So whatever that embedding is, it should be something that contains good information for classifying to + sound class. + \item Note that VGGish has discarded all information about patterns that extend over more than 0.96s. + \item OTOH, its classifications are based on many frames per sound type + \item So the embedding is going to be something fairly generic: for every sound type, for any 0.96s frame, if + that frame is informative for classification, then the embedding will capture some information from it. + \end{enumerate} + +\item {\bf Bird Audio Classifier training}: + \begin{enumerate} + \item Each labeled training recording is broken into 0.96s frames + \item Use the trained VGGish to compute the embedding vector for each frame + \item Use these labeled embedding vectors to train the final classifier (SVM) + \item Thus the final classifier learns to classify embedding vectors to species + \item Note: the final classifier does not have access to any information about patterns extending over > 0.96s. + \item OTOH, it does use many 0.96s frames per species. + \end{enumerate} +\item {\bf Inference}: + \begin{enumerate} + \item The bird recording is broken into 0.96s frames. + \item For each frame, use the trained VGGish to compute the embedding + \item Use the trained final classifier to classify the embedding to species + \item Final classification is majority-vote among embedding classifications + \end{enumerate} + + +\section{Inference} +What do we ultimately want the inference phase to look like? +\begin{enumerate} +\item Take short frames from a live recording in real time and map them to classifier output? + If so, then we will not be using any information about extended patterns (duration, repeat intervals etc) in + the vocalisation. +\item Or, also be able to process entire recording? +\end{enumerate} +\end{enumerate} + +\section*{Training data} +\begin{enumerate} +\item For every bird species, there exist many recordings labeled with the species name. +\item It is unknown at what times during the recording the bird is vocalising, and it is unknown which vocalisation + types are involved (song, call, etc). +\item Suppose we are in a location at which there is only one possible bird species, and it makes one vocalisation type only. +\item We record audio. The problem is now + \begin{quote} + Does the audio contain the bird noise which is present in the training data? + \end{quote} +\item What would a likelihood-based approach look like? + +\section*{Notation } +\begin{tabular}{l|l} + $y_n \in \R$ & The signal (amplitude) at time point $n$. \\ + $Y_m \in \R^d$ & The frequency-domain coordinates for the signal during time-window $m$ +\end{tabular} + + +\section{Model} + +Let $k$ be the species identity (or no-bird). The likelihood of the data is +\begin{align*} + P(y|k) = P(y_1, \ldots, y_N | k) +\end{align*} +Alternatively, we can compute the likelihood of the Short-time Fourier-transformed signal: +\begin{align*} + P(y|k) = P(Y|k) = P(Y_1, \ldots, Y_M|k). +\end{align*} + +\begin{enumerate} +\item The input vector (field recording) has one variable high dimension (number of time windows); the second + dimension could be fixed frequency windows. +\item The output layer is $K$-dimensional, where $K$ is the number of possible species. +\item So we could try to find sub-intervals of the input time dimension which give strong signal in the output layer. +\end{enumerate} + + +\section{Ideal model} + +\begin{enumerate} +\item From the training data for species $k$, we learn a generative model for that species' vocalisations. +\item We classify new data $y$ to the species model under which $y$ has highest likelihood of being generated. +\end{enumerate} +So what would a generative model for recording data look like? + + + +What would a generative model for STFT look like? + + +\end{enumerate} +We could +\begin{enumerate} +\item Classify each recording to the closest training sample. What distance metric? +\end{enumerate} + + +(For now, we assume that different bird vocalisations do not overlap in time.) +\end{document} +The discrete-time STFT divides the signal into time windows, and performs a Fourier transform on each window. + +The Fourier transform of a signal can be viewed as representing the signal by its coordinates in a new basis. + +The STFT converts the 1D time series into a higher dimensional time series (with coarser time buckets). + + + +What is an example of a STFT-based algorithm that could conceivably work? diff --git a/hmm.tex b/hmm.tex new file mode 100644 index 0000000..c8674e9 --- /dev/null +++ b/hmm.tex @@ -0,0 +1,6 @@ +\documentclass{article} + +\begin{document} + + +\end{document} \ No newline at end of file diff --git a/linear-algebra-kun-pimbook-exercises.tex b/linear-algebra-kun-pimbook-exercises.tex new file mode 100644 index 0000000..898d87b --- /dev/null +++ b/linear-algebra-kun-pimbook-exercises.tex @@ -0,0 +1,34 @@ +\begin{mdframed} +\includegraphics[width=400pt]{img/linear-algebra-kun-pimbook-exercises--6242.png} +\end{mdframed} + +The properties of the zero vector are +\begin{enumerate} +\item $\0 + u = u$ for every vector $u$ (additive identity) +\item $a\0 = \0$ for every scalar $a$ +\end{enumerate} + +\begin{proof} + Let $a \neq 1$ be a scalar from the field. + + We have $av - v = aw - w$, since both are equal to $\0$. Therefore $v(a - 1) = w(a - 1)$, therefore $v = w$. +\end{proof} + +\begin{proof} + Let $u \neq \0$ be a vector. We have $u + v = u + w = 0$, therefore $v = w$. +\end{proof} + +\begin{mdframed} +\includegraphics[width=400pt]{img/linear-algebra-kun-pimbook-exercises--f0b5.png} +\end{mdframed} + +\begin{proof} +\begin{align*} + (g \circ f)(ax + by) + &= g(f(ax + by)) \\ + &= g(af(x) + bf(y)) \\ + &= ag(f(x)) + bg(f(y)) \\ + &= a(g \circ f)(x) + b(g \circ f)(y) +\end{align*} +\end{proof} + diff --git a/neural_networks.tex b/neural_networks.tex new file mode 100644 index 0000000..d21d0c5 --- /dev/null +++ b/neural_networks.tex @@ -0,0 +1,12 @@ + +\begin{tabular}{l|l} + $d$ & dimension of an input vector \\ + $K$ & number of output classification classes \\ +\end{tabular} + + +\begin{enumerate} +\item A vanilla FC classification network is a map from $\R^d \to \R^k$. +\item A CNN is the same, but a neuron in layer $l$ is connected to a subset of neurons in layer $l-1$. +\end{enumerate} + diff --git a/xenops.tex b/xenops.tex new file mode 100644 index 0000000..10f148f --- /dev/null +++ b/xenops.tex @@ -0,0 +1,51 @@ +\documentclass{article} +\begin{document} + + + + + + +Xenops is a $\text{\LaTeX}$ editing environment for Emacs + +All $\dot{y}$ math +\begin{align*} + \frac{\partial L}{\partial \dot{y}} +\end{align*} +is displayed as SVG. + + + + + + + + + + + + + + + +\end{document} + + + + + + + + + + + + + + + +Inline $\dot{x}$ and display math +\begin{align*} + \frac{\partial L}{\partial \dot{x}} +\end{align*} +are converted to SVG as you type.