% No 'submit' option for the problems by themselves.
\documentclass{harvardml}
% Use the 'submit' option when you submit your solutions.
%\documentclass[submit]{harvardml}
% Put in your full name and email address.
\name{Your Name}
\email{email@fas.harvard.edu}
% List any people you worked with.
\collaborators{%
John Doe,
Jane Doe
}
% You don't need to change these.
\course{CS281-F13}
\assignment{Assignment \#2}
\duedate{23:59pm October 4, 2013}
% Useful macros.
\newcommand{\trans}{\mathsf{T}}
\newcommand{\bx}{\boldsymbol{x}}
\newcommand{\bw}{\boldsymbol{w}}
\newcommand{\distNorm}{\mathcal{N}}
\newcommand{\bzero}{\boldsymbol{0}}
\newcommand{\ident}{\mathbb{I}}
\begin{document}
\begin{problem}[10pts, Murphy]
Given that we have an estimate~$\hat{\bw}$ of the weights of a
linear regression model with Gaussian noise, show that the MLE of
the error variance is given by
\begin{align*}
\hat{\sigma}^2 &= \frac{1}{N}\sum_{n=1}^N(y_n - \bx_n^\trans\hat{\bw})^2.
\end{align*}
\end{problem}
\begin{problem}[15pts]
One intuitive way to summarize a probability density is via the mode,
as this is the ``most likely'' value in some sense. A common example
of this is using the maximum \textit{a posteriori} (MAP) estimate of a
model's parameters. In high dimensions, however, the mode becomes
less and less representative of typical samples. Consider variates
from a~$D$-dimensional zero mean spherical Gaussian with unit
variance:
\begin{align*}
\bx &\sim \distNorm(\bzero_D, \ident_D),
\end{align*}
where~$\bzero_D$ indicates a column vector of~$D$ zeros and~$\ident_D$
is a~${D\times D}$ identity matrix.
\begin{enumerate}
\item Compute the distribution that this implies over the distance
of these points from the origin. That is, compute the
distribution over~$\sqrt{\bx^\trans\bx}$, if~$\bx$ is a
realization from~$\distNorm(\bzero_D, \ident_D)$. (Hint: Consider
transformations of a Gamma distribution.)
\item Make a plot that shows this probability density function for
several different values of~$D$, up to ~${D=100}$.
\item Make a plot of the cumulative distribution function (CDF) over
this distance distribution for~${D=100}$. A closed-form solution
may be difficult to compute, so you can do this numerically.
(Hint: In Matlab, look up \texttt{cumtrapz}.)
\item From examining the CDF we can think about where most of the
mass lives as a function of radius. For example, most of the mass
for~${D=100}$ is within a thin spherical shell. From eyeballing
the plot, what are the inner and outer radii for the shell that
contains 90\% of the mass in this case?
\end{enumerate}
\end{problem}
\begin{problem}[15pts]
Consider a mixture model for a one-dimensional random variable~$X$
arising from the following generative procedure:
\begin{itemize}
\item With probability $\frac{1}{2}$,~$X$ is Gaussian with zero
mean and variance four.
\item With probability $\frac{3}{8}$,~$X$ is Laplace-distributed with
location five and scale two.
\item With probability $\frac{1}{8}$,~$X$ is uniform on~$(-2,-1.5)$.
\end{itemize}
\begin{enumerate}
\item Write the PDF for this mixture model.
\item Produce a plot of the probability density.
\item Draw 500 samples from this distribution and produce a normalized histogram.
\item Produce a plot that shows the 95\% central credible region.
This may require numeric integration.
\item Produce a plot that shows the 95\% high posterior density
region. This may require discretization and/or optimization.
\end{enumerate}
\end{problem}
\begin{problem}[30pts]
Here are some simple data to regress:
\begin{verbatim}
x = [-1.87 -1.76 -1.67 -1.22 -0.07 0.11 0.67 1.60 2.22 2.51]'
y = [0.06 1.67 0.54 -1.45 -0.18 -0.67 0.92 2.95 5.13 5.18]'
\end{verbatim}
Construct a Bayesian linear regression model using a basis of your
choosing (e.g., polynomial, sinusoids, radial basis functions).
Choose priors that seem sensible for the regression weights and the
Gaussian noise.
\begin{enumerate}
\item Identify your basis and your priors and explain why you chose
them.
\item Plot the data, as well as several typical posterior samples of
the function given the data.
\item Plot the 95\% central credible interval region of the
predictive density as a function of~$x$. That is, produce a plot
that shows the ``tube'' containing most of the functions that are
consistent with the data under your model.
\item There are probably different numbers of basis functions you
could choose for your model. For example, you could choose the
order of polynomial, or how many radial basis functions to put
down. Fix a choice of the noise, and then produce a bar plot for
several different such hypotheses that shows their marginal
likelihoods. Do the data support one hypothesis over the others?
Which one?
\end{enumerate}
\end{problem}
\begin{problem}[30pts, Hastie et al., Murphy]
In this problem, we'll apply logistic regression to a data set of spam
email. These data consist of 4601 email messages, from which 57
features have been extracted. These are as follows:
\begin{itemize}
\item $48$ features in $[0,100]$, giving the percentage of words in
a given message which match a given word on a list containing,
e.g., ``business'', ``free'', etc.
\item $6$ features in $[0,100]$, giving the percentage of characters
in the email that match characters on a list containing, e.g.,
``\$'', ``\#'', etc.
\item Feature 55: The average length of an uninterrupted sequence of
capital letters.
\item Feature 56: The length of the longest uninterrupted sequence
of capital letters.
\item Feature 57: The sum of the lengths of uninterrupted sequences
of capital letters.
\end{itemize}
There are files \texttt{spam.train.dat} and \texttt{spam.test.dat}
(available on the course website) in which each row is an email.
There are 3000 training and 1601 test examples. The final column in
each file indicates whether the email was spam.
\begin{enumerate}
\item Apply~$\ell_2$-regularized logistic regression. Use
cross-validation to determine an appropriate regularization
penalty. Report your procedure and the value you find. What
training and test performance do you get with this value?
\item There are different ways one might preprocess the data. One
typical thing to do is to ``standardize'' each input feature so
that it has mean zero and variance one. Do this standardization
and evaluate the model again. How do your results change?
\item In some data, what matters most is whether the data are zero
or non-zero, and not what the actual value is. Transform the
features to be binary in this way and retrain the model as above.
\item Alternatively, some features are best represented via their
logs. Transform the features, retrain the model and report
results as above.
\end{enumerate}
\end{problem}
\end{document}