ba-thesis/latex/thesis/chapters/decoding_techniques.tex

\chapter{Decoding Techniques}%
\label{chapter:decoding_techniques}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Decoding using Optimization Methods}%
\label{sec:dec:Decoding using Optimization Methods}

%
% TODOs
%

\begin{itemize}
    \item General methodology
\end{itemize}

%
% Figure showing decoding space
%

\begin{figure}[H]
    \centering

    \tikzstyle{codeword} = [color=KITblue, fill=KITblue,
                            draw, circle, inner sep=0pt, minimum size=4pt]

    \tdplotsetmaincoords{60}{245}
    \begin{tikzpicture}[scale=1, transform shape, tdplot_main_coords]
        % Cube

        \draw[dashed] (0, 0, 0) -- (2, 0, 0);
        \draw[dashed] (2, 0, 0) -- (2, 0, 2);
        \draw[] (2, 0, 2) -- (0, 0, 2);
        \draw[] (0, 0, 2) -- (0, 0, 0);

        \draw[] (0, 2, 0) -- (2, 2, 0);
        \draw[] (2, 2, 0) -- (2, 2, 2);
        \draw[] (2, 2, 2) -- (0, 2, 2);
        \draw[] (0, 2, 2) -- (0, 2, 0);

        \draw[] (0, 0, 0) -- (0, 2, 0);
        \draw[dashed] (2, 0, 0) -- (2, 2, 0);
        \draw[] (2, 0, 2) -- (2, 2, 2);
        \draw[] (0, 0, 2) -- (0, 2, 2);

        % Polytope Annotations

        \node[codeword] (c000) at (0, 0, 0) {};% {$\left( 0, 0, 0 \right) $};
        \node[codeword] (c101) at (2, 0, 2) {};% {$\left( 1, 0, 1 \right) $};
        \node[codeword] (c110) at (2, 2, 0) {};% {$\left( 1, 1, 0 \right) $};
        \node[codeword] (c011) at (0, 2, 2) {};% {$\left( 0, 1, 1 \right) $};

        \node[color=KITblue, right=0cm of c000] {$\left( 0, 0, 0 \right) $};
        \node[color=KITblue, above=0cm of c101] {$\left( 1, 0, 1 \right) $};
        \node[color=KITblue, left=0cm of c110] {$\left( 1, 1, 0 \right) $};
        \node[color=KITblue, left=-0.1cm of c011] {$\left( 0, 1, 1 \right) $};

        % f

        \node[color=KITgreen, fill=KITgreen,
              draw, circle, inner sep=0pt, minimum size=4pt] (f) at (0.9, 0.7, 1) {};
        \node[color=KITgreen, right=0cm of f] {$\boldsymbol{f}$};
    \end{tikzpicture}

    \caption{Hypercube ($n=3$) and valid codewords for a single parity-check code}
\end{figure}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{LP Decoding using ADMM}%
\label{sec:dec:LP Decoding using ADMM}

\begin{itemize}
    \item Equivalent \ac{ML} optimization problem
    \item \Ac{LP} relaxation
    \item \Ac{ADMM} as a solver
\end{itemize}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Proximal Decoding}%
\label{sec:dec:Proximal Decoding}

Proximal decoding was proposed by Wadayama et. al \cite{proximal_paper}.
With this decoding algorithm, the objective function is minimized using
the proximal gradient method.
In contrast to \ac{LP} decoding, the objective function is based on a
non-convex optimization formulation of the \ac{MAP} decoding problem.

In order to derive the objective function, the authors reformulate the
\ac{MAP} decoding problem:%
%
\begin{align}
    \hat{\boldsymbol{x}} = \argmax_{\boldsymbol{x} \in \mathbb{R}^{n}}
        f_{\boldsymbol{X} \mid \boldsymbol{Y}}
            \left( \boldsymbol{x} \mid \boldsymbol{y} \right)
    = \argmax_{\boldsymbol{x} \in \mathbb{R}^{n}} f_{\boldsymbol{Y} \mid \boldsymbol{X}}
        \left( \boldsymbol{y} \mid \boldsymbol{x} \right)
        f_{\boldsymbol{X}}\left( \boldsymbol{x} \right)%
    \label{eq:prox:vanilla_MAP}
\end{align}%
%
The likelihood is usually a known function determined by the channel model.
In order to rewrite the prior \ac{PDF}
$f_{\boldsymbol{X}}\left( \boldsymbol{x} \right)$,
the so-called \textit{code-constraint polynomial} is introduced:%
%
\begin{align}
    h\left( \boldsymbol{x} \right) = \sum_{j=1}^{n} \left( x_j^2-1 \right) ^2
        + \sum_{i=1}^{m} \left[
            \left( \prod_{j\in \mathcal{A}\left( i \right) } x_j \right) -1 \right] ^2%
    \label{eq:prox:ccp}
\end{align}%
%
The intention of this function is to provide a way to penalize vectors far
from a codeword and favor those close to a codeword.
In order to achieve this, the polynomial is composed of two parts: one term
representing the bibolar constraint, providing for a discrete solution of the
continuous optimization problem, and one term representing the parity
constraint, accomodating the role of the parity-check matrix $\boldsymbol{H}$.
%
The prior \ac{PDF} is then approximated using the code-constraint polynomial\todo{Italic?}:%
%
\begin{align}
    f_{\boldsymbol{X}}\left( \boldsymbol{x} \right) =
        \frac{1}{\left| \mathcal{C}\left( \boldsymbol{H} \right)  \right| }
            \sum_{c \in \mathcal{C}\left( \boldsymbol{H} \right) }
                \delta\left( \boldsymbol{x} - \left( -1 \right) ^{\boldsymbol{c}}\right)
    \approx \frac{1}{Z}e^{-\gamma h\left( \boldsymbol{x} \right) }%
    \label{eq:prox:prior_pdf_approx}
\end{align}%
%
The authors justify this approximation by arguing that for
$\gamma \rightarrow \infty$, the right-hand side aproaches the left-hand
side. In \ref{eq:prox:vanilla_MAP} the prior \ac{PDF}
$f_{\boldsymbol{X}}\left( \boldsymbol{x} \right) $ can then be subsituted
for \ref{eq:prox:prior_pdf_approx} and the likelihood can be rewritten using
the negative log-likelihood
$f_{\boldsymbol{X} \mid \boldsymbol{Y}}\left( \boldsymbol{x} \mid \boldsymbol{y} \right)
    = e^{- L\left( \boldsymbol{y} \mid \boldsymbol{x} \right) }$:%
%
\begin{align}
    \hat{\boldsymbol{x}} &= \argmax_{\boldsymbol{x} \in \mathbb{R}^{n}}
        e^{- L\left( \boldsymbol{y} \mid \boldsymbol{x} \right) }
        e^{-\gamma h\left( \boldsymbol{x} \right) } \nonumber \\
    &= \argmin_{\boldsymbol{x} \in \mathbb{R}^n} \left(
        L\left( \boldsymbol{y} \mid \boldsymbol{x} \right)
        + \gamma h\left( \boldsymbol{x} \right)
        \right)%
    \label{eq:prox:approx_map_problem}
\end{align}%
%
Thus, with proximal decoding, the objective function
$f\left( \boldsymbol{x} \right)$ to be minimized is%
%
\begin{align}
    f\left( \boldsymbol{x} \right) = L\left( \boldsymbol{x} \mid \boldsymbol{y} \right)
        + \gamma h\left( \boldsymbol{x} \right)%
    \label{eq:prox:objective_function}
.\end{align}\todo{Dot after equations?}

For the solution of the approximalte \ac{MAP} decoding problem, the two parts
of \ref{eq:prox:approx_map_problem} are considered separately from one
another: the minimization of the objective function occurs in an alternating
manner, switching between the minimization of the negative log-likelihood
$L\left( \boldsymbol{y} \mid \boldsymbol{x} \right) $ and the scaled
code-constaint polynomial $\gamma h\left( \boldsymbol{x} \right) $.
Two helper variables, $\boldsymbol{r}$ and $\boldsymbol{s}$ are introduced,
describing the result of each of the two steps.
The first step, minimizing the log-likelihood using gradient descent, yields%
%
\begin{align*}
    \boldsymbol{r} \leftarrow \boldsymbol{s} - \omega \nabla
        L\left( \boldsymbol{y} \mid \boldsymbol{s} \right),
    \hspace{5mm}\omega > 0
.\end{align*}%
%
For the second step, minimizig the scaled code-constraint polynomial using
the proximal gradient method, the proximal operator of
$\gamma h\left( \boldsymbol{x} \right) $ has to be computed and is
immediately approximalted by a gradient-descent step:%
%
\begin{align*}
    \text{prox}_{\gamma h} \left( \boldsymbol{x} \right) &\equiv
        \argmin_{\boldsymbol{t} \in \mathbb{R}^n}
            \left( \gamma h\left( \boldsymbol{x} \right) +
                \frac{1}{2} \lVert \boldsymbol{t} - \boldsymbol{x} \rVert \right)\\
    &\approx \boldsymbol{x} - \gamma h \left( \boldsymbol{x} \right),
    \hspace{5mm} \gamma \text{ small}
.\end{align*}%
%
The second step thus becomes%
%
\begin{align*}
    \boldsymbol{s} \leftarrow \boldsymbol{r} - \gamma h\left( \boldsymbol{x} \right),
    \hspace{5mm}\gamma > 0,\text{ small}
.\end{align*}
%
While the approximatin of the prior \ac{PDF} made in \ref{eq:prox:prior_pdf_approx}
theoretically becomes better
with larger $\gamma$, the constraint that $\gamma$ be small is important,
as it keeps the effect of $h\left( \boldsymbol{x} \right) $ on the landscape
of the objective function small.
Otherwise, unwanted stationary points, including local minima are introduced.
The authors say that in practice, the value of $\gamma$ should be adjusted
according to the decoding performance.

The iterative decoding process resulting from this considreation is shown in
figure \ref{fig:prox:alg}.

\begin{figure}[H]
    \centering

    \begin{genericAlgorithm}[caption={}, label={}]
$\boldsymbol{s} \leftarrow \boldsymbol{0}$
for $K$ iterations do
    $\boldsymbol{r} \leftarrow \boldsymbol{s} - \omega \nabla L \left( \boldsymbol{y} \mid \boldsymbol{s} \right) $
    $\boldsymbol{s} \leftarrow \boldsymbol{r} - \gamma \nabla h\left( \boldsymbol{r} \right) $
    $\boldsymbol{\hat{x}} \leftarrow \text{sign}\left( \boldsymbol{s} \right) $
    if $\boldsymbol{H}\boldsymbol{\hat{c}} = \boldsymbol{0}$ do
        return $\boldsymbol{\hat{c}}$
    end if
end for
return $\boldsymbol{\hat{c}}$
    \end{genericAlgorithm}


    \caption{Proximal decoding algorithm}
    \label{fig:prox:alg}
\end{figure}