\chapter{Theoretical Background}%
\label{chapter:theoretical_background}

In this chapter, the theoretical background necessary to understand this
work is given.
First, the used notation is clarified.
The physical aspects are detailed - the used modulation scheme and channel model.
A short introduction of channel coding with binary linear codes and especially
\ac{LDPC} codes is given.
The established methods of decoding LPDC codes are briefly explained.
Lastly, the optimization methods utilized are described.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Notation}
\label{sec:theo:Notation}

%
% TODOs
%

\begin{itemize}
    \item General remarks on notation (matrices, \ldots)
    \item Probabilistic quantities (random variables, \acp{PDF}, pdfs vs pmfs vs cdfs, \ldots)
\end{itemize}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Preliminaries: Channel Model and Modulation}
\label{sec:theo:Preliminaries: Channel Model and Modulation}

%
% TODOs
%

\begin{itemize}
    \item \Ac{AWGN}
    \item \Ac{BPSK}
\end{itemize}

%
% Figure showing notation for entire coding / decoding process
%

\tikzstyle{box} = [rectangle, minimum width=1.5cm, minimum height=0.7cm,
                          rounded corners=0.1cm, text centered, draw=black, fill=KITgreen!80]
\begin{figure}[htpb]
    \centering

    \begin{tikzpicture}[scale=1, transform shape]
        \node (in) {$\boldsymbol{c}$};
        \node[box, right=0.5cm of in] (bpskmap) {Mapper};
        \node[right=1.5cm of bpskmap,
              draw, circle, inner sep=0pt, minimum size=0.5cm] (add) {$+$};
        \node[below=0.5cm of add] (noise) {$\boldsymbol{z}$};
        \node[box, right=1.5cm of add] (decoder) {Decoder};
        \node[box, right=1.5cm of decoder] (demapper) {Demapper};
        \node[right=0.5cm of demapper] (out) {$\boldsymbol{\hat{c}}$};

        \node at ($(bpskmap.east)!0.5!(add.west) + (0,0.3cm)$) {$\boldsymbol{x}$};
        \node at ($(add.east)!0.5!(decoder.west) + (0,0.3cm)$) {$\boldsymbol{y}$};
        \node at ($(decoder.east)!0.5!(demapper.west) + (0,0.3cm)$) {$\boldsymbol{\hat{x}}$};

        \draw[->] (in) -- (bpskmap);
        \draw[->] (bpskmap) -- (add);
        \draw[->] (add) -- (decoder);
        \draw[->] (noise) -- (add);
        \draw[->] (decoder) -- (demapper);
        \draw[->] (demapper) -- (out);
    \end{tikzpicture}

    \caption{Overview of notation}
    \label{fig:notation}
\end{figure}

\todo{Note about $\tilde{\boldsymbol{c}}$ (and maybe $\tilde{\boldsymbol{x}}$?)}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Channel Coding with LDPC Codes}
\label{sec:theo:Channel Coding with LDPC Codes}

\begin{itemize}
    \item Introduction
    \item Binary linear codes
    \item \Ac{LDPC} codes (especially $i$, $j$, parity check matrix $\boldsymbol{H}$, $N\left( j  \right) $ \& $N\left( i \right) $, etc.)
\end{itemize}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Decoding LDPC Codes using Belief Propagation}
\label{sec:theo:Decoding LDPC Codes using Belief Propagation}

\begin{itemize}
    \item Introduction to message passing
    \item Overview of \ac{BP} algorithm
\end{itemize}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Optimization Methods}
\label{sec:theo:Optimization Methods}

TODO:
\begin{itemize}
    \item Intro
    \item Proximal Decoding
\end{itemize}

\vspace{5mm}

Generally, any linear program can be expressed in \textit{standard form}%
\footnote{The inequality $\boldsymbol{x} \ge \boldsymbol{0}$ is to be
interpreted componentwise.}
\cite[Sec. 1.1]{intro_to_lin_opt_book}:%
%
\begin{alignat}{3}
    \begin{alignedat}{3}
        \text{minimize }\hspace{2mm}   && \boldsymbol{\gamma}^\text{T} \boldsymbol{x}         \\
        \text{subject to }\hspace{2mm} && \boldsymbol{A}\boldsymbol{x}   & = \boldsymbol{b}   \\
                                       &&               \boldsymbol{x}   & \ge \boldsymbol{0}.
    \end{alignedat}
    \label{eq:theo:admm_standard}
\end{alignat}%
%
A technique called \textit{lagrangian relaxation} \cite[Sec. 11.4]{intro_to_lin_opt_book}
can then be applied.
First, some of the constraints are moved into the objective function itself
and the weights $\boldsymbol{\lambda}$ are introduced. A new, relaxed problem
is formulated:
%
\begin{align}
    \begin{aligned}
        \text{minimize }\hspace{2mm}   & \boldsymbol{\gamma}^\text{T}\boldsymbol{x}
            + \boldsymbol{\lambda}^\text{T}\left(\boldsymbol{b}
                - \boldsymbol{A}\boldsymbol{x} \right)  \\
        \text{subject to }\hspace{2mm} & \boldsymbol{x} \ge \boldsymbol{0},
    \end{aligned}
    \label{eq:theo:admm_relaxed}
\end{align}%
%
the new objective function being the \textit{lagrangian}%
%
\begin{align*}
\mathcal{L}\left( \boldsymbol{x}, \boldsymbol{\lambda} \right)
    = \boldsymbol{\gamma}^\text{T}\boldsymbol{x}
        + \boldsymbol{\lambda}^\text{T}\left(\boldsymbol{b}
            - \boldsymbol{A}\boldsymbol{x} \right)
.\end{align*}%
%
This problem is not directly equivalent to the original one, as the
solution now depends on the choice of the \textit{lagrange multipliers}
$\boldsymbol{\lambda}$.
Interestingly, however, for this particular class of problems,
the minimum of the objective function (herafter called \textit{optimal objective}) 
of the relaxed problem (\ref{eq:theo:admm_relaxed}) is a lower bound for
the optimal objective of the original problem (\ref{eq:theo:admm_standard})
\cite[Sec. 4.1]{intro_to_lin_opt_book}:%
%
\begin{align*}
    \min_{\substack{\boldsymbol{x} \ge \boldsymbol{0} \\ \phantom{a}}}
        \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{\lambda}
        \right)
    \le
    \min_{\substack{\boldsymbol{x} \ge \boldsymbol{0} \\ \boldsymbol{A}\boldsymbol{x}
            = \boldsymbol{b}}}
        \boldsymbol{\gamma}^\text{T}\boldsymbol{x}
.\end{align*}
%
Furthermore, for uniquely solvable linear programs \textit{strong duality}
always holds \cite[Theorem 4.4]{intro_to_lin_opt_book}.
This means that not only is it a lower bound, the tightest lower
bound actually reaches the value itself:
In other words, with the optimal choice of $\boldsymbol{\lambda}$,
the optimal objectives of the problems (\ref{eq:theo:admm_relaxed})
and (\ref{eq:theo:admm_standard}) have the same value.
%
\begin{align*}
    \max_{\boldsymbol{\lambda}} \, \min_{\boldsymbol{x} \ge \boldsymbol{0}}
        \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{\lambda} \right) 
    = \min_{\substack{\boldsymbol{x} \ge \boldsymbol{0} \\ \boldsymbol{A}\boldsymbol{x}
            = \boldsymbol{b}}}
        \boldsymbol{\gamma}^\text{T}\boldsymbol{x}
.\end{align*}
%
Thus, we can define the \textit{dual problem} as the search for the tightest lower bound:%
%
\begin{align}
    \text{maximize }\hspace{2mm} & \min_{\boldsymbol{x} \ge \boldsymbol{0}} \mathcal{L}
        \left( \boldsymbol{x}, \boldsymbol{\lambda} \right)
    \label{eq:theo:dual}
,\end{align}
%
and recover the solution $\boldsymbol{x}_{\text{opt}}$ to problem (\ref{eq:theo:admm_standard})
from the solution $\boldsymbol{\lambda}_\text{opt}$ to problem (\ref{eq:theo:dual})
by computing \cite[Sec. 2.1]{admm_distr_stats}%
%
\begin{align}
    \boldsymbol{x}_{\text{opt}} = \argmin_{\boldsymbol{x}}
        \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{\lambda}_{\text{opt}} \right)
    \label{eq:theo:admm_obtain_primal}
.\end{align}
%

The dual problem can then be solved iteratively using \textit{dual ascent}: starting with an
initial estimate of $\boldsymbol{\lambda}$, calculate an estimate for $\boldsymbol{x}$
using equation (\ref{eq:theo:admm_obtain_primal}); then, update $\boldsymbol{\lambda}$
using gradient descent \cite[Sec. 2.1]{admm_distr_stats}:%
%
\begin{align*}
    \boldsymbol{x} &\leftarrow \argmin_{\boldsymbol{x}} \mathcal{L}\left(
        \boldsymbol{x}, \boldsymbol{\lambda} \right) \\
    \boldsymbol{\lambda} &\leftarrow \boldsymbol{\lambda}
        + \alpha\left( \boldsymbol{A}\boldsymbol{x} - \boldsymbol{b} \right),
    \hspace{5mm} \alpha > 0
.\end{align*}
%
The algorithm can be improved by observing that when hen the objective function is separable in $\boldsymbol{x}$, the lagrangian is as well:
%
\begin{align*}
    \text{minimize }\hspace{5mm} & \sum_{i=1}^{N} g_i\left( \boldsymbol{x}_i \right)  \\
    \text{subject to}\hspace{5mm} & \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i
        = \boldsymbol{b}
\end{align*}
\begin{align*}
    \mathcal{L}\left( \boldsymbol{x}_{[1:N]}, \boldsymbol{\lambda} \right)
        = \sum_{i=1}^{N} g_i\left( \boldsymbol{x}_i \right) 
            + \boldsymbol{\lambda}^\text{T} \left( \boldsymbol{b}
            - \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x_i} \right) 
.\end{align*}%
%
The minimization of each term can then happen in parallel, in a distributed fasion
\cite[Sec. 2.2]{admm_distr_stats}.
This modified version of dual ascent is called \textit{dual decomposition}:
%
\begin{align*}
    \boldsymbol{x}_i &\leftarrow \argmin_{\boldsymbol{x}_i}\mathcal{L}\left(
        \boldsymbol{x}_{[1:N]}, \boldsymbol{\lambda}\right) 
        \hspace{5mm} \forall i \in [1:N]\\
    \boldsymbol{\lambda} &\leftarrow \boldsymbol{\lambda}
        + \alpha\left( \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i
            - \boldsymbol{b} \right),
        \hspace{5mm} \alpha > 0
.\end{align*}
%

The \ac{ADMM} works the same way as dual decomposition.
It only differs in the use of an \textit{augmented lagrangian}
$\mathcal{L}_\mu\left( \boldsymbol{x}_{[1:N]} \boldsymbol{\lambda} \right)$
in order to robustify the convergence properties.
The augmented lagrangian extends the ordinary one with an additional penalty term
with the penaly parameter $\mu$:
%
\begin{align*}
    \mathcal{L}_\mu \left( \boldsymbol{x}_{[1:N]}, \boldsymbol{\lambda} \right)
        = \underbrace{\sum_{i=1}^{N} g_i\left( \boldsymbol{x_i} \right) 
            + \boldsymbol{\lambda}^\text{T}\left( \boldsymbol{b}
        - \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i \right)}_{\text{Ordinary lagrangian}}
            + \underbrace{\frac{\mu}{2}\lVert \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i
            - \boldsymbol{b} \rVert_2^2}_{\text{Penalty term}},
        \hspace{5mm} \mu > 0
.\end{align*}
%
The steps to solve the problem are the same as with dual decomposition, with the added
condition that the step size be $\mu$:%
%
\begin{align*}
    \boldsymbol{x}_i &\leftarrow \argmin_{\boldsymbol{x}_i}\mathcal{L}_\mu\left(
        \boldsymbol{x}_{[1:N]}, \boldsymbol{\lambda}\right) 
        \hspace{5mm} \forall i \in [1:N]\\
    \boldsymbol{\lambda} &\leftarrow \boldsymbol{\lambda}
        + \mu\left( \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i
            - \boldsymbol{b} \right),
        \hspace{5mm} \mu > 0
%    \boldsymbol{x}_1 &\leftarrow \argmin_{\boldsymbol{x}_1}\mathcal{L}_\mu\left(
%        \boldsymbol{x}_1, \boldsymbol{x_2}, \boldsymbol{\lambda}\right) \\
%    \boldsymbol{x}_2 &\leftarrow \argmin_{\boldsymbol{x}_2}\mathcal{L}_\mu\left(
%        \boldsymbol{x}_1, \boldsymbol{x_2}, \boldsymbol{\lambda}\right) \\
%    \boldsymbol{\lambda} &\leftarrow \boldsymbol{\lambda}
%        + \mu\left( \boldsymbol{A}_1\boldsymbol{x}_1 + \boldsymbol{A}_2\boldsymbol{x}_2
%            - \boldsymbol{b} \right),
%        \hspace{5mm} \mu > 0
.\end{align*}
%