From 05be3d21b65df7b7deda0c86310fb874ca7f5202 Mon Sep 17 00:00:00 2001
From: Andreas Tsouchlos <an.tsouchlos@gmail.com>
Date: Tue, 14 Feb 2023 15:34:39 +0100
Subject: [PATCH] First draft of proximal decoding background

---
 latex/thesis/chapters/decoding_techniques.tex | 157 +++++++++++++++++-
 1 file changed, 150 insertions(+), 7 deletions(-)

diff --git a/latex/thesis/chapters/decoding_techniques.tex b/latex/thesis/chapters/decoding_techniques.tex
index 241bc79..1ae4ed3 100644
--- a/latex/thesis/chapters/decoding_techniques.tex
+++ b/latex/thesis/chapters/decoding_techniques.tex
@@ -72,9 +72,9 @@
 \label{sec:dec:LP Decoding using ADMM}
 
 \begin{itemize}
-    \item Equivalent ML optimization problem
-    \item LP relaxation
-    \item ADMM as a solver
+    \item Equivalent \ac{ML} optimization problem
+    \item \Ac{LP} relaxation
+    \item \Ac{ADMM} as a solver
 \end{itemize}
 
 
@@ -82,8 +82,151 @@
 \section{Proximal Decoding}%
 \label{sec:dec:Proximal Decoding}
 
-\begin{itemize}
-    \item Formulation of optimization problem
-    \item Proximal gradient method as a solver
-\end{itemize}
+Proximal decoding was proposed by Wadayama et. al \cite{proximal_paper}.
+With this decoding algorithm, the objective function is minimized using
+the proximal gradient method.
+In contrast to \ac{LP} decoding, the objective function is based on a
+non-convex optimization formulation of the \ac{MAP} decoding problem.
+
+In order to derive the objective function, the authors reformulate the
+\ac{MAP} decoding problem:%
+%
+\begin{align}
+    \hat{\boldsymbol{x}} = \argmax_{\boldsymbol{x} \in \mathbb{R}^{n}}
+        f_{\boldsymbol{X} \mid \boldsymbol{Y}}
+            \left( \boldsymbol{x} \mid \boldsymbol{y} \right)
+    = \argmax_{\boldsymbol{x} \in \mathbb{R}^{n}} f_{\boldsymbol{Y} \mid \boldsymbol{X}}
+        \left( \boldsymbol{y} \mid \boldsymbol{x} \right)
+        f_{\boldsymbol{X}}\left( \boldsymbol{x} \right)%
+    \label{eq:prox:vanilla_MAP}
+\end{align}%
+%
+The likelihood is usually a known function determined by the channel model.
+In order to rewrite the prior \ac{PDF}
+$f_{\boldsymbol{X}}\left( \boldsymbol{x} \right)$,
+the so-called \textit{code-constraint polynomial} is introduced:%
+%
+\begin{align}
+    h\left( \boldsymbol{x} \right) = \sum_{j=1}^{n} \left( x_j^2-1 \right) ^2
+        + \sum_{i=1}^{m} \left[
+            \left( \prod_{j\in \mathcal{A}\left( i \right) } x_j \right) -1 \right] ^2%
+    \label{eq:prox:ccp}
+\end{align}%
+%
+The intention of this function is to provide a way to penalize vectors far
+from a codeword and favor those close to a codeword.
+In order to achieve this, the polynomial is composed of two parts: one term
+representing the bibolar constraint, providing for a discrete solution of the
+continuous optimization problem, and one term representing the parity
+constraint, accomodating the role of the parity-check matrix $\boldsymbol{H}$.
+%
+The prior \ac{PDF} is then approximated using the code-constraint polynomial\todo{Italic?}:%
+%
+\begin{align}
+    f_{\boldsymbol{X}}\left( \boldsymbol{x} \right) =
+        \frac{1}{\left| \mathcal{C}\left( \boldsymbol{H} \right)  \right| }
+            \sum_{c \in \mathcal{C}\left( \boldsymbol{H} \right) }
+                \delta\left( \boldsymbol{x} - \left( -1 \right) ^{\boldsymbol{c}}\right)
+    \approx \frac{1}{Z}e^{-\gamma h\left( \boldsymbol{x} \right) }%
+    \label{eq:prox:prior_pdf_approx}
+\end{align}%
+%
+The authors justify this approximation by arguing that for
+$\gamma \rightarrow \infty$, the right-hand side aproaches the left-hand
+side. In \ref{eq:prox:vanilla_MAP} the prior \ac{PDF}
+$f_{\boldsymbol{X}}\left( \boldsymbol{x} \right) $ can then be subsituted
+for \ref{eq:prox:prior_pdf_approx} and the likelihood can be rewritten using
+the negative log-likelihood
+$f_{\boldsymbol{X} \mid \boldsymbol{Y}}\left( \boldsymbol{x} \mid \boldsymbol{y} \right)
+    = e^{- L\left( \boldsymbol{y} \mid \boldsymbol{x} \right) }$:%
+%
+\begin{align}
+    \hat{\boldsymbol{x}} &= \argmax_{\boldsymbol{x} \in \mathbb{R}^{n}}
+        e^{- L\left( \boldsymbol{y} \mid \boldsymbol{x} \right) }
+        e^{-\gamma h\left( \boldsymbol{x} \right) } \nonumber \\
+    &= \argmin_{\boldsymbol{x} \in \mathbb{R}^n} \left(
+        L\left( \boldsymbol{y} \mid \boldsymbol{x} \right)
+        + \gamma h\left( \boldsymbol{x} \right) 
+        \right)%
+    \label{eq:prox:approx_map_problem}
+\end{align}%
+%
+Thus, with proximal decoding, the objective function
+$f\left( \boldsymbol{x} \right)$ to be minimized is%
+%
+\begin{align}
+    f\left( \boldsymbol{x} \right) = L\left( \boldsymbol{x} \mid \boldsymbol{y} \right)
+        + \gamma h\left( \boldsymbol{x} \right)%
+    \label{eq:prox:objective_function}
+.\end{align}\todo{Dot after equations?}
+
+For the solution of the approximalte \ac{MAP} decoding problem, the two parts
+of \ref{eq:prox:approx_map_problem} are considered separately from one
+another: the minimization of the objective function occurs in an alternating
+manner, switching between the minimization of the negative log-likelihood
+$L\left( \boldsymbol{y} \mid \boldsymbol{x} \right) $ and the scaled
+code-constaint polynomial $\gamma h\left( \boldsymbol{x} \right) $.
+Two helper variables, $\boldsymbol{r}$ and $\boldsymbol{s}$ are introduced,
+describing the result of each of the two steps.
+The first step, minimizing the log-likelihood using gradient descent, yields%
+%
+\begin{align*}
+    \boldsymbol{r} \leftarrow \boldsymbol{s} - \omega \nabla
+        L\left( \boldsymbol{y} \mid \boldsymbol{s} \right),
+    \hspace{5mm}\omega > 0
+.\end{align*}%
+%
+For the second step, minimizig the scaled code-constraint polynomial using
+the proximal gradient method, the proximal operator of
+$\gamma h\left( \boldsymbol{x} \right) $ has to be computed and is
+immediately approximalted by a gradient-descent step:%
+%
+\begin{align*}
+    \text{prox}_{\gamma h} \left( \boldsymbol{x} \right) &\equiv
+        \argmin_{\boldsymbol{t} \in \mathbb{R}^n}
+            \left( \gamma h\left( \boldsymbol{x} \right) +
+                \frac{1}{2} \lVert \boldsymbol{t} - \boldsymbol{x} \rVert \right)\\
+    &\approx \boldsymbol{x} - \gamma h \left( \boldsymbol{x} \right),
+    \hspace{5mm} \gamma \text{ small}
+.\end{align*}%
+%
+The second step thus becomes%
+%
+\begin{align*}
+    \boldsymbol{s} \leftarrow \boldsymbol{r} - \gamma h\left( \boldsymbol{x} \right),
+    \hspace{5mm}\gamma > 0,\text{ small}
+.\end{align*}
+%
+While the approximatin of the prior \ac{PDF} made in \ref{eq:prox:prior_pdf_approx}
+theoretically becomes better
+with larger $\gamma$, the constraint that $\gamma$ be small is important,
+as it keeps the effect of $h\left( \boldsymbol{x} \right) $ on the landscape
+of the objective function small.
+Otherwise, unwanted stationary points, including local minima are introduced.
+The authors say that in practice, the value of $\gamma$ should be adjusted
+according to the decoding performance.
+
+The iterative decoding process resulting from this considreation is shown in
+figure \ref{fig:prox:alg}.
+
+\begin{figure}[H]
+    \centering
+
+    \begin{genericAlgorithm}[caption={}, label={}]
+$\boldsymbol{s} \leftarrow \boldsymbol{0}$
+for $K$ iterations do
+    $\boldsymbol{r} \leftarrow \boldsymbol{s} - \omega \nabla L \left( \boldsymbol{y} \mid \boldsymbol{s} \right) $
+    $\boldsymbol{s} \leftarrow \boldsymbol{r} - \gamma \nabla h\left( \boldsymbol{r} \right) $
+    $\boldsymbol{\hat{x}} \leftarrow \text{sign}\left( \boldsymbol{s} \right) $
+    if $\boldsymbol{H}\boldsymbol{\hat{c}} = \boldsymbol{0}$ do
+        return $\boldsymbol{\hat{c}}$
+    end if
+end for
+return $\boldsymbol{\hat{c}}$
+    \end{genericAlgorithm}
+
+
+    \caption{Proximal decoding algorithm}
+    \label{fig:prox:alg}
+\end{figure}