From 252736ff31aeaf8dcb50a9084474cba8500e3ad5 Mon Sep 17 00:00:00 2001
From: Andreas Tsouchlos <an.tsouchlos@gmail.com>
Date: Sun, 19 Feb 2023 14:01:49 +0100
Subject: [PATCH] Reworked proximal decoding

---
 latex/thesis/chapters/decoding_techniques.tex | 243 ++++++++++--------
 1 file changed, 141 insertions(+), 102 deletions(-)

diff --git a/latex/thesis/chapters/decoding_techniques.tex b/latex/thesis/chapters/decoding_techniques.tex
index dfafca7..ec86654 100644
--- a/latex/thesis/chapters/decoding_techniques.tex
+++ b/latex/thesis/chapters/decoding_techniques.tex
@@ -34,8 +34,8 @@ The goal is to arrive at a formulation, where a certain objective function
 $f$ has to be minimized under certain constraints:%
 %
 \begin{align*}
-    \text{minimize } f\left( \boldsymbol{c} \right)\\
-    \text{subject to $\boldsymbol{c} \in D$}
+    \text{minimize}\hspace{2mm}   &f\left( \boldsymbol{c} \right)\\
+    \text{subject to}\hspace{2mm} &\boldsymbol{c} \in D
 ,\end{align*}%
 %
 where $D$ is the domain of values attainable for $c$ and represents the
@@ -256,7 +256,7 @@ the transfer matrix would be $\boldsymbol{T}_j =
     0 & 1 & 0 & 0 & 0 & 0 & 0 \\
     0 & 0 & 0 & 1 & 0 & 0 & 0 \\
     0 & 0 & 0 & 0 & 0 & 1 & 0 \\
-\end{bmatrix} $ (example taken from \cite[Sec. II, A]{efficient_lp_dec_admm})}%
+\end{bmatrix} $ (example taken from \cite[Sec. II, A]{efficient_lp_dec_admm})}
 (i.e. the relevant components of $\boldsymbol{c}$ for parity-check $j$)
 and $\mathcal{P}_{d}$ is the \textit{check polytope}, the convex hull of all
 binary vectors of length $d$ with even parity%
@@ -274,6 +274,22 @@ Figures \ref{fig:dec:poly:local1} and \ref{fig:dec:poly:local2} show the local
 codeword polytopes of each check node.
 Their intersection, the relaxed codeword polytope $\overline{Q}$, is shown in
 figure \ref{fig:dec:poly:relaxed}.
+It can be seen, that the relaxed codeword polytope $\overline{Q}$ introduces
+vertices with fractional values;
+these represent erroneous non-codeword solutions to the linear program and
+correspond to the so-called \textit{pseudocodewords} introduced in
+\cite{feldman_paper}.
+However, since for \ac{LDPC} codes $\overline{Q}$ scales linearly with $n$ instead of
+exponentially, it is a lot more tractable for practical applications.
+
+The resulting formulation of the relaxed optimization problem is the following:%
+%
+\begin{align*}
+    \text{minimize }\hspace{2mm} &\sum_{i=1}^{n} \gamma_i c_i \\
+    \text{subject to }\hspace{2mm} &\boldsymbol{T}_j \boldsymbol{c} \in \mathcal{P}_{d_j},
+        \hspace{5mm}j\in\mathcal{J}
+.\end{align*}%
+%
 %
 %
 % Codeword polytope visualization figure
@@ -566,22 +582,6 @@ figure \ref{fig:dec:poly:relaxed}.
     \label{fig:dec:poly}
 \end{figure}%
 %
-It can be seen, that the relaxed codeword polytope $\overline{Q}$ introduces
-vertices with fractional values;
-these represent erroneous non-codeword solutions to the linear program and
-correspond to the so-called \textit{pseudocodewords} introduced in
-\cite{feldman_paper}.
-However, since for \ac{LDPC} codes $\overline{Q}$ scales linearly with $n$ instead of
-exponentially, it is a lot more tractable for practical applications.
-
-The resulting formulation of the relaxed optimization problem is the following:%
-%
-\begin{align*}
-    \text{minimize }\hspace{2mm} &\sum_{i=1}^{n} \gamma_i c_i \\
-    \text{subject to }\hspace{2mm} &\boldsymbol{T}_j \boldsymbol{c} \in \mathcal{P}_{d_j}
-        \hspace{5mm}j\in\mathcal{J}
-.\end{align*}%
-%
 
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -599,14 +599,16 @@ The resulting formulation of the relaxed optimization problem is the following:%
 \section{Proximal Decoding}%
 \label{sec:dec:Proximal Decoding}
 
-Proximal decoding was proposed by Wadayama et. al \cite{proximal_paper}.
-With this decoding algorithm, the objective function is minimized using
-the proximal gradient method.
+Proximal decoding was proposed by Wadayama et. al as a novel formulation of
+optimization based decoding \cite{proximal_paper}.
+With this algorithm, minimization is performed using the proximal gradient
+method.
 In contrast to \ac{LP} decoding, the objective function is based on a
 non-convex optimization formulation of the \ac{MAP} decoding problem.
 
-In order to derive the objective function, the authors reformulate the
-\ac{MAP} decoding problem:%
+In order to derive the objective function, the authors begin with the
+\ac{MAP} decoding rule, expressed as a continuous minimization problem over
+$\boldsymbol{x}$:%
 %
 \begin{align}
     \hat{\boldsymbol{x}} = \argmax_{\boldsymbol{x} \in \mathbb{R}^{n}}
@@ -616,19 +618,37 @@ In order to derive the objective function, the authors reformulate the
         \left( \boldsymbol{y} \mid \boldsymbol{x} \right)
         f_{\boldsymbol{X}}\left( \boldsymbol{x} \right)%
     \label{eq:prox:vanilla_MAP}
+.\end{align}%
+%
+The likelihood $f_{\boldsymbol{Y} \mid \boldsymbol{X}}
+\left( \boldsymbol{y} \mid \boldsymbol{x} \right) $ is a known function
+determined by the channel model.
+The prior \ac{PDF} $f_{\boldsymbol{X}}\left( \boldsymbol{x} \right)$ is also
+known, as the equal probability assumption is made on
+$\mathcal{C}\left( \boldsymbol{H} \right)$.
+However, because in this case the considered domain is continuous,
+the prior \ac{PDF} cannot be ignored as a constant during the minimization
+as is often done, and has a rather unwieldy representation:%
+%
+\begin{align}
+    f_{\boldsymbol{X}}\left( \boldsymbol{x} \right) =
+        \frac{1}{\left| \mathcal{C}\left( \boldsymbol{H} \right)  \right| }
+            \sum_{c \in \mathcal{C}\left( \boldsymbol{H} \right) }
+                \delta\left( \boldsymbol{x} - \left( -1 \right) ^{\boldsymbol{c}}\right)
+    \label{eq:prox:prior_pdf}
 \end{align}%
 %
-The likelihood is usually a known function determined by the channel model.
 In order to rewrite the prior \ac{PDF}
 $f_{\boldsymbol{X}}\left( \boldsymbol{x} \right)$,
 the so-called \textit{code-constraint polynomial} is introduced:%
 %
-\begin{align}
-    h\left( \boldsymbol{x} \right) = \sum_{j=1}^{n} \left( x_j^2-1 \right) ^2
-        + \sum_{i=1}^{m} \left[
-            \left( \prod_{j\in \mathcal{A}\left( i \right) } x_j \right) -1 \right] ^2%
-    \label{eq:prox:ccp}
-\end{align}%
+\begin{align*}
+    h\left( \boldsymbol{x} \right) =
+        \underbrace{\sum_{j=1}^{n} \left( x_j^2-1 \right) ^2}_{\text{Bipolar constraint}}
+        + \underbrace{\sum_{i=1}^{m} \left[
+            \left( \prod_{j\in \mathcal{A}
+        \left( i \right) } x_j \right) -1 \right] ^2}_{\text{Parity Constraint}}%
+.\end{align*}%
 %
 The intention of this function is to provide a way to penalize vectors far
 from a codeword and favor those close to a codeword.
@@ -636,69 +656,74 @@ In order to achieve this, the polynomial is composed of two parts: one term
 representing the bibolar constraint, providing for a discrete solution of the
 continuous optimization problem, and one term representing the parity
 constraint, accomodating the role of the parity-check matrix $\boldsymbol{H}$.
-%
-The equal probability assumption is made on $\mathcal{C}\left( \boldsymbol{H} \right) $.
 The prior \ac{PDF} is then approximated using the code-constraint polynomial:%
 %
 \begin{align}
-    f_{\boldsymbol{X}}\left( \boldsymbol{x} \right) =
-        \frac{1}{\left| \mathcal{C}\left( \boldsymbol{H} \right)  \right| }
-            \sum_{c \in \mathcal{C}\left( \boldsymbol{H} \right) }
-                \delta\left( \boldsymbol{x} - \left( -1 \right) ^{\boldsymbol{c}}\right)
+    f_{\boldsymbol{X}}\left( \boldsymbol{x} \right)
     \approx \frac{1}{Z}e^{-\gamma h\left( \boldsymbol{x} \right) }%
     \label{eq:prox:prior_pdf_approx}
-\end{align}%
+.\end{align}%
 %
 The authors justify this approximation by arguing that for
-$\gamma \rightarrow \infty$, the right-hand side aproaches the left-hand
-side. In equation \ref{eq:prox:vanilla_MAP}, the prior \ac{PDF}
-$f_{\boldsymbol{X}}\left( \boldsymbol{x} \right) $ can then be subsituted
-for equation \ref{eq:prox:prior_pdf_approx} and the likelihood can be rewritten using
-the negative log-likelihood
+$\gamma \rightarrow \infty$, the approximation in equation
+\ref{eq:prox:prior_pdf_approx} aproaches the original fuction in equation
+\ref{eq:prox:prior_pdf}.
+This approximation can then be plugged into equation \ref{eq:prox:vanilla_MAP}
+and the likelihood can be rewritten using the negative log-likelihood
 $L \left( \boldsymbol{y} \mid \boldsymbol{x} \right) = -\ln\left(
-        f_{\boldsymbol{X} \mid \boldsymbol{Y}}\left(
-            \boldsymbol{x} \mid \boldsymbol{y} \right) \right) $:%
+        f_{\boldsymbol{Y} \mid \boldsymbol{X}}\left(
+            \boldsymbol{y} \mid \boldsymbol{x} \right) \right) $:%
 %
-\begin{align}
+\begin{align*}
     \hat{\boldsymbol{x}} &= \argmax_{\boldsymbol{x} \in \mathbb{R}^{n}}
         e^{- L\left( \boldsymbol{y} \mid \boldsymbol{x} \right) }
-        e^{-\gamma h\left( \boldsymbol{x} \right) } \nonumber \\
+        e^{-\gamma h\left( \boldsymbol{x} \right) } \\
     &= \argmin_{\boldsymbol{x} \in \mathbb{R}^n} \left(
         L\left( \boldsymbol{y} \mid \boldsymbol{x} \right)
         + \gamma h\left( \boldsymbol{x} \right) 
         \right)%
-    \label{eq:prox:approx_map_problem}
-.\end{align}%
+.\end{align*}%
 %
 Thus, with proximal decoding, the objective function
-$f\left( \boldsymbol{x} \right)$ to be minimized is%
+$f\left( \boldsymbol{x} \right)$ considered is%
 %
 \begin{align}
     f\left( \boldsymbol{x} \right) = L\left( \boldsymbol{x} \mid \boldsymbol{y} \right)
         + \gamma h\left( \boldsymbol{x} \right)%
     \label{eq:prox:objective_function}
-.\end{align}
+\end{align}%
+%
+and the decoding problem is reformulated to%
+%
+\begin{align*}    
+    \text{minimize}\hspace{2mm}   &L\left( \boldsymbol{x} \mid \boldsymbol{y} \right)
+        + \gamma h\left( \boldsymbol{x} \right)\\
+    \text{subject to}\hspace{2mm} &\boldsymbol{x} \in \mathbb{R}^n
+.\end{align*}
+%
 
-For the solution of the approximalte \ac{MAP} decoding problem, the two parts
+For the solution of the approximate \ac{MAP} decoding problem, the two parts
 of \ref{eq:prox:objective_function} are considered separately:
 the minimization of the objective function occurs in an alternating
-manner, switching between the minimization of the negative log-likelihood
+fashion, switching between the negative log-likelihood
 $L\left( \boldsymbol{y} \mid \boldsymbol{x} \right) $ and the scaled
 code-constaint polynomial $\gamma h\left( \boldsymbol{x} \right) $.
 Two helper variables, $\boldsymbol{r}$ and $\boldsymbol{s}$ are introduced,
 describing the result of each of the two steps.
-The first step, minimizing the log-likelihood using gradient descent, yields%
+The first step, minimizing the log-likelihood, is performed using gradient
+descent:%
 %
-\begin{align*}
+\begin{align}
     \boldsymbol{r} \leftarrow \boldsymbol{s} - \omega \nabla
         L\left( \boldsymbol{y} \mid \boldsymbol{s} \right),
     \hspace{5mm}\omega > 0
-.\end{align*}%
+    \label{eq:prox:step_log_likelihood}
+.\end{align}%
 %
-For the second step, minimizig the scaled code-constraint polynomial using
-the proximal gradient method, the proximal operator of
-$\gamma h\left( \boldsymbol{x} \right) $ has to be computed and is
-immediately approximalted by a gradient-descent step:%
+For the second step, minimizig the scaled code-constraint polynomial, the
+proximal gradient method is used and the \textit{proximal operator} of
+$\gamma h\left( \boldsymbol{x} \right) $ has to be computed.
+It is then immediately approximalted with gradient-descent:%
 %
 \begin{align*}
     \text{prox}_{\gamma h} \left( \boldsymbol{x} \right) &\equiv
@@ -709,8 +734,7 @@ immediately approximalted by a gradient-descent step:%
     \hspace{5mm} \gamma \text{ small}
 .\end{align*}%
 %
-The second step thus becomes \todo{Write the formulation optimization problem properly
-    (as shown in the introductory section)}%
+The second step thus becomes%
 %
 \begin{align*}
     \boldsymbol{s} \leftarrow \boldsymbol{r} - \gamma \nabla h\left( \boldsymbol{r} \right),
@@ -725,42 +749,19 @@ of the objective function small.
 Otherwise, unwanted stationary points, including local minima, are introduced.
 The authors say that in practice, the value of $\gamma$ should be adjusted
 according to the decoding performance.
-The iterative decoding process \todo{projection with $\eta$} resulting from this considreation is shown in
-figure \ref{fig:prox:alg}.
 
-\begin{figure}[H]
-    \centering
-
-    \begin{genericAlgorithm}[caption={}, label={}]
-$\boldsymbol{s} \leftarrow \boldsymbol{0}$
-for $K$ iterations do
-    $\boldsymbol{r} \leftarrow \boldsymbol{s} - \omega \nabla L \left( \boldsymbol{y} \mid \boldsymbol{s} \right) $
-    $\boldsymbol{s} \leftarrow \boldsymbol{r} - \gamma \nabla h\left( \boldsymbol{r} \right) $
-    $\boldsymbol{\hat{x}} \leftarrow \text{sign}\left( \boldsymbol{s} \right) $
-    if $\boldsymbol{H}\boldsymbol{\hat{c}} = \boldsymbol{0}$ do
-        return $\boldsymbol{\hat{c}}$
-    end if
-end for
-return $\boldsymbol{\hat{c}}$
-    \end{genericAlgorithm}
-
-
-    \caption{Proximal decoding algorithm}
-    \label{fig:prox:alg}
-\end{figure}
-
-The components of the gradient of the code-constraint polynomial can be computed as follows:%
-%
-\begin{align*}
-    \frac{\partial}{\partial x_k} h\left( \boldsymbol{x} \right) =
-        4\left( x_k^2 - 1 \right) x_k + \frac{2}{x_k}
-            \sum_{i\in \mathcal{B}\left( k \right) } \left(
-                \left( \prod_{j\in\mathcal{A}\left( i \right)} x_j\right)^2
-                - \prod_{j\in\mathcal{A}\left( i \right) }x_j \right)
-.\end{align*}%
-\todo{Only multiplication?}%
-\todo{$x_k$: $k$ or some other indexing variable?}%
-%
+%The components of the gradient of the code-constraint polynomial can be computed as follows:%
+%%
+%\begin{align*}
+%    \frac{\partial}{\partial x_k} h\left( \boldsymbol{x} \right) =
+%        4\left( x_k^2 - 1 \right) x_k + \frac{2}{x_k}
+%            \sum_{i\in \mathcal{B}\left( k \right) } \left(
+%                \left( \prod_{j\in\mathcal{A}\left( i \right)} x_j\right)^2
+%                - \prod_{j\in\mathcal{A}\left( i \right) }x_j \right)
+%.\end{align*}%
+%\todo{Only multiplication?}%
+%\todo{$x_k$: $k$ or some other indexing variable?}%
+%%
 In the case of \ac{AWGN}, the likelihood
 $f_{\boldsymbol{Y} \mid \boldsymbol{X}}\left( \boldsymbol{y} \mid \boldsymbol{x} \right)$
 is%
@@ -778,12 +779,50 @@ it suffices to consider only the proportionality instead of the equality.}%
     \nabla L \left( \boldsymbol{y} \mid \boldsymbol{x} \right)
     &\propto -\nabla \lVert \boldsymbol{y} - \boldsymbol{x} \rVert^2\\
     &\propto \boldsymbol{x} - \boldsymbol{y}
-.\end{align*}%
+,\end{align*}%
 %
-The resulting iterative decoding process under the assumption of \ac{AWGN} is
-described by%
+Allowing equation \ref{eq:prox:step_log_likelihood} to be rewritten as%
 %
 \begin{align*}
-    \boldsymbol{r} \leftarrow \boldsymbol{s} - \omega\left( \boldsymbol{s}-\boldsymbol{y} \right)\\
-    \boldsymbol{s} \leftarrow \boldsymbol{r} - \gamma \nabla h\left( \boldsymbol{r} \right) 
+    \boldsymbol{r} \leftarrow \boldsymbol{s}
+        - \omega \left( \boldsymbol{s} - \boldsymbol{y} \right) 
 .\end{align*}
+%
+
+One thing to consider during the actual decoding process, is that the gradient
+of the code-constraint polynomial can take on extremely large values.
+In order to avoid numeric instability, an additional step is added, where all
+components of the current estimate are clipped to $\left[-\eta, \eta \right]$,
+where $\eta$ is a positive constant slightly larger than one:%
+%
+\begin{align*}
+    \boldsymbol{s} \leftarrow \Pi_{\eta} \left( \boldsymbol{r}
+        - \gamma \nabla h\left( \boldsymbol{r} \right)  \right) 
+,\end{align*}
+%
+$\Pi_{\eta}\left( \cdot \right) $ expressing the projection onto
+$\left[ -\eta, \eta \right]^n$.
+
+The iterative decoding process resulting from these considreations is shown in
+figure \ref{fig:prox:alg}.
+
+\begin{figure}[H]
+    \centering
+
+    \begin{genericAlgorithm}[caption={}, label={}]
+$\boldsymbol{s} \leftarrow \boldsymbol{0}$
+for $K$ iterations do
+    $\boldsymbol{r} \leftarrow \boldsymbol{s} - \omega \left( \boldsymbol{s} - \boldsymbol{y} \right) $
+    $\boldsymbol{s} \leftarrow \Pi_\eta \left(\boldsymbol{r} - \gamma \nabla h\left( \boldsymbol{r} \right) \right)$
+    $\boldsymbol{\hat{x}} \leftarrow \text{sign}\left( \boldsymbol{s} \right) $
+    if $\boldsymbol{H}\boldsymbol{\hat{c}} = \boldsymbol{0}$ do
+        return $\boldsymbol{\hat{c}}$
+    end if
+end for
+return $\boldsymbol{\hat{c}}$
+    \end{genericAlgorithm}
+
+
+    \caption{Proximal decoding algorithm}
+    \label{fig:prox:alg}
+\end{figure}