\chapter{Proximal Decoding}% \label{chapter:proximal_decoding} TODO %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Decoding Algorithm}% \label{sec:prox:Decoding Algorithm} Proximal decoding was proposed by Wadayama et. al as a novel formulation of optimization-based decoding \cite{proximal_paper}. With this algorithm, minimization is performed using the proximal gradient method. In contrast to \ac{LP} decoding, the objective function is based on a non-convex optimization formulation of the \ac{MAP} decoding problem. In order to derive the objective function, the authors begin with the \ac{MAP} decoding rule, expressed as a continuous maximization problem% \footnote{The expansion of the domain to be continuous doesn't constitute a material difference in the meaning of the rule. The only change is that what previously were \acp{PMF} now have to be expressed in terms of \acp{PDF}.} over $\boldsymbol{x}$:% % \begin{align} \hat{\boldsymbol{x}} = \argmax_{\tilde{\boldsymbol{x}} \in \mathbb{R}^{n}} f_{\tilde{\boldsymbol{X}} \mid \boldsymbol{Y}} \left( \tilde{\boldsymbol{x}} \mid \boldsymbol{y} \right) = \argmax_{\tilde{\boldsymbol{x}} \in \mathbb{R}^{n}} f_{\boldsymbol{Y} \mid \tilde{\boldsymbol{X}}} \left( \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \right) f_{\tilde{\boldsymbol{X}}}\left( \tilde{\boldsymbol{x}} \right)% \label{eq:prox:vanilla_MAP} .\end{align}% % The likelihood $f_{\boldsymbol{Y} \mid \tilde{\boldsymbol{X}}} \left( \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \right) $ is a known function determined by the channel model. The prior \ac{PDF} $f_{\tilde{\boldsymbol{X}}}\left( \tilde{\boldsymbol{x}} \right)$ is also known, as the equal probability assumption is made on $\mathcal{C}$. However, since the considered domain is continuous, the prior \ac{PDF} cannot be ignored as a constant during the minimization as is often done, and has a rather unwieldy representation:% % \begin{align} f_{\tilde{\boldsymbol{X}}}\left( \tilde{\boldsymbol{x}} \right) = \frac{1}{\left| \mathcal{C} \right| } \sum_{\boldsymbol{c} \in \mathcal{C} } \delta\big( \tilde{\boldsymbol{x}} - \left( -1 \right) ^{\boldsymbol{c}}\big) \label{eq:prox:prior_pdf} .\end{align}% % In order to rewrite the prior \ac{PDF} $f_{\tilde{\boldsymbol{X}}}\left( \tilde{\boldsymbol{x}} \right)$, the so-called \textit{code-constraint polynomial} is introduced as:% % \begin{align*} h\left( \tilde{\boldsymbol{x}} \right) = \underbrace{\sum_{i=1}^{n} \left( \tilde{x_i}^2-1 \right) ^2}_{\text{Bipolar constraint}} + \underbrace{\sum_{j=1}^{m} \left[ \left( \prod_{i\in N_c \left( j \right) } \tilde{x_i} \right) -1 \right] ^2}_{\text{Parity constraint}}% .\end{align*}% % The intention of this function is to provide a way to penalize vectors far from a codeword and favor those close to one. In order to achieve this, the polynomial is composed of two parts: one term representing the bipolar constraint, providing for a discrete solution of the continuous optimization problem, and one term representing the parity constraints, accommodating the role of the parity-check matrix $\boldsymbol{H}$. The prior \ac{PDF} is then approximated using the code-constraint polynomial as:% % \begin{align} f_{\tilde{\boldsymbol{X}}}\left( \tilde{\boldsymbol{x}} \right) \approx \frac{1}{Z}\mathrm{e}^{-\gamma h\left( \tilde{\boldsymbol{x}} \right) }% \label{eq:prox:prior_pdf_approx} .\end{align}% % The authors justify this approximation by arguing, that for $\gamma \rightarrow \infty$, the approximation in equation (\ref{eq:prox:prior_pdf_approx}) approaches the original function in equation (\ref{eq:prox:prior_pdf}). This approximation can then be plugged into equation (\ref{eq:prox:vanilla_MAP}) and the likelihood can be rewritten using the negative log-likelihood $L \left( \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \right) = -\ln\left( f_{\boldsymbol{Y} \mid \tilde{\boldsymbol{X}}}\left( \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \right) \right) $:% % \begin{align*} \hat{\boldsymbol{x}} &= \argmax_{\tilde{\boldsymbol{x}} \in \mathbb{R}^{n}} \mathrm{e}^{- L\left( \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \right) } \mathrm{e}^{-\gamma h\left( \tilde{\boldsymbol{x}} \right) } \\ &= \argmin_{\tilde{\boldsymbol{x}} \in \mathbb{R}^n} \big( L\left( \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \right) + \gamma h\left( \tilde{\boldsymbol{x}} \right) \big)% .\end{align*}% % Thus, with proximal decoding, the objective function $g\left( \tilde{\boldsymbol{x}} \right)$ considered is% % \begin{align} g\left( \tilde{\boldsymbol{x}} \right) = L\left( \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \right) + \gamma h\left( \tilde{\boldsymbol{x}} \right)% \label{eq:prox:objective_function} \end{align}% % and the decoding problem is reformulated to% % \begin{align*} \text{minimize}\hspace{2mm} &L\left( \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \right) + \gamma h\left( \tilde{\boldsymbol{x}} \right)\\ \text{subject to}\hspace{2mm} &\tilde{\boldsymbol{x}} \in \mathbb{R}^n .\end{align*} % For the solution of the approximate \ac{MAP} decoding problem, the two parts of equation (\ref{eq:prox:objective_function}) are considered separately: the minimization of the objective function occurs in an alternating fashion, switching between the negative log-likelihood $L\left( \boldsymbol{y} \mid \boldsymbol{x} \right) $ and the scaled code-constraint polynomial $\gamma h\left( \boldsymbol{x} \right) $. Two helper variables, $\boldsymbol{r}$ and $\boldsymbol{s}$, are introduced, describing the result of each of the two steps. The first step, minimizing the log-likelihood, is performed using gradient descent:% % \begin{align} \boldsymbol{r} \leftarrow \boldsymbol{s} - \omega \nabla L\left( \boldsymbol{y} \mid \boldsymbol{s} \right), \hspace{5mm}\omega > 0 \label{eq:prox:step_log_likelihood} .\end{align}% % For the second step, minimizing the scaled code-constraint polynomial, the proximal gradient method is used and the \textit{proximal operator} of $\gamma h\left( \tilde{\boldsymbol{x}} \right) $ has to be computed. It is then immediately approximated with gradient-descent:% % \begin{align*} \textbf{prox}_{\gamma h} \left( \tilde{\boldsymbol{x}} \right) &\equiv \argmin_{\boldsymbol{t} \in \mathbb{R}^n} \left( \gamma h\left( \boldsymbol{t} \right) + \frac{1}{2} \lVert \boldsymbol{t} - \tilde{\boldsymbol{x}} \rVert \right)\\ &\approx \tilde{\boldsymbol{x}} - \gamma \nabla h \left( \tilde{\boldsymbol{x}} \right), \hspace{5mm} \gamma > 0, \text{ small} .\end{align*}% % The second step thus becomes% % \begin{align*} \boldsymbol{s} \leftarrow \boldsymbol{r} - \gamma \nabla h\left( \boldsymbol{r} \right), \hspace{5mm}\gamma > 0,\text{ small} .\end{align*} % While the approximation of the prior \ac{PDF} made in equation (\ref{eq:prox:prior_pdf_approx}) theoretically becomes better with larger $\gamma$, the constraint that $\gamma$ be small is important, as it keeps the effect of $h\left( \tilde{\boldsymbol{x}} \right) $ on the landscape of the objective function small. Otherwise, unwanted stationary points, including local minima, are introduced. The authors say that ``in practice, the value of $\gamma$ should be adjusted according to the decoding performance.'' \cite[Sec. 3.1]{proximal_paper}. %The components of the gradient of the code-constraint polynomial can be computed as follows:% %% %\begin{align*} % \frac{\partial}{\partial x_k} h\left( \boldsymbol{x} \right) = % 4\left( x_k^2 - 1 \right) x_k + \frac{2}{x_k} % \sum_{i\in \mathcal{B}\left( k \right) } \left( % \left( \prod_{j\in\mathcal{A}\left( i \right)} x_j\right)^2 % - \prod_{j\in\mathcal{A}\left( i \right) }x_j \right) %.\end{align*}% %\todo{Only multiplication?}% %\todo{$x_k$: $k$ or some other indexing variable?}% %% In the case of \ac{AWGN}, the likelihood $f_{\boldsymbol{Y} \mid \tilde{\boldsymbol{X}}} \left( \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \right)$ is% % \begin{align*} f_{\boldsymbol{Y} \mid \tilde{\boldsymbol{X}}} \left( \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \right) = \frac{1}{\sqrt{2\pi\sigma^2}}\mathrm{e}^{ -\frac{\lVert \boldsymbol{y}-\tilde{\boldsymbol{x}} \rVert^2 } {2\sigma^2}} .\end{align*} % Thus, the gradient of the negative log-likelihood becomes% \footnote{For the minimization, constants can be disregarded. For this reason, it suffices to consider only proportionality instead of equality.}% % \begin{align*} \nabla L \left( \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \right) &\propto -\nabla \lVert \boldsymbol{y} - \tilde{\boldsymbol{x}} \rVert^2\\ &\propto \tilde{\boldsymbol{x}} - \boldsymbol{y} ,\end{align*}% % allowing equation (\ref{eq:prox:step_log_likelihood}) to be rewritten as% % \begin{align*} \boldsymbol{r} \leftarrow \boldsymbol{s} - \omega \left( \boldsymbol{s} - \boldsymbol{y} \right) .\end{align*} % One thing to consider during the actual decoding process, is that the gradient of the code-constraint polynomial can take on extremely large values. To avoid numerical instability, an additional step is added, where all components of the current estimate are clipped to $\left[-\eta, \eta \right]$, where $\eta$ is a positive constant slightly larger than one:% % \begin{align*} \boldsymbol{s} \leftarrow \Pi_{\eta} \left( \boldsymbol{r} - \gamma \nabla h\left( \boldsymbol{r} \right) \right) ,\end{align*} % $\Pi_{\eta}\left( \cdot \right) $ expressing the projection onto $\left[ -\eta, \eta \right]^n$. The iterative decoding process resulting from these considerations is shown in figure \ref{fig:prox:alg}. \begin{figure}[H] \centering \begin{genericAlgorithm}[caption={}, label={}] $\boldsymbol{s} \leftarrow \boldsymbol{0}$ for $K$ iterations do $\boldsymbol{r} \leftarrow \boldsymbol{s} - \omega \left( \boldsymbol{s} - \boldsymbol{y} \right) $ $\boldsymbol{s} \leftarrow \Pi_\eta \left(\boldsymbol{r} - \gamma \nabla h\left( \boldsymbol{r} \right) \right)$ $\boldsymbol{\hat{x}} \leftarrow \text{sign}\left( \boldsymbol{s} \right) $ if $\boldsymbol{H}\boldsymbol{\hat{c}} = \boldsymbol{0}$ do return $\boldsymbol{\hat{c}}$ end if end for return $\boldsymbol{\hat{c}}$ \end{genericAlgorithm} \caption{Proximal decoding algorithm for an \ac{AWGN} channel} \label{fig:prox:alg} \end{figure} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Implementation Details}% \label{sec:prox:Implementation Details} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Results}% \label{sec:prox:Results} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Improved Implementation}% \label{sec:prox:Improved Implementation}