ba-letter/letter.tex

\documentclass[journal]{IEEEtran}


\usepackage{amsmath,amsfonts}
\usepackage{float}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage{siunitx}
\usepackage{dsfont}
\usepackage{mleftright}
\usepackage{bbm}

\usepackage{tikz}
\usetikzlibrary{spy, arrows.meta,arrows}

\usepackage{pgfplots}
\pgfplotsset{compat=newest}


\hyphenation{op-tical net-works semi-conduc-tor IEEE-Xplore}


\newif\ifoverleaf
%\overleaftrue


%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Inputs & Global Options
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%


\ifoverleaf
	\input{common.tex}
\else
	\usetikzlibrary{external}
	\tikzexternalize[prefix=build/]
	\input{lib/latex-common/common.tex}
\fi


\pgfplotsset{colorscheme/cel}

% TODO
\newcommand{\figwidth}{\columnwidth}
\newcommand{\figheight}{0.75\columnwidth}

\pgfplotsset{
	FERPlot/.style={
		line width=1pt,
		densely dashed,
	},
	BERPlot/.style={
		line width=1pt,
	},
	DFRPlot/.style={
		only marks,
	},
}


%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Title, Header, Footer, etc.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%


\begin{document}


\title{A Note on Improving Proximal Decoding for Linear Block Codes}

\author{Andreas Tsouchlos, Holger Jäkel, and Laurent Schmalen\\
Communications Engineering Lab (CEL), Karlsruhe Institute of Technology (KIT)\\
Hertzstr. 16, 76187 Karlsruhe, Germany, Email: \texttt{\{first.last\}@kit.edu}}

% TODO
\markboth{Journal of \LaTeX\ Class Files,~Vol.~14, No.~8, August~2021}%
{Shell \MakeLowercase{\textit{et al.}}: A Sample Article Using IEEEtran.cls
    for IEEE Journals}

\maketitle

%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Abstract & Index Terms
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%


\begin{abstract}
In this paper, the proximal decoding algorithm is considered within the
context of \textit{additive white Gaussian noise} (AWGN) channels.
An analysis of the convergence behavior of the algorithm shows that
proximal decoding inherently enters an oscillating behavior of the estimate
after a certain number of iterations.
Due to this oscillation, frame errors arising during decoding can often
be attributed to only a few remaining wrongly decoded bits.
In this letter, an improvement of the proximal decoding algorithm is proposed
by appending an additional step, in which these erroneous components are
attempted to be corrected.
We suggesst an empirical rule with which the components most likely needing
correction can be determined.
Using this insight and performing a subsequent ``ML-in-the-list'' decoding,
a gain of up to 1 dB is achieved compared to conventional
proximal decoding, depending on the decoder parameters and the code.
\end{abstract}

\begin{IEEEkeywords}
Optimization-based decoding, Proximal decoding, ML-in-the-list.
\end{IEEEkeywords}


%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Content
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%


%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction}

\IEEEPARstart{C}{hannel} coding using binary linear codes is a way of enhancing
the reliability of data by detecting and correcting any errors that may occur
during its transmission or storage.
One class of binary linear codes, \textit{low-density parity-check} (LDPC)
codes, has become especially popular due to its ability to reach arbitrarily
small error probabilities at code rates up to the capacity of the channel
\cite{mackay99}, while retaining a structure that allows for very efficient
decoding.
While the established decoders for LDPC codes, such as belief propagation (BP)
and the min-sum algorithm, offer good decoding performance, they are generally
not optimal and exhibit an error floor for high
\textit{signal-to-noise ratios} (SNRs) \cite{channel_codes_book}, making them
unsuitable for applications with extreme reliability requirements.

Optimization based decoding algorithms are an entirely different way of
approaching the decoding problem.
A number of different such algorithms have been introduced.
The field of \textit{linear programming} (LP) decoding \cite{feldman_paper},
for example, represents one class of such algorithms, based on a relaxation
of the \textit{maximum likelihood} (ML) decoding problem as a linear program.
Many different optimization algorithms can be used to solve the resulting
problem \cite{ADMM, adaptive_lp_decoding, interior_point_decoding}.
Recently, proximal decoding for LDPC codes was presented by
Wadayama \textit{et al.} \cite{proximal_paper}.
Proximal decoding relies on a non-convex optimization formulation
of the \textit{maximum a posteriori} (MAP) decoding problem.

The aim of this work is to improve upon the performance of proximal decoding by
first presenting an examination of the algorithm's behavior and then suggesting
an approach to mitigate some of its flaws.
This analysis is performed for
\textit{additive white Gaussian noise} (AWGN) channels.
We first observe that the algorithm initially moves the estimate in
the right direction, however, in the final steps of the decoding process,
convergence to the correct codeword is often not achieved.
Furthermore, we suggest that the reason for this behavior is the nature
of the decoding algorithm itself, comprising two separate gradient descent
steps working adversarially.

We propose a method mitigate this effect by appending an
additional step to the decoding process.
In this additional step, the components of the estimate with the highest
probability of being erroneous are identified.
New codewords are then generated, over which an ``ML-in-the-list''
\cite{ml_in_the_list} decoding is performed.
A process to conduct this identification is proposed in this paper.
Using the improved algorithm, a gain of up to
1 dB can be achieved compared to conventional proximal decoding,
depending on the decoder parameters and the code.


%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Preliminaries}

%%%%%%%%%%%%%%%%%%%%%
\subsection{Notation}

When considering binary linear codes, data words are mapped onto
codewords, the lengths of which are denoted by $k \in \mathbb{N}$
and $n \in \mathbb{N}$, respectively, with $k \le n$.
The set of codewords $\mathcal{C} \subset \mathbb{F}_2^n$ of a binary linear
code can be represented using the parity-check matrix
$\boldsymbol{H} \in \mathbb{F}_2^{m \times n} $, where $m$ represents the
number of parity-checks:
%
\begin{align*}
    \mathcal{C} := \left\{ \boldsymbol{c} \in \mathbb{F}_2^n :
        \boldsymbol{H}\boldsymbol{c}^\text{T} = \boldsymbol{0} \right\}
\end{align*}
%

The check nodes $j \in \mathcal{J}:=\left\{1, \ldots, m\right\}$ each
correspond to a parity check, i.e., a row of $\boldsymbol{H}$.
The variable nodes $i \in \mathcal{I}:=\left\{1, \ldots, n\right\}$ correspond
to the components of a codeword being subjected to a parity check, i.e.,
to the columns of $\boldsymbol{H}$.
The neighborhood of a parity check $j$, i.e., the set of indices of components
relevant for the according parity check, is denoted by
$\mathcal{N}_c(j) := \left\{i \in \mathcal{I}: \boldsymbol{H}\negthinspace_{j,i} = 1 \right\},
\hspace{2mm} j \in \mathcal{J}$.

In order to transmit a codeword $\boldsymbol{c} \in \mathbb{F}_2^n$, it is
mapped onto a \textit{binary phase shift keying} (BPSK) symbol via
$\boldsymbol{x} = 1 - 2\boldsymbol{c}$, with
$ \boldsymbol{x} \in \left\{\pm 1\right\}^n$, which is then transmitted over an
AWGN channel.
The received vector $\boldsymbol{y} \in \mathbb{R}^n$ is decoded to obtain an
estimate of the transmitted codeword, denoted as
$\hat{\boldsymbol{c}} \in \mathbb{F}_2^n$.
A distinction is made between $\boldsymbol{x} \in \left\{\pm 1\right\}^n$
and $\tilde{\boldsymbol{x}} \in \mathbb{R}^n$,
the former denoting the BPSK symbol physically transmitted over the channel and
the latter being used as a variable during the optimization process.
The posterior probability of having transmitted $\boldsymbol{x}$ when receiving
$\boldsymbol{y}$ is expressed as a \textit{probability mass function} (PMF)
$P_{\boldsymbol{X}\mid\boldsymbol{Y}}(\boldsymbol{x} \mid \boldsymbol{y})$.
Likewise, the likelihood of receiving $\boldsymbol{y}$ upon transmitting
$\boldsymbol{x}$ is expressed as a \textit{probability density function} (PDF)
$f_{\boldsymbol{Y}\mid\boldsymbol{X}}(\boldsymbol{y} \mid \boldsymbol{x})$.


%%%%%%%%%%%%%%%%%%%%%
\subsection{Proximal Decoding}

Proximal decoding was proposed by Wadayama et al. as a novel formulation
of optimization-based decoding \cite{proximal_paper}.
With proximal decoding, the proximal gradient method \cite{proximal_algorithms}
is used to solve a non-convex optimization formulation of the MAP decoding
problem.

With the equal prior probability assumption for all codewords, MAP and ML
decoding are equivalent and, specifically for AWGN channels, correspond to a
nearest-neighbor decision.
For this reason, decoding can be carried out using a figure of merit that
describes the distance from a given vector to a codeword.
One such expression, formulated under the assumption of BPSK, is the
\textit{code-constraint polynomial} \cite{proximal_paper}
%
\begin{align*}
    h( \tilde{\boldsymbol{x}} ) =
        \underbrace{\sum_{i=1}^{n}
            \left( \tilde{x}_i^2-1 \right) ^2}_{\text{Bipolar constraint}}
        + \underbrace{\sum_{j=1}^{m} \left[
            \left( \prod_{i\in \mathcal{N}_c \left( j \right) } \tilde{x}_i \right)
        -1 \right] ^2}_{\text{Parity constraint}}
.\end{align*}%
%
Its intent is to penalize vectors far from a codeword.
It comprises two terms: one representing the bipolar constraint
and one representing the parity constraint, incorporating all of the
information regarding the code.

The channel model can be considered using the negative log-likelihood
%
\begin{align*}
	L \mleft( \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \mright) = -\ln\mleft(
    f_{\boldsymbol{Y} \mid \tilde{\boldsymbol{X}}} \mleft(
	    \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \mright) \mright)
.\end{align*}
%
The information about the channel and the code are consolidated in the objective
function \cite{proximal_paper}
%
\begin{align*}
    g \mleft( \tilde{\boldsymbol{x}} \mright)
        = L \mleft( \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \mright)
            + \gamma h\mleft( \tilde{\boldsymbol{x}} \mright),
        \hspace{5mm} \gamma > 0%
.\end{align*}
%
The objective function is minimized using the proximal gradient method, which
amounts to iteratively performing two gradient-descent steps \cite{proximal_paper}
with the given objective function and considering AWGN channels.
To this end, two helper variables, $\boldsymbol{r}$ and $\boldsymbol{s}$, are
introduced, describing the result of each of the two steps:
%
\begin{alignat}{3}
    \boldsymbol{r} &\leftarrow \boldsymbol{s}
        - \omega \mleft( \boldsymbol{s} - \boldsymbol{y} \mright)
        \hspace{5mm }&&\omega > 0 \label{eq:r_update}\\
    \boldsymbol{s} &\leftarrow \boldsymbol{r}
        - \gamma \nabla h\mleft( \boldsymbol{r} \mright),
        \hspace{5mm} &&\gamma > 0 \label{eq:s_update}
.\end{alignat}
%
An equation for determining $\nabla h(\boldsymbol{r})$ is given in
\cite{proximal_paper}.
It should be noted that the variables $\boldsymbol{r}$ and $\boldsymbol{s}$
represent $\tilde{\boldsymbol{x}}$ during different
stages of the decoding process.

As the gradient of the code-constraint polynomial can attain very large values
in some cases, an additional step is introduced to ensure numerical stability:
every current estimate $\boldsymbol{s}$ is projected onto
$\left[-\eta, \eta\right]^n$ by a projection
$\Pi_\eta : \mathbb{R}^n \rightarrow \left[-\eta, \eta\right]^n$, where $\eta$
is a positive constant slightly larger than one, e.g., $\eta = 1.5$.
The resulting decoding process as described in \cite{proximal_paper} is
presented in Algorithm \ref{alg:proximal_decoding}.

\begin{algorithm}
	\caption{Proximal decoding algorithm for an AWGN channel \cite{proximal_paper}.}
    \label{alg:proximal_decoding}

    \begin{algorithmic}
        \STATE $\boldsymbol{s} \leftarrow \boldsymbol{0}$
        \STATE \textbf{for} $K$ iterations \textbf{do}
        \STATE \hspace{5mm} $\boldsymbol{r} \leftarrow \boldsymbol{s} - \omega \left( \boldsymbol{s} - \boldsymbol{y} \right) $
        \STATE \hspace{5mm} $\boldsymbol{s} \leftarrow \Pi_\eta \left(\boldsymbol{r} - \gamma \nabla h\left( \boldsymbol{r} \right) \right)$
		\STATE \hspace{5mm} $\boldsymbol{\hat{c}} \leftarrow \mathbbm{1}_{\left\{ \boldsymbol{s} \preceq 0 \right\}}$
        \STATE \hspace{5mm} \textbf{if} $\boldsymbol{H}\boldsymbol{\hat{c}} = \boldsymbol{0}$ \textbf{do}
        \STATE \hspace{10mm} \textbf{return} $\boldsymbol{\hat{c}}$
        \STATE \hspace{5mm} \textbf{end if}
        \STATE \textbf{end for}
        \STATE \textbf{return} $\boldsymbol{\hat{c}}$
    \end{algorithmic}
\end{algorithm}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Improved algorithm}

%%%%%%%%%%%%%%%%%%%%%
\subsection{Analysis of the Convergence Behavior}

In Fig. \ref{fig:fer vs ber}, the \textit{frame error rate} (FER),
\textit{bit error rate} (BER) and \textit{decoding failure rate} (DFR) of
proximal decoding are shown for an LDPC code with $n=204$ and $k=102$
\cite[204.33.484]{mackay}.
A decoding failure is defined as a decoding operation returning an invalid
codeword, i.e., as non-convergence of the algorithm.
The parameters chosen for this simulation are $\gamma=0.05, \omega=0.05,
\eta=1.5$ and $K=200$.
They were determined to offer the best performance in a preliminary examination,
where the effect of changing multiple parameters was simulated over a wide
range of values.
It is apparent that the DFR completely dominates the FER after a certain SNR.
This means that most frame errors are not due to the algorithm converging
to the wrong codeword, but due to the algorithm not converging at all.

As proximal decoding is an optimization-based decoding method, one possible
explanation for this effect might be that during the decoding process, convergence
to the final codeword is often not achieved, although the estimate is moving into
the right direction.
This would suggest that most frame errors occur due to only a few incorrectly
decoded bits.%
%
\begin{figure}[ht]
    \centering

    \begin{tikzpicture}
        \begin{axis}[
            grid=both,
            xlabel={$E_\text{b} / N_0$ (dB)}, ylabel={},
            ymode=log,
			xmin=1, xmax=8,
            ymax=1, ymin=1e-6,
%			ytick={1e-0, 1e-2, 1e-4, 1e-6},
            width=\figwidth,
            height=\figheight,
            legend pos = south west,
        ]
            \addplot+[FERPlot, mark=o, mark options={solid}, scol1]
                table [x=SNR, y=FER, col sep=comma,
                       discard if not={gamma}{0.05},
                       discard if gt={SNR}{9}]
                    {res/proximal_ber_fer_dfr_20433484.csv};
            \addlegendentry{FER}
            \addplot+[BERPlot, mark=*, scol1]
                table [x=SNR, y=BER, col sep=comma,
                       discard if not={gamma}{0.05},
                       discard if gt={SNR}{7.5}]
                    {res/proximal_ber_fer_dfr_20433484.csv};
            \addlegendentry{BER}
            \addplot+[DFRPlot, mark=square*, scol0]
                table [x=SNR, y=DFR, col sep=comma,
                       discard if not={gamma}{0.05},
                       discard if gt={SNR}{9}]
                    {res/proximal_ber_fer_dfr_20433484.csv};
            \addlegendentry{DFR}
        \end{axis}
    \end{tikzpicture}

    \caption{FER, DFR, and BER for $\left( 3, 6 \right)$-regular LDPC code with
        $n=204, k=102$ \cite[\text{204.33.484}]{mackay}.
        Parameters used for simulation: $\gamma =0.05,\omega = 0.05,
        \eta = 1.5, K=200$.
    }
    \label{fig:fer vs ber}
\end{figure}%
%

An approach for lowering the FER might then be to append an ``ML-in-the-list''
\cite{ml_in_the_list} step to the decoding process shown in Algorithm
\ref{alg:proximal_decoding}.
This step consists in determining the $N \in \mathbb{N}$ most probable
erroneous bits, finding all variations of the current estimate with those bits
modified, and performing ML decoding on this list.

This approach crucially relies on identifying the most probable erroneous bits.
Therefore, the convergence properties of proximal decoding are investigated.
Considering (\ref{eq:s_update}) and (\ref{eq:r_update}), Fig.
\ref{fig:grad} shows the two gradients along which the minimization is
performed for a repetition code with $n=2$.
It is apparent that a net movement will result as long as the two gradients
have a common component.
As soon as this common component is exhausted, they will work in opposing
directions resulting in an oscillation of the estimate.
This behavior supports the conjecture that the reason for the high DFR is a
failure to converge to the correct codeword in the final steps of the
optimization process.%
%
\begin{figure}[h]
    \centering

    \begin{tikzpicture}
        \begin{axis}[xmin = -1.25, xmax=1.25,
                     ymin = -1.25, ymax=1.25,
                     xlabel={$\tilde{x}_1$},
                     ylabel={$\tilde{x}_2$},
                     y label style={at={(axis description cs:-0.06,0.5)},anchor=south},
			         width=\figwidth,
					 height=\figheight,
                     grid=major, grid style={dotted},
                     view={0}{90}]
            \addplot3[point meta=\thisrow{grad_norm},
                      point meta min=1,
                      point meta max=2.5,
                      quiver={u=\thisrow{grad_0},
                              v=\thisrow{grad_1},
                              scale arrows=.05,
                              every arrow/.append style={%
                                line width=.3
                                    +\pgfplotspointmetatransformed/1000,
                                -{Latex[length=0pt 5,width=0pt 3]}
                              },
                          },
                        quiver/colored = {mapped color},
                        -stealth,
                      ]
                        table[col sep=comma] {res/2d_grad_L.csv};
        \end{axis}
        \begin{axis}[hide axis,
			         width=\figwidth,
					 height=\figheight,
                     xmin=10, xmax=50,
                     ymin=0, ymax=0.4,
                     legend style={draw=white!15!black,
                                   legend cell align=left,
                                   empty legend,
                                   at={(0.9775,0.97)},anchor=north east}]
            \addlegendimage{mark=none}
            \addlegendentry{
                $\nabla L\left(\boldsymbol{y}
                    \mid \tilde{\boldsymbol{x}}\right)$
            };
        \end{axis}
    \end{tikzpicture}

    \vspace{3mm}

    \begin{tikzpicture}
        \begin{axis}[xmin = -1.25, xmax=1.25,
                     ymin = -1.25, ymax=1.25,
			         width=\figwidth,
					 height=\figheight,
                     xlabel={$\tilde{x}_1$},
                     ylabel={$\tilde{x}_2$},
                     y label style={at={(axis description cs:-0.06,0.5)},anchor=south},
                     grid=major, grid style={dotted},
                     view={0}{90}]
            \addplot3[point meta=\thisrow{grad_norm},
                      point meta min=1,
                      point meta max=7,
                      quiver={u=\thisrow{grad_0},
                              v=\thisrow{grad_1},
                              scale arrows=.03,
                              every arrow/.append style={%
                                line width=.5
                                    +\pgfplotspointmetatransformed/1000,
                                -{Latex[length=0pt 5,width=0pt 3]}
                              },
                          },
                        quiver/colored = {mapped color},
                        -stealth,
                      ]
                        table[col sep=comma] {res/2d_grad_h.csv};
        \end{axis}
        \begin{axis}[hide axis,
			         width=\figwidth,
					 height=\figheight,
                     xmin=10, xmax=50,
                     ymin=0, ymax=0.4,
                     legend style={draw=white!15!black,
                                   legend cell align=left,
                                   empty legend,
                                   at={(0.9775,0.97)},anchor=north east}]
            \addlegendimage{mark=none}
            \addlegendentry{$\nabla h\left(\tilde{\boldsymbol{x}}\right)$};
        \end{axis}
    \end{tikzpicture}

    \caption{Gradients
        $\nabla L\left(\boldsymbol{y} \mid \tilde{\boldsymbol{x}}\right)$
        and $\nabla h \left( \tilde{\boldsymbol{x}} \right)$ for a repetition
        code with $n=2$.
        Shown for $\boldsymbol{y} = \begin{bmatrix} -0.5 & 0.8 \end{bmatrix}$.
    }
    \label{fig:grad}
\end{figure}%
%

In Fig. \ref{fig:prox:convergence_large_n}, we consider only component
$\left(\tilde{\boldsymbol{x}}\right)_1$ of the estimate during a
decoding operation for the LDPC code used also for Fig. 1.
Two qualities may be observed.
First, we observe the average absolute values of the two gradients are equal,
however, they have opposing signs,
leading to the aforementioned oscillation.
Second, the gradient of the code constraint polynomial itself starts to
oscillate after a certain number of iterations.%
%
\begin{figure}[ht]
    \centering

    \begin{tikzpicture}
        \begin{axis}[
            grid=both,
            xlabel={Iterations},
            width=\figwidth,
            height=\figheight,
            xtick={0, 100, ..., 400},
            xticklabels={0, 50, ..., 200},
			xmin=0, xmax=300,
			ymin=-4, ymax=2,
			ytick={-4,-3,...,2},
            legend pos = south east,
        ]
            \addplot+ [mark=none, line width=1]
                table [col sep=comma, x=k, y=comb_r_s_0,
                        discard if gt={k}{300}]
                    {res/extreme_components_20433484_combined.csv};
            \addplot+ [mark=none, line width=1,
                        discard if gt={k}{300}]
                table [col sep=comma, x=k, y=grad_L_0]
                    {res/extreme_components_20433484_combined.csv};
            \addplot+ [mark=none, line width=1]
                table [col sep=comma, x=k, y=grad_h_0,
                        discard if gt={k}{300}]
                    {res/extreme_components_20433484_combined.csv};
            \addlegendentry{$\left(\tilde{\boldsymbol{x}}\right)_1$}
            \addlegendentry{$\left(\nabla L\right)_1$}
            \addlegendentry{$\left(\nabla h\right)_1$}
        \end{axis}
    \end{tikzpicture}

    \caption{Visualization of component $\left(\tilde{\boldsymbol{x}}\right)_1$
        for a decoding operation for a (3,6) regular LDPC code with
        $n=204, k=102$ \cite[\text{204.33.484}]{mackay}.
        Parameters used for simulation: $\gamma = 0.05, \omega = 0.05,
        \eta = 1.5, E_b/N_0 = \SI{4}{dB}$.
    }
    \label{fig:prox:convergence_large_n}
\end{figure}%

%%%%%%%%%%%%%%%%%%%%%
\subsection{Improvement Using ``ML-in-the-List'' Step}

Considering the magnitude of the oscillation of the gradient of the code constraint
polynomial, some interesting behavior may be observed.
Fig. \ref{fig:p_error} shows the probability that a component of the estimate
is wrong, determined through a Monte Carlo simulation, when the components of
$\boldsymbol{c}$ are ordered from smallest to largest oscillation of
$\left(\nabla h\right)_i$.

The lower the magnitude of the oscillation, the higher the probability that the
corresponding bit was not decoded correctly.
This means that this magnitude is a suitable figure of merit for determining
the probability that a given component was decoded incorrectly.%
%
\begin{figure}[H]
    \centering

    \begin{tikzpicture}
        \begin{axis}[
            grid=both,
            ylabel=$P(\hat{c}_{i'} \ne c_{i'})$,
            xlabel=$i'$,
            ymode=log,
			ymin=1e-9,ymax=1e-5,
			xmin=0,xmax=200,
			width=\figwidth,
			height=\figheight,
        ]
            \addplot+ [scol0, mark=none, line width=1]
                table [col sep=comma, y=p_error]{res/p_error.csv};
        \end{axis}
    \end{tikzpicture}

    \caption{Probability that a component of the estimated codeword
        $\hat{\boldsymbol{c}}\in \mathbb{F}_2^n$ is erroneous for a (3,6) regular
        LDPC code with $n=204, k=102$ \cite[\text{204.33.484}]{mackay}.
        The indices $i'$ are ordered such that the amplitude of oscillation of
        $\left(\nabla h\right)_{i'}$ increases with $i'$.
        Parameters used for the simulation: $\gamma = 0.05, \omega = 0.05,
        \eta = 1.5, E_b/N_0 = \SI{4}{dB}$.
        Simulated with $\SI{100000000}{}$ iterations using the all-zeros codeword.}
    \label{fig:p_error}
\end{figure}

The complete improved algorithm is given in Algorithm \ref{alg:improved}.
First, the proximal decoding algorithm is applied.
If a valid codeword has been reached, i.e., if the algorithm has converged,
we return this solution.
Otherwise, $N \in \mathbb{N}$ components are selected based on the criterion
presented above.
Beginning with the recent estimate $\hat{\boldsymbol{c}} \in \mathbb{F}_2^n$,
all variations of words with the selected components modified are then
generated and an ``ML-in-the-list'' step is performed.

\begin{algorithm}
    \caption{ML-in-the-List algorithm.}
    \label{alg:ml-in-the-list}

    \begin{algorithmic}
        \STATE Find valid codewords under $\left(\hat{\boldsymbol{c}}_{l}\right)_{1=1}^{2^N}$
        \STATE \textbf{if} no valid codewords exist
        \STATE \hspace{5mm} Compute $\langle \hat{\boldsymbol{c}}_l, \hat{\boldsymbol{c}} \rangle$ for all variations $\boldsymbol{c}_l$
        \STATE \textbf{else}
        \STATE \hspace{5mm} Compute $\langle \hat{\boldsymbol{c}}_l, \hat{\boldsymbol{c}} \rangle$ for valid codewords
        \STATE \textbf{end if}
        \STATE \textbf{return} $\hat{\boldsymbol{c}}_l$ with highest $\langle \hat{\boldsymbol{c}}_l, \hat{\boldsymbol{c}} \rangle$
    \end{algorithmic}
\end{algorithm}%
%
\begin{algorithm}
    \caption{Improved proximal decoding algorithm.
        }
    \label{alg:improved}

    \begin{algorithmic}
        \STATE $\boldsymbol{s} \leftarrow \boldsymbol{0}$
        \STATE \textbf{for} $K$ iterations \textbf{do}
        \STATE \hspace{5mm} $\boldsymbol{r} \leftarrow \boldsymbol{s} - \omega \left( \boldsymbol{s} - \boldsymbol{y} \right) $
        \STATE \hspace{5mm} $\boldsymbol{s} \leftarrow \Pi_\eta \left(\boldsymbol{r} - \gamma \nabla h\left( \boldsymbol{r} \right) \right)$
        \STATE \hspace{5mm} $\boldsymbol{\hat{c}} \leftarrow \mathds{1} \left\{ \text{sign}\left( \boldsymbol{s} \right) = -1 \right\}$
        \STATE \hspace{10mm} \textbf{if} $\boldsymbol{H}\boldsymbol{\hat{c}} = \boldsymbol{0}$ \textbf{do}
        \STATE \hspace{10mm} \textbf{return} $\boldsymbol{\hat{c}}$
        \STATE \hspace{5mm} \textbf{end if}
        \STATE \textbf{end for}
        \STATE $\textcolor{KITblue}{\text{Estimate $N$ wrong bit indices $\mathcal{I} = \{i_1,\ldots,i_N\}$}}$
        \STATE $\textcolor{KITblue}{\text{Generate candidate list $\left(\hat{\boldsymbol{c}}_{l}\right)_{l=1}^{2^N}$ by varying bits in $\mathcal{I}$}}$\vspace{1mm}
        \STATE $\textcolor{KITblue}{\textbf{return  ml\textunderscore in\textunderscore the\textunderscore list}\left(\left(\hat{\boldsymbol{c}}_l\right)_{1=1}^{2^N}\right)}$
    \end{algorithmic}
\end{algorithm}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Simulation Results \& Discussion}

Fig. \ref{fig:results} shows the FER and BER resulting from applying
proximal decoding as presented in \cite{proximal_paper} and the improved
algorithm presented here when applied to a $\left( 3,6 \right)$-regular LDPC
code with $n=204$ and $k=102$ \cite[204.33.484]{mackay}.
The parameters chosen for the simulation are
$\gamma = 0.05, \omega=0.05, \eta=1.5, K=200$.
Again, these parameters were chosen,%
%
\begin{figure}[ht]
    \centering

    \begin{tikzpicture}
        \begin{axis}[
            grid=both,
            xlabel={$E_\text{b} / N_0$ (dB)},
            ymode=log,
			xmin=1, xmax=8,
            ymax=1, ymin=1e-6,
			width=\figwidth,
			height=\figheight,
            legend columns=2,
            legend style={draw=white!15!black,
                legend cell align=left,
                at={(0.5,-0.44)},anchor=south}
        ]

            \addplot+[FERPlot, mark=o, mark options={solid}, scol1]
                table [x=SNR, y=FER, col sep=comma,
                       discard if not={gamma}{0.05},
                       discard if gt={SNR}{9}]
                    {res/proximal_ber_fer_dfr_20433484.csv};
            \addlegendentry{FER, prox. dec.};

            \addplot+[BERPlot, mark=*, scol1]
                table [x=SNR, y=BER, col sep=comma,
                       discard if not={gamma}{0.05},
                       discard if gt={SNR}{7.5}]
                    {res/proximal_ber_fer_dfr_20433484.csv};
            \addlegendentry{BER, prox. dec.};

            \addplot+[FERPlot, mark=triangle, mark options={solid}, scol2]
                table [x=SNR, y=FER, col sep=comma,
                       discard if not={gamma}{0.05},
                       discard if gt={SNR}{7.5}]
                    {res/improved_ber_fer_dfr_20433484.csv};
            \addlegendentry{FER, improved};

            \addplot+[BERPlot, mark=triangle*, scol2]
                table [x=SNR, y=BER, col sep=comma,
                       discard if not={gamma}{0.05},
                       discard if gt={SNR}{6.5}]
                    {res/improved_ber_fer_dfr_20433484.csv};
            \addlegendentry{BER, improved};
        \end{axis}
    \end{tikzpicture}

    \caption{FER and BER of proximal decoding \cite{proximal_paper} and the
        improved algorithm for a $\left( 3, 6 \right)$-regular LDPC code with
        $n=204, k=102$ \cite[\text{204.33.484}]{mackay}.
        Parameters used for simulation: $\gamma=0.05, \omega=0.05, \eta=1.5,
        K=200, N=8$.
    }

    \label{fig:results}
\end{figure}%
%
\noindent as a preliminary examination
showed that they provide the best results for proximal decoding as well as
the improved algorithm.
All points were generated by simulating at least 100 frame errors.
The number $N$ of possibly wrong components selected was selected as $8$,
since this provides reasonable gain without requiring an unreasonable amount
of memory and computational resources.

A noticeable improvement can be observed both in the FER as well as the BER.
The gain varies significantly
with the SNR (which is to be expected, since with higher SNR values the number
of bit errors decreases, making the correction of those errors in the
``ML-in-the-list'' step more likely).
For an FER of $10^{-6}$ the gain is approximately $\SI{1}{dB}$.
Similar behavior can be observed with various other codes.
No immediate relationship between the code length and the gain was observed
during our examinations.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Conclusion}

In this paper, an improvement on proximal decoding as presented by
Wadayama et al. \cite{proximal_paper} is introduced for AWGN channels.
It relies on the fact that most errors observed in proximal decoding stem
from only a few components of the estimate being wrong.
These few erroneous components can mostly be corrected by appending an
additional step to the original algorithm that is only executed if the
algorithm has not converged.
A gain of up to $\sim\SI{1}{dB}$ can be observed, depending on the code,
the parameters considered, and the SNR.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Acknowledgements}

This work has received funding in part from the European Research Council
(ERC) under the European Union’s Horizon 2020 research and innovation
programme (grant agreement No. 101001899) and in part from the German Federal
Ministry of Education and Research (BMBF) within the project Open6GHub
(grant agreement 16KISK010).


%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Bibliography
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%


\begin{thebibliography}{1}
\bibliographystyle{IEEEtran}

\bibitem{ADMM}
S. Barman, X. Liu, S. C. Draper and B. Recht, ``Decomposition Methods for Large Scale LP Decoding,'' in IEEE Transactions on Information Theory, vol. 59, no. 12, pp. 7870-7886, Dec. 2013, doi: 10.1109/TIT.2013.2281372.

\bibitem{feldman_paper}
J. Feldman, M. J. Wainwright and D. R. Karger, ``Using linear programming to Decode Binary linear codes,'' in IEEE Transactions on Information Theory, vol. 51, no. 3, pp. 954-972, March 2005, doi: 10.1109/TIT.2004.842696.

\bibitem{ml_in_the_list}
M. Geiselhart, A. Elkelesh, M. Ebada, S. Cammerer and S. t. Brink, ``Automorphism Ensemble Decoding of Reed–Muller Codes,'' in IEEE Transactions on Communications, vol. 69, no. 10, pp. 6424-6438, Oct. 2021, doi: 10.1109/TCOMM.2021.3098798.

\bibitem{mackay99}
D. J. C. MacKay, ``Good error-correcting codes based on very sparse matrices,'' in IEEE Transactions on Information Theory, vol. 45, no. 2, pp. 399-431, March 1999, doi: 10.1109/18.748992.

\bibitem{mackay}
D.J.C. MacKay, ``Encyclopedia of sparse graph codes [online],''
Available: http://www.inference.phy.cam.ac.uk/mackay/codes/data.html

\bibitem{proximal_algorithms}
N. Parikh and S. Boyd,``Proximal algorithms,'' Found. Trends Optim., vol. 1, no. 3, pp. 127–239, Jan. 2014.

\bibitem{channel_codes_book}
W. Ryan and S. Lin, Channel Codes: Classical and Modern, Cambridge, Cambridge University Press, 2009, pp. 651-670.

\bibitem{adaptive_lp_decoding}
M. H. Taghavi and P. H. Siegel, ``Adaptive Linear Programming Decoding,'' 2006 IEEE International Symposium on Information Theory, Seattle, WA, USA, 2006, pp. 1374-1378, doi: 10.1109/ISIT.2006.262071.

\bibitem{interior_point_decoding}
P. O. Vontobel, ``Interior-point algorithms for linear-programming decoding,'' 2008 Information Theory and Applications Workshop, San Diego, CA, USA, 2008, pp. 433-437, doi: 10.1109/ITA.2008.4601085.

\bibitem{proximal_paper}
T. Wadayama and S. Takabe, ``Proximal decoding for ldpc codes'' IEICE Transactions on Fundamentals of Electronics, Communications and Computer Sciences, vol. advpub, 2022TAP0002, 2022.

\end{thebibliography}


\end{document}