From aefb6cbae2af392e5dd2226bf8414c5ccce1eb65 Mon Sep 17 00:00:00 2001
From: Andreas Tsouchlos <an.tsouchlos@gmail.com>
Date: Sun, 9 Apr 2023 18:07:18 +0200
Subject: [PATCH] Implemented corrections; Changed lp dec figure text scaling

---
 latex/thesis/chapters/lp_dec_using_admm.tex   | 54 ++++++++-----------
 latex/thesis/chapters/proximal_decoding.tex   | 46 +++++++---------
 .../chapters/theoretical_background.tex       | 42 ++++++++++-----
 3 files changed, 72 insertions(+), 70 deletions(-)

diff --git a/latex/thesis/chapters/lp_dec_using_admm.tex b/latex/thesis/chapters/lp_dec_using_admm.tex
index 6811531..4fc3695 100644
--- a/latex/thesis/chapters/lp_dec_using_admm.tex
+++ b/latex/thesis/chapters/lp_dec_using_admm.tex
@@ -18,7 +18,7 @@ To solve the resulting linear program, various optimization methods can be
 used (see for example \cite{alp}, \cite{interior_point},
 \cite{efficient_lp_dec_admm}, \cite{pdd}).
 
-They begin by looking at the \ac{ML} decoding problem%
+Feldman et al. begin by looking at the \ac{ML} decoding problem%
 \footnote{They assume that all codewords are equally likely to be transmitted,
 making the \ac{ML} and \ac{MAP} decoding problems equivalent.}%
 %
@@ -40,7 +40,7 @@ of the \acp{LLR} $\gamma_i$ \cite[Sec. 2.5]{feldman_thesis}:%
         {f_{Y_i | C_i} \left( y_i \mid c_i = 1 \right) } \right)
 .\end{align*}
 %
-The authors propose the following cost function%
+The authors propose using the following cost function%
 \footnote{In this context, \textit{cost function} and \textit{objective function}
 have the same meaning.}
 for the \ac{LP} decoding problem:%
@@ -51,7 +51,7 @@ for the \ac{LP} decoding problem:%
 .\end{align*}
 %
 With this cost function, the exact integer linear program formulation of \ac{ML}
-decoding becomes the following:%
+decoding becomes%
 %
 \begin{align*}
     \text{minimize }\hspace{2mm} & \boldsymbol{\gamma}^\text{T}\boldsymbol{c} \\
@@ -65,7 +65,7 @@ As solving integer linear programs is generally NP-hard, this decoding problem
 has to be approximated by a problem with looser constraints.
 A technique called \textit{relaxation} is applied:
 relaxing the constraints, thereby broadening the considered domain
-(e.g. by lifting the integer requirement).
+(e.g., by lifting the integer requirement).
 First, the authors present an equivalent \ac{LP} formulation of exact \ac{ML}
 decoding, redefining the constraints in terms of the \text{codeword polytope}
 %
@@ -82,10 +82,10 @@ This corresponds to simply lifting the integer requirement.
 However, since the number of constraints needed to characterize the codeword
 polytope is exponential in the code length, this formulation is relaxed further.
 By observing that each check node defines its own local single parity-check
-code, and thus its own \textit{local codeword polytope},
+code, and, thus, its own \textit{local codeword polytope},
 the \textit{relaxed codeword polytope} $\overline{Q}$ is defined as the intersection of all
 local codeword polytopes.
-This consideration leads to constraints, that can be described as follows
+This consideration leads to constraints that can be described as follows
 \cite[Sec. II, A]{efficient_lp_dec_admm}:%
 %
 \begin{align*}
@@ -93,10 +93,10 @@ This consideration leads to constraints, that can be described as follows
     \hspace{5mm}\forall j\in \mathcal{J}
 ,\end{align*}%
 %
-where $\mathcal{P}_{d_j}$ is the \textit{check polytope}, the convex hull of all
+where $\mathcal{P}_{d_j}$ is the \textit{check polytope}, i.e., the convex hull of all
 binary vectors of length $d_j$ with even parity%
 \footnote{Essentially $\mathcal{P}_{d_j}$ is the set of vectors that satisfy
-parity-check $j$, but extended to the continuous domain.}%
+parity-check $j$, but extended to the continuous domain.},
 and $\boldsymbol{T}_j$ is the \textit{transfer matrix}, which selects the
 neighboring variable nodes
 of check node $j$ (i.e., the relevant components of $\tilde{\boldsymbol{c}}$
@@ -139,7 +139,7 @@ and has only two possible codewords:
 .\end{align*}
 %
 Figure \ref{fig:lp:poly:exact_ilp} shows the domain of exact \ac{ML} decoding.
-The first relaxation, onto the codeword polytope $\text{poly}\left( \mathcal{C} \right) $,
+The first relaxation onto the codeword polytope $\text{poly}\left( \mathcal{C} \right) $
 is shown in figure \ref{fig:lp:poly:exact};
 this expresses the constraints for the equivalent linear program to exact \ac{ML} decoding.
 $\text{poly}\left( \mathcal{C} \right) $ is further relaxed onto the relaxed codeword polytope
@@ -169,7 +169,7 @@ local codeword polytopes of each check node.
                                 draw, circle, inner sep=0pt, minimum size=4pt]
 
             \tdplotsetmaincoords{60}{25}
-            \begin{tikzpicture}[scale=0.9, transform shape, tdplot_main_coords]
+            \begin{tikzpicture}[scale=0.9, tdplot_main_coords]
                 % Cube
 
                 \coordinate (p000) at (0, 0, 0);
@@ -226,7 +226,7 @@ local codeword polytopes of each check node.
                                 draw, circle, inner sep=0pt, minimum size=4pt]
 
             \tdplotsetmaincoords{60}{25}
-            \begin{tikzpicture}[scale=0.9, transform shape, tdplot_main_coords]
+            \begin{tikzpicture}[scale=0.9, tdplot_main_coords]
                 % Cube
 
                 \coordinate (p000) at (0, 0, 0);
@@ -290,7 +290,7 @@ local codeword polytopes of each check node.
                                     draw, circle, inner sep=0pt, minimum size=4pt]
 
                 \tdplotsetmaincoords{60}{25}
-                \begin{tikzpicture}[scale=0.9, transform shape, tdplot_main_coords]
+                \begin{tikzpicture}[scale=0.9, tdplot_main_coords]
                     % Cube
 
                     \coordinate (p000) at (0, 0, 0);
@@ -342,7 +342,7 @@ local codeword polytopes of each check node.
                     % Polytope Annotations
 
                     \node[color=KITblue, below=0cm of c000]    {$\left( 0, 0, 0 \right) $};
-                    \node[color=KITblue, right=0.17cm of c101] {$\left( 1, 0, 1 \right) $};
+                    \node[color=KITblue, right=0.07cm of c101] {$\left( 1, 0, 1 \right) $};
                     \node[color=KITblue, right=0cm of c110]    {$\left( 1, 1, 0 \right) $};
                     \node[color=KITblue, above=0cm of c011]    {$\left( 0, 1, 1 \right) $};
                 \end{tikzpicture}
@@ -354,7 +354,7 @@ local codeword polytopes of each check node.
                                     draw, circle, inner sep=0pt, minimum size=4pt]
 
                 \tdplotsetmaincoords{60}{25}
-                \begin{tikzpicture}[scale=0.9, transform shape, tdplot_main_coords]
+                \begin{tikzpicture}[scale=0.9, tdplot_main_coords]
                     % Cube
 
                     \coordinate (p000) at (0, 0, 0);
@@ -438,7 +438,7 @@ local codeword polytopes of each check node.
                                 draw, circle, inner sep=0pt, minimum size=4pt]
 
             \tdplotsetmaincoords{60}{25}
-            \begin{tikzpicture}[scale=0.9, transform shape, tdplot_main_coords]
+            \begin{tikzpicture}[scale=0.9, tdplot_main_coords]
                 % Cube
 
                 \coordinate (p000) at (0, 0, 0);
@@ -483,7 +483,7 @@ local codeword polytopes of each check node.
 
                 \node[color=KITblue, below=0cm of c000]   {$\left( 0, 0, 0 \right) $};
                 \node[color=KITblue, above=0cm of c011] {$\left( 0, 1, 1 \right) $};
-                \node[color=KITred, right=0.03cm of cpseudo]
+                \node[color=KITred, right=0cm of cpseudo]
                     {$\left( 1, \frac{1}{2}, \frac{1}{2} \right) $};
             \end{tikzpicture}
         
@@ -607,7 +607,7 @@ The steps to solve the dual problem then become:
         \hspace{3mm} &&\forall j\in\mathcal{J}
 .\end{alignat*}
 %
-Luckily, the additional constaints only affect the $\boldsymbol{z}_j$-update steps.
+Luckily, the additional constraints only affect the $\boldsymbol{z}_j$-update steps.
 Furthermore, the $\boldsymbol{z}_j$-update steps can be shown to be equivalent to projections
 onto the check polytopes $\mathcal{P}_{d_j}$
 and the $\tilde{\boldsymbol{c}}$-update can be computed analytically%
@@ -658,22 +658,19 @@ $\boldsymbol{\lambda}_j = \mu \cdot \boldsymbol{u}_j \,\forall\,j\in\mathcal{J}$
 .\end{alignat*}
 %
 
-
 The reason \ac{ADMM} is able to perform so well is due to the relocation of the constraints
 $\boldsymbol{T}_j\tilde{\boldsymbol{c}}_j\in\mathcal{P}_{d_j}\,\forall\, j\in\mathcal{J}$
 into the objective function itself.
 The minimization of the new objective function can then take place simultaneously
 with respect to all $\boldsymbol{z}_j, j\in\mathcal{J}$.
-Effectively, all of the $\left|\mathcal{J}\right|$ parity constraints are
-able to be handled at the same time.
+Effectively, all of the $\left|\mathcal{J}\right|$ parity constraints can be
+handled at the same time.
 This can also be understood by interpreting the decoding process as a message-passing
 algorithm \cite[Sec. III. D.]{original_admm}, \cite[Sec. II. B.]{efficient_lp_dec_admm},
-as is shown in figure \ref{fig:lp:message_passing}.%
-%
-\begin{figure}[H]
-    \centering
-   
-    \begin{genericAlgorithm}[caption={}, label={},
+depicted in algorithm \ref{alg:admm}.
+
+\begin{genericAlgorithm}[caption={\ac{LP} decoding using \ac{ADMM} interpreted
+            as a message passing algorithm\protect\footnotemark{}}, label={alg:admm},
         basicstyle=\fontsize{11}{16}\selectfont
         ]
 Initialize $\tilde{\boldsymbol{c}}, \boldsymbol{z}_{[1:m]}$ and $\boldsymbol{u}_{[1:m]}$
@@ -694,11 +691,6 @@ while $\sum_{j\in\mathcal{J}} \lVert \boldsymbol{T}_j\tilde{\boldsymbol{c}} - \b
     end for
 end while
     \end{genericAlgorithm}
-
-    \caption{\ac{LP} decoding using \ac{ADMM} interpreted as a message passing algorithm%
-        \protect\footnotemark{}}
-    \label{fig:lp:message_passing}
-\end{figure}%
 %
 \footnotetext{$\epsilon_{\text{pri}} > 0$ and $\epsilon_{\text{dual}} > 0$
 are additional parameters
diff --git a/latex/thesis/chapters/proximal_decoding.tex b/latex/thesis/chapters/proximal_decoding.tex
index 7002f20..8c5d7e6 100644
--- a/latex/thesis/chapters/proximal_decoding.tex
+++ b/latex/thesis/chapters/proximal_decoding.tex
@@ -13,7 +13,7 @@ Finally, an improvement on proximal decoding is proposed.
 \section{Decoding Algorithm}%
 \label{sec:prox:Decoding Algorithm}
 
-Proximal decoding was proposed by Wadayama et. al as a novel formulation of
+Proximal decoding was proposed by Wadayama et al. as a novel formulation of
 optimization-based decoding \cite{proximal_paper}.
 With this algorithm, minimization is performed using the proximal gradient
 method.
@@ -83,7 +83,7 @@ The prior \ac{PDF} is then approximated using the code-constraint polynomial as:
     \label{eq:prox:prior_pdf_approx}
 .\end{align}%
 %
-The authors justify this approximation by arguing, that for
+The authors justify this approximation by arguing that for
 $\gamma \rightarrow \infty$, the approximation in equation
 (\ref{eq:prox:prior_pdf_approx}) approaches the original function in equation
 (\ref{eq:prox:prior_pdf}).
@@ -97,10 +97,9 @@ $L \left( \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \right) = -\ln\left(
     \hat{\boldsymbol{x}} &= \argmax_{\tilde{\boldsymbol{x}} \in \mathbb{R}^{n}}
             \mathrm{e}^{- L\left( \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \right) }
             \mathrm{e}^{-\gamma h\left( \tilde{\boldsymbol{x}} \right) } \\
-        &= \argmin_{\tilde{\boldsymbol{x}} \in \mathbb{R}^n} \big(
+        &= \argmin_{\tilde{\boldsymbol{x}} \in \mathbb{R}^n}
             L\left( \boldsymbol{y} \mid \tilde{\boldsymbol{x}} \right)
-            + \gamma h\left( \tilde{\boldsymbol{x}} \right) 
-            \big)%
+            + \gamma h\left( \tilde{\boldsymbol{x}} \right)%
 .\end{align*}%
 %
 Thus, with proximal decoding, the objective function
@@ -148,13 +147,13 @@ It is then immediately approximated with gradient-descent:%
 \begin{align*}
     \textbf{prox}_{\gamma h} \left( \tilde{\boldsymbol{x}} \right) &\equiv
         \argmin_{\boldsymbol{t} \in \mathbb{R}^n}
-            \left( \gamma h\left( \boldsymbol{t} \right) +
-                \frac{1}{2} \lVert \boldsymbol{t} - \tilde{\boldsymbol{x}} \rVert \right)\\
+            \gamma h\left( \boldsymbol{t} \right) +
+            \frac{1}{2} \left\Vert \boldsymbol{t} - \tilde{\boldsymbol{x}} \right\Vert \\
         &\approx \tilde{\boldsymbol{x}} - \gamma \nabla h \left( \tilde{\boldsymbol{x}} \right),
     \hspace{5mm} \gamma > 0, \text{ small}
 .\end{align*}%
 %
-The second step thus becomes%
+The second optimization step thus becomes%
 %
 \begin{align*}
     \boldsymbol{s} \leftarrow \boldsymbol{r} - \gamma \nabla h\left( \boldsymbol{r} \right),
@@ -228,13 +227,11 @@ where $\eta$ is a positive constant slightly larger than one:%
 $\Pi_{\eta}\left( \cdot \right) $ expressing the projection onto
 $\left[ -\eta, \eta \right]^n$.
 
-The iterative decoding process resulting from these considerations is shown in
-figure \ref{fig:prox:alg}.
+The iterative decoding process resulting from these considerations is
+summarized in algorithm \ref{alg:prox}.
 
-\begin{figure}[H]
-    \centering
-
-    \begin{genericAlgorithm}[caption={}, label={}]
+\begin{genericAlgorithm}[caption={Proximal decoding algorithm for an \ac{AWGN} channel},
+    label={alg:prox}]
 $\boldsymbol{s} \leftarrow \boldsymbol{0}$
 for $K$ iterations do
     $\boldsymbol{r} \leftarrow \boldsymbol{s} - \omega \left( \boldsymbol{s} - \boldsymbol{y} \right) $
@@ -245,12 +242,7 @@ for $K$ iterations do
     end if
 end for
 return $\boldsymbol{\hat{c}}$
-    \end{genericAlgorithm}
-
-
-    \caption{Proximal decoding algorithm for an \ac{AWGN} channel}
-    \label{fig:prox:alg}
-\end{figure}
+\end{genericAlgorithm}
 
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -425,8 +417,7 @@ while the newly generated ones are shown with dashed lines.
 ($\gamma = 0.05$) the decoding performance is better than for low
 ($\gamma = 0.01$) or high ($\gamma = 0.15$) values.
 The question arises if there is some optimal value maximazing the decoding
-performance, especially since the decoding performance seems to dramatically
-depend on $\gamma$.
+performance, especially since it seems to dramatically depend on $\gamma$.
 To better understand how $\gamma$ and the decoding performance are
 related, figure \ref{fig:prox:results} was recreated, but with a considerably
 larger selection of values for $\gamma$.
@@ -814,22 +805,23 @@ Summarizing the above considerations, \ldots
         \end{axis}
     \end{tikzpicture}
 
-    \caption{Cmoparison\protect\footnotemark{} of \ac{FER}, \ac{BER} and
-        decoding failure rate; $\omega = 0.05, K=100$}
+    \caption{Comparison of \ac{FER}, \ac{BER} and decoding failure rate\protect\footnotemark{}}
     \label{fig:prox:ber_fer_dfr}
 \end{figure}%
 %
-\footnotetext{(3,6) regular LDPC code with n = 204, k = 102 \cite[\text{204.33.484}]{mackay_enc}}%
+\footnotetext{(3,6) regular LDPC code with n = 204, k = 102
+    \cite[\text{204.33.484}]{mackay_enc}; $\omega = 0.05, K=100, \eta=1.5$
+}%
 %
 
-Until now, only the \ac{BER} has been considered to assess the decoding
+Until now, only the \ac{BER} has been considered to gauge the decoding
 performance.
 The \ac{FER}, however, shows considerably worse behaviour, as can be seen in
 figure \ref{fig:prox:ber_fer_dfr}.
 Besides the \ac{BER} and \ac{FER} curves, the figure also shows the
 \textit{decoding failure rate}.
 This is the rate at which the iterative process produces invalid codewords,
-i.e., the stopping criterion (line 6 of algorithm \ref{TODO}) is never
+i.e., the stopping criterion (line 6 of algorithm \ref{alg:prox}) is never
 satisfied and the maximum number of itertations $K$ is reached without
 converging to a valid codeword.
 Three lines are plotted in each case, corresponding to different values of
diff --git a/latex/thesis/chapters/theoretical_background.tex b/latex/thesis/chapters/theoretical_background.tex
index ee1d77a..796c25c 100644
--- a/latex/thesis/chapters/theoretical_background.tex
+++ b/latex/thesis/chapters/theoretical_background.tex
@@ -316,10 +316,10 @@ $g : \mathbb{R}^n \rightarrow \mathbb{R} $ must be minimized under certain const
 ,\end{align*}%
 %
 where $D \subseteq \mathbb{R}^n$ is the domain of values attainable for $\tilde{\boldsymbol{c}}$
-and represents the constraints.
+and represents the constraints under which the minimization is to take place.
 
 In contrast to the established message-passing decoding algorithms,
-the prespective then changes from observing the decoding process in its
+the perspective then changes from observing the decoding process in its
 Tanner graph representation with \acp{VN} and \acp{CN} (as shown in figure \ref{fig:dec:tanner})
 to a spatial representation (figure \ref{fig:dec:spatial}),
 where the codewords are some of the edges of a hypercube.
@@ -495,8 +495,8 @@ interpreted componentwise.}
 A technique called \textit{lagrangian relaxation} \cite[Sec. 11.4]{intro_to_lin_opt_book}
 can then be applied.
 First, some of the constraints are moved into the objective function itself
-and the weights $\boldsymbol{\lambda}$ are introduced. A new, relaxed problem
-is formulated:
+and weights $\boldsymbol{\lambda}$ are introduced. A new, relaxed problem
+is then formulated as
 %
 \begin{align}
     \begin{aligned}
@@ -555,7 +555,8 @@ and (\ref{eq:theo:admm_standard}) have the same value.
 Thus, we can define the \textit{dual problem} as the search for the tightest lower bound:%
 %
 \begin{align}
-    \text{maximize }\hspace{2mm} & \min_{\boldsymbol{x} \ge \boldsymbol{0}} \mathcal{L}
+    \underset{\boldsymbol{\lambda}}{\text{maximize }}\hspace{2mm}
+        & \min_{\boldsymbol{x} \ge \boldsymbol{0}} \mathcal{L}
         \left( \boldsymbol{x}, \boldsymbol{\lambda} \right)
     \label{eq:theo:dual}
 ,\end{align}
@@ -565,7 +566,7 @@ from the solution $\boldsymbol{\lambda}_\text{opt}$ to problem (\ref{eq:theo:dua
 by computing \cite[Sec. 2.1]{admm_distr_stats}%
 %
 \begin{align}
-    \boldsymbol{x}_{\text{opt}} = \argmin_{\boldsymbol{x}}
+    \boldsymbol{x}_{\text{opt}} = \argmin_{\boldsymbol{x} \ge \boldsymbol{0}}
         \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{\lambda}_{\text{opt}} \right)
     \label{eq:theo:admm_obtain_primal}
 .\end{align}
@@ -584,7 +585,14 @@ using gradient descent \cite[Sec. 2.1]{admm_distr_stats}:%
     \hspace{5mm} \alpha > 0
 .\end{align*}
 %
-The algorithm can be improved by observing that when the objective function is separable in $\boldsymbol{x}$, the lagrangian is as well:
+The algorithm can be improved by observing that when the objective function
+$g: \mathbb{R}^n \rightarrow \mathbb{R}$ is separable into a number
+$N \in \mathbb{N}$ of sub-functions
+$g_i: \mathbb{R}^{n_i} \rightarrow \mathbb{R}$,
+i.e., $g\left( \boldsymbol{x} \right) = \sum_{i=1}^{N} g_i
+\left( \boldsymbol{x}_i \right)$,
+where $\boldsymbol{x}_i,\hspace{1mm} i\in [1:N]$ are subvectors of
+$\boldsymbol{x}$, the lagrangian is as well:
 %
 \begin{align*}
     \text{minimize }\hspace{5mm} & \sum_{i=1}^{N} g_i\left( \boldsymbol{x}_i \right)  \\
@@ -598,8 +606,18 @@ The algorithm can be improved by observing that when the objective function is s
             - \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x_i} \right) 
 .\end{align*}%
 %
-The minimization of each term can then happen in parallel, in a distributed fasion
-\cite[Sec. 2.2]{admm_distr_stats}.
+The matrices $\boldsymbol{A}_i, \hspace{1mm} i \in [1:N]$ are partitions of
+the matrix $\boldsymbol{A}$, corresponding to
+$\boldsymbol{A} = \begin{bmatrix}
+    \boldsymbol{A}_1 &
+    \ldots &
+    \boldsymbol{A}_N
+\end{bmatrix}$.
+The minimization of each term can then happen in parallel, in a distributed
+fashion \cite[Sec. 2.2]{admm_distr_stats}.
+In each minimization step, only one subvector $\boldsymbol{x}_i$ of
+$\boldsymbol{x}$ is considered, regarding all other subvectors as being
+constant.
 This modified version of dual ascent is called \textit{dual decomposition}:
 %
 \begin{align*}
@@ -616,7 +634,7 @@ This modified version of dual ascent is called \textit{dual decomposition}:
 The \ac{ADMM} works the same way as dual decomposition.
 It only differs in the use of an \textit{augmented lagrangian}
 $\mathcal{L}_\mu\left( \boldsymbol{x}_{[1:N]}, \boldsymbol{\lambda} \right)$
-in order to robustify the convergence properties.
+in order to strengthen the convergence properties.
 The augmented lagrangian extends the ordinary one with an additional penalty term
 with the penaly parameter $\mu$:
 %
@@ -625,8 +643,8 @@ with the penaly parameter $\mu$:
         = \underbrace{\sum_{i=1}^{N} g_i\left( \boldsymbol{x_i} \right) 
             + \boldsymbol{\lambda}^\text{T}\left( \boldsymbol{b}
         - \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i \right)}_{\text{Ordinary lagrangian}}
-            + \underbrace{\frac{\mu}{2}\lVert \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i
-            - \boldsymbol{b} \rVert_2^2}_{\text{Penalty term}},
+            + \underbrace{\frac{\mu}{2}\left\Vert \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i
+            - \boldsymbol{b} \right\Vert_2^2}_{\text{Penalty term}},
         \hspace{5mm} \mu > 0
 .\end{align*}
 %