From 0b12fcb419d27eab639736061a5b5f68056a08df Mon Sep 17 00:00:00 2001
From: Andreas Tsouchlos <an.tsouchlos@gmail.com>
Date: Sun, 23 Apr 2023 23:57:15 +0200
Subject: [PATCH] First round of corrections

---
 latex/thesis/chapters/appendix.tex            |  12 +-
 latex/thesis/chapters/comparison.tex          |  12 +-
 latex/thesis/chapters/introduction.tex        |  14 +-
 latex/thesis/chapters/proximal_decoding.tex   |  56 ++++----
 .../chapters/theoretical_background.tex       | 129 +++++++++++-------
 latex/thesis/thesis.tex                       |   1 +
 6 files changed, 128 insertions(+), 96 deletions(-)

diff --git a/latex/thesis/chapters/appendix.tex b/latex/thesis/chapters/appendix.tex
index a80e542..9ce7939 100644
--- a/latex/thesis/chapters/appendix.tex
+++ b/latex/thesis/chapters/appendix.tex
@@ -508,7 +508,7 @@ $\gamma \in \left\{ 0.01, 0.05, 0.15 \right\}$.
         \begin{tikzpicture}
             \begin{axis}[
                 grid=both,
-                xlabel={$E_b / N_0$}, ylabel={FER},
+                xlabel={$E_b / N_0$ (dB)}, ylabel={FER},
                 ymode=log,
                 legend columns=1,
                 legend pos=outer north east,
@@ -549,7 +549,7 @@ $\gamma \in \left\{ 0.01, 0.05, 0.15 \right\}$.
         \begin{tikzpicture}
             \begin{axis}[
                 grid=both,
-                xlabel={$E_b / N_0$}, ylabel={FER},
+                xlabel={$E_b / N_0$ (dB)}, ylabel={FER},
                 ymode=log,
                 legend columns=1,
                 legend pos=outer north east,
@@ -593,7 +593,7 @@ $\gamma \in \left\{ 0.01, 0.05, 0.15 \right\}$.
         \begin{tikzpicture}
             \begin{axis}[
                 grid=both,
-                xlabel={$E_b / N_0$}, ylabel={FER},
+                xlabel={$E_b / N_0$ (dB)}, ylabel={FER},
                 ymode=log,
                 legend columns=1,
                 legend pos=outer north east,
@@ -647,7 +647,7 @@ $\gamma \in \left\{ 0.01, 0.05, 0.15 \right\}$.
         \begin{tikzpicture}
             \begin{axis}[
                 grid=both,
-                xlabel={$E_b / N_0$}, ylabel={FER},
+                xlabel={$E_b / N_0$ (dB)}, ylabel={FER},
                 ymode=log,
                 legend columns=1,
                 legend pos=outer north east,
@@ -692,7 +692,7 @@ $\gamma \in \left\{ 0.01, 0.05, 0.15 \right\}$.
         \begin{tikzpicture}
             \begin{axis}[
                 grid=both,
-                xlabel={$E_b / N_0$}, ylabel={FER},
+                xlabel={$E_b / N_0$ (dB)}, ylabel={FER},
                 ymode=log,
                 legend columns=1,
                 legend pos=outer north east,
@@ -735,7 +735,7 @@ $\gamma \in \left\{ 0.01, 0.05, 0.15 \right\}$.
         \begin{tikzpicture}
             \begin{axis}[
                 grid=both,
-                xlabel={$E_b / N_0$}, ylabel={FER},
+                xlabel={$E_b / N_0$ (dB)}, ylabel={FER},
                 ymode=log,
                 legend columns=1,
                 legend pos=outer north east,
diff --git a/latex/thesis/chapters/comparison.tex b/latex/thesis/chapters/comparison.tex
index d70e964..8c66876 100644
--- a/latex/thesis/chapters/comparison.tex
+++ b/latex/thesis/chapters/comparison.tex
@@ -340,7 +340,7 @@ algorithms.
         \begin{tikzpicture}
             \begin{axis}[
                 grid=both,
-                xlabel={$E_b / N_0$}, ylabel={FER},
+                xlabel={$E_b / N_0$ (dB)}, ylabel={FER},
                 ymode=log,
                 ymax=1.5, ymin=8e-5,
                 width=\textwidth,
@@ -376,7 +376,7 @@ algorithms.
         \begin{tikzpicture}
             \begin{axis}[
                 grid=both,
-                xlabel={$E_b / N_0$}, ylabel={FER},
+                xlabel={$E_b / N_0$ (dB)}, ylabel={FER},
                 ymode=log,
                 ymax=1.5, ymin=8e-5,
                 width=\textwidth,
@@ -414,7 +414,7 @@ algorithms.
         \begin{tikzpicture}
             \begin{axis}[
                 grid=both,
-                xlabel={$E_b / N_0$}, ylabel={FER},
+                xlabel={$E_b / N_0$ (dB)}, ylabel={FER},
                 ymode=log,
                 ymax=1.5, ymin=8e-5,
                 width=\textwidth,
@@ -455,7 +455,7 @@ algorithms.
         \begin{tikzpicture}
             \begin{axis}[
                 grid=both,
-                xlabel={$E_b / N_0$}, ylabel={FER},
+                xlabel={$E_b / N_0$ (dB)}, ylabel={FER},
                 ymode=log,
                 ymax=1.5, ymin=8e-5,
                 width=\textwidth,
@@ -490,7 +490,7 @@ algorithms.
         \begin{tikzpicture}
             \begin{axis}[
                 grid=both,
-                xlabel={$E_b / N_0$}, ylabel={FER},
+                xlabel={$E_b / N_0$ (dB)}, ylabel={FER},
                 ymode=log,
                 ymax=1.5, ymin=8e-5,
                 width=\textwidth,
@@ -523,7 +523,7 @@ algorithms.
         \begin{tikzpicture}
             \begin{axis}[
                 grid=both,
-                xlabel={$E_b / N_0$}, ylabel={FER},
+                xlabel={$E_b / N_0$ (dB)}, ylabel={FER},
                 ymode=log,
                 ymax=1.5, ymin=8e-5,
                 width=\textwidth,
diff --git a/latex/thesis/chapters/introduction.tex b/latex/thesis/chapters/introduction.tex
index 8d4644c..6ab5b8e 100644
--- a/latex/thesis/chapters/introduction.tex
+++ b/latex/thesis/chapters/introduction.tex
@@ -2,15 +2,15 @@
 \label{chapter:introduction}
 
 Channel coding using binary linear codes is a way of enhancing the reliability
-of data by detecting and correcting any errors that may have occurred during
-transmission or storage.
+of data by detecting and correcting any errors that may occur during
+its transmission or storage.
 One class of binary linear codes, \ac{LDPC} codes, has become especially
 popular due to being able to reach arbitrarily small probabilities of error
 at code rates up to the capacity of the channel, while retaining a structure
 that allows for very efficient decoding.
 While the established decoders for \ac{LDPC} codes, such as \ac{BP} and the
-\textit{min-sum algorithm}, offer reasonable performance, they are suboptimal
-in most cases and exhibit a so called \textit{error floor} for high \acp{SNR},
+\textit{min-sum algorithm}, offer reasonable decoding performance, they are suboptimal
+in most cases and exhibit an \textit{error floor} for high \acp{SNR},
 making them unsuitable for applications with extreme reliability requiremnts.
 Optimization based decoding algorithms are an entirely different way of approaching
 the decoding problem, in some cases coming with stronger theoretical guarantees
@@ -22,10 +22,10 @@ the existing literature by considering a variety of different codes.
 Specifically, the \textit{proximal decoding} \cite{proximal_paper}
 algorithm and \ac{LP} decoding using the \ac{ADMM} \cite{original_admm} are explored.
 The two algorithms are analyzed based on their theoretical structure
-and on results of simulations conducted in the scope of this work.
+and based on the results of the simulations conducted in the scope of this work.
 Approaches to determine the optimal value of each parameter are derived
 and the computational and decoding performance of the algorithms is examined.
-An improvement on proximal decoding is suggested, offering up to $\SI{1}{dB}$
-of gain in decoding performance, depending on the parameters chosen and the
+An improvement on proximal decoding is suggested, achieving up to $\SI{1}{dB}$
+of gain, depending on the parameters chosen and the
 code considered.
 
diff --git a/latex/thesis/chapters/proximal_decoding.tex b/latex/thesis/chapters/proximal_decoding.tex
index de3e26c..4b1458a 100644
--- a/latex/thesis/chapters/proximal_decoding.tex
+++ b/latex/thesis/chapters/proximal_decoding.tex
@@ -256,12 +256,15 @@ process and straightforward debugging ability.
 It was subsequently reimplemented in C++ using the Eigen%
 \footnote{\url{https://eigen.tuxfamily.org}}
 linear algebra library to achieve higher performance.
-The focus has been set on a fast implementation, sometimes at the expense of
+The focus has been on a fast implementation, sometimes at the expense of
 memory usage, somewhat limiting the size of the codes the implementation can be
 used with.
-The evaluation of the simulation results has been wholly realized in Python.
+The evaluation of decoding operations and subsequent calculation of \acp{BER},
+\acp{FER}, etc., has been wholly realized in Python.
 
-The gradient of the code-constraint polynomial \cite[Sec. 2.3]{proximal_paper}
+Concerning the proximal decoding algorithm itself, there are certain aspects
+presenting optimization opportunities during the implementation.
+The gradient of the code-constraint polynomial \cite[Sec. 2.3]{proximal_paper}, for example,
 is given by%
 %
 \begin{align*}
@@ -279,7 +282,7 @@ is given by%
 %
 Since the products
 $\prod_{i\in N_c\left( j \right) } \tilde{x}_i,\hspace{2mm}j\in \mathcal{J}$
-are the same for all components $\tilde{x}_k$ of $\tilde{\boldsymbol{x}}$, they can be
+are identical for all components $\tilde{x}_k$ of $\tilde{\boldsymbol{x}}$, they can be
 precomputed.
 Defining%
 %
@@ -325,7 +328,7 @@ The impact of the parameters $\gamma$, as well as $\omega$, $K$ and $\eta$ is
 examined.
 The decoding performance is assessed based on the \ac{BER} and the
 \ac{FER} as well as the \textit{decoding failure rate} - the rate at which
-the algorithm produces invalid results.
+the algorithm produces results that are not valid codewords.
 The convergence properties are reviewed and related to the decoding
 performance.
 Finally, the computational performance is examined on a theoretical basis
@@ -335,8 +338,9 @@ thesis.
 All simulation results presented hereafter are based on Monte Carlo
 simulations.
 The \ac{BER} and \ac{FER} curves in particular have been generated by
-producing at least 100 frame-errors for each data point, unless otherwise
+producing at least 100 frame errors for each data point, unless otherwise
 stated.
+\todo{Same text about monte carlo simulations and frame errors for admm}
 
 
 \subsection{Choice of Parameters}
@@ -478,10 +482,10 @@ significantly affects the decoding performance, there is not much benefit
 attainable in undertaking an extensive search for an exact optimum.
 Rather, a preliminary examination providing a rough window for $\gamma$ may
 be sufficient.
-When examining a number of different codes (figure
-\ref{fig:prox:results_3d_multiple}), it is apparent that while the exact
+When examining a number of different codes, see figure
+\ref{fig:prox:results_3d_multiple}, it is apparent that while the exact
 landscape of the graph depends on the code, the general behavior is the same
-in each case.
+for all codes analyzed in this thesis.
 
 The parameter $\gamma$ describes the step-size for the optimization step
 dealing with the code-constraint polynomial;
@@ -490,7 +494,7 @@ negative-log likelihood.
 The relationship between $\omega$ and $\gamma$ is portrayed in figure
 \ref{fig:prox:gamma_omega}.
 The color of each cell indicates the \ac{BER} when the corresponding values
-are chosen for the parameters.
+are chosen for the decoding.
 The \ac{SNR} is kept constant at $\SI{4}{dB}$.
 The \ac{BER} exhibits similar behavior in its dependency on $\omega$ and
 on $\gamma$: it is minimized when keeping the value within certain
@@ -547,7 +551,7 @@ error is observed during each iteration of the decoding process, for several
 different \acp{SNR}.
 The plots have been generated by averaging the error over $\SI{500000}{}$
 decodings.
-As some decodings go one for more iterations than others, the number of values
+As some decodings go on for more iterations than others, the number of values
 which are averaged for each datapoints vary.
 This explains the dip visible in all curves around the 20th iteration, since
 after this point more and more correct decodings are completed,
@@ -558,7 +562,7 @@ timing requirements of the decoding process.
 Another aspect to consider is that the higher the \ac{SNR}, the fewer
 decodings are present at each iteration
 to average, since a solution is found earlier.
-This explains the decreasing smoothness of the lines as the \ac{SNR} rises.
+This explains the decreasing smoothness of the lines as the \ac{SNR} increases.
 Remarkably, the \ac{SNR} seems to not have any impact on the number of
 iterations necessary to reach the point at which the average error
 stabilizes.
@@ -609,7 +613,7 @@ optimum values for the parameters $\gamma$ and $\omega$ appears to bring
 limited benefit;
 an initial rudimentary examination to find the general bounds in which the two
 values should lie is sufficient.
-The parameter $K$ is independent of the $SNR$ and raising its value above a
+The parameter $K$ is independent of the \ac{SNR} and raising its value above a
 certain threshold does not improve the decoding performance.
 The choice of $\eta$ is insignificant and the parameter is only relevant as a
 means to bring about numerical stability.
@@ -699,7 +703,7 @@ means to bring about numerical stability.
 
 Until now, only the \ac{BER} has been considered to gauge the decoding
 performance.
-The \ac{FER}, however, shows considerably worse behavior, as can be seen in
+The \ac{FER}, however, shows considerably different behavior, as can be seen in
 figure \ref{fig:prox:ber_fer_dfr}.
 Besides the \ac{BER} and \ac{FER} curves, the figure also shows the
 \textit{decoding failure rate}.
@@ -719,7 +723,8 @@ This leads to the hypothesis that, at least for higher \acp{SNR}, frame errors
 arise mainly due to the non-convergence of the algorithm instead of
 convergence to the wrong codeword.
 This course of thought will be picked up in section
-\ref{sec:prox:Improved Implementation} to try to improve the algorithm.
+\ref{sec:prox:Improved Implementation} when proposing a method toimprove the
+algorithm.
 
 In summary, the \ac{BER} and \ac{FER} indicate dissimilar decoding
 performance.
@@ -730,10 +735,11 @@ the frame errors may largely be attributed to decoding failures.
 \subsection{Convergence Properties}
 \label{subsec:prox:conv_properties}
 
-The previous observation, that the \ac{FER} may arise mainly due to the
+The previous observation that the \ac{FER} may arise mainly due to the
 non-convergence of the algorithm instead of convergence to the wrong codeword,
 raises the question why the decoding process does not converge so often.
-In figure \ref{fig:prox:convergence}, the iterative process is visualized.
+To better understand this issue, the iterative process is visualized in
+figure \ref{fig:prox:convergence}.
 In order to be able to simultaneously consider all components of the vectors
 being dealt with, a BCH code with $n=7$ and $k=4$ is chosen.
 Each plot shows one component of the current estimate during a given
@@ -961,7 +967,7 @@ As such, the constraints are not being satisfied and the estimate is not
 converging towards a valid codeword.
 
 While figure \ref{fig:prox:convergence} shows only one instance of a decoding
-task, with no statistical significance, it is indicative of the general
+task with no statistical significance, it is indicative of the general
 behavior of the algorithm.
 This can be justified by looking at the gradients themselves.
 In figure \ref{fig:prox:gradients} the gradients of the negative
@@ -1087,7 +1093,7 @@ value of the parameter $\gamma$ has to be kept small, as mentioned in section
 \ref{sec:prox:Decoding Algorithm}.
 Local minima are introduced between the codewords, in the areas in which it is
 not immediately clear which codeword is the most likely one.
-Raising the value of $\gamma$ results in
+Increasing the value of $\gamma$ results in
 $h \left( \tilde{\boldsymbol{x}} \right)$ dominating the landscape of the
 objective function, thereby introducing these local minima into the objective
 function.
@@ -1099,7 +1105,7 @@ visualized for one component of a code with $n=204$, for a single decoding.
 The two gradients still eventually oppose each other and the estimate still
 starts to oscillate, the same as illustrated in figure
 \ref{fig:prox:convergence} based on a code with $n=7$.
-However, in this case, the gradient of the code-constraint polynomial iself
+However, in this case, the gradient of the code-constraint polynomial itself
 starts to oscillate, its average value being such that the effect of the
 gradient of the negative log-likelihood is counteracted.
 
@@ -1171,7 +1177,7 @@ The codes considered are the BCH(31, 11) and BCH(31, 26) codes, a number of (3,
 regular \ac{LDPC} codes (\cite[\text{96.3.965, 204.33.484, 408.33.844}]{mackay_enc}),
 a (5,10) regular \ac{LDPC} code (\cite[\text{204.55.187}]{mackay_enc}) and a
 progressive edge growth construction code (\cite[\text{PEGReg252x504}]{mackay_enc}).
-Some deviations from linear behavior are unavoidable because not all codes
+Some deviations from linear behavior are unavoidable, since not all codes
 considered are actually \ac{LDPC} codes, or \ac{LDPC} codes constructed
 according to the same scheme.
 Nonetheless, a generally linear relationship between the average time needed to
@@ -1228,7 +1234,7 @@ And indeed, the magnitude of the oscillation of
 $\nabla h\left( \tilde{\boldsymbol{x}} \right)$ (introduced previously in
 section \ref{subsec:prox:conv_properties} and shown in figure
 \ref{fig:prox:convergence_large_n}) and the probability of having a bit
-error are strongly correlated, a relationship depicted in figure
+error are strongly correlated, a relationship being depicted in figure
 \ref{fig:prox:correlation}.
 %
 \begin{figure}[h]
@@ -1329,7 +1335,7 @@ In some cases, a gain of up to $\SI{1}{dB}$ or higher can be achieved.
     \begin{tikzpicture}
         \begin{axis}[
             grid=both,
-            xlabel={$E_b / N_0$}, ylabel={BER},
+            xlabel={$E_b / N_0$ (dB)}, ylabel={BER},
             ymode=log,
             width=0.48\textwidth,
             height=0.36\textwidth,
@@ -1360,7 +1366,7 @@ In some cases, a gain of up to $\SI{1}{dB}$ or higher can be achieved.
     \begin{tikzpicture}
         \begin{axis}[
             grid=both,
-            xlabel={$E_b / N_0$}, ylabel={FER},
+            xlabel={$E_b / N_0$ (dB)}, ylabel={FER},
             ymode=log,
             width=0.48\textwidth,
             height=0.36\textwidth,
@@ -1392,7 +1398,7 @@ In some cases, a gain of up to $\SI{1}{dB}$ or higher can be achieved.
     \begin{tikzpicture}
         \begin{axis}[
             grid=both,
-            xlabel={$E_b / N_0$}, ylabel={Decoding Failure Rate},
+            xlabel={$E_b / N_0$ (dB)}, ylabel={Decoding Failure Rate},
             ymode=log,
             width=0.48\textwidth,
             height=0.36\textwidth,
diff --git a/latex/thesis/chapters/theoretical_background.tex b/latex/thesis/chapters/theoretical_background.tex
index 9ef72c0..4cb0f79 100644
--- a/latex/thesis/chapters/theoretical_background.tex
+++ b/latex/thesis/chapters/theoretical_background.tex
@@ -4,7 +4,7 @@
 In this chapter, the theoretical background necessary to understand this
 work is given.
 First, the notation used is clarified.
-The physical aspects are detailed - the used modulation scheme and channel model.
+The physical layer is detailed - the used modulation scheme and channel model.
 A short introduction to channel coding with binary linear codes and especially
 \ac{LDPC} codes is given.
 The established methods of decoding LPDC codes are briefly explained.
@@ -204,7 +204,7 @@ Each row of $\boldsymbol{H}$, which represents one parity-check, is viewed as a
 Each component of the codeword $\boldsymbol{c}$ is interpreted as a \ac{VN}.
 The relationship between \acp{CN} and \acp{VN} can then be plotted by noting
 which components of $\boldsymbol{c}$ are considered for which parity-check.
-Figure \ref{fig:theo:tanner_graph} shows the tanner graph for the
+Figure \ref{fig:theo:tanner_graph} shows the Tanner graph for the
 (7,4) Hamming code, which has the following parity-check matrix
 \cite[Example 5.7.]{ryan_lin_2009}:%
 %
@@ -285,15 +285,16 @@ Message passing algorithms are based on the notion of passing messages between
 \acp{CN} and \acp{VN}.
 \Ac{BP} is one such algorithm that is commonly used to decode \ac{LDPC} codes.
 It aims to compute the posterior probabilities
-$p_{C_i \mid \boldsymbol{Y}}\left(c_i = 1 | \boldsymbol{y} \right),\hspace{2mm} i\in\mathcal{I}$
-\cite[Sec. III.]{mackay_rediscovery} and use them to calculate the estimate $\hat{\boldsymbol{c}}$.
+$p_{C_i \mid \boldsymbol{Y}}\left(c_i = 1 | \boldsymbol{y} \right),\hspace{2mm} i\in\mathcal{I}$,
+see \cite[Sec. III.]{mackay_rediscovery} and use them to calculate the estimate
+$\hat{\boldsymbol{c}}$.
 For cycle-free graphs this goal is reached after a finite
 number of steps and \ac{BP} is equivalent to \ac{MAP} decoding.
-When the graph contains cycles, however, \ac{BP} only approximates the probabilities
+When the graph contains cycles, however, \ac{BP} only approximates the \ac{MAP} probabilities
 and is sub-optimal.
 This leads to generally worse performance than \ac{MAP} decoding for practical codes.
 Additionally, an \textit{error floor} appears for very high \acp{SNR}, making
-the use of \ac{BP} impractical for applications where a very low \ac{BER} is
+the use of \ac{BP} impractical for applications where a very low error rate is
 desired \cite[Sec. 15.3]{ryan_lin_2009}.
 Another popular decoding method for \ac{LDPC} codes is the
 \textit{min-sum algorithm}.
@@ -457,29 +458,38 @@ which minimizes the objective function $g$.
 
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{An introduction to the proximal gradient method and ADMM}
+\section{A Short Introduction to the Proximal Gradient Method and ADMM}
 \label{sec:theo:Optimization Methods}
 
+In this section, the general ideas behind the optimization methods used in
+this work are outlined.
+The application of these optimization methods to channel decoding decoding
+will be discussed in later chapters.
+Two methods are introduced, the \textit{proximal gradient method} and
+\ac{ADMM}.
+
 \textit{Proximal algorithms} are algorithms for solving convex optimization
-problems, that rely on the use of \textit{proximal operators}.
+problems that rely on the use of \textit{proximal operators}.
 The proximal operator $\textbf{prox}_{\lambda f} : \mathbb{R}^n \rightarrow \mathbb{R}^n$
 of a function $f:\mathbb{R}^n \rightarrow \mathbb{R}$ is defined by
 \cite[Sec. 1.1]{proximal_algorithms}%
 %
 \begin{align*}
-    \textbf{prox}_{\lambda f}\left( \boldsymbol{v} \right) = \argmin_{\boldsymbol{x}} \left(
-        f\left( \boldsymbol{x} \right) + \frac{1}{2\lambda}\lVert \boldsymbol{x}
-            - \boldsymbol{v} \rVert_2^2 \right)
+    \textbf{prox}_{\lambda f}\left( \boldsymbol{v} \right)
+        = \argmin_{\boldsymbol{x} \in \mathbb{R}^n} \left(
+            f\left( \boldsymbol{x} \right) + \frac{1}{2\lambda}\lVert \boldsymbol{x}
+                - \boldsymbol{v} \rVert_2^2 \right)
 .\end{align*}
 %
 This operator computes a point that is a compromise between minimizing $f$
 and staying in the proximity of $\boldsymbol{v}$.
-The parameter $\lambda$ determines how heavily each term is weighed.
-The \textit{proximal gradient method} is an iterative optimization method
+The parameter $\lambda$ determines how each term is weighed.
+The proximal gradient method is an iterative optimization method
 utilizing proximal operators, used to solve problems of the form%
 %
 \begin{align*}
-    \text{minimize}\hspace{5mm}f\left( \boldsymbol{x} \right) + g\left( \boldsymbol{x} \right) 
+    \underset{\boldsymbol{x} \in \mathbb{R}^n}{\text{minimize}}\hspace{5mm}
+        f\left( \boldsymbol{x} \right) + g\left( \boldsymbol{x} \right) 
 \end{align*}
 %
 that consists of two steps: minimizing $f$ with gradient descent
@@ -492,14 +502,14 @@ and minimizing $g$ using the proximal operator
 ,\end{align*}
 %
 Since $g$ is minimized with the proximal operator and is thus not required
-to be differentiable, it can be used to encode the constraints of the problem
+to be differentiable, it can be used to encode the constraints of the optimization problem
 (e.g., in the form of an \textit{indicator function}, as mentioned in
 \cite[Sec. 1.2]{proximal_algorithms}).
 
-The \ac{ADMM} is another optimization method.
+\ac{ADMM} is another optimization method.
 In this thesis it will be used to solve a \textit{linear program}, which
-is a special type of convex optimization problem, where the objective function
-is linear, and the constraints consist of linear equalities and inequalities.
+is a special type of convex optimization problem in which the objective function
+is linear and the constraints consist of linear equalities and inequalities.
 Generally, any linear program can be expressed in \textit{standard form}%
 \footnote{The inequality $\boldsymbol{x} \ge \boldsymbol{0}$ is to be
 interpreted componentwise.}
@@ -507,38 +517,53 @@ interpreted componentwise.}
 %
 \begin{alignat}{3}
     \begin{alignedat}{3}
-        \text{minimize }\hspace{2mm}   && \boldsymbol{\gamma}^\text{T} \boldsymbol{x}         \\
+        \underset{\boldsymbol{x}\in\mathbb{R}^n}{\text{minimize }}\hspace{2mm}  
+            && \boldsymbol{\gamma}^\text{T} \boldsymbol{x}         \\
         \text{subject to }\hspace{2mm} && \boldsymbol{A}\boldsymbol{x}   & = \boldsymbol{b}   \\
-                                       &&               \boldsymbol{x}   & \ge \boldsymbol{0}.
+                                       &&               \boldsymbol{x}   & \ge \boldsymbol{0},
     \end{alignedat}
     \label{eq:theo:admm_standard}
 \end{alignat}%
 %
+where $\boldsymbol{x}, \boldsymbol{\gamma} \in \mathbb{R}^n$, $\boldsymbol{b} \in \mathbb{R}^m$
+and $\boldsymbol{A}\in\mathbb{R}^{m \times n}$.
 A technique called \textit{Lagrangian relaxation} \cite[Sec. 11.4]{intro_to_lin_opt_book}
 can then be applied.
 First, some of the constraints are moved into the objective function itself
 and weights $\boldsymbol{\lambda}$ are introduced. A new, relaxed problem
-is then formulated as
+is formulated as
 %
 \begin{align}
     \begin{aligned}
-        \text{minimize }\hspace{2mm}   & \boldsymbol{\gamma}^\text{T}\boldsymbol{x}
-            + \boldsymbol{\lambda}^\text{T}\left(\boldsymbol{b}
-                - \boldsymbol{A}\boldsymbol{x} \right)  \\
+        \underset{\boldsymbol{x}\in\mathbb{R}^n}{\text{minimize }}\hspace{2mm}
+            & \boldsymbol{\gamma}^\text{T}\boldsymbol{x}
+            + \boldsymbol{\lambda}^\text{T}\left(
+                \boldsymbol{A}\boldsymbol{x} - \boldsymbol{b}\right)  \\
         \text{subject to }\hspace{2mm} & \boldsymbol{x} \ge \boldsymbol{0},
     \end{aligned}
     \label{eq:theo:admm_relaxed}
 \end{align}%
 %
 the new objective function being the \textit{Lagrangian}%
+\footnote{
+    Depending on what literature is consulted, the definition of the Lagrangian differs
+    in the order of $\boldsymbol{A}\boldsymbol{x}$ and $\boldsymbol{b}$.
+    As will subsequently be seen, however, the only property of the Lagrangian having
+    any bearing on the optimization process is that minimizing it gives a lower bound
+    on the optimal objective of the original problem.
+    This property is satisfied no matter the order of the terms and the order
+    chosen here is the one used in the \ac{LP} decoding literature making use of
+    \ac{ADMM}.
+}%
 %
 \begin{align*}
 \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{\lambda} \right)
     = \boldsymbol{\gamma}^\text{T}\boldsymbol{x}
-        + \boldsymbol{\lambda}^\text{T}\left(\boldsymbol{b}
-            - \boldsymbol{A}\boldsymbol{x} \right)
+        + \boldsymbol{\lambda}^\text{T}\left(
+            \boldsymbol{A}\boldsymbol{x} - \boldsymbol{b}\right)
 .\end{align*}%
 %
+
 This problem is not directly equivalent to the original one, as the
 solution now depends on the choice of the \textit{Lagrange multipliers}
 $\boldsymbol{\lambda}$.
@@ -562,12 +587,12 @@ Furthermore, for uniquely solvable linear programs \textit{strong duality}
 always holds \cite[Theorem 4.4]{intro_to_lin_opt_book}.
 This means that not only is it a lower bound, the tightest lower
 bound actually reaches the value itself:
-In other words, with the optimal choice of $\boldsymbol{\lambda}$,
+in other words, with the optimal choice of $\boldsymbol{\lambda}$,
 the optimal objectives of the problems (\ref{eq:theo:admm_relaxed})
-and (\ref{eq:theo:admm_standard}) have the same value.
+and (\ref{eq:theo:admm_standard}) have the same value, i.e.,
 %
 \begin{align*}
-    \max_{\boldsymbol{\lambda}} \, \min_{\boldsymbol{x} \ge \boldsymbol{0}}
+    \max_{\boldsymbol{\lambda}\in\mathbb{R}^m} \, \min_{\boldsymbol{x} \ge \boldsymbol{0}}
         \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{\lambda} \right) 
     = \min_{\substack{\boldsymbol{x} \ge \boldsymbol{0} \\ \boldsymbol{A}\boldsymbol{x}
             = \boldsymbol{b}}}
@@ -577,7 +602,7 @@ and (\ref{eq:theo:admm_standard}) have the same value.
 Thus, we can define the \textit{dual problem} as the search for the tightest lower bound:%
 %
 \begin{align}
-    \underset{\boldsymbol{\lambda}}{\text{maximize }}\hspace{2mm}
+    \underset{\boldsymbol{\lambda}\in\mathbb{R}^m}{\text{maximize }}\hspace{2mm}
         & \min_{\boldsymbol{x} \ge \boldsymbol{0}} \mathcal{L}
         \left( \boldsymbol{x}, \boldsymbol{\lambda} \right)
     \label{eq:theo:dual}
@@ -600,7 +625,7 @@ using equation (\ref{eq:theo:admm_obtain_primal}); then, update $\boldsymbol{\la
 using gradient descent \cite[Sec. 2.1]{distr_opt_book}:%
 %
 \begin{align*}
-    \boldsymbol{x} &\leftarrow \argmin_{\boldsymbol{x}} \mathcal{L}\left(
+    \boldsymbol{x} &\leftarrow \argmin_{\boldsymbol{x} \ge \boldsymbol{0}} \mathcal{L}\left(
         \boldsymbol{x}, \boldsymbol{\lambda} \right) \\
     \boldsymbol{\lambda} &\leftarrow \boldsymbol{\lambda}
         + \alpha\left( \boldsymbol{A}\boldsymbol{x} - \boldsymbol{b} \right),
@@ -608,12 +633,12 @@ using gradient descent \cite[Sec. 2.1]{distr_opt_book}:%
 .\end{align*}
 %
 The algorithm can be improved by observing that when the objective function
-$g: \mathbb{R}^n \rightarrow \mathbb{R}$ is separable into a number
-$N \in \mathbb{N}$ of sub-functions
+$g: \mathbb{R}^n \rightarrow \mathbb{R}$ is separable into a sum of
+$N \in \mathbb{N}$ sub-functions
 $g_i: \mathbb{R}^{n_i} \rightarrow \mathbb{R}$,
 i.e., $g\left( \boldsymbol{x} \right) = \sum_{i=1}^{N} g_i
 \left( \boldsymbol{x}_i \right)$,
-where $\boldsymbol{x}_i,\hspace{1mm} i\in [1:N]$ are subvectors of
+where $\boldsymbol{x}_i\in\mathbb{R}^{n_i},\hspace{1mm} i\in [1:N]$ are subvectors of
 $\boldsymbol{x}$, the Lagrangian is as well:
 %
 \begin{align*}
@@ -624,12 +649,12 @@ $\boldsymbol{x}$, the Lagrangian is as well:
 \begin{align*}
     \mathcal{L}\left( \left( \boldsymbol{x}_i \right)_{i=1}^N, \boldsymbol{\lambda} \right)
         = \sum_{i=1}^{N} g_i\left( \boldsymbol{x}_i \right) 
-            + \boldsymbol{\lambda}^\text{T} \left( \boldsymbol{b}
-            - \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x_i} \right) 
+            + \boldsymbol{\lambda}^\text{T} \left(
+            \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x_i} - \boldsymbol{b}\right) 
 .\end{align*}%
 %
-The matrices $\boldsymbol{A}_i, \hspace{1mm} i \in [1:N]$ are partitions of
-the matrix $\boldsymbol{A}$, corresponding to
+The matrices $\boldsymbol{A}_i \in \mathbb{R}^{m \times n_i}, \hspace{1mm} i \in [1:N]$
+form a partition of $\boldsymbol{A}$, corresponding to
 $\boldsymbol{A} = \begin{bmatrix}
     \boldsymbol{A}_1 &
     \ldots &
@@ -643,7 +668,7 @@ constant.
 This modified version of dual ascent is called \textit{dual decomposition}:
 %
 \begin{align*}
-    \boldsymbol{x}_i &\leftarrow \argmin_{\boldsymbol{x}_i}\mathcal{L}\left(
+    \boldsymbol{x}_i &\leftarrow \argmin_{\boldsymbol{x}_i \ge \boldsymbol{0}}\mathcal{L}\left(
         \left( \boldsymbol{x}_i \right)_{i=1}^N, \boldsymbol{\lambda}\right) 
         \hspace{5mm} \forall i \in [1:N]\\
     \boldsymbol{\lambda} &\leftarrow \boldsymbol{\lambda}
@@ -657,14 +682,15 @@ This modified version of dual ascent is called \textit{dual decomposition}:
 It only differs in the use of an \textit{augmented Lagrangian}
 $\mathcal{L}_\mu\left( \left( \boldsymbol{x} \right)_{i=1}^N, \boldsymbol{\lambda} \right)$
 in order to strengthen the convergence properties.
-The augmented Lagrangian extends the ordinary one with an additional penalty term
-with the penaly parameter $\mu$:
+The augmented Lagrangian extends the classical one with an additional penalty term
+with the penalty parameter $\mu$:
 %
 \begin{align*}
     \mathcal{L}_\mu \left( \left( \boldsymbol{x} \right)_{i=1}^N, \boldsymbol{\lambda} \right)
         = \underbrace{\sum_{i=1}^{N} g_i\left( \boldsymbol{x_i} \right) 
-            + \boldsymbol{\lambda}^\text{T}\left( \boldsymbol{b}
-        - \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i \right)}_{\text{Ordinary Lagrangian}}
+            + \boldsymbol{\lambda}^\text{T}\left(\sum_{i=1}^{N}
+                \boldsymbol{A}_i\boldsymbol{x}_i - \boldsymbol{b}\right)}
+                _{\text{Classical Lagrangian}}
             + \underbrace{\frac{\mu}{2}\left\Vert \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i
             - \boldsymbol{b} \right\Vert_2^2}_{\text{Penalty term}},
         \hspace{5mm} \mu > 0
@@ -674,21 +700,20 @@ The steps to solve the problem are the same as with dual decomposition, with the
 condition that the step size be $\mu$:%
 %
 \begin{align*}
-    \boldsymbol{x}_i &\leftarrow \argmin_{\boldsymbol{x}_i}\mathcal{L}_\mu\left(
+    \boldsymbol{x}_i &\leftarrow \argmin_{\boldsymbol{x}_i \ge \boldsymbol{0}}\mathcal{L}_\mu\left(
         \left( \boldsymbol{x} \right)_{i=1}^N, \boldsymbol{\lambda}\right) 
         \hspace{5mm} \forall i \in [1:N]\\
     \boldsymbol{\lambda} &\leftarrow \boldsymbol{\lambda}
         + \mu\left( \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i
             - \boldsymbol{b} \right),
         \hspace{5mm} \mu > 0
-%    \boldsymbol{x}_1 &\leftarrow \argmin_{\boldsymbol{x}_1}\mathcal{L}_\mu\left(
-%        \boldsymbol{x}_1, \boldsymbol{x_2}, \boldsymbol{\lambda}\right) \\
-%    \boldsymbol{x}_2 &\leftarrow \argmin_{\boldsymbol{x}_2}\mathcal{L}_\mu\left(
-%        \boldsymbol{x}_1, \boldsymbol{x_2}, \boldsymbol{\lambda}\right) \\
-%    \boldsymbol{\lambda} &\leftarrow \boldsymbol{\lambda}
-%        + \mu\left( \boldsymbol{A}_1\boldsymbol{x}_1 + \boldsymbol{A}_2\boldsymbol{x}_2
-%            - \boldsymbol{b} \right),
-%        \hspace{5mm} \mu > 0
 .\end{align*}
 %
 
+In subsequent chapters, the decoding problem will be reformulated as an
+optimization problem using two different methodologies.
+In chapter \ref{chapter:proximal_decoding}, a non-convex optimization approach
+is chosen and addressed using the proximal gradient method.
+In chapter \ref{chapter:lp_dec_using_admm}, an \ac{LP} based optimization problem is
+formulated and solved using \ac{ADMM}.
+
diff --git a/latex/thesis/thesis.tex b/latex/thesis/thesis.tex
index 20e7b82..791ed2a 100644
--- a/latex/thesis/thesis.tex
+++ b/latex/thesis/thesis.tex
@@ -35,6 +35,7 @@
 \usetikzlibrary{spy}
 \usetikzlibrary{shapes.geometric}
 \usetikzlibrary{arrows.meta,arrows}
+\tikzset{>=latex}
 
 \pgfplotsset{compat=newest}
 \usepgfplotslibrary{colorbrewer}