Reworked entire admm section

2023-03-22 22:12:35 +01:00
parent d411c1945f
commit c1097a59b7
1 changed files with 97 additions and 30 deletions
--- a/latex/thesis/chapters/theoretical_background.tex
+++ b/latex/thesis/chapters/theoretical_background.tex
@@ -101,12 +101,15 @@ Lastly, the optimization methods utilized are described.
 \section{Optimization Methods}
 \label{sec:theo:Optimization Methods}

+TODO:
 \begin{itemize}
-    \item \ac{ADMM}
-    \item proximal decoding
+    \item Intro
+    \item Proximal Decoding
 \end{itemize}

-Generally, any linear program \todo{Acronym} can be expressed in \textit{standard form}%
+\vspace{5mm}
+
+Generally, any linear program can be expressed in \textit{standard form}%
 \footnote{The inequality $\boldsymbol{x} \ge \boldsymbol{0}$ is to be
 interpreted componentwise.}
 \cite[Sec. 1.1]{intro_to_lin_opt_book}:%
@@ -120,11 +123,11 @@ interpreted componentwise.}
    \label{eq:theo:admm_standard}
 \end{alignat}%
 %
-A technique called \textit{lagrangian relaxation}%
-\todo{Citation needed}%
-can then be applied - some of the
-constraints are moved into the objective function itself and the weights
-$\boldsymbol{\lambda}$ are introduced. A new, relaxed problem is formulated:
+A technique called \textit{lagrangian relaxation} \cite[Sec. 11.4]{intro_to_lin_opt_book}
+can then be applied.
+First, some of the constraints are moved into the objective function itself
+and the weights $\boldsymbol{\lambda}$ are introduced. A new, relaxed problem
+is formulated:
 %
 \begin{align}
    \begin{aligned}
@@ -139,23 +142,24 @@ $\boldsymbol{\lambda}$ are introduced. A new, relaxed problem is formulated:
 the new objective function being the \textit{lagrangian}%
 %
 \begin{align*}
-\mathcal{L}\left( \boldsymbol{x}, \boldsymbol{b}, \boldsymbol{\lambda} \right)
+\mathcal{L}\left( \boldsymbol{x}, \boldsymbol{\lambda} \right)
    = \boldsymbol{\gamma}^\text{T}\boldsymbol{x}
        + \boldsymbol{\lambda}^\text{T}\left(\boldsymbol{b}
            - \boldsymbol{A}\boldsymbol{x} \right)
 .\end{align*}%
-
+%
 This problem is not directly equivalent to the original one, as the
 solution now depends on the choice of the \textit{lagrange multipliers}
 $\boldsymbol{\lambda}$.
-Interestingly, for our particular class of problems,
-the optimal objective of the relaxed problem (\ref{eq:theo:admm_relaxed}) is a lower bound for
+Interestingly, however, for this particular class of problems,
+the minimum of the objective function (herafter called \textit{optimal objective}) 
+of the relaxed problem (\ref{eq:theo:admm_relaxed}) is a lower bound for
 the optimal objective of the original problem (\ref{eq:theo:admm_standard})
 \cite[Sec. 4.1]{intro_to_lin_opt_book}:%
 %
 \begin{align*}
    \min_{\substack{\boldsymbol{x} \ge \boldsymbol{0} \\ \phantom{a}}}
-        \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{b}, \boldsymbol{\lambda}
+        \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{\lambda}
        \right)
    \le
    \min_{\substack{\boldsymbol{x} \ge \boldsymbol{0} \\ \boldsymbol{A}\boldsymbol{x}
@@ -163,55 +167,118 @@ the optimal objective of the original problem (\ref{eq:theo:admm_standard})
        \boldsymbol{\gamma}^\text{T}\boldsymbol{x}
 .\end{align*}
 %
-Furthermore, for linear programs \textit{strong duality}
-always holds.
-\todo{Citation needed}
+Furthermore, for uniquely solvable linear programs \textit{strong duality}
+always holds \cite[Theorem 4.4]{intro_to_lin_opt_book}.
 This means that not only is it a lower bound, the tightest lower
 bound actually reaches the value itself:
+In other words, with the optimal choice of $\boldsymbol{\lambda}$,
+the optimal objectives of the problems (\ref{eq:theo:admm_relaxed})
+and (\ref{eq:theo:admm_standard}) have the same value.
 %
 \begin{align*}
    \max_{\boldsymbol{\lambda}} \, \min_{\boldsymbol{x} \ge \boldsymbol{0}}
-        \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{b}, \boldsymbol{\lambda} \right) 
+        \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{\lambda} \right) 
    = \min_{\substack{\boldsymbol{x} \ge \boldsymbol{0} \\ \boldsymbol{A}\boldsymbol{x}
            = \boldsymbol{b}}}
        \boldsymbol{\gamma}^\text{T}\boldsymbol{x}
 .\end{align*}
 %
-In other words, with the optimal choice of $\boldsymbol{\lambda}$,
-the optimal objectives of the problems (\ref{eq:theo:admm_relaxed})
-and (\ref{eq:theo:admm_standard}) have the same value.
-
 Thus, we can define the \textit{dual problem} as the search for the tightest lower bound:%
 %
 \begin{align}
    \text{maximize }\hspace{2mm} & \min_{\boldsymbol{x} \ge \boldsymbol{0}} \mathcal{L}
-        \left( \boldsymbol{x}, \boldsymbol{b}, \boldsymbol{\lambda} \right)
+        \left( \boldsymbol{x}, \boldsymbol{\lambda} \right)
    \label{eq:theo:dual}
 ,\end{align}
 %
-and recover the optimal point $\boldsymbol{x}_{\text{opt}}$
-(the solution to problem (\ref{eq:theo:admm_standard}))
-from the dual optimal point $\boldsymbol{\lambda}_\text{opt}$
-(the solution to problem (\ref{eq:theo:dual}))
+and recover the solution $\boldsymbol{x}_{\text{opt}}$ to problem (\ref{eq:theo:admm_standard})
+from the solution $\boldsymbol{\lambda}_\text{opt}$ to problem (\ref{eq:theo:dual})
 by computing \cite[Sec. 2.1]{admm_distr_stats}%
 %
 \begin{align}
    \boldsymbol{x}_{\text{opt}} = \argmin_{\boldsymbol{x}}
-        \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{b},
-            \boldsymbol{\lambda}_{\text{opt}} \right)
+        \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{\lambda}_{\text{opt}} \right)
    \label{eq:theo:admm_obtain_primal}
 .\end{align}
 %
-The dual problem can then be solved using \textit{dual ascent}: starting with an
+
+The dual problem can then be solved iteratively using \textit{dual ascent}: starting with an
 initial estimate of $\boldsymbol{\lambda}$, calculate an estimate for $\boldsymbol{x}$
 using equation (\ref{eq:theo:admm_obtain_primal}); then, update $\boldsymbol{\lambda}$
 using gradient descent \cite[Sec. 2.1]{admm_distr_stats}:%
 %
 \begin{align*}
    \boldsymbol{x} &\leftarrow \argmin_{\boldsymbol{x}} \mathcal{L}\left(
-        \boldsymbol{x}, \boldsymbol{b}, \boldsymbol{\lambda} \right) \\
+        \boldsymbol{x}, \boldsymbol{\lambda} \right) \\
    \boldsymbol{\lambda} &\leftarrow \boldsymbol{\lambda}
        + \alpha\left( \boldsymbol{A}\boldsymbol{x} - \boldsymbol{b} \right),
    \hspace{5mm} \alpha > 0
 .\end{align*}
+%
+The algorithm can be improved by observing that when hen the objective function is separable in $\boldsymbol{x}$, the lagrangian is as well:
+%
+\begin{align*}
+    \text{minimize }\hspace{5mm} & \sum_{i=1}^{N} g_i\left( \boldsymbol{x}_i \right)  \\
+    \text{subject to}\hspace{5mm} & \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i
+        = \boldsymbol{b}
+\end{align*}
+\begin{align*}
+    \mathcal{L}\left( \boldsymbol{x}_{[1:N]}, \boldsymbol{\lambda} \right)
+        = \sum_{i=1}^{N} g_i\left( \boldsymbol{x}_i \right) 
+            + \boldsymbol{\lambda}^\text{T} \left( \boldsymbol{b}
+            - \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x_i} \right) 
+.\end{align*}%
+%
+The minimization of each term can then happen in parallel, in a distributed fasion
+\cite[Sec. 2.2]{admm_distr_stats}.
+This modified version of dual ascent is called \textit{dual decomposition}:
+%
+\begin{align*}
+    \boldsymbol{x}_i &\leftarrow \argmin_{\boldsymbol{x}_i}\mathcal{L}\left(
+        \boldsymbol{x}_{[1:N]}, \boldsymbol{\lambda}\right) 
+        \hspace{5mm} \forall i \in [1:N]\\
+    \boldsymbol{\lambda} &\leftarrow \boldsymbol{\lambda}
+        + \alpha\left( \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i
+            - \boldsymbol{b} \right),
+        \hspace{5mm} \alpha > 0
+.\end{align*}
+%

+The \ac{ADMM} works the same way as dual decomposition.
+It only differs in the use of an \textit{augmented lagrangian}
+$\mathcal{L}_\mu\left( \boldsymbol{x}_{[1:N]} \boldsymbol{\lambda} \right)$
+in order to robustify the convergence properties.
+The augmented lagrangian extends the ordinary one with an additional penalty term
+with the penaly parameter $\mu$:
+%
+\begin{align*}
+    \mathcal{L}_\mu \left( \boldsymbol{x}_{[1:N]}, \boldsymbol{\lambda} \right)
+        = \underbrace{\sum_{i=1}^{N} g_i\left( \boldsymbol{x_i} \right) 
+            + \boldsymbol{\lambda}^\text{T}\left( \boldsymbol{b}
+        - \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i \right)}_{\text{Ordinary lagrangian}}
+            + \underbrace{\frac{\mu}{2}\lVert \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i
+            - \boldsymbol{b} \rVert_2^2}_{\text{Penalty term}},
+        \hspace{5mm} \mu > 0
+.\end{align*}
+%
+The steps to solve the problem are the same as with dual decomposition, with the added
+condition that the step size be $\mu$:%
+%
+\begin{align*}
+    \boldsymbol{x}_i &\leftarrow \argmin_{\boldsymbol{x}_i}\mathcal{L}_\mu\left(
+        \boldsymbol{x}_{[1:N]}, \boldsymbol{\lambda}\right) 
+        \hspace{5mm} \forall i \in [1:N]\\
+    \boldsymbol{\lambda} &\leftarrow \boldsymbol{\lambda}
+        + \mu\left( \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i
+            - \boldsymbol{b} \right),
+        \hspace{5mm} \mu > 0
+%    \boldsymbol{x}_1 &\leftarrow \argmin_{\boldsymbol{x}_1}\mathcal{L}_\mu\left(
+%        \boldsymbol{x}_1, \boldsymbol{x_2}, \boldsymbol{\lambda}\right) \\
+%    \boldsymbol{x}_2 &\leftarrow \argmin_{\boldsymbol{x}_2}\mathcal{L}_\mu\left(
+%        \boldsymbol{x}_1, \boldsymbol{x_2}, \boldsymbol{\lambda}\right) \\
+%    \boldsymbol{\lambda} &\leftarrow \boldsymbol{\lambda}
+%        + \mu\left( \boldsymbol{A}_1\boldsymbol{x}_1 + \boldsymbol{A}_2\boldsymbol{x}_2
+%            - \boldsymbol{b} \right),
+%        \hspace{5mm} \mu > 0
+.\end{align*}
+%