diff --git a/latex/thesis/chapters/theoretical_background.tex b/latex/thesis/chapters/theoretical_background.tex index e1f0c3f..aa6497d 100644 --- a/latex/thesis/chapters/theoretical_background.tex +++ b/latex/thesis/chapters/theoretical_background.tex @@ -101,12 +101,15 @@ Lastly, the optimization methods utilized are described. \section{Optimization Methods} \label{sec:theo:Optimization Methods} +TODO: \begin{itemize} - \item \ac{ADMM} - \item proximal decoding + \item Intro + \item Proximal Decoding \end{itemize} -Generally, any linear program \todo{Acronym} can be expressed in \textit{standard form}% +\vspace{5mm} + +Generally, any linear program can be expressed in \textit{standard form}% \footnote{The inequality $\boldsymbol{x} \ge \boldsymbol{0}$ is to be interpreted componentwise.} \cite[Sec. 1.1]{intro_to_lin_opt_book}:% @@ -120,11 +123,11 @@ interpreted componentwise.} \label{eq:theo:admm_standard} \end{alignat}% % -A technique called \textit{lagrangian relaxation}% -\todo{Citation needed}% -can then be applied - some of the -constraints are moved into the objective function itself and the weights -$\boldsymbol{\lambda}$ are introduced. A new, relaxed problem is formulated: +A technique called \textit{lagrangian relaxation} \cite[Sec. 11.4]{intro_to_lin_opt_book} +can then be applied. +First, some of the constraints are moved into the objective function itself +and the weights $\boldsymbol{\lambda}$ are introduced. A new, relaxed problem +is formulated: % \begin{align} \begin{aligned} @@ -139,23 +142,24 @@ $\boldsymbol{\lambda}$ are introduced. A new, relaxed problem is formulated: the new objective function being the \textit{lagrangian}% % \begin{align*} -\mathcal{L}\left( \boldsymbol{x}, \boldsymbol{b}, \boldsymbol{\lambda} \right) +\mathcal{L}\left( \boldsymbol{x}, \boldsymbol{\lambda} \right) = \boldsymbol{\gamma}^\text{T}\boldsymbol{x} + \boldsymbol{\lambda}^\text{T}\left(\boldsymbol{b} - \boldsymbol{A}\boldsymbol{x} \right) .\end{align*}% - +% This problem is not directly equivalent to the original one, as the solution now depends on the choice of the \textit{lagrange multipliers} $\boldsymbol{\lambda}$. -Interestingly, for our particular class of problems, -the optimal objective of the relaxed problem (\ref{eq:theo:admm_relaxed}) is a lower bound for +Interestingly, however, for this particular class of problems, +the minimum of the objective function (herafter called \textit{optimal objective}) +of the relaxed problem (\ref{eq:theo:admm_relaxed}) is a lower bound for the optimal objective of the original problem (\ref{eq:theo:admm_standard}) \cite[Sec. 4.1]{intro_to_lin_opt_book}:% % \begin{align*} \min_{\substack{\boldsymbol{x} \ge \boldsymbol{0} \\ \phantom{a}}} - \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{b}, \boldsymbol{\lambda} + \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{\lambda} \right) \le \min_{\substack{\boldsymbol{x} \ge \boldsymbol{0} \\ \boldsymbol{A}\boldsymbol{x} @@ -163,55 +167,118 @@ the optimal objective of the original problem (\ref{eq:theo:admm_standard}) \boldsymbol{\gamma}^\text{T}\boldsymbol{x} .\end{align*} % -Furthermore, for linear programs \textit{strong duality} -always holds. -\todo{Citation needed} +Furthermore, for uniquely solvable linear programs \textit{strong duality} +always holds \cite[Theorem 4.4]{intro_to_lin_opt_book}. This means that not only is it a lower bound, the tightest lower bound actually reaches the value itself: +In other words, with the optimal choice of $\boldsymbol{\lambda}$, +the optimal objectives of the problems (\ref{eq:theo:admm_relaxed}) +and (\ref{eq:theo:admm_standard}) have the same value. % \begin{align*} \max_{\boldsymbol{\lambda}} \, \min_{\boldsymbol{x} \ge \boldsymbol{0}} - \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{b}, \boldsymbol{\lambda} \right) + \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{\lambda} \right) = \min_{\substack{\boldsymbol{x} \ge \boldsymbol{0} \\ \boldsymbol{A}\boldsymbol{x} = \boldsymbol{b}}} \boldsymbol{\gamma}^\text{T}\boldsymbol{x} .\end{align*} % -In other words, with the optimal choice of $\boldsymbol{\lambda}$, -the optimal objectives of the problems (\ref{eq:theo:admm_relaxed}) -and (\ref{eq:theo:admm_standard}) have the same value. - Thus, we can define the \textit{dual problem} as the search for the tightest lower bound:% % \begin{align} \text{maximize }\hspace{2mm} & \min_{\boldsymbol{x} \ge \boldsymbol{0}} \mathcal{L} - \left( \boldsymbol{x}, \boldsymbol{b}, \boldsymbol{\lambda} \right) + \left( \boldsymbol{x}, \boldsymbol{\lambda} \right) \label{eq:theo:dual} ,\end{align} % -and recover the optimal point $\boldsymbol{x}_{\text{opt}}$ -(the solution to problem (\ref{eq:theo:admm_standard})) -from the dual optimal point $\boldsymbol{\lambda}_\text{opt}$ -(the solution to problem (\ref{eq:theo:dual})) +and recover the solution $\boldsymbol{x}_{\text{opt}}$ to problem (\ref{eq:theo:admm_standard}) +from the solution $\boldsymbol{\lambda}_\text{opt}$ to problem (\ref{eq:theo:dual}) by computing \cite[Sec. 2.1]{admm_distr_stats}% % \begin{align} \boldsymbol{x}_{\text{opt}} = \argmin_{\boldsymbol{x}} - \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{b}, - \boldsymbol{\lambda}_{\text{opt}} \right) + \mathcal{L}\left( \boldsymbol{x}, \boldsymbol{\lambda}_{\text{opt}} \right) \label{eq:theo:admm_obtain_primal} .\end{align} % -The dual problem can then be solved using \textit{dual ascent}: starting with an + +The dual problem can then be solved iteratively using \textit{dual ascent}: starting with an initial estimate of $\boldsymbol{\lambda}$, calculate an estimate for $\boldsymbol{x}$ using equation (\ref{eq:theo:admm_obtain_primal}); then, update $\boldsymbol{\lambda}$ using gradient descent \cite[Sec. 2.1]{admm_distr_stats}:% % \begin{align*} \boldsymbol{x} &\leftarrow \argmin_{\boldsymbol{x}} \mathcal{L}\left( - \boldsymbol{x}, \boldsymbol{b}, \boldsymbol{\lambda} \right) \\ + \boldsymbol{x}, \boldsymbol{\lambda} \right) \\ \boldsymbol{\lambda} &\leftarrow \boldsymbol{\lambda} + \alpha\left( \boldsymbol{A}\boldsymbol{x} - \boldsymbol{b} \right), \hspace{5mm} \alpha > 0 .\end{align*} +% +The algorithm can be improved by observing that when hen the objective function is separable in $\boldsymbol{x}$, the lagrangian is as well: +% +\begin{align*} + \text{minimize }\hspace{5mm} & \sum_{i=1}^{N} g_i\left( \boldsymbol{x}_i \right) \\ + \text{subject to}\hspace{5mm} & \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i + = \boldsymbol{b} +\end{align*} +\begin{align*} + \mathcal{L}\left( \boldsymbol{x}_{[1:N]}, \boldsymbol{\lambda} \right) + = \sum_{i=1}^{N} g_i\left( \boldsymbol{x}_i \right) + + \boldsymbol{\lambda}^\text{T} \left( \boldsymbol{b} + - \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x_i} \right) +.\end{align*}% +% +The minimization of each term can then happen in parallel, in a distributed fasion +\cite[Sec. 2.2]{admm_distr_stats}. +This modified version of dual ascent is called \textit{dual decomposition}: +% +\begin{align*} + \boldsymbol{x}_i &\leftarrow \argmin_{\boldsymbol{x}_i}\mathcal{L}\left( + \boldsymbol{x}_{[1:N]}, \boldsymbol{\lambda}\right) + \hspace{5mm} \forall i \in [1:N]\\ + \boldsymbol{\lambda} &\leftarrow \boldsymbol{\lambda} + + \alpha\left( \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i + - \boldsymbol{b} \right), + \hspace{5mm} \alpha > 0 +.\end{align*} +% +The \ac{ADMM} works the same way as dual decomposition. +It only differs in the use of an \textit{augmented lagrangian} +$\mathcal{L}_\mu\left( \boldsymbol{x}_{[1:N]} \boldsymbol{\lambda} \right)$ +in order to robustify the convergence properties. +The augmented lagrangian extends the ordinary one with an additional penalty term +with the penaly parameter $\mu$: +% +\begin{align*} + \mathcal{L}_\mu \left( \boldsymbol{x}_{[1:N]}, \boldsymbol{\lambda} \right) + = \underbrace{\sum_{i=1}^{N} g_i\left( \boldsymbol{x_i} \right) + + \boldsymbol{\lambda}^\text{T}\left( \boldsymbol{b} + - \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i \right)}_{\text{Ordinary lagrangian}} + + \underbrace{\frac{\mu}{2}\lVert \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i + - \boldsymbol{b} \rVert_2^2}_{\text{Penalty term}}, + \hspace{5mm} \mu > 0 +.\end{align*} +% +The steps to solve the problem are the same as with dual decomposition, with the added +condition that the step size be $\mu$:% +% +\begin{align*} + \boldsymbol{x}_i &\leftarrow \argmin_{\boldsymbol{x}_i}\mathcal{L}_\mu\left( + \boldsymbol{x}_{[1:N]}, \boldsymbol{\lambda}\right) + \hspace{5mm} \forall i \in [1:N]\\ + \boldsymbol{\lambda} &\leftarrow \boldsymbol{\lambda} + + \mu\left( \sum_{i=1}^{N} \boldsymbol{A}_i\boldsymbol{x}_i + - \boldsymbol{b} \right), + \hspace{5mm} \mu > 0 +% \boldsymbol{x}_1 &\leftarrow \argmin_{\boldsymbol{x}_1}\mathcal{L}_\mu\left( +% \boldsymbol{x}_1, \boldsymbol{x_2}, \boldsymbol{\lambda}\right) \\ +% \boldsymbol{x}_2 &\leftarrow \argmin_{\boldsymbol{x}_2}\mathcal{L}_\mu\left( +% \boldsymbol{x}_1, \boldsymbol{x_2}, \boldsymbol{\lambda}\right) \\ +% \boldsymbol{\lambda} &\leftarrow \boldsymbol{\lambda} +% + \mu\left( \boldsymbol{A}_1\boldsymbol{x}_1 + \boldsymbol{A}_2\boldsymbol{x}_2 +% - \boldsymbol{b} \right), +% \hspace{5mm} \mu > 0 +.\end{align*} +%