Add expense-by-category figure

2024-01-06 22:42:46 +01:00 · 2024-01-06 22:42:46 +01:00 · 1e4efa0c5b
commit 1e4efa0c5b
parent 99a7920118
7 changed files with 257 additions and 90 deletions
--- a/banking_breakdown/main.py
+++ b/banking_breakdown/main.py
@ -1,6 +1,5 @@
-from banking_breakdown import document_builder
+from banking_breakdown import ui, regex_categorizer, statement_parser, \
-from banking_breakdown import statement_parser
+    document_builder
 from banking_breakdown import ui
 import argparse
@ -10,6 +9,9 @@ def categorize_func(args):
    df = pd.read_csv(args.i, delimiter=args.d)
    if args.f is not None:
        df = regex_categorizer.assign_categories(df, args.f)
    import signal
    signal.signal(signal.SIGINT, signal.SIG_DFL)
@ -17,7 +19,8 @@ def categorize_func(args):
 def report_func(args):
-    print("Report")
+    report_data = statement_parser.parse_statement(args.i)
    document_builder.build_document(report_data)
 #
--- a/banking_breakdown/document_builder.py
+++ b/banking_breakdown/document_builder.py
@ -14,6 +14,8 @@ def _serialize_report_data(report_data: types.ReportData):
    report_data.net_income.to_csv('build/net_income.csv', index=False)
    report_data.category_overview.to_csv('build/category_overview.csv',
                                         index=False)
    report_data.expenses_by_category.to_csv('build/expenses_by_category.csv',
                                            index=False)
    report_data.total_value.to_csv('build/total_value.csv', index=False)
    report_data.detailed_balance.to_csv('build/detailed_balance.csv',
                                        index=False)
--- a/banking_breakdown/regex_categorizer.py
+++ b/banking_breakdown/regex_categorizer.py
@ -0,0 +1,54 @@
 import pandas as pd
 import json
 def _is_str_column(s: pd.Series):
    """Check if the type of a pandas DataFrame column is str.
    Taken from https://stackoverflow.com/a/67001213/3433817.
    """
    if isinstance(s.dtype, pd.StringDtype):
        # The series was explicitly created as a string series (Pandas>=1.0.0)
        return True
    elif s.dtype == 'object':
        # Object series, check each value
        return all((v is None) or isinstance(v, str) for v in s)
    else:
        return False
 def _read_regex_dict(regex_file: str):
    with open(regex_file, 'r') as f:
        return json.load(f)
 def assign_categories(df: pd.DataFrame, regex_file: str) -> pd.DataFrame:
    if 'category' not in df.columns:
        df['category'] = [' '] * len(df.index)
    regex_dict = _read_regex_dict(regex_file)
    df = df.fillna('')
    for column in df.columns:
        if not _is_str_column(df[column]):
            continue
        for category in regex_dict:
            for regex in regex_dict[category]:
                matched = df[column].str.contains(regex, regex=True)
                df.loc[matched, 'category'] = category
    return df
 def main():
    df = pd.read_csv('../res/bank_statement_2023_categorized.csv')
    df = assign_categories(df, regex_file='../res/regexes.json')
    print(df['category'])
 if __name__ == "__main__":
    main()
--- a/banking_breakdown/statement_parser.py
+++ b/banking_breakdown/statement_parser.py
@ -6,94 +6,104 @@ import re
 import numpy as np
-# def _read_regex_dict(regex_file: str = "res/category_regexes.json"):
+def _escape_string(to_escape: str):
-#     with open(regex_file, 'r') as f:
+    return to_escape.translate(str.maketrans({"&": r"\&"}))
 #         return json.load(f)
 #
 #
 # def _tag_with_category(df: pd.DataFrame) -> pd.DataFrame:
 #     regex_dict = _read_regex_dict()
 #
 #     return df
 #
 #
 # def _compute_total_balance(df: pd.DataFrame) -> pd.DataFrame:
 #     stripped_df = pd.DataFrame(
 #         {'t': df["Valutadatum"], 'value': df["Saldo nach Buchung"]})
 #
 #     stripped_df.index = stripped_df['t']
 #     gb = stripped_df.groupby(pd.Grouper(freq='M'))
 #
 #     result = gb.tail(1)['value'].reset_index()
 #
 #     return result
 #
 #
 # def _compute_net_income(df: pd.DataFrame) -> pd.DataFrame:
 #     stripped_df = pd.DataFrame({'t': df["Valutadatum"], 'value': df["Betrag"]})
 #
 #     stripped_df.index = stripped_df['t']
 #     gb = stripped_df.groupby(pd.Grouper(freq='M'))
 #
 #     result = gb["value"].sum().reset_index()
 #     return result
 #
 #
 # def _compute_category_overview(df: pd.DataFrame) -> pd.DataFrame:
 #     categories = ["Social life", "Other", "Food", "Hobbies",
 #                   "Rent \\& Utilities", "Education", "Transportation"]
 #     values = np.array([10, 12, 53, 12, 90, 23, 32])
 #     values = values / values.sum() * 100
 #     values = np.round(values, decimals=1)
 #     values[-1] += 100 - np.sum(values)
 #
 #     category_overview_df = pd.DataFrame(
 #         {"category": categories, "value": values})
 #
 #     return category_overview_df
 #
 #
 # def _compute_detailed_balance(df: pd.DataFrame) -> pd.DataFrame:
 #     return pd.DataFrame({'t': df["Valutadatum"],
 #                          'value': df["Saldo nach Buchung"]})
 #
 #
 # def parse_statement(filename: str) -> types.ReportData:
 #     df = pd.read_csv(filename, delimiter=';', decimal=",")
 #     df["Valutadatum"] = pd.to_datetime(df["Valutadatum"], format='%d.%m.%Y')
 #
 #     category_overview_df = _compute_category_overview(df)
 #     total_balance_df = _compute_total_balance(df)
 #     net_income_df = _compute_net_income(df)
 #     detailed_balance_df = _compute_detailed_balance(df)
 #
 #     return types.ReportData(category_overview_df,
 #                             net_income_df,
 #                             total_balance_df,
 #                             detailed_balance_df)
 #
 #
 # def main():
 #     report_data = parse_statement("../res/bank_statement_2023.csv")
 #
 #
 # if __name__ == "__main__":
 #     main()
-def get_stripped_statement(filename: str) -> pd.DataFrame:
+def _compute_total_balance(df: pd.DataFrame) -> pd.DataFrame:
-    # df = pd.read_csv(filename, delimiter=';', decimal=",")
+    stripped_df = pd.DataFrame(
-    df = pd.read_csv(filename, delimiter=';')
+        {'t': df["t"], 'value': df["balance"]})
    df["Valutadatum"] = (pd.to_datetime(df["Valutadatum"], format='%d.%m.%Y')
                         .dt.strftime('%Y-%m-%d'))
-    result = pd.DataFrame({'t': df["Valutadatum"],
+    stripped_df.index = stripped_df['t']
-                           'other party': df["Name Zahlungsbeteiligter"],
+    gb = stripped_df.groupby(pd.Grouper(freq='M'))
-                           'value': df["Betrag"],
+
-                           'balance': df["Saldo nach Buchung"],
+    result = gb.tail(1)['value'].reset_index()
                           'category': [''] * len(df["Valutadatum"]),
                           'description': df["Buchungstext"],
                           'purpose': df["Verwendungszweck"]
                           })
    return result
 def _compute_net_income(df: pd.DataFrame) -> pd.DataFrame:
    stripped_df = pd.DataFrame({'t': df["t"], 'value': df["value"]})
    stripped_df.index = stripped_df['t']
    gb = stripped_df.groupby(pd.Grouper(freq='M'))
    result = gb["value"].sum().reset_index()
    return result
 def _compute_category_overview(df: pd.DataFrame) -> pd.DataFrame:
    df = df.loc[df['value'] < 0]
    df = df.drop('t', axis=1)
    df = df.groupby(['category']).sum().reset_index()
    values = (df['value'] / df['value'].sum() * 100).to_numpy()
    values[-1] += 100 - np.sum(values)
    values = np.round(values, decimals=1)
    categories = [_escape_string(category) for category in df['category']]
    category_overview_df = pd.DataFrame(
        {"category": categories, "value": values})
    category_overview_df = category_overview_df.sort_values('value',
                                                            ascending=False)
    return category_overview_df
 def _compute_expenses_by_category(complete_df: pd.DataFrame) -> pd.DataFrame:
    complete_df = complete_df.loc[complete_df['value'] < 0]
    complete_df['value'] = -complete_df['value']
    complete_df.index = complete_df['t']
    complete_gb = complete_df.groupby(pd.Grouper(freq='M'))
    categories = complete_df['category'].unique()
    data_dict = {category: [] for category in categories}
    for (month_date, month_df) in complete_gb:
        month_df = month_df.drop('t', axis=1).reset_index().drop('t', axis=1)
        category_df = month_df.groupby(['category']).sum().reset_index()
        for _, row in category_df.iterrows():
            data_dict[row['category']].append(row['value'])
        non_listed = list(set(categories) - set(category_df['category']))
        for category in non_listed:
            data_dict[category].append(0)
    result = pd.DataFrame(data_dict)
    result = result.reindex(result.mean().sort_values(ascending=False).index,
                            axis=1)
    result['t'] = complete_gb.tail(1).drop('t', axis=1).reset_index()['t']
    return result
 def _compute_detailed_balance(df: pd.DataFrame) -> pd.DataFrame:
    return pd.DataFrame({'t': df["t"],
                         'value': df["balance"]})
 def parse_statement(filename: str) -> types.ReportData:
    df = pd.read_csv(filename)
    df["t"] = pd.to_datetime(df["t"], format='%Y-%m-%d')
    category_overview_df = _compute_category_overview(df)
    total_balance_df = _compute_total_balance(df)
    net_income_df = _compute_net_income(df)
    detailed_balance_df = _compute_detailed_balance(df)
    expenses_by_category_df = _compute_expenses_by_category(df)
    return types.ReportData(category_overview_df,
                            expenses_by_category_df,
                            net_income_df,
                            total_balance_df,
                            detailed_balance_df, )
 def main():
    report_data = parse_statement("../res/bank_statement_2023_categorized.csv")
 if __name__ == "__main__":
    main()
--- a/banking_breakdown/test.json
+++ b/banking_breakdown/test.json
@ -0,0 +1,5 @@
 {
  "asdf": [
    "Kinemic"
  ]
 }
--- a/banking_breakdown/types.py
+++ b/banking_breakdown/types.py
@ -5,6 +5,7 @@ import pandas as pd
@dataclass
 class ReportData:
    category_overview: pd.DataFrame
    expenses_by_category: pd.DataFrame
    net_income: pd.DataFrame
    total_value: pd.DataFrame
    detailed_balance: pd.DataFrame
--- a/res/report.tex
+++ b/res/report.tex
@ -212,5 +212,97 @@
 \end{figure}
 \begin{figure}[H]
 	\centering
 	% Read table
 	\pgfplotstableread[col sep=comma]{expenses_by_category.csv}\expbycattable
 	\pgfplotstablegetcolsof{\expbycattable}
 	\pgfmathtruncatemacro\NumCols{\pgfplotsretval-1} 
 	\begin{subfigure}[c]{\textwidth}
 		\centering
 		\begin{tikzpicture}
 			\begin{axis}[
 				stack plots=y,
 				area style,
 				date coordinates in=x,
 				width=\textwidth,
 				height=0.375\textwidth,
 				xticklabel=\month.\shortyear{\year},
 				xtick=data,
 				enlargelimits=false,
 				xticklabel style={
 					rotate=60,
 					anchor=near xticklabel,
 				},
 				legend columns=5,
 				legend style={at={(0.5,-0.6)},anchor=south},
 				ylabel={Expenses in €},
 				ymin=0,
 			]
 				% For each
 				\pgfplotsinvokeforeach{0,...,\NumCols/2 -1}{
 					% Define color
 					\pgfmathparse{1000 / (\NumCols/2 -1) * #1}
 					\extractcolormapcolor{tempcol#1}{\pgfmathresult}
 					% Add plot
 					\addplot+[tempcol#1]
 						table[col sep=comma, x=t, y index=#1]
 							{\expbycattable} \closedcycle;
 					% Add legend entry (https://tex.stackexchange.com/a/405018)
 					\pgfplotstablegetcolumnnamebyindex{#1}\of{\expbycattable}\to\pgfplotsretval
 					\expandafter\addlegendentry\expandafter{\pgfplotsretval}
 				}
 			\end{axis}
 		\end{tikzpicture}
 	\end{subfigure}\\[1em]
 	\begin{subfigure}[c]{\textwidth}
 		\centering
 		\begin{tikzpicture}
 			\begin{axis}[
 				stack plots=y,
 				area style,
 				date coordinates in=x,
 				width=\textwidth,
 				height=0.375\textwidth,
 				xticklabel=\month.\shortyear{\year},
 				xtick=data,
 				enlargelimits=false,
 				xticklabel style={
 					rotate=60,
 					anchor=near xticklabel,
 				},
 				legend columns=5,
 				legend style={at={(0.5,-0.6)},anchor=south},
 				ylabel={Expenses in €},
 				ymin=0,
 			]
 				% For each
 				\pgfplotsinvokeforeach{\NumCols/2,...,\NumCols-1}{
 					% Define color
 					\pgfmathparse{1000 * (#1 - \NumCols/2) / (\NumCols-1 - \NumCols/2)}
 					\extractcolormapcolor{tempcol#1}{\pgfmathresult}
 					% Add plot
 					\addplot+[tempcol#1]
 						table[col sep=comma, x=t, y index=#1]
 							{\expbycattable} \closedcycle;
 					% Add legend entry (https://tex.stackexchange.com/a/405018)
 					\pgfplotstablegetcolumnnamebyindex{#1}\of{\expbycattable}\to\pgfplotsretval
 					\expandafter\addlegendentry\expandafter{\pgfplotsretval}
 				}
 			\end{axis}
 		\end{tikzpicture}
 	\end{subfigure}
 	\caption{Expenses by category}
 \end{figure}
 \end{document}