diff --git a/banking_breakdown/__main__.py b/banking_breakdown/__main__.py index 7d8b5bb..4ca743c 100644 --- a/banking_breakdown/__main__.py +++ b/banking_breakdown/__main__.py @@ -1,6 +1,5 @@ -from banking_breakdown import document_builder -from banking_breakdown import statement_parser -from banking_breakdown import ui +from banking_breakdown import ui, regex_categorizer, statement_parser, \ + document_builder import argparse @@ -10,6 +9,9 @@ def categorize_func(args): df = pd.read_csv(args.i, delimiter=args.d) + if args.f is not None: + df = regex_categorizer.assign_categories(df, args.f) + import signal signal.signal(signal.SIGINT, signal.SIG_DFL) @@ -17,7 +19,8 @@ def categorize_func(args): def report_func(args): - print("Report") + report_data = statement_parser.parse_statement(args.i) + document_builder.build_document(report_data) # diff --git a/banking_breakdown/document_builder.py b/banking_breakdown/document_builder.py index fe460a5..7bf14fc 100644 --- a/banking_breakdown/document_builder.py +++ b/banking_breakdown/document_builder.py @@ -14,6 +14,8 @@ def _serialize_report_data(report_data: types.ReportData): report_data.net_income.to_csv('build/net_income.csv', index=False) report_data.category_overview.to_csv('build/category_overview.csv', index=False) + report_data.expenses_by_category.to_csv('build/expenses_by_category.csv', + index=False) report_data.total_value.to_csv('build/total_value.csv', index=False) report_data.detailed_balance.to_csv('build/detailed_balance.csv', index=False) diff --git a/banking_breakdown/regex_categorizer.py b/banking_breakdown/regex_categorizer.py new file mode 100644 index 0000000..cdf29f0 --- /dev/null +++ b/banking_breakdown/regex_categorizer.py @@ -0,0 +1,54 @@ +import pandas as pd +import json + + +def _is_str_column(s: pd.Series): + """Check if the type of a pandas DataFrame column is str. + + Taken from https://stackoverflow.com/a/67001213/3433817. + """ + if isinstance(s.dtype, pd.StringDtype): + # The series was explicitly created as a string series (Pandas>=1.0.0) + return True + elif s.dtype == 'object': + # Object series, check each value + return all((v is None) or isinstance(v, str) for v in s) + else: + return False + + +def _read_regex_dict(regex_file: str): + with open(regex_file, 'r') as f: + return json.load(f) + + +def assign_categories(df: pd.DataFrame, regex_file: str) -> pd.DataFrame: + if 'category' not in df.columns: + df['category'] = [' '] * len(df.index) + + regex_dict = _read_regex_dict(regex_file) + + df = df.fillna('') + + for column in df.columns: + if not _is_str_column(df[column]): + continue + + for category in regex_dict: + for regex in regex_dict[category]: + matched = df[column].str.contains(regex, regex=True) + df.loc[matched, 'category'] = category + + return df + + +def main(): + df = pd.read_csv('../res/bank_statement_2023_categorized.csv') + + df = assign_categories(df, regex_file='../res/regexes.json') + + print(df['category']) + + +if __name__ == "__main__": + main() diff --git a/banking_breakdown/statement_parser.py b/banking_breakdown/statement_parser.py index 4798286..201e9bd 100644 --- a/banking_breakdown/statement_parser.py +++ b/banking_breakdown/statement_parser.py @@ -6,94 +6,104 @@ import re import numpy as np -# def _read_regex_dict(regex_file: str = "res/category_regexes.json"): -# with open(regex_file, 'r') as f: -# return json.load(f) -# -# -# def _tag_with_category(df: pd.DataFrame) -> pd.DataFrame: -# regex_dict = _read_regex_dict() -# -# return df -# -# -# def _compute_total_balance(df: pd.DataFrame) -> pd.DataFrame: -# stripped_df = pd.DataFrame( -# {'t': df["Valutadatum"], 'value': df["Saldo nach Buchung"]}) -# -# stripped_df.index = stripped_df['t'] -# gb = stripped_df.groupby(pd.Grouper(freq='M')) -# -# result = gb.tail(1)['value'].reset_index() -# -# return result -# -# -# def _compute_net_income(df: pd.DataFrame) -> pd.DataFrame: -# stripped_df = pd.DataFrame({'t': df["Valutadatum"], 'value': df["Betrag"]}) -# -# stripped_df.index = stripped_df['t'] -# gb = stripped_df.groupby(pd.Grouper(freq='M')) -# -# result = gb["value"].sum().reset_index() -# return result -# -# -# def _compute_category_overview(df: pd.DataFrame) -> pd.DataFrame: -# categories = ["Social life", "Other", "Food", "Hobbies", -# "Rent \\& Utilities", "Education", "Transportation"] -# values = np.array([10, 12, 53, 12, 90, 23, 32]) -# values = values / values.sum() * 100 -# values = np.round(values, decimals=1) -# values[-1] += 100 - np.sum(values) -# -# category_overview_df = pd.DataFrame( -# {"category": categories, "value": values}) -# -# return category_overview_df -# -# -# def _compute_detailed_balance(df: pd.DataFrame) -> pd.DataFrame: -# return pd.DataFrame({'t': df["Valutadatum"], -# 'value': df["Saldo nach Buchung"]}) -# -# -# def parse_statement(filename: str) -> types.ReportData: -# df = pd.read_csv(filename, delimiter=';', decimal=",") -# df["Valutadatum"] = pd.to_datetime(df["Valutadatum"], format='%d.%m.%Y') -# -# category_overview_df = _compute_category_overview(df) -# total_balance_df = _compute_total_balance(df) -# net_income_df = _compute_net_income(df) -# detailed_balance_df = _compute_detailed_balance(df) -# -# return types.ReportData(category_overview_df, -# net_income_df, -# total_balance_df, -# detailed_balance_df) -# -# -# def main(): -# report_data = parse_statement("../res/bank_statement_2023.csv") -# -# -# if __name__ == "__main__": -# main() +def _escape_string(to_escape: str): + return to_escape.translate(str.maketrans({"&": r"\&"})) -def get_stripped_statement(filename: str) -> pd.DataFrame: - # df = pd.read_csv(filename, delimiter=';', decimal=",") - df = pd.read_csv(filename, delimiter=';') - df["Valutadatum"] = (pd.to_datetime(df["Valutadatum"], format='%d.%m.%Y') - .dt.strftime('%Y-%m-%d')) +def _compute_total_balance(df: pd.DataFrame) -> pd.DataFrame: + stripped_df = pd.DataFrame( + {'t': df["t"], 'value': df["balance"]}) - result = pd.DataFrame({'t': df["Valutadatum"], - 'other party': df["Name Zahlungsbeteiligter"], - 'value': df["Betrag"], - 'balance': df["Saldo nach Buchung"], - 'category': [''] * len(df["Valutadatum"]), - 'description': df["Buchungstext"], - 'purpose': df["Verwendungszweck"] - }) + stripped_df.index = stripped_df['t'] + gb = stripped_df.groupby(pd.Grouper(freq='M')) + + result = gb.tail(1)['value'].reset_index() return result + + +def _compute_net_income(df: pd.DataFrame) -> pd.DataFrame: + stripped_df = pd.DataFrame({'t': df["t"], 'value': df["value"]}) + + stripped_df.index = stripped_df['t'] + gb = stripped_df.groupby(pd.Grouper(freq='M')) + + result = gb["value"].sum().reset_index() + return result + + +def _compute_category_overview(df: pd.DataFrame) -> pd.DataFrame: + df = df.loc[df['value'] < 0] + df = df.drop('t', axis=1) + df = df.groupby(['category']).sum().reset_index() + values = (df['value'] / df['value'].sum() * 100).to_numpy() + values[-1] += 100 - np.sum(values) + values = np.round(values, decimals=1) + categories = [_escape_string(category) for category in df['category']] + + category_overview_df = pd.DataFrame( + {"category": categories, "value": values}) + + category_overview_df = category_overview_df.sort_values('value', + ascending=False) + return category_overview_df + + +def _compute_expenses_by_category(complete_df: pd.DataFrame) -> pd.DataFrame: + complete_df = complete_df.loc[complete_df['value'] < 0] + complete_df['value'] = -complete_df['value'] + complete_df.index = complete_df['t'] + + complete_gb = complete_df.groupby(pd.Grouper(freq='M')) + + categories = complete_df['category'].unique() + data_dict = {category: [] for category in categories} + + for (month_date, month_df) in complete_gb: + month_df = month_df.drop('t', axis=1).reset_index().drop('t', axis=1) + + category_df = month_df.groupby(['category']).sum().reset_index() + + for _, row in category_df.iterrows(): + data_dict[row['category']].append(row['value']) + + non_listed = list(set(categories) - set(category_df['category'])) + for category in non_listed: + data_dict[category].append(0) + + result = pd.DataFrame(data_dict) + result = result.reindex(result.mean().sort_values(ascending=False).index, + axis=1) + result['t'] = complete_gb.tail(1).drop('t', axis=1).reset_index()['t'] + + return result + + +def _compute_detailed_balance(df: pd.DataFrame) -> pd.DataFrame: + return pd.DataFrame({'t': df["t"], + 'value': df["balance"]}) + + +def parse_statement(filename: str) -> types.ReportData: + df = pd.read_csv(filename) + df["t"] = pd.to_datetime(df["t"], format='%Y-%m-%d') + + category_overview_df = _compute_category_overview(df) + total_balance_df = _compute_total_balance(df) + net_income_df = _compute_net_income(df) + detailed_balance_df = _compute_detailed_balance(df) + expenses_by_category_df = _compute_expenses_by_category(df) + + return types.ReportData(category_overview_df, + expenses_by_category_df, + net_income_df, + total_balance_df, + detailed_balance_df, ) + + +def main(): + report_data = parse_statement("../res/bank_statement_2023_categorized.csv") + + +if __name__ == "__main__": + main() diff --git a/banking_breakdown/test.json b/banking_breakdown/test.json new file mode 100644 index 0000000..bbe1003 --- /dev/null +++ b/banking_breakdown/test.json @@ -0,0 +1,5 @@ +{ + "asdf": [ + "Kinemic" + ] +} \ No newline at end of file diff --git a/banking_breakdown/types.py b/banking_breakdown/types.py index 1ded89a..f94173d 100644 --- a/banking_breakdown/types.py +++ b/banking_breakdown/types.py @@ -5,6 +5,7 @@ import pandas as pd @dataclass class ReportData: category_overview: pd.DataFrame + expenses_by_category: pd.DataFrame net_income: pd.DataFrame total_value: pd.DataFrame detailed_balance: pd.DataFrame diff --git a/res/report.tex b/res/report.tex index 154b160..2aa0451 100644 --- a/res/report.tex +++ b/res/report.tex @@ -212,5 +212,97 @@ \end{figure} +\begin{figure}[H] + \centering + + % Read table + \pgfplotstableread[col sep=comma]{expenses_by_category.csv}\expbycattable + \pgfplotstablegetcolsof{\expbycattable} + \pgfmathtruncatemacro\NumCols{\pgfplotsretval-1} + + \begin{subfigure}[c]{\textwidth} + \centering + + \begin{tikzpicture} + \begin{axis}[ + stack plots=y, + area style, + date coordinates in=x, + width=\textwidth, + height=0.375\textwidth, + xticklabel=\month.\shortyear{\year}, + xtick=data, + enlargelimits=false, + xticklabel style={ + rotate=60, + anchor=near xticklabel, + }, + legend columns=5, + legend style={at={(0.5,-0.6)},anchor=south}, + ylabel={Expenses in €}, + ymin=0, + ] + % For each + \pgfplotsinvokeforeach{0,...,\NumCols/2 -1}{ + % Define color + \pgfmathparse{1000 / (\NumCols/2 -1) * #1} + \extractcolormapcolor{tempcol#1}{\pgfmathresult} + + % Add plot + \addplot+[tempcol#1] + table[col sep=comma, x=t, y index=#1] + {\expbycattable} \closedcycle; + + % Add legend entry (https://tex.stackexchange.com/a/405018) + \pgfplotstablegetcolumnnamebyindex{#1}\of{\expbycattable}\to\pgfplotsretval + \expandafter\addlegendentry\expandafter{\pgfplotsretval} + } + \end{axis} + \end{tikzpicture} + \end{subfigure}\\[1em] + \begin{subfigure}[c]{\textwidth} + \centering + + \begin{tikzpicture} + \begin{axis}[ + stack plots=y, + area style, + date coordinates in=x, + width=\textwidth, + height=0.375\textwidth, + xticklabel=\month.\shortyear{\year}, + xtick=data, + enlargelimits=false, + xticklabel style={ + rotate=60, + anchor=near xticklabel, + }, + legend columns=5, + legend style={at={(0.5,-0.6)},anchor=south}, + ylabel={Expenses in €}, + ymin=0, + ] + % For each + \pgfplotsinvokeforeach{\NumCols/2,...,\NumCols-1}{ + % Define color + \pgfmathparse{1000 * (#1 - \NumCols/2) / (\NumCols-1 - \NumCols/2)} + \extractcolormapcolor{tempcol#1}{\pgfmathresult} + + % Add plot + \addplot+[tempcol#1] + table[col sep=comma, x=t, y index=#1] + {\expbycattable} \closedcycle; + + % Add legend entry (https://tex.stackexchange.com/a/405018) + \pgfplotstablegetcolumnnamebyindex{#1}\of{\expbycattable}\to\pgfplotsretval + \expandafter\addlegendentry\expandafter{\pgfplotsretval} + } + \end{axis} + \end{tikzpicture} + \end{subfigure} + + \caption{Expenses by category} +\end{figure} + \end{document}