Add expense-by-category figure
This commit is contained in:
parent
99a7920118
commit
1e4efa0c5b
@ -1,6 +1,5 @@
|
||||
from banking_breakdown import document_builder
|
||||
from banking_breakdown import statement_parser
|
||||
from banking_breakdown import ui
|
||||
from banking_breakdown import ui, regex_categorizer, statement_parser, \
|
||||
document_builder
|
||||
|
||||
import argparse
|
||||
|
||||
@ -10,6 +9,9 @@ def categorize_func(args):
|
||||
|
||||
df = pd.read_csv(args.i, delimiter=args.d)
|
||||
|
||||
if args.f is not None:
|
||||
df = regex_categorizer.assign_categories(df, args.f)
|
||||
|
||||
import signal
|
||||
signal.signal(signal.SIGINT, signal.SIG_DFL)
|
||||
|
||||
@ -17,7 +19,8 @@ def categorize_func(args):
|
||||
|
||||
|
||||
def report_func(args):
|
||||
print("Report")
|
||||
report_data = statement_parser.parse_statement(args.i)
|
||||
document_builder.build_document(report_data)
|
||||
|
||||
|
||||
#
|
||||
|
||||
@ -14,6 +14,8 @@ def _serialize_report_data(report_data: types.ReportData):
|
||||
report_data.net_income.to_csv('build/net_income.csv', index=False)
|
||||
report_data.category_overview.to_csv('build/category_overview.csv',
|
||||
index=False)
|
||||
report_data.expenses_by_category.to_csv('build/expenses_by_category.csv',
|
||||
index=False)
|
||||
report_data.total_value.to_csv('build/total_value.csv', index=False)
|
||||
report_data.detailed_balance.to_csv('build/detailed_balance.csv',
|
||||
index=False)
|
||||
|
||||
54
banking_breakdown/regex_categorizer.py
Normal file
54
banking_breakdown/regex_categorizer.py
Normal file
@ -0,0 +1,54 @@
|
||||
import pandas as pd
|
||||
import json
|
||||
|
||||
|
||||
def _is_str_column(s: pd.Series):
|
||||
"""Check if the type of a pandas DataFrame column is str.
|
||||
|
||||
Taken from https://stackoverflow.com/a/67001213/3433817.
|
||||
"""
|
||||
if isinstance(s.dtype, pd.StringDtype):
|
||||
# The series was explicitly created as a string series (Pandas>=1.0.0)
|
||||
return True
|
||||
elif s.dtype == 'object':
|
||||
# Object series, check each value
|
||||
return all((v is None) or isinstance(v, str) for v in s)
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def _read_regex_dict(regex_file: str):
|
||||
with open(regex_file, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def assign_categories(df: pd.DataFrame, regex_file: str) -> pd.DataFrame:
|
||||
if 'category' not in df.columns:
|
||||
df['category'] = [' '] * len(df.index)
|
||||
|
||||
regex_dict = _read_regex_dict(regex_file)
|
||||
|
||||
df = df.fillna('')
|
||||
|
||||
for column in df.columns:
|
||||
if not _is_str_column(df[column]):
|
||||
continue
|
||||
|
||||
for category in regex_dict:
|
||||
for regex in regex_dict[category]:
|
||||
matched = df[column].str.contains(regex, regex=True)
|
||||
df.loc[matched, 'category'] = category
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def main():
|
||||
df = pd.read_csv('../res/bank_statement_2023_categorized.csv')
|
||||
|
||||
df = assign_categories(df, regex_file='../res/regexes.json')
|
||||
|
||||
print(df['category'])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -6,94 +6,104 @@ import re
|
||||
import numpy as np
|
||||
|
||||
|
||||
# def _read_regex_dict(regex_file: str = "res/category_regexes.json"):
|
||||
# with open(regex_file, 'r') as f:
|
||||
# return json.load(f)
|
||||
#
|
||||
#
|
||||
# def _tag_with_category(df: pd.DataFrame) -> pd.DataFrame:
|
||||
# regex_dict = _read_regex_dict()
|
||||
#
|
||||
# return df
|
||||
#
|
||||
#
|
||||
# def _compute_total_balance(df: pd.DataFrame) -> pd.DataFrame:
|
||||
# stripped_df = pd.DataFrame(
|
||||
# {'t': df["Valutadatum"], 'value': df["Saldo nach Buchung"]})
|
||||
#
|
||||
# stripped_df.index = stripped_df['t']
|
||||
# gb = stripped_df.groupby(pd.Grouper(freq='M'))
|
||||
#
|
||||
# result = gb.tail(1)['value'].reset_index()
|
||||
#
|
||||
# return result
|
||||
#
|
||||
#
|
||||
# def _compute_net_income(df: pd.DataFrame) -> pd.DataFrame:
|
||||
# stripped_df = pd.DataFrame({'t': df["Valutadatum"], 'value': df["Betrag"]})
|
||||
#
|
||||
# stripped_df.index = stripped_df['t']
|
||||
# gb = stripped_df.groupby(pd.Grouper(freq='M'))
|
||||
#
|
||||
# result = gb["value"].sum().reset_index()
|
||||
# return result
|
||||
#
|
||||
#
|
||||
# def _compute_category_overview(df: pd.DataFrame) -> pd.DataFrame:
|
||||
# categories = ["Social life", "Other", "Food", "Hobbies",
|
||||
# "Rent \\& Utilities", "Education", "Transportation"]
|
||||
# values = np.array([10, 12, 53, 12, 90, 23, 32])
|
||||
# values = values / values.sum() * 100
|
||||
# values = np.round(values, decimals=1)
|
||||
# values[-1] += 100 - np.sum(values)
|
||||
#
|
||||
# category_overview_df = pd.DataFrame(
|
||||
# {"category": categories, "value": values})
|
||||
#
|
||||
# return category_overview_df
|
||||
#
|
||||
#
|
||||
# def _compute_detailed_balance(df: pd.DataFrame) -> pd.DataFrame:
|
||||
# return pd.DataFrame({'t': df["Valutadatum"],
|
||||
# 'value': df["Saldo nach Buchung"]})
|
||||
#
|
||||
#
|
||||
# def parse_statement(filename: str) -> types.ReportData:
|
||||
# df = pd.read_csv(filename, delimiter=';', decimal=",")
|
||||
# df["Valutadatum"] = pd.to_datetime(df["Valutadatum"], format='%d.%m.%Y')
|
||||
#
|
||||
# category_overview_df = _compute_category_overview(df)
|
||||
# total_balance_df = _compute_total_balance(df)
|
||||
# net_income_df = _compute_net_income(df)
|
||||
# detailed_balance_df = _compute_detailed_balance(df)
|
||||
#
|
||||
# return types.ReportData(category_overview_df,
|
||||
# net_income_df,
|
||||
# total_balance_df,
|
||||
# detailed_balance_df)
|
||||
#
|
||||
#
|
||||
# def main():
|
||||
# report_data = parse_statement("../res/bank_statement_2023.csv")
|
||||
#
|
||||
#
|
||||
# if __name__ == "__main__":
|
||||
# main()
|
||||
def _escape_string(to_escape: str):
|
||||
return to_escape.translate(str.maketrans({"&": r"\&"}))
|
||||
|
||||
|
||||
def get_stripped_statement(filename: str) -> pd.DataFrame:
|
||||
# df = pd.read_csv(filename, delimiter=';', decimal=",")
|
||||
df = pd.read_csv(filename, delimiter=';')
|
||||
df["Valutadatum"] = (pd.to_datetime(df["Valutadatum"], format='%d.%m.%Y')
|
||||
.dt.strftime('%Y-%m-%d'))
|
||||
def _compute_total_balance(df: pd.DataFrame) -> pd.DataFrame:
|
||||
stripped_df = pd.DataFrame(
|
||||
{'t': df["t"], 'value': df["balance"]})
|
||||
|
||||
result = pd.DataFrame({'t': df["Valutadatum"],
|
||||
'other party': df["Name Zahlungsbeteiligter"],
|
||||
'value': df["Betrag"],
|
||||
'balance': df["Saldo nach Buchung"],
|
||||
'category': [''] * len(df["Valutadatum"]),
|
||||
'description': df["Buchungstext"],
|
||||
'purpose': df["Verwendungszweck"]
|
||||
})
|
||||
stripped_df.index = stripped_df['t']
|
||||
gb = stripped_df.groupby(pd.Grouper(freq='M'))
|
||||
|
||||
result = gb.tail(1)['value'].reset_index()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _compute_net_income(df: pd.DataFrame) -> pd.DataFrame:
|
||||
stripped_df = pd.DataFrame({'t': df["t"], 'value': df["value"]})
|
||||
|
||||
stripped_df.index = stripped_df['t']
|
||||
gb = stripped_df.groupby(pd.Grouper(freq='M'))
|
||||
|
||||
result = gb["value"].sum().reset_index()
|
||||
return result
|
||||
|
||||
|
||||
def _compute_category_overview(df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = df.loc[df['value'] < 0]
|
||||
df = df.drop('t', axis=1)
|
||||
df = df.groupby(['category']).sum().reset_index()
|
||||
values = (df['value'] / df['value'].sum() * 100).to_numpy()
|
||||
values[-1] += 100 - np.sum(values)
|
||||
values = np.round(values, decimals=1)
|
||||
categories = [_escape_string(category) for category in df['category']]
|
||||
|
||||
category_overview_df = pd.DataFrame(
|
||||
{"category": categories, "value": values})
|
||||
|
||||
category_overview_df = category_overview_df.sort_values('value',
|
||||
ascending=False)
|
||||
return category_overview_df
|
||||
|
||||
|
||||
def _compute_expenses_by_category(complete_df: pd.DataFrame) -> pd.DataFrame:
|
||||
complete_df = complete_df.loc[complete_df['value'] < 0]
|
||||
complete_df['value'] = -complete_df['value']
|
||||
complete_df.index = complete_df['t']
|
||||
|
||||
complete_gb = complete_df.groupby(pd.Grouper(freq='M'))
|
||||
|
||||
categories = complete_df['category'].unique()
|
||||
data_dict = {category: [] for category in categories}
|
||||
|
||||
for (month_date, month_df) in complete_gb:
|
||||
month_df = month_df.drop('t', axis=1).reset_index().drop('t', axis=1)
|
||||
|
||||
category_df = month_df.groupby(['category']).sum().reset_index()
|
||||
|
||||
for _, row in category_df.iterrows():
|
||||
data_dict[row['category']].append(row['value'])
|
||||
|
||||
non_listed = list(set(categories) - set(category_df['category']))
|
||||
for category in non_listed:
|
||||
data_dict[category].append(0)
|
||||
|
||||
result = pd.DataFrame(data_dict)
|
||||
result = result.reindex(result.mean().sort_values(ascending=False).index,
|
||||
axis=1)
|
||||
result['t'] = complete_gb.tail(1).drop('t', axis=1).reset_index()['t']
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _compute_detailed_balance(df: pd.DataFrame) -> pd.DataFrame:
|
||||
return pd.DataFrame({'t': df["t"],
|
||||
'value': df["balance"]})
|
||||
|
||||
|
||||
def parse_statement(filename: str) -> types.ReportData:
|
||||
df = pd.read_csv(filename)
|
||||
df["t"] = pd.to_datetime(df["t"], format='%Y-%m-%d')
|
||||
|
||||
category_overview_df = _compute_category_overview(df)
|
||||
total_balance_df = _compute_total_balance(df)
|
||||
net_income_df = _compute_net_income(df)
|
||||
detailed_balance_df = _compute_detailed_balance(df)
|
||||
expenses_by_category_df = _compute_expenses_by_category(df)
|
||||
|
||||
return types.ReportData(category_overview_df,
|
||||
expenses_by_category_df,
|
||||
net_income_df,
|
||||
total_balance_df,
|
||||
detailed_balance_df, )
|
||||
|
||||
|
||||
def main():
|
||||
report_data = parse_statement("../res/bank_statement_2023_categorized.csv")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
5
banking_breakdown/test.json
Normal file
5
banking_breakdown/test.json
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"asdf": [
|
||||
"Kinemic"
|
||||
]
|
||||
}
|
||||
@ -5,6 +5,7 @@ import pandas as pd
|
||||
@dataclass
|
||||
class ReportData:
|
||||
category_overview: pd.DataFrame
|
||||
expenses_by_category: pd.DataFrame
|
||||
net_income: pd.DataFrame
|
||||
total_value: pd.DataFrame
|
||||
detailed_balance: pd.DataFrame
|
||||
|
||||
@ -212,5 +212,97 @@
|
||||
\end{figure}
|
||||
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
|
||||
% Read table
|
||||
\pgfplotstableread[col sep=comma]{expenses_by_category.csv}\expbycattable
|
||||
\pgfplotstablegetcolsof{\expbycattable}
|
||||
\pgfmathtruncatemacro\NumCols{\pgfplotsretval-1}
|
||||
|
||||
\begin{subfigure}[c]{\textwidth}
|
||||
\centering
|
||||
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[
|
||||
stack plots=y,
|
||||
area style,
|
||||
date coordinates in=x,
|
||||
width=\textwidth,
|
||||
height=0.375\textwidth,
|
||||
xticklabel=\month.\shortyear{\year},
|
||||
xtick=data,
|
||||
enlargelimits=false,
|
||||
xticklabel style={
|
||||
rotate=60,
|
||||
anchor=near xticklabel,
|
||||
},
|
||||
legend columns=5,
|
||||
legend style={at={(0.5,-0.6)},anchor=south},
|
||||
ylabel={Expenses in €},
|
||||
ymin=0,
|
||||
]
|
||||
% For each
|
||||
\pgfplotsinvokeforeach{0,...,\NumCols/2 -1}{
|
||||
% Define color
|
||||
\pgfmathparse{1000 / (\NumCols/2 -1) * #1}
|
||||
\extractcolormapcolor{tempcol#1}{\pgfmathresult}
|
||||
|
||||
% Add plot
|
||||
\addplot+[tempcol#1]
|
||||
table[col sep=comma, x=t, y index=#1]
|
||||
{\expbycattable} \closedcycle;
|
||||
|
||||
% Add legend entry (https://tex.stackexchange.com/a/405018)
|
||||
\pgfplotstablegetcolumnnamebyindex{#1}\of{\expbycattable}\to\pgfplotsretval
|
||||
\expandafter\addlegendentry\expandafter{\pgfplotsretval}
|
||||
}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\end{subfigure}\\[1em]
|
||||
\begin{subfigure}[c]{\textwidth}
|
||||
\centering
|
||||
|
||||
\begin{tikzpicture}
|
||||
\begin{axis}[
|
||||
stack plots=y,
|
||||
area style,
|
||||
date coordinates in=x,
|
||||
width=\textwidth,
|
||||
height=0.375\textwidth,
|
||||
xticklabel=\month.\shortyear{\year},
|
||||
xtick=data,
|
||||
enlargelimits=false,
|
||||
xticklabel style={
|
||||
rotate=60,
|
||||
anchor=near xticklabel,
|
||||
},
|
||||
legend columns=5,
|
||||
legend style={at={(0.5,-0.6)},anchor=south},
|
||||
ylabel={Expenses in €},
|
||||
ymin=0,
|
||||
]
|
||||
% For each
|
||||
\pgfplotsinvokeforeach{\NumCols/2,...,\NumCols-1}{
|
||||
% Define color
|
||||
\pgfmathparse{1000 * (#1 - \NumCols/2) / (\NumCols-1 - \NumCols/2)}
|
||||
\extractcolormapcolor{tempcol#1}{\pgfmathresult}
|
||||
|
||||
% Add plot
|
||||
\addplot+[tempcol#1]
|
||||
table[col sep=comma, x=t, y index=#1]
|
||||
{\expbycattable} \closedcycle;
|
||||
|
||||
% Add legend entry (https://tex.stackexchange.com/a/405018)
|
||||
\pgfplotstablegetcolumnnamebyindex{#1}\of{\expbycattable}\to\pgfplotsretval
|
||||
\expandafter\addlegendentry\expandafter{\pgfplotsretval}
|
||||
}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
\end{subfigure}
|
||||
|
||||
\caption{Expenses by category}
|
||||
\end{figure}
|
||||
|
||||
\end{document}
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user