banking-breakdown/banking_breakdown/statement_parser.py

123 lines
3.9 KiB
Python

import typing
import pandas as pd
from banking_breakdown import types
import json
import re
import numpy as np
def _escape_string(to_escape: str):
return to_escape.translate(str.maketrans({"&": r"\&"}))
def _compute_total_balance(df: pd.DataFrame) -> pd.DataFrame:
stripped_df = pd.DataFrame(
{'t': df["t"], 'value': df["balance"]})
stripped_df.index = stripped_df['t']
gb = stripped_df.groupby(pd.Grouper(freq='M'))
result = gb.tail(1)['value'].reset_index()
return result
def _compute_net_income(df: pd.DataFrame) -> pd.DataFrame:
df.index = df['t']
income_df = df.loc[df['value'] > 0]
expenses_df = df.loc[df['value'] < 0]
income_df = income_df.groupby(pd.Grouper(freq='M'))[
'value'].sum().reset_index().round(decimals=2)
expenses_df = expenses_df.groupby(pd.Grouper(freq='M'))[
'value'].sum().reset_index().round(decimals=2)
t = income_df['t']
income = income_df['value'].round(decimals=2)
expenses = expenses_df['value'].round(decimals=2)
net = (income + expenses).round(decimals=2)
result_df = pd.DataFrame(
{'t': t, 'income': income, 'expenses': expenses, 'net': net})
return result_df
def _compute_category_overview(df: pd.DataFrame) -> pd.DataFrame:
df = df.loc[df['value'] < 0]
df = df.drop('t', axis=1)
df = df.groupby(['category']).sum().reset_index()
values = (df['value'] / df['value'].sum() * 100).to_numpy()
values[-1] += 100 - np.sum(values)
values = np.round(values, decimals=1)
categories = [_escape_string(category) for category in df['category']]
category_overview_df = pd.DataFrame(
{"category": categories, "value": values})
category_overview_df = category_overview_df.sort_values('value',
ascending=False)
return category_overview_df
def _compute_expenses_by_category(complete_df: pd.DataFrame) -> pd.DataFrame:
complete_df = complete_df.loc[complete_df['value'] < 0].copy()
complete_df['value'] = -complete_df['value']
complete_df.index = complete_df['t']
complete_gb = complete_df.groupby(pd.Grouper(freq='M'))
categories = complete_df['category'].unique()
data_dict = {category: [] for category in categories}
for (month_date, month_df) in complete_gb:
month_df = month_df.drop('t', axis=1).reset_index().drop('t', axis=1)
category_df = month_df.groupby(['category']).sum().reset_index()
for _, row in category_df.iterrows():
data_dict[row['category']].append(row['value'])
non_listed = list(set(categories) - set(category_df['category']))
for category in non_listed:
data_dict[category].append(0)
result = pd.DataFrame(data_dict)
result = result.reindex(result.mean().sort_values(ascending=False).index,
axis=1)
result = result.round(decimals=2)
result['t'] = complete_gb.tail(1).drop('t', axis=1).reset_index()['t']
return result
def _compute_detailed_balance(df: pd.DataFrame) -> pd.DataFrame:
return pd.DataFrame({'t': df["t"],
'value': df["balance"]})
def parse_statement(filename: str) -> types.ReportData:
df = pd.read_csv(filename)
df["t"] = pd.to_datetime(df["t"], format='%Y-%m-%d')
category_overview_df = _compute_category_overview(df)
total_balance_df = _compute_total_balance(df)
net_income_df = _compute_net_income(df)
detailed_balance_df = _compute_detailed_balance(df)
expenses_by_category_df = _compute_expenses_by_category(df)
return types.ReportData(category_overview_df,
expenses_by_category_df,
net_income_df,
total_balance_df,
detailed_balance_df, )
def main():
report_data = parse_statement("../res/bank_statement_2023_categorized.csv")
if __name__ == "__main__":
main()