banking-breakdown/banking_breakdown/regex_categorizer.py

55 lines
1.3 KiB
Python

import pandas as pd
import json
def _is_str_column(s: pd.Series):
"""Check if the type of a pandas DataFrame column is str.
Taken from https://stackoverflow.com/a/67001213/3433817.
"""
if isinstance(s.dtype, pd.StringDtype):
# The series was explicitly created as a string series (Pandas>=1.0.0)
return True
elif s.dtype == 'object':
# Object series, check each value
return all((v is None) or isinstance(v, str) for v in s)
else:
return False
def _read_regex_dict(regex_file: str):
with open(regex_file, 'r') as f:
return json.load(f)
def assign_categories(df: pd.DataFrame, regex_file: str) -> pd.DataFrame:
if 'category' not in df.columns:
df['category'] = [' '] * len(df.index)
regex_dict = _read_regex_dict(regex_file)
df = df.fillna('')
for column in df.columns:
if not _is_str_column(df[column]):
continue
for category in regex_dict:
for regex in regex_dict[category]:
matched = df[column].str.contains(regex, regex=True)
df.loc[matched, 'category'] = category
return df
def main():
df = pd.read_csv('../res/bank_statement_2023_categorized.csv')
df = assign_categories(df, regex_file='../res/regexes.json')
print(df['category'])
if __name__ == "__main__":
main()