banking-breakdown/banking_breakdown/regex_categorizer.py

import pandas as pd
import json


def _is_str_column(s: pd.Series):
    """Check if the type of a pandas DataFrame column is str.

    Taken from https://stackoverflow.com/a/67001213/3433817.
    """
    if isinstance(s.dtype, pd.StringDtype):
        # The series was explicitly created as a string series (Pandas>=1.0.0)
        return True
    elif s.dtype == 'object':
        # Object series, check each value
        return all((v is None) or isinstance(v, str) for v in s)
    else:
        return False


def _read_regex_dict(regex_file: str):
    with open(regex_file, 'r') as f:
        return json.load(f)


def assign_categories(df: pd.DataFrame, regex_file: str) -> pd.DataFrame:
    if 'category' not in df.columns:
        df['category'] = [' '] * len(df.index)

    regex_dict = _read_regex_dict(regex_file)

    df = df.fillna('')

    for column in df.columns:
        if not _is_str_column(df[column]):
            continue

        for category in regex_dict:
            for regex in regex_dict[category]:
                matched = df[column].str.contains(regex, regex=True)
                df.loc[matched, 'category'] = category

    return df


def main():
    df = pd.read_csv('../res/bank_statement_2023_categorized.csv')

    df = assign_categories(df, regex_file='../res/regexes.json')

    print(df['category'])


if __name__ == "__main__":
    main()