import pandas as pd import json def _is_str_column(s: pd.Series): """Check if the type of a pandas DataFrame column is str. Taken from https://stackoverflow.com/a/67001213/3433817. """ if isinstance(s.dtype, pd.StringDtype): # The series was explicitly created as a string series (Pandas>=1.0.0) return True elif s.dtype == 'object': # Object series, check each value return all((v is None) or isinstance(v, str) for v in s) else: return False def _read_regex_dict(regex_file: str): with open(regex_file, 'r') as f: return json.load(f) def assign_categories(df: pd.DataFrame, regex_file: str) -> pd.DataFrame: if 'category' not in df.columns: df['category'] = [' '] * len(df.index) regex_dict = _read_regex_dict(regex_file) df = df.fillna('') for column in df.columns: if not _is_str_column(df[column]): continue for category in regex_dict: for regex in regex_dict[category]: matched = df[column].str.contains(regex, regex=True) df.loc[matched, 'category'] = category return df def main(): df = pd.read_csv('../res/bank_statement_2023_categorized.csv') df = assign_categories(df, regex_file='../res/regexes.json') print(df['category']) if __name__ == "__main__": main()