55 lines
1.3 KiB
Python
55 lines
1.3 KiB
Python
import pandas as pd
|
|
import json
|
|
|
|
|
|
def _is_str_column(s: pd.Series):
|
|
"""Check if the type of a pandas DataFrame column is str.
|
|
|
|
Taken from https://stackoverflow.com/a/67001213/3433817.
|
|
"""
|
|
if isinstance(s.dtype, pd.StringDtype):
|
|
# The series was explicitly created as a string series (Pandas>=1.0.0)
|
|
return True
|
|
elif s.dtype == 'object':
|
|
# Object series, check each value
|
|
return all((v is None) or isinstance(v, str) for v in s)
|
|
else:
|
|
return False
|
|
|
|
|
|
def _read_regex_dict(regex_file: str):
|
|
with open(regex_file, 'r') as f:
|
|
return json.load(f)
|
|
|
|
|
|
def assign_categories(df: pd.DataFrame, regex_file: str) -> pd.DataFrame:
|
|
if 'category' not in df.columns:
|
|
df['category'] = [' '] * len(df.index)
|
|
|
|
regex_dict = _read_regex_dict(regex_file)
|
|
|
|
df = df.fillna('')
|
|
|
|
for column in df.columns:
|
|
if not _is_str_column(df[column]):
|
|
continue
|
|
|
|
for category in regex_dict:
|
|
for regex in regex_dict[category]:
|
|
matched = df[column].str.contains(regex, regex=True)
|
|
df.loc[matched, 'category'] = category
|
|
|
|
return df
|
|
|
|
|
|
def main():
|
|
df = pd.read_csv('../res/bank_statement_2023_categorized.csv')
|
|
|
|
df = assign_categories(df, regex_file='../res/regexes.json')
|
|
|
|
print(df['category'])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|