#!/usr/bin/env python3 '''Parse the 2019-2024 backfill staging tree into a SEPARATE ledger file. Reads documents/decrypted-backfill//, writes web/ledger-backfill.json. Does NOT touch the live web/ledger.json.''' import glob, json, os, importlib.util, sys ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.join(ROOT, 'parsers')) import common, classify STAGE = os.path.join(ROOT, 'documents/decrypted-backfill') def load(m): spec = importlib.util.spec_from_file_location(m, os.path.join(ROOT,'parsers',m+'.py')) mod = importlib.util.module_from_spec(spec); spec.loader.exec_module(mod); return mod PARSERS = [('Santander','santander'),('Banco_de_Chile','banco_de_chile'), ('BancoEstado','bancoestado'),('CopecPay','copecpay'),('Tenpo','tenpo')] PARSE_FN = {} def gid_map(): p='/tmp/unlock-results-backfill.json'; out={} if os.path.exists(p): for o in json.load(open(p)).get('ok',[]): if o.get('sha8'): out[o['sha8']]=str(o.get('gmail_id','')) return out def main(): gids=gid_map(); web={'banks':{},'statements':[]}; grand=0 for subdir,modname in PARSERS: mod=load(modname) d_in=os.path.join(STAGE,subdir) files=sorted(glob.glob(os.path.join(d_in,'*.pdf'))+glob.glob(os.path.join(d_in,'*.PDF'))) outdir=os.path.join(ROOT,'data/ledger-backfill',subdir); os.makedirs(outdir,exist_ok=True) fn=getattr(mod, PARSE_FN.get(subdir,'parse_file')) n_txn=n_doc=0 for f in files: try: d=fn(f) except Exception as e: print(f' FAIL {os.path.basename(f)}: {e}'); continue for t in d['transactions']: common.enrich(t); t['description']=common.clean_desc(t['description']); classify.classify(t) d['gmail_id']=gids.get(d['statement_id'], d.get('gmail_id')) json.dump(d, open(os.path.join(outdir,d['statement_id']+'.json'),'w'), ensure_ascii=False, indent=1) dw=dict(d); dw['pdf_url']='/'+d['source_pdf']; web['statements'].append(dw) n_txn+=len(d['transactions']); n_doc+=1 web['banks'][subdir]={'docs':n_doc,'txns':n_txn}; grand+=n_txn print(f'{subdir}: {n_doc} docs, {n_txn} txns') from collections import defaultdict flow=defaultdict(lambda:[0,0]); ri=ro=ia=0 for s in web['statements']: for t in s['transactions']: ft=t.get('flow_type','other'); flow[ft][0]+=1; flow[ft][1]+=t['amount'] if t.get('internal'): ia+=t['amount'] elif t['direction']=='credit': ri+=t['amount'] else: ro+=t['amount'] web['flow_summary']={k:{'count':v[0],'amount':v[1]} for k,v in flow.items()} web['real_totals']={'income':ri,'expense_outflow':ro,'internal':ia} json.dump(web, open(os.path.join(ROOT,'web/ledger-backfill.json'),'w'), ensure_ascii=False) print(f'TOTAL: {grand} txns -> web/ledger-backfill.json') print(f'REAL in ${ri:,} REAL out ${ro:,} INTERNAL ${ia:,}') if __name__=='__main__': main()