#!/usr/bin/env python3 """ BancoEstado parser (Cuenta Corriente / Cuenta Vista / CuentaRUT) -> normalized ledger JSON. Layout (ESTADO DE MOVIMIENTOS): N°Doc | DESCRIPCION | SUC | CARGOS O GIROS | ABONOS O DEPOSITOS | FECHA(DD/MM/YYYY) | SALDO Two real-world header/row variants are handled: - single-line: header has CARGOS+ABONOS on one line; each txn is one line. - wrapped: 'ABONOS O' / 'DEPOSITOS' wrap above/below the CARGOS line, and a txn's docno, description and amount+date can span several lines. Rows are assembled as docno-delimited blocks (every txn starts with a 5+ digit docno at column 0), so both variants parse. Amounts dot-thousands, no '$'. """ import sys, os, re, json sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) import common DATE = re.compile(r'(\d{2})/(\d{2})/(\d{4})') DOCNO = re.compile(r'^(\d{5,})\b') NUMTOK = re.compile(r'\d{1,3}(?:\.\d{3})+|\d+') def parse_period(txt): dts = DATE.findall(txt[:1200]) if len(dts) >= 3: d1, d2 = dts[1], dts[2] return (f"{d1[2]}-{d1[1]}-{d1[0]}", f"{d2[2]}-{d2[1]}-{d2[0]}") return (None, None) def find_account(txt): m = re.search(r'N[º°]\s*CUENTA[^\d]*(\d[\d\.\-]{5,})', txt, re.I) return m.group(1) if m else None def find_columns(lines): """Locate cargos/abonos/fecha column x-positions, tolerating a wrapped ABONOS header.""" for i, ln in enumerate(lines): u = ln.upper() if 'CARGOS O GIROS' not in u: continue cx = u.find('CARGOS O GIROS') fx = u.find('FECHA'); fx = fx if fx >= 0 else None sx = u.find('SALDO'); sx = sx if sx >= 0 else None ax = u.find('ABONOS') if ax < 0: ax = u.find('DEPOSITOS') if ax < 0: # wrapped: look on neighbouring lines for j in (i - 1, i + 1, i - 2, i + 2): if 0 <= j < len(lines): uu = lines[j].upper() p = uu.find('ABONOS') if p < 0: p = uu.find('DEPOSITOS') if p >= 0: ax = p; break return cx, ax, fx, sx, i return None, None, None, None, None def parse_file(path): txt = common.text_layout(path) lines = txt.split('\n') ps, pe = parse_period(txt) acct = find_account(txt) base = os.path.basename(path).upper() dt = 'cuenta_corriente' if 'CORRIENTE' in base else 'cuenta_vista' cx, ax, fx, sx, hdr = find_columns(lines) txns = [] if cx is not None and ax is not None: fxb = fx if fx else ax + 40 # fallback band end for FECHA # movement region: after header, before period summary start = hdr + 1 end = len(lines) for k in range(start, len(lines)): if re.search(r'RESUMEN\s+DEL\s+PER', lines[k], re.I): end = k; break # split into docno-delimited blocks blocks, cur = [], None for ln in lines[start:end]: if DOCNO.match(ln): if cur is not None: blocks.append(cur) cur = [ln] elif cur is not None: cur.append(ln) if cur is not None: blocks.append(cur) for blk in blocks: docno = DOCNO.match(blk[0]).group(1) # the amount line is the block line carrying the date amount_line = next((l for l in blk if DATE.search(l)), None) if not amount_line: continue dm = DATE.search(amount_line); dd, mm, yy = dm.groups() ds, de = dm.span() cargo = abono = saldo = None for mo in NUMTOK.finditer(amount_line): s, e = mo.start(), mo.end() if s >= ds and e <= de + 1: # part of the date itself continue v = common.to_int(mo.group()) if v is None: continue if cx - 3 <= e < ax: cargo = v elif ax - 3 <= e < fxb: abono = v sm = NUMTOK.search(amount_line[de:]) # saldo = first number after the date if sm: saldo = common.to_int(sm.group()) if cargo: amt, direction = cargo, 'debit' elif abono: amt, direction = abono, 'credit' else: continue # description: text fragments across the whole block, minus structural tokens pieces = [] for idx, l in enumerate(blk): seg = l if idx == 0: seg = seg[DOCNO.match(seg).end():] # drop leading docno if l is amount_line: seg = seg[:dm.start()] # keep only text left of amounts/date seg = re.sub(r'\b001\b', ' ', seg) # SUC column seg = NUMTOK.sub(' ', seg) # strip stray numbers seg = seg.strip() if seg: pieces.append(seg) desc = ' '.join(pieces) desc = re.sub(r'(?:\s*/\s*)+', ' ', desc) # collapse stray slashes left by stripped RUTs desc = re.sub(r'\s{2,}', ' ', desc).strip(' /') t = common.blank_txn(f'{yy}-{mm}-{dd}', desc, amt, direction, saldo, amount_line.rstrip()) t['doc_number'] = docno txns.append(t) return { 'statement_id': common.sha8(path), 'gmail_id': None, 'bank': 'BancoEstado', 'doc_type': dt, 'owner': 'Vicente', 'account_ref': acct, 'account_last4': (re.sub(r'\D', '', acct)[-4:] if acct else None), 'period_start': ps, 'period_end': pe, 'saldo_inicial': None, 'saldo_final': None, 'currency': 'CLP', 'source_pdf': path, 'parser_version': 'bancoestado/2', 'transactions': txns, } if __name__ == '__main__': print(json.dumps(parse_file(sys.argv[1]), ensure_ascii=False, indent=2))