kua-money-trace/parsers/bancoestado.py

#!/usr/bin/env python3
"""
BancoEstado parser (Cuenta Corriente / Cuenta Vista / CuentaRUT) -> normalized ledger JSON.
Layout (ESTADO DE MOVIMIENTOS):
  N°Doc | DESCRIPCION | SUC | CARGOS O GIROS | ABONOS O DEPOSITOS | FECHA(DD/MM/YYYY) | SALDO

Two real-world header/row variants are handled:
  - single-line: header has CARGOS+ABONOS on one line; each txn is one line.
  - wrapped:     'ABONOS O' / 'DEPOSITOS' wrap above/below the CARGOS line, and a txn's
                 docno, description and amount+date can span several lines.
Rows are assembled as docno-delimited blocks (every txn starts with a 5+ digit docno at
column 0), so both variants parse. Amounts dot-thousands, no '$'.
"""
import sys, os, re, json
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import common

DATE = re.compile(r'(\d{2})/(\d{2})/(\d{4})')
DOCNO = re.compile(r'^(\d{5,})\b')
NUMTOK = re.compile(r'\d{1,3}(?:\.\d{3})+|\d+')

def parse_period(txt):
    dts = DATE.findall(txt[:1200])
    if len(dts) >= 3:
        d1, d2 = dts[1], dts[2]
        return (f"{d1[2]}-{d1[1]}-{d1[0]}", f"{d2[2]}-{d2[1]}-{d2[0]}")
    return (None, None)

def find_account(txt):
    m = re.search(r'N[º°]\s*CUENTA[^\d]*(\d[\d\.\-]{5,})', txt, re.I)
    return m.group(1) if m else None

def find_columns(lines):
    """Locate cargos/abonos/fecha column x-positions, tolerating a wrapped ABONOS header."""
    for i, ln in enumerate(lines):
        u = ln.upper()
        if 'CARGOS O GIROS' not in u:
            continue
        cx = u.find('CARGOS O GIROS')
        fx = u.find('FECHA'); fx = fx if fx >= 0 else None
        sx = u.find('SALDO'); sx = sx if sx >= 0 else None
        ax = u.find('ABONOS')
        if ax < 0: ax = u.find('DEPOSITOS')
        if ax < 0:                              # wrapped: look on neighbouring lines
            for j in (i - 1, i + 1, i - 2, i + 2):
                if 0 <= j < len(lines):
                    uu = lines[j].upper()
                    p = uu.find('ABONOS')
                    if p < 0: p = uu.find('DEPOSITOS')
                    if p >= 0: ax = p; break
        return cx, ax, fx, sx, i
    return None, None, None, None, None

def parse_file(path):
    txt = common.text_layout(path)
    lines = txt.split('\n')
    ps, pe = parse_period(txt)
    acct = find_account(txt)
    base = os.path.basename(path).upper()
    dt = 'cuenta_corriente' if 'CORRIENTE' in base else 'cuenta_vista'
    cx, ax, fx, sx, hdr = find_columns(lines)

    txns = []
    if cx is not None and ax is not None:
        fxb = fx if fx else ax + 40     # fallback band end for FECHA
        # movement region: after header, before period summary
        start = hdr + 1
        end = len(lines)
        for k in range(start, len(lines)):
            if re.search(r'RESUMEN\s+DEL\s+PER', lines[k], re.I):
                end = k; break
        # split into docno-delimited blocks
        blocks, cur = [], None
        for ln in lines[start:end]:
            if DOCNO.match(ln):
                if cur is not None: blocks.append(cur)
                cur = [ln]
            elif cur is not None:
                cur.append(ln)
        if cur is not None: blocks.append(cur)

        for blk in blocks:
            docno = DOCNO.match(blk[0]).group(1)
            # the amount line is the block line carrying the date
            amount_line = next((l for l in blk if DATE.search(l)), None)
            if not amount_line:
                continue
            dm = DATE.search(amount_line); dd, mm, yy = dm.groups()
            ds, de = dm.span()
            cargo = abono = saldo = None
            for mo in NUMTOK.finditer(amount_line):
                s, e = mo.start(), mo.end()
                if s >= ds and e <= de + 1:      # part of the date itself
                    continue
                v = common.to_int(mo.group())
                if v is None: continue
                if cx - 3 <= e < ax:
                    cargo = v
                elif ax - 3 <= e < fxb:
                    abono = v
            sm = NUMTOK.search(amount_line[de:])  # saldo = first number after the date
            if sm: saldo = common.to_int(sm.group())

            if cargo:   amt, direction = cargo, 'debit'
            elif abono: amt, direction = abono, 'credit'
            else:       continue

            # description: text fragments across the whole block, minus structural tokens
            pieces = []
            for idx, l in enumerate(blk):
                seg = l
                if idx == 0:
                    seg = seg[DOCNO.match(seg).end():]      # drop leading docno
                if l is amount_line:
                    seg = seg[:dm.start()]                  # keep only text left of amounts/date
                seg = re.sub(r'\b001\b', ' ', seg)          # SUC column
                seg = NUMTOK.sub(' ', seg)                  # strip stray numbers
                seg = seg.strip()
                if seg: pieces.append(seg)
            desc = ' '.join(pieces)
            desc = re.sub(r'(?:\s*/\s*)+', ' ', desc)   # collapse stray slashes left by stripped RUTs
            desc = re.sub(r'\s{2,}', ' ', desc).strip(' /')

            t = common.blank_txn(f'{yy}-{mm}-{dd}', desc, amt, direction, saldo, amount_line.rstrip())
            t['doc_number'] = docno
            txns.append(t)

    return {
        'statement_id': common.sha8(path), 'gmail_id': None, 'bank': 'BancoEstado',
        'doc_type': dt, 'owner': 'Vicente', 'account_ref': acct,
        'account_last4': (re.sub(r'\D', '', acct)[-4:] if acct else None),
        'period_start': ps, 'period_end': pe,
        'saldo_inicial': None, 'saldo_final': None, 'currency': 'CLP',
        'source_pdf': path, 'parser_version': 'bancoestado/2', 'transactions': txns,
    }

if __name__ == '__main__':
    print(json.dumps(parse_file(sys.argv[1]), ensure_ascii=False, indent=2))