Fix BancoEstado parser for wrapped-header CuentaRUT layout
The 2024-era CuentaRUT cartola wraps the ABONOS O DEPOSITOS header across three lines and splits each transaction (docno / description / amount+date) across multiple lines, so the old same-line header+row detection found 0 transactions. Rewrote row assembly as docno-delimited blocks with tolerant column detection (ABONOS position recovered from neighbouring lines). Recovers 12 transactions from the backfill statement; all 2025 statements parse identically (36/5/36/12/36/29/2/5 unchanged). Backfill now 2375 txns.
This commit is contained in:
parent
bdda30afa1
commit
e1eed9e5b4
|
|
@ -0,0 +1,138 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
BancoEstado parser (Cuenta Corriente / Cuenta Vista / CuentaRUT) -> normalized ledger JSON.
|
||||
Layout (ESTADO DE MOVIMIENTOS):
|
||||
N°Doc | DESCRIPCION | SUC | CARGOS O GIROS | ABONOS O DEPOSITOS | FECHA(DD/MM/YYYY) | SALDO
|
||||
|
||||
Two real-world header/row variants are handled:
|
||||
- single-line: header has CARGOS+ABONOS on one line; each txn is one line.
|
||||
- wrapped: 'ABONOS O' / 'DEPOSITOS' wrap above/below the CARGOS line, and a txn's
|
||||
docno, description and amount+date can span several lines.
|
||||
Rows are assembled as docno-delimited blocks (every txn starts with a 5+ digit docno at
|
||||
column 0), so both variants parse. Amounts dot-thousands, no '$'.
|
||||
"""
|
||||
import sys, os, re, json
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
import common
|
||||
|
||||
DATE = re.compile(r'(\d{2})/(\d{2})/(\d{4})')
|
||||
DOCNO = re.compile(r'^(\d{5,})\b')
|
||||
NUMTOK = re.compile(r'\d{1,3}(?:\.\d{3})+|\d+')
|
||||
|
||||
def parse_period(txt):
|
||||
dts = DATE.findall(txt[:1200])
|
||||
if len(dts) >= 3:
|
||||
d1, d2 = dts[1], dts[2]
|
||||
return (f"{d1[2]}-{d1[1]}-{d1[0]}", f"{d2[2]}-{d2[1]}-{d2[0]}")
|
||||
return (None, None)
|
||||
|
||||
def find_account(txt):
|
||||
m = re.search(r'N[º°]\s*CUENTA[^\d]*(\d[\d\.\-]{5,})', txt, re.I)
|
||||
return m.group(1) if m else None
|
||||
|
||||
def find_columns(lines):
|
||||
"""Locate cargos/abonos/fecha column x-positions, tolerating a wrapped ABONOS header."""
|
||||
for i, ln in enumerate(lines):
|
||||
u = ln.upper()
|
||||
if 'CARGOS O GIROS' not in u:
|
||||
continue
|
||||
cx = u.find('CARGOS O GIROS')
|
||||
fx = u.find('FECHA'); fx = fx if fx >= 0 else None
|
||||
sx = u.find('SALDO'); sx = sx if sx >= 0 else None
|
||||
ax = u.find('ABONOS')
|
||||
if ax < 0: ax = u.find('DEPOSITOS')
|
||||
if ax < 0: # wrapped: look on neighbouring lines
|
||||
for j in (i - 1, i + 1, i - 2, i + 2):
|
||||
if 0 <= j < len(lines):
|
||||
uu = lines[j].upper()
|
||||
p = uu.find('ABONOS')
|
||||
if p < 0: p = uu.find('DEPOSITOS')
|
||||
if p >= 0: ax = p; break
|
||||
return cx, ax, fx, sx, i
|
||||
return None, None, None, None, None
|
||||
|
||||
def parse_file(path):
|
||||
txt = common.text_layout(path)
|
||||
lines = txt.split('\n')
|
||||
ps, pe = parse_period(txt)
|
||||
acct = find_account(txt)
|
||||
base = os.path.basename(path).upper()
|
||||
dt = 'cuenta_corriente' if 'CORRIENTE' in base else 'cuenta_vista'
|
||||
cx, ax, fx, sx, hdr = find_columns(lines)
|
||||
|
||||
txns = []
|
||||
if cx is not None and ax is not None:
|
||||
fxb = fx if fx else ax + 40 # fallback band end for FECHA
|
||||
# movement region: after header, before period summary
|
||||
start = hdr + 1
|
||||
end = len(lines)
|
||||
for k in range(start, len(lines)):
|
||||
if re.search(r'RESUMEN\s+DEL\s+PER', lines[k], re.I):
|
||||
end = k; break
|
||||
# split into docno-delimited blocks
|
||||
blocks, cur = [], None
|
||||
for ln in lines[start:end]:
|
||||
if DOCNO.match(ln):
|
||||
if cur is not None: blocks.append(cur)
|
||||
cur = [ln]
|
||||
elif cur is not None:
|
||||
cur.append(ln)
|
||||
if cur is not None: blocks.append(cur)
|
||||
|
||||
for blk in blocks:
|
||||
docno = DOCNO.match(blk[0]).group(1)
|
||||
# the amount line is the block line carrying the date
|
||||
amount_line = next((l for l in blk if DATE.search(l)), None)
|
||||
if not amount_line:
|
||||
continue
|
||||
dm = DATE.search(amount_line); dd, mm, yy = dm.groups()
|
||||
ds, de = dm.span()
|
||||
cargo = abono = saldo = None
|
||||
for mo in NUMTOK.finditer(amount_line):
|
||||
s, e = mo.start(), mo.end()
|
||||
if s >= ds and e <= de + 1: # part of the date itself
|
||||
continue
|
||||
v = common.to_int(mo.group())
|
||||
if v is None: continue
|
||||
if cx - 3 <= e < ax:
|
||||
cargo = v
|
||||
elif ax - 3 <= e < fxb:
|
||||
abono = v
|
||||
sm = NUMTOK.search(amount_line[de:]) # saldo = first number after the date
|
||||
if sm: saldo = common.to_int(sm.group())
|
||||
|
||||
if cargo: amt, direction = cargo, 'debit'
|
||||
elif abono: amt, direction = abono, 'credit'
|
||||
else: continue
|
||||
|
||||
# description: text fragments across the whole block, minus structural tokens
|
||||
pieces = []
|
||||
for idx, l in enumerate(blk):
|
||||
seg = l
|
||||
if idx == 0:
|
||||
seg = seg[DOCNO.match(seg).end():] # drop leading docno
|
||||
if l is amount_line:
|
||||
seg = seg[:dm.start()] # keep only text left of amounts/date
|
||||
seg = re.sub(r'\b001\b', ' ', seg) # SUC column
|
||||
seg = NUMTOK.sub(' ', seg) # strip stray numbers
|
||||
seg = seg.strip()
|
||||
if seg: pieces.append(seg)
|
||||
desc = ' '.join(pieces)
|
||||
desc = re.sub(r'(?:\s*/\s*)+', ' ', desc) # collapse stray slashes left by stripped RUTs
|
||||
desc = re.sub(r'\s{2,}', ' ', desc).strip(' /')
|
||||
|
||||
t = common.blank_txn(f'{yy}-{mm}-{dd}', desc, amt, direction, saldo, amount_line.rstrip())
|
||||
t['doc_number'] = docno
|
||||
txns.append(t)
|
||||
|
||||
return {
|
||||
'statement_id': common.sha8(path), 'gmail_id': None, 'bank': 'BancoEstado',
|
||||
'doc_type': dt, 'owner': 'Vicente', 'account_ref': acct,
|
||||
'account_last4': (re.sub(r'\D', '', acct)[-4:] if acct else None),
|
||||
'period_start': ps, 'period_end': pe,
|
||||
'saldo_inicial': None, 'saldo_final': None, 'currency': 'CLP',
|
||||
'source_pdf': path, 'parser_version': 'bancoestado/2', 'transactions': txns,
|
||||
}
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(json.dumps(parse_file(sys.argv[1]), ensure_ascii=False, indent=2))
|
||||
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue