kua-money-trace/src/localMailImport.js

403 lines
13 KiB
JavaScript

import fs from 'node:fs/promises';
import path from 'node:path';
import { execFile } from 'node:child_process';
import { promisify } from 'node:util';
import { archiveRawEmail } from './mailArchive.js';
const DEFAULT_MAIL_ROOT = '/Users/kavi/Library/Mail/V10';
const DEFAULT_ENVELOPE_INDEX = path.join(DEFAULT_MAIL_ROOT, 'MailData', 'Envelope Index');
const DEFAULT_ACCOUNTS_DB = '/Users/kavi/Library/Accounts/Accounts4.sqlite';
const execFileAsync = promisify(execFile);
export async function listAppleMailSources(mailRoot = DEFAULT_MAIL_ROOT) {
const sources = [];
const accountDirs = await safeReaddir(mailRoot, { withFileTypes: true });
for (const accountDir of accountDirs.filter((entry) => entry.isDirectory())) {
if (accountDir.name === 'MailData') continue;
const accountPath = path.join(mailRoot, accountDir.name);
const mailboxes = await findMailboxDirs(accountPath);
if (!mailboxes.length) continue;
sources.push({
accountDir: accountDir.name,
accountPath,
mailboxes: mailboxes.map((mailbox) => ({
path: mailbox,
label: mailboxLabel(mailbox, accountPath),
})),
});
}
return sources;
}
export async function importAppleMailEmlx({ account, sourcePath, archiveRoot, limit = 25, since }) {
const files = await findFiles(sourcePath, (file) => file.endsWith('.emlx') && !file.endsWith('.partial.emlx'));
const selectedFiles = [];
const sinceDate = since ? new Date(since) : null;
for (const file of files.sort()) {
if (sinceDate) {
const stat = await fs.stat(file);
if (stat.mtime < sinceDate) continue;
}
selectedFiles.push(file);
if (selectedFiles.length >= limit) break;
}
const records = [];
for (const file of selectedFiles) {
const source = await readEmlxRawMessage(file);
const record = await archiveRawEmail({
account,
mailbox: mailboxLabel(file, sourcePath),
uid: path.basename(file, '.emlx'),
source,
archiveRoot,
});
record.localSource = file;
records.push(record);
}
return {
sourcePath,
archiveRoot,
imported: records.length,
scanned: files.length,
records,
};
}
export async function importAppleMailFromIndex({
account,
email = account.email || account.auth?.user,
mailRoot = DEFAULT_MAIL_ROOT,
indexPath = DEFAULT_ENVELOPE_INDEX,
accountsDbPath = DEFAULT_ACCOUNTS_DB,
mailbox = 'all',
archiveRoot,
limit = 25,
since,
}) {
if (!email) throw new Error('email is required to resolve Apple Mail account');
const { imapAccount, mailboxRecord } = await resolveAppleMailAccountMailbox({
email,
accountsDbPath,
indexPath,
mailbox,
});
const sourcePath = mailboxPathForUrl(mailRoot, mailboxRecord.url);
const sinceTimestamp = since ? Math.floor(new Date(since).getTime() / 1000) : 0;
const importLimit = Number(limit);
const candidateLimit = Math.max(importLimit * 20, importLimit + 100);
const messages = await queryJson(indexPath, `
select m.ROWID as rowid, m.remote_id as remoteId, m.date_received as dateReceived
from messages m
where m.mailbox = ${Number(mailboxRecord.rowid)}
and m.deleted = 0
and m.date_received >= ${Number.isFinite(sinceTimestamp) ? sinceTimestamp : 0}
order by m.date_received desc
limit ${Number.isFinite(candidateLimit) ? candidateLimit : 25}
`);
const records = [];
const missing = [];
for (const message of messages) {
const localSource = await findEmlxByRowId(sourcePath, message.rowid);
if (!localSource) {
missing.push(message.rowid);
continue;
}
if (records.length >= importLimit) break;
const source = await readEmlxRawMessage(localSource);
const record = await archiveRawEmail({
account,
mailbox: decodeMailboxUrl(mailboxRecord.url),
uid: String(message.rowid),
source,
archiveRoot,
});
record.localSource = localSource;
record.appleMail = {
accountIdentifier: imapAccount.identifier,
mailboxUrl: mailboxRecord.url,
rowid: message.rowid,
remoteId: message.remoteId,
dateReceived: message.dateReceived,
};
records.push(record);
}
return {
email,
imapAccount,
mailbox: mailboxRecord,
sourcePath,
archiveRoot,
imported: records.length,
scanned: messages.length,
missing,
records,
};
}
export async function resolveAppleMailAccountMailbox({
email,
accountsDbPath = DEFAULT_ACCOUNTS_DB,
indexPath = DEFAULT_ENVELOPE_INDEX,
mailbox = 'all',
}) {
const imapAccounts = await resolveAppleMailImapAccounts({ email, accountsDbPath });
const attempts = [];
for (const imapAccount of imapAccounts) {
try {
const mailboxRecord = await resolveMailboxRecord({ indexPath, imapAccountId: imapAccount.identifier, mailbox });
attempts.push({ imapAccount, mailboxRecord });
} catch {
// Some macOS account records are stale or have no Mail mailbox. Try the next candidate.
}
}
if (!attempts.length) {
throw new Error(`Apple Mail mailbox not found for ${email}/${mailbox}`);
}
return attempts.sort((a, b) => Number(b.mailboxRecord.totalCount || 0) - Number(a.mailboxRecord.totalCount || 0))[0];
}
export async function resolveAppleMailImapAccount({ email, accountsDbPath = DEFAULT_ACCOUNTS_DB }) {
const rows = await resolveAppleMailImapAccounts({ email, accountsDbPath });
if (!rows.length) {
throw new Error(`Apple Mail IMAP account not found for ${email}`);
}
return rows[0];
}
export async function resolveAppleMailImapAccounts({ email, accountsDbPath = DEFAULT_ACCOUNTS_DB }) {
const childRows = await queryJson(accountsDbPath, `
select child.ZIDENTIFIER as identifier,
parent.ZUSERNAME as email,
parent.ZACCOUNTDESCRIPTION as description,
parent.ZIDENTIFIER as parentIdentifier,
'child' as source
from ZACCOUNT parent
join ZACCOUNT child on child.ZPARENTACCOUNT = parent.Z_PK
join ZACCOUNTTYPE childType on child.ZACCOUNTTYPE = childType.Z_PK
where lower(parent.ZUSERNAME) = lower('${sqlString(email)}')
and childType.ZIDENTIFIER = 'com.apple.account.IMAP'
order by child.Z_PK
`);
const directRows = await queryJson(accountsDbPath, `
select account.ZIDENTIFIER as identifier,
account.ZUSERNAME as email,
account.ZACCOUNTDESCRIPTION as description,
null as parentIdentifier,
'direct' as source
from ZACCOUNT account
join ZACCOUNTTYPE accountType on account.ZACCOUNTTYPE = accountType.Z_PK
where lower(account.ZUSERNAME) = lower('${sqlString(email)}')
and accountType.ZIDENTIFIER = 'com.apple.account.IMAP'
order by account.Z_PK
`);
const rows = [...directRows, ...childRows];
const unique = new Map();
for (const row of rows) {
if (!unique.has(row.identifier)) unique.set(row.identifier, row);
}
const result = [...unique.values()];
if (!rows.length) {
throw new Error(`Apple Mail IMAP account not found for ${email}`);
}
return result;
}
export async function resolveMailboxRecord({ indexPath = DEFAULT_ENVELOPE_INDEX, imapAccountId, mailbox = 'all' }) {
const accountPrefix = `imap://${imapAccountId}/`;
const candidates = mailboxCandidates(mailbox)
.map((name) => `${accountPrefix}${encodeMailboxPath(name)}`);
for (const url of candidates) {
const rows = await queryJson(indexPath, `
select ROWID as rowid, url, total_count as totalCount, unread_count as unreadCount, unseen_count as unseenCount
from mailboxes
where url = '${sqlString(url)}'
limit 1
`);
if (rows.length) return rows[0];
}
const available = await queryJson(indexPath, `
select ROWID as rowid, url, total_count as totalCount
from mailboxes
where url like '${sqlString(accountPrefix)}%'
order by total_count desc
limit 20
`);
throw new Error(`Apple Mail mailbox not found for ${imapAccountId}/${mailbox}. Available: ${available.map((row) => row.url).join(', ')}`);
}
export async function readEmlxRawMessage(filePath) {
const content = await fs.readFile(filePath);
const newlineIndex = content.indexOf(0x0a);
if (newlineIndex < 0) throw new Error(`invalid emlx file without first line: ${filePath}`);
const sizeText = content.subarray(0, newlineIndex).toString('utf8').trim();
const declaredSize = Number(sizeText);
if (!Number.isFinite(declaredSize) || declaredSize <= 0) {
return content.subarray(newlineIndex + 1);
}
const start = newlineIndex + 1;
const end = Math.min(start + declaredSize, content.length);
return content.subarray(start, end);
}
export async function findEmlxByRowId(sourcePath, rowid) {
const fileName = `${rowid}.emlx`;
const baseDirs = await mailDataBaseDirs(sourcePath);
const bucketParts = bucketPathParts(rowid);
for (const baseDir of baseDirs) {
const candidate = path.join(baseDir, 'Data', ...bucketParts, 'Messages', fileName);
if (await fileExists(candidate)) return candidate;
}
return findFirstFile(sourcePath, fileName);
}
export function bucketPathParts(rowid) {
const bucket = Math.floor(Number(rowid) / 1000);
if (!bucket) return [];
return String(bucket).split('').reverse();
}
export function mailboxPathForUrl(mailRoot, url) {
const withoutScheme = url.replace(/^imap:\/\//, '');
const slashIndex = withoutScheme.indexOf('/');
const accountDir = slashIndex >= 0 ? withoutScheme.slice(0, slashIndex) : withoutScheme;
const mailboxPath = slashIndex >= 0 ? withoutScheme.slice(slashIndex + 1) : '';
const mailboxParts = mailboxPath
.split('/')
.filter(Boolean)
.map((part) => decodeURIComponent(part));
return path.join(mailRoot, accountDir, ...mailboxParts.map((part) => `${part}.mbox`));
}
export function decodeMailboxUrl(url) {
const withoutScheme = url.replace(/^imap:\/\//, '');
const slashIndex = withoutScheme.indexOf('/');
const mailboxPath = slashIndex >= 0 ? withoutScheme.slice(slashIndex + 1) : '';
return mailboxPath
.split('/')
.filter(Boolean)
.map((part) => decodeURIComponent(part))
.join('/');
}
async function findMailboxDirs(root) {
const dirs = [];
const entries = await safeReaddir(root, { withFileTypes: true });
for (const entry of entries) {
if (!entry.isDirectory()) continue;
const fullPath = path.join(root, entry.name);
if (entry.name.endsWith('.mbox')) dirs.push(fullPath);
const nested = await findMailboxDirs(fullPath);
dirs.push(...nested);
}
return dirs;
}
async function findFiles(root, predicate) {
const found = [];
const entries = await safeReaddir(root, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(root, entry.name);
if (entry.isDirectory()) {
found.push(...await findFiles(fullPath, predicate));
} else if (entry.isFile() && predicate(fullPath)) {
found.push(fullPath);
}
}
return found;
}
async function findFirstFile(root, fileName) {
const entries = await safeReaddir(root, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(root, entry.name);
if (entry.isFile() && entry.name === fileName) return fullPath;
if (entry.isDirectory()) {
const nested = await findFirstFile(fullPath, fileName);
if (nested) return nested;
}
}
return null;
}
async function mailDataBaseDirs(sourcePath) {
const baseDirs = [sourcePath];
const entries = await safeReaddir(sourcePath, { withFileTypes: true });
for (const entry of entries) {
if (!entry.isDirectory()) continue;
const fullPath = path.join(sourcePath, entry.name);
if (await fileExists(path.join(fullPath, 'Data'))) baseDirs.push(fullPath);
}
return baseDirs;
}
async function queryJson(dbPath, sql) {
const { stdout } = await execFileAsync('sqlite3', ['-json', dbPath, sql], {
maxBuffer: 1024 * 1024 * 20,
});
return stdout.trim() ? JSON.parse(stdout) : [];
}
function mailboxCandidates(mailbox) {
if (!mailbox || mailbox === 'all') return ['[Gmail]/Todos', '[Gmail]/All Mail', 'INBOX'];
if (mailbox === 'inbox') return ['INBOX'];
return [mailbox];
}
function encodeMailboxPath(mailboxPath) {
return mailboxPath
.split('/')
.map((part) => encodeURIComponent(part))
.join('/');
}
function sqlString(value) {
return String(value).replaceAll("'", "''");
}
async function fileExists(filePath) {
try {
await fs.access(filePath);
return true;
} catch {
return false;
}
}
async function safeReaddir(dir, options) {
try {
return await fs.readdir(dir, options);
} catch {
return [];
}
}
function mailboxLabel(itemPath, rootPath) {
return path.relative(rootPath, itemPath)
.split(path.sep)
.filter((part) => part && part !== 'Data' && part !== 'Messages')
.join('/');
}