PDF invoices have varied layouts. Regex works for 60-70% — the rest needs AI:
import Anthropic from '@anthropic-ai/sdk';
const claude = new Anthropic();
const InvoiceData = z.object({
invoiceNumber: z.string(),
issueDate: z.string(),
sellerVatId: z.string(),
sellerName: z.string(),
totalNet: z.number(),
totalGross: z.number(),
vat: z.number(),
currency: z.enum(['EUR', 'USD', 'GBP']),
dueDate: z.string().optional(),
});
async function extractInvoiceData(pdfText: string) {
const res = await claude.messages.create({
model: 'claude-3-5-sonnet-latest',
max_tokens: 1024,
messages: [{
role: 'user',
content: `Extract structured data from this invoice. Return ONLY JSON matching schema:
{ invoiceNumber, issueDate (YYYY-MM-DD), sellerVatId, sellerName, totalNet, totalGross, vat, currency, dueDate }
Invoice text:
${pdfText}`,
}],
});
const json = JSON.parse(res.content[0].text);
return InvoiceData.parse(json); // validate
}
Cost: ~$0.005 per invoice with Claude Sonnet. 200 invoices/month = $1. Marginal.