What is Python invoice OCR?
Python invoice OCR is the process of using Python libraries to automatically extract structured data -- vendor name, invoice number, date, line items, totals -- from PDF or scanned invoice images, eliminating manual data entry.
How to extract data from PDF invoices automatically with Python
The right approach depends on your invoice type:
- Digital PDFs (created electronically): use
pdfplumberto extract the embedded text layer. No OCR needed -- much faster and more accurate. - Scanned PDFs / images: use
pdf2imageto convert to images, thenpytesseractfor OCR.
Python invoice data extraction from digital PDFs -- pdfplumber
import pdfplumber, re, pandas as pd
def extract_invoice_data(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
text = "\n".join(page.extract_text() or "" for page in pdf.pages)
data = {}
m = re.search(r"Invoice\s*(?:No|Number|#)[:\s]+([A-Z0-9\-]+)", text, re.IGNORECASE)
if m: data["invoice_number"] = m.group(1)
m = re.search(r"(?:Total|Amount Due)[:\s]+\$?([\d,]+\.\d{2})", text, re.IGNORECASE)
if m: data["total"] = float(m.group(1).replace(",", ""))
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
tables = page.extract_tables()
if tables:
data["line_items"] = pd.DataFrame(tables[0][1:], columns=tables[0][0])
break
return data
Python invoice OCR for scanned PDFs -- pytesseract
from pdf2image import convert_from_path
import pytesseract
def ocr_scanned_invoice(pdf_path, dpi=300):
images = convert_from_path(pdf_path, dpi=dpi)
full_text = ""
for img in images:
gray = img.convert("L") # grayscale improves OCR
full_text += pytesseract.image_to_string(gray, config="--psm 6") + "\n"
return full_text
AI invoice data extraction -- Claude API approach
For production extraction across varied formats, regex alone breaks. The modern approach: extract text with pdfplumber, then pass it to an LLM to parse the structure.
import anthropic, json
client = anthropic.Anthropic()
def extract_with_ai(invoice_text):
prompt = (
"Extract invoice data. Return ONLY valid JSON with fields: "
"invoice_number, vendor_name, invoice_date (YYYY-MM-DD), due_date, "
"subtotal, tax_amount, total_amount, "
"line_items (array of description/quantity/unit_price/amount).\n\n"
"Invoice text:\n" + invoice_text
)
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1500,
messages=[{"role": "user", "content": prompt}]
)
raw = response.content[0].text.strip()
if raw.startswith("```"):
raw = raw.split("\n", 1)[1].rsplit("```", 1)[0]
return json.loads(raw)
Batch processing -- automate data extraction from multiple invoices
from pathlib import Path
import pandas as pd
def process_invoice_folder(folder, output="extracted_invoices.xlsx"):
results = []
for f in Path(folder).glob("*.pdf"):
try:
with pdfplumber.open(f) as pdf:
text = "\n".join(p.extract_text() or "" for p in pdf.pages)
data = extract_with_ai(text)
data["file"] = f.name
results.append(data)
print(f"OK {f.name}: {data.get('total_amount')}")
except Exception as e:
print(f"ERR {f.name}: {e}")
pd.DataFrame(results).to_excel(output, index=False)
print(f"Saved {len(results)} invoices to {output}")
This pipeline achieves 97-99% field accuracy on digital PDFs and processes 100+ invoices per hour. Get a free workflow audit.
Want this built for you?
We implement end-to-end finance automation for teams globally. Free 30-minute audit — no commitment.
Get a free automation audit →