extractor.
py
import pdfplumber
import pandas as pd
import re
from fuzzywuzzy import fuzz
# Keywords to identify sections
ACCOUNT_FIELDS = [
"account number", "customer id", "customer no", "account name", "account type",
"ifsc", "micr", "branch", "currency", "balance", "scheme", "cif no", "iban",
"bic", "sort code", "nomination", "interest rate", "drawing power", "mod
balance"
]
TRANSACTION_KEYWORDS = [
"date", "txn date", "tran date", "value date", "description", "remarks",
"ref no", "cheque no", "withdrawal", "deposit", "debit", "credit", "balance",
"paid in", "paid out"
]
FOOTER_WORDS = ["do not share", "otp", "system generated", "reiterate", "liable",
"password"]
# === Utilities ===
def fuzzy_match(text, keywords, threshold=85):
text = text.strip().lower().replace(":", "")
for keyword in keywords:
if fuzz.partial_ratio(text, keyword.lower()) >= threshold:
return keyword.title()
return None
def is_disclaimer(line):
return any(word in line.lower() for word in FOOTER_WORDS)
def extract_transaction_tables_all_pages(pdf):
all_data = []
active_headers = []
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
if len(table) < 2 or not any(table[0]):
continue
raw_headers = [h.strip().lower() if h else "" for h in table[0]]
# Check if table looks like a transaction table
if any(fuzzy_match(h, TRANSACTION_KEYWORDS) for h in raw_headers):
# Set new active headers
active_headers = [fuzzy_match(h, TRANSACTION_KEYWORDS) or h.title()
for h in table[0]]
rows = table[1:]
elif active_headers:
# No headers on this page, but assume continuation of previous page
rows = table
else:
continue # skip if no valid headers or active continuation
# Process rows
for row in rows:
# If row has date or amounts, likely a transaction row
if any(re.search(r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}", str(cell)) for
cell in row) or any(re.search(r"\d+\.\d{2}", str(cell)) for cell in row):
padded = row + [''] * (len(active_headers) - len(row)) # fill
short rows
record = dict(zip(active_headers, padded))
all_data.append(record)
return all_data
def extract_account_summary_top_of_page(page):
summary = {}
lines = page.extract_text().splitlines() if page.extract_text() else []
tables = page.extract_tables()
# Estimate Y position of first table
first_table_top_y = None
try:
if tables:
words = page.extract_words()
for row_label in tables[0][0]:
match = next((w for w in words if w["text"] == row_label), None)
if match:
first_table_top_y = float(match["top"])
break
except:
first_table_top_y = None
for line in lines:
if is_disclaimer(line):
continue
if first_table_top_y:
word = next((w for w in page.extract_words() if w['text'] in line),
None)
if word and float(word['top']) >= first_table_top_y:
break # skip lines below first table
if ":" in line:
key, val = map(str.strip, line.split(":", 1))
elif re.match(r"^\S+\s+.+$", line):
key, val = line.split(" ", 1)
else:
key, val = line.strip(), ""
if key:
summary[key] = val
return summary
# === Main Function ===
def process_bank_statement(pdf_path, output_path):
account_summary = {}
transactions = []
meta_info = []
disclaimers = []
with pdfplumber.open(pdf_path) as pdf:
# === STEP 1: Extract account summary from top of page 1 BEFORE transaction
parsing
if pdf.pages:
account_summary.update(extract_account_summary_top_of_page(pdf.pages[0]))
# === STEP 2: Loop over all pages for meta info and disclaimers
for page_num, page in enumerate(pdf.pages, start=1):
lines = page.extract_text().splitlines() if page.extract_text() else []
for line in lines:
if any(k in line.lower() for k in ["statement", "period",
"summary"]):
meta_info.append({"Page": page_num, "Info": line.strip()})
elif is_disclaimer(line):
disclaimers.append({"Page": page_num, "Text": line.strip()})
# === STEP 3: Extract all transaction tables from all pages (multi-page
continuation supported)
transactions = extract_transaction_tables_all_pages(pdf)
# === STEP 4: Export all sheets to Excel
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
if account_summary:
pd.DataFrame(account_summary.items(), columns=["Field",
"Value"]).to_excel(writer, sheet_name="Account_Summary", index=False)
if transactions:
pd.DataFrame(transactions).to_excel(writer, sheet_name="Transactions",
index=False)
if meta_info:
pd.DataFrame(meta_info).to_excel(writer, sheet_name="Meta_Info",
index=False)
if disclaimers:
pd.DataFrame(disclaimers).to_excel(writer, sheet_name="Disclaimers",
index=False)
app.py
from flask import Flask, render_template, request, send_file, redirect, url_for,
flash
import os
from extractor import process_bank_statement
app = Flask(__name__)
app.secret_key = 'supersecretkey'
UPLOAD_FOLDER = 'uploads'
OUTPUT_FOLDER = 'outputs'
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
ALLOWED_EXTENSIONS = {'pdf'}
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in
ALLOWED_EXTENSIONS
@app.route('/')
def index():
return render_template('index.html')
@app.route('/process', methods=['POST'])
def process():
if 'pdf_file' not in request.files:
flash("No file uploaded.")
return redirect(url_for('index'))
file = request.files['pdf_file']
if file.filename == '':
flash("No file selected.")
return redirect(url_for('index'))
if not allowed_file(file.filename):
flash("Only PDF files are allowed.")
return redirect(url_for('index'))
# Save file
filename = file.filename
pdf_path = os.path.join(UPLOAD_FOLDER, filename)
file.save(pdf_path)
# Define output path
base_name = os.path.splitext(filename)[0]
output_path = os.path.join(OUTPUT_FOLDER, f"{base_name}.xlsx")
try:
process_bank_statement(pdf_path, output_path)
return send_file(output_path, as_attachment=True)
except Exception as e:
flash(f"Error processing file: {str(e)}")
return redirect(url_for('index'))
if __name__ == '__main__':
app.run(debug=True)
index.html
<!DOCTYPE html>
<html>
<head>
<title>Bank Statement to Excel</title>
</head>
<body>
<h2>Upload Bank Statement PDF</h2>
{% with messages = get_flashed_messages() %}
{% if messages %}
<ul style="color:red;">
{% for msg in messages %}
<li>{{ msg }}</li>
{% endfor %}
</ul>
{% endif %}
{% endwith %}
<form action="/process" method="post" enctype="multipart/form-data">
<input type="file" name="pdf_file" accept=".pdf" required><br><br>
<button type="submit">Convert to Excel</button>
</form>
</body>
</html>