Thanks to visit codestin.com
Credit goes to www.scribd.com

0% found this document useful (0 votes)
14 views5 pages

Flask Project

The document contains a Python script for extracting data from bank statements in PDF format and converting it into Excel files. It uses libraries like pdfplumber for PDF handling and pandas for data manipulation, focusing on extracting account summaries and transaction details. Additionally, a Flask web application is provided to facilitate file uploads and processing, along with a simple HTML interface for users.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
14 views5 pages

Flask Project

The document contains a Python script for extracting data from bank statements in PDF format and converting it into Excel files. It uses libraries like pdfplumber for PDF handling and pandas for data manipulation, focusing on extracting account summaries and transaction details. Additionally, a Flask web application is provided to facilitate file uploads and processing, along with a simple HTML interface for users.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 5

extractor.

py

import pdfplumber
import pandas as pd
import re
from fuzzywuzzy import fuzz

# Keywords to identify sections


ACCOUNT_FIELDS = [
"account number", "customer id", "customer no", "account name", "account type",
"ifsc", "micr", "branch", "currency", "balance", "scheme", "cif no", "iban",
"bic", "sort code", "nomination", "interest rate", "drawing power", "mod
balance"
]

TRANSACTION_KEYWORDS = [
"date", "txn date", "tran date", "value date", "description", "remarks",
"ref no", "cheque no", "withdrawal", "deposit", "debit", "credit", "balance",
"paid in", "paid out"
]

FOOTER_WORDS = ["do not share", "otp", "system generated", "reiterate", "liable",


"password"]

# === Utilities ===


def fuzzy_match(text, keywords, threshold=85):
text = text.strip().lower().replace(":", "")
for keyword in keywords:
if fuzz.partial_ratio(text, keyword.lower()) >= threshold:
return keyword.title()
return None

def is_disclaimer(line):
return any(word in line.lower() for word in FOOTER_WORDS)

def extract_transaction_tables_all_pages(pdf):
all_data = []
active_headers = []

for page in pdf.pages:


tables = page.extract_tables()
for table in tables:
if len(table) < 2 or not any(table[0]):
continue

raw_headers = [h.strip().lower() if h else "" for h in table[0]]

# Check if table looks like a transaction table


if any(fuzzy_match(h, TRANSACTION_KEYWORDS) for h in raw_headers):
# Set new active headers
active_headers = [fuzzy_match(h, TRANSACTION_KEYWORDS) or h.title()
for h in table[0]]
rows = table[1:]
elif active_headers:
# No headers on this page, but assume continuation of previous page
rows = table

else:
continue # skip if no valid headers or active continuation
# Process rows
for row in rows:
# If row has date or amounts, likely a transaction row
if any(re.search(r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}", str(cell)) for
cell in row) or any(re.search(r"\d+\.\d{2}", str(cell)) for cell in row):
padded = row + [''] * (len(active_headers) - len(row)) # fill
short rows
record = dict(zip(active_headers, padded))
all_data.append(record)

return all_data

def extract_account_summary_top_of_page(page):
summary = {}
lines = page.extract_text().splitlines() if page.extract_text() else []
tables = page.extract_tables()

# Estimate Y position of first table


first_table_top_y = None
try:
if tables:
words = page.extract_words()
for row_label in tables[0][0]:
match = next((w for w in words if w["text"] == row_label), None)
if match:
first_table_top_y = float(match["top"])
break
except:
first_table_top_y = None

for line in lines:


if is_disclaimer(line):
continue
if first_table_top_y:
word = next((w for w in page.extract_words() if w['text'] in line),
None)
if word and float(word['top']) >= first_table_top_y:
break # skip lines below first table

if ":" in line:
key, val = map(str.strip, line.split(":", 1))
elif re.match(r"^\S+\s+.+$", line):
key, val = line.split(" ", 1)
else:
key, val = line.strip(), ""

if key:
summary[key] = val
return summary

# === Main Function ===


def process_bank_statement(pdf_path, output_path):
account_summary = {}
transactions = []
meta_info = []
disclaimers = []
with pdfplumber.open(pdf_path) as pdf:
# === STEP 1: Extract account summary from top of page 1 BEFORE transaction
parsing
if pdf.pages:

account_summary.update(extract_account_summary_top_of_page(pdf.pages[0]))

# === STEP 2: Loop over all pages for meta info and disclaimers
for page_num, page in enumerate(pdf.pages, start=1):
lines = page.extract_text().splitlines() if page.extract_text() else []
for line in lines:
if any(k in line.lower() for k in ["statement", "period",
"summary"]):
meta_info.append({"Page": page_num, "Info": line.strip()})
elif is_disclaimer(line):
disclaimers.append({"Page": page_num, "Text": line.strip()})

# === STEP 3: Extract all transaction tables from all pages (multi-page
continuation supported)
transactions = extract_transaction_tables_all_pages(pdf)

# === STEP 4: Export all sheets to Excel


with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
if account_summary:
pd.DataFrame(account_summary.items(), columns=["Field",
"Value"]).to_excel(writer, sheet_name="Account_Summary", index=False)
if transactions:
pd.DataFrame(transactions).to_excel(writer, sheet_name="Transactions",
index=False)
if meta_info:
pd.DataFrame(meta_info).to_excel(writer, sheet_name="Meta_Info",
index=False)
if disclaimers:
pd.DataFrame(disclaimers).to_excel(writer, sheet_name="Disclaimers",
index=False)

app.py

from flask import Flask, render_template, request, send_file, redirect, url_for,


flash
import os
from extractor import process_bank_statement

app = Flask(__name__)
app.secret_key = 'supersecretkey'

UPLOAD_FOLDER = 'uploads'
OUTPUT_FOLDER = 'outputs'
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

ALLOWED_EXTENSIONS = {'pdf'}

def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in
ALLOWED_EXTENSIONS
@app.route('/')
def index():
return render_template('index.html')

@app.route('/process', methods=['POST'])
def process():
if 'pdf_file' not in request.files:
flash("No file uploaded.")
return redirect(url_for('index'))

file = request.files['pdf_file']
if file.filename == '':
flash("No file selected.")
return redirect(url_for('index'))

if not allowed_file(file.filename):
flash("Only PDF files are allowed.")
return redirect(url_for('index'))

# Save file
filename = file.filename
pdf_path = os.path.join(UPLOAD_FOLDER, filename)
file.save(pdf_path)

# Define output path


base_name = os.path.splitext(filename)[0]
output_path = os.path.join(OUTPUT_FOLDER, f"{base_name}.xlsx")

try:
process_bank_statement(pdf_path, output_path)
return send_file(output_path, as_attachment=True)
except Exception as e:
flash(f"Error processing file: {str(e)}")
return redirect(url_for('index'))

if __name__ == '__main__':
app.run(debug=True)

index.html

<!DOCTYPE html>
<html>
<head>
<title>Bank Statement to Excel</title>
</head>
<body>
<h2>Upload Bank Statement PDF</h2>

{% with messages = get_flashed_messages() %}


{% if messages %}
<ul style="color:red;">
{% for msg in messages %}
<li>{{ msg }}</li>
{% endfor %}
</ul>
{% endif %}
{% endwith %}

<form action="/process" method="post" enctype="multipart/form-data">


<input type="file" name="pdf_file" accept=".pdf" required><br><br>
<button type="submit">Convert to Excel</button>
</form>
</body>
</html>

You might also like