0% found this document useful (0 votes)

14 views5 pages

Flask Project

The document contains a Python script for extracting data from bank statements in PDF format and converting it into Excel files. It uses libraries like pdfplumber for PDF handling and pandas for data manipulation, focusing on extracting account summaries and transaction details. Additionally, a Flask web application is provided to facilitate file uploads and processing, along with a simple HTML interface for users.

Uploaded by

tasneemnadkar1804

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

14 views5 pages

Flask Project

Uploaded by

tasneemnadkar1804

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 5

extractor.

import pdfplumber
import pandas as pd
import re
from fuzzywuzzy import fuzz

# Keywords to identify sections

ACCOUNT_FIELDS = [
"account number", "customer id", "customer no", "account name", "account type",
"ifsc", "micr", "branch", "currency", "balance", "scheme", "cif no", "iban",
"bic", "sort code", "nomination", "interest rate", "drawing power", "mod
balance"
]

TRANSACTION_KEYWORDS = [
"date", "txn date", "tran date", "value date", "description", "remarks",
"ref no", "cheque no", "withdrawal", "deposit", "debit", "credit", "balance",
"paid in", "paid out"
]

FOOTER_WORDS = ["do not share", "otp", "system generated", "reiterate", "liable",

"password"]

# === Utilities ===

def fuzzy_match(text, keywords, threshold=85):
text = text.strip().lower().replace(":", "")
for keyword in keywords:
if fuzz.partial_ratio(text, keyword.lower()) >= threshold:
return keyword.title()
return None

def is_disclaimer(line):
return any(word in line.lower() for word in FOOTER_WORDS)

def extract_transaction_tables_all_pages(pdf):
all_data = []
active_headers = []

for page in pdf.pages:

tables = page.extract_tables()
for table in tables:
if len(table) < 2 or not any(table[0]):
continue

raw_headers = [h.strip().lower() if h else "" for h in table[0]]

# Check if table looks like a transaction table

if any(fuzzy_match(h, TRANSACTION_KEYWORDS) for h in raw_headers):
# Set new active headers
active_headers = [fuzzy_match(h, TRANSACTION_KEYWORDS) or h.title()
for h in table[0]]
rows = table[1:]
elif active_headers:
# No headers on this page, but assume continuation of previous page
rows = table

else:
continue # skip if no valid headers or active continuation
# Process rows
for row in rows:
# If row has date or amounts, likely a transaction row
if any(re.search(r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}", str(cell)) for
cell in row) or any(re.search(r"\d+\.\d{2}", str(cell)) for cell in row):
padded = row + [''] * (len(active_headers) - len(row)) # fill
short rows
record = dict(zip(active_headers, padded))
all_data.append(record)

return all_data

def extract_account_summary_top_of_page(page):
summary = {}
lines = page.extract_text().splitlines() if page.extract_text() else []
tables = page.extract_tables()

# Estimate Y position of first table

first_table_top_y = None
try:
if tables:
words = page.extract_words()
for row_label in tables[0][0]:
match = next((w for w in words if w["text"] == row_label), None)
if match:
first_table_top_y = float(match["top"])
break
except:
first_table_top_y = None

for line in lines:

if is_disclaimer(line):
continue
if first_table_top_y:
word = next((w for w in page.extract_words() if w['text'] in line),
None)
if word and float(word['top']) >= first_table_top_y:
break # skip lines below first table

if ":" in line:
key, val = map(str.strip, line.split(":", 1))
elif re.match(r"^\S+\s+.+$", line):
key, val = line.split(" ", 1)
else:
key, val = line.strip(), ""

if key:
summary[key] = val
return summary

# === Main Function ===

def process_bank_statement(pdf_path, output_path):
account_summary = {}
transactions = []
meta_info = []
disclaimers = []
with pdfplumber.open(pdf_path) as pdf:
# === STEP 1: Extract account summary from top of page 1 BEFORE transaction
parsing
if pdf.pages:

account_summary.update(extract_account_summary_top_of_page(pdf.pages[0]))

# === STEP 2: Loop over all pages for meta info and disclaimers
for page_num, page in enumerate(pdf.pages, start=1):
lines = page.extract_text().splitlines() if page.extract_text() else []
for line in lines:
if any(k in line.lower() for k in ["statement", "period",
"summary"]):
meta_info.append({"Page": page_num, "Info": line.strip()})
elif is_disclaimer(line):
disclaimers.append({"Page": page_num, "Text": line.strip()})

# === STEP 3: Extract all transaction tables from all pages (multi-page
continuation supported)
transactions = extract_transaction_tables_all_pages(pdf)

# === STEP 4: Export all sheets to Excel

with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
if account_summary:
pd.DataFrame(account_summary.items(), columns=["Field",
"Value"]).to_excel(writer, sheet_name="Account_Summary", index=False)
if transactions:
pd.DataFrame(transactions).to_excel(writer, sheet_name="Transactions",
index=False)
if meta_info:
pd.DataFrame(meta_info).to_excel(writer, sheet_name="Meta_Info",
index=False)
if disclaimers:
pd.DataFrame(disclaimers).to_excel(writer, sheet_name="Disclaimers",
index=False)

app.py

from flask import Flask, render_template, request, send_file, redirect, url_for,

flash
import os
from extractor import process_bank_statement

app = Flask(__name__)
app.secret_key = 'supersecretkey'

UPLOAD_FOLDER = 'uploads'
OUTPUT_FOLDER = 'outputs'
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

ALLOWED_EXTENSIONS = {'pdf'}

def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in
ALLOWED_EXTENSIONS
@app.route('/')
def index():
return render_template('index.html')

@app.route('/process', methods=['POST'])
def process():
if 'pdf_file' not in request.files:
flash("No file uploaded.")
return redirect(url_for('index'))

file = request.files['pdf_file']
if file.filename == '':
flash("No file selected.")
return redirect(url_for('index'))

if not allowed_file(file.filename):
flash("Only PDF files are allowed.")
return redirect(url_for('index'))

# Save file
filename = file.filename
pdf_path = os.path.join(UPLOAD_FOLDER, filename)
file.save(pdf_path)

# Define output path

base_name = os.path.splitext(filename)[0]
output_path = os.path.join(OUTPUT_FOLDER, f"{base_name}.xlsx")

try:
process_bank_statement(pdf_path, output_path)
return send_file(output_path, as_attachment=True)
except Exception as e:
flash(f"Error processing file: {str(e)}")
return redirect(url_for('index'))

if __name__ == '__main__':
app.run(debug=True)

index.html

<!DOCTYPE html>
<html>
<head>
<title>Bank Statement to Excel</title>
</head>
<body>
<h2>Upload Bank Statement PDF</h2>

{% with messages = get_flashed_messages() %}

{% if messages %}
<ul style="color:red;">
{% for msg in messages %}
<li>{{ msg }}</li>
{% endfor %}
</ul>
{% endif %}
{% endwith %}

<form action="/process" method="post" enctype="multipart/form-data">

<input type="file" name="pdf_file" accept=".pdf" required><br><br>
<button type="submit">Convert to Excel</button>
</form>
</body>
</html>

A Project Report On Bank Management System
76% (233)
A Project Report On Bank Management System
27 pages
Pco2
No ratings yet
Pco2
55 pages
Chapter Shutdown
No ratings yet
Chapter Shutdown
31 pages
SingerValve 106 PR UL Fire Valve Sheet Product Pages
No ratings yet
SingerValve 106 PR UL Fire Valve Sheet Product Pages
2 pages
Failover-Clustering Windows Server
No ratings yet
Failover-Clustering Windows Server
89 pages
Nintendo Power Issue 271 (September 2011)
No ratings yet
Nintendo Power Issue 271 (September 2011)
101 pages
Main 115
No ratings yet
Main 115
22 pages
Online Bank Portal PROJECT
No ratings yet
Online Bank Portal PROJECT
29 pages
AI in Fashion Market - Segmentation Detailed Study With Forecast - Facts and Trends
No ratings yet
AI in Fashion Market - Segmentation Detailed Study With Forecast - Facts and Trends
2 pages
Banking System Project Report
No ratings yet
Banking System Project Report
25 pages
Online Payment Fraud Detection - Ipynb
No ratings yet
Online Payment Fraud Detection - Ipynb
120 pages
Act 115 1
No ratings yet
Act 115 1
22 pages
Developing A Data Warehouse For The Healthcare Enterprise Lessons From The Trenches Coll. Download PDF
100% (3)
Developing A Data Warehouse For The Healthcare Enterprise Lessons From The Trenches Coll. Download PDF
65 pages
Athi Physics
No ratings yet
Athi Physics
35 pages
Aws Kms Best Practices PDF
No ratings yet
Aws Kms Best Practices PDF
24 pages
FULL PreSonus Studio One 4 Professional 411 MULTILANG x64 PDF
No ratings yet
FULL PreSonus Studio One 4 Professional 411 MULTILANG x64 PDF
4 pages
How To Parse Data Tables From A PDF Bank Statement With Python - by Phillip Heita - Nov, 2021 - Medium
No ratings yet
How To Parse Data Tables From A PDF Bank Statement With Python - by Phillip Heita - Nov, 2021 - Medium
8 pages
Main Code
No ratings yet
Main Code
10 pages
HT Test Reopts July CTPT 2020
No ratings yet
HT Test Reopts July CTPT 2020
6 pages
111 Final
No ratings yet
111 Final
12 pages
Bank Transaction Js Code
No ratings yet
Bank Transaction Js Code
1 page
Lecture05 IntervalTree
No ratings yet
Lecture05 IntervalTree
4 pages
A Project Report On Bank Management System
No ratings yet
A Project Report On Bank Management System
20 pages
Specialized Business Information Systems
0% (1)
Specialized Business Information Systems
34 pages
Web Scraping Code
No ratings yet
Web Scraping Code
4 pages
Freedom-Ticket 01-2 Notes
No ratings yet
Freedom-Ticket 01-2 Notes
10 pages
Types of Brakes
No ratings yet
Types of Brakes
12 pages
84 3
No ratings yet
84 3
10 pages
TDS DLSF Series
No ratings yet
TDS DLSF Series
3 pages
File Cleaning
No ratings yet
File Cleaning
2 pages
87 1
No ratings yet
87 1
10 pages
A Project Report On Bank Management System
No ratings yet
A Project Report On Bank Management System
24 pages
A Project Report On Bank Management System2
No ratings yet
A Project Report On Bank Management System2
27 pages
84 Store
No ratings yet
84 Store
7 pages
DB Connsa
No ratings yet
DB Connsa
3 pages
Untitled Document
No ratings yet
Untitled Document
6 pages
Distributed Computing
No ratings yet
Distributed Computing
3 pages
A Project Report On Bank Management System3
No ratings yet
A Project Report On Bank Management System3
27 pages
Statement Code
No ratings yet
Statement Code
13 pages
Mihir Cs Project
No ratings yet
Mihir Cs Project
24 pages
Bank Management
No ratings yet
Bank Management
3 pages
Bank Statement Simulator Script
No ratings yet
Bank Statement Simulator Script
2 pages
Code Python
No ratings yet
Code Python
3 pages
A Project Report On Bank Management System.
No ratings yet
A Project Report On Bank Management System.
27 pages
Classical Mechanics A Modern Perspective
No ratings yet
Classical Mechanics A Modern Perspective
2 pages
Scraperskank
No ratings yet
Scraperskank
3 pages
Final 057
No ratings yet
Final 057
8 pages
Information - Project (Kirti) - Organized
No ratings yet
Information - Project (Kirti) - Organized
26 pages
A Project Report On Bank Management System
No ratings yet
A Project Report On Bank Management System
22 pages
A Project Report On Bank Management System
No ratings yet
A Project Report On Bank Management System
25 pages
This Is A PDF Extractor
No ratings yet
This Is A PDF Extractor
2 pages
Training 1
No ratings yet
Training 1
2 pages
Bank Management, Grocery Management, SST SQP
No ratings yet
Bank Management, Grocery Management, SST SQP
27 pages
Assignment 2
No ratings yet
Assignment 2
2 pages
File Code
No ratings yet
File Code
13 pages
Bank Management
No ratings yet
Bank Management
28 pages
File 10
No ratings yet
File 10
2 pages
File 21
No ratings yet
File 21
2 pages
Long Docs
No ratings yet
Long Docs
8 pages
PSPDF Title Extraction Explanation
No ratings yet
PSPDF Title Extraction Explanation
3 pages
PROJECT1
No ratings yet
PROJECT1
17 pages
Python Bank Management System Report
No ratings yet
Python Bank Management System Report
25 pages
Bfsi - Ocr
No ratings yet
Bfsi - Ocr
12 pages
Computer Project Final - pdf2
No ratings yet
Computer Project Final - pdf2
22 pages
Akk
No ratings yet
Akk
2 pages
Bank MNG
No ratings yet
Bank MNG
15 pages
File 15
No ratings yet
File 15
3 pages
Python Code
No ratings yet
Python Code
5 pages
Task
No ratings yet
Task
15 pages
TXN Date Description Debit Credit Balance Date Value Datechq - No. Value Date Ref No./Cheque No
No ratings yet
TXN Date Description Debit Credit Balance Date Value Datechq - No. Value Date Ref No./Cheque No
4 pages
Field Value
No ratings yet
Field Value
4 pages
Field Value
No ratings yet
Field Value
5 pages
Harshita
No ratings yet
Harshita
23 pages
File 14
No ratings yet
File 14
5 pages
Banking Management System
No ratings yet
Banking Management System
21 pages
Project File Reeda Naaz
No ratings yet
Project File Reeda Naaz
27 pages
PolyJet Print-Head Claim Procedure
No ratings yet
PolyJet Print-Head Claim Procedure
3 pages
Cs 3308 Unit 7 Programming Assignment
No ratings yet
Cs 3308 Unit 7 Programming Assignment
8 pages
Allied Meditec 1100 October 2023 Ver23-10
No ratings yet
Allied Meditec 1100 October 2023 Ver23-10
2 pages
Engineers in Society Exam Guide
No ratings yet
Engineers in Society Exam Guide
349 pages
Fish-Ridge Wind Turbine
No ratings yet
Fish-Ridge Wind Turbine
19 pages
Bank Management System Project in Python
0% (2)
Bank Management System Project in Python
12 pages
2021 Fia f3 Regional Homologation 11.01.21
No ratings yet
2021 Fia f3 Regional Homologation 11.01.21
21 pages
CSE-224 (Fundamentals of Android)
No ratings yet
CSE-224 (Fundamentals of Android)
2 pages
IDELA Training Manual - Baseline II
No ratings yet
IDELA Training Manual - Baseline II
30 pages
Example of List in Python
No ratings yet
Example of List in Python
2 pages
A Rapid Abnormal Event Detection Method For Surveillance Video Based On A Novel Feature in Compressed Domain of HEVC
No ratings yet
A Rapid Abnormal Event Detection Method For Surveillance Video Based On A Novel Feature in Compressed Domain of HEVC
6 pages
Dutch Fintech Map 2022: Ecosystem Insights
No ratings yet
Dutch Fintech Map 2022: Ecosystem Insights
16 pages
Empirical Study On Terminal Water Velocity of Drainage Stack - C.L. Cheng, K.C. He e C.L
No ratings yet
Empirical Study On Terminal Water Velocity of Drainage Stack - C.L. Cheng, K.C. He e C.L
15 pages
ORGANIZING
No ratings yet
ORGANIZING
36 pages
Find List of Oyo in Hyderabad Near Me - Justdial
No ratings yet
Find List of Oyo in Hyderabad Near Me - Justdial
46 pages
Road Restraint Systems Guide
No ratings yet
Road Restraint Systems Guide
82 pages

Flask Project

Uploaded by

Flask Project

Uploaded by

extractor.

# Keywords to identify sections

FOOTER_WORDS = ["do not share", "otp", "system generated", "reiterate", "liable",

# === Utilities ===

for page in pdf.pages:

raw_headers = [h.strip().lower() if h else "" for h in table[0]]

# Check if table looks like a transaction table

# Estimate Y position of first table

for line in lines:

# === Main Function ===

# === STEP 4: Export all sheets to Excel

from flask import Flask, render_template, request, send_file, redirect, url_for,

# Define output path

{% with messages = get_flashed_messages() %}

<form action="/process" method="post" enctype="multipart/form-data">

You might also like