0% found this document useful (0 votes)

12 views7 pages

Updated Code

The document outlines a FastAPI application that processes image and PDF files to extract text using OCR and structures the data using Google's Gemini API. It includes various functionalities such as logging, caching, and error handling for unsupported file formats. The application processes multiple files concurrently, providing structured output in JSON format while logging memory usage and processing times.

Uploaded by

joshi

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

12 views7 pages

Updated Code

Uploaded by

joshi

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 7

from fastapi import FastAPI, File, UploadFile, HTTPException

import pytesseract
import cv2
import os
from PIL import Image
import json
import unicodedata
from pdf2image import convert_from_bytes
from pypdf import PdfReader
import numpy as np
from typing import List
import io
import logging
import time
import asyncio
import psutil
import cachetools
import hashlib
import google.generativeai as genai
from dotenv import load_dotenv

app = FastAPI()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %
(message)s')
logger = logging.getLogger(__name__)

# Load environment variables

load_dotenv()

# Configure Gemini API

api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
logger.error("GOOGLE_API_KEY not set")
raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not set")
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-2.0-flash")

# Set Tesseract path

pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

# In-memory caches (1-hour TTL)

raw_text_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
structured_data_cache = cachetools.TTLCache(maxsize=100, ttl=3600)

def log_memory_usage():
"""Log current memory usage."""
process = psutil.Process()
mem_info = process.memory_info()
return f"Memory usage: {mem_info.rss / 1024 / 1024:.2f} MB"

def get_file_hash(file_bytes):
"""Generate MD5 hash of file content."""
return hashlib.md5(file_bytes).hexdigest()

def get_text_hash(raw_text):
"""Generate MD5 hash of raw text."""
return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
def get_poppler_path():
"""Determine the correct poppler path based on the system."""
import platform
import shutil

# Check if poppler utilities are in PATH

if shutil.which('pdftoppm'):
return None # Use system PATH

# Common poppler paths for different systems

common_paths = [
"/usr/bin", # Linux
"/usr/local/bin", # macOS with Homebrew
"/opt/homebrew/bin", # macOS with Apple Silicon Homebrew
"/usr/share/poppler/bin", # Some Linux distributions
"C:\\poppler\\Library\\bin", # Windows
"C:\\Program Files\\poppler\\bin", # Windows alternative
]

for path in common_paths:

if os.path.exists(os.path.join(path, "pdftoppm")) or
os.path.exists(os.path.join(path, "pdftoppm.exe")):
return path

return None

async def process_image(img_bytes, filename, idx):

"""Process a single image (JPG/JPEG/PNG) with OCR."""
start_time = time.time()
logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
try:
img = Image.open(io.BytesIO(img_bytes))
img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
custom_config = r'--oem 1 --psm 6 -l eng+ara' # Reduced for performance
page_text = pytesseract.image_to_string(img_pil, config=custom_config)
logger.info(f"Completed OCR for {filename} image {idx}, took {time.time() -
start_time:.2f} seconds, {log_memory_usage()}")
return page_text + "\n"
except Exception as e:
logger.error(f"OCR failed for {filename} image {idx}: {str(e)},
{log_memory_usage()}")
return ""

async def process_pdf_page(img, page_idx):

"""Process a single PDF page with OCR."""
start_time = time.time()
logger.info(f"Starting OCR for PDF page {page_idx}, {log_memory_usage()}")
try:
img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
custom_config = r'--oem 1 --psm 6 -l eng+ara' # Reduced for performance
page_text = pytesseract.image_to_string(img_pil, config=custom_config)
logger.info(f"Completed OCR for PDF page {page_idx}, took {time.time() -
start_time:.2f} seconds, {log_memory_usage()}")
return page_text + "\n"
except Exception as e:
logger.error(f"OCR failed for PDF page {page_idx}: {str(e)},
{log_memory_usage()}")
return ""

async def process_with_gemini(filename: str, raw_text: str):

"""Process raw text with Gemini to extract structured data."""
start_time = time.time()
logger.info(f"Starting Gemini processing for {filename}, {log_memory_usage()}")

# Check structured data cache

text_hash = get_text_hash(raw_text)
if text_hash in structured_data_cache:
logger.info(f"Structured data cache hit for {filename},
{log_memory_usage()}")
return structured_data_cache[text_hash]

# Truncate text for Gemini

if len(raw_text) > 10000:
raw_text = raw_text[:10000]
logger.info(f"Truncated raw text for {filename} to 10000 characters,
{log_memory_usage()}")

try:
prompt = f"""
You are an intelligent invoice data extractor. Given raw text from an
invoice in any language and extract key business fields in the specified JSON
format. Support English. Handle synonyms (e.g., 'total' = 'net', 'tax' =
'GST'/'TDS'). The 'Products' field is dynamic and may contain multiple items, each
with 'qty', 'description', 'unit_price', and 'amount'. Detect the currency (e.g.,
USD, INR, EUR) from symbols ($, ₹, €) or text; default to USD if unclear. If a
field is missing, include it with an empty string ("") or appropriate default
(e.g., 0 for numbers).

Raw text:
{raw_text}

Output JSON:
{{
"invoice": {{
"invoice_number": "",
"invoice_date": "YYYY-MM-DD",
"due_date": "YYYY-MM-DD",
"purchase_order_number": "",
"vendor": {{
"vendor_id": "",
"name": "",
"address": {{
"line1": "",
"line2": "",
"city": "",
"state": "",
"postal_code": "",
"country": ""
}},
"contact": {{
"email": "",
"phone": ""
}},
"tax_id": ""
}},
"buyer": {{
"buyer_id": "",
"name": "",
"address": {{
"line1": "",
"line2": "",
"city": "",
"state": "",
"postal_code": "",
"country": ""
}},
"contact": {{
"email": "",
"phone": ""
}},
"tax_id": ""
}},
"items": [
{{
"item_id": "",
"description": "",
"quantity": 0,
"unit_of_measure": "",
"unit_price": 0,
"total_price": 0,
"tax_rate": 0,
"tax_amount": 0,
"discount": 0,
"net_amount": 0
}}
],
"sub_total": 0,
"tax_total": 0,
"discount_total": 0,
"total_amount": 0,
"currency": ""
}}
}}
"""
response = model.generate_content(prompt)
llm_output = response.text
json_start = llm_output.find("{")
json_end = llm_output.rfind("}") + 1
json_str = llm_output[json_start:json_end]
structured_data = json.loads(json_str)
structured_data_cache[text_hash] = structured_data
logger.info(f"Gemini processing for {filename}, took {time.time() -
start_time:.2f} seconds, {log_memory_usage()}")
return structured_data
except Exception as e:
logger.error(f"Gemini processing failed for {filename}: {str(e)},
{log_memory_usage()}")
return {"error": f"Gemini processing failed: {str(e)}"}

@app.post("/ocr")
async def extract_and_structure(files: List[UploadFile] = File(...)):
output_json = {
"success": True,
"message": "",
"data": []
}
success_count = 0
fail_count = 0

logger.info(f"Starting processing for {len(files)} files,

{log_memory_usage()}")

for file in files:

total_start_time = time.time()
logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")

# Validate file format

valid_extensions = {'.pdf', '.jpg', '.jpeg', '.png'}
file_ext = os.path.splitext(file.filename.lower())[1]
if file_ext not in valid_extensions:
fail_count += 1
output_json["data"].append({
"filename": file.filename,
"structured_data": {"error": f"Unsupported file format:
{file_ext}"},
"error": f"Unsupported file format: {file_ext}"
})
logger.error(f"Unsupported file format for {file.filename}:
{file_ext}")
continue

# Read file into memory

try:
file_start_time = time.time()
file_bytes = await file.read()
file_stream = io.BytesIO(file_bytes)
file_hash = get_file_hash(file_bytes)
logger.info(f"Read file {file.filename}, took {time.time() -
file_start_time:.2f} seconds, size: {len(file_bytes)/1024:.2f} KB,
{log_memory_usage()}")
except Exception as e:
fail_count += 1
output_json["data"].append({
"filename": file.filename,
"structured_data": {"error": f"Failed to read file: {str(e)}"},
"error": f"Failed to read file: {str(e)}"
})
logger.error(f"Failed to read file {file.filename}: {str(e)},
{log_memory_usage()}")
continue

# Check raw text cache

raw_text = ""
if file_hash in raw_text_cache:
raw_text = raw_text_cache[file_hash]
logger.info(f"Raw text cache hit for {file.filename},
{log_memory_usage()}")
else:
if file_ext == '.pdf':
# Try extracting embedded text
try:
extract_start_time = time.time()
reader = PdfReader(file_stream)
for page in reader.pages:
text = page.extract_text()
if text:
raw_text += text + "\n"
logger.info(f"Embedded text extraction for {file.filename},
took {time.time() - extract_start_time:.2f} seconds, text length: {len(raw_text)},
{log_memory_usage()}")
except Exception as e:
logger.warning(f"Embedded text extraction failed for
{file.filename}: {str(e)}, {log_memory_usage()}")

# If no embedded text, perform OCR

if not raw_text.strip():
try:
convert_start_time = time.time()

# Get the correct poppler path

poppler_path = get_poppler_path()

# Convert PDF to images with proper poppler path handling

if poppler_path:
images = convert_from_bytes(file_bytes,
poppler_path=poppler_path, dpi=100)
logger.info(f"Using poppler path: {poppler_path}")
else:
# Try without specifying poppler_path (use system PATH)
images = convert_from_bytes(file_bytes, dpi=100)
logger.info("Using poppler from system PATH")

logger.info(f"PDF to images conversion for {file.filename},

{len(images)} pages, took {time.time() - convert_start_time:.2f} seconds,
{log_memory_usage()}")

ocr_start_time = time.time()
page_texts = []
for i, img in enumerate(images):
page_text = await process_pdf_page(img, i)
page_texts.append(page_text)
raw_text = "".join(page_texts)
logger.info(f"Total OCR for {file.filename}, took
{time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)},
{log_memory_usage()}")
except Exception as e:
fail_count += 1
error_msg = f"OCR failed: {str(e)}"
if "poppler" in str(e).lower():
error_msg += ". Please ensure Poppler is installed and
accessible in PATH."
output_json["data"].append({
"filename": file.filename,
"structured_data": {"error": error_msg},
"error": error_msg
})
logger.error(f"OCR failed for {file.filename}: {str(e)},
{log_memory_usage()}")
continue
else: # JPG/JPEG/PNG
try:
ocr_start_time = time.time()
raw_text = await process_image(file_bytes, file.filename, 0)
logger.info(f"Image OCR for {file.filename}, took {time.time()
- ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
except Exception as e:
fail_count += 1
output_json["data"].append({
"filename": file.filename,
"structured_data": {"error": f"Image OCR failed:
{str(e)}"},
"error": f"Image OCR failed: {str(e)}"
})
logger.error(f"Image OCR failed for {file.filename}: {str(e)},
{log_memory_usage()}")
continue

# Normalize text
try:
normalize_start_time = time.time()
raw_text = unicodedata.normalize('NFKC', raw_text)
raw_text = raw_text.encode().decode('utf-8')
raw_text_cache[file_hash] = raw_text
logger.info(f"Text normalization for {file.filename}, took
{time.time() - normalize_start_time:.2f} seconds, text length: {len(raw_text)},
{log_memory_usage()}")
except Exception as e:
logger.warning(f"Text normalization failed for {file.filename}:
{str(e)}, {log_memory_usage()}")

# Process with Gemini

structured_data = await process_with_gemini(file.filename, raw_text)
success_count += 1
output_json["data"].append({
"filename": file.filename,
"structured_data": structured_data,
"error": ""
})

logger.info(f"Total processing for {file.filename}, took {time.time() -

total_start_time:.2f} seconds, {log_memory_usage()}")

output_json["message"] = f"Processed {len(files)} files. {success_count}

succeeded, {fail_count} failed."
if fail_count > 0 and success_count == 0:
output_json["success"] = False

logger.info(f"Completed processing for {len(files)} files, {success_count}

succeeded, {fail_count} failed, {log_memory_usage()}")
return output_json

Force FX-8CS Service Manual - en
83% (6)
Force FX-8CS Service Manual - en
282 pages
OCR & Groq: Fast Data Extraction
No ratings yet
OCR & Groq: Fast Data Extraction
17 pages
Python Image Processing Pipeline
100% (1)
Python Image Processing Pipeline
31 pages
Lang Chain Agent
No ratings yet
Lang Chain Agent
9 pages
Ai Scraping Techniques
No ratings yet
Ai Scraping Techniques
9 pages
Extracting Text From Scanned PDF Using Pytesseract & Open CV
No ratings yet
Extracting Text From Scanned PDF Using Pytesseract & Open CV
9 pages
Document Management Application
No ratings yet
Document Management Application
10 pages
Mini Project
No ratings yet
Mini Project
18 pages
Smart Grocery App Overview
No ratings yet
Smart Grocery App Overview
3 pages
Look
No ratings yet
Look
6 pages
FastAPI Project Folder Structure
No ratings yet
FastAPI Project Folder Structure
31 pages
Invoice File 2
No ratings yet
Invoice File 2
2 pages
Adminmainfakeproductdetection
No ratings yet
Adminmainfakeproductdetection
5 pages
11
No ratings yet
11
5 pages
Pfa Ieee
No ratings yet
Pfa Ieee
59 pages
otp تحديث تلقايي @btt5bot
No ratings yet
otp تحديث تلقايي @btt5bot
4 pages
Hybrid Scraping Techniques
No ratings yet
Hybrid Scraping Techniques
8 pages
Image Detection
No ratings yet
Image Detection
5 pages
DL 9
No ratings yet
DL 9
10 pages
Data Science Web Scraping Guide
No ratings yet
Data Science Web Scraping Guide
4 pages
Response PO
No ratings yet
Response PO
20 pages
Odoo ORM Field Types Guide
No ratings yet
Odoo ORM Field Types Guide
26 pages
Main
No ratings yet
Main
9 pages
1 Notmnist - Ipynb
No ratings yet
1 Notmnist - Ipynb
15 pages
DeepSeek Email Classification Overview
No ratings yet
DeepSeek Email Classification Overview
8 pages
11
No ratings yet
11
5 pages
Exp7 Python 210
No ratings yet
Exp7 Python 210
5 pages
Abb
No ratings yet
Abb
4 pages
Code
No ratings yet
Code
11 pages
11
No ratings yet
11
5 pages
Python Applications
No ratings yet
Python Applications
8 pages
Aay
No ratings yet
Aay
5 pages
Look Up
No ratings yet
Look Up
2 pages
Aay
No ratings yet
Aay
4 pages
Log Processing
No ratings yet
Log Processing
13 pages
Code
No ratings yet
Code
1 page
11
No ratings yet
11
4 pages
11
No ratings yet
11
5 pages
11
No ratings yet
11
4 pages
From Flask Import Flask, Render - Tem
No ratings yet
From Flask Import Flask, Render - Tem
72 pages
Pyhon FastAPI
No ratings yet
Pyhon FastAPI
10 pages
CXVX
No ratings yet
CXVX
3 pages
Message
No ratings yet
Message
2 pages
BBB
No ratings yet
BBB
8 pages
导入所需库
No ratings yet
导入所需库
20 pages
Aay
No ratings yet
Aay
4 pages
CXVX
No ratings yet
CXVX
4 pages
Aay
No ratings yet
Aay
4 pages
How To Analyze A PDF With The Layout-Parser Package. - by Brendan Ferris - Towards Data Science
No ratings yet
How To Analyze A PDF With The Layout-Parser Package. - by Brendan Ferris - Towards Data Science
3 pages
87 1
No ratings yet
87 1
10 pages
Airbnb
No ratings yet
Airbnb
3 pages
11
No ratings yet
11
4 pages
PR Writing
No ratings yet
PR Writing
21 pages
11
No ratings yet
11
4 pages
11
No ratings yet
11
4 pages
Document Processing with OCR & UI Path
No ratings yet
Document Processing with OCR & UI Path
7 pages
The Archer's Aim
No ratings yet
The Archer's Aim
2 pages
The Viral Spark
No ratings yet
The Viral Spark
2 pages
The Perfect Click
No ratings yet
The Perfect Click
2 pages
The Startup Gamble
No ratings yet
The Startup Gamble
2 pages
The Quantum Leap
No ratings yet
The Quantum Leap
2 pages
Jigna-Microsoft Certified Azure AI Fundamentals PDF
No ratings yet
Jigna-Microsoft Certified Azure AI Fundamentals PDF
1 page
Mistral
No ratings yet
Mistral
4 pages
62076886-c07b-4b84-8f0d-9046b463afd2 (1)
No ratings yet
62076886-c07b-4b84-8f0d-9046b463afd2 (1)
4 pages
Receipt
No ratings yet
Receipt
1 page
Mistral Complete Research
No ratings yet
Mistral Complete Research
20 pages
Mistral Simplified
No ratings yet
Mistral Simplified
3 pages
AI Comparison
No ratings yet
AI Comparison
2 pages
Mistral Ocr LLM Approach
No ratings yet
Mistral Ocr LLM Approach
6 pages
Poultry Group's Half-Year Report
No ratings yet
Poultry Group's Half-Year Report
2 pages
Economics Module Handbook
No ratings yet
Economics Module Handbook
36 pages
Chapter 1+2+GSCM
No ratings yet
Chapter 1+2+GSCM
45 pages
Asuhan Keperawatan Diare
No ratings yet
Asuhan Keperawatan Diare
32 pages
Kerberos Authentication & Security
No ratings yet
Kerberos Authentication & Security
22 pages
Why Choose Jolly Phonics Flyer - 250125 - 035602
No ratings yet
Why Choose Jolly Phonics Flyer - 250125 - 035602
8 pages
9 RWS PT 4 Math Nida 202425
No ratings yet
9 RWS PT 4 Math Nida 202425
2 pages
BIOLOGY PLUS TWO Short Notes - Line Foundation
No ratings yet
BIOLOGY PLUS TWO Short Notes - Line Foundation
9 pages
Maths Grade-8 Model 2015
No ratings yet
Maths Grade-8 Model 2015
7 pages
BE Mech 5.5 Year
No ratings yet
BE Mech 5.5 Year
3 pages
RF Heating: Created in COMSOL Multiphysics 5.3a
No ratings yet
RF Heating: Created in COMSOL Multiphysics 5.3a
22 pages
Geology of The Area
No ratings yet
Geology of The Area
4 pages
Cost Concepts Quiz
No ratings yet
Cost Concepts Quiz
11 pages
1 Preoperative
No ratings yet
1 Preoperative
67 pages
Improvement of Supply Chain Performance of Printin
No ratings yet
Improvement of Supply Chain Performance of Printin
12 pages
Ilovepdf Merged
No ratings yet
Ilovepdf Merged
86 pages
CSS 12 Module 5
No ratings yet
CSS 12 Module 5
4 pages
Unidad 4
No ratings yet
Unidad 4
12 pages
The Feasibility Study of Ballitaw
No ratings yet
The Feasibility Study of Ballitaw
2 pages
Visitors Guide. Motril History Museum
No ratings yet
Visitors Guide. Motril History Museum
24 pages
A212 - MC 10 - PROVISIONS, CLCA - Student
No ratings yet
A212 - MC 10 - PROVISIONS, CLCA - Student
4 pages
600782556eb44428af903a75 1643114610 GE 7 Course Syllabus 2021 2022 - Rev.2
No ratings yet
600782556eb44428af903a75 1643114610 GE 7 Course Syllabus 2021 2022 - Rev.2
8 pages
Unit 11
No ratings yet
Unit 11
6 pages
Ch1 Solid State Formula Sheet 12th Science Chemistry
No ratings yet
Ch1 Solid State Formula Sheet 12th Science Chemistry
9 pages
Contact Process for Sulphuric Acid
No ratings yet
Contact Process for Sulphuric Acid
8 pages
Object Oriented Programming in Java
No ratings yet
Object Oriented Programming in Java
5 pages
Wednesday 13 October 2021: Mathematics
No ratings yet
Wednesday 13 October 2021: Mathematics
13 pages
How To Send or Receive SMS Message Via GSM Module by at Commands
100% (1)
How To Send or Receive SMS Message Via GSM Module by at Commands
6 pages
Trial Memorandum Plaintiff SAMPLE
No ratings yet
Trial Memorandum Plaintiff SAMPLE
9 pages

Updated Code

Uploaded by

Updated Code

Uploaded by

from fastapi import FastAPI, File, UploadFile, HTTPException

# Load environment variables

# Configure Gemini API

# Set Tesseract path

# In-memory caches (1-hour TTL)

# Check if poppler utilities are in PATH

# Common poppler paths for different systems

for path in common_paths:

async def process_image(img_bytes, filename, idx):

async def process_pdf_page(img, page_idx):

async def process_with_gemini(filename: str, raw_text: str):

# Check structured data cache

# Truncate text for Gemini

logger.info(f"Starting processing for {len(files)} files,

for file in files:

# Validate file format

# Read file into memory

# Check raw text cache

# If no embedded text, perform OCR

# Get the correct poppler path

# Convert PDF to images with proper poppler path handling

logger.info(f"PDF to images conversion for {file.filename},

# Process with Gemini

logger.info(f"Total processing for {file.filename}, took {time.time() -

output_json["message"] = f"Processed {len(files)} files. {success_count}

logger.info(f"Completed processing for {len(files)} files, {success_count}

You might also like