Akk

Uploaded by

parvathisuvarnaajayan

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

6 views2 pages

Akk

Uploaded by

parvathisuvarnaajayan

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 2

import zipfile

import os
from PyPDF2 import PdfReader

def extract_zip(zip_path, extract_to):

"""
Extracts all files from a zip archive.
"""
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_to)
print(f"All files have been extracted to: {extract_to}")

def extract_data_from_pdf(pdf_path, keywords):

"""
Extracts data from a PDF file based on keywords.
"""
extracted_data = {}
try:
reader = PdfReader(pdf_path)
for page in reader.pages:
text = page.extract_text()
for keyword in keywords:
if keyword.lower() in text.lower():
extracted_data[keyword] = extracted_data.get(keyword, "") + "\
n" + text
except Exception as e:
print(f"Could not process {pdf_path}: {e}")
return extracted_data

def process_files(folder_path, keywords):

"""
Process all PDF files in a folder to extract relevant data.
"""
for root, dirs, files in os.walk(folder_path):
for file in files:
if file.endswith(".pdf"):
pdf_path = os.path.join(root, file)
print(f"\nProcessing: {file}")
extracted_data = extract_data_from_pdf(pdf_path, keywords)
if extracted_data:
for keyword, content in extracted_data.items():
print(f"\n--- {keyword.upper()} ---")
print(content[:500]) # Display first 500 characters of
each section
else:
print(f"No relevant data found in {file}")

if __name__ == "__main__":
# Step 1: Specify the paths
zip_file_path = "/mnt/data/P 1.zip" # Path to your ZIP file
extract_to_folder = "/mnt/data/extracted_P1" # Folder for extracted files

# Step 2: Extract ZIP file

extract_zip(zip_file_path, extract_to_folder)

# Step 3: Specify keywords to look for

keywords_to_find = [
"research questions",
"summary",
"variables",
"methodology",
"results and analysis",
"mechanism behind the results",
"takeaways"
]

# Step 4: Analyze extracted files

process_files(extract_to_folder, keywords_to_find)

Module 4 Python
No ratings yet
Module 4 Python
18 pages
Final Assignment
100% (2)
Final Assignment
4 pages
Feature Extraction
No ratings yet
Feature Extraction
9 pages
Extracting Text and Images From PDF Files
No ratings yet
Extracting Text and Images From PDF Files
10 pages
Testing PDFs With Python
No ratings yet
Testing PDFs With Python
5 pages
Python PDF Data Scraping Guide
No ratings yet
Python PDF Data Scraping Guide
8 pages
Pypdf2.Pdffilewriter Python Example
No ratings yet
Pypdf2.Pdffilewriter Python Example
24 pages
Final Code For Markup
No ratings yet
Final Code For Markup
1 page
Dumppdf Py
No ratings yet
Dumppdf Py
9 pages
Python Script For PDF - Reading
No ratings yet
Python Script For PDF - Reading
2 pages
ZIP To PDF Converter Document
No ratings yet
ZIP To PDF Converter Document
5 pages
Pseudocodes and Flowcharts (Riyansha Shahare)
No ratings yet
Pseudocodes and Flowcharts (Riyansha Shahare)
14 pages
Updated Code That Flags Faulty Jpgs
No ratings yet
Updated Code That Flags Faulty Jpgs
3 pages
Python Handwriting Recognition Guide
No ratings yet
Python Handwriting Recognition Guide
31 pages
Automated PDF Summarization & Extraction
No ratings yet
Automated PDF Summarization & Extraction
6 pages
1 Notmnist - Ipynb
No ratings yet
1 Notmnist - Ipynb
15 pages
PDF Processor
No ratings yet
PDF Processor
4 pages
Allcodes
No ratings yet
Allcodes
36 pages
SFP Filemeta
No ratings yet
SFP Filemeta
4 pages
Advanced Python: Files & Regex
No ratings yet
Advanced Python: Files & Regex
12 pages
Context Managers
No ratings yet
Context Managers
3 pages
TASK1.Ipynb - Colab
No ratings yet
TASK1.Ipynb - Colab
4 pages
Komenda
No ratings yet
Komenda
3 pages
Penetration Testing Guide
100% (1)
Penetration Testing Guide
6 pages
PDF Explination
No ratings yet
PDF Explination
3 pages
Lecture Week 5-Data Analytics-Data Scraping and Data Wrangling
No ratings yet
Lecture Week 5-Data Analytics-Data Scraping and Data Wrangling
15 pages
PSPDF Title Extraction Explanation
No ratings yet
PSPDF Title Extraction Explanation
3 pages
Long Docs
No ratings yet
Long Docs
8 pages
Python Programs for File Operations and Data Handling
No ratings yet
Python Programs for File Operations and Data Handling
10 pages
Python Module 4 IMP Questioons
No ratings yet
Python Module 4 IMP Questioons
9 pages
Create Edit PDF App in Python
No ratings yet
Create Edit PDF App in Python
3 pages
Module 4 Notes
No ratings yet
Module 4 Notes
9 pages
Flask Project
No ratings yet
Flask Project
5 pages
SDLC Document
No ratings yet
SDLC Document
15 pages
AI Over PDF Library
No ratings yet
AI Over PDF Library
2 pages
Create - Folder - If - Not - Exists: STR None
No ratings yet
Create - Folder - If - Not - Exists: STR None
5 pages
Ballerono Cappuchino
No ratings yet
Ballerono Cappuchino
10 pages
SDLC File New
No ratings yet
SDLC File New
15 pages
Extractor de Imagenes en PDF
No ratings yet
Extractor de Imagenes en PDF
3 pages
7pgm Python
No ratings yet
7pgm Python
4 pages
Lab Manual Python
No ratings yet
Lab Manual Python
14 pages
Assignment 4
No ratings yet
Assignment 4
3 pages
25 Awesome Python Scripts
No ratings yet
25 Awesome Python Scripts
26 pages
Python Practicals
No ratings yet
Python Practicals
4 pages
Python Lab Program 7
No ratings yet
Python Lab Program 7
3 pages
Lecture 31-Document GPT Hands On
No ratings yet
Lecture 31-Document GPT Hands On
18 pages
Json Processor
No ratings yet
Json Processor
4 pages
Make Link
No ratings yet
Make Link
2 pages
True False: Complete Path Root Directory
No ratings yet
True False: Complete Path Root Directory
2 pages
File Handling and String Formatting
No ratings yet
File Handling and String Formatting
8 pages
Python Assignment 03
No ratings yet
Python Assignment 03
11 pages
PDF Extraction Flow Document Part1
No ratings yet
PDF Extraction Flow Document Part1
2 pages
Project X
No ratings yet
Project X
10 pages
Extracting Text From PDF Files and Printing New Lines in Python
No ratings yet
Extracting Text From PDF Files and Printing New Lines in Python
10 pages
CT5194 - Malware Lab 2
No ratings yet
CT5194 - Malware Lab 2
4 pages
IRT Lab Programs
No ratings yet
IRT Lab Programs
9 pages
Python Tutorial: Tarfile Module
100% (6)
Python Tutorial: Tarfile Module
4 pages

Akk

Uploaded by

Akk

Uploaded by

import zipfile

def extract_zip(zip_path, extract_to):

def extract_data_from_pdf(pdf_path, keywords):

def process_files(folder_path, keywords):

# Step 2: Extract ZIP file

# Step 3: Specify keywords to look for

# Step 4: Analyze extracted files

You might also like