Thanks to visit codestin.com
Credit goes to www.scribd.com

0% found this document useful (0 votes)
6 views2 pages

Akk

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
6 views2 pages

Akk

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 2

import zipfile

import os
from PyPDF2 import PdfReader

def extract_zip(zip_path, extract_to):


"""
Extracts all files from a zip archive.
"""
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_to)
print(f"All files have been extracted to: {extract_to}")

def extract_data_from_pdf(pdf_path, keywords):


"""
Extracts data from a PDF file based on keywords.
"""
extracted_data = {}
try:
reader = PdfReader(pdf_path)
for page in reader.pages:
text = page.extract_text()
for keyword in keywords:
if keyword.lower() in text.lower():
extracted_data[keyword] = extracted_data.get(keyword, "") + "\
n" + text
except Exception as e:
print(f"Could not process {pdf_path}: {e}")
return extracted_data

def process_files(folder_path, keywords):


"""
Process all PDF files in a folder to extract relevant data.
"""
for root, dirs, files in os.walk(folder_path):
for file in files:
if file.endswith(".pdf"):
pdf_path = os.path.join(root, file)
print(f"\nProcessing: {file}")
extracted_data = extract_data_from_pdf(pdf_path, keywords)
if extracted_data:
for keyword, content in extracted_data.items():
print(f"\n--- {keyword.upper()} ---")
print(content[:500]) # Display first 500 characters of
each section
else:
print(f"No relevant data found in {file}")

if __name__ == "__main__":
# Step 1: Specify the paths
zip_file_path = "/mnt/data/P 1.zip" # Path to your ZIP file
extract_to_folder = "/mnt/data/extracted_P1" # Folder for extracted files

# Step 2: Extract ZIP file


extract_zip(zip_file_path, extract_to_folder)

# Step 3: Specify keywords to look for


keywords_to_find = [
"research questions",
"summary",
"variables",
"methodology",
"results and analysis",
"mechanism behind the results",
"takeaways"
]

# Step 4: Analyze extracted files


process_files(extract_to_folder, keywords_to_find)

You might also like