import zipfile
import os
from PyPDF2 import PdfReader
def extract_zip(zip_path, extract_to):
"""
Extracts all files from a zip archive.
"""
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_to)
print(f"All files have been extracted to: {extract_to}")
def extract_data_from_pdf(pdf_path, keywords):
"""
Extracts data from a PDF file based on keywords.
"""
extracted_data = {}
try:
reader = PdfReader(pdf_path)
for page in reader.pages:
text = page.extract_text()
for keyword in keywords:
if keyword.lower() in text.lower():
extracted_data[keyword] = extracted_data.get(keyword, "") + "\
n" + text
except Exception as e:
print(f"Could not process {pdf_path}: {e}")
return extracted_data
def process_files(folder_path, keywords):
"""
Process all PDF files in a folder to extract relevant data.
"""
for root, dirs, files in os.walk(folder_path):
for file in files:
if file.endswith(".pdf"):
pdf_path = os.path.join(root, file)
print(f"\nProcessing: {file}")
extracted_data = extract_data_from_pdf(pdf_path, keywords)
if extracted_data:
for keyword, content in extracted_data.items():
print(f"\n--- {keyword.upper()} ---")
print(content[:500]) # Display first 500 characters of
each section
else:
print(f"No relevant data found in {file}")
if __name__ == "__main__":
# Step 1: Specify the paths
zip_file_path = "/mnt/data/P 1.zip" # Path to your ZIP file
extract_to_folder = "/mnt/data/extracted_P1" # Folder for extracted files
# Step 2: Extract ZIP file
extract_zip(zip_file_path, extract_to_folder)
# Step 3: Specify keywords to look for
keywords_to_find = [
"research questions",
"summary",
"variables",
"methodology",
"results and analysis",
"mechanism behind the results",
"takeaways"
]
# Step 4: Analyze extracted files
process_files(extract_to_folder, keywords_to_find)