is code ko dekho ye pagination bilkul theek handle kar raha hai or sare products
scrape hota hain till last
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
import re
class VetnpetdirectSpiderSpider(scrapy.Spider):
name = "VetnPetDirect_spider"
allowed_domains = ["www.vetnpetdirect.com.au"]
start_urls = ["https://www.vetnpetdirect.com.au/collections/dog-food-complete-
diets"]
# Track visited pages to avoid duplicates
visited_pages = set()
def __init__(self):
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# Set a realistic user agent
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0;
Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36")
# Initialize the webdriver
self.driver = webdriver.Chrome(
service=Service(r"C:\\Users\\Hp\\.wdm\\drivers\\chromedriver\\win64\\
chromedriver-win64\\chromedriver.exe"),
options=chrome_options
)
self.driver.set_window_size(1920, 1080) # Set a reasonable window size
super().__init__()
# Add counter for products
self.product_count = 0
self.current_page = 1
self.max_pages = 5 # Limit to 5 pages as per the pagination you showed
def parse(self, response):
# Check if we've already visited this URL to avoid duplicates
if response.url in self.visited_pages:
self.logger.info(f"Already visited {response.url}, skipping...")
return
self.visited_pages.add(response.url)
# Open the URL with Selenium
self.driver.get(response.url)
self.logger.info(f"Loading page: {response.url}")
# Wait for the page to load fully
time.sleep(10) # Increased wait time to ensure JS loads completely
# Check if we're on a product listing page by looking for the product grid
try:
# First check what product elements are present on the page
# Let's try different selectors
selectors_to_try = [
'li.snize-product',
'div.product-item',
'div.grid__item',
'div.product-card',
'div.product'
]
products = []
used_selector = None
for selector in selectors_to_try:
products = self.driver.find_elements(By.CSS_SELECTOR, selector)
if products:
used_selector = selector
self.logger.info(f"Found products using selector: {selector}")
break
if not products:
# If no products found, dump the page source for debugging
self.logger.warning("No products found with any selector. Page
structure may have changed.")
self.logger.info(f"Current URL: {self.driver.current_url}")
with open(f"page_source_debug_{self.current_page}.html", "w",
encoding="utf-8") as f:
f.write(self.driver.page_source)
return
page_product_count = len(products)
self.product_count += page_product_count
self.logger.info(f"Page {self.current_page}: Found {page_product_count}
products. Total so far: {self.product_count}")
# Extract data from each product
for product in products:
item = {}
try:
# Try different selectors for title
title_selectors = ['span.snize-title', 'h2', '.product-
card__title', '.product-title']
title = None
for selector in title_selectors:
try:
title_element = product.find_element(By.CSS_SELECTOR,
selector)
title = title_element.text
if title:
item['Product_Title'] = title
break
except NoSuchElementException:
continue
# Try different selectors for link
link_selectors = ['a.snize-view-link', 'a', '.product-card a']
link_url = None
for selector in link_selectors:
try:
link_element = product.find_element(By.CSS_SELECTOR,
selector)
link_url = link_element.get_attribute('href')
if link_url:
item['Title_Link'] = link_url
break
except NoSuchElementException:
continue
# Try different selectors for image
img_selectors = ['span.snize-thumbnail img', 'img', '.product-
image img']
image_url = None
for selector in img_selectors:
try:
img_element = product.find_element(By.CSS_SELECTOR,
selector)
image_url = img_element.get_attribute('src')
if image_url:
item['Image_URL'] = image_url
break
except NoSuchElementException:
continue
# Only yield if we found at least some data
if item:
yield item
except Exception as e:
self.logger.error(f"Error extracting product data: {e}")
# Handle pagination
if self.current_page < self.max_pages:
try:
# Try to construct the next page URL directly based on pattern
next_page_num = self.current_page + 1
base_url = re.sub(r'\?tab=products&page=\d+', '',
self.driver.current_url)
base_url = base_url.rstrip('/')
next_page_url = f"{base_url}?tab=products&page={next_page_num}"
self.logger.info(f"Constructed next page URL: {next_page_url}")
if next_page_url not in self.visited_pages:
self.current_page = next_page_num
self.logger.info(f"Moving to page {self.current_page}")
# Create a new request for the next page
yield scrapy.Request(
url=next_page_url,
callback=self.parse,
dont_filter=True
)
else:
self.logger.info(f"Already visited {next_page_url}, ending
pagination.")
except Exception as e:
self.logger.error(f"Error during pagination: {e}")
except Exception as e:
self.logger.error(f"Error in parsing page: {e}")
def closed(self, reason):
# Close the browser when spider is closed
self.driver.quit()
self.logger.info(f"Spider closed. Total products scraped:
{self.product_count}")
but ye code jab mae ec2 pa chalata hu to sirf 83 products scrape houe hai it means
pagination handle nhi hui or products sahi tareeqa sy scrpae bhi nhi hue
please mujh pura complete code provide kro uper wale code mae sleep wala logic
laga wa hai for pagination
<div class="snize-pagination" role="navigation" aria-label="Pagination"
style="width: 770.3px;"><ul><li><span class="snize-pagination-prev disabled" aria-
hidden="true"></span></li><li><a
href="https://www.vetnpetdirect.com.au/collections/dog-food-complete-diets?
&page=1" class="active" aria-current="page" rev="1"
data-no-instant="true">1</a></li><li><a
href="https://www.vetnpetdirect.com.au/collections/dog-food-complete-diets?
&page=2" rev="2" data-no-instant="true">2</a></li><li><a
href="https://www.vetnpetdirect.com.au/collections/dog-food-complete-diets?
&page=3" rev="3" data-no-instant="true">3</a></li><li><span aria-
hidden="true">...</span></li><li><a
href="https://www.vetnpetdirect.com.au/collections/dog-food-complete-diets?
&page=5" rev="5" data-no-instant="true">5</a></li><li><a
href="https://www.vetnpetdirect.com.au/collections/dog-food-complete-diets?
&page=5" class="snize-pagination-next" rev="2" data-no-instant="true" aria-
label="Next page"></a></li></ul></div>
the code must sleep and scrape all the next pages correctl with variations
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import NoSuchElementException, TimeoutException,
StaleElementReferenceException
from scrapy import signals
import time
import re
import traceback
class VetnpetdirectSpiderSpider(scrapy.Spider):
name = "VetnPetDirect_spider"
allowed_domains = ["www.vetnpetdirect.com.au","proxy.scrapeops.io"]
# Category URLs with display names and hardcoded pet types
category_urls = [
('Food', 'collections/dog-food-complete-diets', 'Dog'),
# ('Treats', 'collections/dog-treats', 'Dog'),
# ('Toys', 'collections/dog-toys', 'Dog'),
# ('Accessories', 'collections/dog-accessories', 'Dog'),
# ('Food', 'collections/cat-food', 'Cat'),
# ('Treats', 'collections/cat-treats', 'Cat'),
# ('Toys', 'collections/cat-toys', 'Cat'),
# ('Accessories', 'collections/cat-accessories', 'Cat')
]
# Known brand names list
brand_names = [
"LifeWise", "lovebites","Absolute Holistic","4CYTE","3M",
"Abbey Animal Health", "Absolute Pet", "Absorbine", "Adaptil", "Advantage",
"Advantix", "Advocate", "Aesculap", "Agmin Chelates", "Alfalfa King", "All Barks",
"All For Paws", "All Fur You", "ALLPET", "Alogard", "Alto", "Alto Lab", "Anarchy",
"Animal Works Nutrition", "Animalintex", "Animals Like Us", "Animology", "AniPal",
"Anitone", "Antinol Australia", "Ants Off", "Apex Laboratories", "Apex Tools",
"API", "Aqua Buddy", "Aqua Zonic", "Aquasonic", "Aristopet", "Arm & Hammer",
"Aromadog", "Ausrichter", "Aussie Pet Products", "Avione", "Avitrol", "Bainbridge",
"Balanced Life", "Banana Feeds Australia", "Barastoc", "Baxter", "Baycox", "Bayer",
"BD", "Bear Bear", "Beau Pets", "Beco", "Becton Dickinson", "Betadine", "Big Dog",
"Bio-Groom", "Bionic", "BIOpet", "Bioscape", "BirdLife", "Black Dog Wear", "Black
Hawk", "Blackdog", "Blue Planet", "Boehringer Ingelheim", "Borotto", "Braun",
"Bravecto", "Breeder's Choice", "BSN Medical", "Burgess", "Burgon & Ball",
"Buster", "Busy Buddy", "Caitec", "Camon", "Capstar", "CaribSea", "Catit",
"CatMate", "Cazitel", "CEN Nutrition", "Ceva", "CheckUp", "Chipsi", "Chuckit!",
"CLEAR Dog Treats", "Coachi", "Company of Animals", "Compost-A-Pak", "Continuum
Aquatics", "Coopers", "CopRice", "Country Heritage", "Covetrus", "Cowboy Magic",
"Creative Foraging", "Credelio Plus", "Crooked Lane", "Crown", "Cruiser Bowl",
"Crystalfix", "CSI", "Cydectin", "Davis", "Decron", "Dectomax", "Dentipet", "Derma
Gel", "Dermcare", "Dermoscent Laboratorie", "Designer Collection", "DGS Products",
"Di-Vetelact", "Diamond Cut", "Dimmitrol", "Dine-a-Chook", "Diversey Cleaning",
"Doc & Phoebe", "Dog Rocks", "Dog Treat Naturals", "Doggylicious", "DOOG",
"Drinkwell", "Drontal", "Drool", "Durex", "Dynavyte", "EAC Animal Care", "Earthz
Pet", "Eco Tech", "EcoPellets Tasmania", "Elanco", "EnviroSafe", "Equi-Prene",
"Equinade", "Equine Health Science", "Equine Pure", "Equine Vit&Min", "Equitex",
"Eurofarm", "EVO Lifestyle Products", "Excel", "Exo Terra", "Ezi-LockOdour",
"EzyDog", "EzyGrip", "F10", "Fantasmic", "Farmalogic", "Farmhand", "Farnam",
"Feathered Friends", "Featherland Paradise", "Feed-O-Matic", "Feel Good Doggo",
"Feline Natural", "Feliway", "Fenpral", "Fido", "Fido's", "Filta-Bac", "FitNFlash",
"Fjord Manufacturing", "Fleet", "Flexi", "Flipper", "Fluval", "Flyveils By Design",
"Freezy Paws", "Frontier Pets", "Frontline", "FURminator", "Furriends", "FuzzYard",
"Genia", "Genial", "Gentle Leader", "Giddy Citizen", "GiGwi", "Glandex", "Global
Vet", "Glow Groom", "Glyde", "GMV", "GO CAT", "Golp", "Greenies", "Guru Pet
Company", "GVP", "Hagen", "HandsOn", "Hayes", "HayPigs", "HeartGard", "Hemp
Collective", "HempPet", "Henry Schein", "Heritage Downs", "Hi Form", "Hill's",
"Hill's Pet Nutrition", "Hill's Prescription Diet", "HomeoPet", "Horse Health
Products", "Horse Hydrator", "Horsemaster", "Huds and Toke", "HugSmart", "Hunter
River Company", "Huskimo", "Hygain", "Hypro Pet Foods", "IAH", "Ibiyaya", "Ice N
Easy", "Inca", "IncrediBUBBLES", "independents Own", "Industrial Dog",
"Interceptor", "Intervet", "IO", "Ipromea", "Isle & Skye", "Jenquine", "JerHigh",
"Joint Guard", "Joseph Lyddy", "Jurox", "Juwel", "JW Pet", "K & H Pet Products",
"K9 Natural", "Kelato", "KER", "Ketchum", "Kiltix", "Kit Cat", "Kitter", "Kitty
Play", "Kiwi Kitchens", "KLEO", "Kohnke's", "KONG", "KoolMaster", "Krutex",
"Kruuse", "Kumfi", "Kurgo", "La Chanvriere", "Laucke", "Leather Therapy", "Lee's",
"Lenny's Kitchen", "Leovet", "Lickables", "Lickimat", "Life Data Labs", "LifeWise
Pet Nutrition", "Likit", "Little Giant", "Livamol", "Livingstone", "Love'em",
"Lulu's Kitchen", "LupinePet", "Lyppard", "Magictails", "Mane 'n Tail", "Marina",
"Masterpet", "Mavlab", "Max & Molly", "Meals For Mutts", "Medibit", "Melanie
Newman", "Melcare", "Merial", "Midwest", "Milbemax", "Millers Forge", "Mimi &
Munch", "Minrosa", "Miscellaneous", "Mog & Bone", "Mr Sticky", "Mr. Fothergill's",
"MSD", "Multipet", "Mustad", "My Family", "MyBestMate", "MyEcoBag", "myPet
Magazine", "Natural Animal Solutions", "Natural Health NZ", "Nature's Botanical",
"Nature's Miracle", "Neove Pharma Australia", "Nerf", "NexGard", "NexGard Spectra",
"Nina Ottosson", "NJ Phillips", "NORTHMATE", "NRG", "Nuheart", "Nutrafin",
"Nutramax", "Nutrimol", "Nylabone", "NYOS Aquatics", "O'TOM", "Oakwood", "Ocean
Nutrition", "ODR", "Odzon", "Oh Crap", "Olsson's", "Oralx", "OraVet", "Organica",
"Outward Hound", "Oxbow", "Oz Pet", "OzHemp", "ParaGard", "Passwell", "PatPet",
"PAW", "Paw Ready", "Pawsome Organics", "Peckish", "Penn-Plax Reptology", "Pet
DreamHouse", "Pet Drs", "Pet Relax", "Pet Teezer", "pet+me", "Pet-Rite", "PET-TEK",
"PetArk", "Petkin", "PetQwerks", "Petrageous", "PetSafe", "Petstages",
"Pharmachem", "Phud's", "Pioneer", "Plutos", "Polly's", "Polyp Lab", "Pomms",
"POOWEE!", "Poseidon Animal Health", "Pottie's", "Prestige Pet", "Prestige Snuggle
Pals", "Prime Pantry", "Prime100", "Pritchard", "Pro-Dosa", "PRO-TRAINER",
"Profender", "ProN8ure", "Proteq", "Protexin", "Proudi", "PROVET", "Provex",
"ProviCo", "Proviro Group", "Prozym", "PuppyPlay", "Pure Life", "Quirky Kitty",
"Racing Blue", "Ranvet", "Raw Pawz", "Red Healer", "Reptile Publications",
"Revolution", "Ridley", "River Systems", "Roche", "Rocky Point", "Rogz", "Rose-Hip
Vital", "Rover Pet Products", "Royal Canin", "Royal Show", "Ruddock's", "Ruffey",
"Ruffwear", "Rufus & Coco", "RWR", "Saddlery Trading Company", "Sandlube", "Sashas
Blend", "SavourLife", "ScoopFree", "Scream", "SeaFlex", "Sentinel Spectrum",
"Serenity", "Seresto", "Shark Net", "Shear Magic", "Shoof", "ShowMaster", "Silvan",
"Simcro", "Simparica", "SmartCat", "Smith & Nephew", "Snax", "Snooza",
"SnuggleSafe", "SodaPup", "Solo", "Sporn", "Spotnik", "Springer-Magrath", "Spunky
Pup", "Stance Equitec", "STARMARK", "StaySound", "Staywell", "STC", "Sticky Paws",
"Stockbrands", "Stockman & Paddock", "Super Bird Creations", "SuperBird", "Superior
Pet Goods", "Superior Shavings", "Sure Petcare", "SUREFEED", "SUREFLAP", "Swann-
Morton", "Sykes", "Tankmaid", "Tasty Bone", "Tensoplast", "Terumo", "Tetra", "The
Art of Whole Food", "The Canny Company", "The NZ Natural Pet Food Company", "The
Pet Loo", "Thundershirt", "Thunderworks", "Tiger Brands", "Triplepet",
"TropiClean", "Trouble & Trix", "Trough Rocks", "Troy", "TuffRock", "Tuffy", "Two
Little Fishies", "Ultra", "Urine Off", "USA-K9", "Value Plus", "Valuheart",
"Varco", "Veggiedent", "Veredus", "Vet's All Natural", "Vet's Best", "vet-n-pet
DIRECT", "Vetafarm", "Vetforce", "Vetgold", "Vetnex Pet Care", "Vetopop",
"Vetoquinol", "VetPen", "VetRx", "Vetsense", "Virbac", "Virkon", "Vision", "Vital
SupaSnax", "Wagg & Purr", "Wags & Wiggles", "Wahl", "Water & Woods", "Weaver",
"Wellness Pet Company", "West Paw", "Whimzees", "Whiskers & Wiggles", "Wild
Hibiscus Flower Company", "Wombaroo", "Worlds Best Hoof Oil", "WSD", "Y-Tex",
"Yeowww!", "Yours Droolly", "ZEEZ", "zenpet", "Zippy Claws", "Zippy Paws", "ZIWI",
"Zoetis", "Zoo Med", "ZooPets", "Zychem Technologies"
]
# Track visited pages to avoid duplicates
visited_pages = set()
def __init__(self, *args, **kwargs):
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Enable headless mode for AWS
EC2
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0;
Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36")
chrome_options.add_experimental_option('excludeSwitches', ['enable-
automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
# Initialize the webdriver
self.service = Service(r"/usr/local/bin/chromedriver")
self.driver = None
self.options = chrome_options
# Add counter for products
self.product_count = 0
self.current_page = 1
# Add variables for category tracking
self.current_category_index = 0
self.processed_products = set()
self.pending_requests = {}
# EC2 optimization parameters
self.page_load_wait = 30 # Increased wait time for EC2
self.request_delay = 3 # Delay between requests
super(VetnpetdirectSpiderSpider, self).__init__(*args, **kwargs)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(VetnpetdirectSpiderSpider, cls).from_crawler(crawler, *args,
**kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_opened(self, spider):
self.driver = webdriver.Chrome(service=self.service, options=self.options)
def spider_closed(self, spider):
if self.driver:
self.driver.quit()
def start_requests(self):
# Start with only the first category
if self.category_urls:
category_name, category_path, pet_type =
self.category_urls[self.current_category_index]
url = f"https://www.vetnpetdirect.com.au/{category_path}"
self.logger.info(f"Starting with category: {category_name} for
{pet_type}")
# Initialize pending requests counter for this category
self.pending_requests[category_name] = 1 # Start with 1 for the
initial request
yield scrapy.Request(
url=url,
callback=self.parse,
meta={
'category_name': category_name,
'pet_type': pet_type,
'dont_filter': True
}
)
def parse(self, response):
# Get the category name and pet type from meta
category_name = response.meta.get('category_name')
pet_type = response.meta.get('pet_type')
# Check if we've already visited this URL to avoid duplicates
if response.url in self.visited_pages:
self.logger.info(f"Already visited {response.url}, skipping...")
self.pending_requests[category_name] -= 1
next_request = self.check_category_completion(category_name)
if next_request:
yield next_request
return
self.visited_pages.add(response.url)
self.logger.info(f"Parsing category: {category_name} for pet type:
{pet_type} - Page {self.current_page}")
# Initialize the driver if it's not already initialized
if not hasattr(self, 'driver') or self.driver is None:
self.driver = webdriver.Chrome(service=self.service,
options=self.options)
# Open the URL with Selenium
self.driver.get(response.url)
self.logger.info(f"Loading page: {response.url}")
# Wait for the page to load fully
time.sleep(self.page_load_wait)
# Check if we're on a product listing page by looking for the product grid
try:
# Wait for the body to be fully loaded
wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
# First check what product elements are present on the page
# Let's try different selectors
selectors_to_try = [
'li.snize-product',
'div.product-item',
'div.grid__item',
'div.product-card',
'div.product'
]
products = []
used_selector = None
# Try each selector with explicit wait
for selector in selectors_to_try:
try:
# Use explicit wait for each selector
products =
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector)))
if products:
used_selector = selector
self.logger.info(f"Found products using selector:
{selector}")
break
except TimeoutException:
continue
if not products:
# Check if we're on the last page (no products found)
self.logger.warning("No products found on this page. This might be
the last page.")
self.logger.info(f"Current URL: {self.driver.current_url}")
with
open(f"page_source_debug_{category_name}_{self.current_page}.html", "w",
encoding="utf-8") as f:
f.write(self.driver.page_source)
# Decrement pending requests for this page since we're done with it
self.pending_requests[category_name] -= 1
next_request = self.check_category_completion(category_name)
if next_request:
yield next_request
return
page_product_count = len(products)
self.product_count += page_product_count
self.logger.info(f"Page {self.current_page}: Found {page_product_count}
products. Total so far: {self.product_count}")
# Increment pending requests counter for product detail pages
self.pending_requests[category_name] += page_product_count
# Store product data first to avoid stale element issues
product_data_list = []
# First collect basic data from the list page
for index, product in enumerate(products):
try:
item = {}
# Extract link first as we'll need it to visit the product page
link_url = None
link_selectors = ['a.snize-view-link', 'a', '.product-card a']
for selector in link_selectors:
try:
link_element = product.find_element(By.CSS_SELECTOR,
selector)
link_url = link_element.get_attribute('href')
if link_url:
item['Title_Link'] = link_url
break
except (NoSuchElementException,
StaleElementReferenceException):
continue
# Skip if we've already processed this product URL
if link_url in self.processed_products:
self.logger.info(f"Skipping already processed product URL:
{link_url}")
self.pending_requests[category_name] -= 1
continue
# Add to processed set
if link_url:
self.processed_products.add(link_url)
# Only collect other data if we found a link
if link_url:
# Try different selectors for title
title_selectors = ['span.snize-title', 'h2', '.product-
card__title', '.product-title']
for selector in title_selectors:
try:
title_element =
product.find_element(By.CSS_SELECTOR, selector)
title = title_element.text
if title:
item['Product_Title'] = title
break
except (NoSuchElementException,
StaleElementReferenceException):
continue
# Try different selectors for image
img_selectors = ['span.snize-thumbnail img', 'img',
'.product-image img']
for selector in img_selectors:
try:
img_element = product.find_element(By.CSS_SELECTOR,
selector)
image_url = img_element.get_attribute('src')
if image_url:
item['Image_URL'] = image_url
break
except (NoSuchElementException,
StaleElementReferenceException):
continue
# Check if product is out of stock
try:
out_of_stock = product.find_element(By.CSS_SELECTOR,
'.snize-out-of-stock')
if out_of_stock:
item['Stock_Status'] = "Out Of Stock"
else:
item['Stock_Status'] = "In Stock"
except (NoSuchElementException,
StaleElementReferenceException):
item['Stock_Status'] = "In Stock" # Default to in
stock if no indicator found
# Determine brand name from product title
if 'Product_Title' in item:
brand_name = "N/A"
for brand in self.brand_names:
if brand.lower() in item['Product_Title'].lower():
brand_name = brand
break
item['Brand_Name'] = brand_name
# Add category information
item['Product_Category'] = category_name
item['Pet_Type'] = pet_type
# Add default fields
item['Description'] = "Not Available"
item['Lifestage'] = "Not Available"
# Add product to our list to process
product_data_list.append(item)
except Exception as e:
self.logger.error(f"Error extracting basic product data: {e}")
self.pending_requests[category_name] -= 1 # Decrement for
failed product
# Now visit each product page to get prices and check for variations
for item in product_data_list:
if 'Title_Link' in item:
# Follow the product link to get price and check for variations
yield scrapy.Request(
item['Title_Link'],
callback=self.parse_product,
meta={
'item': item,
'category_name': category_name,
'pet_type': pet_type
}
)
# Add small delay between product requests to avoid
overwhelming EC2
time.sleep(self.request_delay)
# Handle pagination - try to find the next page
has_next_page = False
try:
# Check for next page link first
next_page_selectors = [
'.pagination .next a',
'.pagination-custom .pagination-next a',
'a.pagination__next',
'a[rel="next"]'
]
next_page_link = None
for selector in next_page_selectors:
try:
next_links = self.driver.find_elements(By.CSS_SELECTOR,
selector)
if next_links:
next_page_link = next_links[0].get_attribute('href')
if next_page_link:
has_next_page = True
break
except Exception:
continue
# If direct link not found, try to construct the next page URL
if not has_next_page:
# Try to construct the next page URL directly based on pattern
next_page_num = self.current_page + 1
base_url = re.sub(r'\?tab=products&page=\d+', '',
self.driver.current_url)
base_url = base_url.rstrip('/')
next_page_url = f"{base_url}?tab=products&page={next_page_num}"
# Check if this page exists by looking for a pagination element
with this number
try:
page_numbers = self.driver.find_elements(By.CSS_SELECTOR,
'.pagination li a, .pagination__number')
for page_elem in page_numbers:
if page_elem.text.strip() == str(next_page_num):
next_page_link = next_page_url
has_next_page = True
break
except Exception:
# If we can't confirm via pagination, let's try the
constructed URL anyway
next_page_link = next_page_url
has_next_page = True
if has_next_page and next_page_link and next_page_link not in
self.visited_pages:
self.logger.info(f"Found next page URL for {category_name}:
{next_page_link}")
# Increment counter for next page
self.pending_requests[category_name] += 1
self.current_page = self.current_page + 1
# Create a new request for the next page
yield scrapy.Request(
url=next_page_link,
callback=self.parse,
meta={
'category_name': category_name,
'pet_type': pet_type,
'dont_filter': True
}
)
else:
self.logger.info(f"No next page found for {category_name} or
already visited. Ending pagination.")
# Reset page counter for next category
self.current_page = 1
except Exception as e:
self.logger.error(f"Error during pagination: {e}")
self.logger.error(traceback.format_exc())
# Reset page counter for the next category
self.current_page = 1
# Decrement pending requests counter for this page
self.pending_requests[category_name] -= 1
self.logger.info(f"Remaining requests for {category_name}:
{self.pending_requests[category_name]}")
# Check if we've completed this category
next_request = self.check_category_completion(category_name)
if next_request:
yield next_request
except Exception as e:
self.logger.error(f"Error in parsing page: {e}")
self.logger.error(traceback.format_exc())
# Decrement pending requests for this failed page
self.pending_requests[category_name] -= 1
next_request = self.check_category_completion(category_name)
if next_request:
yield next_request
def parse_product(self, response):
# Get the item that was collected in the first parse
base_item = response.meta['item']
category_name = response.meta['category_name']
self.logger.info(f"Parsing product: {base_item['Product_Title']}")
try:
# Process this product (will handle variations if present)
product_url = response.url
items = list(self.process_product_page(product_url, base_item))
# Yield all items
for item in items:
yield item
except Exception as e:
self.logger.error(f"Error processing product detail page: {e}")
self.logger.error(traceback.format_exc())
# Still yield the base item as fallback
yield base_item
finally:
# Decrement pending requests counter for this product
self.pending_requests[category_name] -= 1
self.logger.info(f"Remaining requests for {category_name}:
{self.pending_requests[category_name]}")
# Check if we've completed this category
next_request = self.check_category_completion(category_name)
if next_request:
yield next_request
def check_category_completion(self, category_name):
"""Check if a category is complete and move to the next one if needed"""
# If there are no more pending requests for this category, move to the next
if self.pending_requests.get(category_name, 0) <= 0:
self.logger.info(f"✅ COMPLETED CATEGORY: {category_name}")
# Move to the next category
self.current_category_index += 1
# Reset page counter for next category
self.current_page = 1
if self.current_category_index < len(self.category_urls):
# Start the next category
next_category_name, next_category_path, next_pet_type =
self.category_urls[self.current_category_index]
next_url = f"https://www.vetnpetdirect.com.au/{next_category_path}"
self.logger.info(f"🚀 MOVING TO NEXT CATEGORY: {next_category_name}
for {next_pet_type}")
# Initialize pending requests counter for the new category
self.pending_requests[next_category_name] = 1
# Schedule the request for the next category
return scrapy.Request(
url=next_url,
callback=self.parse,
meta={
'category_name': next_category_name,
'pet_type': next_pet_type,
'dont_filter': True
}
)
else:
self.logger.info("🎉 ALL CATEGORIES HAVE BEEN PROCESSED")
return None
return None
def process_product_page(self, url, base_item):
"""Process a product page, handling variations if present"""
max_retries = 3
for attempt in range(max_retries):
try:
# Load the product page
self.driver.get(url)
self.logger.info(f"Loading product page: {url}")
# Wait for page to load
time.sleep(self.page_load_wait)
# Wait for product detail elements to be present
wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,
'body')))
# Try to get description if available
try:
description_selectors = [
'.product-description',
'.product__description',
'.rte',
'[itemprop="description"]'
]
for selector in description_selectors:
try:
desc_elements =
self.driver.find_elements(By.CSS_SELECTOR, selector)
if desc_elements:
description = desc_elements[0].text.strip()
if description:
base_item['Description'] = description
break
except Exception:
continue
except Exception as e:
self.logger.warning(f"Error extracting description: {e}")
# First check if there are variations
variation_selects = []
try:
# Look for variation selectors - try different types
selectors_to_try = [
'select.single-option-selector',
'.selector-wrapper select',
'select[data-variant-option]'
]
for selector in selectors_to_try:
selects = self.driver.find_elements(By.CSS_SELECTOR,
selector)
if selects:
variation_selects = selects
self.logger.info(f"Found {len(selects)} variation
selectors using {selector}")
break
if not variation_selects:
# No variations, just get the regular price
price_selectors = [
'span.price span.money',
'span.price--sale span.money',
'.product-single__price',
'.price',
'[itemprop="price"]'
]
price = None
for selector in price_selectors:
try:
price_elements =
self.driver.find_elements(By.CSS_SELECTOR, selector)
if price_elements:
price = price_elements[0].text.strip()
if price:
# Clean price text (remove dollar sign)
price = price.replace('$', '').strip()
base_item['Regular_Price'] = price
break
except Exception as e:
self.logger.warning(f"Error extracting price with
selector {selector}: {e}")
if not price:
base_item['Regular_Price'] = "Price not available"
self.logger.info(f"No variations found, yielding single
product with price {price}")
yield base_item
else:
# Process all variations
yield from self.process_variations(variation_selects,
base_item)
except Exception as e:
self.logger.error(f"Error in product page processing: {e}")
# On error, yield base item as fallback
yield base_item
# Successfully processed this item, break retry loop
break
except Exception as e:
if "stale element reference" in str(e) and attempt < max_retries -
1:
self.logger.warning(f"Stale element on attempt {attempt+1},
retrying: {base_item.get('Product_Title')}")
time.sleep(3) # Wait before retry
continue
else:
self.logger.error(f"Failed to process product after
{max_retries} attempts: {e}")
# Return the base item if processing fails
yield base_item
def process_variations(self, variation_selects, base_item):
"""Process all variations for a product"""
# Initialize variation processing
all_variations = []
# Process the first variation dropdown
if variation_selects:
try:
select_element = variation_selects[0]
select = Select(select_element)
wait = WebDriverWait(self.driver, 10)
# Get all options in this dropdown
options = select.options
for option in options:
try:
# Get the option text/value
option_value = option.text.strip()
if not option_value or option_value.lower() == "choose an
option":
continue
# Select this option
select.select_by_visible_text(option_value)
# Wait for price to update
time.sleep(3)
self.wait_for_page_to_stabilize()
# Create a new item for this variation
variation_item = base_item.copy()
# Update title to include variation
variation_item['Product_Title'] =
f"{base_item['Product_Title']} - {option_value}"
# Check stock status after selecting this variation
try:
out_of_stock_elements =
self.driver.find_elements(By.CSS_SELECTOR, '.snize-out-of-stock, .sold-out')
if out_of_stock_elements:
variation_item['Stock_Status'] = "Out Of Stock"
else:
variation_item['Stock_Status'] = "In Stock"
except Exception:
pass # Keep existing stock status if check fails
# Get the updated price with extended selectors
price_selectors = [
'span.price span.money',
'span.price--sale span.money',
'.product-single__price',
'.price',
'[itemprop="price"]'
]
price = None
for selector in price_selectors:
try:
# Wait explicitly for price elements
try:
price_element =
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
price = price_element.text.strip()
except TimeoutException:
# Try direct find if wait fails
price_elements =
self.driver.find_elements(By.CSS_SELECTOR, selector)
if price_elements:
price = price_elements[0].text.strip()
if price:
# Clean price text (remove dollar sign)
price = price.replace('$', '').strip()
variation_item['Regular_Price'] = price
break
except Exception as e:
self.logger.warning(f"Error extracting price for
variation {option_value} with selector {selector}: {e}")
if not price:
variation_item['Regular_Price'] = "Price not available"
self.logger.info(f"Processed variation: {option_value} with
price {price}")
# Add to our list and yield
all_variations.append(variation_item)
yield variation_item
except Exception as e:
self.logger.error(f"Error processing variation option
{option_value if 'option_value' in locals() else 'unknown'}: {e}")
except Exception as e:
self.logger.error(f"Error processing variations: {e}")
# If we encounter an error, yield the base item as fallback
yield base_item
# If no variations were successfully processed, yield the base item
if not all_variations:
yield base_item
def wait_for_page_to_stabilize(self, timeout=5):
"""Wait for page to finish any animations or AJAX calls"""
try:
old_page = self.driver.find_element(By.TAG_NAME,
'html').get_attribute('outerHTML')
end_time = time.time() + timeout
while time.time() < end_time:
time.sleep(0.5)
new_page = self.driver.find_element(By.TAG_NAME,
'html').get_attribute('outerHTML')
if new_page == old_page:
return True
old_page = new_page
return False
except Exception as e:
self.logger.error(f"Error in wait_for_page_to_stabilize: {e}")
return False
def closed(self, reason):
# Close the browser when spider is closed
if self.driver:
self.driver.quit()
self.logger.info(f"Spider closed. Total products scraped:
{self.product_count}")
/// coorected ec2 version
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import NoSuchElementException, TimeoutException,
StaleElementReferenceException
from scrapy import signals
import time
import traceback
class VetnpetdirectSpiderSpider(scrapy.Spider):
name = "VetnPetDirect_spider"
allowed_domains = ["www.vetnpetdirect.com.au"]
# Category URLs with display names and hardcoded pet types
# Add all your individual URLs here with their respective categories
category_urls = [
# ('Food', 'collections/dog-food-complete-diets', 'Dog'),
# ('Food', 'collections/dog-food-complete-diets?tab=products&page=2',
'Dog'),
# ('Food', 'collections/dog-food-complete-diets?tab=products&page=3',
'Dog'),
# ('Food', 'collections/dog-food-complete-diets?tab=products&page=4',
'Dog'),
# ('Food', 'collections/dog-food-complete-diets?tab=products&page=5',
'Dog'),
# ('Treats', 'collections/treats-and-rewards-for-dogs', 'Dog'),
# ('Toys', 'collections/dog-puppy-toys-boredom-relief', 'Dog'),
# ('Food', 'collections/cat-food', 'Cat'),
]
# Simplified brand names list (just 2 brands for brevity)
brand_names = [
"LifeWise", "lovebites","Absolute Holistic","4CYTE","3M",
"Abbey Animal Health", "Absolute Pet", "Absorbine", "Adaptil", "Advantage",
"Advantix", "Advocate", "Aesculap", "Agmin Chelates", "Alfalfa King", "All Barks",
"All For Paws", "All Fur You", "ALLPET", "Alogard", "Alto", "Alto Lab", "Anarchy",
"Animal Works Nutrition", "Animalintex", "Animals Like Us", "Animology", "AniPal",
"Anitone", "Antinol Australia", "Ants Off", "Apex Laboratories", "Apex Tools",
"API", "Aqua Buddy", "Aqua Zonic", "Aquasonic", "Aristopet", "Arm & Hammer",
"Aromadog", "Ausrichter", "Aussie Pet Products", "Avione", "Avitrol", "Bainbridge",
"Balanced Life", "Banana Feeds Australia", "Barastoc", "Baxter", "Baycox", "Bayer",
"BD", "Bear Bear", "Beau Pets", "Beco", "Becton Dickinson", "Betadine", "Big Dog",
"Bio-Groom", "Bionic", "BIOpet", "Bioscape", "BirdLife", "Black Dog Wear", "Black
Hawk", "Blackdog", "Blue Planet", "Boehringer Ingelheim", "Borotto", "Braun",
"Bravecto", "Breeder's Choice", "BSN Medical", "Burgess", "Burgon & Ball",
"Buster", "Busy Buddy", "Caitec", "Camon", "Capstar", "CaribSea", "Catit",
"CatMate", "Cazitel", "CEN Nutrition", "Ceva", "CheckUp", "Chipsi", "Chuckit!",
"CLEAR Dog Treats", "Coachi", "Company of Animals", "Compost-A-Pak", "Continuum
Aquatics", "Coopers", "CopRice", "Country Heritage", "Covetrus", "Cowboy Magic",
"Creative Foraging", "Credelio Plus", "Crooked Lane", "Crown", "Cruiser Bowl",
"Crystalfix", "CSI", "Cydectin", "Davis", "Decron", "Dectomax", "Dentipet", "Derma
Gel", "Dermcare", "Dermoscent Laboratorie", "Designer Collection", "DGS Products",
"Di-Vetelact", "Diamond Cut", "Dimmitrol", "Dine-a-Chook", "Diversey Cleaning",
"Doc & Phoebe", "Dog Rocks", "Dog Treat Naturals", "Doggylicious", "DOOG",
"Drinkwell", "Drontal", "Drool", "Durex", "Dynavyte", "EAC Animal Care", "Earthz
Pet", "Eco Tech", "EcoPellets Tasmania", "Elanco", "EnviroSafe", "Equi-Prene",
"Equinade", "Equine Health Science", "Equine Pure", "Equine Vit&Min", "Equitex",
"Eurofarm", "EVO Lifestyle Products", "Excel", "Exo Terra", "Ezi-LockOdour",
"EzyDog", "EzyGrip", "F10", "Fantasmic", "Farmalogic", "Farmhand", "Farnam",
"Feathered Friends", "Featherland Paradise", "Feed-O-Matic", "Feel Good Doggo",
"Feline Natural", "Feliway", "Fenpral", "Fido", "Fido's", "Filta-Bac", "FitNFlash",
"Fjord Manufacturing", "Fleet", "Flexi", "Flipper", "Fluval", "Flyveils By Design",
"Freezy Paws", "Frontier Pets", "Frontline", "FURminator", "Furriends", "FuzzYard",
"Genia", "Genial", "Gentle Leader", "Giddy Citizen", "GiGwi", "Glandex", "Global
Vet", "Glow Groom", "Glyde", "GMV", "GO CAT", "Golp", "Greenies", "Guru Pet
Company", "GVP", "Hagen", "HandsOn", "Hayes", "HayPigs", "HeartGard", "Hemp
Collective", "HempPet", "Henry Schein", "Heritage Downs", "Hi Form", "Hill's",
"Hill's Pet Nutrition", "Hill's Prescription Diet", "HomeoPet", "Horse Health
Products", "Horse Hydrator", "Horsemaster", "Huds and Toke", "HugSmart", "Hunter
River Company", "Huskimo", "Hygain", "Hypro Pet Foods", "IAH", "Ibiyaya", "Ice N
Easy", "Inca", "IncrediBUBBLES", "independents Own", "Industrial Dog",
"Interceptor", "Intervet", "IO", "Ipromea", "Isle & Skye", "Jenquine", "JerHigh",
"Joint Guard", "Joseph Lyddy", "Jurox", "Juwel", "JW Pet", "K & H Pet Products",
"K9 Natural", "Kelato", "KER", "Ketchum", "Kiltix", "Kit Cat", "Kitter", "Kitty
Play", "Kiwi Kitchens", "KLEO", "Kohnke's", "KONG", "KoolMaster", "Krutex",
"Kruuse", "Kumfi", "Kurgo", "La Chanvriere", "Laucke", "Leather Therapy", "Lee's",
"Lenny's Kitchen", "Leovet", "Lickables", "Lickimat", "Life Data Labs", "LifeWise
Pet Nutrition", "Likit", "Little Giant", "Livamol", "Livingstone", "Love'em",
"Lulu's Kitchen", "LupinePet", "Lyppard", "Magictails", "Mane 'n Tail", "Marina",
"Masterpet", "Mavlab", "Max & Molly", "Meals For Mutts", "Medibit", "Melanie
Newman", "Melcare", "Merial", "Midwest", "Milbemax", "Millers Forge", "Mimi &
Munch", "Minrosa", "Miscellaneous", "Mog & Bone", "Mr Sticky", "Mr. Fothergill's",
"MSD", "Multipet", "Mustad", "My Family", "MyBestMate", "MyEcoBag", "myPet
Magazine", "Natural Animal Solutions", "Natural Health NZ", "Nature's Botanical",
"Nature's Miracle", "Neove Pharma Australia", "Nerf", "NexGard", "NexGard Spectra",
"Nina Ottosson", "NJ Phillips", "NORTHMATE", "NRG", "Nuheart", "Nutrafin",
"Nutramax", "Nutrimol", "Nylabone", "NYOS Aquatics", "O'TOM", "Oakwood", "Ocean
Nutrition", "ODR", "Odzon", "Oh Crap", "Olsson's", "Oralx", "OraVet", "Organica",
"Outward Hound", "Oxbow", "Oz Pet", "OzHemp", "ParaGard", "Passwell", "PatPet",
"PAW", "Paw Ready", "Pawsome Organics", "Peckish", "Penn-Plax Reptology", "Pet
DreamHouse", "Pet Drs", "Pet Relax", "Pet Teezer", "pet+me", "Pet-Rite", "PET-TEK",
"PetArk", "Petkin", "PetQwerks", "Petrageous", "PetSafe", "Petstages",
"Pharmachem", "Phud's", "Pioneer", "Plutos", "Polly's", "Polyp Lab", "Pomms",
"POOWEE!", "Poseidon Animal Health", "Pottie's", "Prestige Pet", "Prestige Snuggle
Pals", "Prime Pantry", "Prime100", "Pritchard", "Pro-Dosa", "PRO-TRAINER",
"Profender", "ProN8ure", "Proteq", "Protexin", "Proudi", "PROVET", "Provex",
"ProviCo", "Proviro Group", "Prozym", "PuppyPlay", "Pure Life", "Quirky Kitty",
"Racing Blue", "Ranvet", "Raw Pawz", "Red Healer", "Reptile Publications",
"Revolution", "Ridley", "River Systems", "Roche", "Rocky Point", "Rogz", "Rose-Hip
Vital", "Rover Pet Products", "Royal Canin", "Royal Show", "Ruddock's", "Ruffey",
"Ruffwear", "Rufus & Coco", "RWR", "Saddlery Trading Company", "Sandlube", "Sashas
Blend", "SavourLife", "ScoopFree", "Scream", "SeaFlex", "Sentinel Spectrum",
"Serenity", "Seresto", "Shark Net", "Shear Magic", "Shoof", "ShowMaster", "Silvan",
"Simcro", "Simparica", "SmartCat", "Smith & Nephew", "Snax", "Snooza",
"SnuggleSafe", "SodaPup", "Solo", "Sporn", "Spotnik", "Springer-Magrath", "Spunky
Pup", "Stance Equitec", "STARMARK", "StaySound", "Staywell", "STC", "Sticky Paws",
"Stockbrands", "Stockman & Paddock", "Super Bird Creations", "SuperBird", "Superior
Pet Goods", "Superior Shavings", "Sure Petcare", "SUREFEED", "SUREFLAP", "Swann-
Morton", "Sykes", "Tankmaid", "Tasty Bone", "Tensoplast", "Terumo", "Tetra", "The
Art of Whole Food", "The Canny Company", "The NZ Natural Pet Food Company", "The
Pet Loo", "Thundershirt", "Thunderworks", "Tiger Brands", "Triplepet",
"TropiClean", "Trouble & Trix", "Trough Rocks", "Troy", "TuffRock", "Tuffy", "Two
Little Fishies", "Ultra", "Urine Off", "USA-K9", "Value Plus", "Valuheart",
"Varco", "Veggiedent", "Veredus", "Vet's All Natural", "Vet's Best", "vet-n-pet
DIRECT", "Vetafarm", "Vetforce", "Vetgold", "Vetnex Pet Care", "Vetopop",
"Vetoquinol", "VetPen", "VetRx", "Vetsense", "Virbac", "Virkon", "Vision", "Vital
SupaSnax", "Wagg & Purr", "Wags & Wiggles", "Wahl", "Water & Woods", "Weaver",
"Wellness Pet Company", "West Paw", "Whimzees", "Whiskers & Wiggles", "Wild
Hibiscus Flower Company", "Wombaroo", "Worlds Best Hoof Oil", "WSD", "Y-Tex",
"Yeowww!", "Yours Droolly", "ZEEZ", "zenpet", "Zippy Claws", "Zippy Paws", "ZIWI",
"Zoetis", "Zoo Med", "ZooPets", "Zychem Technologies"
]
# Track processed products to avoid duplicates
processed_products = set()
def __init__(self, *args, **kwargs):
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Enable headless mode for AWS
EC2
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0;
Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36")
chrome_options.add_experimental_option('excludeSwitches', ['enable-
automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
# Initialize the webdriver
self.service = Service(r"")
self.driver = None
self.options = chrome_options
# Add counter for products
self.product_count = 0
# EC2 optimization parameters - increased for EC2 reliability
self.page_load_wait = 15 # Increased wait time for EC2
self.request_delay = 1.0 # Increased delay between requests for EC2
# Maximum wait times for explicit waits
self.wait_time = 60 # Increased for EC2
self.short_wait_time = 15
super(VetnpetdirectSpiderSpider, self).__init__(*args, **kwargs)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(VetnpetdirectSpiderSpider, cls).from_crawler(crawler, *args,
**kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_opened(self, spider):
self.driver = webdriver.Chrome(service=self.service, options=self.options)
def spider_closed(self, spider):
if self.driver:
self.driver.quit()
def wait_for_element(self, by, selector, timeout=None,
condition=EC.presence_of_element_located):
"""Dynamic wait helper with automatic retry for stale elements"""
if timeout is None:
timeout = self.wait_time
wait = WebDriverWait(self.driver, timeout, poll_frequency=0.5)
max_retries = 3
for attempt in range(max_retries):
try:
element = wait.until(condition((by, selector)))
return element
except StaleElementReferenceException:
if attempt < max_retries - 1:
self.logger.warning(f"Stale element for {selector}, retrying
({attempt+1}/{max_retries})")
time.sleep(1)
continue
else:
raise
def wait_for_elements(self, by, selector, timeout=None,
condition=EC.presence_of_all_elements_located):
"""Dynamic wait helper for multiple elements"""
if timeout is None:
timeout = self.wait_time
wait = WebDriverWait(self.driver, timeout, poll_frequency=0.5)
max_retries = 3
for attempt in range(max_retries):
try:
elements = wait.until(condition((by, selector)))
return elements
except StaleElementReferenceException:
if attempt < max_retries - 1:
self.logger.warning(f"Stale elements for {selector}, retrying
({attempt+1}/{max_retries})")
time.sleep(1)
continue
else:
raise
def wait_for_page_load(self):
"""Enhanced page load wait with dynamic detection"""
# First wait for the document to be ready
try:
WebDriverWait(self.driver, self.wait_time).until(
lambda d: d.execute_script("return document.readyState") ==
"complete"
)
# Then wait for any jQuery or AJAX to complete
WebDriverWait(self.driver, self.short_wait_time).until(
lambda d: d.execute_script("return jQuery.active == 0") or True
)
# Sometimes need a small additional wait for JavaScript rendering
time.sleep(1.5)
return True
except Exception as e:
self.logger.warning(f"Wait for page load exception: {e}")
# Fallback to simple sleep if script execution fails
time.sleep(self.page_load_wait)
return False
def start_requests(self):
"""Initialize requests for all category URLs"""
for index, (category_name, category_path, pet_type) in
enumerate(self.category_urls):
url = f"https://www.vetnpetdirect.com.au/{category_path}"
self.logger.info(f"Scheduling category URL
{index+1}/{len(self.category_urls)}: {category_name} for {pet_type}")
yield scrapy.Request(
url=url,
callback=self.parse,
meta={
'category_name': category_name,
'pet_type': pet_type,
'dont_filter': True
}
)
def parse(self, response):
# Get the category name and pet type from meta
category_name = response.meta.get('category_name')
pet_type = response.meta.get('pet_type')
self.logger.info(f"Parsing category: {category_name} for pet type:
{pet_type}")
# Initialize the driver if it's not already initialized
if not hasattr(self, 'driver') or self.driver is None:
self.driver = webdriver.Chrome(service=self.service,
options=self.options)
# Open the URL with Selenium
try:
self.driver.get(response.url)
self.logger.info(f"Loading page: {response.url}")
# Enhanced page load wait
self.wait_for_page_load()
# Wait for the body to be fully loaded with explicit wait
self.wait_for_element(By.TAG_NAME, 'body')
# First check what product elements are present on the page
# Let's try different selectors
selectors_to_try = [
'li.snize-product',
'div.product-item',
'div.grid__item',
'div.product-card',
'div.product'
]
products = []
used_selector = None
# Try each selector with explicit wait
for selector in selectors_to_try:
try:
# Use our enhanced wait for elements helper
products = self.wait_for_elements(By.CSS_SELECTOR, selector,
timeout=15)
if products:
used_selector = selector
self.logger.info(f"Found products using selector:
{selector}")
break
except TimeoutException:
continue
if not products:
# No products found on this page
self.logger.warning("No products found on this page.")
self.logger.info(f"Current URL: {self.driver.current_url}")
# Save page source for debugging
with open(f"page_source_debug_{category_name}.html", "w",
encoding="utf-8") as f:
f.write(self.driver.page_source)
return
page_product_count = len(products)
self.product_count += page_product_count
self.logger.info(f"Found {page_product_count} products. Total so far:
{self.product_count}")
# Store product data first to avoid stale element issues
product_data_list = []
# First collect basic data from the list page
for index, product in enumerate(products):
try:
item = {}
# Extract link first as we'll need it to visit the product page
link_url = None
link_selectors = ['a.snize-view-link', 'a', '.product-card a']
for selector in link_selectors:
try:
link_element = product.find_element(By.CSS_SELECTOR,
selector)
link_url = link_element.get_attribute('href')
if link_url:
item['Title_Link'] = link_url
break
except (NoSuchElementException,
StaleElementReferenceException):
continue
# Skip if we've already processed this product URL
if link_url in self.processed_products:
self.logger.info(f"Skipping already processed product URL:
{link_url}")
continue
# Add to processed set
if link_url:
self.processed_products.add(link_url)
# Only collect other data if we found a link
if link_url:
# Try different selectors for title
title_selectors = ['span.snize-title', 'h2', '.product-
card__title', '.product-title']
for selector in title_selectors:
try:
title_element =
product.find_element(By.CSS_SELECTOR, selector)
title = title_element.text
if title:
item['Product_Title'] = title
break
except (NoSuchElementException,
StaleElementReferenceException):
continue
# Try different selectors for image
img_selectors = ['span.snize-thumbnail img', 'img',
'.product-image img']
for selector in img_selectors:
try:
img_element = product.find_element(By.CSS_SELECTOR,
selector)
image_url = img_element.get_attribute('src')
if image_url:
item['Image_URL'] = image_url
break
except (NoSuchElementException,
StaleElementReferenceException):
continue
# Check if product is out of stock
try:
out_of_stock = product.find_element(By.CSS_SELECTOR,
'.snize-out-of-stock')
if out_of_stock:
item['Stock_Status'] = "Out Of Stock"
else:
item['Stock_Status'] = "In Stock"
except (NoSuchElementException,
StaleElementReferenceException):
item['Stock_Status'] = "In Stock" # Default to in
stock if no indicator found
# Determine brand name from product title
if 'Product_Title' in item:
brand_name = "N/A"
for brand in self.brand_names:
if brand.lower() in item['Product_Title'].lower():
brand_name = brand
break
item['Brand_Name'] = brand_name
# Add category information
item['Product_Category'] = category_name
item['Pet_Type'] = pet_type
# Add default fields
item['Description'] = "Not Available"
item['Lifestage'] = "Not Available"
# Add product to our list to process
product_data_list.append(item)
except Exception as e:
self.logger.error(f"Error extracting basic product data: {e}")
# Now visit each product page to get prices and check for variations
for item in product_data_list:
if 'Title_Link' in item:
# Follow the product link to get price and check for variations
yield scrapy.Request(
item['Title_Link'],
callback=self.parse_product,
meta={
'item': item,
'category_name': category_name,
'pet_type': pet_type
}
)
# Add small delay between product requests to avoid
overwhelming EC2
time.sleep(self.request_delay)
except Exception as e:
self.logger.error(f"Error in parsing page: {e}")
self.logger.error(traceback.format_exc())
def parse_product(self, response):
# Get the item that was collected in the first parse
base_item = response.meta['item']
category_name = response.meta['category_name']
self.logger.info(f"Parsing product: {base_item['Product_Title']}")
try:
# Process this product (will handle variations if present)
product_url = response.url
items = list(self.process_product_page(product_url, base_item))
# Yield all items
for item in items:
yield item
except Exception as e:
self.logger.error(f"Error processing product detail page: {e}")
self.logger.error(traceback.format_exc())
# Still yield the base item as fallback
yield base_item
def process_product_page(self, url, base_item):
"""Process a product page, handling variations if present"""
max_retries = 3
for attempt in range(max_retries):
try:
# Load the product page
self.driver.get(url)
self.logger.info(f"Loading product page: {url}")
# Enhanced page load wait
self.wait_for_page_load()
# Wait for product detail elements to be present
self.wait_for_element(By.CSS_SELECTOR, 'body', self.wait_time)
# Try to get description if available
try:
description_selectors = [
'.product-description',
'.product__description',
'.rte',
'[itemprop="description"]'
]
for selector in description_selectors:
try:
# Try to find with our enhanced wait first
try:
desc_element = self.wait_for_element(
By.CSS_SELECTOR,
selector,
timeout=10
)
description = desc_element.text.strip() if
desc_element else ""
except TimeoutException:
# Fallback to direct find_elements
desc_elements =
self.driver.find_elements(By.CSS_SELECTOR, selector)
description = desc_elements[0].text.strip() if
desc_elements else ""
if description:
base_item['Description'] = description
break
except Exception:
continue
except Exception as e:
self.logger.warning(f"Error extracting description: {e}")
# First check if there are variations
variation_selects = []
try:
# Look for variation selectors - try different types
selectors_to_try = [
'select.single-option-selector',
'.selector-wrapper select',
'select[data-variant-option]'
]
for selector in selectors_to_try:
try:
selects = self.driver.find_elements(By.CSS_SELECTOR,
selector)
if selects:
variation_selects = selects
self.logger.info(f"Found {len(selects)} variation
selectors using {selector}")
break
except Exception:
continue
if not variation_selects:
# No variations, just get the regular price
price_selectors = [
'span.price span.money',
'span.price--sale span.money',
'.product-single__price',
'.price',
'[itemprop="price"]'
]
price = None
for selector in price_selectors:
try:
# Try with explicit wait first
try:
price_element = self.wait_for_element(
By.CSS_SELECTOR,
selector,
timeout=10,
condition=EC.visibility_of_element_located
)
price = price_element.text.strip() if
price_element else ""
except TimeoutException:
# Fallback to direct find_elements
price_elements =
self.driver.find_elements(By.CSS_SELECTOR, selector)
price = price_elements[0].text.strip() if
price_elements else ""
if price:
# Clean price text (remove dollar sign)
price = price.replace('$', '').strip()
base_item['Regular_Price'] = price
break
except Exception as e:
self.logger.warning(f"Error extracting price with
selector {selector}: {e}")
if not price:
base_item['Regular_Price'] = "Price not available"
self.logger.info(f"No variations found, yielding single
product with price {price}")
yield base_item
else:
# Process all variations
yield from self.process_variations(variation_selects,
base_item)
except Exception as e:
self.logger.error(f"Error in product page processing: {e}")
# On error, yield base item as fallback
yield base_item
# Successfully processed this item, break retry loop
break
except Exception as e:
if attempt < max_retries - 1:
self.logger.warning(f"Error on attempt {attempt+1}, retrying:
{base_item.get('Product_Title')} - {str(e)}")
time.sleep(3) # Wait before retry
continue
else:
self.logger.error(f"Failed to process product after
{max_retries} attempts: {e}")
# Return the base item if processing fails
yield base_item
def process_variations(self, variation_selects, base_item):
"""Process all variations for a product"""
# Initialize variation processing
all_variations = []
# Process the first variation dropdown
if variation_selects:
try:
select_element = variation_selects[0]
select = Select(select_element)
# Get all options in this dropdown
options = select.options
for option in options:
try:
# Get the option text/value
option_value = option.text.strip()
if not option_value or option_value.lower() in ["choose an
option", "select"]:
continue
# Select this option with retry mechanism
max_retries = 3
for retry in range(max_retries):
try:
select.select_by_visible_text(option_value)
# Wait for price to update - use dynamic waiting
self.wait_for_page_to_stabilize(timeout=8)
break
except StaleElementReferenceException:
if retry < max_retries - 1:
# Get fresh reference to select element and
retry
time.sleep(1)
select_element = self.wait_for_element(
By.CSS_SELECTOR,
'select.single-option-selector, .selector-
wrapper select, select[data-variant-option]'
)
select = Select(select_element)
else:
raise
# Create a new item for this variation
variation_item = base_item.copy()
# Update title to include variation
variation_item['Product_Title'] =
f"{base_item['Product_Title']} - {option_value}"
# Check stock status after selecting this variation
try:
out_of_stock_elements =
self.driver.find_elements(By.CSS_SELECTOR, '.snize-out-of-stock, .sold-out')
if out_of_stock_elements:
variation_item['Stock_Status'] = "Out Of Stock"
else:
variation_item['Stock_Status'] = "In Stock"
except Exception:
pass # Keep existing stock status if check fails
# Get the updated price with extended selectors
price_selectors = [
'span.price span.money',
'span.price--sale span.money',
'.product-single__price',
'.price',
'[itemprop="price"]'
]
price = None
for selector in price_selectors:
try:
# Try with explicit wait first for better
reliability
try:
price_element = self.wait_for_element(
By.CSS_SELECTOR,
selector,
timeout=8,
condition=EC.visibility_of_element_located
)
if price_element:
price = price_element.text.strip()
except TimeoutException:
# Fallback to direct find_elements
price_elements =
self.driver.find_elements(By.CSS_SELECTOR, selector)
if price_elements:
price = price_elements[0].text.strip()
if price:
# Clean price text (remove dollar sign)
price = price.replace('$', '').strip()
variation_item['Regular_Price'] = price
break
except Exception as e:
self.logger.warning(f"Error extracting price for
variation {option_value} with selector {selector}: {e}")
if not price:
variation_item['Regular_Price'] = "Price not available"
self.logger.info(f"Processed variation: {option_value} with
price {price}")
# Add to our list and yield
all_variations.append(variation_item)
yield variation_item
except Exception as e:
self.logger.error(f"Error processing variation option
{option_value if 'option_value' in locals() else 'unknown'}: {e}")
except Exception as e:
self.logger.error(f"Error processing variations: {e}")
# If we encounter an error, yield the base item as fallback
yield base_item
# If no variations were successfully processed, yield the base item
if not all_variations:
yield base_item
def wait_for_page_to_stabilize(self, timeout=8):
"""Wait for page to finish any animations or AJAX calls"""
try:
# First check document ready state
WebDriverWait(self.driver, timeout).until(
lambda d: d.execute_script("return document.readyState") ==
"complete"
)
# Then wait for any pending AJAX requests (jQuery)
try:
WebDriverWait(self.driver, timeout/2).until(
lambda d: d.execute_script("return jQuery.active == 0") or True
)
except:
pass # jQuery might not be present
# Check for HTML stability (DOM not changing)
old_page = self.driver.find_element(By.TAG_NAME,
'html').get_attribute('outerHTML')
stability_check_time = time.time() + (timeout/2)
while time.time() < stability_check_time:
time.sleep(0.5)
new_page = self.driver.find_element(By.TAG_NAME,
'html').get_attribute('outerHTML')
if new_page == old_page:
return True
old_page = new_page
return True
except Exception as e:
self.logger.error(f"Error in wait_for_page_to_stabilize: {e}")
# Fallback to simple sleep
time.sleep(3)
return False
def closed(self, reason):
# Close the browser when spider is closed
if self.driver:
self.driver.quit()
self.logger.info(f"Spider closed. Total products scraped:
{self.product_count}")