price selector when no variation is present
<p class="price"><span class="woocommerce-Price-amount amount"><bdi><span
class="woocommerce-Price-currencySymbol">$</span>7.63</bdi></span></p>
price selector when variations are present
<div class="woocommerce-variation single_variation">
<div class="woocommerce-variation-description"></div>
<div class="woocommerce-variation-price"><span class="price"><span
class="woocommerce-Price-amount amount"><bdi><span class="woocommerce-Price-
currencySymbol">$</span>39.66</bdi></span></span>
<div class="wcsatt-options-wrapper wcsatt-options-wrapper-grouped wcsatt-options-
wrapper-radio closed " data-sign_up_text="Sign up now">
<div class="wcsatt-options-product-prompt wcsatt-options-product-prompt-
grouped wcsatt-options-product-prompt-radio wcsatt-options-product-prompt--visible"
data-prompt_type="radio">
Description selector:
<div class="woocommerce-Tabs-panel woocommerce-Tabs-panel--description panel entry-
content wc-tab" id="tab-description" role="tabpanel" aria-labelledby="tab-title-
description" style="">
<h2>Description</h2>
<p>Prime100 is an Australian owned company. Working with a network of expert
dermatologists, veterinarians, and nutritionists Prime100 has created a
scientifically based range of functional protein diets for pets where every
ingredient has a purpose. Nutritionally balanced, extremely palatable and
specifically designed to aid in the elimination of food-based sensitivities and
other health issues.</p>
<p>Prime100 SPT Lamb Fillets Treats are an ideal training aid or treat, made from
premium Australian Lamb. These single protein treats are soft to chew and contain
prime cuts of human grade, pasture fed Australian lamb.</p>
<p>No preservatives, colouring or flavours. 100gm pack.</p>
<p>Should be used as a complementary treat only and fed in conjunction with a
complete and balanced diet.</p>
</div>
ye code bilkul theek kam kar raha hai ab tume har product ka title link ko follow
karna hai or har product ka andr sy 2 chhezain extract karni hain:
1. Regular_Price
2. Description
tume products variations mae scrape karni hain if present or har variation ko as a
separate product treat karna hai or weight ya size jo bhi ho wo Title mae add kar
dena hai:
for example :
1. Product milk - 1L
2. Product milk - 2L
you have to weight for the prices to be updated in the same selector then you will
scrape that price for that particualr variation in a corrected manner:
<table class="variations" cellspacing="0" role="presentation">
<tbody>
<tr>
<th class="label"><label
for="size">Size</label></th>
<td class="value">
<select id="size" class=""
name="attribute_size" data-attribute_name="attribute_size" data-
show_option_none="yes"><option value="">Choose an option</option><option
value="2.5kg" class="attached enabled">2.5kg</option><option value="9kg"
class="attached enabled">9kg</option><option value="18kg" selected="selected"
class="attached enabled">18kg</option></select><a class="reset_variations" href="#"
aria-label="Clear options" style="visibility: visible;">Clear</a>
</td>
</tr>
</tbody>
</table>
mae tume eak code provide karta hu es ko analyze kro or same logic mujhy
myPetZone_spider mae provide kro to handle dropdowns
handle stale elements refernce properly
import scrapy
import re
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
from scrapy import signals
from scrapy.selector import Selector
import random
import traceback
class DirecttopetSpiderSpider(scrapy.Spider):
name = "Directtopet_spider"
allowed_domains = ["directtopet.com.au"]
# Category URLs with display names and hardcoded pet types
category_urls = [
# ('Food', 'collections/cat-food', 'Cat'),
# ('Bed', 'collections/cat-beds-furniture', 'Cat'),
('Toys', 'collections/bird-toys', 'Bird'),
('Accessories', 'collections/bird-cages', 'Bird'),
('Health Care', 'collections/horse-health', 'Horse'),
('Health Care', 'collections/fish-tank-aquarium-filters-pumps', 'Fish'),
# ('Litter', 'collections/cat-litter', 'Cat'),
# ('Litter', 'collections/cat-litter-boxes-trays-mats', 'Cat'),
# ('Litter', 'collections/cat-cleanup', 'Cat'),
('Accessories', 'collections/rabbit-hutch-cage-housing-supplies',
'Rabbit'),
('Accessories', 'collections/electric-fencers-electric-fences', 'Other'),
# ('Accessories', 'collections/bowls-feeders', 'Cat'),
# ('Accessories', 'collections/cat-scratching-poles-trees', 'Cat'),
# ('Accessories', 'collections/doors-cat-ladders', 'Cat'),
# ('Accessories', 'collections/cat-carriers-strollers', 'Cat'),
# Known brand names list
brand_names = [
"Activyl", "Adaptil", "Advantage", "Advantix", "Advocate", "Aloveen",
"Balanced Life", "Blackmores", "Ceva", "DentiPet", "Dermcare Vet", "Dermoscent",
"Equinade", "Evolution", "Fenpral", "Filta Bac", "Freeze Dry Australia",
"Frontline", "Hamish McBeth", "Heart Gard", "Himalayan Pet Supply", "IAH", "Joint
Guard", "Kelato", "Kiltix", "Kong", "Livamol", "Milbemax", "Natural Animal
Solutions", "ParaGard", "RanVet", "Rose Hip Vital", "Sasha's Blend", "Savour Life",
"Sentinel", "The Art Of Whole Food", "ValuePlus", "Vet's All Natural",
"Vetoquinol", "VetSense", "VirBac", "Zippy Paws"
]
def __init__(self, *args, **kwargs):
super(DirecttopetSpiderSpider, self).__init__(*args, **kwargs)
# Initialize the Chrome WebDriver
self.service = Service(r"C:\\Users\\Hp\\.wdm\\drivers\\chromedriver\\
win64\\chromedriver-win64\\chromedriver.exe")
self.options = webdriver.ChromeOptions()
# Add options to improve stability
self.options.add_argument('--headless')
self.options.add_argument('--disable-dev-shm-usage')
self.options.add_argument('--no-sandbox')
self.options.add_argument('--disable-gpu')
self.options.add_argument('--window-size=1920,1080')
self.options.add_experimental_option('excludeSwitches', ['enable-
automation'])
self.options.add_experimental_option('useAutomationExtension', False)
self.driver = None
# Add index to track current category
self.current_category_index = 0
# Add a set to track processed product URLs to avoid duplicates
self.processed_products = set()
# Track pending requests per category to know when a category is complete
self.pending_requests = {}
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(DirecttopetSpiderSpider, cls).from_crawler(crawler, *args,
**kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_opened(self, spider):
self.driver = webdriver.Chrome(service=self.service, options=self.options)
def spider_closed(self, spider):
if self.driver:
self.driver.quit()
def start_requests(self):
# Start with only the first category
if self.category_urls:
category_name, category_path, pet_type =
self.category_urls[self.current_category_index]
url = f"https://directtopet.com.au/{category_path}"
self.logger.info(f"Starting with category: {category_name}")
# Initialize pending requests counter for this category
self.pending_requests[category_name] = 1 # Start with 1 for the
initial request
yield scrapy.Request(
url=url,
callback=self.parse,
meta={
'category_name': category_name,
'pet_type': pet_type,
'dont_filter': True # Ensure the request is not filtered as
duplicate
}
)
def parse(self, response):
# Get the category name and pet type from meta
category_name = response.meta.get('category_name')
pet_type = response.meta.get('pet_type')
self.logger.info(f"Parsing category: {category_name} for pet type:
{pet_type}")
# Initialize the driver if it's not already initialized
if not hasattr(self, 'driver') or self.driver is None:
self.driver = webdriver.Chrome(service=self.service,
options=self.options)
Products = response.css('div.grid__item.grid__item--collection-
template.medium-up--one-quarter.small--one-half')
self.logger.info(f"Found {len(Products)} products on this page")
# Increment pending requests counter for product detail pages
self.pending_requests[category_name] += len(Products)
for product in Products:
Product_Title = product.css('div.subheading.grid-view-
item__title.medium-up--hide::text').get().strip()
Title_Link_relative = product.css('a.grid-view-item__link.grid-view-
item__image-container::attr(href)').get()
Title_Link = response.urljoin(Title_Link_relative)
# Skip if we've already processed this product URL
if Title_Link in self.processed_products:
self.logger.info(f"Skipping already processed product:
{Product_Title}")
self.pending_requests[category_name] -= 1
continue
# Add to processed set
self.processed_products.add(Title_Link)
# Fix image URL extraction
Image_url_relative = product.css('img.grid-view-
item__image::attr(src)').get()
Image_url = response.urljoin(Image_url_relative)
if Image_url and Image_url.startswith('//'):
Image_url = 'https:' + Image_url
# Fix stock status extraction
sold_out_element = product.css('p.sale_wheel.sld-out strong')
if sold_out_element:
Stock_Status = "Sold Out"
else:
Stock_Status = 'In Stock'
# Determine brand name from product title
brand_name = "N/A"
for brand in self.brand_names:
if brand.lower() in Product_Title.lower():
brand_name = brand
break
# Create item dict with the data we have so far
item = {
'Product_Title': Product_Title,
'Title_Link': Title_Link,
'Image_url': Image_url,
'Stock_Status': Stock_Status,
'Brand_Name': brand_name,
'Product_Category': category_name,
'Pet_Type': pet_type, # Use the hardcoded pet type from
category_urls
'Lifestage': 'Not Available' # As requested, hardcoded
}
# Follow the product link to get price and description
yield scrapy.Request(
Title_Link,
callback=self.parse_product,
meta={
'item': item,
'category_name': category_name,
'pet_type': pet_type
}
)
# Check for pagination and follow next page within the same category
Next_page = response.css('a.btn.btn--secondary.btn--narrow:has(svg.icon-
arrow-right)::attr(href)').get()
if Next_page is not None:
next_page_url = response.urljoin(Next_page)
self.logger.info(f"Following next page for {category_name}:
{next_page_url}")
# Increment pending requests for the next page
self.pending_requests[category_name] += 1
yield scrapy.Request(
url=next_page_url,
callback=self.parse,
meta={
'category_name': category_name,
'pet_type': pet_type,
'dont_filter': True # Ensure the request is not filtered as
duplicate
}
)
# Decrement pending requests counter for this page
self.pending_requests[category_name] -= 1
self.logger.info(f"Remaining requests for {category_name}:
{self.pending_requests[category_name]}")
# Check if we've completed this category
self.check_category_completion(category_name)
def check_category_completion(self, category_name):
"""Check if a category is complete and move to the next one if needed"""
# If there are no more pending requests for this category, move to the next
if self.pending_requests.get(category_name, 0) <= 0:
self.logger.info(f"✅ COMPLETED CATEGORY: {category_name}")
# Move to the next category
self.current_category_index += 1
if self.current_category_index < len(self.category_urls):
# Start the next category
next_category_name, next_category_path, next_pet_type =
self.category_urls[self.current_category_index]
next_url = f"https://directtopet.com.au/{next_category_path}"
self.logger.info(f"🚀 MOVING TO NEXT CATEGORY:
{next_category_name}")
# Initialize pending requests counter for the new category
self.pending_requests[next_category_name] = 1
# Schedule the request for the next category
return scrapy.Request(
url=next_url,
callback=self.parse,
meta={
'category_name': next_category_name,
'pet_type': next_pet_type,
'dont_filter': True
}
)
else:
self.logger.info("🎉 ALL CATEGORIES HAVE BEEN PROCESSED")
def wait_for_page_to_stabilize(self, timeout=5):
"""Wait for page to finish any animations or AJAX calls"""
old_page = self.driver.find_element(By.TAG_NAME,
'html').get_attribute('outerHTML')
end_time = time.time() + timeout
while time.time() < end_time:
time.sleep(0.5)
new_page = self.driver.find_element(By.TAG_NAME,
'html').get_attribute('outerHTML')
if new_page == old_page:
return True
old_page = new_page
return False
def parse_product(self, response):
# Get the item that was collected in the first parse
base_item = response.meta['item']
category_name = response.meta['category_name']
self.logger.info(f"Parsing product: {base_item['Product_Title']}")
# Extract description and clean HTML tags
Description_html = response.css('div.product-single__description .product-
description').get()
if Description_html:
# Use BeautifulSoup to clean the HTML and preserve only text
soup = BeautifulSoup(Description_html, 'html.parser')
# Extract text from each element while preserving structure
description_text = ""
for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'li']):
if element.name == 'li':
description_text += "• " + element.get_text().strip() + " "
else:
# Add header formatting for headings
if element.name.startswith('h'):
text = element.get_text().strip()
if text:
description_text += text + " "
else:
text = element.get_text().strip()
if text:
description_text += text + " "
# Handle table data
for table in soup.find_all('table'):
for row in table.find_all('tr'):
cells = row.find_all(['td', 'th'])
if cells:
row_text = " | ".join(cell.get_text().strip() for cell in
cells)
description_text += row_text + " "
Description = description_text.strip()
else:
Description = None
# Get the regular price regardless of variations
Regular_Price = response.css('span#ProductPrice-product-
template::text').get()
if Regular_Price:
base_item['Regular_Price'] = Regular_Price.strip()
else:
base_item['Regular_Price'] = "Price not available"
# Add description to base item
base_item['Description'] = Description
# Check if there are variations (like weight options)
variation_selectors = response.css('div.selector-wrapper.js.product-
form__item')
if variation_selectors:
self.logger.info(f"Product has variations: {len(variation_selectors)}
selectors found")
# Use Selenium to load the page and interact with the dropdowns
for item in self.process_variations_with_retry(response.url,
base_item):
yield item
else:
# No variations, just yield the single item with its price
self.logger.info(f"No variations found, yielding single item:
{base_item['Product_Title']}")
yield base_item
# Decrement pending requests counter for this product
self.pending_requests[category_name] -= 1
self.logger.info(f"Remaining requests for {category_name}:
{self.pending_requests[category_name]}")
# Check if we've completed this category
next_request = self.check_category_completion(category_name)
if next_request:
yield next_request
def process_variations_with_retry(self, url, base_item, max_retries=3):
"""Process product variations with retry mechanism for stale element
errors"""
items_yielded = [] # Track items to be yielded
for attempt in range(max_retries):
try:
# Load the page fresh for each attempt
self.driver.get(url)
# Wait for the page to load completely
try:
WebDriverWait(self.driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR,
"span#ProductPrice-product-template"))
)
# Add extra wait time to ensure page is fully loaded
time.sleep(2)
# Wait for page to stabilize
self.wait_for_page_to_stabilize()
except Exception as e:
self.logger.warning(f"Timeout waiting for price element on
{url}: {e}")
items_yielded.append(base_item)
return items_yielded
# Find all variation selectors
selenium_selectors = self.driver.find_elements(By.CSS_SELECTOR,
'div.selector-wrapper.js.product-form__item')
if not selenium_selectors:
self.logger.info(f"No variation selectors found with Selenium
for {base_item['Product_Title']}")
items_yielded.append(base_item)
return items_yielded
# Process each variation selector
for selector_idx, selector in enumerate(selenium_selectors):
# Get fresh elements for each iteration to avoid stale
references
fresh_selectors = self.driver.find_elements(By.CSS_SELECTOR,
'div.selector-wrapper.js.product-form__item')
if selector_idx >= len(fresh_selectors):
self.logger.warning(f"Selector index {selector_idx} out of
range. Refreshing page.")
self.driver.refresh()
time.sleep(2)
fresh_selectors =
self.driver.find_elements(By.CSS_SELECTOR, 'div.selector-wrapper.js.product-
form__item')
if selector_idx >= len(fresh_selectors):
self.logger.error(f"Still can't find selector at index
{selector_idx}. Skipping.")
continue
selector = fresh_selectors[selector_idx]
# Get the variation type (e.g., "Weight")
try:
variation_type = selector.find_element(By.CSS_SELECTOR,
'label').text.strip()
except Exception as e:
self.logger.warning(f"Error getting variation type: {e}")
variation_type = f"Variation {selector_idx + 1}"
# Get the select element
try:
select_element = selector.find_element(By.TAG_NAME,
'select')
select = Select(select_element)
# Get all options
options = select.options
# For each option, select it and get the updated price
for option_idx, option in enumerate(options):
try:
option_value = option.text.strip()
if not option_value: # Skip empty options
continue
# Get fresh select element for each option to avoid
stale references
fresh_selectors =
self.driver.find_elements(By.CSS_SELECTOR, 'div.selector-wrapper.js.product-
form__item')
fresh_select =
Select(fresh_selectors[selector_idx].find_element(By.TAG_NAME, 'select'))
# Select this option
fresh_select.select_by_visible_text(option_value)
# Wait for price to update
time.sleep(1.5) # Give JavaScript more time to
update the price
# Get the updated price
try:
price_element = WebDriverWait(self.driver,
5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "span#ProductPrice-product-
template"))
)
price = price_element.text.strip()
except Exception as e:
self.logger.warning(f"Error getting price:
{e}")
price = "Price not available"
# Check stock status for this variation
try:
sold_out_element =
self.driver.find_elements(By.CSS_SELECTOR, ".product-form__cart-submit[disabled]")
if sold_out_element:
variation_stock_status = "Sold Out"
else:
variation_stock_status = "In Stock"
except Exception as e:
self.logger.warning(f"Error getting stock
status: {e}")
variation_stock_status =
base_item['Stock_Status'] # Use default from base item
# Create a copy of the base item for this variation
item = base_item.copy()
# Modify the title to include the variation
item['Product_Title'] =
f"{base_item['Product_Title']} - {option_value}"
item['Regular_Price'] = price
item['Variation_Type'] = variation_type
item['Variation_Value'] = option_value
item['Stock_Status'] = variation_stock_status
items_yielded.append(item)
self.logger.info(f"Added variation to yield:
{item['Product_Title']} with price {price}")
except Exception as e:
if "stale element reference" in str(e) and
option_idx < len(options) - 1:
self.logger.warning(f"Stale element when
processing option {option_idx}. Refreshing elements.")
# Don't break the loop, just refresh elements
and continue with next option
self.driver.refresh()
time.sleep(2)
self.wait_for_page_to_stabilize()
else:
self.logger.error(f"Error processing option
{option_idx}: {e}")
except Exception as e:
self.logger.error(f"Error processing selector
{selector_idx}: {e}")
# If we got here without raising an exception that breaks out of
the loop,
# we successfully processed all variations
return items_yielded
except Exception as e:
if "stale element reference" in str(e) and attempt < max_retries -
1:
self.logger.warning(f"Stale element on attempt {attempt+1},
retrying: {base_item['Product_Title']}")
# Refresh the page to get a clean state
self.driver.refresh()
time.sleep(2) # Wait for page to reload
self.wait_for_page_to_stabilize()
else:
self.logger.error(f"Error with Selenium on {url}: {e}")
self.logger.error(traceback.format_exc())
# Fallback to basic extraction if Selenium fails after all
retries
items_yielded.append(base_item)
self.logger.info(f"Added base item to yield due to Selenium
error: {base_item['Product_Title']}")
return items_yielded
# If we've exhausted all retries, yield the base item as fallback
items_yielded.append(base_item)
self.logger.info(f"Added base item to yield after {max_retries} failed
attempts: {base_item['Product_Title']}")
return items_yielded