0% found this document useful (0 votes)

37 views33 pages

Vnprod

The document contains a Python Scrapy spider designed to scrape product data from the VetnPetDirect website, handling pagination through Selenium. It includes logic to avoid duplicate page visits and attempts to extract product titles, links, and images using various CSS selectors. The spider is configured to run on an EC2 instance but experiences issues with pagination, resulting in only 83 products being scraped instead of the expected total.

Uploaded by

amjadkhann621

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

37 views33 pages

Vnprod

Uploaded by

amjadkhann621

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 33

is code ko dekho ye pagination bilkul theek handle kar raha hai or sare products

scrape hota hain till last

class VetnpetdirectSpiderSpider(scrapy.Spider):
name = "VetnPetDirect_spider"
allowed_domains = ["www.vetnpetdirect.com.au"]
start_urls = ["https://www.vetnpetdirect.com.au/collections/dog-food-complete-
diets"]

# Track visited pages to avoid duplicates

visited_pages = set()

def __init__(self):
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# Set a realistic user agent
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0;
Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36")

# Initialize the webdriver

self.driver = webdriver.Chrome(
service=Service(r"C:\\Users\\Hp\\.wdm\\drivers\\chromedriver\\win64\\
chromedriver-win64\\chromedriver.exe"),
options=chrome_options
)
self.driver.set_window_size(1920, 1080) # Set a reasonable window size
super().__init__()

# Add counter for products

self.product_count = 0
self.current_page = 1
self.max_pages = 5 # Limit to 5 pages as per the pagination you showed

def parse(self, response):

# Check if we've already visited this URL to avoid duplicates
if response.url in self.visited_pages:
self.logger.info(f"Already visited {response.url}, skipping...")
return

self.visited_pages.add(response.url)

# Open the URL with Selenium

self.driver.get(response.url)
self.logger.info(f"Loading page: {response.url}")
# Wait for the page to load fully
time.sleep(10) # Increased wait time to ensure JS loads completely

# Check if we're on a product listing page by looking for the product grid
try:
# First check what product elements are present on the page
# Let's try different selectors
selectors_to_try = [
'li.snize-product',
'div.product-item',
'div.grid__item',
'div.product-card',
'div.product'
]

products = []
used_selector = None

for selector in selectors_to_try:

products = self.driver.find_elements(By.CSS_SELECTOR, selector)
if products:
used_selector = selector
self.logger.info(f"Found products using selector: {selector}")
break

if not products:
# If no products found, dump the page source for debugging
self.logger.warning("No products found with any selector. Page
structure may have changed.")
self.logger.info(f"Current URL: {self.driver.current_url}")
with open(f"page_source_debug_{self.current_page}.html", "w",
encoding="utf-8") as f:
f.write(self.driver.page_source)
return

page_product_count = len(products)
self.product_count += page_product_count
self.logger.info(f"Page {self.current_page}: Found {page_product_count}
products. Total so far: {self.product_count}")

# Extract data from each product

for product in products:
item = {}

try:
# Try different selectors for title
title_selectors = ['span.snize-title', 'h2', '.product-
card__title', '.product-title']
title = None
for selector in title_selectors:
try:
title_element = product.find_element(By.CSS_SELECTOR,
selector)
title = title_element.text
if title:
item['Product_Title'] = title
break
except NoSuchElementException:
continue
# Try different selectors for link
link_selectors = ['a.snize-view-link', 'a', '.product-card a']
link_url = None
for selector in link_selectors:
try:
link_element = product.find_element(By.CSS_SELECTOR,
selector)
link_url = link_element.get_attribute('href')
if link_url:
item['Title_Link'] = link_url
break
except NoSuchElementException:
continue

# Try different selectors for image

img_selectors = ['span.snize-thumbnail img', 'img', '.product-
image img']
image_url = None
for selector in img_selectors:
try:
img_element = product.find_element(By.CSS_SELECTOR,
selector)
image_url = img_element.get_attribute('src')
if image_url:
item['Image_URL'] = image_url
break
except NoSuchElementException:
continue

# Only yield if we found at least some data

if item:
yield item

except Exception as e:
self.logger.error(f"Error extracting product data: {e}")

# Handle pagination
if self.current_page < self.max_pages:
try:
# Try to construct the next page URL directly based on pattern
next_page_num = self.current_page + 1
base_url = re.sub(r'\?tab=products&page=\d+', '',
self.driver.current_url)
base_url = base_url.rstrip('/')
next_page_url = f"{base_url}?tab=products&page={next_page_num}"

self.logger.info(f"Constructed next page URL: {next_page_url}")

if next_page_url not in self.visited_pages:

self.current_page = next_page_num
self.logger.info(f"Moving to page {self.current_page}")

# Create a new request for the next page

yield scrapy.Request(
url=next_page_url,
callback=self.parse,
dont_filter=True
)
else:
self.logger.info(f"Already visited {next_page_url}, ending
pagination.")
except Exception as e:
self.logger.error(f"Error during pagination: {e}")

except Exception as e:
self.logger.error(f"Error in parsing page: {e}")

def closed(self, reason):

# Close the browser when spider is closed
self.driver.quit()
self.logger.info(f"Spider closed. Total products scraped:
{self.product_count}")

but ye code jab mae ec2 pa chalata hu to sirf 83 products scrape houe hai it means
pagination handle nhi hui or products sahi tareeqa sy scrpae bhi nhi hue
please mujh pura complete code provide kro uper wale code mae sleep wala logic
laga wa hai for pagination

<div class="snize-pagination" role="navigation" aria-label="Pagination"

style="width: 770.3px;"><ul><li><span class="snize-pagination-prev disabled" aria-
hidden="true"></span></li><li><a
href="https://www.vetnpetdirect.com.au/collections/dog-food-complete-diets?
&page=1" class="active" aria-current="page" rev="1"
data-no-instant="true">1</a></li><li><a
href="https://www.vetnpetdirect.com.au/collections/dog-food-complete-diets?
&page=2" rev="2" data-no-instant="true">2</a></li><li><a
href="https://www.vetnpetdirect.com.au/collections/dog-food-complete-diets?
&page=3" rev="3" data-no-instant="true">3</a></li><li><span aria-
hidden="true">...</span></li><li><a
href="https://www.vetnpetdirect.com.au/collections/dog-food-complete-diets?
&page=5" rev="5" data-no-instant="true">5</a></li><li><a
href="https://www.vetnpetdirect.com.au/collections/dog-food-complete-diets?
&page=5" class="snize-pagination-next" rev="2" data-no-instant="true" aria-
label="Next page"></a></li></ul></div>

the code must sleep and scrape all the next pages correctl with variations

import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import NoSuchElementException, TimeoutException,
StaleElementReferenceException
from scrapy import signals
import time
import re
import traceback

class VetnpetdirectSpiderSpider(scrapy.Spider):
name = "VetnPetDirect_spider"
allowed_domains = ["www.vetnpetdirect.com.au","proxy.scrapeops.io"]

# Category URLs with display names and hardcoded pet types

category_urls = [
('Food', 'collections/dog-food-complete-diets', 'Dog'),
# ('Treats', 'collections/dog-treats', 'Dog'),
# ('Toys', 'collections/dog-toys', 'Dog'),
# ('Accessories', 'collections/dog-accessories', 'Dog'),
# ('Food', 'collections/cat-food', 'Cat'),
# ('Treats', 'collections/cat-treats', 'Cat'),
# ('Toys', 'collections/cat-toys', 'Cat'),
# ('Accessories', 'collections/cat-accessories', 'Cat')
]

# Known brand names list

brand_names = [
"LifeWise", "lovebites","Absolute Holistic","4CYTE","3M",
"Abbey Animal Health", "Absolute Pet", "Absorbine", "Adaptil", "Advantage",
"Advantix", "Advocate", "Aesculap", "Agmin Chelates", "Alfalfa King", "All Barks",
"All For Paws", "All Fur You", "ALLPET", "Alogard", "Alto", "Alto Lab", "Anarchy",
"Animal Works Nutrition", "Animalintex", "Animals Like Us", "Animology", "AniPal",
"Anitone", "Antinol Australia", "Ants Off", "Apex Laboratories", "Apex Tools",
"API", "Aqua Buddy", "Aqua Zonic", "Aquasonic", "Aristopet", "Arm & Hammer",
"Aromadog", "Ausrichter", "Aussie Pet Products", "Avione", "Avitrol", "Bainbridge",
"Balanced Life", "Banana Feeds Australia", "Barastoc", "Baxter", "Baycox", "Bayer",
"BD", "Bear Bear", "Beau Pets", "Beco", "Becton Dickinson", "Betadine", "Big Dog",
"Bio-Groom", "Bionic", "BIOpet", "Bioscape", "BirdLife", "Black Dog Wear", "Black
Hawk", "Blackdog", "Blue Planet", "Boehringer Ingelheim", "Borotto", "Braun",
"Bravecto", "Breeder's Choice", "BSN Medical", "Burgess", "Burgon & Ball",
"Buster", "Busy Buddy", "Caitec", "Camon", "Capstar", "CaribSea", "Catit",
"CatMate", "Cazitel", "CEN Nutrition", "Ceva", "CheckUp", "Chipsi", "Chuckit!",
"CLEAR Dog Treats", "Coachi", "Company of Animals", "Compost-A-Pak", "Continuum
Aquatics", "Coopers", "CopRice", "Country Heritage", "Covetrus", "Cowboy Magic",
"Creative Foraging", "Credelio Plus", "Crooked Lane", "Crown", "Cruiser Bowl",
"Crystalfix", "CSI", "Cydectin", "Davis", "Decron", "Dectomax", "Dentipet", "Derma
Gel", "Dermcare", "Dermoscent Laboratorie", "Designer Collection", "DGS Products",
"Di-Vetelact", "Diamond Cut", "Dimmitrol", "Dine-a-Chook", "Diversey Cleaning",
"Doc & Phoebe", "Dog Rocks", "Dog Treat Naturals", "Doggylicious", "DOOG",
"Drinkwell", "Drontal", "Drool", "Durex", "Dynavyte", "EAC Animal Care", "Earthz
Pet", "Eco Tech", "EcoPellets Tasmania", "Elanco", "EnviroSafe", "Equi-Prene",
"Equinade", "Equine Health Science", "Equine Pure", "Equine Vit&Min", "Equitex",
"Eurofarm", "EVO Lifestyle Products", "Excel", "Exo Terra", "Ezi-LockOdour",
"EzyDog", "EzyGrip", "F10", "Fantasmic", "Farmalogic", "Farmhand", "Farnam",
"Feathered Friends", "Featherland Paradise", "Feed-O-Matic", "Feel Good Doggo",
"Feline Natural", "Feliway", "Fenpral", "Fido", "Fido's", "Filta-Bac", "FitNFlash",
"Fjord Manufacturing", "Fleet", "Flexi", "Flipper", "Fluval", "Flyveils By Design",
"Freezy Paws", "Frontier Pets", "Frontline", "FURminator", "Furriends", "FuzzYard",
"Genia", "Genial", "Gentle Leader", "Giddy Citizen", "GiGwi", "Glandex", "Global
Vet", "Glow Groom", "Glyde", "GMV", "GO CAT", "Golp", "Greenies", "Guru Pet
Company", "GVP", "Hagen", "HandsOn", "Hayes", "HayPigs", "HeartGard", "Hemp
Collective", "HempPet", "Henry Schein", "Heritage Downs", "Hi Form", "Hill's",
"Hill's Pet Nutrition", "Hill's Prescription Diet", "HomeoPet", "Horse Health
Products", "Horse Hydrator", "Horsemaster", "Huds and Toke", "HugSmart", "Hunter
River Company", "Huskimo", "Hygain", "Hypro Pet Foods", "IAH", "Ibiyaya", "Ice N
Easy", "Inca", "IncrediBUBBLES", "independents Own", "Industrial Dog",
"Interceptor", "Intervet", "IO", "Ipromea", "Isle & Skye", "Jenquine", "JerHigh",
"Joint Guard", "Joseph Lyddy", "Jurox", "Juwel", "JW Pet", "K & H Pet Products",
"K9 Natural", "Kelato", "KER", "Ketchum", "Kiltix", "Kit Cat", "Kitter", "Kitty
Play", "Kiwi Kitchens", "KLEO", "Kohnke's", "KONG", "KoolMaster", "Krutex",
"Kruuse", "Kumfi", "Kurgo", "La Chanvriere", "Laucke", "Leather Therapy", "Lee's",
"Lenny's Kitchen", "Leovet", "Lickables", "Lickimat", "Life Data Labs", "LifeWise
Pet Nutrition", "Likit", "Little Giant", "Livamol", "Livingstone", "Love'em",
"Lulu's Kitchen", "LupinePet", "Lyppard", "Magictails", "Mane 'n Tail", "Marina",
"Masterpet", "Mavlab", "Max & Molly", "Meals For Mutts", "Medibit", "Melanie
Newman", "Melcare", "Merial", "Midwest", "Milbemax", "Millers Forge", "Mimi &
Munch", "Minrosa", "Miscellaneous", "Mog & Bone", "Mr Sticky", "Mr. Fothergill's",
"MSD", "Multipet", "Mustad", "My Family", "MyBestMate", "MyEcoBag", "myPet
Magazine", "Natural Animal Solutions", "Natural Health NZ", "Nature's Botanical",
"Nature's Miracle", "Neove Pharma Australia", "Nerf", "NexGard", "NexGard Spectra",
"Nina Ottosson", "NJ Phillips", "NORTHMATE", "NRG", "Nuheart", "Nutrafin",
"Nutramax", "Nutrimol", "Nylabone", "NYOS Aquatics", "O'TOM", "Oakwood", "Ocean
Nutrition", "ODR", "Odzon", "Oh Crap", "Olsson's", "Oralx", "OraVet", "Organica",
"Outward Hound", "Oxbow", "Oz Pet", "OzHemp", "ParaGard", "Passwell", "PatPet",
"PAW", "Paw Ready", "Pawsome Organics", "Peckish", "Penn-Plax Reptology", "Pet
DreamHouse", "Pet Drs", "Pet Relax", "Pet Teezer", "pet+me", "Pet-Rite", "PET-TEK",
"PetArk", "Petkin", "PetQwerks", "Petrageous", "PetSafe", "Petstages",
"Pharmachem", "Phud's", "Pioneer", "Plutos", "Polly's", "Polyp Lab", "Pomms",
"POOWEE!", "Poseidon Animal Health", "Pottie's", "Prestige Pet", "Prestige Snuggle
Pals", "Prime Pantry", "Prime100", "Pritchard", "Pro-Dosa", "PRO-TRAINER",
"Profender", "ProN8ure", "Proteq", "Protexin", "Proudi", "PROVET", "Provex",
"ProviCo", "Proviro Group", "Prozym", "PuppyPlay", "Pure Life", "Quirky Kitty",
"Racing Blue", "Ranvet", "Raw Pawz", "Red Healer", "Reptile Publications",
"Revolution", "Ridley", "River Systems", "Roche", "Rocky Point", "Rogz", "Rose-Hip
Vital", "Rover Pet Products", "Royal Canin", "Royal Show", "Ruddock's", "Ruffey",
"Ruffwear", "Rufus & Coco", "RWR", "Saddlery Trading Company", "Sandlube", "Sashas
Blend", "SavourLife", "ScoopFree", "Scream", "SeaFlex", "Sentinel Spectrum",
"Serenity", "Seresto", "Shark Net", "Shear Magic", "Shoof", "ShowMaster", "Silvan",
"Simcro", "Simparica", "SmartCat", "Smith & Nephew", "Snax", "Snooza",
"SnuggleSafe", "SodaPup", "Solo", "Sporn", "Spotnik", "Springer-Magrath", "Spunky
Pup", "Stance Equitec", "STARMARK", "StaySound", "Staywell", "STC", "Sticky Paws",
"Stockbrands", "Stockman & Paddock", "Super Bird Creations", "SuperBird", "Superior
Pet Goods", "Superior Shavings", "Sure Petcare", "SUREFEED", "SUREFLAP", "Swann-
Morton", "Sykes", "Tankmaid", "Tasty Bone", "Tensoplast", "Terumo", "Tetra", "The
Art of Whole Food", "The Canny Company", "The NZ Natural Pet Food Company", "The
Pet Loo", "Thundershirt", "Thunderworks", "Tiger Brands", "Triplepet",
"TropiClean", "Trouble & Trix", "Trough Rocks", "Troy", "TuffRock", "Tuffy", "Two
Little Fishies", "Ultra", "Urine Off", "USA-K9", "Value Plus", "Valuheart",
"Varco", "Veggiedent", "Veredus", "Vet's All Natural", "Vet's Best", "vet-n-pet
DIRECT", "Vetafarm", "Vetforce", "Vetgold", "Vetnex Pet Care", "Vetopop",
"Vetoquinol", "VetPen", "VetRx", "Vetsense", "Virbac", "Virkon", "Vision", "Vital
SupaSnax", "Wagg & Purr", "Wags & Wiggles", "Wahl", "Water & Woods", "Weaver",
"Wellness Pet Company", "West Paw", "Whimzees", "Whiskers & Wiggles", "Wild
Hibiscus Flower Company", "Wombaroo", "Worlds Best Hoof Oil", "WSD", "Y-Tex",
"Yeowww!", "Yours Droolly", "ZEEZ", "zenpet", "Zippy Claws", "Zippy Paws", "ZIWI",
"Zoetis", "Zoo Med", "ZooPets", "Zychem Technologies"
]

# Track visited pages to avoid duplicates

visited_pages = set()

def init(self, *args, **kwargs):

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Enable headless mode for AWS
EC2
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0;
Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36")
chrome_options.add_experimental_option('excludeSwitches', ['enable-
automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)

# Initialize the webdriver

self.service = Service(r"/usr/local/bin/chromedriver")
self.driver = None
self.options = chrome_options

# Add counter for products

self.product_count = 0
self.current_page = 1

# Add variables for category tracking

self.current_category_index = 0
self.processed_products = set()
self.pending_requests = {}

# EC2 optimization parameters

self.page_load_wait = 30 # Increased wait time for EC2
self.request_delay = 3 # Delay between requests

super(VetnpetdirectSpiderSpider, self).init(*args, **kwargs)

@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(VetnpetdirectSpiderSpider, cls).from_crawler(crawler, *args,
**kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider

def spider_opened(self, spider):

self.driver = webdriver.Chrome(service=self.service, options=self.options)

def spider_closed(self, spider):

if self.driver:
self.driver.quit()

def start_requests(self):
# Start with only the first category
if self.category_urls:
category_name, category_path, pet_type =
self.category_urls[self.current_category_index]
url = f"https://www.vetnpetdirect.com.au/{category_path}"
self.logger.info(f"Starting with category: {category_name} for
{pet_type}")
# Initialize pending requests counter for this category
self.pending_requests[category_name] = 1 # Start with 1 for the
initial request
yield scrapy.Request(
url=url,
callback=self.parse,
meta={
'category_name': category_name,
'pet_type': pet_type,
'dont_filter': True
}
)

def parse(self, response):

# Get the category name and pet type from meta
category_name = response.meta.get('category_name')
pet_type = response.meta.get('pet_type')

# Check if we've already visited this URL to avoid duplicates

if response.url in self.visited_pages:
self.logger.info(f"Already visited {response.url}, skipping...")
self.pending_requests[category_name] -= 1
next_request = self.check_category_completion(category_name)
if next_request:
yield next_request
return

self.visited_pages.add(response.url)
self.logger.info(f"Parsing category: {category_name} for pet type:
{pet_type} - Page {self.current_page}")

# Initialize the driver if it's not already initialized

if not hasattr(self, 'driver') or self.driver is None:
self.driver = webdriver.Chrome(service=self.service,
options=self.options)

# Open the URL with Selenium

self.driver.get(response.url)
self.logger.info(f"Loading page: {response.url}")

# Wait for the page to load fully

time.sleep(self.page_load_wait)

# Check if we're on a product listing page by looking for the product grid
try:
# Wait for the body to be fully loaded
wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body')))

# First check what product elements are present on the page

# Let's try different selectors
selectors_to_try = [
'li.snize-product',
'div.product-item',
'div.grid__item',
'div.product-card',
'div.product'
]

products = []
used_selector = None

# Try each selector with explicit wait

for selector in selectors_to_try:
try:
# Use explicit wait for each selector
products =
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector)))
if products:
used_selector = selector
self.logger.info(f"Found products using selector:
{selector}")
break
except TimeoutException:
continue

if not products:
# Check if we're on the last page (no products found)
self.logger.warning("No products found on this page. This might be
the last page.")
self.logger.info(f"Current URL: {self.driver.current_url}")
with
open(f"page_source_debug_{category_name}_{self.current_page}.html", "w",
encoding="utf-8") as f:
f.write(self.driver.page_source)

# Decrement pending requests for this page since we're done with it
self.pending_requests[category_name] -= 1
next_request = self.check_category_completion(category_name)
if next_request:
yield next_request
return

page_product_count = len(products)
self.product_count += page_product_count
self.logger.info(f"Page {self.current_page}: Found {page_product_count}
products. Total so far: {self.product_count}")

# Increment pending requests counter for product detail pages

self.pending_requests[category_name] += page_product_count

# Store product data first to avoid stale element issues

product_data_list = []

# First collect basic data from the list page

for index, product in enumerate(products):
try:
item = {}

# Extract link first as we'll need it to visit the product page

link_url = None
link_selectors = ['a.snize-view-link', 'a', '.product-card a']

for selector in link_selectors:

try:
link_element = product.find_element(By.CSS_SELECTOR,
selector)
link_url = link_element.get_attribute('href')
if link_url:
item['Title_Link'] = link_url
break
except (NoSuchElementException,
StaleElementReferenceException):
continue

# Skip if we've already processed this product URL

if link_url in self.processed_products:
self.logger.info(f"Skipping already processed product URL:
{link_url}")
self.pending_requests[category_name] -= 1
continue

# Add to processed set

if link_url:
self.processed_products.add(link_url)

# Try different selectors for image

img_selectors = ['span.snize-thumbnail img', 'img',
'.product-image img']
for selector in img_selectors:
try:
img_element = product.find_element(By.CSS_SELECTOR,
selector)
image_url = img_element.get_attribute('src')
if image_url:
item['Image_URL'] = image_url
break
except (NoSuchElementException,
StaleElementReferenceException):
continue

# Check if product is out of stock

try:
out_of_stock = product.find_element(By.CSS_SELECTOR,
'.snize-out-of-stock')
if out_of_stock:
item['Stock_Status'] = "Out Of Stock"
else:
item['Stock_Status'] = "In Stock"
except (NoSuchElementException,
StaleElementReferenceException):
item['Stock_Status'] = "In Stock" # Default to in
stock if no indicator found
# Determine brand name from product title
if 'Product_Title' in item:
brand_name = "N/A"
for brand in self.brand_names:
if brand.lower() in item['Product_Title'].lower():
brand_name = brand
break
item['Brand_Name'] = brand_name

# Add category information

item['Product_Category'] = category_name
item['Pet_Type'] = pet_type

# Add default fields

item['Description'] = "Not Available"
item['Lifestage'] = "Not Available"

# Add product to our list to process

product_data_list.append(item)

except Exception as e:
self.logger.error(f"Error extracting basic product data: {e}")
self.pending_requests[category_name] -= 1 # Decrement for
failed product

# Now visit each product page to get prices and check for variations
for item in product_data_list:
if 'Title_Link' in item:
# Follow the product link to get price and check for variations
yield scrapy.Request(
item['Title_Link'],
callback=self.parse_product,
meta={
'item': item,
'category_name': category_name,
'pet_type': pet_type
}
)
# Add small delay between product requests to avoid
overwhelming EC2
time.sleep(self.request_delay)

# Handle pagination - try to find the next page

has_next_page = False
try:
# Check for next page link first
next_page_selectors = [
'.pagination .next a',
'.pagination-custom .pagination-next a',
'a.pagination__next',
'a[rel="next"]'
]

next_page_link = None
for selector in next_page_selectors:
try:
next_links = self.driver.find_elements(By.CSS_SELECTOR,
selector)
if next_links:
next_page_link = next_links[0].get_attribute('href')
if next_page_link:
has_next_page = True
break
except Exception:
continue

# If direct link not found, try to construct the next page URL
if not has_next_page:
# Try to construct the next page URL directly based on pattern
next_page_num = self.current_page + 1
base_url = re.sub(r'\?tab=products&page=\d+', '',
self.driver.current_url)
base_url = base_url.rstrip('/')
next_page_url = f"{base_url}?tab=products&page={next_page_num}"

# Check if this page exists by looking for a pagination element

with this number
try:
page_numbers = self.driver.find_elements(By.CSS_SELECTOR,
'.pagination li a, .pagination__number')
for page_elem in page_numbers:
if page_elem.text.strip() == str(next_page_num):
next_page_link = next_page_url
has_next_page = True
break
except Exception:
# If we can't confirm via pagination, let's try the
constructed URL anyway
next_page_link = next_page_url
has_next_page = True

if has_next_page and next_page_link and next_page_link not in

self.visited_pages:
self.logger.info(f"Found next page URL for {category_name}:
{next_page_link}")
# Increment counter for next page
self.pending_requests[category_name] += 1
self.current_page = self.current_page + 1

# Create a new request for the next page

yield scrapy.Request(
url=next_page_link,
callback=self.parse,
meta={
'category_name': category_name,
'pet_type': pet_type,
'dont_filter': True
}
)
else:
self.logger.info(f"No next page found for {category_name} or
already visited. Ending pagination.")
# Reset page counter for next category
self.current_page = 1
except Exception as e:
self.logger.error(f"Error during pagination: {e}")
self.logger.error(traceback.format_exc())
# Reset page counter for the next category
self.current_page = 1

# Decrement pending requests counter for this page

self.pending_requests[category_name] -= 1
self.logger.info(f"Remaining requests for {category_name}:
{self.pending_requests[category_name]}")

# Check if we've completed this category

next_request = self.check_category_completion(category_name)
if next_request:
yield next_request

except Exception as e:
self.logger.error(f"Error in parsing page: {e}")
self.logger.error(traceback.format_exc())
# Decrement pending requests for this failed page
self.pending_requests[category_name] -= 1
next_request = self.check_category_completion(category_name)
if next_request:
yield next_request

def parse_product(self, response):

# Get the item that was collected in the first parse
base_item = response.meta['item']
category_name = response.meta['category_name']
self.logger.info(f"Parsing product: {base_item['Product_Title']}")

try:
# Process this product (will handle variations if present)
product_url = response.url
items = list(self.process_product_page(product_url, base_item))

# Yield all items

for item in items:
yield item

except Exception as e:
self.logger.error(f"Error processing product detail page: {e}")
self.logger.error(traceback.format_exc())
# Still yield the base item as fallback
yield base_item

finally:
# Decrement pending requests counter for this product
self.pending_requests[category_name] -= 1
self.logger.info(f"Remaining requests for {category_name}:
{self.pending_requests[category_name]}")

# Check if we've completed this category

next_request = self.check_category_completion(category_name)
if next_request:
yield next_request

def check_category_completion(self, category_name):

"""Check if a category is complete and move to the next one if needed"""
# If there are no more pending requests for this category, move to the next
if self.pending_requests.get(category_name, 0) <= 0:
self.logger.info(f"✅ COMPLETED CATEGORY: {category_name}")
# Move to the next category
self.current_category_index += 1
# Reset page counter for next category
self.current_page = 1

if self.current_category_index < len(self.category_urls):

# Start the next category
next_category_name, next_category_path, next_pet_type =
self.category_urls[self.current_category_index]
next_url = f"https://www.vetnpetdirect.com.au/{next_category_path}"
self.logger.info(f"🚀 MOVING TO NEXT CATEGORY: {next_category_name}
for {next_pet_type}")
# Initialize pending requests counter for the new category
self.pending_requests[next_category_name] = 1

def process_product_page(self, url, base_item):

"""Process a product page, handling variations if present"""
max_retries = 3

for attempt in range(max_retries):

try:
# Load the product page
self.driver.get(url)
self.logger.info(f"Loading product page: {url}")

# Wait for page to load

time.sleep(self.page_load_wait)

# Wait for product detail elements to be present

wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,
'body')))

# Try to get description if available

try:
description_selectors = [
'.product-description',
'.product__description',
'.rte',
'[itemprop="description"]'
]

for selector in description_selectors:

try:
desc_elements =
self.driver.find_elements(By.CSS_SELECTOR, selector)
if desc_elements:
description = desc_elements[0].text.strip()
if description:
base_item['Description'] = description
break
except Exception:
continue
except Exception as e:
self.logger.warning(f"Error extracting description: {e}")

# First check if there are variations

variation_selects = []

try:
# Look for variation selectors - try different types
selectors_to_try = [
'select.single-option-selector',
'.selector-wrapper select',
'select[data-variant-option]'
]

for selector in selectors_to_try:

selects = self.driver.find_elements(By.CSS_SELECTOR,
selector)
if selects:
variation_selects = selects
self.logger.info(f"Found {len(selects)} variation
selectors using {selector}")
break

if not variation_selects:
# No variations, just get the regular price
price_selectors = [
'span.price span.money',
'span.price--sale span.money',
'.product-single__price',
'.price',
'[itemprop="price"]'
]
price = None

for selector in price_selectors:

try:
price_elements =
self.driver.find_elements(By.CSS_SELECTOR, selector)
if price_elements:
price = price_elements[0].text.strip()
if price:
# Clean price text (remove dollar sign)
price = price.replace('$', '').strip()
base_item['Regular_Price'] = price
break
except Exception as e:
self.logger.warning(f"Error extracting price with
selector {selector}: {e}")

if not price:
base_item['Regular_Price'] = "Price not available"
self.logger.info(f"No variations found, yielding single
product with price {price}")
yield base_item

else:
# Process all variations
yield from self.process_variations(variation_selects,
base_item)

except Exception as e:
self.logger.error(f"Error in product page processing: {e}")
# On error, yield base item as fallback
yield base_item

# Successfully processed this item, break retry loop

break

except Exception as e:
if "stale element reference" in str(e) and attempt < max_retries -
1:
self.logger.warning(f"Stale element on attempt {attempt+1},
retrying: {base_item.get('Product_Title')}")
time.sleep(3) # Wait before retry
continue
else:
self.logger.error(f"Failed to process product after
{max_retries} attempts: {e}")
# Return the base item if processing fails
yield base_item

def process_variations(self, variation_selects, base_item):

"""Process all variations for a product"""

# Initialize variation processing

all_variations = []

# Process the first variation dropdown

if variation_selects:
try:
select_element = variation_selects[0]
select = Select(select_element)
wait = WebDriverWait(self.driver, 10)

# Get all options in this dropdown

options = select.options

for option in options:

try:
# Get the option text/value
option_value = option.text.strip()
if not option_value or option_value.lower() == "choose an
option":
continue

# Select this option

select.select_by_visible_text(option_value)

# Wait for price to update

time.sleep(3)
self.wait_for_page_to_stabilize()

# Create a new item for this variation

variation_item = base_item.copy()

# Update title to include variation

variation_item['Product_Title'] =
f"{base_item['Product_Title']} - {option_value}"

# Check stock status after selecting this variation

try:
out_of_stock_elements =
self.driver.find_elements(By.CSS_SELECTOR, '.snize-out-of-stock, .sold-out')
if out_of_stock_elements:
variation_item['Stock_Status'] = "Out Of Stock"
else:
variation_item['Stock_Status'] = "In Stock"
except Exception:
pass # Keep existing stock status if check fails

# Get the updated price with extended selectors

price_selectors = [
'span.price span.money',
'span.price--sale span.money',
'.product-single__price',
'.price',
'[itemprop="price"]'
]
price = None

for selector in price_selectors:

try:
# Wait explicitly for price elements
try:
price_element =
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
price = price_element.text.strip()
except TimeoutException:
# Try direct find if wait fails
price_elements =
self.driver.find_elements(By.CSS_SELECTOR, selector)
if price_elements:
price = price_elements[0].text.strip()

if price:
# Clean price text (remove dollar sign)
price = price.replace('$', '').strip()
variation_item['Regular_Price'] = price
break
except Exception as e:
self.logger.warning(f"Error extracting price for
variation {option_value} with selector {selector}: {e}")

if not price:
variation_item['Regular_Price'] = "Price not available"

self.logger.info(f"Processed variation: {option_value} with

price {price}")
# Add to our list and yield
all_variations.append(variation_item)
yield variation_item

except Exception as e:
self.logger.error(f"Error processing variation option
{option_value if 'option_value' in locals() else 'unknown'}: {e}")

except Exception as e:
self.logger.error(f"Error processing variations: {e}")
# If we encounter an error, yield the base item as fallback
yield base_item

# If no variations were successfully processed, yield the base item

if not all_variations:
yield base_item

def wait_for_page_to_stabilize(self, timeout=5):

"""Wait for page to finish any animations or AJAX calls"""
try:
old_page = self.driver.find_element(By.TAG_NAME,
'html').get_attribute('outerHTML')
end_time = time.time() + timeout
while time.time() < end_time:
time.sleep(0.5)
new_page = self.driver.find_element(By.TAG_NAME,
'html').get_attribute('outerHTML')
if new_page == old_page:
return True
old_page = new_page
return False
except Exception as e:
self.logger.error(f"Error in wait_for_page_to_stabilize: {e}")
return False

def closed(self, reason):

# Close the browser when spider is closed
if self.driver:
self.driver.quit()
self.logger.info(f"Spider closed. Total products scraped:
{self.product_count}")
/// coorected ec2 version

# Category URLs with display names and hardcoded pet types

# Add all your individual URLs here with their respective categories
category_urls = [
# ('Food', 'collections/dog-food-complete-diets', 'Dog'),
# ('Food', 'collections/dog-food-complete-diets?tab=products&page=2',
'Dog'),
# ('Food', 'collections/dog-food-complete-diets?tab=products&page=3',
'Dog'),
# ('Food', 'collections/dog-food-complete-diets?tab=products&page=4',
'Dog'),
# ('Food', 'collections/dog-food-complete-diets?tab=products&page=5',
'Dog'),
# ('Treats', 'collections/treats-and-rewards-for-dogs', 'Dog'),
# ('Toys', 'collections/dog-puppy-toys-boredom-relief', 'Dog'),
# ('Food', 'collections/cat-food', 'Cat'),
]

# Simplified brand names list (just 2 brands for brevity)

# Track processed products to avoid duplicates

processed_products = set()

def init(self, *args, **kwargs):

# Initialize the webdriver

self.service = Service(r"")
self.driver = None
self.options = chrome_options

# Add counter for products

self.product_count = 0

# EC2 optimization parameters - increased for EC2 reliability

self.page_load_wait = 15 # Increased wait time for EC2
self.request_delay = 1.0 # Increased delay between requests for EC2

# Maximum wait times for explicit waits

self.wait_time = 60 # Increased for EC2
self.short_wait_time = 15

super(VetnpetdirectSpiderSpider, self).init(*args, **kwargs)

def spider_opened(self, spider):

self.driver = webdriver.Chrome(service=self.service, options=self.options)

def spider_closed(self, spider):

if self.driver:
self.driver.quit()

def wait_for_element(self, by, selector, timeout=None,

condition=EC.presence_of_element_located):
"""Dynamic wait helper with automatic retry for stale elements"""
if timeout is None:
timeout = self.wait_time

wait = WebDriverWait(self.driver, timeout, poll_frequency=0.5)

max_retries = 3
for attempt in range(max_retries):
try:
element = wait.until(condition((by, selector)))
return element
except StaleElementReferenceException:
if attempt < max_retries - 1:
self.logger.warning(f"Stale element for {selector}, retrying
({attempt+1}/{max_retries})")
time.sleep(1)
continue
else:
raise

def wait_for_elements(self, by, selector, timeout=None,

condition=EC.presence_of_all_elements_located):
"""Dynamic wait helper for multiple elements"""
if timeout is None:
timeout = self.wait_time

wait = WebDriverWait(self.driver, timeout, poll_frequency=0.5)

max_retries = 3
for attempt in range(max_retries):
try:
elements = wait.until(condition((by, selector)))
return elements
except StaleElementReferenceException:
if attempt < max_retries - 1:
self.logger.warning(f"Stale elements for {selector}, retrying
({attempt+1}/{max_retries})")
time.sleep(1)
continue
else:
raise

def wait_for_page_load(self):
"""Enhanced page load wait with dynamic detection"""
# First wait for the document to be ready
try:
WebDriverWait(self.driver, self.wait_time).until(
lambda d: d.execute_script("return document.readyState") ==
"complete"
)

# Then wait for any jQuery or AJAX to complete

WebDriverWait(self.driver, self.short_wait_time).until(
lambda d: d.execute_script("return jQuery.active == 0") or True
)

# Sometimes need a small additional wait for JavaScript rendering

time.sleep(1.5)

return True
except Exception as e:
self.logger.warning(f"Wait for page load exception: {e}")
# Fallback to simple sleep if script execution fails
time.sleep(self.page_load_wait)
return False

def start_requests(self):
"""Initialize requests for all category URLs"""
for index, (category_name, category_path, pet_type) in
enumerate(self.category_urls):
url = f"https://www.vetnpetdirect.com.au/{category_path}"
self.logger.info(f"Scheduling category URL
{index+1}/{len(self.category_urls)}: {category_name} for {pet_type}")

yield scrapy.Request(
url=url,
callback=self.parse,
meta={
'category_name': category_name,
'pet_type': pet_type,
'dont_filter': True
}
)

def parse(self, response):

# Get the category name and pet type from meta
category_name = response.meta.get('category_name')
pet_type = response.meta.get('pet_type')

self.logger.info(f"Parsing category: {category_name} for pet type:

{pet_type}")

# Initialize the driver if it's not already initialized

if not hasattr(self, 'driver') or self.driver is None:
self.driver = webdriver.Chrome(service=self.service,
options=self.options)

# Open the URL with Selenium

try:
self.driver.get(response.url)
self.logger.info(f"Loading page: {response.url}")

# Enhanced page load wait

self.wait_for_page_load()

# Wait for the body to be fully loaded with explicit wait

self.wait_for_element(By.TAG_NAME, 'body')

# First check what product elements are present on the page

# Let's try different selectors
selectors_to_try = [
'li.snize-product',
'div.product-item',
'div.grid__item',
'div.product-card',
'div.product'
]

products = []
used_selector = None

# Try each selector with explicit wait

for selector in selectors_to_try:
try:
# Use our enhanced wait for elements helper
products = self.wait_for_elements(By.CSS_SELECTOR, selector,
timeout=15)
if products:
used_selector = selector
self.logger.info(f"Found products using selector:
{selector}")
break
except TimeoutException:
continue

if not products:
# No products found on this page
self.logger.warning("No products found on this page.")
self.logger.info(f"Current URL: {self.driver.current_url}")
# Save page source for debugging
with open(f"page_source_debug_{category_name}.html", "w",
encoding="utf-8") as f:
f.write(self.driver.page_source)
return

page_product_count = len(products)
self.product_count += page_product_count
self.logger.info(f"Found {page_product_count} products. Total so far:
{self.product_count}")

# Store product data first to avoid stale element issues

product_data_list = []

# First collect basic data from the list page

for index, product in enumerate(products):
try:
item = {}

# Extract link first as we'll need it to visit the product page

link_url = None
link_selectors = ['a.snize-view-link', 'a', '.product-card a']

for selector in link_selectors:

# Skip if we've already processed this product URL

if link_url in self.processed_products:
self.logger.info(f"Skipping already processed product URL:
{link_url}")
continue

# Add to processed set

if link_url:
self.processed_products.add(link_url)

# Only collect other data if we found a link

# Try different selectors for image

# Check if product is out of stock

# Determine brand name from product title

if 'Product_Title' in item:
brand_name = "N/A"
for brand in self.brand_names:
if brand.lower() in item['Product_Title'].lower():
brand_name = brand
break
item['Brand_Name'] = brand_name

# Add category information

item['Product_Category'] = category_name
item['Pet_Type'] = pet_type

# Add default fields

item['Description'] = "Not Available"
item['Lifestage'] = "Not Available"

# Add product to our list to process

product_data_list.append(item)

except Exception as e:
self.logger.error(f"Error extracting basic product data: {e}")

except Exception as e:
self.logger.error(f"Error in parsing page: {e}")
self.logger.error(traceback.format_exc())

def parse_product(self, response):

try:
# Process this product (will handle variations if present)
product_url = response.url
items = list(self.process_product_page(product_url, base_item))

# Yield all items

for item in items:
yield item

except Exception as e:
self.logger.error(f"Error processing product detail page: {e}")
self.logger.error(traceback.format_exc())
# Still yield the base item as fallback
yield base_item

def process_product_page(self, url, base_item):

"""Process a product page, handling variations if present"""
max_retries = 3

for attempt in range(max_retries):

try:
# Load the product page
self.driver.get(url)
self.logger.info(f"Loading product page: {url}")

# Enhanced page load wait

self.wait_for_page_load()
# Wait for product detail elements to be present
self.wait_for_element(By.CSS_SELECTOR, 'body', self.wait_time)

# Try to get description if available

try:
description_selectors = [
'.product-description',
'.product__description',
'.rte',
'[itemprop="description"]'
]

for selector in description_selectors:

try:
# Try to find with our enhanced wait first
try:
desc_element = self.wait_for_element(
By.CSS_SELECTOR,
selector,
timeout=10
)
description = desc_element.text.strip() if
desc_element else ""
except TimeoutException:
# Fallback to direct find_elements
desc_elements =
self.driver.find_elements(By.CSS_SELECTOR, selector)
description = desc_elements[0].text.strip() if
desc_elements else ""

if description:
base_item['Description'] = description
break
except Exception:
continue
except Exception as e:
self.logger.warning(f"Error extracting description: {e}")

# First check if there are variations

variation_selects = []

try:
# Look for variation selectors - try different types
selectors_to_try = [
'select.single-option-selector',
'.selector-wrapper select',
'select[data-variant-option]'
]

for selector in selectors_to_try:

try:
selects = self.driver.find_elements(By.CSS_SELECTOR,
selector)
if selects:
variation_selects = selects
self.logger.info(f"Found {len(selects)} variation
selectors using {selector}")
break
except Exception:
continue

for selector in price_selectors:

try:
# Try with explicit wait first
try:
price_element = self.wait_for_element(
By.CSS_SELECTOR,
selector,
timeout=10,
condition=EC.visibility_of_element_located
)
price = price_element.text.strip() if
price_element else ""
except TimeoutException:
# Fallback to direct find_elements
price_elements =
self.driver.find_elements(By.CSS_SELECTOR, selector)
price = price_elements[0].text.strip() if
price_elements else ""

if price:
# Clean price text (remove dollar sign)
price = price.replace('$', '').strip()
base_item['Regular_Price'] = price
break
except Exception as e:
self.logger.warning(f"Error extracting price with
selector {selector}: {e}")

if not price:
base_item['Regular_Price'] = "Price not available"

self.logger.info(f"No variations found, yielding single

product with price {price}")
yield base_item

else:
# Process all variations
yield from self.process_variations(variation_selects,
base_item)

except Exception as e:
self.logger.error(f"Error in product page processing: {e}")
# On error, yield base item as fallback
yield base_item
# Successfully processed this item, break retry loop
break

except Exception as e:
if attempt < max_retries - 1:
self.logger.warning(f"Error on attempt {attempt+1}, retrying:
{base_item.get('Product_Title')} - {str(e)}")
time.sleep(3) # Wait before retry
continue
else:
self.logger.error(f"Failed to process product after
{max_retries} attempts: {e}")
# Return the base item if processing fails
yield base_item

def process_variations(self, variation_selects, base_item):

"""Process all variations for a product"""

# Initialize variation processing

all_variations = []

# Process the first variation dropdown

if variation_selects:
try:
select_element = variation_selects[0]
select = Select(select_element)

# Get all options in this dropdown

options = select.options

for option in options:

try:
# Get the option text/value
option_value = option.text.strip()
if not option_value or option_value.lower() in ["choose an
option", "select"]:
continue

# Select this option with retry mechanism

max_retries = 3
for retry in range(max_retries):
try:
select.select_by_visible_text(option_value)
# Wait for price to update - use dynamic waiting
self.wait_for_page_to_stabilize(timeout=8)
break
except StaleElementReferenceException:
if retry < max_retries - 1:
# Get fresh reference to select element and
retry
time.sleep(1)
select_element = self.wait_for_element(
By.CSS_SELECTOR,
'select.single-option-selector, .selector-
wrapper select, select[data-variant-option]'
)
select = Select(select_element)
else:
raise
# Create a new item for this variation
variation_item = base_item.copy()

# Update title to include variation

variation_item['Product_Title'] =
f"{base_item['Product_Title']} - {option_value}"

# Check stock status after selecting this variation

# Get the updated price with extended selectors

price_selectors = [
'span.price span.money',
'span.price--sale span.money',
'.product-single__price',
'.price',
'[itemprop="price"]'
]
price = None

for selector in price_selectors:

try:
# Try with explicit wait first for better
reliability
try:
price_element = self.wait_for_element(
By.CSS_SELECTOR,
selector,
timeout=8,
condition=EC.visibility_of_element_located
)
if price_element:
price = price_element.text.strip()
except TimeoutException:
# Fallback to direct find_elements
price_elements =
self.driver.find_elements(By.CSS_SELECTOR, selector)
if price_elements:
price = price_elements[0].text.strip()

if not price:
variation_item['Regular_Price'] = "Price not available"

self.logger.info(f"Processed variation: {option_value} with

price {price}")

# Add to our list and yield

all_variations.append(variation_item)
yield variation_item

except Exception as e:
self.logger.error(f"Error processing variation option
{option_value if 'option_value' in locals() else 'unknown'}: {e}")

except Exception as e:
self.logger.error(f"Error processing variations: {e}")
# If we encounter an error, yield the base item as fallback
yield base_item

# If no variations were successfully processed, yield the base item

if not all_variations:
yield base_item

def wait_for_page_to_stabilize(self, timeout=8):

"""Wait for page to finish any animations or AJAX calls"""
try:
# First check document ready state
WebDriverWait(self.driver, timeout).until(
lambda d: d.execute_script("return document.readyState") ==
"complete"
)

# Then wait for any pending AJAX requests (jQuery)

try:
WebDriverWait(self.driver, timeout/2).until(
lambda d: d.execute_script("return jQuery.active == 0") or True
)
except:
pass # jQuery might not be present

# Check for HTML stability (DOM not changing)

old_page = self.driver.find_element(By.TAG_NAME,
'html').get_attribute('outerHTML')
stability_check_time = time.time() + (timeout/2)

while time.time() < stability_check_time:

time.sleep(0.5)
new_page = self.driver.find_element(By.TAG_NAME,
'html').get_attribute('outerHTML')
if new_page == old_page:
return True
old_page = new_page

return True
except Exception as e:
self.logger.error(f"Error in wait_for_page_to_stabilize: {e}")
# Fallback to simple sleep
time.sleep(3)
return False
def closed(self, reason):
# Close the browser when spider is closed
if self.driver:
self.driver.quit()
self.logger.info(f"Spider closed. Total products scraped:
{self.product_count}")

Polaris New
No ratings yet
Polaris New
40 pages
Offerletter Infinity Applicationid 529 46202111739370
No ratings yet
Offerletter Infinity Applicationid 529 46202111739370
3 pages
Primark - Full Factory List (En) - 2023
No ratings yet
Primark - Full Factory List (En) - 2023
75 pages
Guitar Group
No ratings yet
Guitar Group
8 pages
Web Scraping
No ratings yet
Web Scraping
2 pages
6
No ratings yet
6
3 pages
Dropdownlistscraping
No ratings yet
Dropdownlistscraping
7 pages
RA. 9266 - Architecture Act of 2004
No ratings yet
RA. 9266 - Architecture Act of 2004
2 pages
mc15470 Web
No ratings yet
mc15470 Web
2 pages
Application of Coir Geotextile For Road Construction Some Issues
No ratings yet
Application of Coir Geotextile For Road Construction Some Issues
5 pages
Class 11 Economics Sample Paper Set 8
No ratings yet
Class 11 Economics Sample Paper Set 8
8 pages
Step 2
No ratings yet
Step 2
2 pages
Project Py PDF
No ratings yet
Project Py PDF
6 pages
MD To PDF
No ratings yet
MD To PDF
6 pages
Directory Structure
No ratings yet
Directory Structure
10 pages
Lab Manual - Exp 1 (STA)
No ratings yet
Lab Manual - Exp 1 (STA)
5 pages
Rogers Et Al., 2018
No ratings yet
Rogers Et Al., 2018
12 pages
EXCEL Formulae
No ratings yet
EXCEL Formulae
211 pages
87 1
No ratings yet
87 1
10 pages
Structure of RNA
No ratings yet
Structure of RNA
36 pages
Benchmaster Documentation
No ratings yet
Benchmaster Documentation
12 pages
B - 2 CIE Web Scraping
No ratings yet
B - 2 CIE Web Scraping
8 pages
Internship Review Form
No ratings yet
Internship Review Form
5 pages
Product Info Scrapper
No ratings yet
Product Info Scrapper
18 pages
Step 3
No ratings yet
Step 3
2 pages
Soal Bahasa Inggris Bab Colors Warna-Warna Dan Kunci Jawaban
No ratings yet
Soal Bahasa Inggris Bab Colors Warna-Warna Dan Kunci Jawaban
8 pages
Khusbu
No ratings yet
Khusbu
5 pages
Message
No ratings yet
Message
8 pages
Project
No ratings yet
Project
4 pages
MR Brico Url
No ratings yet
MR Brico Url
2 pages
Walmart Display Makes and Models 2
No ratings yet
Walmart Display Makes and Models 2
1 page
3.1 Reselling - Code
No ratings yet
3.1 Reselling - Code
2 pages
Lista Musicas - Texto
No ratings yet
Lista Musicas - Texto
4 pages
RPG and Story-Based Game in Game Development
No ratings yet
RPG and Story-Based Game in Game Development
9 pages
Internship Assessment
No ratings yet
Internship Assessment
3 pages
Assessment Task - Carbon38
No ratings yet
Assessment Task - Carbon38
5 pages
Variationmustprompt
No ratings yet
Variationmustprompt
11 pages
Ajay Lovestory
No ratings yet
Ajay Lovestory
2 pages
IP Project File
No ratings yet
IP Project File
25 pages
Script
No ratings yet
Script
20 pages
Document 1
No ratings yet
Document 1
9 pages
22 Oct 2024 Start Reactjs Project
No ratings yet
22 Oct 2024 Start Reactjs Project
6 pages
Mini Project Info
No ratings yet
Mini Project Info
2 pages
Experiment 1 Record Format
No ratings yet
Experiment 1 Record Format
7 pages
Project
No ratings yet
Project
25 pages
Category JS
No ratings yet
Category JS
6 pages
Products Code File
No ratings yet
Products Code File
10 pages
Hype Hair - February 2015 USA
100% (2)
Hype Hair - February 2015 USA
132 pages
Web Scraping with BeautifulSoup
No ratings yet
Web Scraping with BeautifulSoup
7 pages
GrNo.2 Consumer Dashboard
No ratings yet
GrNo.2 Consumer Dashboard
10 pages
Ip Mkeka 2
No ratings yet
Ip Mkeka 2
19 pages
Ethics and Human Interface
No ratings yet
Ethics and Human Interface
17 pages
Flipkart Data Extraction Guide
No ratings yet
Flipkart Data Extraction Guide
8 pages
Code of The Project
No ratings yet
Code of The Project
10 pages
Web Scraping Assignment Ebay
No ratings yet
Web Scraping Assignment Ebay
6 pages
Template - GA4 Ecommerce Implementation Developer Specification
No ratings yet
Template - GA4 Ecommerce Implementation Developer Specification
5 pages
Responsive Shopping Cart App Guide
No ratings yet
Responsive Shopping Cart App Guide
72 pages
Shopping HTML
No ratings yet
Shopping HTML
4 pages
Web Scrapping Project Phase 4 1679950739
No ratings yet
Web Scrapping Project Phase 4 1679950739
12 pages
Web Essentials 2
No ratings yet
Web Essentials 2
9 pages
Python Script for Product Management
No ratings yet
Python Script for Product Management
16 pages
Miracle Worker: Chase Ra'Mel Phillips Ms. Nelson English 1
No ratings yet
Miracle Worker: Chase Ra'Mel Phillips Ms. Nelson English 1
3 pages
THINK L2 Unit 4 Vocabulary Extension
No ratings yet
THINK L2 Unit 4 Vocabulary Extension
2 pages
01 Web Data Analytics Pawan
No ratings yet
01 Web Data Analytics Pawan
55 pages
New Text Document
No ratings yet
New Text Document
4 pages
SDC Program 01
No ratings yet
SDC Program 01
17 pages
Raport Practica 2024
No ratings yet
Raport Practica 2024
41 pages
Luce Irigaray - Sharing The Fire - Outline of A Dialectics of Sensitivity (2019, Springer International Publishing - Palgrave Macmillan) PDF
100% (2)
Luce Irigaray - Sharing The Fire - Outline of A Dialectics of Sensitivity (2019, Springer International Publishing - Palgrave Macmillan) PDF
114 pages
Thired
No ratings yet
Thired
15 pages
Header
No ratings yet
Header
7 pages
Skill Developmen LAB Manual
No ratings yet
Skill Developmen LAB Manual
32 pages
HRMS Guide for Employees
No ratings yet
HRMS Guide for Employees
30 pages
Aluminium Industry Trends
No ratings yet
Aluminium Industry Trends
7 pages
Jsonaddtocart
No ratings yet
Jsonaddtocart
6 pages
Django Project
No ratings yet
Django Project
5 pages
Philips Web Scraper Spec
No ratings yet
Philips Web Scraper Spec
2 pages
Rate Analogy
No ratings yet
Rate Analogy
9 pages
The Islamic-Byzantine Frontier
100% (1)
The Islamic-Byzantine Frontier
372 pages
English Exam Video Guide
No ratings yet
English Exam Video Guide
8 pages
Python
No ratings yet
Python
18 pages
Childrens Writers Illustrators Market 33rd Edition The Most Trusted Guide To Getting Published Amy Jones Download
No ratings yet
Childrens Writers Illustrators Market 33rd Edition The Most Trusted Guide To Getting Published Amy Jones Download
35 pages
Piping Design - Engineering Information
No ratings yet
Piping Design - Engineering Information
32 pages
Getting Children To Help Around The House Level 8.0
100% (1)
Getting Children To Help Around The House Level 8.0
2 pages
Iot Physical Devices and Endpoints: Bahga & Madisetti, © 2015
No ratings yet
Iot Physical Devices and Endpoints: Bahga & Madisetti, © 2015
14 pages
Chap 12 PM-BB Multiple Choice Type Questions
No ratings yet
Chap 12 PM-BB Multiple Choice Type Questions
24 pages
Top Down Network Design
100% (1)
Top Down Network Design
10 pages

Vnprod

Uploaded by

Vnprod

Uploaded by

is code ko dekho ye pagination bilkul theek handle kar raha hai or sare products

scrape hota hain till last

# Track visited pages to avoid duplicates

# Initialize the webdriver

# Add counter for products

def parse(self, response):

# Open the URL with Selenium

for selector in selectors_to_try:

# Extract data from each product

# Try different selectors for image

# Only yield if we found at least some data

self.logger.info(f"Constructed next page URL: {next_page_url}")

if next_page_url not in self.visited_pages:

# Create a new request for the next page

def closed(self, reason):

<div class="snize-pagination" role="navigation" aria-label="Pagination"

# Category URLs with display names and hardcoded pet types

# Known brand names list

# Track visited pages to avoid duplicates

def __init__(self, *args, **kwargs):

# Initialize the webdriver

# Add counter for products

# Add variables for category tracking

# EC2 optimization parameters

super(VetnpetdirectSpiderSpider, self).__init__(*args, **kwargs)

def spider_opened(self, spider):

def spider_closed(self, spider):

def parse(self, response):

# Check if we've already visited this URL to avoid duplicates

# Initialize the driver if it's not already initialized

# Open the URL with Selenium

# Wait for the page to load fully

# First check what product elements are present on the page

# Try each selector with explicit wait

# Increment pending requests counter for product detail pages

# Store product data first to avoid stale element issues

# First collect basic data from the list page

# Extract link first as we'll need it to visit the product page

for selector in link_selectors:

# Skip if we've already processed this product URL

# Add to processed set

# Only collect other data if we found a link

# Try different selectors for image

# Check if product is out of stock

# Add category information

# Add default fields

# Add product to our list to process

# Handle pagination - try to find the next page

# Check if this page exists by looking for a pagination element

if has_next_page and next_page_link and next_page_link not in

# Create a new request for the next page

# Decrement pending requests counter for this page

# Check if we've completed this category

def parse_product(self, response):

# Yield all items

# Check if we've completed this category

def check_category_completion(self, category_name):

if self.current_category_index < len(self.category_urls):

# Schedule the request for the next category

def process_product_page(self, url, base_item):

for attempt in range(max_retries):

# Wait for page to load

# Wait for product detail elements to be present

# Try to get description if available

for selector in description_selectors:

# First check if there are variations

for selector in selectors_to_try:

for selector in price_selectors:

# Successfully processed this item, break retry loop

def process_variations(self, variation_selects, base_item):

# Initialize variation processing

# Process the first variation dropdown

# Get all options in this dropdown

for option in options:

# Select this option

# Wait for price to update

def init(self, *args, **kwargs):

super(VetnpetdirectSpiderSpider, self).init(*args, **kwargs)

def init(self, *args, **kwargs):

super(VetnpetdirectSpiderSpider, self).init(*args, **kwargs)