06/05/2024, 18:31 Amazon_Web_Scrapper.
ipynb - Colab
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
# Function to extract Product Title
def get_title(soup):
try:
# Outer Tag Object
title = soup.find("span", attrs={"id":'productTitle'})
# Inner NavigatableString Object
title_value = title.text
# Title as a string value
title_string = title_value.strip()
except AttributeError:
title_string = ""
return title_string
# Function to extract Product Price
def get_price(soup):
try:
price = soup.find("span", attrs={'id':'priceblock_ourprice'}).string.strip()
except AttributeError:
try:
# If there is some deal price
price = soup.find("span", attrs={'id':'priceblock_dealprice'}).string.strip()
except:
price = ""
return price
# Function to extract Product Rating
def get_rating(soup):
try:
rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
except AttributeError:
try:
rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
except:
rating = ""
return rating
# Function to extract Number of User Reviews
def get_review_count(soup):
try:
review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()
except AttributeError:
review_count = ""
return review_count
# Function to extract Availability Status
def get_availability(soup):
try:
available = soup.find("div", attrs={'id':'availability'})
available = available.find("span").string.strip()
except AttributeError:
available = "Not Available"
return available
if __name__ == '__main__':
# add your user agent
https://colab.research.google.com/drive/1RJi-EWT_Ovqs3oMD50FO3lZA5ly7n2xp?authuser=1#scrollTo=cnrCkzYQ_VMj&printMode=true 1/3
06/05/2024, 18:31 Amazon_Web_Scrapper.ipynb - Colab
# add your user agent
HEADERS = ({'User-Agent':'', 'Accept-Language': 'en-US, en;q=0.5'})
# The webpage URL
URL = "https://www.amazon.com/s?k=playstation+4&ref=nb_sb_noss_2"
# HTTP Request
webpage = requests.get(URL, headers=HEADERS)
# Soup Object containing all data
soup = BeautifulSoup(webpage.content, "html.parser")
# Fetch links as List of Tag Objects
links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})
# Store the links
links_list = []
# Loop for extracting links from Tag Objects
for link in links:
links_list.append(link.get('href'))
d = {"title":[], "price":[], "rating":[], "reviews":[],"availability":[]}
# Loop for extracting product details from each link
for link in links_list:
new_webpage = requests.get("https://www.amazon.com" + link, headers=HEADERS)
new_soup = BeautifulSoup(new_webpage.content, "html.parser")
# Function calls to display all necessary product information
d['title'].append(get_title(new_soup))
d['price'].append(get_price(new_soup))
d['rating'].append(get_rating(new_soup))
d['reviews'].append(get_review_count(new_soup))
d['availability'].append(get_availability(new_soup))
amazon_df = pd.DataFrame.from_dict(d)
amazon_df['title'].replace('', np.nan, inplace=True)
amazon_df = amazon_df.dropna(subset=['title'])
amazon_df.to_csv("amazon_data.csv", header=True, index=False)
amazon_df
output title price rating reviews availability
0 PlayStation 4 Slim 1TB Console - Black (Renewed) 4.5 out of 5 stars 1,561 ratings In Stock
1 PlayStation 4 500GB Console (Renewed) 4.5 out of 5 stars 1,103 ratings In Stock
2 Sony Playstation PS4 1TB Black Console 4.5 out of 5 stars 1,452 ratings In Stock
3 Sony PlayStation 4 Pro w/ Accessories, 1TB HDD... 4.5 out of 5 stars 896 ratings In Stock
4 Sony PlayStation 4 Slim Limited Edition 1TB Ga... 4.5 out of 5 stars 1,410 ratings In Stock
5 Sony PlayStation 4 Console, Renewed, Black 4.5 out of 5 stars 214 ratings Only 3 left in stock - order soon.
6 PlayStation 4 Console - 1TB Slim Edition (Rene... 4.5 out of 5 stars 69 ratings Only 1 left in stock - order soon.
7 Playstation Sony 4, 500GB Slim System [CUH-221... 4.5 out of 5 stars 343 ratings Only 4 left in stock - order soon.
8 Flagship Newest Play Station 4 1TB HDD Only on... 4.5 out of 5 stars 204 ratings Only 12 left in stock - order soon.
9 Playstation SONY 4, 500GB Slim System [CUH-221... 4.5 out of 5 stars 419 ratings Only 12 left in stock - order soon.
10 PlayStation®5 Digital Edition (slim) 4.5 out of 5 stars 4,881 ratings In Stock
11 Sony PlayStation 4 Console 1TB - Black (Renewed) 4.5 out of 5 stars In Stock
12 Sony PlayStation 4 500GB Console Only (Certifi... 4.5 out of 5 stars 7 ratings In Stock
13 Sony - PlayStation 4 Pro Console (3002470) Jet... 4.4 out of 5 stars 253 ratings Not Available
14 Sony PlayStation 4 500GB Premium Bundle (Renewed) 1.5 out of 5 stars 2 ratings Not Available
16 Sony PlayStation 4 Slim Limited Edition 1TB Ga... 4.2 out of 5 stars 1,406 ratings Not Available
18 PlayStation 2 Slim Console PS2 (Renewed) 4.5 out of 5 stars 1,225 ratings In Stock
20 Sony Playstation 3 160GB System (Renewed) 4.1 out of 5 stars 506 ratings Not Available
21 Playstation SONY 4, 500GB Slim System [CUH-221... 4.5 out of 5 stars 418 ratings Not Available
23 Sony - PlayStation 4 Pro Console (3002470) Jet... 4.4 out of 5 stars 253 ratings Not Available
Next steps: Generate code with amazon_df
toggle_off View recommended plots
https://colab.research.google.com/drive/1RJi-EWT_Ovqs3oMD50FO3lZA5ly7n2xp?authuser=1#scrollTo=cnrCkzYQ_VMj&printMode=true 2/3
06/05/2024, 18:31 Amazon_Web_Scrapper.ipynb - Colab
amazon_df.to_csv("amazon_web_scrapped_data.csv")
from google.colab import files
files.download('amazon_web_scrapped_data.csv')
Start coding or generate with AI.
https://colab.research.google.com/drive/1RJi-EWT_Ovqs3oMD50FO3lZA5ly7n2xp?authuser=1#scrollTo=cnrCkzYQ_VMj&printMode=true 3/3