Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit f8f1c94

Browse files
committed
Beautiful Soup Web Scraping
1 parent 03ecf27 commit f8f1c94

File tree

2 files changed

+98
-1
lines changed

2 files changed

+98
-1
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,5 @@ repository breaks up the code into individual .py files, and the full tutorial f
2525
18. [Urllib Examples](https://vegibit.com/python-urllib/)
2626
19. [Python Requests](https://vegibit.com/python-requests-library/)
2727
20. [Json In Python](https://vegibit.com/python-json-tutorial/)
28-
21. [XML Parsing](https://vegibit.com/python-xml-parsing/)
28+
21. [XML Parsing](https://vegibit.com/python-xml-parsing/)
29+
22. [Beautiful Soup Web Scraping](https://vegibit.com/python-web-scraping-with-beautiful-soup/)

scraper.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Building The Scraping Script
2+
import requests
3+
from bs4 import BeautifulSoup
4+
5+
url = 'http://quotes.toscrape.com/'
6+
response = requests.get(url)
7+
soup = BeautifulSoup(response.text, 'lxml')
8+
9+
print(soup)
10+
11+
# Parsing Html Markup
12+
url = 'http://quotes.toscrape.com/'
13+
response = requests.get(url)
14+
soup = BeautifulSoup(response.text, 'lxml')
15+
quotes = soup.find_all('span', class_='text')
16+
17+
print(quotes)
18+
19+
# Beautiful Soup text property
20+
import requests
21+
from bs4 import BeautifulSoup
22+
23+
url = 'http://quotes.toscrape.com/'
24+
response = requests.get(url)
25+
soup = BeautifulSoup(response.text, 'lxml')
26+
quotes = soup.find_all('span', class_='text')
27+
28+
for quote in quotes:
29+
print(quote.text)
30+
31+
# More Granular
32+
import requests
33+
from bs4 import BeautifulSoup
34+
35+
url = 'http://quotes.toscrape.com/'
36+
response = requests.get(url)
37+
soup = BeautifulSoup(response.text, 'lxml')
38+
quotes = soup.find_all('span', class_='text')
39+
authors = soup.find_all('small', class_='author')
40+
41+
for i in range(0, len(quotes)):
42+
print(quotes[i].text)
43+
print('--' + authors[i].text)
44+
45+
# Using an inner loop
46+
import requests
47+
from bs4 import BeautifulSoup
48+
49+
url = 'http://quotes.toscrape.com/'
50+
response = requests.get(url)
51+
soup = BeautifulSoup(response.text, 'lxml')
52+
quotes = soup.find_all('span', class_='text')
53+
authors = soup.find_all('small', class_='author')
54+
tags = soup.find_all('div', class_='tags')
55+
56+
for i in range(0, len(quotes)):
57+
print(quotes[i].text)
58+
print('--' + authors[i].text)
59+
tagsforquote = tags[i].find_all('a', class_='tag')
60+
for tagforquote in tagsforquote:
61+
print(tagforquote.text)
62+
print('\n')
63+
64+
# Web Scraping More Than One Page
65+
import requests
66+
from bs4 import BeautifulSoup
67+
68+
url = 'https://scrapingclub.com/exercise/list_basic/?page=1'
69+
response = requests.get(url)
70+
soup = BeautifulSoup(response.text, 'lxml')
71+
items = soup.find_all('div', class_='col-lg-4 col-md-6 mb-4')
72+
count = 1
73+
for i in items:
74+
itemName = i.find('h4', class_='card-title').text.strip()
75+
itemPrice = i.find('h5').text
76+
print(f'{count}: {itemPrice} for the {itemName}')
77+
count += 1
78+
pages = soup.find('ul', class_='pagination')
79+
urls = []
80+
links = pages.find_all('a', class_='page-link')
81+
for link in links:
82+
pageNum = int(link.text) if link.text.isdigit() else None
83+
if pageNum != None:
84+
hrefval = link.get('href')
85+
urls.append(hrefval)
86+
count = 1
87+
for i in urls:
88+
newUrl = url + i
89+
response = requests.get(newUrl)
90+
soup = BeautifulSoup(response.text, 'lxml')
91+
items = soup.find_all('div', class_='col-lg-4 col-md-6 mb-4')
92+
for i in items:
93+
itemName = i.find('h4', class_='card-title').text.strip()
94+
itemPrice = i.find('h5').text
95+
print(f'{count}: {itemPrice} for the {itemName}')
96+
count += 1

0 commit comments

Comments
 (0)