Pythonbot / app.py
Bloodlyghoul's picture
Rename Web Scraping with BeautifulSoup.py to app.py
de22914 verified
#!/usr/bin/env python
# coding: utf-8
#Requirements
#pip3 install requests
#pip3 install bs4
#run in the browser also what are you doing with the help of chrome driver
# ## Basic fundamentals of web scraping
# import these two modules bs4 for selecting HTML tags easily
from bs4 import BeautifulSoup
# requests module is easy to operate some people use urllib but I prefer this one because it is easy to use.
import requests
from selenium import webdriver
# I put here my own blog url ,you can change it.
url="https://getpython.wordpress.com/"
BASE_URL = "https://getpython.wordpress.com/"
#Requests module use to data from given url
source=requests.get(url)
def get_chrome_web_driver(options):
return webdriver.Chrome("./chromedriver", chrome_options=options)
def get_web_driver_options():
return webdriver.ChromeOptions()
def set_ignore_certificate_error(options):
options.add_argument('--ignore-certificate-errors')
def set_browser_as_incognito(options):
options.add_argument('--incognito')
# BeautifulSoup is used for getting HTML structure from requests response.(craete your soup)
soup=BeautifulSoup(source.text,'html')
# Find function is used to find a single element if there are more than once it always returns the first element.
title=soup.find('title') # place your html tagg in parentheses that you want to find from html.
print("this is with html tags :",title)
qwery=soup.find('h1') # here i find first h1 tagg in my website using find operation.
#use .text for extract only text without any html tags
print("this is without html tags:",qwery.text)
links=soup.find('a') #i extarcted link using "a" tag
print(links)
# ## extarct data from innerhtml
# here i extarcted href data from anchor tag.
print(links['href'])
## or another way
##extracting href(links) attribute and anchor(<a>) tag from page
for a in soup.find_all('a', href=True):
print ( a['href'])
for i in links:
print(i.text)
# similarly i got class details from a anchor tag
print(links['class'])
# ## findall operation in Bs4
# findall function is used to fetch all tags at a single time.
many_link=soup.find_all('a') # here i extracted all the anchor tags of my website
total_links=len(many_link) # len function is use to calculate length of your array
print("total links in my website :",total_links)
print()
for i in many_link[:6]: # here i use slicing to fetch only first 6 links from rest of them.
print(i)
second_link=many_link[1] #here i fetch second link which place on 1 index number in many_links.
print(second_link)
print()
print("href is :",second_link['href']) #only href link is extracted from ancor tag
# select div tag from second link
nested_div=second_link.find('div')
# As you can see div element extarcted , it also have inner elements
print(nested_div)
print()
#here i extracted class element from div but it give us in the form of list
z=(nested_div['class'])
print(z)
print(type(z))
print()
# " " .join () method use to convert list type into string type
print("class name of div is :"," ".join(nested_div['class']))
# ## scrap data from wikipedia
wiki=requests.get("https://en.wikipedia.org/wiki/World_War_II")
soup=BeautifulSoup(wiki.text,'html')
print(soup.find('title'))
# ### find html tags with classes
ww2_contents=soup.find_all("div",class_='toc')
for i in ww2_contents:
print(i.text)
overview=soup.find_all('table',class_='infobox vevent')
for z in overview:
print(z.text)
images=soup.find_all('img')
images
##or
print(images)