Spaces:

Bloodlyghoul
/

Pythonbot

Running

App Files Files Community

Pythonbot / web_scraping_command_line_tool.py

Bloodlyghoul

Upload 8 files

0ade82b verified 21 days ago

raw

history blame contribute delete

5.92 kB

	# import required modules
	import json
	import requests
	from datetime import datetime
	from urllib.parse import urlparse
	from bs4 import BeautifulSoup
	from beautifultable import BeautifulTable



	def load_json(database_json_file="scraped_data.json"):
	"""
	This function will load json data from scraped_data.json file if it exist else crean an empty array
	"""
	try:
	with open(database_json_file, "r") as read_it:
	all_data_base = json.loads(read_it.read())
	return all_data_base
	except:
	all_data_base = dict()
	return all_data_base


	def save_scraped_data_in_json(data, database_json_file="scraped_data.json"):
	"""
	This function Save the scraped data in json format. scraped_data.json file if it exist else create it.
	if file already exist you can view previous scraped data
	"""
	file_obj = open(database_json_file, "w")
	file_obj.write(json.dumps(data))
	file_obj.close()


	def existing_scraped_data_init(json_db):
	"""
	This function init data from json file if it exist have data else create an empty one
	"""
	scraped_data = json_db.get("scraped_data")
	if scraped_data is None:
	json_db['scraped_data'] = dict()

	return None


	def scraped_time_is():
	"""
	This function create time stamp for keep our book issue record trackable
	"""
	now = datetime.now()
	dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
	return dt_string

	def process_url_request(website_url):
	"""
	This function process provided URL get its data using requets module
	and contrunct soup data using BeautifulSoup for scarping
	"""
	requets_data = requests.get(website_url)
	if requets_data.status_code == 200:
	soup = BeautifulSoup(requets_data.text,'html')
	return soup
	return None

	def proccess_beautiful_soup_data(soup):
	return {
	'title': soup.find('title').text,
	'all_anchor_href': [i['href'] for i in soup.find_all('a', href=True)],
	'all_anchors': [str(i) for i in soup.find_all('a')],
	'all_images_data': [ str(i) for i in soup.find_all('img')],
	'all_images_source_data': [ i['src'] for i in soup.find_all('img')],
	'all_h1_data': [i.text for i in soup.find_all('h1')],
	'all_h2_data': [i.text for i in soup.find_all('h2')],
	'all_h3_data': [i.text for i in soup.find_all('h3')],
	'all_p_data': [i.text for i in soup.find_all('p')]
	}



	# Here I used infinite loop because i don't want to run it again and again.
	while True:

	print(""" ================ Welcome to this scraping program =============
	==>> press 1 for checking existing scraped websites
	==>> press 2 for scrap a single website
	==>> press 3 for exit
	""")

	choice = int(input("==>> Please enter your choice :"))

	# Load json function called for fetching/creating data from json file.
	local_json_db = load_json()
	existing_scraped_data_init(local_json_db)

	if choice == 1:
	# I used Beautiful table for presenting scraped data in a good way !!
	# you guys can read more about from this link https://beautifultable.readthedocs.io/en/latest/index.html
	scraped_websites_table = BeautifulTable()
	scraped_websites_table.columns.header = ["Sr no.", "Allias name ", "Website domain", "title", "Scraped at", "Status"]
	scraped_websites_table.set_style(BeautifulTable.STYLE_BOX_DOUBLED)


	local_json_db = load_json()
	for count, data in enumerate(local_json_db['scraped_data']):
	scraped_websites_table.rows.append([count + 1,
	local_json_db['scraped_data'][data]['alias'],
	local_json_db['scraped_data'][data]['domain'],
	local_json_db['scraped_data'][data]['title'],
	local_json_db['scraped_data'][data]['scraped_at'],
	local_json_db['scraped_data'][data]['status']])
	# all_scraped_websites = [websites['name'] for websites in local_json_db['scraped_data']]
	if not local_json_db['scraped_data']:
	print('===> No existing data found !!!')
	print(scraped_websites_table)

	elif choice == 2:
	print()
	url_for_scrap = input("===> Please enter url you want to scrap:")
	is_accessable = process_url_request(url_for_scrap)
	if is_accessable:
	scraped_data_packet = proccess_beautiful_soup_data(is_accessable)
	print()
	print(' =====> Data scraped successfully !!!')
	key_for_storing_data = input("enter alias name for saving scraped data :")
	scraped_data_packet['url'] = url_for_scrap
	scraped_data_packet['name'] = key_for_storing_data
	scraped_data_packet['scraped_at'] = scraped_time_is()
	if key_for_storing_data in local_json_db['scraped_data']:
	key_for_storing_data = key_for_storing_data + str(scraped_time_is())
	print("Provided key is already exist so data stored as : {}".format(key_for_storing_data))
	scraped_data_packet['alias'] = key_for_storing_data
	scraped_data_packet['status'] = True
	scraped_data_packet['domain'] = urlparse(url_for_scrap).netloc

	local_json_db['scraped_data'][key_for_storing_data] = scraped_data_packet
	print(
	'scraped data is:', local_json_db['scraped_data'][key_for_storing_data]
	)
	save_scraped_data_in_json(local_json_db)
	# load data
	local_json_db = load_json()
	print(' =====> Data saved successfully !!!')
	print()
	elif choice == 3:
	print('Thank you for using !!!')
	break

	elif choice == 4:
	print('Thank you for using !!!')
	break

	else:
	print("enter a valid choice ")