# import required modules import json import requests from datetime import datetime from urllib.parse import urlparse from bs4 import BeautifulSoup from beautifultable import BeautifulTable def load_json(database_json_file="scraped_data.json"): """ This function will load json data from scraped_data.json file if it exist else crean an empty array """ try: with open(database_json_file, "r") as read_it: all_data_base = json.loads(read_it.read()) return all_data_base except: all_data_base = dict() return all_data_base def save_scraped_data_in_json(data, database_json_file="scraped_data.json"): """ This function Save the scraped data in json format. scraped_data.json file if it exist else create it. if file already exist you can view previous scraped data """ file_obj = open(database_json_file, "w") file_obj.write(json.dumps(data)) file_obj.close() def existing_scraped_data_init(json_db): """ This function init data from json file if it exist have data else create an empty one """ scraped_data = json_db.get("scraped_data") if scraped_data is None: json_db['scraped_data'] = dict() return None def scraped_time_is(): """ This function create time stamp for keep our book issue record trackable """ now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") return dt_string def process_url_request(website_url): """ This function process provided URL get its data using requets module and contrunct soup data using BeautifulSoup for scarping """ requets_data = requests.get(website_url) if requets_data.status_code == 200: soup = BeautifulSoup(requets_data.text,'html') return soup return None def proccess_beautiful_soup_data(soup): return { 'title': soup.find('title').text, 'all_anchor_href': [i['href'] for i in soup.find_all('a', href=True)], 'all_anchors': [str(i) for i in soup.find_all('a')], 'all_images_data': [ str(i) for i in soup.find_all('img')], 'all_images_source_data': [ i['src'] for i in soup.find_all('img')], 'all_h1_data': [i.text for i in soup.find_all('h1')], 'all_h2_data': [i.text for i in soup.find_all('h2')], 'all_h3_data': [i.text for i in soup.find_all('h3')], 'all_p_data': [i.text for i in soup.find_all('p')] } # Here I used infinite loop because i don't want to run it again and again. while True: print(""" ================ Welcome to this scraping program ============= ==>> press 1 for checking existing scraped websites ==>> press 2 for scrap a single website ==>> press 3 for exit """) choice = int(input("==>> Please enter your choice :")) # Load json function called for fetching/creating data from json file. local_json_db = load_json() existing_scraped_data_init(local_json_db) if choice == 1: # I used Beautiful table for presenting scraped data in a good way !! # you guys can read more about from this link https://beautifultable.readthedocs.io/en/latest/index.html scraped_websites_table = BeautifulTable() scraped_websites_table.columns.header = ["Sr no.", "Allias name ", "Website domain", "title", "Scraped at", "Status"] scraped_websites_table.set_style(BeautifulTable.STYLE_BOX_DOUBLED) local_json_db = load_json() for count, data in enumerate(local_json_db['scraped_data']): scraped_websites_table.rows.append([count + 1, local_json_db['scraped_data'][data]['alias'], local_json_db['scraped_data'][data]['domain'], local_json_db['scraped_data'][data]['title'], local_json_db['scraped_data'][data]['scraped_at'], local_json_db['scraped_data'][data]['status']]) # all_scraped_websites = [websites['name'] for websites in local_json_db['scraped_data']] if not local_json_db['scraped_data']: print('===> No existing data found !!!') print(scraped_websites_table) elif choice == 2: print() url_for_scrap = input("===> Please enter url you want to scrap:") is_accessable = process_url_request(url_for_scrap) if is_accessable: scraped_data_packet = proccess_beautiful_soup_data(is_accessable) print() print(' =====> Data scraped successfully !!!') key_for_storing_data = input("enter alias name for saving scraped data :") scraped_data_packet['url'] = url_for_scrap scraped_data_packet['name'] = key_for_storing_data scraped_data_packet['scraped_at'] = scraped_time_is() if key_for_storing_data in local_json_db['scraped_data']: key_for_storing_data = key_for_storing_data + str(scraped_time_is()) print("Provided key is already exist so data stored as : {}".format(key_for_storing_data)) scraped_data_packet['alias'] = key_for_storing_data scraped_data_packet['status'] = True scraped_data_packet['domain'] = urlparse(url_for_scrap).netloc local_json_db['scraped_data'][key_for_storing_data] = scraped_data_packet print( 'scraped data is:', local_json_db['scraped_data'][key_for_storing_data] ) save_scraped_data_in_json(local_json_db) # load data local_json_db = load_json() print(' =====> Data saved successfully !!!') print() elif choice == 3: print('Thank you for using !!!') break elif choice == 4: print('Thank you for using !!!') break else: print("enter a valid choice ")