from selenium import webdriver from selenium.webdriver.common.by import By import time import json import pandas as pd # Initialize WebDriver driver = webdriver.Chrome() # Base URL for chapters base_url = "https://devgan.in/bns/chapter_" # Number of chapters num_chapters = 20 # Initialize an empty list to store the data dataset = [] # Loop through each chapter for chapter_num in range(1, num_chapters + 1): chapter_url = f"{base_url}{str(chapter_num).zfill(2)}.php" print(f"Scraping: {chapter_url}") # Open the chapter page driver.get(chapter_url) time.sleep(3) # Wait for the page to load # Get the chapter name try: chapter_name = driver.find_element(By.TAG_NAME, "h1").text.strip() print(f"Chapter: {chapter_name}") except Exception as e: print(f"Error fetching chapter name: {e}") continue # Find all sections (subClose and sectxt) try: section_headers = driver.find_elements(By.CSS_SELECTOR, "h2.subClose") section_contents = driver.find_elements(By.CSS_SELECTOR, "div.sectxt") if len(section_headers) != len(section_contents): print(f"Mismatch in sections and content: {len(section_headers)} headers, {len(section_contents)} contents.") continue for header, content in zip(section_headers, section_contents): try: section_title = header.text.strip() # Expand hidden content if necessary if content.value_of_css_property("display") == "none": driver.execute_script("arguments[0].style.display = 'block';", content) section_content = content.text.strip() # Add data to the dataset dataset.append({ "chapter": chapter_name, "section_title": section_title, "section_content": section_content }) print(f"Processed section: {section_title}") except Exception as e: print(f"Error processing section: {e}") continue except Exception as e: print(f"Error finding sections: {e}") continue # Pause between chapters to avoid overwhelming the server time.sleep(2) # Close the WebDriver driver.quit() # Save the dataset to a JSON file output_file_json = "bns_dataset.json" with open(output_file_json, "w", encoding="utf-8") as json_file: json.dump(dataset, json_file, ensure_ascii=False, indent=4) print(f"Dataset saved to {output_file_json}") # Save the dataset to a CSV file output_file_csv = "bns_dataset.csv" df = pd.DataFrame(dataset) df.to_csv(output_file_csv, index=False) print(f"Dataset saved to {output_file_csv}")