Personal Video Database alternate Forum it is NOW online again.
1 Member and 4 Guests are viewing this topic.
A. You installed pythonB. You installed selenium and requests byQuotepip install selenium requestsC. You have your Chrome bin on a PATHD. You have Python folder on your PATHE. pythonw.exe is not missing, or it's containing folder is on the PATH
pip install selenium requests
1. Uses Chrome browser instead Firefox2. Uses chromedriver.exe instead geckodriver3. Starts chromedriver.exe silently4. Silently invokes browser in a headless mode (no pop-up windows of browser)5. Scrapes .htm page of a given url6. No path is needed to set manually inside the script - it is set to be relative to the path of selenium script!
6A. You put this script into "Scripts" folder of your PVD instance.6B. You put appropirate chromedriver.exe to the "Script" folder, too.
FileExecute('pythonw.exe', '"' + ScriptPath + 'selenium_script-Chrome_People.py" "' + URL + '" "' + ScriptPath + BASE_DOWNLOAD_FILE_NO_BOM + '"');
C:\Users\user\selenium_script-Chrome_People.py "https://www.imdb.com/name/nm0000017"
http://www.videodb.info/forum_en/index.php/topic,4367.msg22727.html#msg22727
# Define URLs and save pathsURLS_AND_PATHS = { f"{base_url}/awards/": os.path.join(tmp_dir, "downpage-UTF8_NO_BOM-Awards.htm"), f"{base_url}/bio/": os.path.join(tmp_dir, "downpage-UTF8_NO_BOM-Bio.htm"), f"https://www.imdb.com/search/title/?explore=genres&role={base_url.split('/')[-1]}": os.path.join(tmp_dir, "downpage-UTF8_NO_BOM-Genres.htm"), f"{base_url}/?showAllCredits=true": os.path.join(tmp_dir, "downpage-UTF8_NO_BOM-Credit.htm")}# Improved function to click all "More" buttons with scrollingdef click_all_more_buttons(driver): """ Scrolls down the page and clicks all the "More" buttons that are visible. """ body = driver.find_element(By.TAG_NAME, 'body') while True: try: # Find visible "More" buttons more_buttons = driver.find_elements(By.XPATH, '//span[contains(@class, "ipc-see-more__text")]/..') # If no buttons are found, break the loop if not more_buttons: logging.info("No more 'More' buttons found.") break # Iterate through and click all visible "More" buttons for button in more_buttons: try: # Scroll into view before clicking driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", button) time.sleep(0.5) # Allow page to stabilize button.click() logging.info("Clicked a 'More' button.") time.sleep(1) # Allow time for new content to load except Exception as e: logging.warning(f"Error clicking a 'More' button: {e}") continue # Scroll the page down to load more buttons body.send_keys(Keys.PAGE_DOWN) time.sleep(1) # Wait for page to load more buttons except Exception as e: logging.info("No additional 'More' buttons to click.") break# Function to download a pagedef download_page(imdb_url, output_path, retries=3): for attempt in range(retries): try: # Initialize FirefoxDriver service = Service(gecko_path) driver = webdriver.Firefox(service=service, options=firefox_options) logging.info(f"Started FirefoxDriver for: {imdb_url}") driver.get(imdb_url) logging.info(f"Loaded URL: {imdb_url}") # Handle "Select Your Preferences" popup try: popup = WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'sc-kDvujY')]")) ) accept_button = WebDriverWait(driver, 5).until( EC.element_to_be_clickable((By.XPATH, "//button[@data-testid='accept-button']")) ) accept_button.click() logging.info("Accepted preferences popup.") except TimeoutException: logging.info("No preferences popup appeared.") # Click all "More" buttons on the page click_all_more_buttons(driver) # Save the HTML after clicking all "More" buttons html_source = driver.page_source with open(output_path, 'w', encoding='utf-8') as file: file.write(html_source) logging.info(f"Saved HTML to: {output_path}") break except Exception as e: logging.error(f"Error in attempt {attempt + 1}: {e}") finally: driver.quit()# Download pages in parallelthreads = []for url, path in URLS_AND_PATHS.items(): thread = threading.Thread(target=download_page, args=(url, path)) threads.append(thread) thread.start()