Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add selenium automation in Automation Tools #540

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
314 changes: 314 additions & 0 deletions Automation_Tools/Selenium-Automation/IMDB_selenium_scraping.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 148,
"metadata": {},
"outputs": [],
"source": [
"# all imports\n",
"from selenium import webdriver\n",
"from selenium.webdriver.chrome.service import Service\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.common.exceptions import TimeoutException, StaleElementReferenceException\n",
"from selenium.webdriver.support.wait import WebDriverWait\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [],
"source": [
"# Set up service and options\n",
"chromdrive = Service(os.path.join('chromedriver.exe'))\n",
"opt = webdriver.ChromeOptions()\n",
"driver = webdriver.Chrome(service=chromdrive, options=opt)\n",
"driver.maximize_window()"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {},
"outputs": [],
"source": [
"# BASE URL for IMDB disney movies list\n",
"url = \"https://www.imdb.com/list/ls026785255/\"\n",
"driver.get(url)"
]
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {},
"outputs": [],
"source": [
"# Scroll to bottom of page and load all elements\n",
"def load_full_page():\n",
" last_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
"\n",
" while True:\n",
" driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
" try:\n",
" WebDriverWait(driver, 10).until(\n",
" lambda d: driver.execute_script(\"return document.body.scrollHeight\") > last_height\n",
" )\n",
" except TimeoutException:\n",
" break\n",
" last_height = driver.execute_script(\"return document.body.scrollHeight\")"
]
},
{
"cell_type": "code",
"execution_count": 152,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"136"
]
},
"execution_count": 152,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"load_full_page()\n",
"movie_links = [url.get_attribute('href') for url in driver.find_elements(By.XPATH, \"//a[@class='ipc-title-link-wrapper']\")]\n",
"len(movie_links)"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {},
"outputs": [],
"source": [
"disney_movies = []"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {},
"outputs": [],
"source": [
"# To dismiss any dialog box like ratings that open up while browsing\n",
"def dismiss_dialog_if_present(driver):\n",
" try:\n",
" close_button = WebDriverWait(driver, 5).until(\n",
" EC.element_to_be_clickable((By.CLASS_NAME, 'ipc-icon ipc-icon--clear'))\n",
" )\n",
" close_button.click() # Click to close the dialog\n",
" print(\"Dialog box dismissed.\")\n",
" except Exception as e:\n",
" pass"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Iterating and collecting all movies in disney_movies as list of dict\n",
"for movie_link in movie_links:\n",
" try:\n",
" # Navigate to the movie link\n",
" driver.get(movie_link)\n",
" load_full_page() # Ensure the page is fully loaded\n",
"\n",
" # Check for the dialog box before scraping\n",
" dismiss_dialog_if_present(driver)\n",
"\n",
" # Scrape the data\n",
" name = WebDriverWait(driver, 10).until(\n",
" EC.visibility_of_element_located((By.CLASS_NAME, 'hero__primary-text'))\n",
" ).text\n",
" \n",
" # Check for the dialog box again before finding the video\n",
" dismiss_dialog_if_present(driver)\n",
"\n",
" video = WebDriverWait(driver, 10).until(\n",
" EC.visibility_of_element_located((By.XPATH, '//video[@class=\"jw-video jw-reset\"]'))\n",
" ).get_attribute('src')\n",
" \n",
" # Check for the dialog box again before finding the cover image\n",
" dismiss_dialog_if_present(driver)\n",
"\n",
" cover_img = WebDriverWait(driver, 10).until(\n",
" EC.visibility_of_element_located((By.XPATH, '//img[@class=\"ipc-image\"]'))\n",
" ).get_attribute('src')\n",
"\n",
" # Check for the dialog box again before finding genres\n",
" dismiss_dialog_if_present(driver)\n",
"\n",
" genres = [{'name': genre.text, 'url': genre.get_attribute('href')} for genre in\n",
" driver.find_elements(By.XPATH, '//a[@class=\"ipc-chip ipc-chip--on-baseAlt\"]')]\n",
"\n",
" # Check for the dialog box again before finding the synopsis\n",
" dismiss_dialog_if_present(driver)\n",
"\n",
" synopsis = WebDriverWait(driver, 10).until(\n",
" EC.visibility_of_element_located((By.XPATH, '//span[@class=\"sc-55855a9b-0 dAbouZ\"]'))\n",
" ).text\n",
"\n",
" # Check for the dialog box again before finding the story\n",
" dismiss_dialog_if_present(driver)\n",
"\n",
" story = WebDriverWait(driver, 10).until(\n",
" EC.visibility_of_element_located((By.XPATH, '//div[@class=\"ipc-html-content-inner-div\"]'))\n",
" ).text\n",
"\n",
" except Exception as e:\n",
" print(f\"An unexpected error occurred: {e}\")\n",
"\n",
" disney_movies.append({\n",
" 'name': name,\n",
" 'video': video,\n",
" 'cover_img': cover_img,\n",
" 'genres': genres,\n",
" 'synopsis': synopsis,\n",
" 'story': story\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'name': 'Snow White and the Seven Dwarfs',\n",
" 'video': 'https://imdb-video.media-imdb.com/vi854264089/1434659607842-pgv4ql-1616198319041.mp4?Expires=1728849484&Signature=JYIRqJM2dF36IE4JZR27UArXkQ0Rw1G3DVmnvozvlcLHXK-U8qB~BTAKaTKxtLQD1Wf7atV99JVmGGDCBMtvZOrTICGAJyNJq8slxsXPftqKGRX094RqYSHiUjzNAHbZwfFrbxoRtTJbNp7PgqJL4dTC-CoLespVrgziZueahAPJSSm3CF0I978pcvtjgJ3WcN-5enUqBjoxA~-0Apnj7Df91Kj7rqHLGQU~ij4oBE9TrOc8RF70ZiUsxxxeZspGG3yCiUKl~RA0S3tHX1lYusygnBMbRDKWr6MYAuU4mL8cA9SCGk4DmM2ex3fesPA9MYcodMeBY1ndCb5iJFGAPw__&Key-Pair-Id=APKAIFLZBVQZ24NQH3KA',\n",
" 'cover_img': 'https://m.media-amazon.com/images/M/MV5BMjAyNDM2MDA2NF5BMl5BanBnXkFtZTcwOTMxMDAxNA@@._V1_QL75_UX190_CR0,0,190,281_.jpg',\n",
" 'genres': [{'name': 'Computer Animation',\n",
" 'url': 'https://www.imdb.com/interest/in0000028/?ref_=tt_ov_in_1'},\n",
" {'name': 'Jukebox Musical',\n",
" 'url': 'https://www.imdb.com/interest/in0000132/?ref_=tt_ov_in_2'},\n",
" {'name': 'Adventure',\n",
" 'url': 'https://www.imdb.com/interest/in0000012/?ref_=tt_ov_in_3'},\n",
" {'name': 'Animation',\n",
" 'url': 'https://www.imdb.com/interest/in0000026/?ref_=tt_ov_in_4'},\n",
" {'name': 'Comedy',\n",
" 'url': 'https://www.imdb.com/interest/in0000034/?ref_=tt_ov_in_5'},\n",
" {'name': 'Family',\n",
" 'url': 'https://www.imdb.com/interest/in0000093/?ref_=tt_ov_in_6'},\n",
" {'name': 'Fantasy',\n",
" 'url': 'https://www.imdb.com/interest/in0000098/?ref_=tt_ov_in_7'},\n",
" {'name': 'Musical',\n",
" 'url': 'https://www.imdb.com/interest/in0000133/?ref_=tt_ov_in_8'},\n",
" {'name': 'Romance',\n",
" 'url': 'https://www.imdb.com/interest/in0000152/?ref_=tt_ov_in_9'}],\n",
" 'synopsis': 'Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household.',\n",
" 'story': 'At a recording session, Lucille La Verne, the voice of the Wicked Queen, was told by Walt Disney\\'s animators that they needed an older, raspier version of the Queen\\'s voice for the Old Witch. La Verne stepped out of the recording booth, returned a few minutes later, and gave a perfect \"Old Hag\\'s voice\" that stunned the animators. When asked how she did it, she replied, \"Oh, I just took my teeth out.\"'},\n",
" {'name': 'Snow White and the Seven Dwarfs',\n",
" 'video': 'https://imdb-video.media-imdb.com/vi854264089/1434659607842-pgv4ql-1616198319041.mp4?Expires=1728849484&Signature=JYIRqJM2dF36IE4JZR27UArXkQ0Rw1G3DVmnvozvlcLHXK-U8qB~BTAKaTKxtLQD1Wf7atV99JVmGGDCBMtvZOrTICGAJyNJq8slxsXPftqKGRX094RqYSHiUjzNAHbZwfFrbxoRtTJbNp7PgqJL4dTC-CoLespVrgziZueahAPJSSm3CF0I978pcvtjgJ3WcN-5enUqBjoxA~-0Apnj7Df91Kj7rqHLGQU~ij4oBE9TrOc8RF70ZiUsxxxeZspGG3yCiUKl~RA0S3tHX1lYusygnBMbRDKWr6MYAuU4mL8cA9SCGk4DmM2ex3fesPA9MYcodMeBY1ndCb5iJFGAPw__&Key-Pair-Id=APKAIFLZBVQZ24NQH3KA',\n",
" 'cover_img': 'https://m.media-amazon.com/images/M/MV5BMjAyNDM2MDA2NF5BMl5BanBnXkFtZTcwOTMxMDAxNA@@._V1_QL75_UX190_CR0,0,190,281_.jpg',\n",
" 'genres': [{'name': 'Computer Animation',\n",
" 'url': 'https://www.imdb.com/interest/in0000028/?ref_=tt_ov_in_1'},\n",
" {'name': 'Jukebox Musical',\n",
" 'url': 'https://www.imdb.com/interest/in0000132/?ref_=tt_ov_in_2'},\n",
" {'name': 'Adventure',\n",
" 'url': 'https://www.imdb.com/interest/in0000012/?ref_=tt_ov_in_3'},\n",
" {'name': 'Animation',\n",
" 'url': 'https://www.imdb.com/interest/in0000026/?ref_=tt_ov_in_4'},\n",
" {'name': 'Comedy',\n",
" 'url': 'https://www.imdb.com/interest/in0000034/?ref_=tt_ov_in_5'},\n",
" {'name': 'Family',\n",
" 'url': 'https://www.imdb.com/interest/in0000093/?ref_=tt_ov_in_6'},\n",
" {'name': 'Fantasy',\n",
" 'url': 'https://www.imdb.com/interest/in0000098/?ref_=tt_ov_in_7'},\n",
" {'name': 'Musical',\n",
" 'url': 'https://www.imdb.com/interest/in0000133/?ref_=tt_ov_in_8'},\n",
" {'name': 'Romance',\n",
" 'url': 'https://www.imdb.com/interest/in0000152/?ref_=tt_ov_in_9'}],\n",
" 'synopsis': 'Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household.',\n",
" 'story': 'At a recording session, Lucille La Verne, the voice of the Wicked Queen, was told by Walt Disney\\'s animators that they needed an older, raspier version of the Queen\\'s voice for the Old Witch. La Verne stepped out of the recording booth, returned a few minutes later, and gave a perfect \"Old Hag\\'s voice\" that stunned the animators. When asked how she did it, she replied, \"Oh, I just took my teeth out.\"'},\n",
" {'name': 'Pinocchio',\n",
" 'video': 'https://imdb-video.media-imdb.com/vi854264089/1434659607842-pgv4ql-1616198319041.mp4?Expires=1728849484&Signature=JYIRqJM2dF36IE4JZR27UArXkQ0Rw1G3DVmnvozvlcLHXK-U8qB~BTAKaTKxtLQD1Wf7atV99JVmGGDCBMtvZOrTICGAJyNJq8slxsXPftqKGRX094RqYSHiUjzNAHbZwfFrbxoRtTJbNp7PgqJL4dTC-CoLespVrgziZueahAPJSSm3CF0I978pcvtjgJ3WcN-5enUqBjoxA~-0Apnj7Df91Kj7rqHLGQU~ij4oBE9TrOc8RF70ZiUsxxxeZspGG3yCiUKl~RA0S3tHX1lYusygnBMbRDKWr6MYAuU4mL8cA9SCGk4DmM2ex3fesPA9MYcodMeBY1ndCb5iJFGAPw__&Key-Pair-Id=APKAIFLZBVQZ24NQH3KA',\n",
" 'cover_img': 'https://m.media-amazon.com/images/M/MV5BMjAyNDM2MDA2NF5BMl5BanBnXkFtZTcwOTMxMDAxNA@@._V1_QL75_UX190_CR0,0,190,281_.jpg',\n",
" 'genres': [{'name': 'Computer Animation',\n",
" 'url': 'https://www.imdb.com/interest/in0000028/?ref_=tt_ov_in_1'},\n",
" {'name': 'Jukebox Musical',\n",
" 'url': 'https://www.imdb.com/interest/in0000132/?ref_=tt_ov_in_2'},\n",
" {'name': 'Adventure',\n",
" 'url': 'https://www.imdb.com/interest/in0000012/?ref_=tt_ov_in_3'},\n",
" {'name': 'Animation',\n",
" 'url': 'https://www.imdb.com/interest/in0000026/?ref_=tt_ov_in_4'},\n",
" {'name': 'Comedy',\n",
" 'url': 'https://www.imdb.com/interest/in0000034/?ref_=tt_ov_in_5'},\n",
" {'name': 'Family',\n",
" 'url': 'https://www.imdb.com/interest/in0000093/?ref_=tt_ov_in_6'},\n",
" {'name': 'Fantasy',\n",
" 'url': 'https://www.imdb.com/interest/in0000098/?ref_=tt_ov_in_7'},\n",
" {'name': 'Musical',\n",
" 'url': 'https://www.imdb.com/interest/in0000133/?ref_=tt_ov_in_8'},\n",
" {'name': 'Romance',\n",
" 'url': 'https://www.imdb.com/interest/in0000152/?ref_=tt_ov_in_9'}],\n",
" 'synopsis': 'Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household.',\n",
" 'story': 'At a recording session, Lucille La Verne, the voice of the Wicked Queen, was told by Walt Disney\\'s animators that they needed an older, raspier version of the Queen\\'s voice for the Old Witch. La Verne stepped out of the recording booth, returned a few minutes later, and gave a perfect \"Old Hag\\'s voice\" that stunned the animators. When asked how she did it, she replied, \"Oh, I just took my teeth out.\"'},\n",
" {'name': 'Fantasia',\n",
" 'video': 'https://imdb-video.media-imdb.com/vi854264089/1434659607842-pgv4ql-1616198319041.mp4?Expires=1728849484&Signature=JYIRqJM2dF36IE4JZR27UArXkQ0Rw1G3DVmnvozvlcLHXK-U8qB~BTAKaTKxtLQD1Wf7atV99JVmGGDCBMtvZOrTICGAJyNJq8slxsXPftqKGRX094RqYSHiUjzNAHbZwfFrbxoRtTJbNp7PgqJL4dTC-CoLespVrgziZueahAPJSSm3CF0I978pcvtjgJ3WcN-5enUqBjoxA~-0Apnj7Df91Kj7rqHLGQU~ij4oBE9TrOc8RF70ZiUsxxxeZspGG3yCiUKl~RA0S3tHX1lYusygnBMbRDKWr6MYAuU4mL8cA9SCGk4DmM2ex3fesPA9MYcodMeBY1ndCb5iJFGAPw__&Key-Pair-Id=APKAIFLZBVQZ24NQH3KA',\n",
" 'cover_img': 'https://m.media-amazon.com/images/M/MV5BMjAyNDM2MDA2NF5BMl5BanBnXkFtZTcwOTMxMDAxNA@@._V1_QL75_UX190_CR0,0,190,281_.jpg',\n",
" 'genres': [{'name': 'Computer Animation',\n",
" 'url': 'https://www.imdb.com/interest/in0000028/?ref_=tt_ov_in_1'},\n",
" {'name': 'Jukebox Musical',\n",
" 'url': 'https://www.imdb.com/interest/in0000132/?ref_=tt_ov_in_2'},\n",
" {'name': 'Adventure',\n",
" 'url': 'https://www.imdb.com/interest/in0000012/?ref_=tt_ov_in_3'},\n",
" {'name': 'Animation',\n",
" 'url': 'https://www.imdb.com/interest/in0000026/?ref_=tt_ov_in_4'},\n",
" {'name': 'Comedy',\n",
" 'url': 'https://www.imdb.com/interest/in0000034/?ref_=tt_ov_in_5'},\n",
" {'name': 'Family',\n",
" 'url': 'https://www.imdb.com/interest/in0000093/?ref_=tt_ov_in_6'},\n",
" {'name': 'Fantasy',\n",
" 'url': 'https://www.imdb.com/interest/in0000098/?ref_=tt_ov_in_7'},\n",
" {'name': 'Musical',\n",
" 'url': 'https://www.imdb.com/interest/in0000133/?ref_=tt_ov_in_8'},\n",
" {'name': 'Romance',\n",
" 'url': 'https://www.imdb.com/interest/in0000152/?ref_=tt_ov_in_9'}],\n",
" 'synopsis': 'Exiled into the dangerous forest by her wicked stepmother, a princess is rescued by seven dwarf miners who make her part of their household.',\n",
" 'story': 'At a recording session, Lucille La Verne, the voice of the Wicked Queen, was told by Walt Disney\\'s animators that they needed an older, raspier version of the Queen\\'s voice for the Old Witch. La Verne stepped out of the recording booth, returned a few minutes later, and gave a perfect \"Old Hag\\'s voice\" that stunned the animators. When asked how she did it, she replied, \"Oh, I just took my teeth out.\"'}]"
]
},
"execution_count": 161,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"disney_movies"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
67 changes: 67 additions & 0 deletions Automation_Tools/Selenium-Automation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
## Automation Tool for Web Scraping using Selenium

### 🎯 Goal

We will be making a list of disney movies from IMDB scraped using Selenium. [IMDB]("https://www.imdb.com/list/ls026785255/") is one of the modern websites that uses javascript for dynamic loading, making it hard for beautifulSoup to scrape, But Selenium got us covered.
---
### 🧾 Description

*Selenium* is a powerful python library for automation of web browser
You can install it using
`pip install selenium`
or if you are using anaconda
`conda install -c conda-forge selenium`.

Then you need to download a web-driver, in chrome it's ChromeDriver, in firefox it's geckodriver.
Here you will get the links to drivers https://pypi.org/project/selenium/ .

In my case I am using chromedriver (Remember the version of your chrome driver should match up with the version of chrome you are using)
and once can use any web browser like firfox, safari, edge, of your choice...

As for chromedriver, you can go to `https://chromedriver.chromium.org/downloads`
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @nidhi2026 ,
Specifying this link is enough, you dont have to attach the .exe file into the folder. Please remove it and will review it then.
Thanks!



---
### 🧮 Features Implemented

Scrapes Disney movie's
- Movie Title
- Trailer Link / Video
- Cover Image
- List of all Genres of Movie
- Synopsis - Short Description
- StoryLine of Movie

Extras
- Added loader to scroll to bottom of page to load all contents (that loads thorugh js)
- Added dialog box dismis logic to close any unexpectedly opened dialog for ratings etc that hinders with element interaction

---
### 📚 Libraries Needed

- **Selenium**: To get browser driver to interact with element of the web page

---
### 📊 Example Output:

```structure
[
{
'name': NAME OF DISNEY MOVIE,
'video': URL OF TRAILER,
'cover_img': COVER IMAGE,
'genres': [
{
'name': NAME OF GENRE,
'url': IMDB LINK OF GENRE
},
],
'synopsis': SHORT SUMMARY,
'story': STORYLINE'
}
]
...,...,...
```

**Nidhi**
[![GitHub](https://img.shields.io/badge/github-%2312100E.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/nidhi2026) | [![LinkedIn](https://img.shields.io/badge/linkedin-%230077B5.svg?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/nidhi-845150271/)