-
Notifications
You must be signed in to change notification settings - Fork 58
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
339 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,339 @@ | ||
{ | ||
"nbformat": 4, | ||
"nbformat_minor": 0, | ||
"metadata": { | ||
"colab": { | ||
"provenance": [] | ||
}, | ||
"kernelspec": { | ||
"name": "python3", | ||
"display_name": "Python 3" | ||
}, | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# created on April 14, 2021\n", | ||
"# modified on Jan 2, 2022\n", | ||
"# modified on April 2o, 2023\n", | ||
"# @author: Bo Zhao\n", | ||
"# @email: zhaobo@uw.edu\n", | ||
"# @website: https://hgis.uw.edu\n", | ||
"# @organization: Department of Geography, University of Washington, Seattle\n", | ||
"# @description: A demo of collecting data from YouTube." | ||
], | ||
"metadata": { | ||
"id": "bVJiWG0ZQ35Z" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "1VMonPRy4f6J" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"%%shell\n", | ||
"# Ubuntu no longer distributes chromium-browser outside of snap\n", | ||
"#\n", | ||
"# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap\n", | ||
"\n", | ||
"# Add debian buster\n", | ||
"cat > /etc/apt/sources.list.d/debian.list <<'EOF'\n", | ||
"deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main\n", | ||
"deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main\n", | ||
"deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main\n", | ||
"EOF\n", | ||
"\n", | ||
"# Add keys\n", | ||
"apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517\n", | ||
"apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138\n", | ||
"apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A\n", | ||
"\n", | ||
"apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg\n", | ||
"apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg\n", | ||
"apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg\n", | ||
"\n", | ||
"# Prefer debian repo for chromium* packages only\n", | ||
"# Note the double-blank lines between entries\n", | ||
"cat > /etc/apt/preferences.d/chromium.pref << 'EOF'\n", | ||
"Package: *\n", | ||
"Pin: release a=eoan\n", | ||
"Pin-Priority: 500\n", | ||
"\n", | ||
"\n", | ||
"Package: *\n", | ||
"Pin: origin \"deb.debian.org\"\n", | ||
"Pin-Priority: 300\n", | ||
"\n", | ||
"\n", | ||
"Package: chromium*\n", | ||
"Pin: origin \"deb.debian.org\"\n", | ||
"Pin-Priority: 700\n", | ||
"EOF\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"!apt-get update\n", | ||
"!apt-get install chromium chromium-driver\n", | ||
"!pip3 install selenium\n", | ||
"!pip install kora -q" | ||
], | ||
"metadata": { | ||
"id": "zMY83ivH4_Tc" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"from bs4 import BeautifulSoup\n", | ||
"import time, datetime\n", | ||
"import pandas as pd\n", | ||
"from selenium import webdriver\n", | ||
"from selenium.webdriver.chrome.options import Options\n", | ||
"#from kora.selenium import wd as bot\n", | ||
"\n", | ||
"options = Options()\n", | ||
"options.add_argument(\"--headless\")\n", | ||
"options.add_argument(\"--no-sandbox\")\n", | ||
"bot = webdriver.Chrome(\"chromedriver\", options=options)" | ||
], | ||
"metadata": { | ||
"id": "KOJwgbYBRed3" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# The url where the data will be collected from.\n", | ||
"url = \"https://www.youtube.com/results?search_query=standing+rock\"\n", | ||
"\n", | ||
"# Input the targeting url to the bot, and the bot will load data from the url.\n", | ||
"bot.get(url)\n", | ||
"\n", | ||
"# An array to store all the video urls. If a video has been crawled, it would not be stored to the data frame.\n", | ||
"video_urls = []\n", | ||
"# An array to store the retrieved video details.\n", | ||
"results = []\n", | ||
"\n", | ||
"\n", | ||
"# variable i indicates the number of times that scrolls down a web page. In practice, you might want to develop different\n", | ||
"# interaction approach to load and view the web pages.\n", | ||
"\n", | ||
"for i in range(5):\n", | ||
"\n", | ||
" # Create a document object model (DOM) from the raw source of the crawled web page.\n", | ||
" # Since you are processing a html page, 'html.parser' is chosen.\n", | ||
" soup = BeautifulSoup(bot.page_source, 'html.parser')\n", | ||
"\n", | ||
" # Capture all the video items using find_all or findAll method.\n", | ||
" # To view the information of the html elements you want to collect, you need to inspect the raw source using Chrome Inspector.\n", | ||
" # To test whether you find the right html elements, you can use the pycharm debugger to examine the returned data.\n", | ||
" videos = soup.find_all('ytd-video-renderer', class_=\"style-scope ytd-item-section-renderer\")[-20:] # 20 indicates only process the newly-acquired 20 entries.\n", | ||
"\n", | ||
" # iterate and process each video entry.\n", | ||
" for video in videos:\n", | ||
"\n", | ||
" # I prefer use the \"try-except\" statement to enable the program run without pausing due to unexpected errors.\n", | ||
" try:\n", | ||
" # video_url = video.find(\"a\", class_=\"yt-simple-endpoint inline-block style-scope ytd-thumbnail\").attrs[\"href\"]\n", | ||
" # user_url = video.find(\"a\", class_=\"yt-simple-endpoint style-scope yt-formatted-string\").attrs[\"href\"]\n", | ||
" # username = video.find(\"a\", class_=\"yt-simple-endpoint style-scope yt-formatted-string\").text\n", | ||
" # title = video.find(\"yt-formatted-string\", class_=\"style-scope ytd-video-renderer\").text\n", | ||
" # view_num = video.find_all(\"span\", class_=\"style-scope ytd-video-meta-block\")[0].text.replace(\" views\", \"\")\n", | ||
" # created_at = video.find_all(\"span\", class_=\"style-scope ytd-video-meta-block\")[0].text.replace(\" ago\", \"\")\n", | ||
" # shortdesc = video.find(\"yt-formatted-string\", id=\"description-text\").text\n", | ||
" # collected_at = datetime.datetime.now()\n", | ||
"\n", | ||
" video_url = video.find(\"a\", class_=\"yt-simple-endpoint style-scope ytd-video-renderer\").attrs[\"href\"]\n", | ||
" user_url = video.find(\"a\", class_=\"yt-simple-endpoint style-scope yt-formatted-string\").attrs[\"href\"]\n", | ||
" username = video.find(\"a\", class_=\"yt-simple-endpoint style-scope yt-formatted-string\").text\n", | ||
" title = video.find(\"yt-formatted-string\", class_=\"style-scope ytd-video-renderer\").text\n", | ||
" metadata_items = video.find_all(\"span\", class_=\"inline-metadata-item style-scope ytd-video-meta-block\")\n", | ||
" view_num = metadata_items[0].text.replace(\" views\", \"\")\n", | ||
" created_at = metadata_items[1].text.replace(\" ago\", \"\")\n", | ||
" shortdesc = video.find(\"yt-formatted-string\", class_=\"metadata-snippet-text style-scope ytd-video-renderer\").text\n", | ||
" collected_at = datetime.datetime.now()\n", | ||
"\n", | ||
" # create a row in the dict format.\n", | ||
" row = {'video_url': video_url,\n", | ||
" 'user_url': user_url,\n", | ||
" 'username': username,\n", | ||
" 'title': title,\n", | ||
" 'view_num': view_num,\n", | ||
" 'created_at': created_at,\n", | ||
" 'shortdesc': shortdesc,\n", | ||
" 'collected_at': collected_at}\n", | ||
"\n", | ||
" # if a video has been added, this video would not be inserted to the results array,\n", | ||
" # otherwise, this video will be inserted.\n", | ||
" if video_url in video_urls:\n", | ||
" print(\"this video has already been added.\")\n", | ||
" else:\n", | ||
" print(row)\n", | ||
" video_urls.append(video_url)\n", | ||
" results.append(row)\n", | ||
" except:\n", | ||
" pass\n", | ||
"\n", | ||
" # it is very important to enable the bot take some rest, and then resume to work.\n", | ||
" time.sleep(5)\n", | ||
"\n", | ||
" # Let the bot scrolls down to the bottom of the content element, most of the time the bot needs to scroll down to the bottom of the page.\n", | ||
" # like this statement: bot.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n", | ||
" bot.execute_script('window.scrollTo(0, document.getElementById(\"content\").scrollHeight);')\n", | ||
"\n", | ||
"# terminate the bot object.\n", | ||
"bot.close()\n", | ||
"\n", | ||
"# Store the results as a pandas dataframe\n", | ||
"df = pd.DataFrame(results)\n", | ||
"\n", | ||
"# notify the completion of the crawling in the console.\n", | ||
"print(\"the crawling task is finished.\")" | ||
], | ||
"metadata": { | ||
"id": "uj2ClnTbRGp7" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# Create data on to Google Drive\n", | ||
"from google.colab import drive\n", | ||
"# Mount your Drive to the Colab VM.\n", | ||
"#drive._mount('/gdrive')\n", | ||
"drive.mount('/gdrive')\n", | ||
"\n", | ||
"# the file path where to store the output csv on google drive\n", | ||
"output_file = '/gdrive/My Drive/videos.csv'\n", | ||
"\n", | ||
"# Save the dataframe as a csv file\n", | ||
"df.to_csv(output_file, index=False)" | ||
], | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "L3I9cWYBRI8r", | ||
"outputId": "e3dc7d97-03c2-4060-c8c3-e8726c39cb39" | ||
}, | ||
"execution_count": null, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": [ | ||
"Mounted at /gdrive\n" | ||
] | ||
} | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# download the csv to your local computer\n", | ||
"from google.colab import files\n", | ||
"files.download(output_file)\n", | ||
"print(\"the csv has been downloaded to your local computer. The program has been completed successfully.\")" | ||
], | ||
"metadata": { | ||
"id": "ymNL_9hpTJwr", | ||
"outputId": "74eae079-a644-4442-f4e9-fd590d87a823", | ||
"colab": { | ||
"base_uri": "https://localhost:8080/", | ||
"height": 35 | ||
} | ||
}, | ||
"execution_count": null, | ||
"outputs": [ | ||
{ | ||
"output_type": "display_data", | ||
"data": { | ||
"text/plain": [ | ||
"<IPython.core.display.Javascript object>" | ||
], | ||
"application/javascript": [ | ||
"\n", | ||
" async function download(id, filename, size) {\n", | ||
" if (!google.colab.kernel.accessAllowed) {\n", | ||
" return;\n", | ||
" }\n", | ||
" const div = document.createElement('div');\n", | ||
" const label = document.createElement('label');\n", | ||
" label.textContent = `Downloading \"${filename}\": `;\n", | ||
" div.appendChild(label);\n", | ||
" const progress = document.createElement('progress');\n", | ||
" progress.max = size;\n", | ||
" div.appendChild(progress);\n", | ||
" document.body.appendChild(div);\n", | ||
"\n", | ||
" const buffers = [];\n", | ||
" let downloaded = 0;\n", | ||
"\n", | ||
" const channel = await google.colab.kernel.comms.open(id);\n", | ||
" // Send a message to notify the kernel that we're ready.\n", | ||
" channel.send({})\n", | ||
"\n", | ||
" for await (const message of channel.messages) {\n", | ||
" // Send a message to notify the kernel that we're ready.\n", | ||
" channel.send({})\n", | ||
" if (message.buffers) {\n", | ||
" for (const buffer of message.buffers) {\n", | ||
" buffers.push(buffer);\n", | ||
" downloaded += buffer.byteLength;\n", | ||
" progress.value = downloaded;\n", | ||
" }\n", | ||
" }\n", | ||
" }\n", | ||
" const blob = new Blob(buffers, {type: 'application/binary'});\n", | ||
" const a = document.createElement('a');\n", | ||
" a.href = window.URL.createObjectURL(blob);\n", | ||
" a.download = filename;\n", | ||
" div.appendChild(a);\n", | ||
" a.click();\n", | ||
" div.remove();\n", | ||
" }\n", | ||
" " | ||
] | ||
}, | ||
"metadata": {} | ||
}, | ||
{ | ||
"output_type": "display_data", | ||
"data": { | ||
"text/plain": [ | ||
"<IPython.core.display.Javascript object>" | ||
], | ||
"application/javascript": [ | ||
"download(\"download_c44deb31-fb3f-4522-aaf8-3667c1850759\", \"videos.csv\", 18888)" | ||
] | ||
}, | ||
"metadata": {} | ||
}, | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": [ | ||
"the csv has been downloaded to your local computer. The program has been completed successfully.\n" | ||
] | ||
} | ||
] | ||
} | ||
] | ||
} |