Skip to content

Commit

Permalink
Created using Colaboratory
Browse files Browse the repository at this point in the history
  • Loading branch information
jakobzhao committed Jan 6, 2024
1 parent 9c66a7f commit 7e303df
Showing 1 changed file with 339 additions and 0 deletions.
339 changes: 339 additions & 0 deletions labs/lab02/youtube.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,339 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"source": [
"# created on April 14, 2021\n",
"# modified on Jan 2, 2022\n",
"# modified on April 2o, 2023\n",
"# @author: Bo Zhao\n",
"# @email: zhaobo@uw.edu\n",
"# @website: https://hgis.uw.edu\n",
"# @organization: Department of Geography, University of Washington, Seattle\n",
"# @description: A demo of collecting data from YouTube."
],
"metadata": {
"id": "bVJiWG0ZQ35Z"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1VMonPRy4f6J"
},
"outputs": [],
"source": [
"%%shell\n",
"# Ubuntu no longer distributes chromium-browser outside of snap\n",
"#\n",
"# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap\n",
"\n",
"# Add debian buster\n",
"cat > /etc/apt/sources.list.d/debian.list <<'EOF'\n",
"deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main\n",
"deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main\n",
"deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main\n",
"EOF\n",
"\n",
"# Add keys\n",
"apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517\n",
"apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138\n",
"apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A\n",
"\n",
"apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg\n",
"apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg\n",
"apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg\n",
"\n",
"# Prefer debian repo for chromium* packages only\n",
"# Note the double-blank lines between entries\n",
"cat > /etc/apt/preferences.d/chromium.pref << 'EOF'\n",
"Package: *\n",
"Pin: release a=eoan\n",
"Pin-Priority: 500\n",
"\n",
"\n",
"Package: *\n",
"Pin: origin \"deb.debian.org\"\n",
"Pin-Priority: 300\n",
"\n",
"\n",
"Package: chromium*\n",
"Pin: origin \"deb.debian.org\"\n",
"Pin-Priority: 700\n",
"EOF\n"
]
},
{
"cell_type": "code",
"source": [
"!apt-get update\n",
"!apt-get install chromium chromium-driver\n",
"!pip3 install selenium\n",
"!pip install kora -q"
],
"metadata": {
"id": "zMY83ivH4_Tc"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from bs4 import BeautifulSoup\n",
"import time, datetime\n",
"import pandas as pd\n",
"from selenium import webdriver\n",
"from selenium.webdriver.chrome.options import Options\n",
"#from kora.selenium import wd as bot\n",
"\n",
"options = Options()\n",
"options.add_argument(\"--headless\")\n",
"options.add_argument(\"--no-sandbox\")\n",
"bot = webdriver.Chrome(\"chromedriver\", options=options)"
],
"metadata": {
"id": "KOJwgbYBRed3"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# The url where the data will be collected from.\n",
"url = \"https://www.youtube.com/results?search_query=standing+rock\"\n",
"\n",
"# Input the targeting url to the bot, and the bot will load data from the url.\n",
"bot.get(url)\n",
"\n",
"# An array to store all the video urls. If a video has been crawled, it would not be stored to the data frame.\n",
"video_urls = []\n",
"# An array to store the retrieved video details.\n",
"results = []\n",
"\n",
"\n",
"# variable i indicates the number of times that scrolls down a web page. In practice, you might want to develop different\n",
"# interaction approach to load and view the web pages.\n",
"\n",
"for i in range(5):\n",
"\n",
" # Create a document object model (DOM) from the raw source of the crawled web page.\n",
" # Since you are processing a html page, 'html.parser' is chosen.\n",
" soup = BeautifulSoup(bot.page_source, 'html.parser')\n",
"\n",
" # Capture all the video items using find_all or findAll method.\n",
" # To view the information of the html elements you want to collect, you need to inspect the raw source using Chrome Inspector.\n",
" # To test whether you find the right html elements, you can use the pycharm debugger to examine the returned data.\n",
" videos = soup.find_all('ytd-video-renderer', class_=\"style-scope ytd-item-section-renderer\")[-20:] # 20 indicates only process the newly-acquired 20 entries.\n",
"\n",
" # iterate and process each video entry.\n",
" for video in videos:\n",
"\n",
" # I prefer use the \"try-except\" statement to enable the program run without pausing due to unexpected errors.\n",
" try:\n",
" # video_url = video.find(\"a\", class_=\"yt-simple-endpoint inline-block style-scope ytd-thumbnail\").attrs[\"href\"]\n",
" # user_url = video.find(\"a\", class_=\"yt-simple-endpoint style-scope yt-formatted-string\").attrs[\"href\"]\n",
" # username = video.find(\"a\", class_=\"yt-simple-endpoint style-scope yt-formatted-string\").text\n",
" # title = video.find(\"yt-formatted-string\", class_=\"style-scope ytd-video-renderer\").text\n",
" # view_num = video.find_all(\"span\", class_=\"style-scope ytd-video-meta-block\")[0].text.replace(\" views\", \"\")\n",
" # created_at = video.find_all(\"span\", class_=\"style-scope ytd-video-meta-block\")[0].text.replace(\" ago\", \"\")\n",
" # shortdesc = video.find(\"yt-formatted-string\", id=\"description-text\").text\n",
" # collected_at = datetime.datetime.now()\n",
"\n",
" video_url = video.find(\"a\", class_=\"yt-simple-endpoint style-scope ytd-video-renderer\").attrs[\"href\"]\n",
" user_url = video.find(\"a\", class_=\"yt-simple-endpoint style-scope yt-formatted-string\").attrs[\"href\"]\n",
" username = video.find(\"a\", class_=\"yt-simple-endpoint style-scope yt-formatted-string\").text\n",
" title = video.find(\"yt-formatted-string\", class_=\"style-scope ytd-video-renderer\").text\n",
" metadata_items = video.find_all(\"span\", class_=\"inline-metadata-item style-scope ytd-video-meta-block\")\n",
" view_num = metadata_items[0].text.replace(\" views\", \"\")\n",
" created_at = metadata_items[1].text.replace(\" ago\", \"\")\n",
" shortdesc = video.find(\"yt-formatted-string\", class_=\"metadata-snippet-text style-scope ytd-video-renderer\").text\n",
" collected_at = datetime.datetime.now()\n",
"\n",
" # create a row in the dict format.\n",
" row = {'video_url': video_url,\n",
" 'user_url': user_url,\n",
" 'username': username,\n",
" 'title': title,\n",
" 'view_num': view_num,\n",
" 'created_at': created_at,\n",
" 'shortdesc': shortdesc,\n",
" 'collected_at': collected_at}\n",
"\n",
" # if a video has been added, this video would not be inserted to the results array,\n",
" # otherwise, this video will be inserted.\n",
" if video_url in video_urls:\n",
" print(\"this video has already been added.\")\n",
" else:\n",
" print(row)\n",
" video_urls.append(video_url)\n",
" results.append(row)\n",
" except:\n",
" pass\n",
"\n",
" # it is very important to enable the bot take some rest, and then resume to work.\n",
" time.sleep(5)\n",
"\n",
" # Let the bot scrolls down to the bottom of the content element, most of the time the bot needs to scroll down to the bottom of the page.\n",
" # like this statement: bot.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
" bot.execute_script('window.scrollTo(0, document.getElementById(\"content\").scrollHeight);')\n",
"\n",
"# terminate the bot object.\n",
"bot.close()\n",
"\n",
"# Store the results as a pandas dataframe\n",
"df = pd.DataFrame(results)\n",
"\n",
"# notify the completion of the crawling in the console.\n",
"print(\"the crawling task is finished.\")"
],
"metadata": {
"id": "uj2ClnTbRGp7"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Create data on to Google Drive\n",
"from google.colab import drive\n",
"# Mount your Drive to the Colab VM.\n",
"#drive._mount('/gdrive')\n",
"drive.mount('/gdrive')\n",
"\n",
"# the file path where to store the output csv on google drive\n",
"output_file = '/gdrive/My Drive/videos.csv'\n",
"\n",
"# Save the dataframe as a csv file\n",
"df.to_csv(output_file, index=False)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "L3I9cWYBRI8r",
"outputId": "e3dc7d97-03c2-4060-c8c3-e8726c39cb39"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Mounted at /gdrive\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# download the csv to your local computer\n",
"from google.colab import files\n",
"files.download(output_file)\n",
"print(\"the csv has been downloaded to your local computer. The program has been completed successfully.\")"
],
"metadata": {
"id": "ymNL_9hpTJwr",
"outputId": "74eae079-a644-4442-f4e9-fd590d87a823",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"<IPython.core.display.Javascript object>"
],
"application/javascript": [
"\n",
" async function download(id, filename, size) {\n",
" if (!google.colab.kernel.accessAllowed) {\n",
" return;\n",
" }\n",
" const div = document.createElement('div');\n",
" const label = document.createElement('label');\n",
" label.textContent = `Downloading \"${filename}\": `;\n",
" div.appendChild(label);\n",
" const progress = document.createElement('progress');\n",
" progress.max = size;\n",
" div.appendChild(progress);\n",
" document.body.appendChild(div);\n",
"\n",
" const buffers = [];\n",
" let downloaded = 0;\n",
"\n",
" const channel = await google.colab.kernel.comms.open(id);\n",
" // Send a message to notify the kernel that we're ready.\n",
" channel.send({})\n",
"\n",
" for await (const message of channel.messages) {\n",
" // Send a message to notify the kernel that we're ready.\n",
" channel.send({})\n",
" if (message.buffers) {\n",
" for (const buffer of message.buffers) {\n",
" buffers.push(buffer);\n",
" downloaded += buffer.byteLength;\n",
" progress.value = downloaded;\n",
" }\n",
" }\n",
" }\n",
" const blob = new Blob(buffers, {type: 'application/binary'});\n",
" const a = document.createElement('a');\n",
" a.href = window.URL.createObjectURL(blob);\n",
" a.download = filename;\n",
" div.appendChild(a);\n",
" a.click();\n",
" div.remove();\n",
" }\n",
" "
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"<IPython.core.display.Javascript object>"
],
"application/javascript": [
"download(\"download_c44deb31-fb3f-4522-aaf8-3667c1850759\", \"videos.csv\", 18888)"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"the csv has been downloaded to your local computer. The program has been completed successfully.\n"
]
}
]
}
]
}

0 comments on commit 7e303df

Please sign in to comment.