From 3366ef88ccdaf0e8dbebcb5c5a1174281cc8c450 Mon Sep 17 00:00:00 2001 From: Junwon Choi <56373973+cjunwon@users.noreply.github.com> Date: Tue, 5 Mar 2024 14:13:52 -0800 Subject: [PATCH] Sports gymnastics 2024 (#198) * Add url checking file for gymnastics * Fix covid model --- _config.yml | 4 + _posts/2020-05-12-covid-model.md | 10 +- .../gymnastics_score_crawl.ipynb | 159 ++++++++++++++++++ .../sports-gymnastics-2024/valid_urls.txt | 8 + 4 files changed, 176 insertions(+), 5 deletions(-) create mode 100644 collaborations/sports-gymnastics-2024/gymnastics_score_crawl.ipynb create mode 100644 collaborations/sports-gymnastics-2024/valid_urls.txt diff --git a/_config.yml b/_config.yml index 7c2cd888d..866c6f79e 100644 --- a/_config.yml +++ b/_config.yml @@ -33,5 +33,9 @@ paginate_path: "/page/:num/" # Build settings markdown: kramdown +kramdown: + input: GFM + math_engine: mathjax + # plugins plugins: [jekyll-paginate] diff --git a/_posts/2020-05-12-covid-model.md b/_posts/2020-05-12-covid-model.md index 12a66ea85..71ec6f515 100644 --- a/_posts/2020-05-12-covid-model.md +++ b/_posts/2020-05-12-covid-model.md @@ -124,10 +124,10 @@ A stochastic block model considers a set of student communities, grouped by depa Then a matrix A defines the probabilities used to randomly assign students from each department to courses in other departments. Celli, j of A represents the probability that a student housed in department i will take a course in department j. For this example, there's a probability of 0.7 that a sciences student will take a sciences class, a 0.1 probability they will take a business class, and a 0.2 probability they will take a humanities class. $$ -Sciences&Business&Humanities\\ -0.7&0.1&0.2\\ -0.1&0.8&0.1\\ -0.1&0.2&0.7 + Sciences&Business&Humanities\\ + 0.7&0.1&0.2\\ + 0.1&0.8&0.1\\ + 0.1&0.2&0.7 \end{pmatrix}$$ This example has simulated probabilities, but the real probabilities in our model are based on the number of GE, diversity and language courses in each major. So if a College of Letters and Science student in the mathematics department takes 140 units of major courses and 40 units of GEs, then the probability of the student being enrolled in the mathematics department is $$\frac{140}{180}$$, and, in the other GE-offering departments, is $$\frac{40}{180}$$, which in turn are distributed by department. So if there are three GE courses offered in total, with two of them being offered in department A and one being offered in department B, department A will have probability $$\frac{2}{3} * \frac{40}{180}$$, and department B will have probability $$\frac{1}{3} * \frac{40}{180}$$. @@ -137,7 +137,7 @@ Sciences&Business&Humanities\\ 0.7&0.1&0.2\\ 0.1&0.8&0.1\\ 0.1&0.2&0.7 -\end{pmatrix}\\\$\$ +\end{pmatrix}\$\$ This example has simulated probabilities, but the real probabilities in our model are based on the number of GE, diversity and language courses in each major. So if a College of Letters and Science student in the mathematics department takes 140 units of major courses and 40 units of GEs, then the probability of the student being enrolled in the mathematics department is $$\frac{140}{180}$$, and, in the other GE-offering departments, is $$\frac{40}{180}$$, which in turn are distributed by department. So if there are three GE courses offered in total, with two of them being offered in department A and one being offered in department B, department A will have probability $$\frac{2}{3} * \frac{40}{180}$$, and department B will have probability $$\frac{1}{3} * \frac{40}{180}$$. diff --git a/collaborations/sports-gymnastics-2024/gymnastics_score_crawl.ipynb b/collaborations/sports-gymnastics-2024/gymnastics_score_crawl.ipynb new file mode 100644 index 000000000..1196d611f --- /dev/null +++ b/collaborations/sports-gymnastics-2024/gymnastics_score_crawl.ipynb @@ -0,0 +1,159 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "example_url = \"https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2023/1/15/Stats.htm\"\n", + "\n", + "base_url = \"https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/\"\n", + "end_url = \"/Stats.htm\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import datetime\n", + "import time\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a list of all dates for each year (list in list) for the years 2015-2024. Each year is a list and date as a string in yyyy/m/d format\n", + "\n", + "years = list(range(2000,2025))\n", + "dates = []\n", + "\n", + "for year in years:\n", + " year_dates = []\n", + " for month in range(1,3):\n", + " for day in range(1,32):\n", + " try:\n", + " date = datetime.date(year, month, day)\n", + " year_dates.append(date.strftime(\"%Y/%-m/%-d\"))\n", + " except ValueError:\n", + " pass\n", + " dates.append(year_dates)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# For each year, create a list of urls for each date in the dates list\n", + "\n", + "urls = []\n", + "\n", + "for year in dates:\n", + " year_urls = []\n", + " for date in year:\n", + " year_urls.append(base_url + date + end_url)\n", + " urls.append(year_urls)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# iterate through each url and check if url exists. If it does, add to list of valid urls. If not, pass. If there is a valid url for a year, skip to the next year.\n", + "\n", + "valid_urls = []\n", + "\n", + "for year in urls:\n", + " for url in year:\n", + " try:\n", + " response = requests.get(url)\n", + " if response.status_code == 200:\n", + " valid_urls.append(url)\n", + " break\n", + " except:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2017/1/7/Stats.htm',\n", + " 'https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2018/1/7/Stats.htm',\n", + " 'https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2019/1/5/Stats.htm',\n", + " 'https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2020/1/5/Stats.htm',\n", + " 'https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2021/1/24/Stats.htm',\n", + " 'https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2022/1/18/Stats.htm',\n", + " 'https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2023/1/8/Stats.htm',\n", + " 'https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2024/1/10/Stats.htm']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "valid_urls" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# export valid urls to a txt file\n", + "\n", + "with open(\"valid_urls.txt\", \"w\") as file:\n", + " for url in valid_urls:\n", + " file.write(url + \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/collaborations/sports-gymnastics-2024/valid_urls.txt b/collaborations/sports-gymnastics-2024/valid_urls.txt new file mode 100644 index 000000000..1bfe00418 --- /dev/null +++ b/collaborations/sports-gymnastics-2024/valid_urls.txt @@ -0,0 +1,8 @@ +https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2017/1/7/Stats.htm +https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2018/1/7/Stats.htm +https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2019/1/5/Stats.htm +https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2020/1/5/Stats.htm +https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2021/1/24/Stats.htm +https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2022/1/18/Stats.htm +https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2023/1/8/Stats.htm +https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2024/1/10/Stats.htm