From 3e9e05c649114dc082efd0e381c4b0626563ffd6 Mon Sep 17 00:00:00 2001 From: noah-ca Date: Thu, 7 Nov 2024 22:51:57 +0000 Subject: [PATCH] created a new folder and notebook for the freight routing project --- .../freight_routing_parsing.ipynb | 493 ++++++++++++++++++ 1 file changed, 493 insertions(+) create mode 100644 freight_routing_parsing/freight_routing_parsing.ipynb diff --git a/freight_routing_parsing/freight_routing_parsing.ipynb b/freight_routing_parsing/freight_routing_parsing.ipynb new file mode 100644 index 0000000..6525c3c --- /dev/null +++ b/freight_routing_parsing/freight_routing_parsing.ipynb @@ -0,0 +1,493 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9f2d5a2c-4b02-44f3-af2d-0febddc5fbc6", + "metadata": {}, + "source": [ + "# A workbook looking at the possibility of parsing the [Authorized Highways] column in the all_permits data\n", + "- ns\n", + "\n", + "- Originally requested by Stephen Yoon" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "acace08e-1139-4d26-876f-0101415d1cf3", + "metadata": {}, + "outputs": [], + "source": [ + "# import modules\n", + "import pandas as pd\n", + "import warnings\n", + "import gcsfs\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "482262aa-fbc6-478a-91ad-71ca26c8bb35", + "metadata": {}, + "outputs": [], + "source": [ + "gcs_path = \"gs://calitp-analytics-data/data-analyses/big_data/freight/all_permits/\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "94dc7a2c-e5db-4a60-88b0-5c5aa4961dd5", + "metadata": {}, + "outputs": [], + "source": [ + "file_names = [\"all_permits_2023_sampleset.xlsx\",\n", + " \"all_permits_2024_sampleset.xlsx\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7f234ea9-0098-47df-a456-01e9d585078c", + "metadata": {}, + "outputs": [], + "source": [ + "def load_excel_sheets_1(gcs_path, file_names):\n", + " \"\"\"\n", + " Pull in the first sheet from each Excel file in GCS, add a 'year' column based on the filename,\n", + " and remove records with NaN values in the 'permitnumber' column. Returns a concatenated DataFrame\n", + " with data from all files.\n", + "\n", + " Parameters:\n", + " gcs_path (str): The Google Cloud Storage path where the files are located.\n", + " file_names (list): A list of Excel file names in the GCS path.\n", + "\n", + " Returns:\n", + " pd.DataFrame: A single concatenated DataFrame with data from all files, a 'year' column, and\n", + " records with NaN values in 'permitnumber' removed.\n", + " \"\"\"\n", + " \n", + " # Create a Google Cloud Storage file system object\n", + " fs = gcsfs.GCSFileSystem()\n", + " \n", + " # List to store all DataFrames\n", + " df_list = []\n", + " \n", + " # Suppress any warnings\n", + " warnings.filterwarnings(\"ignore\")\n", + " \n", + " # Define the columns to keep\n", + " columns_to_keep = ['permitnumber', 'year', 'permitvalidfrom', 'permitvalidto', \n", + " 'loaddescription', 'origin', 'destination', 'authorizedhighways']\n", + " \n", + " # Loop through each file in the file list\n", + " for file in file_names:\n", + " # Extract the year from the filename\n", + " year = file.split('_')[2] # Assuming the year is the third element when split by '_'\n", + " \n", + " # Open the file and read only the first sheet\n", + " with fs.open(f\"{gcs_path}{file}\", 'rb') as f:\n", + " df = pd.read_excel(f, sheet_name=0) # Load only the first sheet\n", + " \n", + " # Clean headers by removing spaces and making characters lowercase\n", + " df.columns = [col.replace(\" \", \"\").lower() for col in df.columns]\n", + " \n", + " # Add 'year' column\n", + " df['year'] = year\n", + " \n", + " # Filter columns and remove rows with NaN in 'permitnumber'\n", + " df = df[columns_to_keep].dropna(subset=['permitnumber'])\n", + " \n", + " # Append to list\n", + " df_list.append(df)\n", + " \n", + " # Concatenate all DataFrames into a single DataFrame\n", + " final_df = pd.concat(df_list, ignore_index=True)\n", + " \n", + " return final_df\n", + "\n", + "# # Parsing function for authorizedhighways\n", + "# def parse_routes(route_info):\n", + "# segments = []\n", + " \n", + "# # Split based on \"from\" and \"to\"\n", + "# raw_segments = re.split(r'\\s*-\\s*from\\s+|\\s*-\\s*to\\s+', route_info)\n", + " \n", + "# # Loop through each segment and capture details\n", + "# for i, segment in enumerate(raw_segments):\n", + "# if i % 2 == 0:\n", + "# # Route segment starts with 'from'\n", + "# entry_point = segment.strip()\n", + "# entry_type = \"from\"\n", + "# else:\n", + "# # Route segment ends with 'to'\n", + "# exit_point = segment.strip()\n", + "# entry_type = \"to\"\n", + " \n", + "# # Extract highways and directions (e.g., 092E, 880N)\n", + "# routes = re.findall(r'\\b\\d{3}[A-Z]?\\b', segment)\n", + " \n", + "# # Append segment data as a dictionary\n", + "# segments.append({\n", + "# 'entry_type': entry_type,\n", + "# 'point': entry_point if entry_type == \"from\" else exit_point,\n", + "# 'routes': routes\n", + "# })\n", + " \n", + "# return segments" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "801cfbc4-1e6e-4d89-b79f-224b588a4351", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.9/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" + ] + } + ], + "source": [ + "df = load_excel_sheets_1(gcs_path, file_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4f026dc3-9d98-4fd1-a980-2be15cdfe474", + "metadata": {}, + "outputs": [], + "source": [ + "# Parsing function to create individual route locations\n", + "def parse_routes(route_info):\n", + " segments = []\n", + " \n", + " # Split the data by \"from\" and \"to\" to isolate each route section\n", + " raw_segments = re.split(r'\\s*-\\s*from\\s+|\\s*-\\s*to\\s+', route_info)\n", + " \n", + " # Process each segment\n", + " for segment in raw_segments:\n", + " # Find the first location and the remaining parts\n", + " sub_segments = segment.split(' - ', 1)\n", + " \n", + " # First part is the main entry or exit point\n", + " if sub_segments:\n", + " segments.append(sub_segments[0].strip())\n", + " \n", + " # If there are additional route elements, split by ' - ' and add them\n", + " if len(sub_segments) > 1:\n", + " additional_routes = sub_segments[1].split(' - ')\n", + " segments.extend([route.strip() for route in additional_routes])\n", + " \n", + " return segments\n", + "\n", + "# Apply the parsing function to create lists of individual route locations\n", + "df['route_segments'] = df['authorizedhighways'].apply(parse_routes)\n", + "\n", + "# Determine the maximum number of locations to create the necessary columns\n", + "max_locations = df['route_segments'].apply(len).max()\n", + "\n", + "# Create new columns for each route location based on the maximum number of locations\n", + "for i in range(max_locations):\n", + " df[f'route_location_{i}'] = df['route_segments'].apply(lambda x: x[i] if i < len(x) else None)\n", + "\n", + "# Drop the temporary route_segments column\n", + "df.drop(columns=['route_segments'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b4b7cf2c-69b4-4a59-a50e-a01c7b5397a0", + "metadata": {}, + "outputs": [], + "source": [ + "# Apply the parsing function to each row in the authorizedhighways column\n", + "#df['parsed_routes'] = df['authorizedhighways'].apply(parse_routes)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "32d5abec-9575-4855-85ba-b257a334e34b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
permitnumberyearpermitvalidfrompermitvalidtoloaddescriptionorigindestinationauthorizedhighwaysroute_location_0route_location_1...route_location_15route_location_16route_location_17route_location_18route_location_19route_location_20route_location_21route_location_22route_location_23route_location_24
0e23-013125202302/15/202302/21/202375' KELLY BARHAYWARDANTELOPE* from CLAWITER RD S/B ON RAMP - 092E - 880N -...* from CLAWITER RD S/B ON RAMP092E...080EANTELOPE RD exit (ANTELOPE RD N/B OFF RAMP) *NoneNoneNoneNoneNoneNoneNoneNone
1e23-021610202303/20/202303/26/2023UNLADEN 9 AXLE WITH 2 DECK INSERTSFONTANAONTARIO* from SIERRA AVE W/B ON RAMP - 015S - 060W - ...* from SIERRA AVE W/B ON RAMP015S...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
2e23-022752202303/22/202303/28/2023M95 TRACKED CONVEYORDIXONFRESNO* from INDUSTRIAL WAY - 113N - 080W - 680S - 5...* from INDUSTRIAL WAY113N...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
3e23-036568202305/05/202305/11/20235 TROWEL MACHINES (END TO END) & MISC LEGAL FR...ELK GROVECA/NV BORDER* from GRANT LINE RD W/B ON RAMP - 099N - 051N...* from GRANT LINE RD W/B ON RAMP099N...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
4e23-017248202303/02/202303/08/2023150H GRADERFAIRFIELDSARATOGA* from AIR BASE PKWY N/B ON RAMP - 080W - 680S...* from AIR BASE PKWY N/B ON RAMP080W...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
\n", + "

5 rows × 33 columns

\n", + "
" + ], + "text/plain": [ + " permitnumber year permitvalidfrom permitvalidto \\\n", + "0 e23-013125 2023 02/15/2023 02/21/2023 \n", + "1 e23-021610 2023 03/20/2023 03/26/2023 \n", + "2 e23-022752 2023 03/22/2023 03/28/2023 \n", + "3 e23-036568 2023 05/05/2023 05/11/2023 \n", + "4 e23-017248 2023 03/02/2023 03/08/2023 \n", + "\n", + " loaddescription origin destination \\\n", + "0 75' KELLY BAR HAYWARD ANTELOPE \n", + "1 UNLADEN 9 AXLE WITH 2 DECK INSERTS FONTANA ONTARIO \n", + "2 M95 TRACKED CONVEYOR DIXON FRESNO \n", + "3 5 TROWEL MACHINES (END TO END) & MISC LEGAL FR... ELK GROVE CA/NV BORDER \n", + "4 150H GRADER FAIRFIELD SARATOGA \n", + "\n", + " authorizedhighways \\\n", + "0 * from CLAWITER RD S/B ON RAMP - 092E - 880N -... \n", + "1 * from SIERRA AVE W/B ON RAMP - 015S - 060W - ... \n", + "2 * from INDUSTRIAL WAY - 113N - 080W - 680S - 5... \n", + "3 * from GRANT LINE RD W/B ON RAMP - 099N - 051N... \n", + "4 * from AIR BASE PKWY N/B ON RAMP - 080W - 680S... \n", + "\n", + " route_location_0 route_location_1 ... route_location_15 \\\n", + "0 * from CLAWITER RD S/B ON RAMP 092E ... 080E \n", + "1 * from SIERRA AVE W/B ON RAMP 015S ... None \n", + "2 * from INDUSTRIAL WAY 113N ... None \n", + "3 * from GRANT LINE RD W/B ON RAMP 099N ... None \n", + "4 * from AIR BASE PKWY N/B ON RAMP 080W ... None \n", + "\n", + " route_location_16 route_location_17 \\\n", + "0 ANTELOPE RD exit (ANTELOPE RD N/B OFF RAMP) * None \n", + "1 None None \n", + "2 None None \n", + "3 None None \n", + "4 None None \n", + "\n", + " route_location_18 route_location_19 route_location_20 route_location_21 \\\n", + "0 None None None None \n", + "1 None None None None \n", + "2 None None None None \n", + "3 None None None None \n", + "4 None None None None \n", + "\n", + " route_location_22 route_location_23 route_location_24 \n", + "0 None None None \n", + "1 None None None \n", + "2 None None None \n", + "3 None None None \n", + "4 None None None \n", + "\n", + "[5 rows x 33 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d2b8b899-d4d4-4c11-94a6-c99f91802988", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv(\"justlooking.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f986819c-2db3-42cb-a83f-ae3666d2293e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}