diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bac463d --- /dev/null +++ b/.gitignore @@ -0,0 +1,63 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual Environment +.env +.venv +env/ +venv/ +ENV/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +.project +.pydevproject +.settings/ + +# Project specific +temp_data/ +*.bak +*.bacpac +*.dump +*.sql +vpic.db +logs/ + +# Docker +.docker/ +docker-compose.override.yml + +# OS specific +.DS_Store +Thumbs.db +*.log + +# Test coverage +.coverage +htmlcov/ +.pytest_cache/ +.tox/ +coverage.xml +*.cover \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..393c372 --- /dev/null +++ b/Makefile @@ -0,0 +1,92 @@ +# Makefile +.PHONY: all clean setup install start-containers download restore migrate-pg migrate-sqlite verify backup export-pg test + +# Variables +BACKUP_NAME := vpic_postgres_$(shell date +%Y%m%d_%H%M%S) +TEMP_DIR := temp_data +SQL_CONTAINER := sqltemp +PG_CONTAINER := pg_target +VENV := .venv +PYTHON := $(VENV)/bin/python +PIP := $(VENV)/bin/pip + +# Default target +all: clean setup start-containers download restore migrate-pg verify backup + +# Clean environment +clean: + @echo "Cleaning environment..." + docker-compose -f docker/docker-compose.yml down -v || true + rm -rf $(VENV) *.egg-info dist build __pycache__ vpic_migration/__pycache__ + +# Setup Python virtual environment +$(VENV)/bin/activate: requirements.txt + python -m venv $(VENV) + $(PIP) install -U pip setuptools wheel + $(PIP) install -r requirements.txt + $(PYTHON) setup.py develop + +# Setup environment +setup: $(VENV)/bin/activate + +# Install package in development mode +install: setup + $(PYTHON) setup.py develop + +# Start Docker containers +start-containers: + @echo "Starting containers..." + docker-compose -f docker/docker-compose.yml up -d + @echo "Waiting for containers to be ready..." + sleep 20 + +# Download vPIC data +download: + @echo "Downloading vPIC data..." + ./scripts/download_vpic.sh + +# Restore SQL Server backup +restore: + @echo "Restoring SQL Server backup..." + ./scripts/restore_backup.sh + ./scripts/verify_db.sh + +# Migrate to PostgreSQL +migrate-pg: install + @echo "Migrating to PostgreSQL..." + TARGET_DB=postgres $(PYTHON) -m vpic_migration.migrate + +# Migrate to SQLite +migrate-sqlite: install + @echo "Migrating to SQLite..." + TARGET_DB=sqlite $(PYTHON) -m vpic_migration.migrate + +# Verify migration +verify-pg: + @echo "Verifying migration..." + @echo "SQL Server tables:" + ./scripts/verify_db.sh + @echo "PostgreSQL tables:" + docker exec $(PG_CONTAINER) psql -U postgres -d vpic -c "\dt+" + +# Run tests +test: install + @echo "Running tests..." + $(PYTHON) -m pytest tests/ + +# Create backup +backup: export-pg + +# Export PostgreSQL database +export-pg: + @echo "Exporting PostgreSQL database..." + mkdir -p $(TEMP_DIR) + @echo "Creating schema-only backup..." + docker exec $(PG_CONTAINER) pg_dump -U postgres -d vpic --schema-only > $(TEMP_DIR)/$(BACKUP_NAME)_schema.sql + @echo "Creating data-only backup..." + docker exec $(PG_CONTAINER) pg_dump -U postgres -d vpic --data-only > $(TEMP_DIR)/$(BACKUP_NAME)_data.sql + @echo "Creating complete backup..." + docker exec $(PG_CONTAINER) pg_dump -U postgres -d vpic -Fc > $(TEMP_DIR)/$(BACKUP_NAME).dump + @echo "Backup files created in $(TEMP_DIR):" + @ls -lh $(TEMP_DIR)/$(BACKUP_NAME)* + diff --git a/README.md b/README.md new file mode 100644 index 0000000..3a3f78a --- /dev/null +++ b/README.md @@ -0,0 +1,122 @@ +# vPIC Database Migration Tool + +A robust tool for downloading, migrating, and managing the NHTSA's Vehicle Product Information Catalog (vPIC) database across different database platforms (SQL Server, PostgreSQL, and SQLite). + +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) + +## Overview + +This tool facilitates the migration of the NHTSA's vPIC database, which contains comprehensive vehicle specification data, including: +- Vehicle Makes, Models, and Types +- Manufacturer Information +- Vehicle Specifications and Features +- WMI (World Manufacturer Identifier) Data +- VIN Decoder implementation + +## Features + +- 🚀 Automated download of the latest vPIC database backup +- 🔄 Migration support for multiple database platforms: + - Microsoft SQL Server + - PostgreSQL + - SQLite +- ✅ Data integrity verification +- 📊 Progress tracking with detailed logging +- 🔧 Configurable settings and type mappings +- 🐳 Docker support for easy deployment + +## Prerequisites + +- Python 3.8 or higher +- Docker and Docker Compose +- Make (optional, but recommended) + +### System Requirements + +- Storage: At least 10GB free space (varies based on data size) +- Memory: Minimum 4GB RAM recommended +- OS: Compatible with Linux, macOS, and Windows + +## Quick Start + +1. Clone the repository: + git clone https://github.com/yourusername/vpic-migration.git + cd vpic-migration + +2. Install dependencies: + # On macOS + ./install_deps.sh + + # On Windows / Linux + python -m venv .venv + .venv\Scripts\activate + pip install -r requirements.txt + +3. Start the containers: + make start-containers + +4. Run the migration: + make download + make restore + make migrate-pg + make migrate-sqlite + make verify-pg + make backup + + or + + make all + +## Usage + +### Basic Usage + +The simplest way to use the tool is through the provided Makefile commands: + +# Run all steps +make all + +# Download latest vPIC data +make download + +# Restore SQL Server backup +make restore + +# Migrate to PostgreSQL +make migrate-pg + +# Migrate to SQLite +make migrate-sqlite + +# Verify migration +make verify + +# Create backup +make backup + +## Configuration + +Configuration can be modified through environment variables or by editing vpic_migration/settings.py: + +SQL_SERVER = { + "driver": "ODBC Driver 18 for SQL Server", + "server": "localhost", + "database": "vpic", + "user": "SA", + "password": "YourPassword", + "trust_cert": "yes" +} + +See [CONFIGURATION.md](docs/CONFIGURATION.md) for all available options. + +## Data Structure + +The vPIC database contains numerous tables with vehicle-related information. Key tables include: + +- Make: Vehicle manufacturers +- Model: Vehicle models +- VehicleType: Types of vehicles +- WMI: World Manufacturer Identifier information +- And many more... + +For complete schema information, see [DATA_STRUCTURE.md](docs/DATA_STRUCTURE.md). diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml new file mode 100644 index 0000000..6effae1 --- /dev/null +++ b/docker/docker-compose.yml @@ -0,0 +1,89 @@ +version: '3.8' +services: + sqlserver: + build: + context: . + dockerfile: sqledge.Dockerfile + container_name: sqltemp + environment: + - ACCEPT_EULA=1 + - MSSQL_SA_PASSWORD=DevPassword123# + - MSSQL_PID=Developer + # Disable encryption requirement for local dev + - MSSQL_ENCRYPT=DISABLED + ports: + - "1433:1433" + volumes: + - ../temp_data:/var/opt/mssql/backup + healthcheck: + test: /opt/mssql-tools18/bin/sqlcmd -S localhost -U SA -P "DevPassword123#" -C -Q "SELECT 1" || exit 1 + interval: 10s + timeout: 3s + retries: 10 + start_period: 10s + + + postgres: + image: postgres:15 + container_name: pg_target + environment: + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=postgres + - POSTGRES_DB=vpic + ports: + - "5432:5432" + volumes: + - postgres_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 10s + timeout: 5s + retries: 5 + + # postgres: + # image: postgres:15 + # container_name: pg_target + # environment: + # - POSTGRES_USER=postgres + # - POSTGRES_PASSWORD=postgres + # - POSTGRES_DB=vpic + # ports: + # - "5432:5432" + # volumes: + # - postgres_data:/var/lib/postgresql/data + # command: + # - postgres + # - "-c" + # - "listen_addresses=*" + # - "-c" + # - "password_encryption=md5" + + # healthcheck: + # test: ["CMD-SHELL", "pg_isready -U postgres"] + # interval: 10s + # timeout: 5s + # retries: 5 + + # postgres: + # image: postgres:15 + # container_name: pg_target + # environment: + # - POSTGRES_USER=postgres + # - POSTGRES_PASSWORD=postgres + # - POSTGRES_DB=vpic + # ports: + # - "5432:5432" + # volumes: + # - postgres_data:/var/lib/postgresql/data + # command: + # - "postgres" + # - "-c" + # - "password_encryption=md5" + # healthcheck: + # test: ["CMD-SHELL", "pg_isready -U postgres"] + # interval: 10s + # timeout: 5s + # retries: 5 + +volumes: + postgres_data: \ No newline at end of file diff --git a/docker/sqledge.Dockerfile b/docker/sqledge.Dockerfile new file mode 100644 index 0000000..edaaccf --- /dev/null +++ b/docker/sqledge.Dockerfile @@ -0,0 +1,26 @@ +# ./docker/sqledge.Dockerfile +FROM mcr.microsoft.com/azure-sql-edge:latest + +USER root + +# Create required directories and set permissions +RUN mkdir -p /var/lib/apt/lists/partial && \ + chmod 755 /var/lib/apt/lists/partial + +# Install prerequisites and SQL tools +RUN apt-get update && \ + apt-get install -y curl gnupg2 unixodbc-dev && \ + curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - && \ + curl https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list && \ + apt-get update && \ + ACCEPT_EULA=Y apt-get install -y mssql-tools18 msodbcsql18 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Add SQL tools to PATH +ENV PATH="/opt/mssql-tools18/bin:${PATH}" + +# Switch back to mssql user +USER mssql + +CMD [ "/opt/mssql/bin/sqlservr" ] \ No newline at end of file diff --git a/docker/test.sh b/docker/test.sh new file mode 100755 index 0000000..baa3ec2 --- /dev/null +++ b/docker/test.sh @@ -0,0 +1,51 @@ +#!/bin/bash +set -euo pipefail + +echo "Building and starting containers..." +docker-compose down -v +docker-compose build +docker-compose up -d + +echo "Waiting for SQL Server to be ready..." +sleep 20 + +echo "Testing SQL Server connection..." +docker exec sqltemp /opt/mssql-tools18/bin/sqlcmd \ + -S localhost \ + -U SA \ + -P "DevPassword123#" \ + -C \ + -Q "SELECT @@VERSION" + +if [ $? -eq 0 ]; then + echo "SQL Server is running and accessible!" + + # Test creating a database and table + echo "Testing database operations..." + docker exec sqltemp /opt/mssql-tools18/bin/sqlcmd \ + -S localhost \ + -U SA \ + -P "DevPassword123#" \ + -C \ + -Q "CREATE DATABASE TestDB; USE TestDB; CREATE TABLE TestTable (ID INT); INSERT INTO TestTable VALUES (1); SELECT * FROM TestTable;" +else + echo "Failed to connect to SQL Server" + docker logs sqltemp + exit 1 +fi + +echo "Testing Postgres connection..." +docker exec postgrestemp psql -U postgres -d postgres -c "SELECT version();" + +if [ $? -eq 0 ]; then + echo "Postgres is running and accessible!" + + # Test creating a database and table + echo "Testing database operations..." + docker exec postgrestemp psql -U postgres -d postgres -c "CREATE DATABASE TestDB; \c TestDB; CREATE TABLE TestTable (ID INT); INSERT INTO TestTable VALUES (1); SELECT * FROM TestTable;" +else + echo "Failed to connect to Postgres" + docker logs postgrestemp + exit 1 +fi + diff --git a/docs/DATA_STRUCTURE.md b/docs/DATA_STRUCTURE.md new file mode 100644 index 0000000..16f4fcf --- /dev/null +++ b/docs/DATA_STRUCTURE.md @@ -0,0 +1,130 @@ +# VPIC Database Structure + +## Core Components + +The VPIC database is organized around several key components that work together to decode Vehicle Identification Numbers (VINs) and provide vehicle specifications. + +### Primary Tables + +#### 1. Manufacturer & WMI +```mermaid +erDiagram + Manufacturer ||--o{ WMI : "produces" + Country ||--o{ WMI : "location_of" + Make ||--o{ Wmi_Make : "identifies" + + WMI { + varchar Wmi PK + int ManufacturerId FK + int VehicleTypeId FK + int CountryId FK + } +``` + +The World Manufacturer Identifier (WMI) system is the foundation of VIN decoding: +- `Manufacturer`: Company that produces vehicles +- `WMI`: 3-6 character codes assigned to manufacturers +- `Country`: Manufacturing location +- `Make`: Brand names under manufacturers + +#### 2. Vehicle Models +```mermaid +erDiagram + Make ||--o{ Make_Model : "has" + Model ||--o{ Make_Model : "belongs_to" + VehicleType ||--o{ WMI : "categorizes" + + Make_Model { + int Id PK + int MakeId FK + int ModelId FK + } +``` + +Vehicle model relationships: +- `Make_Model`: Links makes to their models +- `VehicleType`: Categories like Passenger Car, Motorcycle, etc. +- `Model`: Specific vehicle models + +#### 3. VIN Pattern Matching +```mermaid +erDiagram + WMI ||--o{ Wmi_VinSchema : "uses" + VinSchema ||--o{ Pattern : "contains" + Element ||--o{ Pattern : "defines" + + Pattern { + int Id PK + int VinSchemaId FK + varchar Keys + int ElementId FK + varchar AttributeId + } +``` + +Pattern matching system: +- `VinSchema`: Decoding rules for manufacturers +- `Pattern`: Position-specific matching rules +- `Element`: Decodable vehicle attributes +- `Wmi_VinSchema`: Links WMIs to applicable schemas + +### Key Attribute Tables + +Essential vehicle specification tables: +- `BodyStyle`: Vehicle body configurations +- `DriveType`: 2WD, 4WD, AWD, etc. +- `EngineModel`: Engine specifications +- `Transmission`: Transmission types +- `FuelType`: Fuel system information +- `VehicleType`: Basic vehicle categories + +### Core Fields + +#### VinSchema +```sql +CREATE TABLE VinSchema ( + Id int PRIMARY KEY, + Name varchar(100), + SourceWMI varchar(6), + TobeQCed bit +) +``` + +#### Pattern +```sql +CREATE TABLE Pattern ( + Id int PRIMARY KEY, + VinSchemaId int, + Keys varchar(50), + ElementId int, + AttributeId varchar(50) +) +``` + +#### Element +```sql +CREATE TABLE Element ( + Id int PRIMARY KEY, + Name varchar(100), + Code varchar(50), + LookupTable varchar(50), + GroupName varchar(50), + DataType varchar(20) +) +``` + +## VIN Position Reference + +Standard 17-character VIN structure: +1. Position 1-3: World Manufacturer Identifier (WMI) +2. Position 4-8: Vehicle Description Section (VDS) +3. Position 9: Check Digit +4. Position 10: Model Year +5. Position 11: Plant Code +6. Position 12-17: Sequential Number + +## Data Attribution + +This database structure is based on the NHTSA's vPIC (Vehicle Product Information Catalog) system. When using this data, please attribute: + +"National Highway Traffic Safety Administration (NHTSA) - Vehicle Product Information Catalog (vPIC)" \ No newline at end of file diff --git a/install_deps.sh b/install_deps.sh new file mode 100755 index 0000000..9a7ba07 --- /dev/null +++ b/install_deps.sh @@ -0,0 +1,47 @@ +#!/bin/bash +set -euo pipefail + +echo "Installing dependencies for vPIC migration..." + +# Check if running on macOS +if [[ "$OSTYPE" != "darwin"* ]]; then + echo "This script is designed for macOS. Please adapt for your OS." + exit 1 +fi + +# Check if Homebrew is installed +if ! command -v brew &> /dev/null; then + echo "Installing Homebrew..." + /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)" +fi + +# Install system dependencies +echo "Installing system dependencies..." +brew install unixodbc +brew install postgresql@14 + +# Install Microsoft ODBC driver +echo "Installing Microsoft ODBC driver..." +brew tap microsoft/mssql-release https://github.com/Microsoft/homebrew-mssql-release +brew update +brew install msodbcsql18 +brew install mssql-tools18 + +# Set up Python virtual environment +echo "Setting up Python virtual environment..." +python -m venv .venv +source .venv/bin/activate + +# Upgrade pip +echo "Upgrading pip..." +pip install --upgrade pip + +# Install Python dependencies one by one with verbose output +echo "Installing Python dependencies..." +pip install pyodbc -v +pip install psycopg2-binary -v +pip install tqdm -v +pip install python-dotenv -v + +echo "Installation completed!" +echo "You can now activate the virtual environment with: source .venv/bin/activate" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2b39dcc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +pyodbc>=5.0.1 +psycopg2-binary==2.9.9 +tqdm==4.66.1 +pytest==7.4.3 # For testing +python-dotenv==1.0.0 # For configuration \ No newline at end of file diff --git a/scripts/db_utils.sh b/scripts/db_utils.sh new file mode 100644 index 0000000..c687bb3 --- /dev/null +++ b/scripts/db_utils.sh @@ -0,0 +1,104 @@ +#!/bin/bash +set -euo pipefail + +# Common database connection settings +source .env 2>/dev/null || true + +# Default values if not set in .env +SQL_USER="${SQL_USER:-SA}" +SQL_PASSWORD="${SQL_PASSWORD:-DevPassword123#}" +PG_USER="${PG_USER:-postgres}" +PG_PASSWORD="${PG_PASSWORD:-postgres}" + +# Container names +SQL_CONTAINER="${SQL_CONTAINER:-sqltemp}" +PG_CONTAINER="${PG_CONTAINER:-pg_target}" + +wait_for_sqlserver() { + echo "Waiting for SQL Server to be ready..." + ATTEMPT=0 + MAX_ATTEMPTS=30 + + until docker exec "$SQL_CONTAINER" /opt/mssql-tools18/bin/sqlcmd \ + -S localhost \ + -U "$SQL_USER" \ + -P "$SQL_PASSWORD" \ + -C \ + -Q "SELECT @@VERSION" &>/dev/null + do + ATTEMPT=$((ATTEMPT+1)) + echo "Waiting... ($ATTEMPT/$MAX_ATTEMPTS)" + [ $ATTEMPT -eq $MAX_ATTEMPTS ] && return 1 + sleep 2 + done + + echo "SQL Server is ready!" + return 0 +} + +wait_for_postgres() { + echo "Waiting for PostgreSQL to be ready..." + ATTEMPT=0 + MAX_ATTEMPTS=30 + + until docker exec "$PG_CONTAINER" pg_isready -U "$PG_USER" &>/dev/null + do + ATTEMPT=$((ATTEMPT+1)) + echo "Waiting... ($ATTEMPT/$MAX_ATTEMPTS)" + [ $ATTEMPT -eq $MAX_ATTEMPTS ] && return 1 + sleep 2 + done + + echo "PostgreSQL is ready!" + return 0 +} + +verify_sqlserver_connection() { + echo "Verifying SQL Server connection..." + docker exec "$SQL_CONTAINER" /opt/mssql-tools18/bin/sqlcmd \ + -S localhost \ + -U "$SQL_USER" \ + -P "$SQL_PASSWORD" \ + -C \ + -Q "SELECT DB_NAME(database_id) as DatabaseName, + state_desc as Status + FROM sys.databases + WHERE name = 'vpic'" +} + +verify_postgres_connection() { + echo "Verifying PostgreSQL connection..." + docker exec "$PG_CONTAINER" psql \ + -U "$PG_USER" \ + -d vpic \ + -c "\dt+" +} + +# Database backup functions +backup_postgres() { + local backup_dir="$1" + local timestamp=$(date +%Y%m%d_%H%M%S) + local backup_base="${backup_dir}/vpic_postgres_${timestamp}" + + echo "Creating PostgreSQL backups..." + + # Schema only backup + docker exec "$PG_CONTAINER" pg_dump \ + -U "$PG_USER" \ + -d vpic \ + --schema-only > "${backup_base}_schema.sql" + + # Data only backup + docker exec "$PG_CONTAINER" pg_dump \ + -U "$PG_USER" \ + -d vpic \ + --data-only > "${backup_base}_data.sql" + + # Complete backup in custom format + docker exec "$PG_CONTAINER" pg_dump \ + -U "$PG_USER" \ + -d vpic \ + -Fc > "${backup_base}.dump" + + echo "Backups created at: ${backup_base}*" +} \ No newline at end of file diff --git a/scripts/download_vpic.sh b/scripts/download_vpic.sh new file mode 100755 index 0000000..cb24252 --- /dev/null +++ b/scripts/download_vpic.sh @@ -0,0 +1,40 @@ +# ./scripts/download_vpic.sh +#!/usr/bin/env bash +set -euo pipefail + +VPIC_PAGE_URL="https://vpic.nhtsa.dot.gov/api/" +DOWNLOAD_DIR="${DOWNLOAD_DIR:-temp_data}" + +mkdir -p "$DOWNLOAD_DIR" + +echo "Fetching vPIC main API page..." +content=$(curl -s "$VPIC_PAGE_URL") + +# 1) Extract the relative URL. For example, /api/vPICList_lite_2024_12.bak.zip +# We’ll look for: /api/vPICList_lite_YYYY_MM.bak.zip + +latest_relative_url=$(echo "$content" \ + | grep -oE '/api/vPICList_lite_[0-9]{4}_[0-9]{2}\.bak\.zip' \ + | head -n 1) + +if [ -z "$latest_relative_url" ]; then + echo "ERROR: Could not find any .bak.zip link on $VPIC_PAGE_URL" + exit 1 +fi + +# 2) Prepend https://vpic.nhtsa.dot.gov to get a full download URL +latest_url="https://vpic.nhtsa.dot.gov${latest_relative_url}" + +echo "Latest link found: $latest_url" + +# 3) Download the file +zipfile="${DOWNLOAD_DIR}/vpic.bak.zip" +echo "Downloading to $zipfile ..." +curl -L "$latest_url" -o "$zipfile" + +# 4) Unzip +echo "Unzipping $zipfile ..." +unzip -o "$zipfile" -d "$DOWNLOAD_DIR" + +echo "Contents in $DOWNLOAD_DIR:" +ls -l "$DOWNLOAD_DIR" diff --git a/scripts/fix_structure.sh b/scripts/fix_structure.sh new file mode 100644 index 0000000..af74139 --- /dev/null +++ b/scripts/fix_structure.sh @@ -0,0 +1,32 @@ +#!/bin/bash +set -euo pipefail + +echo "Creating package structure..." + +# Create base package directory if it doesn't exist +mkdir -p vpic_migration/{config,utils,scripts} + +# Create __init__.py files +touch vpic_migration/__init__.py +touch vpic_migration/config/__init__.py +touch vpic_migration/utils/__init__.py +touch vpic_migration/scripts/__init__.py + +# Move files to their new locations +if [ -f "scripts/migrate.py" ]; then + mv scripts/migrate.py vpic_migration/migrate.py +fi + +if [ -d "scripts/utils" ]; then + mv scripts/utils/* vpic_migration/utils/ +fi + +if [ -d "config" ]; then + mv config/* vpic_migration/config/ +fi + +# Clean up empty directories +rm -rf scripts/utils config 2>/dev/null || true + +echo "Package structure created!" +ls -R vpic_migration/ \ No newline at end of file diff --git a/scripts/restore_backup.sh b/scripts/restore_backup.sh new file mode 100755 index 0000000..61940c7 --- /dev/null +++ b/scripts/restore_backup.sh @@ -0,0 +1,56 @@ +#!/bin/bash +set -euo pipefail + +echo "Starting backup restoration process..." + +# Variables + +BACKUP_FILE="/var/opt/mssql/backup/VPICList_lite_2025_01.bak" +SQL_USER="SA" +SQL_PASSWORD="DevPassword123#" + +# First, let's check if the backup file exists in the container +docker exec sqltemp ls -l $BACKUP_FILE || { + echo "Error: Backup file not found in container" + exit 1 +} + +# Get logical file names from backup +echo "Getting logical file names from backup..." +docker exec sqltemp /opt/mssql-tools18/bin/sqlcmd -S localhost \ + -U $SQL_USER -P $SQL_PASSWORD -C \ + -Q "RESTORE FILELISTONLY FROM DISK = '$BACKUP_FILE'" + +# Create restore command with correct logical file names +echo "Restoring database..." +docker exec sqltemp /opt/mssql-tools18/bin/sqlcmd -S localhost \ + -U $SQL_USER -P $SQL_PASSWORD -C \ + -Q "RESTORE DATABASE vpic + FROM DISK = '$BACKUP_FILE' + WITH MOVE 'vPICList_Lite1' TO '/var/opt/mssql/data/vpic.mdf', + MOVE 'vPICList_Lite1_log' TO '/var/opt/mssql/data/vpic_log.ldf', + REPLACE" + +# Verify restoration +echo "Verifying database restoration..." +docker exec sqltemp /opt/mssql-tools18/bin/sqlcmd -S localhost \ + -U $SQL_USER -P $SQL_PASSWORD -C \ + -Q "SELECT DB_NAME(database_id) as DatabaseName, + state_desc as Status + FROM sys.databases + WHERE name = 'vpic'" + +# Get table counts and names +echo "Getting database information..." +docker exec sqltemp /opt/mssql-tools18/bin/sqlcmd -S localhost \ + -U $SQL_USER -P $SQL_PASSWORD -C \ + -d vpic \ + -Q "SELECT + t.TABLE_NAME, + (SELECT COUNT(*) FROM vpic.INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = t.TABLE_NAME) as ColumnCount, + (SELECT COUNT_BIG(*) FROM vpic.sys.tables st + INNER JOIN vpic.sys.partitions p ON st.object_id = p.object_id + WHERE st.name = t.TABLE_NAME) as RowCount + FROM vpic.INFORMATION_SCHEMA.TABLES t + WHERE TABLE_TYPE = 'BASE TABLE' + ORDER BY TABLE_NAME;" \ No newline at end of file diff --git a/scripts/run_pipeline.sh b/scripts/run_pipeline.sh new file mode 100755 index 0000000..9a80d31 --- /dev/null +++ b/scripts/run_pipeline.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Function to clean up Docker resources +cleanup_docker() { + echo "Cleaning up Docker resources..." + + # Stop and remove specific containers if they exist + for container in sqltemp pg_target; do + if docker ps -a --format '{{.Names}}' | grep -q "^${container}$"; then + echo "Stopping and removing container: ${container}" + docker stop "${container}" 2>/dev/null || true + docker rm "${container}" 2>/dev/null || true + fi + done + + # Remove the network if it exists + if docker network ls | grep -q "docker_default"; then + echo "Removing network: docker_default" + docker network rm docker_default 2>/dev/null || true + fi +} + +# Function to wait for SQL Server +wait_for_sqlserver() { + echo "Waiting for SQL Server to be ready..." + ATTEMPT=0 + MAX_ATTEMPTS=30 + + until docker exec sqltemp sqlcmd \ + -S localhost \ + -U SA \ + -P "YourStrong!Passw0rd" \ + -Q "SELECT @@VERSION" \ + -N || [ $ATTEMPT -eq $MAX_ATTEMPTS ] + do + ATTEMPT=$((ATTEMPT+1)) + echo "Waiting for SQL Server to start (Attempt $ATTEMPT/$MAX_ATTEMPTS)..." + sleep 5 + done + + if [ $ATTEMPT -eq $MAX_ATTEMPTS ]; then + echo "Error: Could not connect to SQL Server" + docker logs sqltemp + exit 1 + fi + + echo "SQL Server is ready!" +} + +# Function to wait for PostgreSQL +wait_for_postgres() { + echo "Waiting for PostgreSQL to be ready..." + ATTEMPT=0 + MAX_ATTEMPTS=30 + + until docker exec pg_target pg_isready -U postgres &>/dev/null || [ $ATTEMPT -eq $MAX_ATTEMPTS ] + do + ATTEMPT=$((ATTEMPT+1)) + echo "Waiting for PostgreSQL to initialize (Attempt $ATTEMPT/$MAX_ATTEMPTS)..." + sleep 5 + done + + if [ $ATTEMPT -eq $MAX_ATTEMPTS ]; then + echo "Error: PostgreSQL failed to start within timeout" + docker logs pg_target + exit 1 + fi + + echo "PostgreSQL is ready!" +} + +# Trap for cleanup on script exit +trap cleanup_docker EXIT + +# Initial cleanup +cleanup_docker + +# Create temp_data directory if it doesn't exist +mkdir -p temp_data + +# 1. Download vPIC data +echo "Downloading vPIC data..." +if [ ! -f temp_data/vpic.bak.zip ]; then + bash scripts/download_vpic.sh +else + echo "vPIC data already exists" +fi + +# 2. Start containers +echo "Starting Docker containers..." +docker-compose -f docker/docker-compose.yml up -d + +# Wait for services to be ready +wait_for_sqlserver +wait_for_postgres + +# 3. Restore SQL Server backup +echo "Restoring SQL Server backup..." +bash scripts/restore_and_export.sh + +# 4. Run migration script +echo "Running migration script..." +python scripts/migrate.py + +# 5. Verify migration +echo "Verifying migration..." +if ! docker exec pg_target psql -U postgres -d vpic -c "\dt"; then + echo "Error: Failed to verify PostgreSQL migration" + exit 1 +fi + +# Create backup of the migrated PostgreSQL database +echo "Creating PostgreSQL backup..." +backup_file="temp_data/vpic_postgres_$(date +%Y%m%d).sql" +if ! docker exec pg_target pg_dump -U postgres vpic > "$backup_file"; then + echo "Error: Failed to create PostgreSQL backup" + exit 1 +fi +echo "Backup created at: $backup_file" + +echo "Pipeline completed successfully!" \ No newline at end of file diff --git a/scripts/verify_db.sh b/scripts/verify_db.sh new file mode 100755 index 0000000..e27ff9d --- /dev/null +++ b/scripts/verify_db.sh @@ -0,0 +1,65 @@ + +set -euo pipefail + +SQL_USER="SA" +SQL_PASSWORD="DevPassword123#" + +echo "Checking table details with proper row counts..." +docker exec sqltemp /opt/mssql-tools18/bin/sqlcmd -S localhost \ + -U $SQL_USER -P $SQL_PASSWORD -C \ + -d vpic \ + -Q " + WITH TableCounts AS ( + SELECT + t.TABLE_NAME, + s.row_count, + (SELECT COUNT(*) FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = t.TABLE_NAME) as ColumnCount + FROM INFORMATION_SCHEMA.TABLES t + CROSS APPLY ( + SELECT SUM(p.rows) as row_count + FROM sys.partitions p + JOIN sys.tables st ON st.object_id = p.object_id + WHERE st.name = t.TABLE_NAME + AND p.index_id IN (0,1) + ) s + WHERE t.TABLE_TYPE = 'BASE TABLE' + ) + SELECT + TABLE_NAME, + row_count as RecordCount, + ColumnCount + FROM TableCounts + ORDER BY row_count DESC;" + +echo -e "\nChecking specific important tables..." +docker exec sqltemp /opt/mssql-tools18/bin/sqlcmd -S localhost \ + -U $SQL_USER -P $SQL_PASSWORD -C \ + -d vpic \ + -Q " + -- Check Element table + SELECT 'Element Table Counts:' as Info; + SELECT COUNT(*) as ElementCount FROM Element; + SELECT TOP 5 * FROM Element; + + -- Check Pattern table + SELECT 'Pattern Table Counts:' as Info; + SELECT COUNT(*) as PatternCount FROM Pattern; + SELECT TOP 5 * FROM Pattern; + + -- Check Make table + SELECT 'Make Table Counts:' as Info; + SELECT COUNT(*) as MakeCount FROM Make; + SELECT TOP 5 * FROM Make; + + -- Check Model table + SELECT 'Model Table Counts:' as Info; + SELECT COUNT(*) as ModelCount FROM Model; + SELECT TOP 5 * FROM Model; + + -- Database space info + SELECT 'Database Space Info:' as Info; + SELECT + name, + size/128.0 as SizeMB, + CAST(FILEPROPERTY(name, 'SpaceUsed') AS INT)/128.0 as SpaceUsedMB + FROM sys.database_files;" \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..a3e2b6b --- /dev/null +++ b/setup.py @@ -0,0 +1,24 @@ +# setup.py +from setuptools import setup, find_packages + +setup( + name="vpic_migration", + version="0.1.0", + description="vPIC Database Migration Tool", + author="Sam", + packages=find_packages(), + python_requires=">=3.8", + install_requires=[ + "pyodbc>=5.0.1", + "psycopg2-binary==2.9.9", + "tqdm==4.66.1", + "pytest==7.4.3", + "python-dotenv==1.0.0", + "setuptools>=42.0.0", + ], + entry_points={ + "console_scripts": [ + "vpic-migrate=vpic_migration.migrate:main", + ], + }, +) \ No newline at end of file diff --git a/setup.sh b/setup.sh new file mode 100644 index 0000000..774f994 --- /dev/null +++ b/setup.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -euo pipefail + +echo "Setting up vPIC migration environment..." + +# Check if python3 is installed +if ! command -v python &> /dev/null; then + echo "Python is required but not installed. Please install Python first." + exit 1 +fi + +# Check if pip3 is installed +if ! command -v pip &> /dev/null; then + echo "pip is required but not installed. Please install pip first." + exit 1 +fi + +# Create virtual environment if it doesn't exist +if [ ! -d "venv" ]; then + echo "Creating virtual environment..." + python -m venv venv +fi + +# Activate virtual environment +source .venv/bin/activate + +# Install requirements +echo "Installing Python dependencies..." +pip install -r requirements.txt + +# Install ODBC driver for Mac +if [[ "$OSTYPE" == "darwin"* ]]; then + +fi + +echo "Setup completed successfully!" \ No newline at end of file diff --git a/test_postgres.py b/test_postgres.py new file mode 100644 index 0000000..90f958a --- /dev/null +++ b/test_postgres.py @@ -0,0 +1,29 @@ + + + +#!/usr/bin/env python3 +import psycopg2 +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +try: + conn = psycopg2.connect( + dbname="vpic", + user="postgres", + password="postgres", + host="localhost", # Use container name + port="5432" + ) + + cur = conn.cursor() + cur.execute('SELECT version()') + ver = cur.fetchone() + print(f"Connected to: {ver[0]}") + + cur.close() + conn.close() + +except Exception as e: + print(f"Error: {str(e)}") \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_postgres.py b/tests/test_postgres.py new file mode 100644 index 0000000..d06a856 --- /dev/null +++ b/tests/test_postgres.py @@ -0,0 +1,83 @@ +# #!/usr/bin/env python3 +# import psycopg2 +# import logging + +# logging.basicConfig(level=logging.INFO) +# logger = logging.getLogger(__name__) + +# def test_connection(): +# try: +# # Try connecting to postgres database first +# conn = psycopg2.connect( +# dbname="postgres", +# user="postgres", +# password="postgres", +# host="localhost", +# port="5432" +# ) + +# logger.info("Successfully connected to postgres database") + +# # Test query +# cur = conn.cursor() +# cur.execute("SELECT current_database(), current_user") +# result = cur.fetchone() +# logger.info(f"Current database: {result[0]}, Current user: {result[1]}") + +# cur.close() +# conn.close() + +# # Now try connecting to vpic database +# conn = psycopg2.connect( +# dbname="vpic", +# user="postgres", +# password="postgres", +# host="localhost", +# port="5432" +# ) + +# logger.info("Successfully connected to vpic database") + +# cur = conn.cursor() +# cur.execute("SELECT current_database(), current_user") +# result = cur.fetchone() +# logger.info(f"Current database: {result[0]}, Current user: {result[1]}") + +# cur.close() +# conn.close() + +# except psycopg2.Error as e: +# logger.error(f"Connection failed: {str(e)}") +# logger.error(f"Error code: {e.pgcode}") +# logger.error(f"Error message: {e.pgerror}") + +# if __name__ == "__main__": +# test_connection() + + +#!/usr/bin/env python3 +import psycopg2 +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +try: + conn = psycopg2.connect( + dbname="vpic", + user="postgres", + password="postgres", + host="localhost", # Use container name + port="5432" + ) + + cur = conn.cursor() + cur.execute('SELECT version()') + ver = cur.fetchone() + print(f"Connected to: {ver[0]}") + + cur.close() + conn.close() + +except Exception as e: + print(f"Error: {str(e)}") \ No newline at end of file diff --git a/tests/test_postgres_conn.py b/tests/test_postgres_conn.py new file mode 100644 index 0000000..007b61d --- /dev/null +++ b/tests/test_postgres_conn.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +import psycopg2 +import logging +from config.settings import POSTGRES + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def test_connection(): + try: + logger.info("Testing PostgreSQL connection with settings:") + logger.info(f"Host: {POSTGRES['host']}") + logger.info(f"Port: {POSTGRES['port']}") + logger.info(f"Database: {POSTGRES['dbname']}") + logger.info(f"User: {POSTGRES['user']}") + + conn = psycopg2.connect(**POSTGRES) + cur = conn.cursor() + + # Test basic query + cur.execute('SELECT version()') + version = cur.fetchone()[0] + logger.info(f"Connected successfully to PostgreSQL: {version}") + + # Test user permissions + cur.execute('SELECT current_user, session_user') + users = cur.fetchone() + logger.info(f"Current user: {users[0]}, Session user: {users[1]}") + + cur.close() + conn.close() + return True + + except Exception as e: + logger.error(f"Connection failed: {str(e)}") + return False + +if __name__ == "__main__": + test_connection() diff --git a/vpic_migration/database.py b/vpic_migration/database.py new file mode 100644 index 0000000..37b495b --- /dev/null +++ b/vpic_migration/database.py @@ -0,0 +1,176 @@ +# scripts/utils/database.py +import pyodbc +import psycopg2 +from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT +import sqlite3 +from abc import ABC, abstractmethod +from typing import Any, Optional, Dict +import logging + +logger = logging.getLogger(__name__) + +class DatabaseConnection(ABC): + """Abstract base class for database connections""" + + def __init__(self): + self.conn = None + self.cur = None + + @abstractmethod + def connect(self, database: str, **kwargs) -> 'DatabaseConnection': + pass + + @abstractmethod + def execute(self, query: str, params: Optional[tuple] = None) -> Any: + pass + + def close(self): + if self.cur: + self.cur.close() + if self.conn: + self.conn.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + +class SQLServer(DatabaseConnection): + """SQL Server connection manager""" + def connect(self, database: str = "master", **kwargs) -> 'SQLServer': + try: + # Import settings here to avoid circular imports + from vpic_migration.settings import SQL_SERVER + + # Use settings with override from kwargs + conn_str = ( + f"DRIVER={{{kwargs.get('driver', SQL_SERVER['driver'])}}};" + f"SERVER={kwargs.get('server', SQL_SERVER['server'])};" + f"DATABASE={database};" + f"UID={kwargs.get('user', SQL_SERVER['user'])};" + f"PWD={kwargs.get('password', SQL_SERVER['password'])};" + f"TrustServerCertificate={kwargs.get('trust_cert', SQL_SERVER['trust_cert'])};" + ) + + self.conn = pyodbc.connect(conn_str) + self.cur = self.conn.cursor() + logger.info(f"Connected to SQL Server - {database}") + return self + except pyodbc.Error as e: + logger.error(f"SQL Server connection failed: {str(e)}") + raise + + def execute(self, query: str, params: Optional[tuple] = None) -> Any: + try: + if params: + self.cur.execute(query, params) + else: + self.cur.execute(query) + return self.cur + except pyodbc.Error as e: + logger.error(f"SQL Server query failed: {str(e)}") + raise + +class PostgreSQL(DatabaseConnection): + """PostgreSQL connection manager""" + def connect(self, database: str, **kwargs) -> 'PostgreSQL': + try: + params = { + 'dbname': database, + 'user': kwargs.get('user', 'postgres'), + 'password': kwargs.get('password', 'postgres'), + 'host': kwargs.get('host', 'localhost'), + 'port': kwargs.get('port', 5432) + } + + logger.info(f"Connecting to PostgreSQL - {database}") + self.conn = psycopg2.connect(**params) + self.cur = self.conn.cursor() + return self + except psycopg2.Error as e: + logger.error(f"PostgreSQL connection failed: {str(e)}") + raise + + def execute(self, query: str, params: Optional[tuple] = None) -> Any: + try: + if params: + self.cur.execute(query, params) + else: + self.cur.execute(query) + return self.cur + except psycopg2.Error as e: + logger.error(f"PostgreSQL query failed: {str(e)}") + raise + +class SQLite(DatabaseConnection): + """SQLite connection manager""" + def connect(self, database: str, **kwargs) -> 'SQLite': + try: + self.conn = sqlite3.connect(database) + self.conn.execute("PRAGMA foreign_keys = OFF") # Temporarily disable foreign key constraints + self.conn.execute("PRAGMA journal_mode = WAL") # Use WAL mode for better performance + self.conn.execute("PRAGMA synchronous = NORMAL") # Reduce synchronous mode for better performance + self.cur = self.conn.cursor() + logger.info(f"Connected to SQLite - {database}") + return self + except sqlite3.Error as e: + logger.error(f"SQLite connection failed: {str(e)}") + raise + + def execute(self, query: str, params: Optional[tuple] = None) -> Any: + try: + if params: + self.cur.execute(query, params) + else: + self.cur.execute(query) + return self.cur + except sqlite3.Error as e: + logger.error(f"SQLite query failed: {str(e)}") + logger.error(f"Query: {query}") + if params: + logger.error(f"Parameters: {params}") + raise + +def get_connection(db_type: str) -> DatabaseConnection: + """Factory function to get appropriate database connection""" + connections = { + 'sqlserver': SQLServer, + 'postgres': PostgreSQL, + 'sqlite': SQLite + } + + if db_type not in connections: + raise ValueError(f"Unsupported database type: {db_type}") + + return connections[db_type]() + +def ensure_database(name, server_type="postgres"): + """Ensure database exists and is accessible""" + if server_type == "postgres": + db = PostgreSQL() + with db.connect(name) as conn: + conn.conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) + try: + conn.execute(f"SELECT 1 FROM pg_database WHERE datname = '{name}'") + if not conn.cur.fetchone(): + conn.execute(f"CREATE DATABASE {name}") + logger.info(f"Created PostgreSQL database: {name}") + else: + logger.info(f"PostgreSQL database already exists: {name}") + except psycopg2.Error as e: + logger.error(f"PostgreSQL database creation failed: {str(e)}") + raise + else: + db = SQLServer() + with db.connect() as conn: + try: + conn.execute(f"SELECT database_id FROM sys.databases WHERE name = '{name}'") + if not conn.cur.fetchone(): + conn.execute(f"CREATE DATABASE {name}") + logger.info(f"Created SQL Server database: {name}") + else: + logger.info(f"SQL Server database already exists: {name}") + except pyodbc.Error as e: + logger.error(f"SQL Server database creation failed: {str(e)}") + raise \ No newline at end of file diff --git a/vpic_migration/migrate.py b/vpic_migration/migrate.py new file mode 100644 index 0000000..e82e4d5 --- /dev/null +++ b/vpic_migration/migrate.py @@ -0,0 +1,250 @@ +# vpic_migration/migrate.py +import logging +import os +from pathlib import Path +from typing import Dict, List, Tuple, Optional, Type +from tqdm import tqdm + +from vpic_migration.database import ( + DatabaseConnection, + SQLServer, + PostgreSQL, + SQLite, + ensure_database +) +from vpic_migration.settings import ( + SQL_TO_PG_TYPES, + SQL_TO_SQLITE_TYPES, + SQLITE +) + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +def get_table_schema(sql_conn: SQLServer) -> Dict[str, List[Tuple]]: + """Extract schema information from SQL Server""" + tables = {} + query = """ + SELECT + t.name as table_name, + c.name as column_name, + typ.name as data_type, + c.max_length, + c.precision, + c.scale, + c.is_nullable + FROM sys.tables t + INNER JOIN sys.columns c ON t.object_id = c.object_id + INNER JOIN sys.types typ ON c.user_type_id = typ.user_type_id + ORDER BY t.name, c.column_id + """ + + try: + cursor = sql_conn.execute(query) + for row in cursor.fetchall(): + table_name = row[0] + if table_name not in tables: + tables[table_name] = [] + tables[table_name].append(row[1:]) + + logger.info(f"Found {len(tables)} tables in SQL Server") + return tables + except Exception as e: + logger.error(f"Failed to get schema information: {str(e)}") + raise + +def get_type_mapping(target_db: str) -> Dict[str, str]: + """Get the appropriate type mapping for the target database""" + mappings = { + 'postgres': SQL_TO_PG_TYPES, + 'sqlite': SQL_TO_SQLITE_TYPES + } + return mappings.get(target_db, SQL_TO_PG_TYPES) + +def create_target_tables(target_conn: DatabaseConnection, schema_info: Dict[str, List[Tuple]], target_type: str): + """Create tables in target database with appropriate schema""" + type_mapping = get_type_mapping(target_type) + + try: + # Start a single transaction for all table creation + if isinstance(target_conn, SQLite): + target_conn.execute("BEGIN TRANSACTION") + + for table_name, columns in schema_info.items(): + try: + create_stmt = f"CREATE TABLE IF NOT EXISTS {table_name} (\n" + cols = [] + + for col_info in columns: + col_name, data_type, max_length, precision, scale, is_nullable = col_info + target_type = type_mapping.get(data_type.lower(), "TEXT") + + # For SQLite, simplify the types + if isinstance(target_conn, SQLite): + if target_type.startswith(('varchar', 'char', 'nvarchar', 'nchar')): + target_type = 'TEXT' + elif target_type.startswith(('decimal', 'numeric')): + target_type = 'REAL' + elif target_type in ('bit', 'tinyint', 'smallint', 'int', 'bigint'): + target_type = 'INTEGER' + + nullable = "" if is_nullable else " NOT NULL" + cols.append(f"{col_name} {target_type}{nullable}") + + create_stmt += ",\n".join(cols) + create_stmt += "\n)" + + logger.info(f"Creating table {table_name}") + target_conn.execute(create_stmt) + + except Exception as e: + logger.error(f"Failed to create table {table_name}: {str(e)}") + raise + + # Commit the transaction for SQLite + if isinstance(target_conn, SQLite): + target_conn.execute("COMMIT") + # Set pragmas after table creation + for pragma, value in SQLITE['pragmas'].items(): + target_conn.execute(f"PRAGMA {pragma}={value}") + + except Exception as e: + if isinstance(target_conn, SQLite): + target_conn.execute("ROLLBACK") + logger.error(f"Failed to create tables: {str(e)}") + raise + +def migrate_table_data( + source_conn: DatabaseConnection, + target_conn: DatabaseConnection, + table_name: str, + columns: List[Tuple], + batch_size: int = 1000 +): + """Migrate data for a single table with progress bar""" + try: + # Get row count + count_result = source_conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone() + total_rows = count_result[0] if count_result else 0 + + if total_rows == 0: + logger.info(f"No data in table {table_name}") + return + + # Prepare statements + col_names = [col[0] for col in columns] + select_stmt = f"SELECT {','.join(col_names)} FROM {table_name}" + + # Adjust placeholders based on target database type + if isinstance(target_conn, SQLite): + placeholders = ",".join(["?" for _ in col_names]) + else: + placeholders = ",".join(["%s" for _ in col_names]) + + insert_stmt = f"INSERT INTO {table_name} ({','.join(col_names)}) VALUES ({placeholders})" + + logger.info(f"Migrating {total_rows} rows from {table_name}") + + with tqdm(total=total_rows, desc=table_name) as pbar: + # Start transaction + if isinstance(target_conn, SQLite): + target_conn.execute("BEGIN TRANSACTION") + + try: + rows = source_conn.execute(select_stmt).fetchall() + + for i in range(0, len(rows), batch_size): + batch = rows[i:i + batch_size] + for row in batch: + # Convert any None values to NULL for SQLite + cleaned_row = tuple(None if v == '' else v for v in row) + target_conn.execute(insert_stmt, cleaned_row) + pbar.update(len(batch)) + + if isinstance(target_conn, SQLite): + target_conn.execute("COMMIT") + else: + target_conn.conn.commit() + + except Exception as e: + if isinstance(target_conn, SQLite): + target_conn.execute("ROLLBACK") + raise e + + except Exception as e: + logger.error(f"Failed to migrate data for table {table_name}: {str(e)}") + raise + +def verify_migration(sql_conn: SQLServer, pg_conn: PostgreSQL, table_name: str) -> bool: + """Verify the migration of a table""" + try: + sql_count = sql_conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0] + pg_count = pg_conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0] + + if sql_count != pg_count: + logger.error(f"Count mismatch for table {table_name}: SQL={sql_count}, PG={pg_count}") + return False + + logger.info(f"Verified table {table_name}: {sql_count} rows") + return True + except Exception as e: + logger.error(f"Failed to verify table {table_name}: {str(e)}") + return False + +def main(): + """Main migration process""" + source_conn = None + target_conn = None + + # Get target database type from environment or default to postgres + target_db = os.getenv("TARGET_DB", "postgres").lower() + + try: + # Connect to source database (SQL Server) + source_conn = SQLServer().connect("vpic") + + # Connect to target database based on type + if target_db == "sqlite": + target_conn = SQLite().connect(str(SQLITE["database"])) + else: + ensure_database("vpic", "postgres") + target_conn = PostgreSQL().connect("vpic") + + # Get schema information + schema_info = get_table_schema(source_conn) + + # Create tables in target database + create_target_tables(target_conn, schema_info, target_db) + + # Migrate and verify each table + failed_tables = [] + for table_name, columns in schema_info.items(): + try: + migrate_table_data(source_conn, target_conn, table_name, columns) + if not verify_migration(source_conn, target_conn, table_name): + failed_tables.append(table_name) + except Exception as e: + logger.error(f"Failed to migrate table {table_name}: {str(e)}") + failed_tables.append(table_name) + + if failed_tables: + logger.error(f"Migration completed with errors. Failed tables: {failed_tables}") + else: + logger.info("Migration completed successfully") + + except Exception as e: + logger.error(f"Migration failed: {str(e)}") + raise + + finally: + if source_conn: + source_conn.close() + if target_conn: + target_conn.close() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/vpic_migration/settings.py b/vpic_migration/settings.py new file mode 100644 index 0000000..c3b1a63 --- /dev/null +++ b/vpic_migration/settings.py @@ -0,0 +1,115 @@ +# config/settings.py +import os +from pathlib import Path + +# Base paths +PROJECT_ROOT = Path(__file__).parent.parent +TEMP_DATA_DIR = PROJECT_ROOT / "temp_data" +MIGRATIONS_DIR = PROJECT_ROOT / "migrations" + +# Database configurations +SQL_SERVER = { + "driver": "ODBC Driver 18 for SQL Server", + "server": "localhost", + "database": "vpic", + "user": "SA", + "password": "DevPassword123#", + "trust_cert": "yes" +} + +# Updated PostgreSQL settings to match working container configuration +POSTGRES = { + "host": "localhost", + "port": "5432", + "dbname": "vpic", # Start with default database + "user": "postgres", + "password": "postgres", + "connect_timeout": 10 +} + +# Docker configurations +DOCKER = { + "compose_file": PROJECT_ROOT / "docker" / "docker-compose.yml", + "sql_container": "sqltemp", + "pg_container": "pg_target", + "network": "docker_default" +} + +# vPIC configurations +VPIC = { + "api_url": "https://vpic.nhtsa.dot.gov/api/", + "backup_file": "VPICList_lite_2024_12.bak" +} + +# Type mappings +SQL_TO_PG_TYPES = { + "bigint": "bigint", + "bit": "boolean", + "decimal": "decimal", + "int": "integer", + "money": "decimal(19,4)", + "numeric": "numeric", + "smallint": "smallint", + "smallmoney": "decimal(10,4)", + "tinyint": "smallint", + "float": "double precision", + "real": "real", + "date": "date", + "datetime2": "timestamp", + "datetime": "timestamp", + "datetimeoffset": "timestamp with time zone", + "smalldatetime": "timestamp", + "time": "time", + "char": "char", + "varchar": "varchar", + "text": "text", + "nchar": "char", + "nvarchar": "varchar", + "ntext": "text", + "binary": "bytea", + "varbinary": "bytea", + "image": "bytea", + "uniqueidentifier": "uuid", +} + +# Add SQLite configuration and type mappings +SQLITE = { + "database": TEMP_DATA_DIR / "vpic.db", + "pragmas": { + "journal_mode": "WAL", + "synchronous": "NORMAL", + "foreign_keys": "ON", + "cache_size": -64000 # 64MB cache + } +} + +# Add SQLite type mappings +SQL_TO_SQLITE_TYPES = { + "bigint": "INTEGER", + "bit": "INTEGER", + "decimal": "REAL", + "int": "INTEGER", + "money": "REAL", + "numeric": "REAL", + "smallint": "INTEGER", + "smallmoney": "REAL", + "tinyint": "INTEGER", + "float": "REAL", + "real": "REAL", + "date": "TEXT", + "datetime2": "TEXT", + "datetime": "TEXT", + "datetimeoffset": "TEXT", + "smalldatetime": "TEXT", + "time": "TEXT", + "char": "TEXT", + "varchar": "TEXT", + "text": "TEXT", + "nchar": "TEXT", + "nvarchar": "TEXT", + "ntext": "TEXT", + "binary": "BLOB", + "varbinary": "BLOB", + "image": "BLOB", + "uniqueidentifier": "TEXT", +} \ No newline at end of file