There are 5829030 no of rows and 13 no of columns observed in 'combined_Table' table
SELECT COUNT(*) AS total_rows
FROM combined_table;
DESCRIBE combined_Table;
SELECT 'null_count_ride_id' AS column_name, COUNT(*) AS count_value FROM combined_Table WHERE ride_id IS NULL
UNION ALL
SELECT 'null_count_rideable_type', COUNT(*) FROM combined_Table WHERE rideable_type IS NULL
UNION ALL
SELECT 'null_count_started_at', COUNT(*) FROM combined_Table WHERE started_at IS NULL
UNION ALL
SELECT 'null_count_ended_at', COUNT(*) FROM combined_Table WHERE ended_at IS NULL
UNION ALL
SELECT 'null_count_start_station_name', COUNT(*) FROM combined_Table WHERE start_station_name IS NULL
UNION ALL
SELECT 'null_count_start_station_id', COUNT(*) FROM combined_Table WHERE start_station_id IS NULL
UNION ALL
SELECT 'null_count_end_station_name', COUNT(*) FROM combined_Table WHERE end_station_name IS NULL
UNION ALL
SELECT 'null_count_end_station_id', COUNT(*) FROM combined_Table WHERE end_station_id IS NULL
UNION ALL
SELECT 'null_count_start_lat', COUNT(*) FROM combined_Table WHERE start_lat IS NULL
UNION ALL
SELECT 'null_count_start_lng', COUNT(*) FROM combined_Table WHERE start_lng IS NULL
UNION ALL
SELECT 'null_count_end_lat', COUNT(*) FROM combined_Table WHERE end_lat IS NULL
UNION ALL
SELECT 'null_count_end_lng', COUNT(*) FROM combined_Table WHERE end_lng IS NULL
UNION ALL
SELECT 'null_count_member_casual', COUNT(*) FROM combined_Table WHERE member_casual IS NULL;
SELECT 'blank_count_ride_id' AS column_name, COUNT(*) AS count_value FROM combined_Table WHERE ride_id = ''
UNION ALL
SELECT 'blank_count_rideable_type', COUNT(*) FROM combined_Table WHERE rideable_type = ''
UNION ALL
SELECT 'null_count_started_at', COUNT(*) FROM combined_Table WHERE started_at IS NULL
UNION ALL
SELECT 'null_count_ended_at', COUNT(*) FROM combined_Table WHERE ended_at IS NULL
UNION ALL
SELECT 'blank_count_start_station_name', COUNT(*) FROM combined_Table WHERE start_station_name = ''
UNION ALL
SELECT 'blank_count_start_station_id', COUNT(*) FROM combined_Table WHERE start_station_id = ''
UNION ALL
SELECT 'blank_count_end_station_name', COUNT(*) FROM combined_Table WHERE end_station_name = ''
UNION ALL
SELECT 'blank_count_end_station_id', COUNT(*) FROM combined_Table WHERE end_station_id = ''
UNION ALL
SELECT 'blank_count_start_lat', COUNT(*) FROM combined_Table WHERE start_lat = ''
UNION ALL
SELECT 'blank_count_start_lng', COUNT(*) FROM combined_Table WHERE start_lng = ''
UNION ALL
SELECT 'blank_count_end_lat', COUNT(*) FROM combined_Table WHERE end_lat = ''
UNION ALL
SELECT 'blank_count_end_lng', COUNT(*) FROM combined_Table WHERE end_lng = ''
UNION ALL
SELECT 'blank_count_member_casual', COUNT(*) FROM combined_Table WHERE member_casual = '';
SELECT
COUNT(*) AS total_blank_rows_count
FROM (
SELECT
ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
start_station_id,
end_station_name,
end_station_id,
start_lat,
start_lng,
end_lat,
end_lng,
member_casual
FROM combined_Table
WHERE
ride_id = '' OR
rideable_type = '' OR
started_at IS NULL OR
ended_at IS NULL OR
start_station_name = '' OR
start_station_id = '' OR
end_station_name = '' OR
end_station_id = '' OR
start_lat = '' OR
start_lng = '' OR
end_lat = '' OR
end_lng = '' OR
member_casual = ''
) AS distinct_blank_rows;
SELECT
column_name,
(count_value / (SELECT COUNT(*) FROM combined_Table)) * 100 AS percentage_of_blank
FROM (
SELECT 'blank_count_ride_id' AS column_name, COUNT(*) AS count_value FROM combined_Table WHERE ride_id = ''
UNION ALL
SELECT 'blank_count_rideable_type', COUNT(*) FROM combined_Table WHERE rideable_type = ''
UNION ALL
SELECT 'blank_count_started_at', COUNT(*) FROM combined_Table WHERE started_at IS NULL
UNION ALL
SELECT 'blank_count_ended_at', COUNT(*) FROM combined_Table WHERE ended_at IS NULL
UNION ALL
SELECT 'blank_count_start_station_name', COUNT(*) FROM combined_Table WHERE start_station_name = ''
UNION ALL
SELECT 'blank_count_start_station_id', COUNT(*) FROM combined_Table WHERE start_station_id = ''
UNION ALL
SELECT 'blank_count_end_station_name', COUNT(*) FROM combined_Table WHERE end_station_name = ''
UNION ALL
SELECT 'blank_count_end_station_id', COUNT(*) FROM combined_Table WHERE end_station_id = ''
UNION ALL
SELECT 'blank_count_start_lat', COUNT(*) FROM combined_Table WHERE start_lat = ''
UNION ALL
SELECT 'blank_count_start_lng', COUNT(*) FROM combined_Table WHERE start_lng = ''
UNION ALL
SELECT 'blank_count_end_lat', COUNT(*) FROM combined_Table WHERE end_lat = ''
UNION ALL
SELECT 'blank_count_end_lng', COUNT(*) FROM combined_Table WHERE end_lng = ''
UNION ALL
SELECT 'blank_count_member_casual', COUNT(*) FROM combined_Table WHERE member_casual = ''
) AS count_data;
SELECT
((COUNT(*)/(SELECT COUNT(*) from combined_Table)) * 100) AS count_distinct_blank_rows
FROM (
SELECT
ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
start_station_id,
end_station_name,
end_station_id,
start_lat,
start_lng,
end_lat,
end_lng,
member_casual
FROM combined_Table
WHERE
ride_id = '' OR
rideable_type = '' OR
started_at IS NULL OR
ended_at IS NULL OR
start_station_name = '' OR
start_station_id = '' OR
end_station_name = '' OR
end_station_id = '' OR
start_lat = '' OR
start_lng = '' OR
end_lat = '' OR
end_lng = '' OR
member_casual = ''
) AS distinct_blank_rows;
SELECT COUNT(*) AS total_duplicate_rows
FROM combined_Table
GROUP BY
ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
start_station_id,
end_station_name,
end_station_id,
start_lat,
start_lng,
end_lat,
end_lng,
member_casual
HAVING COUNT(*) > 1;
SELECT COUNT(*) AS longer_than_a_day
FROM combined_Table
WHERE TIMESTAMPDIFF(HOUR, started_at, ended_at) >= 24;
SELECT COUNT(*) AS less_than_a_minute
FROM combined_Table
WHERE TIMESTAMPDIFF(SECOND, started_at, ended_at) <= 60;
Total count of rows including 'Total no of blank or null values', 'Outlier or false value of ride length longer than a day' and 'Outlier or false value of ride length less than a minute'.
SELECT
COUNT(*) AS total_inconsistent_count
FROM (
SELECT
ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
start_station_id,
end_station_name,
end_station_id,
start_lat,
start_lng,
end_lat,
end_lng,
member_casual
FROM combined_Table
WHERE
ride_id = '' OR
rideable_type = '' OR
started_at IS NULL OR
ended_at IS NULL OR
start_station_name = '' OR
start_station_id = '' OR
end_station_name = '' OR
end_station_id = '' OR
start_lat = '' OR
start_lng = '' OR
end_lat = '' OR
end_lng = '' OR
member_casual = '' OR
TIMESTAMPDIFF(HOUR, started_at, ended_at) >= 24 OR
TIMESTAMPDIFF(SECOND, started_at, ended_at) <= 60
) AS distinct_blank_rows;
Percent of rows including 'Total no of blank or null values', 'Outlier or false value of ride length longer than a day' and 'Outlier or false value of ride length less than a minute'.
SELECT
(COUNT(*)/(SELECT COUNT(*) FROM combined_Table) * 100) AS percent_inconsistent_count
FROM (
SELECT
ride_id,
rideable_type,
started_at,
ended_at,
start_station_name,
start_station_id,
end_station_name,
end_station_id,
start_lat,
start_lng,
end_lat,
end_lng,
member_casual
FROM combined_Table
WHERE
ride_id = '' OR
rideable_type = '' OR
started_at IS NULL OR
ended_at IS NULL OR
start_station_name = '' OR
start_station_id = '' OR
end_station_name = '' OR
end_station_id = '' OR
start_lat = '' OR
start_lng = '' OR
end_lat = '' OR
end_lng = '' OR
member_casual = '' OR
TIMESTAMPDIFF(HOUR, started_at, ended_at) >= 24 OR
TIMESTAMPDIFF(SECOND, started_at, ended_at) <= 60
) AS distinct_blank_rows;
- Data type: VARCHAR(255)
- Constraints applied:
- Primary Key
- Unique
- Not Null
-- Primary Key constraints added to ride_id column
ALTER TABLE combined_Table
MODIFY COLUMN ride_id VARCHAR(255) PRIMARY KEY UNIQUE NOT NULL;
- Length of field:
SELECT LENGTH(ride_id) AS length_ride_id, COUNT(ride_id) AS no_of_rows
FROM combined_Table
GROUP BY length_ride_id;
- Data type: MEDIUMTEXT
- Data format: Categorical
- All Categories:
SELECT rideable_type, COUNT(rideable_type) AS no_of_rides
FROM combined_Table
GROUP BY rideable_type;
- Data type: DATETIME -- (YYYY-MM-DD hh:mm:ss)
Characteristics and data format of Field 'start_station_name', 'end_station_name', 'start_station_id' & 'end_station_id
- Data type: MEDIUMTEXT
- Blank values were found in all four columns.
- Data type: Double
- Blank values were found in 'end_lat' & 'end_lng' columns
- Data type: MEDIUMTEXT
- Data format: Categorical
- All Categories:
SELECT member_casual, COUNT(member_casual) AS no_of_rides
FROM combined_Table
GROUP BY member_casual;