-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathData cleaning using SQL.sql
195 lines (147 loc) · 5.33 KB
/
Data cleaning using SQL.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
--Cleaning Data:
SELECT * FROM
[Portfolio Project]..[Nashville Housing]
--Standardize Date Format:
SELECT SaleDate, CONVERT(date, SaleDate) as Rectified_date FROM
[Portfolio Project]..[Nashville Housing]
--UPDATE [Portfolio Project]..[Nashville Housing] SET SaleDate = CONVERT(Date, SaleDate);
ALTER TABLE [Portfolio Project]..[Nashville Housing]
Add SaleDateConverted Date
UPDATE [Portfolio Project]..[Nashville Housing]
SET SaleDateConverted = CONVERT(Date, SaleDate)
SELECT SaleDateConverted, SaleDate FROM
[Portfolio Project]..[Nashville Housing]
--Populate Property Address Data:
SELECT PropertyAddress FROM
[Portfolio Project]..[Nashville Housing]
WHERE PropertyAddress IS NULL
SELECT * FROM
[Portfolio Project]..[Nashville Housing]
WHERE PropertyAddress IS NULL
SELECT * FROM
[Portfolio Project]..[Nashville Housing]
--WHERE PropertyAddress IS NULL
ORDER BY ParcelID
SELECT a.ParcelID, a.PropertyAddress, b.ParcelID, b.PropertyAddress, ISNULL(a.PropertyAddress, b.PropertyAddress) FROM
[Portfolio Project]..[Nashville Housing] as a
JOIN [Portfolio Project]..[Nashville Housing] as b
ON a.ParcelID = b.ParcelID
AND a.[UniqueID ] <> b.[UniqueID ]
WHERE a.PropertyAddress IS NULL
UPDATE a
SET PropertyAddress = ISNULL(a.PropertyAddress, b.PropertyAddress) FROM
[Portfolio Project]..[Nashville Housing] as a
JOIN [Portfolio Project]..[Nashville Housing] as b
ON a.ParcelID = b.ParcelID
AND a.[UniqueID ] <> b.[UniqueID ]
WHERE a.PropertyAddress IS NULL
SELECT PropertyAddress FROM [Portfolio Project]..[Nashville Housing]
WHERE PropertyAddress IS NULL
--Dissecting the address into city, state, etc.:
SELECT PropertyAddress FROM
[Portfolio Project]..[Nashville Housing]
--WHERE PropertyAddress IS NULL
--ORDER BY ParcelID
SELECT
SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress)-1) as Address,
SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress)+1, LEN(PropertyAddress)) as State
FROM [Portfolio Project]..[Nashville Housing]
ALTER TABLE [Portfolio Project]..[Nashville Housing]
Add SplitedAddress nvarchar(255)
UPDATE [Portfolio Project]..[Nashville Housing]
SET SplitedAddress = SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress)-1)
ALTER TABLE [Portfolio Project]..[Nashville Housing]
Add SplitedState nvarchar(255)
UPDATE [Portfolio Project]..[Nashville Housing]
SET SplitedState = SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress)+1, LEN(PropertyAddress))
SELECT SplitedAddress, SplitedState FROM
[Portfolio Project]..[Nashville Housing]
SELECT OwnerAddress FROM
[Portfolio Project]..[Nashville Housing]
SELECT PARSENAME(REPLACE(OwnerAddress,',','.'), 3) as Address,
PARSENAME(REPLACE(OwnerAddress,',','.'), 2) as City,
PARSENAME(REPLACE(OwnerAddress,',','.'), 1) as State
FROM
[Portfolio Project]..[Nashville Housing]
ALTER TABLE [Portfolio Project]..[Nashville Housing]
Add SplitedOwnerAddress nvarchar(255)
UPDATE [Portfolio Project]..[Nashville Housing]
SET SplitedOwnerAddress = PARSENAME(REPLACE(OwnerAddress,',','.'), 3)
ALTER TABLE [Portfolio Project]..[Nashville Housing]
Add SplitedOwnerCity nvarchar(255)
UPDATE [Portfolio Project]..[Nashville Housing]
SET SplitedOwnerCity = PARSENAME(REPLACE(OwnerAddress,',','.'), 2)
ALTER TABLE [Portfolio Project]..[Nashville Housing]
Add SplitedOwnerState nvarchar(255)
UPDATE [Portfolio Project]..[Nashville Housing]
SET SplitedOwnerState = PARSENAME(REPLACE(OwnerAddress,',','.'), 1)
SELECT SplitedOwnerAddress, SplitedOwnerCity, SplitedOwnerState
FROM [Portfolio Project]..[Nashville Housing]
SELECT * FROM [Portfolio Project]..[Nashville Housing]
--Change Y and N to YES and NO respectively:
SELECT SoldAsVacant, COUNT(SoldAsVacant) as Counts
FROM [Portfolio Project]..[Nashville Housing]
GROUP BY SoldAsVacant
ORDER BY Counts DESC
SELECT SoldAsVacant,
CASE
WHEN SoldAsVacant ='Y' THEN 'Yes'
WHEN SoldAsVacant = 'N' THEN 'No'
ELSE SoldAsVacant
END
as ModifiedSAV
FROM [Portfolio Project]..[Nashville Housing]
WHERE SoldAsVacant IN ('Y', 'N')
UPDATE [Portfolio Project]..[Nashville Housing]
SET SoldAsVacant = CASE
WHEN SoldAsVacant ='Y' THEN 'Yes'
WHEN SoldAsVacant = 'N' THEN 'No'
ELSE SoldAsVacant
END
SELECT SoldAsVacant, COUNT(SoldAsVacant) as Counts
FROM [Portfolio Project]..[Nashville Housing]
GROUP BY SoldAsVacant
ORDER BY Counts DESC
--Deleting Duplicates:
WITH ROWNUMCTE AS(
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY ParcelID,
PropertyAddress,
SalePrice,
SaleDate,
LegalReference
ORDER BY
UniqueID
) row_num
FROM [Portfolio Project]..[Nashville Housing]
--ORDER BY ParcelID
)
DELETE FROM ROWNUMCTE
WHERE row_num>1
--ORDER BY PropertyAddress
--Verifying the deletion
WITH ROWNUMCTE AS(
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY ParcelID,
PropertyAddress,
SalePrice,
SaleDate,
LegalReference
ORDER BY
UniqueID
) row_num
FROM [Portfolio Project]..[Nashville Housing]
--ORDER BY ParcelID
)
SELECT * FROM ROWNUMCTE
WHERE row_num>1
ORDER BY PropertyAddress
--Deleting Unused Columns:
SELECT * FROM
[Portfolio Project]..[Nashville Housing]
ALTER TABLE [Portfolio Project]..[Nashville Housing]
DROP COLUMN OwnerAddress, PropertyAddress, TaxDistrict
ALTER TABLE [Portfolio Project]..[Nashville Housing]
DROP COLUMN SaleDate