1
+ {
2
+ "cells" : [
3
+ {
4
+ "cell_type" : " code" ,
5
+ "execution_count" : null ,
6
+ "metadata" : {
7
+ "scrolled" : true
8
+ },
9
+ "outputs" : [],
10
+ "source" : [
11
+ " # import data\n " ,
12
+ " \n " ,
13
+ " import pandas as pd\n " ,
14
+ " \n " ,
15
+ " data = pd.read_excel('input_data.xlsx')\n " ,
16
+ " \n " ,
17
+ " data"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type" : " code" ,
22
+ "execution_count" : null ,
23
+ "metadata" : {},
24
+ "outputs" : [],
25
+ "source" : [
26
+ " # read heders\n " ,
27
+ " \n " ,
28
+ " print(data.columns)"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type" : " code" ,
33
+ "execution_count" : null ,
34
+ "metadata" : {},
35
+ "outputs" : [],
36
+ "source" : [
37
+ " data.area.describe()"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type" : " code" ,
42
+ "execution_count" : null ,
43
+ "metadata" : {
44
+ "scrolled" : true
45
+ },
46
+ "outputs" : [],
47
+ "source" : [
48
+ " data.info()"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type" : " code" ,
53
+ "execution_count" : null ,
54
+ "metadata" : {
55
+ "scrolled" : true
56
+ },
57
+ "outputs" : [],
58
+ "source" : [
59
+ " # find all unique city\n " ,
60
+ " \n " ,
61
+ " all_city = data['city'].unique()\n " ,
62
+ " print(\" City array: {0}\" .format(all_city))\n " ,
63
+ " \n " ,
64
+ " # find all unique country\n " ,
65
+ " \n " ,
66
+ " all_country = data['country'].unique()\n " ,
67
+ " print(\" Country array: {0}\" .format(all_country))"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type" : " code" ,
72
+ "execution_count" : null ,
73
+ "metadata" : {},
74
+ "outputs" : [],
75
+ "source" : [
76
+ " # mapping value\n " ,
77
+ " dicionary_corect = {'US':'USA', 'USA':'USA', ' United States of America':'USA', 'America':'USA', 'Poland':'POL', 'PL':'POL', 'Polska':'POL' }\n " ,
78
+ " mapping_country = data['country'].map(dicionary_corect)\n " ,
79
+ " \n " ,
80
+ " data['country'] = mapping_country\n " ,
81
+ " \n " ,
82
+ " data"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type" : " code" ,
87
+ "execution_count" : null ,
88
+ "metadata" : {},
89
+ "outputs" : [],
90
+ "source" : [
91
+ " # check area and population value\n " ,
92
+ " \n " ,
93
+ " for city in all_city:\n " ,
94
+ " # get uniqe value area and population for the city\n " ,
95
+ " area = data[(data['city']==city) & (~data['area'].isna())]['area'].unique()\n " ,
96
+ " population = data[(data['city']==city) & (~data['population'].isna())]['population'].unique()\n " ,
97
+ " if len(area) == 1:\n " ,
98
+ " data.loc[(data['city']==city) & (data['area'].isna()), 'area'] = area\n " ,
99
+ " else:\n " ,
100
+ " print('Area data mismatch on the context of {0}'.format(city))\n " ,
101
+ " \n " ,
102
+ " if len(population) == 1:\n " ,
103
+ " data.loc[(data['city']==city) & (data['population'].isna()), 'population'] = population\n " ,
104
+ " else:\n " ,
105
+ " print('Population data mismatch on the context of {0}'.format(city))\n " ,
106
+ " \n " ,
107
+ " data"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type" : " code" ,
112
+ "execution_count" : null ,
113
+ "metadata" : {},
114
+ "outputs" : [],
115
+ "source" : [
116
+ " # get country\n " ,
117
+ " \n " ,
118
+ " country_list = pd.DataFrame(data['country'].unique(), columns=['country'])\n " ,
119
+ " \n " ,
120
+ " country_list.index.name = 'id'\n " ,
121
+ " \n " ,
122
+ " country_list"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type" : " code" ,
127
+ "execution_count" : null ,
128
+ "metadata" : {},
129
+ "outputs" : [],
130
+ "source" : [
131
+ " # get city and conect with country\n " ,
132
+ " \n " ,
133
+ " city_list = data[['city','country']].drop_duplicates().reset_index().drop(columns = ['index']);\n " ,
134
+ " city_list.index.name = 'id'\n " ,
135
+ " \n " ,
136
+ " city_list = city_list.rename(columns = {'country':'country_id'})\n " ,
137
+ " \n " ,
138
+ " city_list"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type" : " code" ,
143
+ "execution_count" : null ,
144
+ "metadata" : {},
145
+ "outputs" : [],
146
+ "source" : [
147
+ " city_list['country_id'] = city_list['country_id'].map(lambda x: country_list[country_list['country'] == x].index.values.astype(int)[0])\n " ,
148
+ " \n " ,
149
+ " city_list"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type" : " code" ,
154
+ "execution_count" : null ,
155
+ "metadata" : {},
156
+ "outputs" : [],
157
+ "source" : [
158
+ " # get area and population\n " ,
159
+ " \n " ,
160
+ " city_pop_area = data[['city','area', 'population', 'president']].drop_duplicates().reset_index().drop(columns = ['index']);\n " ,
161
+ " city_pop_area.index.name = 'id'\n " ,
162
+ " \n " ,
163
+ " city_pop_area = city_pop_area.rename(columns = {'city':'city_id'})\n " ,
164
+ " \n " ,
165
+ " city_pop_area['city_id'] = city_pop_area['city_id'].map(lambda x: city_list[city_list['city'] == x].index.values.astype(int)[0])\n " ,
166
+ " \n " ,
167
+ " city_pop_area"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type" : " code" ,
172
+ "execution_count" : null ,
173
+ "metadata" : {},
174
+ "outputs" : [],
175
+ "source" : [
176
+ " # get city and monument\n " ,
177
+ " \n " ,
178
+ " city_monuments = data[['city', 'monument']].drop_duplicates().dropna().reset_index().drop(columns = ['index']);\n " ,
179
+ " city_monuments.index.name = 'id'\n " ,
180
+ " \n " ,
181
+ " city_monuments = city_monuments.rename(columns = {'city':'city_id'})\n " ,
182
+ " \n " ,
183
+ " city_monuments['city_id'] = city_monuments['city_id'].map(lambda x: city_list[city_list['city'] == x].index.values.astype(int)[0])\n " ,
184
+ " \n " ,
185
+ " city_monuments"
186
+ ]
187
+ },
188
+ {
189
+ "cell_type" : " code" ,
190
+ "execution_count" : 1 ,
191
+ "metadata" : {},
192
+ "outputs" : [],
193
+ "source" : [
194
+ " #Table definition and insert data\n " ,
195
+ " \n " ,
196
+ " from sqlalchemy import create_engine\n " ,
197
+ " from sqlalchemy.ext.declarative import declarative_base\n " ,
198
+ " \n " ,
199
+ " db_string = \" postgres://postgres:postgres@127.0.0.1:5432/testAGH\"\n " ,
200
+ " \n " ,
201
+ " engine = create_engine(db_string)\n " ,
202
+ " \n " ,
203
+ " Base = declarative_base()\n " ,
204
+ " \n " ,
205
+ " # Import column structure and constraints\n " ,
206
+ " \n " ,
207
+ " from sqlalchemy import Column, Integer, String, Float, ForeignKey, Sequence, CheckConstraint, UniqueConstraint\n " ,
208
+ " \n " ,
209
+ " class Country(Base):\n " ,
210
+ " __tablename__ = 'countryies'\n " ,
211
+ " __table_args__ = (\n " ,
212
+ " CheckConstraint('length(country) = 3'),\n " ,
213
+ " UniqueConstraint('country'),\n " ,
214
+ " )\n " ,
215
+ " id = Column(Integer, Sequence('seq_country_id'), primary_key = True)\n " ,
216
+ " country = Column(String(50), nullable = False)\n " ,
217
+ " \n " ,
218
+ " class City(Base):\n " ,
219
+ " __tablename__ = 'cities'\n " ,
220
+ " __table_args__ = (\n " ,
221
+ " CheckConstraint('length(city) > 0'),\n " ,
222
+ " )\n " ,
223
+ " id = Column(Integer, Sequence('seq_city_id'), primary_key=True)\n " ,
224
+ " country_id = Column(Integer, ForeignKey('countryies.id'))\n " ,
225
+ " city = Column(String, nullable = False)\n " ,
226
+ " \n " ,
227
+ " class City_data(Base):\n " ,
228
+ " __tablename__ = 'city_data'\n " ,
229
+ " __table_args__ = (\n " ,
230
+ " CheckConstraint('area > 0'),\n " ,
231
+ " CheckConstraint('population >= 0')\n " ,
232
+ " )\n " ,
233
+ " id = Column(Integer, Sequence('seq_city_data_id'), primary_key=True )\n " ,
234
+ " city_id = Column(Integer, ForeignKey('cities.id'))\n " ,
235
+ " area = Column(Float, nullable = False, default=0)\n " ,
236
+ " population = Column(Integer, nullable = False, default=0)\n " ,
237
+ " president = Column(String(60), nullable = True, default='')\n " ,
238
+ " \n " ,
239
+ " class Monument(Base):\n " ,
240
+ " __tablename__ = 'monuments'\n " ,
241
+ " __table_args__ = (\n " ,
242
+ " CheckConstraint('length(monument) > 0'),\n " ,
243
+ " )\n " ,
244
+ " id = Column(Integer, Sequence('seq_monument_id'), primary_key=True )\n " ,
245
+ " city_id = Column(Integer, ForeignKey('cities.id'))\n " ,
246
+ " monument = Column(String(100), nullable = True)\n " ,
247
+ " \n " ,
248
+ " Base.metadata.create_all(engine)"
249
+ ]
250
+ },
251
+ {
252
+ "cell_type" : " code" ,
253
+ "execution_count" : null ,
254
+ "metadata" : {},
255
+ "outputs" : [],
256
+ "source" : [
257
+ " country_list.to_sql('countryies',engine, if_exists='append')\n " ,
258
+ " city_list.to_sql('cityies',engine, if_exists='append')\n " ,
259
+ " city_pop_area.to_sql('city_data',engine, if_exists='append')\n " ,
260
+ " city_monuments.to_sql('monuments',engine, if_exists='append')"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type" : " code" ,
265
+ "execution_count" : null ,
266
+ "metadata" : {},
267
+ "outputs" : [],
268
+ "source" : []
269
+ }
270
+ ],
271
+ "metadata" : {
272
+ "kernelspec" : {
273
+ "name" : " python37664bit8db317473b104a8fa7579acc07e75f1f" ,
274
+ "display_name" : " Python 3.7.6 64-bit"
275
+ },
276
+ "language_info" : {
277
+ "codemirror_mode" : {
278
+ "name" : " ipython" ,
279
+ "version" : 3
280
+ },
281
+ "file_extension" : " .py" ,
282
+ "mimetype" : " text/x-python" ,
283
+ "name" : " python" ,
284
+ "nbconvert_exporter" : " python" ,
285
+ "pygments_lexer" : " ipython3" ,
286
+ "version" : " 3.7.6"
287
+ },
288
+ "metadata" : {
289
+ "interpreter" : {
290
+ "hash" : " 9164a3399a70d355c381b62813f30880ed90ca5a6f321bf0d85375640bda7ee5"
291
+ }
292
+ }
293
+ },
294
+ "nbformat" : 4 ,
295
+ "nbformat_minor" : 2
296
+ }
0 commit comments