-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathimport_food_data.py
524 lines (433 loc) · 17.7 KB
/
import_food_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
#!env python3
"""Convert the USDA's FoodData Central CSV files into a single sqlite3 file.
https://fdc.nal.usda.gov/download-datasets.html and select "All Foods" zip.
"""
# Copyright (c) 2019 Doug Hogan <github@acyclic.org>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
import argparse
import csv
import os
import os.path
import sqlite3
import sys
from glob import glob
from typing import List, Optional
def import_csv(cursor: sqlite3.Cursor, csvfile: str, batch: int = 1000,
verbose: bool = False) -> None:
"""Import a given USDA FoodData Central CSV file.
Assumes that the schema has already been created."""
table = os.path.basename(csvfile)[:-4]
rows = []
with open(csvfile, newline='') as desc:
reader = csv.reader(desc, strict=True)
next(reader) # Skip the header
for row in reader:
# Replace "" with None so NULL values are handled properly.
rows.append(list(map(lambda x: None if x == "" else x, row)))
if len(rows) >= batch:
rows = insert_many(cursor, table, rows, verbose)
if rows:
rows = insert_many(cursor, table, rows, verbose)
def insert_many(cursor: sqlite3.Cursor, table: str, rows: List[List[Optional[str]]],
verbose: bool = False) -> List[List[Optional[str]]]:
"""Insert many rows into a table."""
placeholders = ','.join(['?'] * len(rows[0]))
sql = f"INSERT INTO {table} VALUES ({placeholders})"
if verbose:
print(f"Running {sql} with {rows}")
cursor.executemany(sql, rows)
return []
def query_counts(cursor: sqlite3.Cursor, csv_count_file: str, verbose: bool = False) -> None:
"""Double check the number of rows on disk matches the count file"""
with open(csv_count_file, newline='') as desc:
reader = csv.reader(desc, strict=True)
next(reader) # Skip the header
for row in reader:
row = [x.replace('"', '') for x in row]
table, count = row[0], int(row[1])
num_rows = cursor.execute(f'''
SELECT COUNT(*) AS {table}_count
FROM {table};
''').fetchone()[0]
if verbose:
print(f"Inserted {num_rows} into {table}")
if num_rows != count:
print(f"==> Expected {count} but inserted {num_rows} in {table}")
def sqlite3_schema(cursor: sqlite3.Cursor) -> None:
"""SQLite3 Schema for USDA's FoodData Central database"""
cursor.executescript('''
CREATE TABLE acquisition_sample (
"fdc_id_of_sample_food" INT REFERENCES food(fdc_id),
"fdc_id_of_acquisition_food" INT REFERENCES food(fdc_id),
PRIMARY KEY(fdc_id_of_sample_food, fdc_id_of_acquisition_food)
);
CREATE TABLE agricultural_acquisition (
"fdc_id" INT NOT NULL PRIMARY KEY,
"acquisition_date" TEXT
CHECK(acquisition_date IS NULL OR
acquisition_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1),
"market_class" TEXT,
"treatment" TEXT,
"state" TEXT
);
CREATE TABLE branded_food (
"fdc_id" INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id),
"brand_owner" TEXT, -- XXX Inconsistent names
"gtin_upc" TEXT,
"ingredients" TEXT,
"serving_size" REAL,
"serving_size_unit" TEXT
CHECK(serving_size_unit IN ('g', 'ml')),
"household_serving_fulltext" TEXT,
"branded_food_category" TEXT,
"data_source" TEXT
CHECK(data_source IN ('GDSN', 'LI')),
"modified_date" TEXT
CHECK(modified_date IS NULL OR
modified_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1),
"available_date" TEXT
CHECK(available_date IS NULL OR
available_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1)
);
CREATE INDEX idx_branded_food_gtin_upc ON branded_food (gtin_upc);
CREATE INDEX idx_branded_food_branded_food_category ON branded_food (branded_food_category);
CREATE TABLE food (
fdc_id INT NOT NULL PRIMARY KEY,
data_type TEXT,
description TEXT,
food_category_id INT REFERENCES food_category(id),
publication_date TEXT
CHECK(publication_date IS NULL OR
publication_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1)
);
CREATE INDEX idx_food_data_type ON food (data_type);
CREATE INDEX idx_food_food_category_id ON food (food_category_id);
CREATE TABLE food_attribute (
"id" INT NOT NULL PRIMARY KEY,
"fdc_id" INT REFERENCES food(fdc_id),
"seq_num" INT,
"food_attribute_type_id" INT REFERENCES food_attribute_type(id),
"name" TEXT,
"value" TEXT
);
CREATE INDEX idx_food_attribute_fdc_id ON food_attribute (fdc_id);
CREATE INDEX idx_food_attribute_food_attribute_type_id ON food_attribute (food_attribute_type_id);
CREATE TABLE food_attribute_type (
"id" INT NOT NULL PRIMARY KEY,
"name" TEXT,
"description" TEXT
);
CREATE TABLE food_calorie_conversion_factor (
"food_nutrient_conversion_factor_id" INT NOT NULL PRIMARY KEY REFERENCES food_nutrient_conversion_factor(id),
"protein_value" REAL,
"fat_value" REAL,
"carbohydrate_value" REAL
);
CREATE TABLE food_category (
"id" INT NOT NULL PRIMARY KEY,
"code" TEXT,
"description" TEXT
);
CREATE TABLE food_component (
"id" INT NOT NULL PRIMARY KEY,
"fdc_id" INT REFERENCES food(fdc_id),
"name" TEXT,
"pct_weight" REAL,
"is_refuse" TEXT
CHECK(is_refuse IN ('Y', 'N')),
"gram_weight" REAL,
"data_points" INT,
"min_year_acquired" TEXT
CHECK(min_year_acquired IS NULL OR
min_year_acquired GLOB '[0-9][0-9][0-9][0-9]' IS 1)
);
CREATE INDEX idx_food_component_fdc_id ON food_component (fdc_id);
-- XXX Field Descriptions describes "food_fat_conversion_factor" but there is no table for it.
-- XXX File is missing?
CREATE TABLE food_nutrient (
"id" INT NOT NULL PRIMARY KEY,
"fdc_id" INT REFERENCES food(fdc_id),
"nutrient_id" INT REFERENCES nutrient(id),
"amount" REAL,
"data_points" INT,
"derivation_id" INT REFERENCES food_nutrient_derivation(id),
-- XXX Missing standard_error from Field Descriptions
"min" REAL,
"max" REAL,
"median" REAL,
"footnote" TEXT,
"min_year_acquired" TEXT
CHECK(min_year_acquired IS NULL OR
min_year_acquired GLOB '[0-9][0-9][0-9][0-9]' IS 1)
);
CREATE INDEX idx_food_nutrient_fdc_id ON food_nutrient (fdc_id);
CREATE INDEX idx_food_nutrient_nutrient_id ON food_nutrient (nutrient_id);
CREATE INDEX idx_food_nutrient_derivation_id ON food_nutrient (derivation_id);
CREATE TABLE food_nutrient_conversion_factor (
"id" INT NOT NULL PRIMARY KEY,
"fdc_id" INT REFERENCES food(fdc_id)
);
CREATE INDEX idx_food_nutrient_conversion_factor_fdc_id ON food_nutrient_conversion_factor (fdc_id);
CREATE TABLE food_nutrient_derivation (
"id" INT NOT NULL PRIMARY KEY,
"code" TEXT,
"description" TEXT,
"source_id" INT REFERENCES food_nutrient_source(id)
);
CREATE INDEX idx_food_nutrient_derivation_source_id ON food_nutrient_derivation (source_id);
CREATE TABLE food_nutrient_source (
"id" INT NOT NULL PRIMARY KEY,
"code" INT UNIQUE, -- Code for source (4=calculated). XXX FK to ?
"description" TEXT
);
CREATE TABLE food_portion (
"id" INT NOT NULL PRIMARY KEY,
"fdc_id" INT REFERENCES food(fdc_id),
"seq_num" INT,
"amount" REAL,
"measure_unit_id" INT REFERENCES measure_unit(id),
"portion_description" TEXT,
"modifier" TEXT,
"gram_weight" REAL,
"data_points" INT,
"footnote" TEXT,
"min_year_acquired" TEXT
CHECK(min_year_acquired IS NULL OR
min_year_acquired GLOB '[0-9][0-9][0-9][0-9]' IS 1)
);
CREATE INDEX idx_food_portion_fdc_id ON food_portion (fdc_id);
CREATE INDEX idx_food_portion_measure_unit_id ON food_portion (measure_unit_id);
CREATE TABLE food_protein_conversion_factor (
"food_nutrient_conversion_factor_id" INT NOT NULL PRIMARY KEY REFERENCES food_nutrient_conversion_factor(id),
"value" REAL
);
CREATE TABLE foundation_food (
"fdc_id" INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id),
"NDB_number" INT UNIQUE,
"footnote" TEXT
);
CREATE TABLE input_food (
"id" INT NOT NULL PRIMARY KEY,
"fdc_id" INT REFERENCES food(fdc_id),
"fdc_id_of_input_food" INT REFERENCES food(fdc_id),
"seq_num" INT,
"amount" REAL,
"sr_code" INT, -- NDB code of SR food XXX but not a FK
"sr_description" TEXT,
"unit" TEXT, -- Unit of measure (but inconsistent)
"portion_code" INT, -- Code for portion description XXX FK?
"portion_description" TEXT,
"gram_weight" REAL,
"retention_code" INT,
"survey_flag" INT
);
CREATE INDEX idx_input_food_fdc_id ON input_food (fdc_id);
CREATE INDEX idx_input_food_fdc_id_of_input_food ON input_food (fdc_id_of_input_food);
CREATE TABLE lab_method (
"id" INT NOT NULL PRIMARY KEY,
"description" TEXT,
"technique" TEXT
);
CREATE TABLE lab_method_code (
"id" INT NOT NULL PRIMARY KEY,
"lab_method_id" INT REFERENCES lab_method(id),
"code" TEXT
);
CREATE INDEX idx_lab_method_code_lab_method_id ON lab_method_code (lab_method_id);
CREATE TABLE lab_method_nutrient (
"id" INT NOT NULL PRIMARY KEY,
"lab_method_id" INT REFERENCES lab_method(id),
"nutrient_id" INT -- XXX this constraint fails: REFERENCES nutrient(id)
);
CREATE INDEX idx_lab_method_nutrient_lab_method_id ON lab_method_nutrient (lab_method_id);
CREATE TABLE market_acquisition (
"fdc_id" INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id),
"brand_description" TEXT,
"expiration_date" TEXT
CHECK(expiration_date IS NULL OR
expiration_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1),
"label_weight" REAL,
"location" TEXT,
"acquisition_date" TEXT
CHECK(acquisition_date IS NULL OR
acquisition_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1),
"sales_type" TEXT,
"sample_lot_nbr" INT,
"sell_by_date" TEXT
CHECK(sell_by_date IS NULL OR
sell_by_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1),
"store_city" TEXT,
"store_name" TEXT,
"store_state" TEXT,
"upc_code" TEXT
);
CREATE TABLE measure_unit (
"id" INT NOT NULL PRIMARY KEY,
"name" TEXT UNIQUE
);
CREATE TABLE nutrient (
"id" INT NOT NULL PRIMARY KEY,
"name" TEXT,
"unit_name" TEXT,
"nutrient_nbr" INT UNIQUE,
"rank" INT -- XXX Not documented
);
-- XXX Missing table nutrient_analysis_details per Field Descriptions
CREATE TABLE nutrient_incoming_name (
"id" INT NOT NULL PRIMARY KEY,
"name" TEXT,
"nutrient_id" INT REFERENCES nutrient(id)
);
CREATE INDEX idx_nutrient_incoming_name_nutrient_id ON nutrient_incoming_name (nutrient_id);
CREATE TABLE retention_factor (
"id" INT NOT NULL PRIMARY KEY,
"code" TEXT,
"food_group_id" INT REFERENCES food_category(id),
"description" TEXT
);
CREATE INDEX idx_retention_factor_food_group_id ON retention_factor (food_group_id);
CREATE TABLE sample_food (
"fdc_id" INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id)
);
CREATE TABLE sr_legacy_food (
"fdc_id" INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id),
"NDB_number" INT UNIQUE -- XXX doc says starts at 100k but not in practice
);
CREATE TABLE sub_sample_food (
"fdc_id" INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id),
"fdc_id_of_sample_food" INT REFERENCES food(fdc_id)
);
CREATE INDEX idx_sub_sample_food_fdc_id_of_sample_food ON sub_sample_food (fdc_id_of_sample_food);
CREATE TABLE sub_sample_result (
"food_nutrient_id" INT NOT NULL PRIMARY KEY REFERENCES food_nutrient(id),
"adjusted_amount" REAL,
"lab_method_id" INT REFERENCES lab_method(id), -- XXX cannot use this because of broken refs: REFERENCES lab_method(id),
"nutrient_name" TEXT
);
CREATE INDEX idx_sub_sample_result_lab_method_id ON sub_sample_result (lab_method_id);
CREATE TABLE survey_fndds_food (
"fdc_id" INT NOT NULL PRIMARY KEY REFERENCES food(fdc_id),
"food_code" INT UNIQUE,
"wweia_category_code" INT REFERENCES wweia_food_category(wweia_food_category_code),
"start_date" TEXT
CHECK(start_date IS NULL OR
start_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1),
"end_date" TEXT
CHECK(end_date IS NULL OR
end_date GLOB '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]' IS 1)
);
CREATE INDEX idx_survey_fndds_food_wweia_category_code ON survey_fndds_food (wweia_category_code);
CREATE TABLE wweia_food_category (
"wweia_food_category_code" INT NOT NULL PRIMARY KEY,
"wweia_food_category_description" TEXT
);
''')
def process(directory: str, database: str, force: bool = False, batch: int = 1000,
verbose: bool = False) -> None:
"""Process a directory of USDA FoodData CSVs into a SQLite file"""
# Import in this order so it will not cause problems with foreign key
# constraints.
ordered = [
'food_category.csv',
'food.csv',
'food_nutrient_conversion_factor.csv',
'nutrient.csv',
'food_nutrient_source.csv',
'measure_unit.csv',
'lab_method.csv',
'wweia_food_category.csv',
'acquisition_sample.csv',
'agricultural_acquisition.csv',
'branded_food.csv',
'food_attribute_type.csv',
'food_attribute.csv',
'food_calorie_conversion_factor.csv',
'food_component.csv',
'food_nutrient_derivation.csv',
'food_portion.csv',
'food_protein_conversion_factor.csv',
'foundation_food.csv',
'sr_legacy_food.csv',
'survey_fndds_food.csv',
'input_food.csv',
'lab_method_code.csv',
'lab_method_nutrient.csv',
'market_acquisition.csv',
'nutrient_incoming_name.csv',
'retention_factor.csv',
'sample_food.csv',
'sub_sample_food.csv',
'food_nutrient.csv',
'sub_sample_result.csv',
'all_downloaded_table_record_counts.csv',
]
# Make sure we accounted for everything
csvs = [os.path.basename(x) for x in glob(os.path.join(directory, "*.csv"))]
diff = set(ordered) ^ set(csvs)
if diff != set():
raise Exception(f"Unhandled input: {diff}")
if os.path.isfile(database):
if force:
os.remove(database)
else:
print("Output already exists. Use --force or delete it first")
sys.exit(1)
with sqlite3.connect(database) as conn:
cursor = conn.cursor()
# This must be enabled at runtime every time you want to check FK.
cursor.executescript('''
PRAGMA foreign_keys = ON;
''')
sqlite3_schema(cursor)
for fname in ordered:
if fname == "all_downloaded_table_record_counts.csv":
print(f"Skipping {fname}")
else:
print(f"Importing {fname}")
import_csv(cursor, os.path.join(directory, fname), batch, verbose)
# Due to lots of inserts, force a vacuum to reduce fragmentation
print("Running integrity check, vacuum and analyze on database")
cursor.executescript('''
PRAGMA foreign_key_check;
PRAGMA integrity_check;
VACUUM;
ANALYZE;
''')
conn.close()
def check(directory: str, database: str, verbose: bool = False) -> None:
"""Re-open the database from disk to verify it matches the record count"""
with sqlite3.connect(database) as conn:
cursor = conn.cursor()
check_file = os.path.join(directory, 'all_downloaded_table_record_counts.csv')
print(f"Checking counts against {check_file}")
query_counts(cursor, check_file, verbose)
conn.close()
def main() -> None:
"""Main func"""
parser = argparse.ArgumentParser(description='Convert USDA FoodData CSV files to SQLite3')
parser.add_argument('-d', '--directory', default='.',
help='Base directory with the *.csv files')
parser.add_argument('-b', '--batch', default=1000, type=int,
help='Batch inserts into sets of this length')
parser.add_argument('-o', '--output', default='usda_food_data.db',
help='Output SQLite3 file')
parser.add_argument('-f', '--force', action='store_true', default=False,
help='Whether to clobber output file')
parser.add_argument('-v', '--verbose', action='store_true', default=False,
help='Verbose output')
args = parser.parse_args()
process(args.directory, args.output, args.force, args.batch, args.verbose)
check(args.directory, args.output, args.verbose)
if __name__ == '__main__':
main()