Skip to content

Commit 35fdbd9

Browse files
authored
Refactoring of File Reader classes to accommodate for AWS SDK S3 integration (NVIDIA#5434)
Signed-off-by: Joaquin Anton <janton@nvidia.com>
1 parent 22864d9 commit 35fdbd9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+997
-481
lines changed

dali/kernels/slice/slice_flip_normalize_gpu_test.cu

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -79,7 +79,7 @@ class SliceFlipNormalizeGPUTest : public ::testing::Test {
7979
}
8080

8181
void LoadTensor(Tensor<CPUBackend> &tensor, const std::string& path_npy) {
82-
auto stream = FileStream::Open(path_npy, false, false);
82+
auto stream = FileStream::Open(path_npy);
8383
tensor = ::dali::numpy::ReadTensor(stream.get(), true);
8484
}
8585

dali/operators/imgcodec/decoder_test_helper.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ inline Tensor<CPUBackend> ReadReference(InputStream *src, TensorLayout layout =
248248
*/
249249
inline Tensor<CPUBackend> ReadReferenceFrom(const std::string &reference_path,
250250
TensorLayout layout = "HWC") {
251-
auto src = FileStream::Open(reference_path, false, false);
251+
auto src = FileStream::Open(reference_path);
252252
return ReadReference(src.get(), layout);
253253
}
254254

dali/operators/reader/file_reader_op.cc

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2017-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -114,6 +114,10 @@ list of files in the sub-directories of the ``file_root``.
114114
115115
This argument is ignored when file paths are taken from ``file_list`` or ``files``.)",
116116
kKnownExtensionsGlob)
117+
.AddOptionalArg<std::vector<string>>("dir_filters", R"(A list of glob strings to filter the
118+
list of sub-directories under ``file_root``.
119+
120+
This argument is ignored when file paths are taken from ``file_list`` or ``files``.)", nullptr)
117121
.AddOptionalArg<bool>("case_sensitive_filter", R"(If set to True, the filter will be matched
118122
case-sensitively, otherwise case-insensitively.)", false)
119123
.AddParent("LoaderBase");

dali/operators/reader/loader/CMakeLists.txt

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@ collect_headers(DALI_INST_HDRS PARENT_SCOPE)
2525

2626
set(DALI_OPERATOR_SRCS ${DALI_OPERATOR_SRCS}
2727
"${CMAKE_CURRENT_SOURCE_DIR}/filesystem.cc"
28+
"${CMAKE_CURRENT_SOURCE_DIR}/discover_files.cc"
2829
"${CMAKE_CURRENT_SOURCE_DIR}/file_label_loader.cc"
2930
"${CMAKE_CURRENT_SOURCE_DIR}/coco_loader.cc"
3031
"${CMAKE_CURRENT_SOURCE_DIR}/loader.cc"
@@ -57,7 +58,8 @@ endif()
5758
set(DALI_OPERATOR_TEST_SRCS ${DALI_OPERATOR_TEST_SRCS}
5859
"${CMAKE_CURRENT_SOURCE_DIR}/loader_test.cc"
5960
"${CMAKE_CURRENT_SOURCE_DIR}/sequence_loader_test.cc"
60-
"${CMAKE_CURRENT_SOURCE_DIR}/filesystem_test.cc")
61+
"${CMAKE_CURRENT_SOURCE_DIR}/filesystem_test.cc"
62+
"${CMAKE_CURRENT_SOURCE_DIR}/discover_files_test.cc")
6163

6264
if (BUILD_LIBSND)
6365
set(DALI_OPERATOR_TEST_SRCS ${DALI_OPERATOR_TEST_SRCS}

dali/operators/reader/loader/coco_loader.cc

+14-14
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -141,13 +141,13 @@ void SaveToFile(const std::vector<std::vector<T> > &input, const std::string pat
141141
}
142142

143143
template <>
144-
void SaveToFile(const ImageIdPairs &image_id_pairs, const std::string path) {
145-
if (image_id_pairs.empty())
144+
void SaveToFile(const std::vector<FileLabelEntry> &entries, const std::string path) {
145+
if (entries.empty())
146146
return;
147147
std::ofstream file(path);
148148
DALI_ENFORCE(file, "CocoReader meta file error while saving: " + path);
149-
for (const auto &p : image_id_pairs) {
150-
file << p.first << std::endl;
149+
for (const auto &p : entries) {
150+
file << p.filename << std::endl;
151151
}
152152
DALI_ENFORCE(file.good(), make_string("Error writing to path: ", path));
153153
}
@@ -203,16 +203,16 @@ void LoadFromFile(std::vector<std::vector<T> > &output, const std::string path)
203203
}
204204

205205
template <>
206-
void LoadFromFile(ImageIdPairs &image_id_pairs, const std::string path) {
206+
void LoadFromFile(std::vector<FileLabelEntry> &entries, const std::string path) {
207207
std::ifstream file(path);
208-
image_id_pairs.clear();
208+
entries.clear();
209209
if (!file.good())
210210
return;
211211

212212
int id = 0;
213213
std::string filename;
214214
while (file >> filename) {
215-
image_id_pairs.emplace_back(std::move(filename), int{id});
215+
entries.push_back({std::move(filename), id});
216216
++id;
217217
}
218218
}
@@ -417,14 +417,14 @@ void ParseJsonFile(const OpSpec &spec, std::vector<detail::ImageInfo> &image_inf
417417

418418
} // namespace detail
419419

420-
void CocoLoader::SavePreprocessedAnnotations(const std::string &path,
421-
const ImageIdPairs &image_id_pairs) {
420+
void CocoLoader::SavePreprocessedAnnotations(
421+
const std::string &path, const std::vector<FileLabelEntry> &entries) {
422422
using detail::SaveToFile;
423423
SaveToFile(offsets_, path + "/offsets.dat");
424424
SaveToFile(boxes_, path + "/boxes.dat");
425425
SaveToFile(labels_, path + "/labels.dat");
426426
SaveToFile(counts_, path + "/counts.dat");
427-
SaveToFile(image_id_pairs, path + "/filenames.dat");
427+
SaveToFile(entries, path + "/filenames.dat");
428428

429429
if (output_polygon_masks_ || output_pixelwise_masks_) {
430430
SaveToFile(polygon_data_, path + "/polygon_data.dat");
@@ -459,7 +459,7 @@ void CocoLoader::ParsePreprocessedAnnotations() {
459459
LoadFromFile(boxes_, path + "/boxes.dat");
460460
LoadFromFile(labels_, path + "/labels.dat");
461461
LoadFromFile(counts_, path + "/counts.dat");
462-
LoadFromFile(image_label_pairs_, path + "/filenames.dat");
462+
LoadFromFile(file_label_entries_, path + "/filenames.dat");
463463

464464
if (output_polygon_masks_ || output_pixelwise_masks_) {
465465
LoadFromFile(polygon_data_, path + "/polygon_data.dat");
@@ -628,7 +628,7 @@ void CocoLoader::ParseJsonAnnotations() {
628628
}
629629
}
630630

631-
image_label_pairs_.emplace_back(std::move(image_info.filename_), new_image_id);
631+
file_label_entries_.push_back({std::move(image_info.filename_), new_image_id});
632632
new_image_id++;
633633
}
634634
}
@@ -639,7 +639,7 @@ void CocoLoader::ParseJsonAnnotations() {
639639
if (spec_.GetArgument<bool>("save_preprocessed_annotations")) {
640640
SavePreprocessedAnnotations(
641641
spec_.GetArgument<std::string>("save_preprocessed_annotations_dir"),
642-
image_label_pairs_);
642+
file_label_entries_);
643643
}
644644
}
645645

dali/operators/reader/loader/coco_loader.h

+5-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -34,8 +34,6 @@ extern "C" {
3434

3535
namespace dali {
3636

37-
using ImageIdPairs = std::vector<std::pair<std::string, int>>;
38-
3937
inline bool OutPolygonMasksEnabled(const OpSpec &spec) {
4038
return spec.GetArgument<bool>("polygon_masks") ||
4139
(spec.HasArgument("masks") && spec.GetArgument<bool>("masks"));
@@ -189,12 +187,12 @@ class DLL_PUBLIC CocoLoader : public FileLabelLoaderBase<true> {
189187
// seeded with hardcoded value to get
190188
// the same sequence on every shard
191189
std::mt19937 g(kDaliDataloaderSeed);
192-
std::shuffle(image_label_pairs_.begin(), image_label_pairs_.end(), g);
190+
std::shuffle(file_label_entries_.begin(), file_label_entries_.end(), g);
193191
}
194192

195193
if (IsCheckpointingEnabled() && shuffle_after_epoch_) {
196194
// save initial order
197-
backup_image_label_pairs_ = image_label_pairs_;
195+
backup_file_label_entries_ = file_label_entries_;
198196
}
199197
Reset(true);
200198
}
@@ -203,7 +201,8 @@ class DLL_PUBLIC CocoLoader : public FileLabelLoaderBase<true> {
203201

204202
void ParseJsonAnnotations();
205203

206-
void SavePreprocessedAnnotations(const std::string &path, const ImageIdPairs &image_id_pairs);
204+
void SavePreprocessedAnnotations(
205+
const std::string &path, const std::vector<FileLabelEntry> &image_id_pairs);
207206

208207
private:
209208
const OpSpec spec_;

dali/operators/reader/loader/cufile_loader.h

+2-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -38,8 +38,7 @@ namespace dali {
3838
template <typename Target>
3939
class CUFileLoader : public FileLoader<GPUBackend, Target, CUFileStream> {
4040
public:
41-
explicit CUFileLoader(const OpSpec& spec, vector<std::string> images = {},
42-
bool shuffle_after_epoch = false)
41+
CUFileLoader(const OpSpec& spec, bool shuffle_after_epoch)
4342
: FileLoader<GPUBackend, Target, CUFileStream>(spec, shuffle_after_epoch) {
4443
}
4544

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "dali/operators/reader/loader/discover_files.h"
16+
#include <dirent.h>
17+
#include <errno.h>
18+
#include <fnmatch.h>
19+
#include <glob.h>
20+
#include <sys/stat.h>
21+
#include <algorithm>
22+
#include <cstring>
23+
#include <filesystem>
24+
#include <optional>
25+
#include <string>
26+
#include <unordered_map>
27+
#include <utility>
28+
#include <vector>
29+
#include "dali/core/call_at_exit.h"
30+
#include "dali/core/error_handling.h"
31+
#include "dali/operators/reader/loader/filesystem.h"
32+
#include "dali/operators/reader/loader/utils.h"
33+
34+
namespace dali {
35+
36+
std::vector<std::string> list_subdirectories(const std::string &parent_dir,
37+
const std::vector<std::string> dir_filters = {},
38+
bool case_sensitive_filter = true) {
39+
// open the root
40+
DIR *dir = opendir(parent_dir.c_str());
41+
DALI_ENFORCE(dir != nullptr, make_string("Failed to open ", parent_dir));
42+
auto cleanup = AtScopeExit([&dir] {
43+
closedir(dir);
44+
});
45+
46+
struct dirent *entry;
47+
std::vector<std::string> subdirs;
48+
49+
while ((entry = readdir(dir))) {
50+
struct stat s;
51+
std::string entry_name(entry->d_name);
52+
std::string full_path = filesystem::join_path(parent_dir, entry_name);
53+
int ret = stat(full_path.c_str(), &s);
54+
DALI_ENFORCE(ret == 0, "Could not access " + full_path + " during directory traversal.");
55+
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
56+
continue;
57+
if (S_ISDIR(s.st_mode)) {
58+
if (dir_filters.empty()) {
59+
subdirs.push_back(entry_name);
60+
} else {
61+
for (auto &filter : dir_filters) {
62+
if (fnmatch(filter.c_str(), entry_name.c_str(),
63+
case_sensitive_filter ? 0 : FNM_CASEFOLD) == 0) {
64+
subdirs.push_back(entry_name);
65+
}
66+
}
67+
}
68+
}
69+
}
70+
// sort directories to preserve class alphabetic order, as readdir could
71+
// return unordered dir list. Otherwise file reader for training and validation
72+
// could return directories with the same names in completely different order
73+
std::sort(subdirs.begin(), subdirs.end());
74+
return subdirs;
75+
}
76+
77+
std::vector<std::string> list_files(const std::string &parent_dir,
78+
const std::vector<std::string> filters = {},
79+
bool case_sensitive_filter = true) {
80+
DIR *dir = opendir(parent_dir.c_str());
81+
DALI_ENFORCE(dir != nullptr, make_string("Failed to open ", parent_dir));
82+
auto cleanup = AtScopeExit([&dir] {
83+
closedir(dir);
84+
});
85+
86+
dirent *entry;
87+
std::vector<std::string> files;
88+
while ((entry = readdir(dir))) {
89+
#ifdef _DIRENT_HAVE_D_TYPE
90+
/*
91+
* we support only regular files and symlinks, if FS returns DT_UNKNOWN
92+
* it doesn't mean anything and let us validate filename itself
93+
*/
94+
if (entry->d_type != DT_REG && entry->d_type != DT_LNK && entry->d_type != DT_UNKNOWN) {
95+
continue;
96+
}
97+
#endif
98+
std::string fname(entry->d_name);
99+
for (auto &filter : filters) {
100+
if (fnmatch(filter.c_str(), fname.c_str(), case_sensitive_filter ? 0 : FNM_CASEFOLD) == 0) {
101+
files.push_back(fname);
102+
break;
103+
}
104+
}
105+
}
106+
std::sort(files.begin(), files.end());
107+
return files;
108+
}
109+
110+
std::vector<FileLabelEntry> discover_files(const std::string &file_root,
111+
const FileDiscoveryOptions &opts) {
112+
bool is_s3 = starts_with(file_root, "s3://");
113+
if (is_s3) {
114+
DALI_FAIL("This version of DALI was not built with AWS S3 storage support.");
115+
}
116+
117+
std::vector<std::string> subdirs;
118+
subdirs = list_subdirectories(file_root, opts.dir_filters, opts.case_sensitive_filter);
119+
std::vector<FileLabelEntry> entries;
120+
auto process_dir = [&](const std::string &rel_dirpath, std::optional<int> label = {}) {
121+
auto full_dirpath = filesystem::join_path(file_root, rel_dirpath);
122+
auto tmp_files = list_files(full_dirpath, opts.file_filters, opts.case_sensitive_filter);
123+
for (const auto &f : tmp_files) {
124+
entries.push_back({filesystem::join_path(rel_dirpath, f), label});
125+
}
126+
};
127+
128+
// if we are in "label_from_subdir" mode, we need a subdir to infer the label, therefore we don't
129+
// visit the current directory
130+
if (!opts.label_from_subdir) {
131+
process_dir(".");
132+
}
133+
for (unsigned dir_idx = 0; dir_idx < subdirs.size(); ++dir_idx) {
134+
process_dir(subdirs[dir_idx],
135+
opts.label_from_subdir ? std::optional<int>{dir_idx} : std::nullopt);
136+
}
137+
size_t total_dir_count = opts.label_from_subdir ? subdirs.size() : subdirs.size() + 1;
138+
LOG_LINE << "read " << entries.size() << " files from " << total_dir_count << "directories\n";
139+
return entries;
140+
}
141+
142+
} // namespace dali

0 commit comments

Comments
 (0)