Skip to content

Commit adef2ef

Browse files
authored
GH-49043: [C++][FS][Azure] Avoid bugs caused by empty first page(s) followed by non-empty subsequent page(s) (#49049)
### Rationale for this change Prevent bugs similar to #49043 ### What changes are included in this PR? - Implement `SkipStartingEmptyPages` for various types of PagedResponses used in the `AzureFileSystem`. - Apply `SkipStartingEmptyPages` on the response from every list operation that returns a paged response. ### Are these changes tested? Ran the tests in the codebase including the ones that need to connect to real blob storage. This makes me fairly confident that I haven't introduced a regression. The only reproduce I've found involves reading a production Azure blob storage account. With this I've tested that this PR solves #49043, but I haven't been able to reproduce it in any checked in tests. I tried copying a chunk of data around our prod reproduce into azurite, but still can't reproduce. ### Are there any user-facing changes? Some low probability bugs will be gone. No interface changes. * GitHub Issue: #49043 Authored-by: Thomas Newton <thomas.w.newton@gmail.com> Signed-off-by: Sutou Kouhei <kou@clear-code.com>
1 parent e40efd8 commit adef2ef

File tree

1 file changed

+36
-0
lines changed

1 file changed

+36
-0
lines changed

cpp/src/arrow/filesystem/azurefs.cc

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -967,6 +967,32 @@ Status StageBlock(Blobs::BlockBlobClient* block_blob_client, const std::string&
967967
return Status::OK();
968968
}
969969

970+
// Usually if the first page is empty it means there are no results. This was assumed in
971+
// several places in AzureFilesystem. The Azure docs do not guarantee this and we have
972+
// evidence (GH-49043) that there can be subsequent non-empty pages.
973+
// Applying `SkipStartingEmptyPages` on a paged response corrects this assumption.
974+
void SkipStartingEmptyPages(Blobs::ListBlobContainersPagedResponse& paged_response) {
975+
while (paged_response.HasPage() && paged_response.BlobContainers.empty()) {
976+
paged_response.MoveToNextPage();
977+
}
978+
}
979+
void SkipStartingEmptyPages(Blobs::ListBlobsPagedResponse& paged_response) {
980+
while (paged_response.HasPage() && paged_response.Blobs.size() == 0) {
981+
paged_response.MoveToNextPage();
982+
}
983+
}
984+
void SkipStartingEmptyPages(Blobs::ListBlobsByHierarchyPagedResponse& paged_response) {
985+
while (paged_response.HasPage() && paged_response.Blobs.empty() &&
986+
paged_response.BlobPrefixes.empty()) {
987+
paged_response.MoveToNextPage();
988+
}
989+
}
990+
void SkipStartingEmptyPages(DataLake::ListPathsPagedResponse& paged_response) {
991+
while (paged_response.HasPage() && paged_response.Paths.empty()) {
992+
paged_response.MoveToNextPage();
993+
}
994+
}
995+
970996
/// Writes will be buffered up to this size (in bytes) before actually uploading them.
971997
static constexpr int64_t kBlockUploadSizeBytes = 10 * 1024 * 1024;
972998
/// The maximum size of a block in Azure Blob (as per docs).
@@ -1805,12 +1831,14 @@ class AzureFileSystem::Impl {
18051831
try {
18061832
FileInfo info{location.all};
18071833
auto list_response = container_client.ListBlobsByHierarchy(kDelimiter, options);
1834+
SkipStartingEmptyPages(list_response);
18081835
// Since PageSizeHint=1, we expect at most one entry in either Blobs or
18091836
// BlobPrefixes. A BlobPrefix always ends with kDelimiter ("/"), so we can
18101837
// distinguish between a directory and a file by checking if we received a
18111838
// prefix or a blob.
18121839
// This strategy allows us to implement GetFileInfo with just 1 blob storage
18131840
// operation in almost every case.
1841+
18141842
if (!list_response.BlobPrefixes.empty()) {
18151843
// Ensure the returned BlobPrefixes[0] string doesn't contain more characters than
18161844
// the requested Prefix. For instance, if we request with Prefix="dir/abra" and
@@ -1847,6 +1875,7 @@ class AzureFileSystem::Impl {
18471875
// whether the path is a directory.
18481876
options.Prefix = internal::EnsureTrailingSlash(location.path);
18491877
auto list_with_trailing_slash_response = container_client.ListBlobs(options);
1878+
SkipStartingEmptyPages(list_with_trailing_slash_response);
18501879
if (!list_with_trailing_slash_response.Blobs.empty()) {
18511880
info.set_type(FileType::Directory);
18521881
return info;
@@ -1909,6 +1938,7 @@ class AzureFileSystem::Impl {
19091938
try {
19101939
auto container_list_response =
19111940
blob_service_client_->ListBlobContainers(options, context);
1941+
SkipStartingEmptyPages(container_list_response);
19121942
for (; container_list_response.HasPage();
19131943
container_list_response.MoveToNextPage(context)) {
19141944
for (const auto& container : container_list_response.BlobContainers) {
@@ -1950,6 +1980,7 @@ class AzureFileSystem::Impl {
19501980
auto base_path_depth = internal::GetAbstractPathDepth(base_location.path);
19511981
try {
19521982
auto list_response = directory_client.ListPaths(select.recursive, options, context);
1983+
SkipStartingEmptyPages(list_response);
19531984
for (; list_response.HasPage(); list_response.MoveToNextPage(context)) {
19541985
if (list_response.Paths.empty()) {
19551986
continue;
@@ -2040,6 +2071,7 @@ class AzureFileSystem::Impl {
20402071
try {
20412072
auto list_response =
20422073
container_client.ListBlobsByHierarchy(/*delimiter=*/"/", options, context);
2074+
SkipStartingEmptyPages(list_response);
20432075
for (; list_response.HasPage(); list_response.MoveToNextPage(context)) {
20442076
if (list_response.Blobs.empty() && list_response.BlobPrefixes.empty()) {
20452077
continue;
@@ -2442,6 +2474,7 @@ class AzureFileSystem::Impl {
24422474
bool found_dir_marker_blob = false;
24432475
try {
24442476
auto list_response = container_client.ListBlobs(options);
2477+
SkipStartingEmptyPages(list_response);
24452478
if (list_response.Blobs.empty()) {
24462479
if (require_dir_to_exist) {
24472480
return PathNotFound(location);
@@ -2575,6 +2608,7 @@ class AzureFileSystem::Impl {
25752608
auto directory_client = adlfs_client.GetDirectoryClient(location.path);
25762609
try {
25772610
auto list_response = directory_client.ListPaths(false);
2611+
SkipStartingEmptyPages(list_response);
25782612
for (; list_response.HasPage(); list_response.MoveToNextPage()) {
25792613
for (const auto& path : list_response.Paths) {
25802614
if (path.IsDirectory) {
@@ -2899,6 +2933,7 @@ class AzureFileSystem::Impl {
28992933
list_blobs_options.PageSizeHint = 1;
29002934
try {
29012935
auto dest_list_response = dest_container_client.ListBlobs(list_blobs_options);
2936+
SkipStartingEmptyPages(dest_list_response);
29022937
dest_is_empty = dest_list_response.Blobs.empty();
29032938
if (!dest_is_empty) {
29042939
return NotEmpty(dest);
@@ -2952,6 +2987,7 @@ class AzureFileSystem::Impl {
29522987
list_blobs_options.PageSizeHint = 1;
29532988
try {
29542989
auto src_list_response = src_container_client.ListBlobs(list_blobs_options);
2990+
SkipStartingEmptyPages(src_list_response);
29552991
if (!src_list_response.Blobs.empty()) {
29562992
// Reminder: dest is used here because we're semantically replacing dest
29572993
// with src. By deleting src if it's empty just like dest.

0 commit comments

Comments
 (0)