Skip to content

Commit

Permalink
Fix false positives in loop detection. Closes #17.
Browse files Browse the repository at this point in the history
We had false positives when there was a (forward) symlink to a subdir.
Fix filter_visited so it doesn't count multiple subdirs pointing to the
same thing as a loop.
  • Loading branch information
israel-lugo committed Sep 15, 2017
1 parent 63c6d9c commit 817cb87
Showing 1 changed file with 8 additions and 7 deletions.
15 changes: 8 additions & 7 deletions capidup/finddups.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,16 +122,17 @@ def filter_visited(curr_dir, subdirs, already_visited, follow_dirlinks, on_error
returned by os.walk().
already_visited is a set of tuples (st_dev, st_ino) of already
visited directories; it will be modified *in-place* to include the
directories in subdirs.
visited directories. This set will not be modified.
on error is a function f(OSError) -> None, to be called in case of
error.
Returns a new (possibly filtered) subdirs list.
Returns a tuple: the new (possibly filtered) subdirs list, and a new
set of already visited directories, now including the subdirs.
"""
filtered = []
to_visit = set()

for subdir in subdirs:
full_path = os.path.join(curr_dir, subdir)
Expand All @@ -148,11 +149,11 @@ def filter_visited(curr_dir, subdirs, already_visited, follow_dirlinks, on_error
dev_inode = (file_info.st_dev, file_info.st_ino)
if dev_inode not in already_visited:
filtered.append(subdir)
already_visited.add(dev_inode)
to_visit.add(dev_inode)
else:
on_error(OSError(errno.ELOOP, "directory loop detected", full_path))

return filtered
return filtered, already_visited.union(to_visit)


def index_files_by_size(root, files_by_size, exclude_dirs, exclude_files,
Expand Down Expand Up @@ -207,8 +208,8 @@ def _print_error(error):
# remove subdirs that have already been visited; loops can happen
# if there's a symlink loop and follow_dirlinks==True, or if
# there's a hardlink loop (which is usually a corrupted filesystem)
subdirs[:] = filter_visited(curr_dir, subdirs, already_visited,
follow_dirlinks, _print_error)
subdirs[:], already_visited = filter_visited(curr_dir, subdirs,
already_visited, follow_dirlinks, _print_error)

for base_filename in filenames:
full_path = os.path.join(curr_dir, base_filename)
Expand Down

0 comments on commit 817cb87

Please sign in to comment.