Skip to content

Commit

Permalink
[feature] Add --index-minimum-file-count option with sane default to …
Browse files Browse the repository at this point in the history
…avoid creating index files for rather small zip archives
  • Loading branch information
mxmlnkn committed Sep 3, 2023
1 parent 96bc291 commit 5346c40
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 55 deletions.
31 changes: 21 additions & 10 deletions core/ratarmountcore/SQLiteIndex.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,17 +145,25 @@ def __init__(
encoding: str = tarfile.ENCODING,
checkMetadata: Optional[Callable[[Dict[str, Any]], None]] = None,
printDebug: int = 0,
preferMemory: bool = False,
):
"""
indexFilePath : Path to the index file. This takes precedence over defaultIndexFilePath.
If it is ':memory:', then the SQLite database will be kept in memory
and not stored to the file system at any point.
indexFolders : Specify one or multiple paths for storing .index.sqlite files. Paths will be tested for
suitability in the given order. An empty path will be interpreted as the location in which
the archive resides in.
checkMetadata : A verifying callback that is called when opening an existing index. It is given the
the dictionary of metadata in the index and should thrown an exception when the index
should not be used, e.g., because the version is incompatible.
indexFilePath
Path to the index file. This takes precedence over defaultIndexFilePath.
If it is ':memory:', then the SQLite database will be kept in memory
and not stored to the file system at any point.
indexFolders
Specify one or multiple paths for storing .index.sqlite files. Paths will be tested for
suitability in the given order. An empty path will be interpreted as the location in which
the archive resides in.
checkMetadata
A verifying callback that is called when opening an existing index. It is given the
the dictionary of metadata in the index and should thrown an exception when the index
should not be used, e.g., because the version is incompatible.
preferMemory
If True, then load existing indexes and write to explicitly given index file paths but
if no such things are given, then create the new index in memory as if indexFilePath
= ':memory:' was specified.
"""

self.printDebug = printDebug
Expand All @@ -169,6 +177,7 @@ def __init__(
# stores which parent folders were last tried to add to database and therefore do exist
self.parentFolderCache: List[Tuple[str, str]] = []
self.checkMetadata = checkMetadata
self.preferMemory = preferMemory

@staticmethod
def getPossibleIndexFilePaths(
Expand Down Expand Up @@ -210,14 +219,16 @@ def openInMemory(self):
self._openPath(':memory:')

def openWritable(self):
if self.possibleIndexFilePaths:
if self.possibleIndexFilePaths and not self.preferMemory:
for indexPath in self.possibleIndexFilePaths:
if SQLiteIndex._pathIsWritable(
indexPath, printDebug=self.printDebug
) and SQLiteIndex._pathCanBeUsedForSqlite(indexPath, printDebug=self.printDebug):
self._openPath(indexPath)
break
else:
if self.printDebug >= 3 and self.preferMemory:
print("[Info] Create new index in memory because memory is to be preferred, e.g., for small archives.")
self._openPath(':memory:')

if not self.indexIsLoaded():
Expand Down
87 changes: 51 additions & 36 deletions core/ratarmountcore/SQLiteIndexedTar.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,42 +608,57 @@ def __init__(
# fmt: on
) -> None:
"""
tarFileName : Path to the TAR file to be opened. If not specified, a fileObject must be specified.
If only a fileObject is given, the created index can't be cached (efficiently).
fileObject : A io.IOBase derived object. If not specified, tarFileName will be opened.
If it is an instance of IndexedBzip2File, IndexedGzipFile, or IndexedZstdFile, then the offset
loading and storing from and to the SQLite database is managed automatically by this class.
writeIndex : If true, then the sidecar index file will be written to a suitable location.
Will be ignored if indexFilePath is ':memory:' or if only fileObject is specified
but not tarFileName.
clearIndexCache : If true, then check all possible index file locations for the given tarFileName/fileObject
combination and delete them. This also implicitly forces a recreation of the index.
indexFilePath : Path to the index file for this TAR archive. This takes precedence over the automatically
chosen locations. If it is ':memory:', then the SQLite database will be kept in memory
and not stored to the file system at any point.
indexFolders : Specify one or multiple paths for storing .index.sqlite files. Paths will be tested for
suitability in the given order. An empty path will be interpreted as the location in which
the TAR resides.
recursive : If true, then TAR files inside this archive will be recursively analyzed and added to the SQLite
index. Currently, this recursion can only break the outermost compression layer. I.e., a .tar.bz2
file inside a tar.bz2 file can not be mounted recursively.
gzipSeekPointSpacing : This controls the frequency of gzip decoder seek points, see indexed_gzip documentation.
Larger spacings lead to less memory usage but increase the constant seek overhead.
encoding : Will be forwarded to tarfile. Specifies how filenames inside the TAR are encoded.
ignoreZeros : Will be forwarded to tarfile. Specifies to not only skip zero blocks but also blocks with
invalid data. Setting this to true can lead to some problems but is required to correctly
read concatenated tars.
stripRecursiveTarExtension : If true and if recursive is also true, then a <file>.tar inside the current
tar will be mounted at <file>/ instead of <file>.tar/.
transformRecursiveMountPoint : If specified, then a <path>.tar inside the current tar will be matched with the
first argument of the tuple and replaced by the second argument. This new
modified path is used as recursive mount point. See also Python's re.sub.
verifyModificationTime : If true, then the index will be recreated automatically if the TAR archive has a more
recent modification time than the index file.
isGnuIncremental : If None, then it will be determined automatically. Behavior can be overwritten by setting
it to a bool value. If true, then prefixes will be stripped from certain paths encountered
with GNU incremental backups.
kwargs : Unused. Only for compatibility with generic MountSource interface.
tarFileName
Path to the TAR file to be opened. If not specified, a fileObject must be specified.
If only a fileObject is given, the created index can't be cached (efficiently).
fileObject
A io.IOBase derived object. If not specified, tarFileName will be opened.
If it is an instance of IndexedBzip2File, IndexedGzipFile, or IndexedZstdFile, then the offset
loading and storing from and to the SQLite database is managed automatically by this class.
writeIndex
If true, then the sidecar index file will be written to a suitable location.
Will be ignored if indexFilePath is ':memory:' or if only fileObject is specified
but not tarFileName.
clearIndexCache
If true, then check all possible index file locations for the given tarFileName/fileObject
combination and delete them. This also implicitly forces a recreation of the index.
indexFilePath
Path to the index file for this TAR archive. This takes precedence over the automatically
chosen locations. If it is ':memory:', then the SQLite database will be kept in memory
and not stored to the file system at any point.
indexFolders
Specify one or multiple paths for storing .index.sqlite files. Paths will be tested for
suitability in the given order. An empty path will be interpreted as the location in which
the TAR resides.
recursive
If true, then TAR files inside this archive will be recursively analyzed and added to the SQLite
index. Currently, this recursion can only break the outermost compression layer. I.e., a .tar.bz2
file inside a tar.bz2 file can not be mounted recursively.
gzipSeekPointSpacing
This controls the frequency of gzip decoder seek points, see indexed_gzip documentation.
Larger spacings lead to less memory usage but increase the constant seek overhead.
encoding
Will be forwarded to tarfile. Specifies how filenames inside the TAR are encoded.
ignoreZeros
Will be forwarded to tarfile. Specifies to not only skip zero blocks but also blocks with
invalid data. Setting this to true can lead to some problems but is required to correctly
read concatenated tars.
stripRecursiveTarExtension
If true and if recursive is also true, then a <file>.tar inside the current
tar will be mounted at <file>/ instead of <file>.tar/.
transformRecursiveMountPoint
If specified, then a <path>.tar inside the current tar will be matched with the
first argument of the tuple and replaced by the second argument. This new
modified path is used as recursive mount point. See also Python's re.sub.
verifyModificationTime
If true, then the index will be recreated automatically if the TAR archive has a more
recent modification time than the index file.
isGnuIncremental
If None, then it will be determined automatically. Behavior can be overwritten by setting
it to a bool value. If true, then prefixes will be stripped from certain paths encountered
with GNU incremental backups.
kwargs
Unused. Only for compatibility with generic MountSource interface.
"""

# fmt: off
Expand Down
10 changes: 1 addition & 9 deletions core/ratarmountcore/ZipMountSource.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,22 +220,14 @@ def __init__(
ZipMountSource._findPassword(self.fileObject, options.get("passwords", []))
self.files = {info.header_offset: info for info in self.fileObject.infolist()}

# If no explicit index file path given and if it is a very small file, then avoid creating an obnoxious
# index file for it. This becomes especially important when mounting folders of ZIPs!
possibleIndexFilePaths = SQLiteIndex.getPossibleIndexFilePaths(
indexFilePath = indexFilePath, indexFolders = indexFolders, archiveFilePath = self.archiveFilePath
)
indexExists = any(path and os.path.isfile(path) for path in possibleIndexFilePaths)
if indexFilePath is None and len(self.files) < options.get("indexMinimumFileCount", 1000) and not indexExists:
indexFilePath = ':memory:'

self.index = SQLiteIndex(
indexFilePath,
indexFolders=indexFolders,
archiveFilePath=self.archiveFilePath,
encoding=self.encoding,
checkMetadata=self._checkMetadata,
printDebug=self.printDebug,
preferMemory=len(self.files) < options.get("indexMinimumFileCount", 1000),
)

if clearIndexCache:
Expand Down
7 changes: 7 additions & 0 deletions ratarmount.py
Original file line number Diff line number Diff line change
Expand Up @@ -1303,6 +1303,12 @@ def _parseArgs(rawArgs: Optional[List[str]] = None):
'--union-mount-cache-timeout', type=float, default=60,
help='Timeout in seconds before stopping to build the union mount cache.')

advancedGroup.add_argument(
'--index-minimum-file-count', type=int, default=1000,
help='Create indexes for archives with fewer than this limit of files in memory instead of '
'creating a .index.sqlite file. This is currently not applied for TAR files because the file count '
'only becomes known after parsing the archive, for which an index is already created.')

# Positional Arguments

positionalGroup.add_argument(
Expand Down Expand Up @@ -1637,6 +1643,7 @@ def cli(rawArgs: Optional[List[str]] = None) -> None:
maxCacheDepth = args.union_mount_cache_max_depth,
maxCacheEntries = args.union_mount_cache_max_entries,
maxSecondsToCache = args.union_mount_cache_timeout,
indexMinimumFileCount = args.index_minimum_file_count,
# fmt: on
)

Expand Down

0 comments on commit 5346c40

Please sign in to comment.