[feature] Add --index-minimum-file-count option with sane default to …

…avoid creating index files for rather small zip archives
mxmlnkn · Sep 3, 2023 · 5346c40 · 5346c40
1 parent 96bc291
commit 5346c40
Show file tree

Hide file tree

Showing 4 changed files with 80 additions and 55 deletions.
diff --git a/core/ratarmountcore/SQLiteIndex.py b/core/ratarmountcore/SQLiteIndex.py
@@ -145,17 +145,25 @@ def __init__(
         encoding: str = tarfile.ENCODING,
         checkMetadata: Optional[Callable[[Dict[str, Any]], None]] = None,
         printDebug: int = 0,
+        preferMemory: bool = False,
     ):
         """
-        indexFilePath : Path to the index file. This takes precedence over defaultIndexFilePath.
-                        If it is ':memory:', then the SQLite database will be kept in memory
-                        and not stored to the file system at any point.
-        indexFolders : Specify one or multiple paths for storing .index.sqlite files. Paths will be tested for
-                       suitability in the given order. An empty path will be interpreted as the location in which
-                       the archive resides in.
-        checkMetadata : A verifying callback that is called when opening an existing index. It is given the
-                        the dictionary of metadata in the index and should thrown an exception when the index
-                        should not be used, e.g., because the version is incompatible.
+        indexFilePath
+            Path to the index file. This takes precedence over defaultIndexFilePath.
+            If it is ':memory:', then the SQLite database will be kept in memory
+            and not stored to the file system at any point.
+        indexFolders
+            Specify one or multiple paths for storing .index.sqlite files. Paths will be tested for
+            suitability in the given order. An empty path will be interpreted as the location in which
+            the archive resides in.
+        checkMetadata
+            A verifying callback that is called when opening an existing index. It is given the
+            the dictionary of metadata in the index and should thrown an exception when the index
+            should not be used, e.g., because the version is incompatible.
+        preferMemory
+            If True, then load existing indexes and write to explicitly given index file paths but
+            if no such things are given, then create the new index in memory as if indexFilePath
+            = ':memory:' was specified.
         """
 
         self.printDebug = printDebug
@@ -169,6 +177,7 @@ def __init__(
         # stores which parent folders were last tried to add to database and therefore do exist
         self.parentFolderCache: List[Tuple[str, str]] = []
         self.checkMetadata = checkMetadata
+        self.preferMemory = preferMemory
 
     @staticmethod
     def getPossibleIndexFilePaths(
@@ -210,14 +219,16 @@ def openInMemory(self):
         self._openPath(':memory:')
 
     def openWritable(self):
-        if self.possibleIndexFilePaths:
+        if self.possibleIndexFilePaths and not self.preferMemory:
             for indexPath in self.possibleIndexFilePaths:
                 if SQLiteIndex._pathIsWritable(
                     indexPath, printDebug=self.printDebug
                 ) and SQLiteIndex._pathCanBeUsedForSqlite(indexPath, printDebug=self.printDebug):
                     self._openPath(indexPath)
                     break
         else:
+            if self.printDebug >= 3 and self.preferMemory:
+                print("[Info] Create new index in memory because memory is to be preferred, e.g., for small archives.")
             self._openPath(':memory:')
 
         if not self.indexIsLoaded():

diff --git a/core/ratarmountcore/SQLiteIndexedTar.py b/core/ratarmountcore/SQLiteIndexedTar.py
@@ -608,42 +608,57 @@ def __init__(
         # fmt: on
     ) -> None:
         """
-        tarFileName : Path to the TAR file to be opened. If not specified, a fileObject must be specified.
-                      If only a fileObject is given, the created index can't be cached (efficiently).
-        fileObject : A io.IOBase derived object. If not specified, tarFileName will be opened.
-                     If it is an instance of IndexedBzip2File, IndexedGzipFile, or IndexedZstdFile, then the offset
-                     loading and storing from and to the SQLite database is managed automatically by this class.
-        writeIndex : If true, then the sidecar index file will be written to a suitable location.
-                     Will be ignored if indexFilePath is ':memory:' or if only fileObject is specified
-                     but not tarFileName.
-        clearIndexCache : If true, then check all possible index file locations for the given tarFileName/fileObject
-                          combination and delete them. This also implicitly forces a recreation of the index.
-        indexFilePath : Path to the index file for this TAR archive. This takes precedence over the automatically
-                        chosen locations. If it is ':memory:', then the SQLite database will be kept in memory
-                        and not stored to the file system at any point.
-        indexFolders : Specify one or multiple paths for storing .index.sqlite files. Paths will be tested for
-                       suitability in the given order. An empty path will be interpreted as the location in which
-                       the TAR resides.
-        recursive : If true, then TAR files inside this archive will be recursively analyzed and added to the SQLite
-                    index. Currently, this recursion can only break the outermost compression layer. I.e., a .tar.bz2
-                    file inside a tar.bz2 file can not be mounted recursively.
-        gzipSeekPointSpacing : This controls the frequency of gzip decoder seek points, see indexed_gzip documentation.
-                               Larger spacings lead to less memory usage but increase the constant seek overhead.
-        encoding : Will be forwarded to tarfile. Specifies how filenames inside the TAR are encoded.
-        ignoreZeros : Will be forwarded to tarfile. Specifies to not only skip zero blocks but also blocks with
-                      invalid data. Setting this to true can lead to some problems but is required to correctly
-                      read concatenated tars.
-        stripRecursiveTarExtension : If true and if recursive is also true, then a <file>.tar inside the current
-                                     tar will be mounted at <file>/ instead of <file>.tar/.
-        transformRecursiveMountPoint : If specified, then a <path>.tar inside the current tar will be matched with the
-                                       first argument of the tuple and replaced by the second argument. This new
-                                       modified path is used as recursive mount point. See also Python's re.sub.
-        verifyModificationTime : If true, then the index will be recreated automatically if the TAR archive has a more
-                                 recent modification time than the index file.
-        isGnuIncremental : If None, then it will be determined automatically. Behavior can be overwritten by setting
-                           it to a bool value. If true, then prefixes will be stripped from certain paths encountered
-                           with GNU incremental backups.
-        kwargs : Unused. Only for compatibility with generic MountSource interface.
+        tarFileName
+            Path to the TAR file to be opened. If not specified, a fileObject must be specified.
+            If only a fileObject is given, the created index can't be cached (efficiently).
+        fileObject
+            A io.IOBase derived object. If not specified, tarFileName will be opened.
+            If it is an instance of IndexedBzip2File, IndexedGzipFile, or IndexedZstdFile, then the offset
+            loading and storing from and to the SQLite database is managed automatically by this class.
+        writeIndex
+            If true, then the sidecar index file will be written to a suitable location.
+            Will be ignored if indexFilePath is ':memory:' or if only fileObject is specified
+            but not tarFileName.
+        clearIndexCache
+            If true, then check all possible index file locations for the given tarFileName/fileObject
+            combination and delete them. This also implicitly forces a recreation of the index.
+        indexFilePath
+            Path to the index file for this TAR archive. This takes precedence over the automatically
+            chosen locations. If it is ':memory:', then the SQLite database will be kept in memory
+            and not stored to the file system at any point.
+        indexFolders
+            Specify one or multiple paths for storing .index.sqlite files. Paths will be tested for
+            suitability in the given order. An empty path will be interpreted as the location in which
+            the TAR resides.
+        recursive
+            If true, then TAR files inside this archive will be recursively analyzed and added to the SQLite
+            index. Currently, this recursion can only break the outermost compression layer. I.e., a .tar.bz2
+            file inside a tar.bz2 file can not be mounted recursively.
+        gzipSeekPointSpacing
+            This controls the frequency of gzip decoder seek points, see indexed_gzip documentation.
+            Larger spacings lead to less memory usage but increase the constant seek overhead.
+        encoding
+            Will be forwarded to tarfile. Specifies how filenames inside the TAR are encoded.
+        ignoreZeros
+            Will be forwarded to tarfile. Specifies to not only skip zero blocks but also blocks with
+            invalid data. Setting this to true can lead to some problems but is required to correctly
+            read concatenated tars.
+        stripRecursiveTarExtension
+            If true and if recursive is also true, then a <file>.tar inside the current
+            tar will be mounted at <file>/ instead of <file>.tar/.
+        transformRecursiveMountPoint
+            If specified, then a <path>.tar inside the current tar will be matched with the
+            first argument of the tuple and replaced by the second argument. This new
+            modified path is used as recursive mount point. See also Python's re.sub.
+        verifyModificationTime
+            If true, then the index will be recreated automatically if the TAR archive has a more
+            recent modification time than the index file.
+        isGnuIncremental
+            If None, then it will be determined automatically. Behavior can be overwritten by setting
+            it to a bool value. If true, then prefixes will be stripped from certain paths encountered
+            with GNU incremental backups.
+        kwargs
+            Unused. Only for compatibility with generic MountSource interface.
         """
 
         # fmt: off

diff --git a/core/ratarmountcore/ZipMountSource.py b/core/ratarmountcore/ZipMountSource.py
@@ -220,22 +220,14 @@ def __init__(
         ZipMountSource._findPassword(self.fileObject, options.get("passwords", []))
         self.files = {info.header_offset: info for info in self.fileObject.infolist()}
 
-        # If no explicit index file path given and if it is a very small file, then avoid creating an obnoxious
-        # index file for it. This becomes especially important when mounting folders of ZIPs!
-        possibleIndexFilePaths = SQLiteIndex.getPossibleIndexFilePaths(
-            indexFilePath = indexFilePath, indexFolders = indexFolders, archiveFilePath = self.archiveFilePath
-        )
-        indexExists = any(path and os.path.isfile(path) for path in possibleIndexFilePaths)
-        if indexFilePath is None and len(self.files) < options.get("indexMinimumFileCount", 1000) and not indexExists:
-            indexFilePath = ':memory:'
-
         self.index = SQLiteIndex(
             indexFilePath,
             indexFolders=indexFolders,
             archiveFilePath=self.archiveFilePath,
             encoding=self.encoding,
             checkMetadata=self._checkMetadata,
             printDebug=self.printDebug,
+            preferMemory=len(self.files) < options.get("indexMinimumFileCount", 1000),
         )
 
         if clearIndexCache:

diff --git a/ratarmount.py b/ratarmount.py
@@ -1303,6 +1303,12 @@ def _parseArgs(rawArgs: Optional[List[str]] = None):
         '--union-mount-cache-timeout', type=float, default=60,
         help='Timeout in seconds before stopping to build the union mount cache.')
 
+    advancedGroup.add_argument(
+        '--index-minimum-file-count', type=int, default=1000,
+        help='Create indexes for archives with fewer than this limit of files in memory instead of '
+             'creating a .index.sqlite file. This is currently not applied for TAR files because the file count '
+             'only becomes known after parsing the archive, for which an index is already created.')
+
     # Positional Arguments
 
     positionalGroup.add_argument(
@@ -1637,6 +1643,7 @@ def cli(rawArgs: Optional[List[str]] = None) -> None:
         maxCacheDepth                = args.union_mount_cache_max_depth,
         maxCacheEntries              = args.union_mount_cache_max_entries,
         maxSecondsToCache            = args.union_mount_cache_timeout,
+        indexMinimumFileCount        = args.index_minimum_file_count,
         # fmt: on
     )