Skip to content

Commit

Permalink
Avoid redundant downloading and decompressing across processes
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Oct 8, 2024
1 parent 036f593 commit 32428a2
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 6 deletions.
23 changes: 18 additions & 5 deletions hanlp/utils/io_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,13 @@ def tempdir_human():
return tempdir(now_filename())


def temp_lock(path):
from filelock import FileLock
import hashlib
lock = FileLock(f"{tempdir()}/.{hashlib.md5(path.encode('utf8')).hexdigest()}.lock")
return lock


def hanlp_home_default():
"""Default data directory depending on the platform and environment variables"""
if windows():
Expand Down Expand Up @@ -292,6 +299,7 @@ def get_resource(path: str, save_dir=hanlp_home(), extract=True, prefix=HANLP_UR
The real path to the resource.
"""
_path = path
path = hanlp.pretrained.ALL.get(path, path)
anchor: str = None
compressed = None
Expand Down Expand Up @@ -333,12 +341,17 @@ def get_resource(path: str, save_dir=hanlp_home(), extract=True, prefix=HANLP_UR
# realpath is where its path after exaction
if compressed:
realpath += compressed
if not os.path.isfile(realpath):
path = download(url=path, save_path=realpath, verbose=verbose)
else:
path = realpath
with temp_lock(path):
if not os.path.isfile(realpath):
path = download(url=path, save_path=realpath, verbose=verbose)
else:
path = realpath
if extract and compressed:
path = uncompress(path, verbose=verbose)
with temp_lock(path):
if os.path.isfile(path):
path = uncompress(path, verbose=verbose)
else: # other process must have already decompressed it and deleted it
return get_resource(_path, save_dir, extract, prefix, append_location, verbose)
if anchor:
path = path_join(path, anchor)

Expand Down
2 changes: 1 addition & 1 deletion hanlp/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Author: hankcs
# Date: 2019-12-28 19:26

__version__ = '2.1.0-beta.61'
__version__ = '2.1.0-beta.62'
"""HanLP version"""


Expand Down

0 comments on commit 32428a2

Please sign in to comment.