Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix allocation of non-transient strings in StringStore #13713

Merged
merged 6 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ jobs:
fail-fast: true
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python_version: ["3.9", "3.11", "3.12"]
python_version: ["3.9", "3.12"]

runs-on: ${{ matrix.os }}

Expand Down
7 changes: 4 additions & 3 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ classifiers =
Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11
Programming Language :: Python :: 3.12
Programming Language :: Python :: 3.13
Topic :: Scientific/Engineering
project_urls =
Release notes = https://github.com/explosion/spaCy/releases
Expand All @@ -29,13 +30,13 @@ project_urls =
[options]
zip_safe = false
include_package_data = true
python_requires = >=3.9
python_requires = >=3.9,<3.13
# NOTE: This section is superseded by pyproject.toml and will be removed in
# spaCy v4
setup_requires =
cython>=0.25,<3.0
numpy>=2.0.0,<2.1.0; python_version < "3.9"
numpy>=2.0.0,<2.1.0; python_version >= "3.9"
numpy>=2.0.0,<3.0.0; python_version < "3.9"
numpy>=2.0.0,<3.0.0; python_version >= "3.9"
# We also need our Cython packages here to compile against
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
Expand Down
2 changes: 1 addition & 1 deletion spacy/about.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# fmt: off
__title__ = "spacy"
__version__ = "3.8.2"
__version__ = "3.8.3"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
10 changes: 7 additions & 3 deletions spacy/morphology.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,20 @@ cdef class Morphology:
field_feature_pairs = []
for field in sorted(string_features):
values = string_features[field]
self.strings.add(field, allow_transient=False),
field_id = self.strings[field]
for value in values.split(self.VALUE_SEP):
field_sep_value = field + self.FIELD_SEP + value
self.strings.add(field_sep_value, allow_transient=False),
field_feature_pairs.append((
self.strings.add(field),
self.strings.add(field + self.FIELD_SEP + value),
field_id,
self.strings[field_sep_value]
))
cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
# the hash key for the tag is either the hash of the normalized UFEATS
# string or the hash of an empty placeholder
norm_feats_string = self.normalize_features(features)
tag.key = self.strings.add(norm_feats_string)
tag.key = self.strings.add(norm_feats_string, allow_transient=False)
self.insert(tag)
return tag.key

Expand Down
7 changes: 6 additions & 1 deletion spacy/strings.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,8 @@ cdef class StringStore:
internally should not.
RETURNS (uint64): The string's hash value.
"""
if not string:
return 0
if allow_transient is None:
allow_transient = self.mem is not self._non_temp_mem
cdef hash_t str_hash
Expand Down Expand Up @@ -383,7 +385,10 @@ cdef class StringStore:
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
if value is not NULL:
return value
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
if allow_transient:
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
else:
value = _allocate(self._non_temp_mem, <unsigned char*>utf8_string, length)
self._map.set(key, value)
if allow_transient and self.mem is not self._non_temp_mem:
self._transient_keys.push_back(key)
Expand Down
89 changes: 45 additions & 44 deletions spacy/tests/training/test_pretraining.py.disabled
Original file line number Diff line number Diff line change
Expand Up @@ -264,50 +264,51 @@ def test_pretraining_tagger():
pretrain(filled, tmp_dir)


def test_pretraining_training():
"""Test that training can use a pretrained Tok2Vec model"""
config = Config().from_str(pretrain_string_internal)
nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
filled = nlp.config
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
filled = pretrain_config.merge(filled)
train_config = util.load_config(DEFAULT_CONFIG_PATH)
filled = train_config.merge(filled)
with make_tempdir() as tmp_dir:
pretrain_dir = tmp_dir / "pretrain"
pretrain_dir.mkdir()
file_path = write_sample_jsonl(pretrain_dir)
filled["paths"]["raw_text"] = file_path
filled["pretraining"]["component"] = "tagger"
filled["pretraining"]["layer"] = "tok2vec"
train_dir = tmp_dir / "train"
train_dir.mkdir()
train_path, dev_path = write_sample_training(train_dir)
filled["paths"]["train"] = train_path
filled["paths"]["dev"] = dev_path
filled = filled.interpolate()
P = filled["pretraining"]
nlp_base = init_nlp(filled)
model_base = (
nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
)
embed_base = None
for node in model_base.walk():
if node.name == "hashembed":
embed_base = node
pretrain(filled, pretrain_dir)
pretrained_model = Path(pretrain_dir / "model3.bin")
assert pretrained_model.exists()
filled["initialize"]["init_tok2vec"] = str(pretrained_model)
nlp = init_nlp(filled)
model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
embed = None
for node in model.walk():
if node.name == "hashembed":
embed = node
# ensure that the tok2vec weights are actually changed by the pretraining
assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E")))
train(nlp, train_dir)
# Try to debug segfault on windows
#def test_pretraining_training():
# """Test that training can use a pretrained Tok2Vec model"""
# config = Config().from_str(pretrain_string_internal)
# nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
# filled = nlp.config
# pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
# filled = pretrain_config.merge(filled)
# train_config = util.load_config(DEFAULT_CONFIG_PATH)
# filled = train_config.merge(filled)
# with make_tempdir() as tmp_dir:
# pretrain_dir = tmp_dir / "pretrain"
# pretrain_dir.mkdir()
# file_path = write_sample_jsonl(pretrain_dir)
# filled["paths"]["raw_text"] = file_path
# filled["pretraining"]["component"] = "tagger"
# filled["pretraining"]["layer"] = "tok2vec"
# train_dir = tmp_dir / "train"
# train_dir.mkdir()
# train_path, dev_path = write_sample_training(train_dir)
# filled["paths"]["train"] = train_path
# filled["paths"]["dev"] = dev_path
# filled = filled.interpolate()
# P = filled["pretraining"]
# nlp_base = init_nlp(filled)
# model_base = (
# nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
# )
# embed_base = None
# for node in model_base.walk():
# if node.name == "hashembed":
# embed_base = node
# pretrain(filled, pretrain_dir)
# pretrained_model = Path(pretrain_dir / "model3.bin")
# assert pretrained_model.exists()
# filled["initialize"]["init_tok2vec"] = str(pretrained_model)
# nlp = init_nlp(filled)
# model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
# embed = None
# for node in model.walk():
# if node.name == "hashembed":
# embed = node
# # ensure that the tok2vec weights are actually changed by the pretraining
# assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E")))
# train(nlp, train_dir)


def write_sample_jsonl(tmp_dir):
Expand Down
Loading