Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace pickle with safer alternatives #13067

Open
wants to merge 10 commits into
base: 3.6.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/continous-integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1284,7 +1284,6 @@ jobs:
with:
args: "💥 New *Rasa Open Source * version `${{ github.ref_name }}` has been released!"


send_slack_notification_for_release_on_failure:
name: Notify Slack & Publish Release Notes
runs-on: ubuntu-22.04
Expand Down
19 changes: 19 additions & 0 deletions changelog/1424.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Replace `pickle` and `joblib` with safer alternatives, e.g. `json`, `safetensors`, and `skops`, for
serializing components.

**Note**: This is a model breaking change. Please retrain your model.

If you have a custom component that inherits from one of the components listed below and modified the `persist` or
`load` method, make sure to update your code. Please contact us in case you encounter any problems.

Affected components:

- `CountVectorFeaturizer`
- `LexicalSyntacticFeaturizer`
- `LogisticRegressionClassifier`
- `SklearnIntentClassifier`
- `DIETClassifier`
- `CRFEntityExtractor`
- `TrackerFeaturizer`
- `TEDPolicy`
- `UnexpectedIntentTEDPolicy`
459 changes: 347 additions & 112 deletions poetry.lock

Large diffs are not rendered by default.

9 changes: 5 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ sanic-cors = "~2.0.0"
sanic-jwt = "^1.6.0"
sanic-routing = "^0.7.2"
websockets = ">=10.0,<11.0"
cloudpickle = ">=1.2,<2.3"
aiohttp = ">=3.9.0,<3.10"
questionary = ">=1.5.1,<1.11.0"
prompt-toolkit = "^3.0,<3.0.29"
Expand All @@ -133,10 +132,9 @@ psycopg2-binary = ">=2.8.2,<2.10.0"
python-dateutil = "~2.8"
protobuf = ">=4.23.3,< 4.23.4"
tensorflow_hub = "^0.13.0"
setuptools = ">=65.5.1"
setuptools = "~75.3.0"
ujson = ">=1.35,<6.0"
regex = ">=2020.6,<2022.11"
joblib = ">=0.15.1,<1.3.0"
sentry-sdk = ">=0.17.0,<1.15.0"
aio-pika = ">=6.7.1,<8.2.4"
aiogram = "<2.26"
Expand All @@ -156,6 +154,9 @@ dnspython = "2.3.0"
wheel = ">=0.38.1"
certifi = ">=2023.7.22"
cryptography = ">=41.0.7"
skops = "0.9.0"
safetensors = "~0.4.5"

[[tool.poetry.dependencies.tensorflow-io-gcs-filesystem]]
version = "==0.31"
markers = "sys_platform == 'win32'"
Expand Down Expand Up @@ -285,7 +286,7 @@ version = "~3.2.0"
optional = true

[tool.poetry.dependencies.transformers]
version = ">=4.13.0, <=4.26.0"
version = "~4.36.2"
optional = true

[tool.poetry.dependencies.sentencepiece]
Expand Down
23 changes: 22 additions & 1 deletion rasa/core/featurizers/single_state_featurizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import logging
from typing import List, Optional, Dict, Text, Set, Any

import numpy as np
import scipy.sparse
from typing import List, Optional, Dict, Text, Set, Any

from rasa.core.featurizers.precomputation import MessageContainerForCoreFeaturization
from rasa.nlu.extractors.extractor import EntityTagSpec
Expand Down Expand Up @@ -362,6 +363,26 @@ def encode_all_labels(
for action in domain.action_names_or_texts
]

def to_dict(self) -> Dict[str, Any]:
return {
"action_texts": self.action_texts,
"entity_tag_specs": self.entity_tag_specs,
"feature_states": self._default_feature_states,
}

@classmethod
def create_from_dict(
cls, data: Dict[str, Any]
) -> Optional["SingleStateFeaturizer"]:
if not data:
return None

featurizer = SingleStateFeaturizer()
featurizer.action_texts = data["action_texts"]
featurizer._default_feature_states = data["feature_states"]
featurizer.entity_tag_specs = data["entity_tag_specs"]
return featurizer


class IntentTokenizerSingleStateFeaturizer(SingleStateFeaturizer):
"""A SingleStateFeaturizer for use with policies that predict intent labels."""
Expand Down
133 changes: 115 additions & 18 deletions rasa/core/featurizers/tracker_featurizers.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
from __future__ import annotations
from pathlib import Path
from collections import defaultdict
from abc import abstractmethod
import jsonpickle
import logging

from tqdm import tqdm
import logging
from abc import abstractmethod
from collections import defaultdict
from pathlib import Path
from typing import (
Tuple,
List,
Expand All @@ -18,25 +16,30 @@
Set,
DefaultDict,
cast,
Type,
Callable,
ClassVar,
)

import numpy as np
from tqdm import tqdm

from rasa.core.featurizers.single_state_featurizer import SingleStateFeaturizer
from rasa.core.featurizers.precomputation import MessageContainerForCoreFeaturization
from rasa.core.exceptions import InvalidTrackerFeaturizerUsageError
import rasa.shared.core.trackers
import rasa.shared.utils.io
from rasa.shared.nlu.constants import TEXT, INTENT, ENTITIES, ACTION_NAME
from rasa.shared.nlu.training_data.features import Features
from rasa.shared.core.trackers import DialogueStateTracker
from rasa.shared.core.domain import State, Domain
from rasa.shared.core.events import Event, ActionExecuted, UserUttered
from rasa.core.exceptions import InvalidTrackerFeaturizerUsageError
from rasa.core.featurizers.precomputation import MessageContainerForCoreFeaturization
from rasa.core.featurizers.single_state_featurizer import SingleStateFeaturizer
from rasa.shared.core.constants import (
USER,
ACTION_UNLIKELY_INTENT_NAME,
PREVIOUS_ACTION,
)
from rasa.shared.core.domain import State, Domain
from rasa.shared.core.events import Event, ActionExecuted, UserUttered
from rasa.shared.core.trackers import DialogueStateTracker
from rasa.shared.exceptions import RasaException
from rasa.shared.nlu.constants import TEXT, INTENT, ENTITIES, ACTION_NAME
from rasa.shared.nlu.training_data.features import Features
from rasa.utils.tensorflow.constants import LABEL_PAD_ID
from rasa.utils.tensorflow.model_data import ragged_array_to_ndarray

Expand Down Expand Up @@ -64,6 +67,10 @@ def __str__(self) -> Text:
class TrackerFeaturizer:
"""Base class for actual tracker featurizers."""

# Class registry to store all subclasses
_registry: ClassVar[Dict[str, Type["TrackerFeaturizer"]]] = {}
_featurizer_type: str = "TrackerFeaturizer"

def __init__(
self, state_featurizer: Optional[SingleStateFeaturizer] = None
) -> None:
Expand All @@ -74,6 +81,36 @@ def __init__(
"""
self.state_featurizer = state_featurizer

@classmethod
def register(cls, featurizer_type: str) -> Callable:
"""Decorator to register featurizer subclasses."""

def wrapper(subclass: Type["TrackerFeaturizer"]) -> Type["TrackerFeaturizer"]:
cls._registry[featurizer_type] = subclass
# Store the type identifier in the class for serialization
subclass._featurizer_type = featurizer_type
return subclass

return wrapper

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "TrackerFeaturizer":
"""Create featurizer instance from dictionary."""
featurizer_type = data.pop("type")

if featurizer_type not in cls._registry:
raise ValueError(f"Unknown featurizer type: {featurizer_type}")

# Get the correct subclass and instantiate it
subclass = cls._registry[featurizer_type]
return subclass.create_from_dict(data)

@classmethod
@abstractmethod
def create_from_dict(cls, data: Dict[str, Any]) -> "TrackerFeaturizer":
"""Each subclass must implement its own creation from dict method."""
pass

@staticmethod
def _create_states(
tracker: DialogueStateTracker,
Expand Down Expand Up @@ -465,9 +502,7 @@ def persist(self, path: Union[Text, Path]) -> None:
self.state_featurizer.entity_tag_specs = []

# noinspection PyTypeChecker
rasa.shared.utils.io.write_text_file(
str(jsonpickle.encode(self)), featurizer_file
)
rasa.shared.utils.io.dump_obj_as_json_to_file(featurizer_file, self.to_dict())

@staticmethod
def load(path: Union[Text, Path]) -> Optional[TrackerFeaturizer]:
Expand All @@ -481,7 +516,17 @@ def load(path: Union[Text, Path]) -> Optional[TrackerFeaturizer]:
"""
featurizer_file = Path(path) / FEATURIZER_FILE
if featurizer_file.is_file():
return jsonpickle.decode(rasa.shared.utils.io.read_file(featurizer_file))
data = rasa.shared.utils.io.read_json_file(featurizer_file)

if "type" not in data:
logger.error(
f"Couldn't load featurizer for policy. "
f"File '{featurizer_file}' does not contain all "
f"necessary information. 'type' is missing."
)
return None

return TrackerFeaturizer.from_dict(data)

logger.error(
f"Couldn't load featurizer for policy. "
Expand All @@ -508,7 +553,16 @@ def _remove_action_unlikely_intent_from_events(events: List[Event]) -> List[Even
)
]

def to_dict(self) -> Dict[str, Any]:
return {
"type": self.__class__._featurizer_type,
"state_featurizer": (
self.state_featurizer.to_dict() if self.state_featurizer else None
),
}


@TrackerFeaturizer.register("FullDialogueTrackerFeaturizer")
class FullDialogueTrackerFeaturizer(TrackerFeaturizer):
"""Creates full dialogue training data for time distributed architectures.

Expand Down Expand Up @@ -646,7 +700,20 @@ def prediction_states(

return trackers_as_states

def to_dict(self) -> Dict[str, Any]:
return super().to_dict()

@classmethod
def create_from_dict(cls, data: Dict[str, Any]) -> "FullDialogueTrackerFeaturizer":
state_featurizer = SingleStateFeaturizer.create_from_dict(
data["state_featurizer"]
)
return cls(
state_featurizer,
)


@TrackerFeaturizer.register("MaxHistoryTrackerFeaturizer")
class MaxHistoryTrackerFeaturizer(TrackerFeaturizer):
"""Truncates the tracker history into `max_history` long sequences.

Expand Down Expand Up @@ -887,7 +954,25 @@ def prediction_states(

return trackers_as_states

def to_dict(self) -> Dict[str, Any]:
data = super().to_dict()
data.update(
{
"remove_duplicates": self.remove_duplicates,
"max_history": self.max_history,
}
)
return data

@classmethod
def create_from_dict(cls, data: Dict[str, Any]) -> "MaxHistoryTrackerFeaturizer":
state_featurizer = SingleStateFeaturizer.create_from_dict(
data["state_featurizer"]
)
return cls(state_featurizer, data["max_history"], data["remove_duplicates"])


@TrackerFeaturizer.register("IntentMaxHistoryTrackerFeaturizer")
class IntentMaxHistoryTrackerFeaturizer(MaxHistoryTrackerFeaturizer):
"""Truncates the tracker history into `max_history` long sequences.

Expand Down Expand Up @@ -1166,6 +1251,18 @@ def prediction_states(

return trackers_as_states

def to_dict(self) -> Dict[str, Any]:
return super().to_dict()

@classmethod
def create_from_dict(
cls, data: Dict[str, Any]
) -> "IntentMaxHistoryTrackerFeaturizer":
state_featurizer = SingleStateFeaturizer.create_from_dict(
data["state_featurizer"]
)
return cls(state_featurizer, data["max_history"], data["remove_duplicates"])


def _is_prev_action_unlikely_intent_in_state(state: State) -> bool:
prev_action_name = state.get(PREVIOUS_ACTION, {}).get(ACTION_NAME)
Expand Down
Loading
Loading