diff --git a/README.md b/README.md index a081977f46..25c28f287f 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@
- + + + + [![PyPI](https://img.shields.io/pypi/v/stream_topic)](https://pypi.org/project/stream_topic) @@ -24,14 +27,6 @@

We present STREAM, a Simplified Topic Retrieval, Exploration, and Analysis Module for User-Friendly and Interactive Topic Modeling and Visualization. Our paper can be found here.

- - - - - - -
First GIFSecond GIF
-

Table of Contents

@@ -164,6 +159,10 @@ STREAM offers a variety of neural as well as non-neural topic models and we are NeuralLDA Autoencoding Variational Inference For Topic Models + + NSTM + Neural Topic Model via Optimal Transport +
diff --git a/assets/stream-topic-gif.gif b/assets/stream-topic-gif.gif new file mode 100644 index 0000000000..ce941c2685 Binary files /dev/null and b/assets/stream-topic-gif.gif differ diff --git a/docs/api/models/models.rst b/docs/api/models/models.rst index 530848ce37..ec39a9fbf3 100644 --- a/docs/api/models/models.rst +++ b/docs/api/models/models.rst @@ -44,3 +44,6 @@ Models .. autoclass:: stream_topic.models.CBC :members: + +.. autoclass:: stream_topic.models.NSTM + :members: diff --git a/docs/conf.py b/docs/conf.py index 0d31cfa18c..ba90d793ba 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -62,7 +62,6 @@ "plotly", "matplotlib", "gensim", - "octis", "nltk", "langdetect", "loguru", diff --git a/docs/images/logos/gif1.gif b/docs/images/logos/gif1.gif new file mode 100644 index 0000000000..4de8c9d4a0 Binary files /dev/null and b/docs/images/logos/gif1.gif differ diff --git a/docs/images/logos/gif2.gif b/docs/images/logos/gif2.gif new file mode 100644 index 0000000000..618f5be6d7 Binary files /dev/null and b/docs/images/logos/gif2.gif differ diff --git a/docs/notebooks/examples.ipynb b/docs/notebooks/examples.ipynb index 4b22e8a42b..33043c9fee 100644 --- a/docs/notebooks/examples.ipynb +++ b/docs/notebooks/examples.ipynb @@ -273,9 +273,9 @@ ], "metadata": { "kernelspec": { - "display_name": "db", + "display_name": "Python (stream_topic_venv)", "language": "python", - "name": "python3" + "name": "stream_topic_venv" }, "language_info": { "codemirror_mode": { diff --git a/docs/notebooks/quickstart.ipynb b/docs/notebooks/quickstart.ipynb index 4ea1fb54eb..63a133536a 100644 --- a/docs/notebooks/quickstart.ipynb +++ b/docs/notebooks/quickstart.ipynb @@ -269,9 +269,9 @@ ], "metadata": { "kernelspec": { - "display_name": "db", + "display_name": "Python (stream_topic_venv)", "language": "python", - "name": "python3" + "name": "stream_topic_venv" }, "language_info": { "codemirror_mode": { diff --git a/requirements.txt b/requirements.txt index 1cc4a06813..62cebe311c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,6 @@ torch==2.4.0 transformers==4.40.2 setfit==1.0.3 gensim==4.2.0 -octis==1.13.1 umap-learn==0.5.6 wordcloud==1.9.3 diff --git a/stream_topic/__version__.py b/stream_topic/__version__.py index 277ccdab5b..a719e3726e 100644 --- a/stream_topic/__version__.py +++ b/stream_topic/__version__.py @@ -1,4 +1,4 @@ """Version information.""" # The following line *must* be the last in the module, exactly as formatted: -__version__ = "0.1.8" +__version__ = "0.1.9" diff --git a/stream_topic/models/CEDC.py b/stream_topic/models/CEDC.py index 53f2319b1e..bb4789f1e4 100644 --- a/stream_topic/models/CEDC.py +++ b/stream_topic/models/CEDC.py @@ -191,7 +191,7 @@ def fit( only_nouns: bool = False, clean: bool = False, clean_threshold: float = 0.85, - expansion_corpus: str = "octis", + expansion_corpus: str = "brown", n_words: int = 20, ): """ @@ -210,7 +210,7 @@ def fit( clean_threshold : float, optional Threshold for cleaning topics based on similarity (default is 0.85). expansion_corpus : str, optional - Corpus for expanding topics (default is 'octis'). + Corpus for expanding topics (default is 'brown'). n_words : int, optional Number of top words to include in each topic (default is 20). diff --git a/stream_topic/models/__init__.py b/stream_topic/models/__init__.py index cc3f27e581..28059ebf86 100644 --- a/stream_topic/models/__init__.py +++ b/stream_topic/models/__init__.py @@ -13,6 +13,7 @@ from .ctmneg import CTMNeg from .tntm import TNTM from .nmf import NMFTM +from .nstm import NSTM __all__ = [ "BERTopicTM", @@ -30,4 +31,5 @@ "CTMNeg", "TNTM", "NMFTM", + "NSTM", ] diff --git a/stream_topic/models/neural_base_models/nstm_base.py b/stream_topic/models/neural_base_models/nstm_base.py new file mode 100644 index 0000000000..416dd6912c --- /dev/null +++ b/stream_topic/models/neural_base_models/nstm_base.py @@ -0,0 +1,210 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from ...utils.sinkhorn_loss import sinkhorn_loss + + +class NSTMBase(nn.Module): + """ + Neural Topic Model via Optimal Transport (NSTM). Based on the paper presented at ICLR 2021 by + He Zhao, Dinh Phung, Viet Huynh, Trung Le, and Wray Buntine. + + This model learns topic embeddings using an encoder and leverages optimal transport + via the Sinkhorn loss for topic and word distributions. + + Parameters + ---------- + dataset : Dataset + A dataset object containing the bag-of-words (BoW) matrix used for training. + n_topics : int, optional + The number of topics to be learned by the model, by default 50. + encoder_dim : int, optional + The dimension of the encoder's hidden layer, by default 128. + dropout : float, optional + The dropout rate for the encoder, by default 0.1. + pretrained_WE : numpy.ndarray, optional + Pretrained word embeddings as a numpy array. If None, the embeddings will be randomly initialized, by default None. + train_WE : bool, optional + Whether to fine-tune (train) the word embeddings during model training, by default True. + encoder_activation : callable, optional + The activation function for the encoder, by default nn.ReLU(). + embed_size : int, optional + The size of the word embedding vectors, by default 256. + recon_loss_weight : float, optional + The weight given to the reconstruction loss, by default 0.07. + sinkhorn_alpha : float, optional + The scaling factor for the Sinkhorn loss, by default 20. + + Attributes + ---------- + recon_loss_weight : float + The weight of the reconstruction loss in the final loss computation. + sinkhorn_alpha : float + The scaling factor applied to the Sinkhorn loss for optimal transport. + encoder : nn.Sequential + The neural network that encodes bag-of-words input into topic distribution. + word_embeddings : nn.Parameter + The word embeddings matrix, either pretrained or initialized randomly. + topic_embeddings : nn.Parameter + The matrix of learned topic embeddings. + + Methods + ------- + get_beta(): + Computes the normalized topic-word distribution matrix. + get_theta(x): + Computes the topic distribution (theta) for the input BoW vector. + forward(x): + Executes the forward pass, returning the topic distribution, topic-word distribution, and the transport cost matrix. + compute_loss(x): + Computes the overall loss, combining reconstruction and Sinkhorn losses. + """ + + def __init__( + self, + dataset, + n_topics: int = 50, + encoder_dim: int = 128, + dropout: float = 0.1, + pretrained_WE=None, + train_WE: bool = True, + encoder_activation: callable = nn.ReLU(), + embed_size: int = 256, + recon_loss_weight=0.07, + sinkhorn_alpha=20, + ): + """ + Initializes the Neural Topic Model. + + Parameters + ---------- + dataset : Dataset + A dataset object containing the BoW matrix as `dataset.bow`. + n_topics : int, optional + Number of topics to be learned, by default 50. + encoder_dim : int, optional + Hidden dimension size for the encoder, by default 128. + dropout : float, optional + Dropout rate for regularization in the encoder, by default 0.1. + pretrained_WE : np.ndarray, optional + Pretrained word embeddings (optional), by default None. + train_WE : bool, optional + Whether the word embeddings are trainable, by default True. + encoder_activation : callable, optional + Activation function for the encoder layers, by default nn.ReLU(). + embed_size : int, optional + Size of the word embeddings, by default 256. + recon_loss_weight : float, optional + Weight of the reconstruction loss, by default 0.07. + sinkhorn_alpha : float, optional + Scaling factor for the Sinkhorn loss, by default 20. + """ + super().__init__() + + vocab_size = dataset.bow.shape[1] + + self.recon_loss_weight = recon_loss_weight + self.sinkhorn_alpha = sinkhorn_alpha + + self.encoder = nn.Sequential( + nn.Linear(vocab_size, encoder_dim), + encoder_activation, + nn.Dropout(dropout), + nn.Linear(encoder_dim, n_topics), + nn.BatchNorm1d(n_topics), + ) + + if pretrained_WE is not None: + self.word_embeddings = nn.Parameter(torch.from_numpy(pretrained_WE).float()) + else: + self.word_embeddings = nn.Parameter( + torch.randn((vocab_size, embed_size)) * 1e-03 + ) + + self.word_embeddings.requires_grad = train_WE + + self.topic_embeddings = nn.Parameter( + torch.randn((n_topics, self.word_embeddings.shape[1])) * 1e-03 + ) + + def get_beta(self): + """ + Computes the normalized topic-word distribution matrix (beta) by taking the dot product + of the normalized topic embeddings and word embeddings. + + Returns + ------- + torch.Tensor + The topic-word distribution matrix of shape (n_topics, vocab_size). + """ + word_embedding_norm = F.normalize(self.word_embeddings) + topic_embedding_norm = F.normalize(self.topic_embeddings) + beta = torch.matmul(topic_embedding_norm, word_embedding_norm.T) + return beta + + def get_theta(self, x): + """ + Computes the document-topic distribution (theta) for a given bag-of-words input using the encoder. + + Parameters + ---------- + x : torch.Tensor + Input tensor representing the bag-of-words (BoW) data of shape (batch_size, vocab_size). + + Returns + ------- + torch.Tensor + The document-topic distribution of shape (batch_size, n_topics). + """ + theta = self.encoder(x) + theta = F.softmax(theta, dim=-1) + return theta + + def forward(self, x): + """ + Performs the forward pass of the model, which computes the document-topic distribution (theta), + the topic-word distribution (beta), and the optimal transport distance matrix (M). + + Parameters + ---------- + x : dict + A dictionary containing the input bag-of-words tensor under the key "bow". + + Returns + ------- + tuple + A tuple containing: + - theta (torch.Tensor): Document-topic distribution of shape (batch_size, n_topics). + - beta (torch.Tensor): Topic-word distribution of shape (n_topics, vocab_size). + - M (torch.Tensor): Distance matrix of shape (n_topics, vocab_size). + """ + x = x["bow"] + theta = self.get_theta(x) + beta = self.get_beta() + M = 1 - beta + return theta, beta, M + + def compute_loss(self, x): + """ + Computes the total loss for a given input by combining the reconstruction loss and the Sinkhorn loss. + + Parameters + ---------- + x : dict + A dictionary containing the input bag-of-words tensor under the key "bow". + + Returns + ------- + torch.Tensor + The total loss, averaged over the batch. + """ + theta, beta, M = self.forward(x) + sh_loss = sinkhorn_loss( + M, theta.T, F.softmax(x["bow"], dim=-1).T, lambda_sh=self.sinkhorn_alpha + ) + recon = F.softmax(torch.matmul(theta, beta), dim=-1) + recon_loss = -(x["bow"] * recon.log()).sum(axis=1) + + loss = self.recon_loss_weight * recon_loss + sh_loss + loss = loss.mean() + return loss diff --git a/stream_topic/models/nstm.py b/stream_topic/models/nstm.py new file mode 100644 index 0000000000..0b527e7592 --- /dev/null +++ b/stream_topic/models/nstm.py @@ -0,0 +1,534 @@ +from datetime import datetime + +import lightning as pl +import numpy as np +import torch +from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint, ModelSummary +from loguru import logger +import torch.nn as nn +from ..commons.check_steps import check_dataset_steps +from ..utils.datamodule import TMDataModule +from ..utils.dataset import TMDataset +from .abstract_helper_models.base import BaseModel, TrainingStatus +from .abstract_helper_models.neural_basemodel import NeuralBaseModel +from .neural_base_models.nstm_base import NSTMBase +from optuna.integration import PyTorchLightningPruningCallback + +time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") +MODEL_NAME = "NSTM" +# logger.add(f"{MODEL_NAME}_{time}.log", backtrace=True, diagnose=True) + + +class NSTM(BaseModel): + """ + Neural Topic Model via Optimal Transport (NSTM). Based on the paper presented at ICLR 2021 by + He Zhao, Dinh Phung, Viet Huynh, Trung Le, and Wray Buntine. + + This class initializes and configures the NSTM model with the specified + hyperparameters and dataset. It inherits from the `BaseModel` class. + + Parameters + ---------- + embed_size : int, optional + Size of the embedding layer, by default 128. + encoder_dim : int, optional + Dimensionality of the encoder layer, by default 256. + dropout : float, optional + Dropout rate for the layers, by default 0.1. + pretrained_WE : optional + Pretrained word embeddings, by default None. + train_WE : bool, optional + Whether to train word embeddings, by default True. + encoder_activation : callable, optional + Activation function for the encoder, by default `nn.ReLU()`. + batch_size : int, optional + Batch size for training, by default 64. + val_size : float, optional + Proportion of the dataset to use for validation, by default 0.2. + shuffle : bool, optional + Whether to shuffle the dataset before splitting, by default True. + random_state : int, optional + Random seed for shuffling and splitting the dataset, by default 42. + **kwargs : dict + Additional keyword arguments to pass to the parent class constructor. + + Attributes + ---------- + optimize : bool + Flag indicating whether to optimize the model, by default False. + n_topics : int or None + Number of topics in the model, by default None. + _status : TrainingStatus + Current training status of the model, by default `TrainingStatus.NOT_STARTED`. + hparams : dict + Hyperparameters for the data module, including batch size, validation size, + shuffling, and random state. + + Examples + -------- + >>> NSTM = NSTM(embed_size=100, encoder_dim=200, dropout=0.2, batch_size=32) + >>> print(NSTM.hparams) + {'datamodule_args': {'batch_size': 32, 'val_size': 0.2, 'shuffle': True, 'random_state': 42, + 'embeddings': False, 'bow': True, 'tf_idf': False, 'word_embeddings': False, 'min_df': 5}} + """ + + def __init__( + self, + embed_size: int = 128, + encoder_dim: int = 256, + dropout: float = 0.1, + pretrained_WE=None, + train_WE: bool = True, + encoder_activation: callable = nn.ReLU(), + batch_size=64, + val_size=0.2, + shuffle=True, + random_state=42, + **kwargs, + ): + """ + Initialize the NSTM model. + + Parameters + ---------- + embed_size : int, optional + Size of the embedding layer, by default 128. + encoder_dim : int, optional + Dimensionality of the encoder layer, by default 256. + dropout : float, optional + Dropout rate for the layers, by default 0.1. + pretrained_WE : optional + Pretrained word embeddings, by default None. + train_WE : bool, optional + Whether to train word embeddings, by default True. + encoder_activation : callable, optional + Activation function for the encoder, by default `nn.ReLU()`. + batch_size : int, optional + Batch size for training, by default 64. + val_size : float, optional + Proportion of the dataset to use for validation, by default 0.2. + shuffle : bool, optional + Whether to shuffle the dataset before splitting, by default True. + random_state : int, optional + Random seed for shuffling and splitting the dataset, by default 42. + **kwargs : dict + Additional keyword arguments to pass to the parent class constructor. + """ + + super().__init__( + use_pretrained_embeddings=False, + dropout=dropout, + embed_size=embed_size, + encoder_dim=encoder_dim, + pretrained_WE=pretrained_WE, + train_WE=train_WE, + encoder_activation=encoder_activation, + ) + self.save_hyperparameters( + ignore=[ + "random_state", + ] + ) + + self.hparams["datamodule_args"] = { + "batch_size": batch_size, + "val_size": val_size, + "shuffle": shuffle, + "random_state": random_state, + "embeddings": False, + "bow": True, + "tf_idf": False, + "word_embeddings": False, + "min_df": kwargs.get("min_df", 5), + } + + self.optimize = False + self.n_topics = None + self._status = TrainingStatus.NOT_STARTED + + def get_info(self): + """ + Get information about the model. + + Returns + ------- + dict + Dictionary containing model information including model name, + number of topics, embedding model name, UMAP arguments, + K-Means arguments, and training status. + """ + info = { + "model_name": MODEL_NAME, + "num_topics": self.n_topics, + "trained": self._status.name, + } + return info + + def _initialize_model(self): + """ + Initialize the neural base model. + + This method initializes the neural base model (`NeuralBaseModel`) with the given + hyperparameters and dataset. It filters out certain hyperparameters that are + not required by the model. + + Parameters + ---------- + self : object + The instance of the class that this method is a part of. This object should have + attributes `dataset` and `hparams`. + + Attributes + ---------- + model : NeuralBaseModel + The initialized neural base model. + """ + + self.model = NeuralBaseModel( + model_class=NSTMBase, + dataset=self.dataset, + **{ + k: v + for k, v in self.hparams.items() + if k not in ["datamodule_args", "max_epochs"] + }, + ) + + def _initialize_trainer( + self, + max_epochs, + monitor, + patience, + mode, + checkpoint_path, + trial=None, + **trainer_kwargs, + ): + """ + Initialize the PyTorch Lightning trainer. + + Parameters + ---------- + max_epochs : int + Maximum number of epochs for training. + monitor : str + Metric to monitor for early stopping and checkpointing. + patience : int + Patience for early stopping. + mode : str + Mode for the monitored metric (min or max). + checkpoint_path : str + Path to save model checkpoints. + **trainer_kwargs : dict + Additional keyword arguments for the trainer. + """ + + logger.info(f"--- Initializing Trainer for {MODEL_NAME} ---") + early_stop_callback = EarlyStopping( + monitor=monitor, min_delta=0.00, patience=patience, verbose=False, mode=mode + ) + + checkpoint_callback = ModelCheckpoint( + monitor="val_loss", + mode="min", + save_top_k=1, + dirpath=checkpoint_path, # Specify the directory to save checkpoints + filename="best_model", + ) + + model_callbacks = [ + early_stop_callback, + checkpoint_callback, + ModelSummary(max_depth=2), + ] + + if self.optimize: + model_callbacks.append( + PyTorchLightningPruningCallback(trial, monitor="val_loss") + ) + + # Initialize the trainer + self.trainer = pl.Trainer( + max_epochs=max_epochs, + callbacks=model_callbacks, + **trainer_kwargs, + ) + + def _initialize_datamodule( + self, + dataset, + ): + """ + Initialize the data module. + + Parameters + ---------- + dataset : TMDataset + The dataset to be used for training. + """ + + logger.info(f"--- Initializing Datamodule for {MODEL_NAME} ---") + self.data_module = TMDataModule( + batch_size=self.hparams["datamodule_args"]["batch_size"], + shuffle=self.hparams["datamodule_args"]["shuffle"], + val_size=self.hparams["datamodule_args"]["val_size"], + random_state=self.hparams["datamodule_args"]["random_state"], + ) + + self.data_module.preprocess_data( + dataset=dataset, + **{ + k: v + for k, v in self.hparams["datamodule_args"].items() + if k not in ["batch_size", "shuffle", "val_size"] + }, + ) + + self.dataset = dataset + + def fit( + self, + dataset: TMDataset = None, + n_topics: int = 20, + val_size: float = 0.2, + lr: float = 1e-04, + lr_patience: int = 15, + patience: int = 15, + weight_decay: float = 1e-07, + max_epochs: int = 100, + batch_size: int = 32, + shuffle: bool = True, + random_state: int = 101, + checkpoint_path: str = "checkpoints", + monitor: str = "val_loss", + mode: str = "min", + trial=None, + optimize=False, + **kwargs, + ): + """ + Fits the NSTM (topic model in embedding spaces) topic model to the given dataset. + + Parameters + ---------- + dataset : TMDataset, optional + The dataset to train the topic model on. Defaults to None. + n_topics : int, optional + The number of topics to extract. Defaults to 20. + val_size : float, optional + The proportion of the dataset to use for validation. Defaults to 0.2. + lr : float, optional + The learning rate for the optimizer. Defaults to 1e-04. + lr_patience : int, optional + The number of epochs with no improvement after which the learning rate will be reduced. Defaults to 15. + patience : int, optional + The number of epochs with no improvement after which training will be stopped. Defaults to 15. + weight_decay : float, optional + The weight decay (L2 penalty) for the optimizer. Defaults to 1e-07. + max_epochs : int, optional + The maximum number of epochs to train for. Defaults to 100. + batch_size : int, optional + The batch size for training. Defaults to 32. + shuffle : bool, optional + Whether to shuffle the training data. Defaults to True. + random_state : int, optional + The random seed for reproducibility. Defaults to 101. + checkpoint_path : str, optional + The path to save model checkpoints. Defaults to "checkpoints". + monitor : str, optional + The metric to monitor for early stopping. Defaults to "val_loss". + mode : str, optional + The mode for early stopping. Defaults to "min". + trial : optuna.Trial, optional + The Optuna trial for hyperparameter optimization. Defaults to None. + optimize : bool, optional + Whether to optimize hyperparameters. Defaults to False. + **kwargs + Additional keyword arguments to be passed to the trainer. + + Raises + ------ + ValueError + If the dataset is not an instance of TMDataset or if the number of topics is less than or equal to 0. + + Examples + -------- + >>> model = NSTM() + >>> dataset = TMDataset(...) + >>> model.fit(dataset, n_topics=20, val_size=0.2, lr=1e-04) + """ + + self.optimize = optimize + assert isinstance( + dataset, TMDataset + ), "The dataset must be an instance of TMDataset." + + check_dataset_steps(dataset, logger, MODEL_NAME) + + self.n_topics = n_topics + + self.hparams.update( + { + "n_topics": n_topics, + "lr": lr, + "lr_patience": lr_patience, + "patience": patience, + "weight_decay": weight_decay, + "max_epochs": max_epochs, + } + ) + + self.hparams["datamodule_args"].update( + { + "batch_size": batch_size, + "val_size": val_size, + "shuffle": shuffle, + "random_state": random_state, + } + ) + + try: + + self._status = TrainingStatus.INITIALIZED + self._initialize_datamodule(dataset=dataset) + + self._initialize_model() + + self._initialize_trainer( + max_epochs=self.hparams["max_epochs"], + monitor=monitor, + patience=patience, + mode=mode, + checkpoint_path=checkpoint_path, + trial=trial, + **kwargs, + ) + + logger.info(f"--- Training {MODEL_NAME} topic model ---") + self._status = TrainingStatus.RUNNING + self.trainer.fit(self.model, self.data_module) + + except Exception as e: + logger.error(f"Error in training: {e}") + self._status = TrainingStatus.FAILED + raise + except KeyboardInterrupt: + logger.error("Training interrupted.") + self._status = TrainingStatus.INTERRUPTED + raise + + if self.n_topics <= 0: + raise ValueError("Number of topics must be greater than 0.") + + logger.info("--- Training completed successfully. ---") + self._status = TrainingStatus.SUCCEEDED + + self.theta = ( + self.model.model.get_theta(torch.tensor(self.dataset.bow)) + .detach() + .cpu() + .numpy() + ) + + self.theta = self.theta / self.theta.sum(axis=1, keepdims=True) + + self.beta = self.model.model.get_beta().detach().cpu().numpy() + self.labels = np.array(np.argmax(self.theta, axis=1)) + + self.topic_dict = self.get_topic_word_dict(self.data_module.vocab) + + def get_topic_word_dict(self, vocab, num_words=100): + """ + Get the topic-word dictionary. + + Parameters + ---------- + vocab : list of str + Vocabulary list corresponding to the word indices. + num_words : int, optional + Number of top words to retrieve for each topic, by default 100. + + Returns + ------- + dict + Dictionary where keys are topic indices and values are lists of tuples (word, probability). + """ + + topic_word_dict = {} + for topic_idx, topic_dist in enumerate(self.beta): + top_word_indices = topic_dist.argsort()[-num_words:][::-1] + top_words_probs = [(vocab[i], topic_dist[i]) for i in top_word_indices] + topic_word_dict[topic_idx] = top_words_probs + return topic_word_dict + + def predict(self, dataset): + pass + + def suggest_hyperparameters(self, trial, max_topics=100): + self.hparams["n_topics"] = trial.suggest_int("n_topics", 1, max_topics) + self.hparams["encoder_dim"] = trial.suggest_int("encoder_dim", 16, 512) + self.hparams["embed_size"] = trial.suggest_int("embed_size", 16, 512) + self.hparams["dropout"] = trial.suggest_float("dropout", 0.0, 0.5) + self.hparams["encoder_activation"] = trial.suggest_categorical( + "encoder_activation", ["Softplus", "ReLU", "LeakyReLU", "Tanh"] + ) + self.hparams["lr"] = trial.suggest_float("lr", 1e-5, 1e-2) + self.hparams["weight_decay"] = trial.suggest_float("weight_decay", 1e-7, 1e-3) + + # Map string to actual PyTorch activation function + activation_mapping = { + "Softplus": nn.Softplus(), + "ReLU": nn.ReLU(), + "LeakyReLU": nn.LeakyReLU(), + "Tanh": nn.Tanh(), + } + self.hparams["encoder_activation"] = activation_mapping[ + self.hparams["encoder_activation"] + ] + + self.hparams["datamodule_args"]["batch_size"] = trial.suggest_int( + "batch_size", 12, 512 + ) + + def optimize_and_fit( + self, + dataset, + min_topics=2, + max_topics=20, + criterion="val_loss", + n_trials=100, + custom_metric=None, + ): + """ + A new method in the child class that calls the parent class's optimize_hyperparameters method. + + Parameters + ---------- + dataset : TMDataset + The dataset to train the model on. + min_topics : int, optional + Minimum number of topics to evaluate, by default 2. + max_topics : int, optional + Maximum number of topics to evaluate, by default 20. + criterion : str, optional + Criterion to use for optimization ('aic', 'bic', or 'custom'), by default 'aic'. + n_trials : int, optional + Number of trials for optimization, by default 100. + custom_metric : object, optional + Custom metric object with a `score` method for evaluation, by default None. + + Returns + ------- + dict + Dictionary containing the best parameters and the optimal number of topics. + """ + best_params = super().optimize_hyperparameters_neural( + dataset=dataset, + min_topics=min_topics, + max_topics=max_topics, + criterion=criterion, + n_trials=n_trials, + custom_metric=custom_metric, + ) + + return best_params diff --git a/stream_topic/preprocessor/config/default_preprocessing_steps.json b/stream_topic/preprocessor/config/default_preprocessing_steps.json index 17bf129e6f..6413feae02 100644 --- a/stream_topic/preprocessor/config/default_preprocessing_steps.json +++ b/stream_topic/preprocessor/config/default_preprocessing_steps.json @@ -199,6 +199,26 @@ "remove_words_with_numbers": true, "remove_words_with_special_chars": true }, + "NSTM": { + "remove_stopwords": true, + "lowercase": true, + "remove_punctuation": true, + "remove_numbers": true, + "lemmatize": true, + "stem": false, + "expand_contractions": true, + "remove_html_tags": true, + "remove_special_chars": true, + "remove_accents": true, + "detokenize": false, + "min_word_freq": 3, + "max_word_freq": null, + "min_word_length": 3, + "max_word_length": 20, + "dictionary": [], + "remove_words_with_numbers": true, + "remove_words_with_special_chars": true + }, "CTM": { "remove_stopwords": true, "lowercase": true, diff --git a/stream_topic/preprocessor/topic_extraction.py b/stream_topic/preprocessor/topic_extraction.py index f3c997b77d..448591a949 100644 --- a/stream_topic/preprocessor/topic_extraction.py +++ b/stream_topic/preprocessor/topic_extraction.py @@ -8,7 +8,7 @@ from nltk.corpus import brown as nltk_words from nltk.corpus import words as eng_dict from numpy.linalg import norm -from octis.dataset.dataset import Dataset as OCDataset +from ..utils.dataset import TMDataset from ._embedder import BaseEmbedder @@ -37,7 +37,7 @@ def __init__( self.embedder = BaseEmbedder(embedding_model) self.n_topics = n_topics - def _noun_extractor_haystack(self, embeddings, n, corpus="octis", only_nouns=True): + def _noun_extractor_haystack(self, embeddings, n, corpus="brown", only_nouns=True): """ Extracts the topics most probable words, which are the words nearest to the topics centroid. We extract all nouns from the corpus and the brown corpus. Afterwards we compute the cosine similarity between every word and every centroid. @@ -51,6 +51,7 @@ def _noun_extractor_haystack(self, embeddings, n, corpus="octis", only_nouns=Tru Args: embeddings (_type_): _document embeddings to compute centroid of the topic n (_type_): n_top number of words per topic + corpus (str, optional): corpus to be used for word extraction. Defaults to "brown". One of "brown", "stream", "words". Returns: dict: extracted topics @@ -67,21 +68,21 @@ def is_noun(pos): if corpus == "brown": word_list = nltk_words.words() word_list = [word.lower().strip() for word in word_list] - word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word) - for word in word_list] + word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word) for word in word_list] elif corpus == "words": word_list = eng_dict.words() word_list = [word.lower().strip() for word in word_list] - word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word) - for word in word_list] - elif corpus == "octis": - data = OCDataset() - data.fetch_dataset("20NewsGroup") + word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word) for word in word_list] + elif corpus == "stream": + data = TMDataset() + data.fetch_dataset("20NewsGroups") word_list = data.get_vocabulary() - data.fetch_dataset("M10") + data.fetch_dataset("Spotify") word_list += data.get_vocabulary() data.fetch_dataset("BBC_News") word_list += data.get_vocabulary() + data.fetch_dataset("Poliblogs") + word_list += data.get_vocabulary() # include reuters etc datasets # data.load_custom_dataset_from_folder(DATADIR + "/GN") @@ -90,16 +91,14 @@ def is_noun(pos): word_list += self.dataset.get_vocabulary() word_list = [word.lower().strip() for word in word_list] - word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word) - for word in word_list] + word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word) for word in word_list] else: raise ValueError( "There are no words to be extracted for the Topics: Please specify a corpus" ) if only_nouns: - word_list = [word for (word, pos) in pos_tag( - word_list) if is_noun(pos)] + word_list = [word for (word, pos) in pos_tag(word_list) if is_noun(pos)] else: word_list = [word for (word, pos) in pos_tag(word_list)] diff --git a/stream_topic/utils/sinkhorn_loss.py b/stream_topic/utils/sinkhorn_loss.py new file mode 100644 index 0000000000..d8eef5a633 --- /dev/null +++ b/stream_topic/utils/sinkhorn_loss.py @@ -0,0 +1,24 @@ +import torch + + +def sinkhorn_loss(M, a, b, lambda_sh, numItermax=5000, stopThr=0.5e-2): + + u = torch.ones_like(a) / a.size()[0] + + K = torch.exp(-M * lambda_sh) + err = 1 + cpt = 0 + while err > stopThr and cpt < numItermax: + u = torch.div(a, torch.matmul(K, torch.div(b, torch.matmul(u.t(), K).t()))) + cpt += 1 + if cpt % 20 == 1: + v = torch.div(b, torch.matmul(K.t(), u)) + u = torch.div(a, torch.matmul(K, v)) + bb = torch.mul(v, torch.matmul(K.t(), u)) + err = torch.norm(torch.sum(torch.abs(bb - b), dim=0), p=float("inf")) + + sinkhorn_divergences = torch.sum( + torch.mul(u, torch.matmul(torch.mul(K, M), v)), dim=0 + ) + + return sinkhorn_divergences