Merge pull request #271 from RubixML/2.3

2.3
RubixML · Dec 31, 2022 · d0de8a0 · d0de8a0
2 parents 3f0b211 + 3c76343
commit d0de8a0
Show file tree

Hide file tree

Showing 30 changed files with 668 additions and 10 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,6 @@ Thumbs.db
 .DS_Store
 debug.log
 /test.png
-/.idea
-/.vscode
-/.vs
+.idea
+.vscode
+.vs
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+- 2.3.0
+    - Added BM25 Transformer
+    - Add `dropFeature()` method to the dataset object API
+    - Add neural network architecture visualization via GraphViz
+
 - 2.2.2
     - Fix Grid Search best model selection
 

diff --git a/benchmarks/Transformers/BM25TransformerBench.php b/benchmarks/Transformers/BM25TransformerBench.php
@@ -0,0 +1,60 @@
+<?php
+
+namespace Rubix\ML\Benchmarks\Transformers;
+
+use Tensor\Matrix;
+use Rubix\ML\Datasets\Unlabeled;
+use Rubix\ML\Transformers\BM25Transformer;
+
+/**
+ * @Groups({"Transformers"})
+ * @BeforeMethods({"setUp"})
+ */
+class BM25TransformerBench
+{
+    protected const NUM_SAMPLES = 10000;
+
+    /**
+     * @var \Rubix\ML\Datasets\Unlabeled
+     */
+    protected $dataset;
+
+    /**
+     * @var \Rubix\ML\Transformers\BM25Transformer
+     */
+    protected $transformer;
+
+    /**
+     * @var array<array<mixed>>
+     */
+    protected $aSamples;
+
+    /**
+     * @var array<array<mixed>>
+     */
+    protected $bSamples;
+
+    public function setUp() : void
+    {
+        $mask = Matrix::rand(self::NUM_SAMPLES, 100)
+            ->greater(0.8);
+
+        $samples = Matrix::gaussian(self::NUM_SAMPLES, 100)
+            ->multiply($mask)
+            ->asArray();
+
+        $this->dataset = Unlabeled::quick($samples);
+
+        $this->transformer = new BM25Transformer();
+    }
+
+    /**
+     * @Subject
+     * @Iterations(3)
+     * @OutputTimeUnit("milliseconds", precision=3)
+     */
+    public function apply() : void
+    {
+        $this->dataset->apply($this->transformer);
+    }
+}
diff --git a/docs/classifiers/multilayer-perceptron.md b/docs/classifiers/multilayer-perceptron.md
@@ -77,6 +77,22 @@ Returns the underlying neural network instance or `null` if untrained:
 public network() : Network|null
 ```
 
+Export a Graphviz "dot" encoding of the neural network architecture.
+```php
+public exportGraphviz() : Encoding
+```
+
+```php
+use Rubix\ML\Helpers\Graphviz;
+use Rubix\ML\Persisters\Filesystem;
+
+$dot = $estimator->exportGraphviz();
+
+Graphviz::dotToImage($dot)->saveTo(new Filesystem('network.png'));
+```
+
+![Neural Network Graph](https://github.com/RubixML/ML/blob/master/docs/images/neural-network-graph.png?raw=true)
+
 ## References
 [^1]: G. E. Hinton. (1989). Connectionist learning procedures.
 [^2]: L. Prechelt. (1997). Early Stopping - but when?
diff --git a/docs/datasets/api.md b/docs/datasets/api.md
@@ -101,6 +101,12 @@ Select the values of a feature column at a given offset :
 public feature(int $offset) : mixed[]
 ```
 
+## Dropping
+Drop a feature at a given column offset from the dataset:
+```php
+public dropFeature(int $offset) : self
+```
+
 ## Head and Tail
 Return the first *n* rows of data in a new dataset object:
 ```php

diff --git a/docs/images/neural-network-graph.png b/docs/images/neural-network-graph.png
diff --git a/docs/preprocessing.md b/docs/preprocessing.md
@@ -110,6 +110,7 @@ The library provides a number of transformers for Natural Language Processing (N
 
 | Transformer | Supervised | [Stateful](transformers/api.md#stateful) | [Elastic](transformers/api.md#elastic) |
 |---|---|---|---|
+| [BM25 Transformer](transformers/bm25-transformer.md) | | ● | ● |
 | [Regex Filter](transformers/regex-filter.md) | | | |
 | [Text Normalizer](transformers/text-normalizer.md) | | | |
 | [Multibyte Text Normalizer](transformers/multibyte-text-normalizer.md) | | | |

diff --git a/docs/regressors/mlp-regressor.md b/docs/regressors/mlp-regressor.md
@@ -75,6 +75,22 @@ Returns the underlying neural network instance or `null` if untrained:
 public network() : Network|null
 ```
 
+Export a Graphviz "dot" encoding of the neural network architecture.
+```php
+public exportGraphviz() : Encoding
+```
+
+```php
+use Rubix\ML\Helpers\Graphviz;
+use Rubix\ML\Persisters\Filesystem;
+
+$dot = $estimator->exportGraphviz();
+
+Graphviz::dotToImage($dot)->saveTo(new Filesystem('network.png'));
+```
+
+![Neural Network Graph](https://github.com/RubixML/ML/blob/master/docs/images/neural-network-graph.png?raw=true)
+
 ## References
 [^1]: G. E. Hinton. (1989). Connectionist learning procedures.
 [^2]: L. Prechelt. (1997). Early Stopping - but when?
diff --git a/docs/transformers/bm25-transformer.md b/docs/transformers/bm25-transformer.md
@@ -0,0 +1,38 @@
+<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/Transformers/BM25Transformer.php">[source]</a></span>
+
+# BM25 Transformer
+BM25 is a sublinear term weighting scheme that takes term frequency (TF), document frequency (DF), and document length into account. It is similar to [TF-IDF](tf-idf-transformer.md) but with variable sublinearity and the addition of document length normalization.
+
+> **Note:** BM25 Transformer assumes that its inputs are token frequency vectors such as those created by [Word Count Vectorizer](word-count-vectorizer.md).
+
+**Interfaces:** [Transformer](api.md#transformer), [Stateful](api.md#stateful), [Elastic](api.md#elastic)
+
+**Data Type Compatibility:** Continuous only
+
+## Parameters
+| # | Param | Default | Type | Description |
+|---|---|---|---|---|
+| 1 | dampening | 1.2 | float | The term frequency (TF) dampening factor i.e. the `K1` parameter in the formula. Lower values will cause the TF to saturate quicker. |
+| 2 | normalization | 0.75 | float | The importance of document length in normalizing the term frequency i.e. the `b` parameter in the formula. |
+
+## Example
+```php
+use Rubix\ML\Transformers\BM25Transformer;
+
+$transformer = new BM25Transformer(1.2, 0.75);
+```
+
+## Additional Methods
+Return the document frequencies calculated during fitting:
+```php
+public dfs() : ?array
+```
+
+Return the average number of tokens per document:
+```php
+public averageDocumentLength() : ?float
+```
+
+### References
+>- S. Robertson et al. (2009). The Probabilistic Relevance Framework: BM25 and Beyond.
+>- K. Sparck Jones et al. (2000). A probabilistic model of information retrieval: development and comparative experiments.
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -139,6 +139,7 @@ nav:
         - KNN Imputer: transformers/knn-imputer.md
         - Missing Data Imputer: transformers/missing-data-imputer.md
       - Natural Language:
+        - BM25 Transformer: transformers/bm25-transformer.md
         - Regex Filter: transformers/regex-filter.md
         - Text Normalizer: transformers/text-normalizer.md
         - Multibyte Text Normalizer: transformers/multibyte-text-normalizer.md

diff --git a/src/Classifiers/MultilayerPerceptron.php b/src/Classifiers/MultilayerPerceptron.php
@@ -6,6 +6,7 @@
 use Rubix\ML\Learner;
 use Rubix\ML\Verbose;
 use Rubix\ML\DataType;
+use Rubix\ML\Encoding;
 use Rubix\ML\Estimator;
 use Rubix\ML\Persistable;
 use Rubix\ML\Probabilistic;
@@ -544,6 +545,21 @@ public function proba(Dataset $dataset) : array
         return $probabilities;
     }
 
+    /**
+     * Export the network architecture as a graph in dot format.
+     *
+     * @throws \Rubix\ML\Exceptions\RuntimeException
+     * @return \Rubix\ML\Encoding
+     */
+    public function exportGraphviz() : Encoding
+    {
+        if (!$this->network) {
+            throw new RuntimeException('Must train network first.');
+        }
+
+        return $this->network->exportGraphviz();
+    }
+
     /**
      * Return an associative array containing the data used to serialize the object.
      *

diff --git a/src/Datasets/Dataset.php b/src/Datasets/Dataset.php
@@ -196,6 +196,21 @@ public function feature(int $offset) : array
         return array_column($this->samples, $offset);
     }
 
+    /**
+     * Drop a feature column at a given offset from the dataset.
+     *
+     * @param int $offset
+     * @return self
+     */
+    public function dropFeature(int $offset) : self
+    {
+        foreach ($this->samples as &$sample) {
+            array_splice($sample, $offset, 1);
+        }
+
+        return $this;
+    }
+
     /**
      * Rotate the sample matrix so that the values of each feature become rows.
      *

diff --git a/src/NeuralNet/FeedForward.php b/src/NeuralNet/FeedForward.php
@@ -3,6 +3,7 @@
 namespace Rubix\ML\NeuralNet;
 
 use Tensor\Matrix;
+use Rubix\ML\Encoding;
 use Rubix\ML\Datasets\Dataset;
 use Rubix\ML\Datasets\Labeled;
 use Rubix\ML\NeuralNet\Layers\Input;
@@ -218,4 +219,33 @@ public function backpropagate(array $labels) : float
 
         return $loss;
     }
+
+    /**
+     * Export the network architecture as a graph in dot format.
+     *
+     * @return \Rubix\ML\Encoding
+     */
+    public function exportGraphviz() : Encoding
+    {
+        $dot = 'digraph Tree {' . PHP_EOL;
+        $dot .= '  node [shape=box, fontname=helvetica];' . PHP_EOL;
+
+        $layerNum = 0;
+
+        foreach ($this->layers() as $layer) {
+            ++$layerNum;
+
+            $dot .= "  N$layerNum [label=\"$layer\",style=\"rounded\"]" . PHP_EOL;
+
+            if ($layerNum > 1) {
+                $parentId = $layerNum - 1;
+
+                $dot .= "  N{$parentId} -> N{$layerNum};" . PHP_EOL;
+            }
+        }
+
+        $dot .= '}';
+
+        return new Encoding($dot);
+    }
 }
diff --git a/src/NeuralNet/Layers/Binary.php b/src/NeuralNet/Layers/Binary.php
@@ -199,4 +199,16 @@ public function gradient(Matrix $input, Matrix $output, Matrix $expected) : Matr
         return $this->sigmoid->differentiate($input, $output)
             ->multiply($dLoss);
     }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Binary (cost function: {$this->costFn})";
+    }
 }
diff --git a/src/NeuralNet/Layers/Continuous.php b/src/NeuralNet/Layers/Continuous.php
@@ -137,4 +137,16 @@ public function gradient(Matrix $input, Matrix $expected) : Matrix
         return $this->costFn->differentiate($input, $expected)
             ->divide($input->n());
     }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Continuous (cost function: {$this->costFn})";
+    }
 }
diff --git a/src/NeuralNet/Layers/Hidden.php b/src/NeuralNet/Layers/Hidden.php
@@ -4,7 +4,6 @@
 
 use Rubix\ML\Deferred;
 use Rubix\ML\NeuralNet\Optimizers\Optimizer;
-use Stringable;
 
 /**
  * Hidden
@@ -13,7 +12,7 @@
  * @package     Rubix/ML
  * @author      Andrew DalPino
  */
-interface Hidden extends Layer, Stringable
+interface Hidden extends Layer
 {
     /**
      * Calculate the gradient and update the parameters of the layer.

diff --git a/src/NeuralNet/Layers/Layer.php b/src/NeuralNet/Layers/Layer.php
@@ -3,8 +3,9 @@
 namespace Rubix\ML\NeuralNet\Layers;
 
 use Tensor\Matrix;
+use Stringable;
 
-interface Layer
+interface Layer extends Stringable
 {
     /**
      * The width of the layer. i.e. the number of neurons or computation nodes.

diff --git a/src/NeuralNet/Layers/Multiclass.php b/src/NeuralNet/Layers/Multiclass.php
@@ -205,4 +205,16 @@ public function gradient(Matrix $input, Matrix $output, Matrix $expected) : Matr
         return $this->softmax->differentiate($input, $output)
             ->multiply($dLoss);
     }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Multiclass (cost function: {$this->costFn})";
+    }
 }
diff --git a/src/NeuralNet/Layers/Placeholder1D.php b/src/NeuralNet/Layers/Placeholder1D.php
@@ -88,4 +88,16 @@ public function infer(Matrix $input) : Matrix
     {
         return $this->forward($input);
     }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Placeholder 1D (inputs: {$this->inputs})";
+    }
 }