Skip to content

Commit

Permalink
Merge pull request #271 from RubixML/2.3
Browse files Browse the repository at this point in the history
2.3
  • Loading branch information
andrewdalpino authored Dec 31, 2022
2 parents 3f0b211 + 3c76343 commit d0de8a0
Show file tree
Hide file tree
Showing 30 changed files with 668 additions and 10 deletions.
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ Thumbs.db
.DS_Store
debug.log
/test.png
/.idea
/.vscode
/.vs
.idea
.vscode
.vs
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
- 2.3.0
- Added BM25 Transformer
- Add `dropFeature()` method to the dataset object API
- Add neural network architecture visualization via GraphViz

- 2.2.2
- Fix Grid Search best model selection

Expand Down
60 changes: 60 additions & 0 deletions benchmarks/Transformers/BM25TransformerBench.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
<?php

namespace Rubix\ML\Benchmarks\Transformers;

use Tensor\Matrix;
use Rubix\ML\Datasets\Unlabeled;
use Rubix\ML\Transformers\BM25Transformer;

/**
* @Groups({"Transformers"})
* @BeforeMethods({"setUp"})
*/
class BM25TransformerBench
{
protected const NUM_SAMPLES = 10000;

/**
* @var \Rubix\ML\Datasets\Unlabeled
*/
protected $dataset;

/**
* @var \Rubix\ML\Transformers\BM25Transformer
*/
protected $transformer;

/**
* @var array<array<mixed>>
*/
protected $aSamples;

/**
* @var array<array<mixed>>
*/
protected $bSamples;

public function setUp() : void
{
$mask = Matrix::rand(self::NUM_SAMPLES, 100)
->greater(0.8);

$samples = Matrix::gaussian(self::NUM_SAMPLES, 100)
->multiply($mask)
->asArray();

$this->dataset = Unlabeled::quick($samples);

$this->transformer = new BM25Transformer();
}

/**
* @Subject
* @Iterations(3)
* @OutputTimeUnit("milliseconds", precision=3)
*/
public function apply() : void
{
$this->dataset->apply($this->transformer);
}
}
16 changes: 16 additions & 0 deletions docs/classifiers/multilayer-perceptron.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,22 @@ Returns the underlying neural network instance or `null` if untrained:
public network() : Network|null
```

Export a Graphviz "dot" encoding of the neural network architecture.
```php
public exportGraphviz() : Encoding
```

```php
use Rubix\ML\Helpers\Graphviz;
use Rubix\ML\Persisters\Filesystem;

$dot = $estimator->exportGraphviz();

Graphviz::dotToImage($dot)->saveTo(new Filesystem('network.png'));
```

![Neural Network Graph](https://github.com/RubixML/ML/blob/master/docs/images/neural-network-graph.png?raw=true)

## References
[^1]: G. E. Hinton. (1989). Connectionist learning procedures.
[^2]: L. Prechelt. (1997). Early Stopping - but when?
6 changes: 6 additions & 0 deletions docs/datasets/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ Select the values of a feature column at a given offset :
public feature(int $offset) : mixed[]
```

## Dropping
Drop a feature at a given column offset from the dataset:
```php
public dropFeature(int $offset) : self
```

## Head and Tail
Return the first *n* rows of data in a new dataset object:
```php
Expand Down
Binary file added docs/images/neural-network-graph.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions docs/preprocessing.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ The library provides a number of transformers for Natural Language Processing (N

| Transformer | Supervised | [Stateful](transformers/api.md#stateful) | [Elastic](transformers/api.md#elastic) |
|---|---|---|---|
| [BM25 Transformer](transformers/bm25-transformer.md) | |||
| [Regex Filter](transformers/regex-filter.md) | | | |
| [Text Normalizer](transformers/text-normalizer.md) | | | |
| [Multibyte Text Normalizer](transformers/multibyte-text-normalizer.md) | | | |
Expand Down
16 changes: 16 additions & 0 deletions docs/regressors/mlp-regressor.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,22 @@ Returns the underlying neural network instance or `null` if untrained:
public network() : Network|null
```

Export a Graphviz "dot" encoding of the neural network architecture.
```php
public exportGraphviz() : Encoding
```

```php
use Rubix\ML\Helpers\Graphviz;
use Rubix\ML\Persisters\Filesystem;

$dot = $estimator->exportGraphviz();

Graphviz::dotToImage($dot)->saveTo(new Filesystem('network.png'));
```

![Neural Network Graph](https://github.com/RubixML/ML/blob/master/docs/images/neural-network-graph.png?raw=true)

## References
[^1]: G. E. Hinton. (1989). Connectionist learning procedures.
[^2]: L. Prechelt. (1997). Early Stopping - but when?
38 changes: 38 additions & 0 deletions docs/transformers/bm25-transformer.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/Transformers/BM25Transformer.php">[source]</a></span>

# BM25 Transformer
BM25 is a sublinear term weighting scheme that takes term frequency (TF), document frequency (DF), and document length into account. It is similar to [TF-IDF](tf-idf-transformer.md) but with variable sublinearity and the addition of document length normalization.

> **Note:** BM25 Transformer assumes that its inputs are token frequency vectors such as those created by [Word Count Vectorizer](word-count-vectorizer.md).
**Interfaces:** [Transformer](api.md#transformer), [Stateful](api.md#stateful), [Elastic](api.md#elastic)

**Data Type Compatibility:** Continuous only

## Parameters
| # | Param | Default | Type | Description |
|---|---|---|---|---|
| 1 | dampening | 1.2 | float | The term frequency (TF) dampening factor i.e. the `K1` parameter in the formula. Lower values will cause the TF to saturate quicker. |
| 2 | normalization | 0.75 | float | The importance of document length in normalizing the term frequency i.e. the `b` parameter in the formula. |

## Example
```php
use Rubix\ML\Transformers\BM25Transformer;

$transformer = new BM25Transformer(1.2, 0.75);
```

## Additional Methods
Return the document frequencies calculated during fitting:
```php
public dfs() : ?array
```

Return the average number of tokens per document:
```php
public averageDocumentLength() : ?float
```

### References
>- S. Robertson et al. (2009). The Probabilistic Relevance Framework: BM25 and Beyond.
>- K. Sparck Jones et al. (2000). A probabilistic model of information retrieval: development and comparative experiments.
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ nav:
- KNN Imputer: transformers/knn-imputer.md
- Missing Data Imputer: transformers/missing-data-imputer.md
- Natural Language:
- BM25 Transformer: transformers/bm25-transformer.md
- Regex Filter: transformers/regex-filter.md
- Text Normalizer: transformers/text-normalizer.md
- Multibyte Text Normalizer: transformers/multibyte-text-normalizer.md
Expand Down
16 changes: 16 additions & 0 deletions src/Classifiers/MultilayerPerceptron.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
use Rubix\ML\Learner;
use Rubix\ML\Verbose;
use Rubix\ML\DataType;
use Rubix\ML\Encoding;
use Rubix\ML\Estimator;
use Rubix\ML\Persistable;
use Rubix\ML\Probabilistic;
Expand Down Expand Up @@ -544,6 +545,21 @@ public function proba(Dataset $dataset) : array
return $probabilities;
}

/**
* Export the network architecture as a graph in dot format.
*
* @throws \Rubix\ML\Exceptions\RuntimeException
* @return \Rubix\ML\Encoding
*/
public function exportGraphviz() : Encoding
{
if (!$this->network) {
throw new RuntimeException('Must train network first.');
}

return $this->network->exportGraphviz();
}

/**
* Return an associative array containing the data used to serialize the object.
*
Expand Down
15 changes: 15 additions & 0 deletions src/Datasets/Dataset.php
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,21 @@ public function feature(int $offset) : array
return array_column($this->samples, $offset);
}

/**
* Drop a feature column at a given offset from the dataset.
*
* @param int $offset
* @return self
*/
public function dropFeature(int $offset) : self
{
foreach ($this->samples as &$sample) {
array_splice($sample, $offset, 1);
}

return $this;
}

/**
* Rotate the sample matrix so that the values of each feature become rows.
*
Expand Down
30 changes: 30 additions & 0 deletions src/NeuralNet/FeedForward.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
namespace Rubix\ML\NeuralNet;

use Tensor\Matrix;
use Rubix\ML\Encoding;
use Rubix\ML\Datasets\Dataset;
use Rubix\ML\Datasets\Labeled;
use Rubix\ML\NeuralNet\Layers\Input;
Expand Down Expand Up @@ -218,4 +219,33 @@ public function backpropagate(array $labels) : float

return $loss;
}

/**
* Export the network architecture as a graph in dot format.
*
* @return \Rubix\ML\Encoding
*/
public function exportGraphviz() : Encoding
{
$dot = 'digraph Tree {' . PHP_EOL;
$dot .= ' node [shape=box, fontname=helvetica];' . PHP_EOL;

$layerNum = 0;

foreach ($this->layers() as $layer) {
++$layerNum;

$dot .= " N$layerNum [label=\"$layer\",style=\"rounded\"]" . PHP_EOL;

if ($layerNum > 1) {
$parentId = $layerNum - 1;

$dot .= " N{$parentId} -> N{$layerNum};" . PHP_EOL;
}
}

$dot .= '}';

return new Encoding($dot);
}
}
12 changes: 12 additions & 0 deletions src/NeuralNet/Layers/Binary.php
Original file line number Diff line number Diff line change
Expand Up @@ -199,4 +199,16 @@ public function gradient(Matrix $input, Matrix $output, Matrix $expected) : Matr
return $this->sigmoid->differentiate($input, $output)
->multiply($dLoss);
}

/**
* Return the string representation of the object.
*
* @internal
*
* @return string
*/
public function __toString() : string
{
return "Binary (cost function: {$this->costFn})";
}
}
12 changes: 12 additions & 0 deletions src/NeuralNet/Layers/Continuous.php
Original file line number Diff line number Diff line change
Expand Up @@ -137,4 +137,16 @@ public function gradient(Matrix $input, Matrix $expected) : Matrix
return $this->costFn->differentiate($input, $expected)
->divide($input->n());
}

/**
* Return the string representation of the object.
*
* @internal
*
* @return string
*/
public function __toString() : string
{
return "Continuous (cost function: {$this->costFn})";
}
}
3 changes: 1 addition & 2 deletions src/NeuralNet/Layers/Hidden.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

use Rubix\ML\Deferred;
use Rubix\ML\NeuralNet\Optimizers\Optimizer;
use Stringable;

/**
* Hidden
Expand All @@ -13,7 +12,7 @@
* @package Rubix/ML
* @author Andrew DalPino
*/
interface Hidden extends Layer, Stringable
interface Hidden extends Layer
{
/**
* Calculate the gradient and update the parameters of the layer.
Expand Down
3 changes: 2 additions & 1 deletion src/NeuralNet/Layers/Layer.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
namespace Rubix\ML\NeuralNet\Layers;

use Tensor\Matrix;
use Stringable;

interface Layer
interface Layer extends Stringable
{
/**
* The width of the layer. i.e. the number of neurons or computation nodes.
Expand Down
12 changes: 12 additions & 0 deletions src/NeuralNet/Layers/Multiclass.php
Original file line number Diff line number Diff line change
Expand Up @@ -205,4 +205,16 @@ public function gradient(Matrix $input, Matrix $output, Matrix $expected) : Matr
return $this->softmax->differentiate($input, $output)
->multiply($dLoss);
}

/**
* Return the string representation of the object.
*
* @internal
*
* @return string
*/
public function __toString() : string
{
return "Multiclass (cost function: {$this->costFn})";
}
}
12 changes: 12 additions & 0 deletions src/NeuralNet/Layers/Placeholder1D.php
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,16 @@ public function infer(Matrix $input) : Matrix
{
return $this->forward($input);
}

/**
* Return the string representation of the object.
*
* @internal
*
* @return string
*/
public function __toString() : string
{
return "Placeholder 1D (inputs: {$this->inputs})";
}
}
Loading

0 comments on commit d0de8a0

Please sign in to comment.