-
-
Notifications
You must be signed in to change notification settings - Fork 183
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #271 from RubixML/2.3
2.3
- Loading branch information
Showing
30 changed files
with
668 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,6 @@ Thumbs.db | |
.DS_Store | ||
debug.log | ||
/test.png | ||
/.idea | ||
/.vscode | ||
/.vs | ||
.idea | ||
.vscode | ||
.vs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
<?php | ||
|
||
namespace Rubix\ML\Benchmarks\Transformers; | ||
|
||
use Tensor\Matrix; | ||
use Rubix\ML\Datasets\Unlabeled; | ||
use Rubix\ML\Transformers\BM25Transformer; | ||
|
||
/** | ||
* @Groups({"Transformers"}) | ||
* @BeforeMethods({"setUp"}) | ||
*/ | ||
class BM25TransformerBench | ||
{ | ||
protected const NUM_SAMPLES = 10000; | ||
|
||
/** | ||
* @var \Rubix\ML\Datasets\Unlabeled | ||
*/ | ||
protected $dataset; | ||
|
||
/** | ||
* @var \Rubix\ML\Transformers\BM25Transformer | ||
*/ | ||
protected $transformer; | ||
|
||
/** | ||
* @var array<array<mixed>> | ||
*/ | ||
protected $aSamples; | ||
|
||
/** | ||
* @var array<array<mixed>> | ||
*/ | ||
protected $bSamples; | ||
|
||
public function setUp() : void | ||
{ | ||
$mask = Matrix::rand(self::NUM_SAMPLES, 100) | ||
->greater(0.8); | ||
|
||
$samples = Matrix::gaussian(self::NUM_SAMPLES, 100) | ||
->multiply($mask) | ||
->asArray(); | ||
|
||
$this->dataset = Unlabeled::quick($samples); | ||
|
||
$this->transformer = new BM25Transformer(); | ||
} | ||
|
||
/** | ||
* @Subject | ||
* @Iterations(3) | ||
* @OutputTimeUnit("milliseconds", precision=3) | ||
*/ | ||
public function apply() : void | ||
{ | ||
$this->dataset->apply($this->transformer); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/Transformers/BM25Transformer.php">[source]</a></span> | ||
|
||
# BM25 Transformer | ||
BM25 is a sublinear term weighting scheme that takes term frequency (TF), document frequency (DF), and document length into account. It is similar to [TF-IDF](tf-idf-transformer.md) but with variable sublinearity and the addition of document length normalization. | ||
|
||
> **Note:** BM25 Transformer assumes that its inputs are token frequency vectors such as those created by [Word Count Vectorizer](word-count-vectorizer.md). | ||
**Interfaces:** [Transformer](api.md#transformer), [Stateful](api.md#stateful), [Elastic](api.md#elastic) | ||
|
||
**Data Type Compatibility:** Continuous only | ||
|
||
## Parameters | ||
| # | Param | Default | Type | Description | | ||
|---|---|---|---|---| | ||
| 1 | dampening | 1.2 | float | The term frequency (TF) dampening factor i.e. the `K1` parameter in the formula. Lower values will cause the TF to saturate quicker. | | ||
| 2 | normalization | 0.75 | float | The importance of document length in normalizing the term frequency i.e. the `b` parameter in the formula. | | ||
|
||
## Example | ||
```php | ||
use Rubix\ML\Transformers\BM25Transformer; | ||
|
||
$transformer = new BM25Transformer(1.2, 0.75); | ||
``` | ||
|
||
## Additional Methods | ||
Return the document frequencies calculated during fitting: | ||
```php | ||
public dfs() : ?array | ||
``` | ||
|
||
Return the average number of tokens per document: | ||
```php | ||
public averageDocumentLength() : ?float | ||
``` | ||
|
||
### References | ||
>- S. Robertson et al. (2009). The Probabilistic Relevance Framework: BM25 and Beyond. | ||
>- K. Sparck Jones et al. (2000). A probabilistic model of information retrieval: development and comparative experiments. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.