Skip to content

Commit

Permalink
feature #200 - data generator without php code prototype
Browse files Browse the repository at this point in the history
  • Loading branch information
pounard committed Nov 27, 2024
1 parent 4e7e7af commit 7dbc7ac
Show file tree
Hide file tree
Showing 21 changed files with 890 additions and 6 deletions.
38 changes: 38 additions & 0 deletions src/Anonymization/Datasource/Context.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
<?php

declare(strict_types=1);

namespace MakinaCorpus\DbToolsBundle\Anonymization\Datasource;

use MakinaCorpus\DbToolsBundle\Error\DatasourceException;

class Context
{
private array $datasources = [];

public function __construct(iterable $datasources)
{
foreach ($datasources as $datasource) {
if (!$datasource instanceof Datasource) {
throw new \InvalidArgumentException(\sprintf("Value is not a '%s' instance.", Datasource::class));
}
$this->datasources[$datasource->getName()] = $datasource;
}
}

/**
* Get a single datasource.
*/
public function getDatasource(string $name): Datasource
{
return $this->datasources[$name] ?? throw new DatasourceException(\sprintf("Datasource '%s' does not exist.", $name));
}

/**
* Does datasource exists.
*/
public function hasDatasource(string $name): bool
{
return \array_key_exists($name, $this->datasources);
}
}
68 changes: 68 additions & 0 deletions src/Anonymization/Datasource/Datasource.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
<?php

Check warning on line 1 in src/Anonymization/Datasource/Datasource.php

View workflow job for this annotation

GitHub Actions / PHP CS Fixer (8.2)

Found violation(s) of type: visibility_required

Check warning on line 1 in src/Anonymization/Datasource/Datasource.php

View workflow job for this annotation

GitHub Actions / PHP CS Fixer (8.2)

Found violation(s) of type: braces_position

Check warning on line 1 in src/Anonymization/Datasource/Datasource.php

View workflow job for this annotation

GitHub Actions / PHP CS Fixer (8.2)

Found violation(s) of type: single_line_empty_body

declare(strict_types=1);

namespace MakinaCorpus\DbToolsBundle\Anonymization\Datasource;

use MakinaCorpus\DbToolsBundle\Error\DatasourceException;

/**
* There are two usages for the datasource:
*
* - You fill a sample table: in this case, it's best to assume that the
* default behavior is to create a sample table which contains all the
* datalist. In this case, we need the datasource to be an iterator
* which will not consume any memory while reading the file.
*
* - The second use case if when using it as an expression datasource,
* then we need to be able to randomly select a line in the datasource,
* which means we probably need to load it into memory.
*
* In regard of the second use case, the default implementations will always
* load all data into memory, and we'll see what happens next.
*
* If this causes trouble, we might want to implement some kind of random
* line read in files directly algorithm, it does not really seem that
* difficult to implement.
*/
abstract class Datasource implements \Countable
{
public function __construct(
private string $name,
) {}

/**
* Get datasource name.
*/
public function getName(): string
{
return $this->name;
}

/**
* Get random value in.
*
* @return string|array<string>
*/
public abstract function random(Context $context): string|array;

/**
* Get all values iterator.
*
* @return iterable<string>|iterable<array<string>>
*/
public abstract function iterator(Context $context): iterable;

/**
* Raise an error.
*/
protected function throwError(string|\Throwable $error): never
{
$prefix = \sprintf('Datasource "%s": ', $this->name);

if ($error instanceof \Throwable) {
throw new DatasourceException($prefix . $error->getMessage(), 0, $error);
}
throw new DatasourceException($prefix . $error);
}
}
170 changes: 170 additions & 0 deletions src/Anonymization/Datasource/EnumDatasource.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
<?php

Check warning on line 1 in src/Anonymization/Datasource/EnumDatasource.php

View workflow job for this annotation

GitHub Actions / PHP CS Fixer (8.2)

Found violation(s) of type: elseif

declare(strict_types=1);

namespace MakinaCorpus\DbToolsBundle\Anonymization\Datasource;

class EnumDatasource extends Datasource
{
private ?array $data = null;
private ?string $filename = null;
private array $expressions = [];

public function __construct(string $name, string|array $source, ?array $expressions = [])
{
parent::__construct($name);

if (\is_array($source)) {
$this->data = $source;
} else {
$this->filename = $source;
}

foreach (\array_values($expressions) as $number => $expression) {
if (\is_string($expression)) {
$this->expressions[] = new Expression($expression, $name, $number);
} else if ($expression instanceof Expression) {
$this->expressions[] = $expression;
} else {
$this->throwError(\sprintf("expression #%d is not a string nor a '%s' instance.", $number, Expression::class));
}
}
}

#[\Override]
public function random(Context $context): string|array
{
if ($this->expressions) {
$expression = $this->expressions[\rand(0, \count($this->expressions) - 1)];
\assert($expression instanceof Expression);

return $expression->execute($context);
}

return $this->rawRandom();
}

#[\Override]
public function iterator(Context $context): iterable
{
return (fn () => yield from $this->data)();
}

#[\Override]
public function count(): int
{
$this->initialize();

return \count($this->data);
}

/**
* Get a random item from the data list, without expression handling.
*
* @internal
* This is being used in unit tests.
*/
public function rawAt(int $position = 0): string
{
$this->initialize();

return $this->data[$position];
}

/**
* Get a random item from the data list, without expression handling.
*
* @internal
* This is being used in the Expression class.
* @see Expression
*/
public function rawRandom(): string
{
$this->initialize();

return $this->data[\rand(0, \count($this->data) - 1)];
}

/**
* Internal values initialization.
*/
private function initialize(): void
{
if (null !== $this->data) {
return;
}

if (null === $this->filename) {
$this->throwError("was initialized without data nor filename.");
}
if (!\file_exists($this->filename)) {
$this->throwError(\sprintf("file '%s': does not exist.", $this->filename));
}

$this->data = [];

$ext = ($pos = \strrpos($this->filename, '.')) ? \substr($this->filename, $pos + 1) : 'txt';

$source = match ($ext) {
'js', 'json' => $this->parseJsonFile($this->filename),
'txt' => $this->parseTextFile($this->filename),
default => $this->throwError(\sprintf("file '%s': unsupported file format '%s'.", $this->filename, $ext)),
};

foreach ($source as $line => $item) {
if (!\is_string($item)) {
$this->throwError(\sprintf("file '%s': line #%s is not a valid value.", $this->filename, $line));
}
if (empty($item)) {
// @todo log error?
continue;
}
$this->data[] = $item;
}
}

/**
* Parse data from a JSON file.
*/
private function parseJsonFile(string $filename): iterable
{
$list = \json_decode(\file_get_contents($filename), true);

if (!\is_array($list)) {
$this->throwError(\sprintf("file '%s': does not contain valid JSON.", $this->filename));
}

return (function () use ($list) {
$count = 1;
foreach ($list as $value) {
if (!\is_string($value)) {
$this->throwError(\sprintf("file '%s': item #%s is not a string.", $this->filename, $count));
}
yield $count => $value;
$count++;
}
})();
}

/**
* Parse data from a text file.
*/
private function parseTextFile(string $filename): iterable
{
if (!$handle = \fopen($filename, 'r')) {
$this->throwError(\sprintf("file '%s': could not open file for reading.", $this->filename));
}

return (function () use ($handle) {
try {
$count = 1;
while ($value = \fgets($handle)) {
yield $count => \trim($value);
$count++;
}
} finally {
@\fclose($handle);
}
})();
}
}
52 changes: 52 additions & 0 deletions src/Anonymization/Datasource/Expression.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
<?php

declare(strict_types=1);

namespace MakinaCorpus\DbToolsBundle\Anonymization\Datasource;

use MakinaCorpus\DbToolsBundle\Anonymization\Datasource\Expression\Parser;
use MakinaCorpus\DbToolsBundle\Anonymization\Datasource\Expression\Token;

class Expression
{
/** @var Token[] */
private array $tokens = [];

/**
* All other data than the raw text is here only for error handling and
* building helping error messages for end-users.
*
* @param string $raw
* User text.
* @param string $datasource
* Datasource in which this expression is found.
* @param int $number
* Expression number in datasource.
*/
public function __construct(string $raw, string $datasource, int $number)
{
$this->tokens = (new Parser($raw, $datasource, $number))->parse();
}

/**
* Execute given expression over the given context.
*/
public function execute(Context $context): ?string
{
$ret = '';
foreach ($this->tokens as $token) {
\assert($token instanceof Token);
$ret .= $token->execute($context);
}
return $ret;
}

/**
* @internal
* For unit tests.
*/
public function toArray(): array
{
return $this->tokens;
}
}
Loading

0 comments on commit 7dbc7ac

Please sign in to comment.