diff --git a/src/Anonymization/Datasource/Context.php b/src/Anonymization/Datasource/Context.php new file mode 100644 index 00000000..d7b3d041 --- /dev/null +++ b/src/Anonymization/Datasource/Context.php @@ -0,0 +1,38 @@ +datasources[$datasource->getName()] = $datasource; + } + } + + /** + * Get a single datasource. + */ + public function getDatasource(string $name): Datasource + { + return $this->datasources[$name] ?? throw new DatasourceException(\sprintf("Datasource '%s' does not exist.", $name)); + } + + /** + * Does datasource exists. + */ + public function hasDatasource(string $name): bool + { + return \array_key_exists($name, $this->datasources); + } +} diff --git a/src/Anonymization/Datasource/Datasource.php b/src/Anonymization/Datasource/Datasource.php new file mode 100644 index 00000000..eb871acd --- /dev/null +++ b/src/Anonymization/Datasource/Datasource.php @@ -0,0 +1,68 @@ +name; + } + + /** + * Get random value in. + * + * @return string|array + */ + public abstract function random(Context $context): string|array; + + /** + * Get all values iterator. + * + * @return iterable|iterable> + */ + public abstract function iterator(Context $context): iterable; + + /** + * Raise an error. + */ + protected function throwError(string|\Throwable $error): never + { + $prefix = \sprintf('Datasource "%s": ', $this->name); + + if ($error instanceof \Throwable) { + throw new DatasourceException($prefix . $error->getMessage(), 0, $error); + } + throw new DatasourceException($prefix . $error); + } +} diff --git a/src/Anonymization/Datasource/EnumDatasource.php b/src/Anonymization/Datasource/EnumDatasource.php new file mode 100644 index 00000000..10f5617e --- /dev/null +++ b/src/Anonymization/Datasource/EnumDatasource.php @@ -0,0 +1,170 @@ +data = $source; + } else { + $this->filename = $source; + } + + foreach (\array_values($expressions) as $number => $expression) { + if (\is_string($expression)) { + $this->expressions[] = new Expression($expression, $name, $number); + } else if ($expression instanceof Expression) { + $this->expressions[] = $expression; + } else { + $this->throwError(\sprintf("expression #%d is not a string nor a '%s' instance.", $number, Expression::class)); + } + } + } + + #[\Override] + public function random(Context $context): string|array + { + if ($this->expressions) { + $expression = $this->expressions[\rand(0, \count($this->expressions) - 1)]; + \assert($expression instanceof Expression); + + return $expression->execute($context); + } + + return $this->rawRandom(); + } + + #[\Override] + public function iterator(Context $context): iterable + { + return (fn () => yield from $this->data)(); + } + + #[\Override] + public function count(): int + { + $this->initialize(); + + return \count($this->data); + } + + /** + * Get a random item from the data list, without expression handling. + * + * @internal + * This is being used in unit tests. + */ + public function rawAt(int $position = 0): string + { + $this->initialize(); + + return $this->data[$position]; + } + + /** + * Get a random item from the data list, without expression handling. + * + * @internal + * This is being used in the Expression class. + * @see Expression + */ + public function rawRandom(): string + { + $this->initialize(); + + return $this->data[\rand(0, \count($this->data) - 1)]; + } + + /** + * Internal values initialization. + */ + private function initialize(): void + { + if (null !== $this->data) { + return; + } + + if (null === $this->filename) { + $this->throwError("was initialized without data nor filename."); + } + if (!\file_exists($this->filename)) { + $this->throwError(\sprintf("file '%s': does not exist.", $this->filename)); + } + + $this->data = []; + + $ext = ($pos = \strrpos($this->filename, '.')) ? \substr($this->filename, $pos + 1) : 'txt'; + + $source = match ($ext) { + 'js', 'json' => $this->parseJsonFile($this->filename), + 'txt' => $this->parseTextFile($this->filename), + default => $this->throwError(\sprintf("file '%s': unsupported file format '%s'.", $this->filename, $ext)), + }; + + foreach ($source as $line => $item) { + if (!\is_string($item)) { + $this->throwError(\sprintf("file '%s': line #%s is not a valid value.", $this->filename, $line)); + } + if (empty($item)) { + // @todo log error? + continue; + } + $this->data[] = $item; + } + } + + /** + * Parse data from a JSON file. + */ + private function parseJsonFile(string $filename): iterable + { + $list = \json_decode(\file_get_contents($filename), true); + + if (!\is_array($list)) { + $this->throwError(\sprintf("file '%s': does not contain valid JSON.", $this->filename)); + } + + return (function () use ($list) { + $count = 1; + foreach ($list as $value) { + if (!\is_string($value)) { + $this->throwError(\sprintf("file '%s': item #%s is not a string.", $this->filename, $count)); + } + yield $count => $value; + $count++; + } + })(); + } + + /** + * Parse data from a text file. + */ + private function parseTextFile(string $filename): iterable + { + if (!$handle = \fopen($filename, 'r')) { + $this->throwError(\sprintf("file '%s': could not open file for reading.", $this->filename)); + } + + return (function () use ($handle) { + try { + $count = 1; + while ($value = \fgets($handle)) { + yield $count => \trim($value); + $count++; + } + } finally { + @\fclose($handle); + } + })(); + } +} diff --git a/src/Anonymization/Datasource/Expression.php b/src/Anonymization/Datasource/Expression.php new file mode 100644 index 00000000..de2e5178 --- /dev/null +++ b/src/Anonymization/Datasource/Expression.php @@ -0,0 +1,52 @@ +tokens = (new Parser($raw, $datasource, $number))->parse(); + } + + /** + * Execute given expression over the given context. + */ + public function execute(Context $context): ?string + { + $ret = ''; + foreach ($this->tokens as $token) { + \assert($token instanceof Token); + $ret .= $token->execute($context); + } + return $ret; + } + + /** + * @internal + * For unit tests. + */ + public function toArray(): array + { + return $this->tokens; + } +} diff --git a/src/Anonymization/Datasource/Expression/Parser.php b/src/Anonymization/Datasource/Expression/Parser.php new file mode 100644 index 00000000..e6c91eb3 --- /dev/null +++ b/src/Anonymization/Datasource/Expression/Parser.php @@ -0,0 +1,145 @@ +length = \strlen($raw); + } + + /** + * Parse expression string. + * + * @return Token[] + */ + public function parse(): array + { + $tokens = []; + $text = ''; + $startOffset = 0; + + while (true) { + try { + $cur = $this->nextChar(); + } catch (\OutOfBoundsException) { // End of text. + if ($text) { + $tokens[] = new Text($this->datasource, $this->number, $startOffset, $text); + } + return $tokens; + } + + if ('{' === $cur) { + $next = $this->nextChar(); + if ('{' === $next) { + if ($text) { + $tokens[] = new Text($this->datasource, $this->number, $startOffset, $text); + $text = ''; + } + $startOffset = $this->offset - 1; // parseRef() shifts the offset. + $tokens[] = new Reference($this->datasource, $this->number, $startOffset, $this->parseRef()); + $startOffset = $this->offset + 1; + } else { + $text .= $cur . $next; + } + } else if ('[' === $cur) { + if ($text) { + $tokens[] = new Text($this->datasource, $this->number, $startOffset, $text); + $text = ''; + } + $startOffset = $this->offset; // parseRange() shifts the offset. + $tokens[] = new Range($this->datasource, $this->number, $startOffset, ...$this->parseRange()); + $startOffset = $this->offset + 1; + } else { + $text .= $cur; + } + } + } + + private function throwError(string|\Throwable $error, ?int $offset = null): never + { + $prefix = \sprintf('Datasource "%s" expression #%d at offset %d: ', $this->datasource, $this->number, $offset ?? $this->offset); + if ($error instanceof \Throwable) { + throw new DatasourceException($prefix . $error->getMessage(), 0, $error); + } + throw new DatasourceException($prefix . $error); + } + + private function nextChar(): string + { + if ($this->offset >= ($this->length - 1)) { + throw new \OutOfBoundsException(); // Flow control. + } + return $this->raw[++$this->offset]; + } + + private function parseRange(): array + { + $min = $this->parseInt(); + if (!\in_array($this->nextChar(), [',', ';'])) { + $this->throwError("invalid integer range."); + } + $max = $this->parseInt(); + if (']' !== $this->nextChar()) { + $this->throwError("invalid integer range."); + } + // @phpstan-ignore-next-line + return $min < $max ? [$min, $max] : [$max, $min]; + } + + private function parseInt(): int + { + $ret = ''; + $negative = false; + while (true) { + $next = $this->nextChar(); + if (\ctype_digit($next)) { + $ret .= $next; + } else if ($ret) { + $this->offset--; + return $negative ? (0 - \intval($ret)) : \intval($ret); + } else if ('+' === $next) { // Positive integer. + } else if ('-' === $next) { // Negative integer. + $negative = true; + } else { + $this->throwError("value is not a valid integer."); + } + } + } + + private function parseRef(): string + { + $ret = ''; + while (true) { + $cur = $this->nextChar(); + if ($cur === '}') { + $next = $this->nextChar(); + if ($next === '}') { + // End of reference. + return $ret; + } else { + $ret .= $cur . $next; + } + } else { + $ret .= $cur; + } + } + } +} diff --git a/src/Anonymization/Datasource/Expression/Range.php b/src/Anonymization/Datasource/Expression/Range.php new file mode 100644 index 00000000..b9befba7 --- /dev/null +++ b/src/Anonymization/Datasource/Expression/Range.php @@ -0,0 +1,26 @@ +min, $this->max); + } +} diff --git a/src/Anonymization/Datasource/Expression/Reference.php b/src/Anonymization/Datasource/Expression/Reference.php new file mode 100644 index 00000000..8ca23d77 --- /dev/null +++ b/src/Anonymization/Datasource/Expression/Reference.php @@ -0,0 +1,45 @@ +getDatasource($this->referenced); + + if ($this->referenced === $this->datasource) { + if (!$datasource instanceof EnumDatasource) { + $this->throwError("referenced datasource is not an enum"); + } + if (!$datasource->count()) { + $this->throwError("referenced datasource is empty"); + } + + return $datasource->rawRandom(); + } + + return $datasource->random($context); + + } catch (\Throwable $e) { + $this->throwError($e); + } + } +} diff --git a/src/Anonymization/Datasource/Expression/Text.php b/src/Anonymization/Datasource/Expression/Text.php new file mode 100644 index 00000000..1a2f57aa --- /dev/null +++ b/src/Anonymization/Datasource/Expression/Text.php @@ -0,0 +1,25 @@ +text; + } +} diff --git a/src/Anonymization/Datasource/Expression/Token.php b/src/Anonymization/Datasource/Expression/Token.php new file mode 100644 index 00000000..94956a60 --- /dev/null +++ b/src/Anonymization/Datasource/Expression/Token.php @@ -0,0 +1,32 @@ +datasource, $this->expression, $this->offset); + + if ($error instanceof \Throwable) { + throw new DatasourceException($prefix . $error->getMessage(), 0, $error); + } + throw new DatasourceException($prefix . $error); + } +} diff --git a/src/Anonymization/Datasource/MultipleColumnDatasource.php b/src/Anonymization/Datasource/MultipleColumnDatasource.php new file mode 100644 index 00000000..f7582efe --- /dev/null +++ b/src/Anonymization/Datasource/MultipleColumnDatasource.php @@ -0,0 +1,29 @@ + */ + array $columns, + ) { + parent::__construct($name); + + if ($columns) { + foreach (\array_values($columns) as $index => $column) { + if (!\is_string($column)) { + $this->throwError(\sprintf("column %d is not a string", $index)); + } + $this->columns[$index] = $column; + } + } else { + $this->throwError("columns cannot be empty"); + } + } +} diff --git a/src/Anonymization/Datasource/MultipleColumnFixedDatasource.php b/src/Anonymization/Datasource/MultipleColumnFixedDatasource.php new file mode 100644 index 00000000..1bd56803 --- /dev/null +++ b/src/Anonymization/Datasource/MultipleColumnFixedDatasource.php @@ -0,0 +1,14 @@ +rawAt(0)); + self::assertSame('Coraline', $datasource->rawAt(1)); + self::assertSame(2, $datasource->count()); + } + + public function testCreateWithNonExistingFileError(): void + { + $datasource = new EnumDatasource('foo', $this->getFilename('non_existing_file.txt')); + + self::expectExceptionMessageMatches('/Datasource .* file .*: does not exist./'); + $datasource->rawRandom(); + } + + public function testCreateWithJson(): void + { + $datasource = new EnumDatasource('foo', ['Mathieu', 'Coraline']); + + self::assertSame('Mathieu', $datasource->rawAt(0)); + self::assertSame('Coraline', $datasource->rawAt(1)); + self::assertSame(2, $datasource->count()); + } + + public function testCreateWithJsonError(): void + { + $datasource = new EnumDatasource('foo', $this->getFilename('invalid.json')); + + self::expectExceptionMessageMatches('/Datasource .* file .*: item #2 is not a string./'); + $datasource->rawRandom(); + } + + public function testCreateWithText(): void + { + $datasource = new EnumDatasource('foo', $this->getFilename('firstname.txt')); + + self::assertSame('Robert', $datasource->rawAt(0)); + self::assertSame('Arletta', $datasource->rawAt(6)); + self::assertSame(7, $datasource->count()); + } +} diff --git a/tests/Unit/Anonymization/Datasource/ExpressionTest.php b/tests/Unit/Anonymization/Datasource/ExpressionTest.php new file mode 100644 index 00000000..aa0be300 --- /dev/null +++ b/tests/Unit/Anonymization/Datasource/ExpressionTest.php @@ -0,0 +1,113 @@ +getDatasource('bar')->random($context), + ); + } + + protected function createArbitraryExpression(string $raw): Expression + { + return new Expression($raw, 'arbitrary_datasource', 666); + } + + public function testParseDatasourceFetch(): void + { + $expression = $this->createArbitraryExpression('Fetched: {{foo}}'); + + self::assertEquals( + [ + new Text('arbitrary_datasource', 666, 0, 'Fetched: '), + new Reference('arbitrary_datasource', 666, 9, 'foo'), + ], + $expression->toArray(), + ); + } + + public function testParseRange(): void + { + $expression = $this->createArbitraryExpression('[-5;+734]'); + + self::assertEquals( + [ + new Range('arbitrary_datasource', 666, 0, -5, 734), + ], + $expression->toArray(), + ); + } + + public function testParseInversedRange(): void + { + $expression = $this->createArbitraryExpression('[5;-14]'); + + self::assertEquals( + [ + new Range('arbitrary_datasource', 666, 0, -14, 5), + ], + $expression->toArray(), + ); + } + + public function testParseManyDatasourceFetch(): void + { + $expression = $this->createArbitraryExpression('{{foo}} is {{bar}}'); + + self::assertEquals( + [ + new Reference('arbitrary_datasource', 666, 0, 'foo'), + new Text('arbitrary_datasource', 666, 7, ' is '), + new Reference('arbitrary_datasource', 666, 11, 'bar'), + ], + $expression->toArray(), + ); + } + + public function testParseAllInOneFetch(): void + { + $expression = $this->createArbitraryExpression('[12,134]{{foo}} -> {{bar}}@[1,2] {{bla}}'); + + self::assertEquals( + [ + new Range('arbitrary_datasource', 666, 0, 12, 134), + new Reference('arbitrary_datasource', 666, 8, 'foo'), + new Text('arbitrary_datasource', 666, 15, ' -> '), + new Reference('arbitrary_datasource', 666, 19, 'bar'), + new Text('arbitrary_datasource', 666, 26, '@'), + new Range('arbitrary_datasource', 666, 27, 1, 2), + new Text('arbitrary_datasource', 666, 32, ' '), + new Reference('arbitrary_datasource', 666, 33, 'bla'), + ], + $expression->toArray(), + ); + } +}