More tests, filtering iterator, cleanup

This commit is contained in:
Chris 2022-10-31 02:58:34 +01:00
parent d0956f851c
commit fd2767b642
14 changed files with 340 additions and 21 deletions

View File

@ -45,3 +45,39 @@ foreach ($reader as $row) {
// row is an array // row is an array
} }
``` ```
# Documentation
## DatasetManager
The `DatasetManager` will automatically locate and load datasets on startup.
```
getDataset(string $identifier): Dataset
Return a Dataset object, or throw exception on error
openDataset(string $identifer): Iterator
Return a reader for a Dataset, same as getDataset()->open()
getAvailableDatasets(): array
Returns the Dataset objects for all datasets found
```
## Dataset
```
open(): Iterator
Return an iterator to iterate over the data
filter(array|callable $condition): Iterator
Return an iterator that only returns rows matching filter
getIdentifier(): string
Return the dataset identifier (vendor/package#dataset)
getVersion(): string
Return the package version of the dataset
getPackageName(): string
Return the package name (vendor/package)
getDatasetName(): string
Return the dataset name (dataset)
getLicense(): ?string
Return the license for the dataset
getComment(): ?string
Return the dataset comment
```

View File

@ -3,13 +3,15 @@
foreach ([ foreach ([
__DIR__."/..", __DIR__."/..",
__DIR__."/../vendor", __DIR__."/../..",
__DIR__."/../../..", __DIR__."/../../..",
__DIR__."/vendor",
] as $dir) { ] as $dir) {
if (file_exists($dir."/autoload.php")) { if (file_exists($dir."/autoload.php")) {
define("COMPOSER_VENDOR_PATH", $dir); define("COMPOSER_VENDOR_PATH", $dir);
require_once $dir."/autoload.php"; require_once $dir."/autoload.php";
} elseif (file_exists($dir."/vendor/autoload.php")) {
define("COMPOSER_VENDOR_PATH", $dir."/vendor");
require_once $dir."/vendor/autoload.php";
} }
} }
@ -17,8 +19,20 @@ $datasetManager = new NoccyLabs\Dataset\DatasetManager();
$datasets = $datasetManager->getAvailableDatasets(); $datasets = $datasetManager->getAvailableDatasets();
function _printf(string $fmt, ...$args): void {
$out = sprintf($fmt, ...$args);
if (!posix_isatty(STDOUT)) {
$out = preg_replace("<\e\\[[0-9;]+m>", "", $out);
}
echo $out;
}
foreach ($datasets as $dataset) { foreach ($datasets as $dataset) {
echo $dataset->getIdentifier()." (".$dataset->getVersion().")\n"; _printf("Identifier: \e[36m%s\e[35m#\e[36;1m%s\e[0m\n", $dataset->getPackageName(), $dataset->getDatasetName());
// $dataset->getIdentifier()."\n";
_printf(" Package: \e[33m%s\e[0m\n", $dataset->getPackageName());
_printf(" Dataset: \e[33m%s\e[0m\n", $dataset->getDatasetName());
_printf(" Version: \e[33m%s\e[0m\n", $dataset->getVersion());
$reader = $dataset->open(); $reader = $dataset->open();
$rows = 0; $rows = 0;
$headers = null; $headers = null;
@ -26,5 +40,6 @@ foreach ($datasets as $dataset) {
if (!$headers) $headers = array_keys($row); if (!$headers) $headers = array_keys($row);
$rows++; $rows++;
} }
echo " # ".$rows." rows\n - ".join("\n - ",$headers)."\n"; _printf(" Rows: \e[33m%d\e[0m\n", $rows);
_printf(" Fields: \e[32;1m%s\e[0m\n", join("\e[0m\n \e[32;1m",$headers));
} }

View File

@ -2,9 +2,15 @@
namespace NoccyLabs\Dataset; namespace NoccyLabs\Dataset;
use Iterator;
use NoccyLabs\Dataset\Readers\CsvReader; use NoccyLabs\Dataset\Readers\CsvReader;
use NoccyLabs\Dataset\Readers\JsonReader; use NoccyLabs\Dataset\Readers\JsonReader;
/**
*
*
*
*/
class Dataset class Dataset
{ {
protected string $packageName; protected string $packageName;
@ -15,48 +21,83 @@ class Dataset
protected array $options; protected array $options;
protected ?string $version; protected string $version;
/**
*
*
* @param string $identifier The identifier for the dataset (vendor/package#dataset)
* @param array $options Configured options for the dataset
* @param string|null $version The package version
*/
public function __construct(string $identifier, array $options, ?string $version=null) public function __construct(string $identifier, array $options, ?string $version=null)
{ {
$this->identifier = $identifier; $this->identifier = $identifier;
$this->options = $options; $this->options = $options;
$this->version = $version; $this->version = $version??"0.0.0.0";
[$this->packageName, $this->datasetName] = explode("#", $identifier, 2); [$this->packageName, $this->datasetName] = explode("#", $identifier, 2);
} }
/**
*
* @return string
*/
public function getIdentifier(): string public function getIdentifier(): string
{ {
return $this->identifier; return $this->identifier;
} }
/**
*
* @return string
*/
public function getPackageName(): string public function getPackageName(): string
{ {
return $this->packageName; return $this->packageName;
} }
/**
*
* @return string
*/
public function getDatasetName(): string public function getDatasetName(): string
{ {
return $this->datasetName; return $this->datasetName;
} }
public function getVersion(): ?string /**
*
* @return string
*/
public function getVersion(): string
{ {
return $this->version; return $this->version;
} }
/**
*
* @return string|null
*/
public function getComment(): ?string public function getComment(): ?string
{ {
return array_key_exists('comment', $this->options) ? $this->options['comment'] : null; return array_key_exists('comment', $this->options) ? $this->options['comment'] : null;
} }
/**
*
* @return string|null
*/
public function getLicense(): ?string public function getLicense(): ?string
{ {
return array_key_exists('license', $this->options) ? $this->options['license'] : null; return array_key_exists('license', $this->options) ? $this->options['license'] : null;
} }
public function open(): ReaderInterface /**
*
* @return Iterator
*/
public function open(): Iterator
{ {
$filename = $this->options['filename']; $filename = $this->options['filename'];
$reader = $this->determineReaderForFile($filename); $reader = $this->determineReaderForFile($filename);
@ -64,6 +105,11 @@ class Dataset
return $inst; return $inst;
} }
/**
*
* @param string $filename
* @return string
*/
private function determineReaderForFile(string $filename): string private function determineReaderForFile(string $filename): string
{ {
if ($reader = $this->options['reader']??null) { if ($reader = $this->options['reader']??null) {

View File

@ -2,6 +2,8 @@
namespace NoccyLabs\Dataset; namespace NoccyLabs\Dataset;
use Iterator;
/** /**
* DatasetManager is the central class of noccylabs/dataset. * DatasetManager is the central class of noccylabs/dataset.
* *
@ -36,11 +38,11 @@ class DatasetManager
* Directly return a reader for a specific dataset. * Directly return a reader for a specific dataset.
* *
* @param string $identifier The dataset identifier * @param string $identifier The dataset identifier
* @return ReaderInterface A reader for the data * @return Iterator A reader for the data
* @throws InvalidDatasetException if the dataset can not be opened * @throws InvalidDatasetException if the dataset can not be opened
* @throws UnknownDatasetException if the dataset does not exist * @throws UnknownDatasetException if the dataset does not exist
*/ */
public function openDataset(string $identifier): ReaderInterface public function openDataset(string $identifier): Iterator
{ {
return $this->getDataset($identifier)->open(); return $this->getDataset($identifier)->open();
} }
@ -157,6 +159,11 @@ class DatasetManager
return null; return null;
} }
/**
* Register a dataset
*
*
*/
public function registerDataset(Dataset $dataset) public function registerDataset(Dataset $dataset)
{ {
$id = $dataset->getIdentifier(); $id = $dataset->getIdentifier();

View File

@ -0,0 +1,92 @@
<?php
namespace NoccyLabs\Dataset;
use Iterator;
class FilteringReaderIterator implements Iterator
{
private Iterator $reader;
private $condition;
public function __construct(Iterator $reader, array|callable $condition)
{
$this->reader = $reader;
$this->condition = $condition;
}
public function current(): mixed
{
return $this->reader->current();
}
public function key(): mixed
{
return $this->reader->key();
}
public function valid(): bool
{
return $this->reader->valid();
}
public function next(): void
{
$this->reader->next();
while ($this->reader->valid()) {
$curr = $this->reader->current();
if ($this->matchCondition($curr))
break;
$this->reader->next();
}
}
public function rewind(): void
{
$this->reader->rewind();
while ($this->reader->valid()) {
$curr = $this->reader->current();
if ($this->matchCondition($curr))
break;
$this->reader->next();
}
}
/**
* Test a condition against a row
*
* The matched conditions are:
* - string/bool/int/float - match value
* - array - all conditions in array must match:
* - eq equals
* - neq not equals
* - gt greater than
* - gte greater than or equal
* - lt less than
* - in value in array
* - nin value not in array
*/
private function matchCondition(array $row): bool
{
if (is_callable($this->condition)) {
return (bool)call_user_func($this->condition, $row);
}
foreach ($this->condition as $field=>$test) {
if (!array_key_exists($field, $row)) continue;
if (is_array($test)) {
if (array_key_exists('eq',$test) && $row[$field] != $test['eq']) return false;
if (array_key_exists('neq',$test) && $row[$field] == $test['neq']) return false;
if (array_key_exists('gt',$test) && $row[$field] <= $test['gt']) return false;
if (array_key_exists('gte',$test) && $row[$field] < $test['gte']) return false;
if (array_key_exists('lt',$test) && $row[$field] >= $test['lt']) return false;
if (array_key_exists('lte',$test) && $row[$field] > $test['lte']) return false;
if (array_key_exists('in',$test) && !in_array($row[$field], $test['in'])) return false;
if (array_key_exists('nin',$test) && in_array($row[$field], $test['in'])) return false;
} else {
if ($row[$field] != $test) return false;
}
}
return true;
}
}

View File

@ -45,12 +45,16 @@ class CsvReader implements ReaderInterface
private function loadData(array $data) private function loadData(array $data)
{ {
// FIXME parse data according to directives if present $separator = $this->options['separator']??',';
$enclosure = $this->options['enclosure']??'"';
$escape = $this->options['escape']??"\\";
$head = str_getcsv(array_shift($data)); $head = str_getcsv(array_shift($data));
$this->data = []; $this->data = [];
foreach ($data as $row) { foreach ($data as $row) {
if ($row) { if ($row) {
$row = str_getcsv($row); $row = str_getcsv($row, $separator, $enclosure, $escape);
$this->data[] = array_combine($head, $row); $this->data[] = array_combine($head, $row);
} }
} }

View File

@ -35,8 +35,10 @@ class JsonReader implements ReaderInterface
//printf("Reached end of set at slice=%d\n", $this->currentFile); //printf("Reached end of set at slice=%d\n", $this->currentFile);
return; return;
} }
$flags = ($this->options['bigintAsString']??false)?JSON_BIGINT_AS_STRING:0;
$file = $this->files[$this->currentFile]; $file = $this->files[$this->currentFile];
$json = @json_decode(@file_get_contents($file), true); $json = @json_decode(@file_get_contents($file), true, flags:$flags);
$this->loadData($json); $this->loadData($json);
$this->loadedFile = $this->currentFile; $this->loadedFile = $this->currentFile;

View File

@ -1,7 +0,0 @@
{
"datasets": {
"baz": {
"filename": "data/test.csv"
}
}
}

View File

@ -16,6 +16,6 @@ class DatasetManagerTest extends \PHPUnit\Framework\TestCase
$dm = new DatasetManager(); $dm = new DatasetManager();
$sets = $dm->getAvailableDatasets(); $sets = $dm->getAvailableDatasets();
$this->assertEquals(1, count($sets), "Expected 1 loaded set"); $this->assertEquals(2, count($sets), "Expected 2 loaded set");
} }
} }

View File

@ -0,0 +1,72 @@
<?php
namespace NoccyLabs\Dataset;
use ArrayIterator;
use Iterator;
class FilteringReaderIteratorTest extends \PHPUnit\Framework\TestCase
{
/**
* @covers FilteringReaderIterator
*/
public function testSimpleFiltering()
{
$arr = iterator_to_array(
new FilteringReaderIterator(
$this->getTestIterator(), [
"alpha" => 3
]));
$this->assertEquals(1, count($arr));
$this->assertEquals(3, $arr[2]['alpha']);
$this->assertEquals('red', $arr[2]['beta']);
$this->assertEquals(null, $arr[2]['gamma']);
}
/**
* @covers FilteringReaderIterator
*/
public function testFilteringOnRange()
{
$arr = iterator_to_array(
new FilteringReaderIterator(
$this->getTestIterator(), [
"alpha" => [ 'gt' => 2, 'lt' => 5 ]
]));
$this->assertEquals(2, count($arr));
}
/**
* @covers FilteringReaderIterator
*/
public function testFilteringOnRangeInclusive()
{
$arr = iterator_to_array(
new FilteringReaderIterator(
$this->getTestIterator(), [
"alpha" => [ 'gt' => 2, 'lte' => 5 ]
]));
$this->assertEquals(3, count($arr));
}
private function getTestData(): array
{
return [
[ 'alpha' => 1, 'beta' => 'green', 'gamma' => true ],
[ 'alpha' => 2, 'beta' => 'blue', 'gamma' => false ],
[ 'alpha' => 3, 'beta' => 'red', 'gamma' => null ],
[ 'alpha' => 4, 'beta' => 'yellow', 'gamma' => false ],
[ 'alpha' => 5, 'beta' => 'pink', 'gamma' => true ],
];
}
private function getTestIterator(): Iterator
{
return new ArrayIterator($this->getTestData());
}
}

View File

@ -0,0 +1,29 @@
<?php
namespace NoccyLabs\Dataset\Readers;
class CsvReaderTest extends \PHPUnit\Framework\TestCase
{
/**
* @covers CsvReader
*/
public function testReadingSingleFile()
{
$reader = new CsvReader(__DIR__."/../../test/test/data/data.csv", []);
$arr = iterator_to_array($reader);
$this->assertEquals(5, count($arr), "Expected 5 items");
}
/**
* @covers CsvReader
*/
public function testReadingSplitFile()
{
$reader = new CsvReader(__DIR__."/../../test/test/data/*.csv", []);
$arr = iterator_to_array($reader);
$this->assertEquals(5, count($arr), "Expected 5 items");
}
}

View File

@ -0,0 +1,6 @@
"a";"b"
1;a
2;b
3;c
4;d
5;e
1 a b
2 1 a
3 2 b
4 3 c
5 4 d
6 5 e

View File

@ -0,0 +1,7 @@
[
{ "a": 1, "b": "a" },
{ "a": 2, "b": "b" },
{ "a": 3, "b": "c" },
{ "a": 4, "b": "d" },
{ "a": 5, "b": "e" }
]

View File

@ -0,0 +1,10 @@
{
"datasets": {
"csv": {
"filename": "data/data.csv"
},
"json": {
"filename": "data/data.json"
}
}
}