More tests, filtering iterator, cleanup
This commit is contained in:
parent
d0956f851c
commit
fd2767b642
36
README.md
36
README.md
@ -45,3 +45,39 @@ foreach ($reader as $row) {
|
||||
// row is an array
|
||||
}
|
||||
```
|
||||
|
||||
# Documentation
|
||||
|
||||
## DatasetManager
|
||||
|
||||
The `DatasetManager` will automatically locate and load datasets on startup.
|
||||
|
||||
```
|
||||
getDataset(string $identifier): Dataset
|
||||
Return a Dataset object, or throw exception on error
|
||||
openDataset(string $identifer): Iterator
|
||||
Return a reader for a Dataset, same as getDataset()->open()
|
||||
getAvailableDatasets(): array
|
||||
Returns the Dataset objects for all datasets found
|
||||
```
|
||||
|
||||
## Dataset
|
||||
|
||||
```
|
||||
open(): Iterator
|
||||
Return an iterator to iterate over the data
|
||||
filter(array|callable $condition): Iterator
|
||||
Return an iterator that only returns rows matching filter
|
||||
getIdentifier(): string
|
||||
Return the dataset identifier (vendor/package#dataset)
|
||||
getVersion(): string
|
||||
Return the package version of the dataset
|
||||
getPackageName(): string
|
||||
Return the package name (vendor/package)
|
||||
getDatasetName(): string
|
||||
Return the dataset name (dataset)
|
||||
getLicense(): ?string
|
||||
Return the license for the dataset
|
||||
getComment(): ?string
|
||||
Return the dataset comment
|
||||
```
|
@ -3,13 +3,15 @@
|
||||
|
||||
foreach ([
|
||||
__DIR__."/..",
|
||||
__DIR__."/../vendor",
|
||||
__DIR__."/../..",
|
||||
__DIR__."/../../..",
|
||||
__DIR__."/vendor",
|
||||
] as $dir) {
|
||||
if (file_exists($dir."/autoload.php")) {
|
||||
define("COMPOSER_VENDOR_PATH", $dir);
|
||||
require_once $dir."/autoload.php";
|
||||
} elseif (file_exists($dir."/vendor/autoload.php")) {
|
||||
define("COMPOSER_VENDOR_PATH", $dir."/vendor");
|
||||
require_once $dir."/vendor/autoload.php";
|
||||
}
|
||||
}
|
||||
|
||||
@ -17,8 +19,20 @@ $datasetManager = new NoccyLabs\Dataset\DatasetManager();
|
||||
|
||||
$datasets = $datasetManager->getAvailableDatasets();
|
||||
|
||||
function _printf(string $fmt, ...$args): void {
|
||||
$out = sprintf($fmt, ...$args);
|
||||
if (!posix_isatty(STDOUT)) {
|
||||
$out = preg_replace("<\e\\[[0-9;]+m>", "", $out);
|
||||
}
|
||||
echo $out;
|
||||
}
|
||||
|
||||
foreach ($datasets as $dataset) {
|
||||
echo $dataset->getIdentifier()." (".$dataset->getVersion().")\n";
|
||||
_printf("Identifier: \e[36m%s\e[35m#\e[36;1m%s\e[0m\n", $dataset->getPackageName(), $dataset->getDatasetName());
|
||||
// $dataset->getIdentifier()."\n";
|
||||
_printf(" Package: \e[33m%s\e[0m\n", $dataset->getPackageName());
|
||||
_printf(" Dataset: \e[33m%s\e[0m\n", $dataset->getDatasetName());
|
||||
_printf(" Version: \e[33m%s\e[0m\n", $dataset->getVersion());
|
||||
$reader = $dataset->open();
|
||||
$rows = 0;
|
||||
$headers = null;
|
||||
@ -26,5 +40,6 @@ foreach ($datasets as $dataset) {
|
||||
if (!$headers) $headers = array_keys($row);
|
||||
$rows++;
|
||||
}
|
||||
echo " # ".$rows." rows\n - ".join("\n - ",$headers)."\n";
|
||||
_printf(" Rows: \e[33m%d\e[0m\n", $rows);
|
||||
_printf(" Fields: \e[32;1m%s\e[0m\n", join("\e[0m\n \e[32;1m",$headers));
|
||||
}
|
||||
|
@ -2,9 +2,15 @@
|
||||
|
||||
namespace NoccyLabs\Dataset;
|
||||
|
||||
use Iterator;
|
||||
use NoccyLabs\Dataset\Readers\CsvReader;
|
||||
use NoccyLabs\Dataset\Readers\JsonReader;
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
*
|
||||
*/
|
||||
class Dataset
|
||||
{
|
||||
protected string $packageName;
|
||||
@ -15,48 +21,83 @@ class Dataset
|
||||
|
||||
protected array $options;
|
||||
|
||||
protected ?string $version;
|
||||
protected string $version;
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
* @param string $identifier The identifier for the dataset (vendor/package#dataset)
|
||||
* @param array $options Configured options for the dataset
|
||||
* @param string|null $version The package version
|
||||
*/
|
||||
public function __construct(string $identifier, array $options, ?string $version=null)
|
||||
{
|
||||
$this->identifier = $identifier;
|
||||
$this->options = $options;
|
||||
$this->version = $version;
|
||||
$this->version = $version??"0.0.0.0";
|
||||
|
||||
[$this->packageName, $this->datasetName] = explode("#", $identifier, 2);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function getIdentifier(): string
|
||||
{
|
||||
return $this->identifier;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function getPackageName(): string
|
||||
{
|
||||
return $this->packageName;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function getDatasetName(): string
|
||||
{
|
||||
return $this->datasetName;
|
||||
}
|
||||
|
||||
public function getVersion(): ?string
|
||||
/**
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function getVersion(): string
|
||||
{
|
||||
return $this->version;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return string|null
|
||||
*/
|
||||
public function getComment(): ?string
|
||||
{
|
||||
return array_key_exists('comment', $this->options) ? $this->options['comment'] : null;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return string|null
|
||||
*/
|
||||
public function getLicense(): ?string
|
||||
{
|
||||
return array_key_exists('license', $this->options) ? $this->options['license'] : null;
|
||||
}
|
||||
|
||||
public function open(): ReaderInterface
|
||||
/**
|
||||
*
|
||||
* @return Iterator
|
||||
*/
|
||||
public function open(): Iterator
|
||||
{
|
||||
$filename = $this->options['filename'];
|
||||
$reader = $this->determineReaderForFile($filename);
|
||||
@ -64,6 +105,11 @@ class Dataset
|
||||
return $inst;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param string $filename
|
||||
* @return string
|
||||
*/
|
||||
private function determineReaderForFile(string $filename): string
|
||||
{
|
||||
if ($reader = $this->options['reader']??null) {
|
||||
|
@ -2,6 +2,8 @@
|
||||
|
||||
namespace NoccyLabs\Dataset;
|
||||
|
||||
use Iterator;
|
||||
|
||||
/**
|
||||
* DatasetManager is the central class of noccylabs/dataset.
|
||||
*
|
||||
@ -36,11 +38,11 @@ class DatasetManager
|
||||
* Directly return a reader for a specific dataset.
|
||||
*
|
||||
* @param string $identifier The dataset identifier
|
||||
* @return ReaderInterface A reader for the data
|
||||
* @return Iterator A reader for the data
|
||||
* @throws InvalidDatasetException if the dataset can not be opened
|
||||
* @throws UnknownDatasetException if the dataset does not exist
|
||||
*/
|
||||
public function openDataset(string $identifier): ReaderInterface
|
||||
public function openDataset(string $identifier): Iterator
|
||||
{
|
||||
return $this->getDataset($identifier)->open();
|
||||
}
|
||||
@ -157,6 +159,11 @@ class DatasetManager
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Register a dataset
|
||||
*
|
||||
*
|
||||
*/
|
||||
public function registerDataset(Dataset $dataset)
|
||||
{
|
||||
$id = $dataset->getIdentifier();
|
||||
|
92
src/FilteringReaderIterator.php
Normal file
92
src/FilteringReaderIterator.php
Normal file
@ -0,0 +1,92 @@
|
||||
<?php
|
||||
|
||||
namespace NoccyLabs\Dataset;
|
||||
|
||||
use Iterator;
|
||||
|
||||
class FilteringReaderIterator implements Iterator
|
||||
{
|
||||
private Iterator $reader;
|
||||
|
||||
private $condition;
|
||||
|
||||
public function __construct(Iterator $reader, array|callable $condition)
|
||||
{
|
||||
$this->reader = $reader;
|
||||
$this->condition = $condition;
|
||||
}
|
||||
|
||||
public function current(): mixed
|
||||
{
|
||||
return $this->reader->current();
|
||||
}
|
||||
|
||||
public function key(): mixed
|
||||
{
|
||||
return $this->reader->key();
|
||||
}
|
||||
|
||||
public function valid(): bool
|
||||
{
|
||||
return $this->reader->valid();
|
||||
}
|
||||
|
||||
public function next(): void
|
||||
{
|
||||
$this->reader->next();
|
||||
while ($this->reader->valid()) {
|
||||
$curr = $this->reader->current();
|
||||
if ($this->matchCondition($curr))
|
||||
break;
|
||||
$this->reader->next();
|
||||
}
|
||||
}
|
||||
|
||||
public function rewind(): void
|
||||
{
|
||||
$this->reader->rewind();
|
||||
while ($this->reader->valid()) {
|
||||
$curr = $this->reader->current();
|
||||
if ($this->matchCondition($curr))
|
||||
break;
|
||||
$this->reader->next();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test a condition against a row
|
||||
*
|
||||
* The matched conditions are:
|
||||
* - string/bool/int/float - match value
|
||||
* - array - all conditions in array must match:
|
||||
* - eq equals
|
||||
* - neq not equals
|
||||
* - gt greater than
|
||||
* - gte greater than or equal
|
||||
* - lt less than
|
||||
* - in value in array
|
||||
* - nin value not in array
|
||||
*/
|
||||
private function matchCondition(array $row): bool
|
||||
{
|
||||
if (is_callable($this->condition)) {
|
||||
return (bool)call_user_func($this->condition, $row);
|
||||
}
|
||||
foreach ($this->condition as $field=>$test) {
|
||||
if (!array_key_exists($field, $row)) continue;
|
||||
if (is_array($test)) {
|
||||
if (array_key_exists('eq',$test) && $row[$field] != $test['eq']) return false;
|
||||
if (array_key_exists('neq',$test) && $row[$field] == $test['neq']) return false;
|
||||
if (array_key_exists('gt',$test) && $row[$field] <= $test['gt']) return false;
|
||||
if (array_key_exists('gte',$test) && $row[$field] < $test['gte']) return false;
|
||||
if (array_key_exists('lt',$test) && $row[$field] >= $test['lt']) return false;
|
||||
if (array_key_exists('lte',$test) && $row[$field] > $test['lte']) return false;
|
||||
if (array_key_exists('in',$test) && !in_array($row[$field], $test['in'])) return false;
|
||||
if (array_key_exists('nin',$test) && in_array($row[$field], $test['in'])) return false;
|
||||
} else {
|
||||
if ($row[$field] != $test) return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
@ -45,12 +45,16 @@ class CsvReader implements ReaderInterface
|
||||
|
||||
private function loadData(array $data)
|
||||
{
|
||||
// FIXME parse data according to directives if present
|
||||
$separator = $this->options['separator']??',';
|
||||
$enclosure = $this->options['enclosure']??'"';
|
||||
$escape = $this->options['escape']??"\\";
|
||||
|
||||
|
||||
$head = str_getcsv(array_shift($data));
|
||||
$this->data = [];
|
||||
foreach ($data as $row) {
|
||||
if ($row) {
|
||||
$row = str_getcsv($row);
|
||||
$row = str_getcsv($row, $separator, $enclosure, $escape);
|
||||
$this->data[] = array_combine($head, $row);
|
||||
}
|
||||
}
|
||||
|
@ -35,8 +35,10 @@ class JsonReader implements ReaderInterface
|
||||
//printf("Reached end of set at slice=%d\n", $this->currentFile);
|
||||
return;
|
||||
}
|
||||
|
||||
$flags = ($this->options['bigintAsString']??false)?JSON_BIGINT_AS_STRING:0;
|
||||
$file = $this->files[$this->currentFile];
|
||||
$json = @json_decode(@file_get_contents($file), true);
|
||||
$json = @json_decode(@file_get_contents($file), true, flags:$flags);
|
||||
|
||||
$this->loadData($json);
|
||||
$this->loadedFile = $this->currentFile;
|
||||
|
@ -1,7 +0,0 @@
|
||||
{
|
||||
"datasets": {
|
||||
"baz": {
|
||||
"filename": "data/test.csv"
|
||||
}
|
||||
}
|
||||
}
|
@ -16,6 +16,6 @@ class DatasetManagerTest extends \PHPUnit\Framework\TestCase
|
||||
$dm = new DatasetManager();
|
||||
|
||||
$sets = $dm->getAvailableDatasets();
|
||||
$this->assertEquals(1, count($sets), "Expected 1 loaded set");
|
||||
$this->assertEquals(2, count($sets), "Expected 2 loaded set");
|
||||
}
|
||||
}
|
||||
|
72
tests/src/FilteringReaderIteratorTest.php
Normal file
72
tests/src/FilteringReaderIteratorTest.php
Normal file
@ -0,0 +1,72 @@
|
||||
<?php
|
||||
|
||||
namespace NoccyLabs\Dataset;
|
||||
|
||||
use ArrayIterator;
|
||||
use Iterator;
|
||||
|
||||
class FilteringReaderIteratorTest extends \PHPUnit\Framework\TestCase
|
||||
{
|
||||
|
||||
/**
|
||||
* @covers FilteringReaderIterator
|
||||
*/
|
||||
public function testSimpleFiltering()
|
||||
{
|
||||
$arr = iterator_to_array(
|
||||
new FilteringReaderIterator(
|
||||
$this->getTestIterator(), [
|
||||
"alpha" => 3
|
||||
]));
|
||||
|
||||
$this->assertEquals(1, count($arr));
|
||||
$this->assertEquals(3, $arr[2]['alpha']);
|
||||
$this->assertEquals('red', $arr[2]['beta']);
|
||||
$this->assertEquals(null, $arr[2]['gamma']);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @covers FilteringReaderIterator
|
||||
*/
|
||||
public function testFilteringOnRange()
|
||||
{
|
||||
$arr = iterator_to_array(
|
||||
new FilteringReaderIterator(
|
||||
$this->getTestIterator(), [
|
||||
"alpha" => [ 'gt' => 2, 'lt' => 5 ]
|
||||
]));
|
||||
|
||||
$this->assertEquals(2, count($arr));
|
||||
}
|
||||
|
||||
/**
|
||||
* @covers FilteringReaderIterator
|
||||
*/
|
||||
public function testFilteringOnRangeInclusive()
|
||||
{
|
||||
$arr = iterator_to_array(
|
||||
new FilteringReaderIterator(
|
||||
$this->getTestIterator(), [
|
||||
"alpha" => [ 'gt' => 2, 'lte' => 5 ]
|
||||
]));
|
||||
|
||||
$this->assertEquals(3, count($arr));
|
||||
}
|
||||
|
||||
private function getTestData(): array
|
||||
{
|
||||
return [
|
||||
[ 'alpha' => 1, 'beta' => 'green', 'gamma' => true ],
|
||||
[ 'alpha' => 2, 'beta' => 'blue', 'gamma' => false ],
|
||||
[ 'alpha' => 3, 'beta' => 'red', 'gamma' => null ],
|
||||
[ 'alpha' => 4, 'beta' => 'yellow', 'gamma' => false ],
|
||||
[ 'alpha' => 5, 'beta' => 'pink', 'gamma' => true ],
|
||||
];
|
||||
}
|
||||
|
||||
private function getTestIterator(): Iterator
|
||||
{
|
||||
return new ArrayIterator($this->getTestData());
|
||||
}
|
||||
}
|
29
tests/src/Readers/CsvReaderTest.php
Normal file
29
tests/src/Readers/CsvReaderTest.php
Normal file
@ -0,0 +1,29 @@
|
||||
<?php
|
||||
|
||||
namespace NoccyLabs\Dataset\Readers;
|
||||
|
||||
|
||||
class CsvReaderTest extends \PHPUnit\Framework\TestCase
|
||||
{
|
||||
/**
|
||||
* @covers CsvReader
|
||||
*/
|
||||
public function testReadingSingleFile()
|
||||
{
|
||||
$reader = new CsvReader(__DIR__."/../../test/test/data/data.csv", []);
|
||||
$arr = iterator_to_array($reader);
|
||||
|
||||
$this->assertEquals(5, count($arr), "Expected 5 items");
|
||||
}
|
||||
|
||||
/**
|
||||
* @covers CsvReader
|
||||
*/
|
||||
public function testReadingSplitFile()
|
||||
{
|
||||
$reader = new CsvReader(__DIR__."/../../test/test/data/*.csv", []);
|
||||
$arr = iterator_to_array($reader);
|
||||
|
||||
$this->assertEquals(5, count($arr), "Expected 5 items");
|
||||
}
|
||||
}
|
6
tests/test/test/data/data.csv
Normal file
6
tests/test/test/data/data.csv
Normal file
@ -0,0 +1,6 @@
|
||||
"a";"b"
|
||||
1;a
|
||||
2;b
|
||||
3;c
|
||||
4;d
|
||||
5;e
|
|
7
tests/test/test/data/data.json
Normal file
7
tests/test/test/data/data.json
Normal file
@ -0,0 +1,7 @@
|
||||
[
|
||||
{ "a": 1, "b": "a" },
|
||||
{ "a": 2, "b": "b" },
|
||||
{ "a": 3, "b": "c" },
|
||||
{ "a": 4, "b": "d" },
|
||||
{ "a": 5, "b": "e" }
|
||||
]
|
10
tests/test/test/dataset.json
Normal file
10
tests/test/test/dataset.json
Normal file
@ -0,0 +1,10 @@
|
||||
{
|
||||
"datasets": {
|
||||
"csv": {
|
||||
"filename": "data/data.csv"
|
||||
},
|
||||
"json": {
|
||||
"filename": "data/data.json"
|
||||
}
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user