diff --git a/README.md b/README.md index e1c2c68..760ba56 100644 --- a/README.md +++ b/README.md @@ -45,3 +45,39 @@ foreach ($reader as $row) { // row is an array } ``` + +# Documentation + +## DatasetManager + +The `DatasetManager` will automatically locate and load datasets on startup. + +``` +getDataset(string $identifier): Dataset + Return a Dataset object, or throw exception on error +openDataset(string $identifer): Iterator + Return a reader for a Dataset, same as getDataset()->open() +getAvailableDatasets(): array + Returns the Dataset objects for all datasets found +``` + +## Dataset + +``` +open(): Iterator + Return an iterator to iterate over the data +filter(array|callable $condition): Iterator + Return an iterator that only returns rows matching filter +getIdentifier(): string + Return the dataset identifier (vendor/package#dataset) +getVersion(): string + Return the package version of the dataset +getPackageName(): string + Return the package name (vendor/package) +getDatasetName(): string + Return the dataset name (dataset) +getLicense(): ?string + Return the license for the dataset +getComment(): ?string + Return the dataset comment +``` \ No newline at end of file diff --git a/bin/dataset-info b/bin/dataset-info index 0009739..874e2e2 100755 --- a/bin/dataset-info +++ b/bin/dataset-info @@ -3,13 +3,15 @@ foreach ([ __DIR__."/..", - __DIR__."/../vendor", + __DIR__."/../..", __DIR__."/../../..", - __DIR__."/vendor", ] as $dir) { if (file_exists($dir."/autoload.php")) { define("COMPOSER_VENDOR_PATH", $dir); require_once $dir."/autoload.php"; + } elseif (file_exists($dir."/vendor/autoload.php")) { + define("COMPOSER_VENDOR_PATH", $dir."/vendor"); + require_once $dir."/vendor/autoload.php"; } } @@ -17,8 +19,20 @@ $datasetManager = new NoccyLabs\Dataset\DatasetManager(); $datasets = $datasetManager->getAvailableDatasets(); +function _printf(string $fmt, ...$args): void { + $out = sprintf($fmt, ...$args); + if (!posix_isatty(STDOUT)) { + $out = preg_replace("<\e\\[[0-9;]+m>", "", $out); + } + echo $out; +} + foreach ($datasets as $dataset) { - echo $dataset->getIdentifier()." (".$dataset->getVersion().")\n"; + _printf("Identifier: \e[36m%s\e[35m#\e[36;1m%s\e[0m\n", $dataset->getPackageName(), $dataset->getDatasetName()); + // $dataset->getIdentifier()."\n"; + _printf(" Package: \e[33m%s\e[0m\n", $dataset->getPackageName()); + _printf(" Dataset: \e[33m%s\e[0m\n", $dataset->getDatasetName()); + _printf(" Version: \e[33m%s\e[0m\n", $dataset->getVersion()); $reader = $dataset->open(); $rows = 0; $headers = null; @@ -26,5 +40,6 @@ foreach ($datasets as $dataset) { if (!$headers) $headers = array_keys($row); $rows++; } - echo " # ".$rows." rows\n - ".join("\n - ",$headers)."\n"; + _printf(" Rows: \e[33m%d\e[0m\n", $rows); + _printf(" Fields: \e[32;1m%s\e[0m\n", join("\e[0m\n \e[32;1m",$headers)); } diff --git a/src/Dataset.php b/src/Dataset.php index 1ae1008..d32c3ef 100644 --- a/src/Dataset.php +++ b/src/Dataset.php @@ -2,9 +2,15 @@ namespace NoccyLabs\Dataset; +use Iterator; use NoccyLabs\Dataset\Readers\CsvReader; use NoccyLabs\Dataset\Readers\JsonReader; +/** + * + * + * + */ class Dataset { protected string $packageName; @@ -15,48 +21,83 @@ class Dataset protected array $options; - protected ?string $version; + protected string $version; + /** + * + * + * @param string $identifier The identifier for the dataset (vendor/package#dataset) + * @param array $options Configured options for the dataset + * @param string|null $version The package version + */ public function __construct(string $identifier, array $options, ?string $version=null) { $this->identifier = $identifier; $this->options = $options; - $this->version = $version; + $this->version = $version??"0.0.0.0"; [$this->packageName, $this->datasetName] = explode("#", $identifier, 2); } + /** + * + * @return string + */ public function getIdentifier(): string { return $this->identifier; } + /** + * + * @return string + */ public function getPackageName(): string { return $this->packageName; } + /** + * + * @return string + */ public function getDatasetName(): string { return $this->datasetName; } - public function getVersion(): ?string + /** + * + * @return string + */ + public function getVersion(): string { return $this->version; } + /** + * + * @return string|null + */ public function getComment(): ?string { return array_key_exists('comment', $this->options) ? $this->options['comment'] : null; } + /** + * + * @return string|null + */ public function getLicense(): ?string { return array_key_exists('license', $this->options) ? $this->options['license'] : null; } - public function open(): ReaderInterface + /** + * + * @return Iterator + */ + public function open(): Iterator { $filename = $this->options['filename']; $reader = $this->determineReaderForFile($filename); @@ -64,6 +105,11 @@ class Dataset return $inst; } + /** + * + * @param string $filename + * @return string + */ private function determineReaderForFile(string $filename): string { if ($reader = $this->options['reader']??null) { diff --git a/src/DatasetManager.php b/src/DatasetManager.php index 3123122..c13a591 100644 --- a/src/DatasetManager.php +++ b/src/DatasetManager.php @@ -2,6 +2,8 @@ namespace NoccyLabs\Dataset; +use Iterator; + /** * DatasetManager is the central class of noccylabs/dataset. * @@ -36,11 +38,11 @@ class DatasetManager * Directly return a reader for a specific dataset. * * @param string $identifier The dataset identifier - * @return ReaderInterface A reader for the data + * @return Iterator A reader for the data * @throws InvalidDatasetException if the dataset can not be opened * @throws UnknownDatasetException if the dataset does not exist */ - public function openDataset(string $identifier): ReaderInterface + public function openDataset(string $identifier): Iterator { return $this->getDataset($identifier)->open(); } @@ -157,6 +159,11 @@ class DatasetManager return null; } + /** + * Register a dataset + * + * + */ public function registerDataset(Dataset $dataset) { $id = $dataset->getIdentifier(); diff --git a/src/FilteringReaderIterator.php b/src/FilteringReaderIterator.php new file mode 100644 index 0000000..0a89de9 --- /dev/null +++ b/src/FilteringReaderIterator.php @@ -0,0 +1,92 @@ +reader = $reader; + $this->condition = $condition; + } + + public function current(): mixed + { + return $this->reader->current(); + } + + public function key(): mixed + { + return $this->reader->key(); + } + + public function valid(): bool + { + return $this->reader->valid(); + } + + public function next(): void + { + $this->reader->next(); + while ($this->reader->valid()) { + $curr = $this->reader->current(); + if ($this->matchCondition($curr)) + break; + $this->reader->next(); + } + } + + public function rewind(): void + { + $this->reader->rewind(); + while ($this->reader->valid()) { + $curr = $this->reader->current(); + if ($this->matchCondition($curr)) + break; + $this->reader->next(); + } + } + + /** + * Test a condition against a row + * + * The matched conditions are: + * - string/bool/int/float - match value + * - array - all conditions in array must match: + * - eq equals + * - neq not equals + * - gt greater than + * - gte greater than or equal + * - lt less than + * - in value in array + * - nin value not in array + */ + private function matchCondition(array $row): bool + { + if (is_callable($this->condition)) { + return (bool)call_user_func($this->condition, $row); + } + foreach ($this->condition as $field=>$test) { + if (!array_key_exists($field, $row)) continue; + if (is_array($test)) { + if (array_key_exists('eq',$test) && $row[$field] != $test['eq']) return false; + if (array_key_exists('neq',$test) && $row[$field] == $test['neq']) return false; + if (array_key_exists('gt',$test) && $row[$field] <= $test['gt']) return false; + if (array_key_exists('gte',$test) && $row[$field] < $test['gte']) return false; + if (array_key_exists('lt',$test) && $row[$field] >= $test['lt']) return false; + if (array_key_exists('lte',$test) && $row[$field] > $test['lte']) return false; + if (array_key_exists('in',$test) && !in_array($row[$field], $test['in'])) return false; + if (array_key_exists('nin',$test) && in_array($row[$field], $test['in'])) return false; + } else { + if ($row[$field] != $test) return false; + } + } + return true; + } +} diff --git a/src/Readers/CsvReader.php b/src/Readers/CsvReader.php index b528d79..3d5edf0 100644 --- a/src/Readers/CsvReader.php +++ b/src/Readers/CsvReader.php @@ -45,12 +45,16 @@ class CsvReader implements ReaderInterface private function loadData(array $data) { - // FIXME parse data according to directives if present + $separator = $this->options['separator']??','; + $enclosure = $this->options['enclosure']??'"'; + $escape = $this->options['escape']??"\\"; + + $head = str_getcsv(array_shift($data)); $this->data = []; foreach ($data as $row) { if ($row) { - $row = str_getcsv($row); + $row = str_getcsv($row, $separator, $enclosure, $escape); $this->data[] = array_combine($head, $row); } } diff --git a/src/Readers/JsonReader.php b/src/Readers/JsonReader.php index 8e44440..176b830 100644 --- a/src/Readers/JsonReader.php +++ b/src/Readers/JsonReader.php @@ -35,8 +35,10 @@ class JsonReader implements ReaderInterface //printf("Reached end of set at slice=%d\n", $this->currentFile); return; } + + $flags = ($this->options['bigintAsString']??false)?JSON_BIGINT_AS_STRING:0; $file = $this->files[$this->currentFile]; - $json = @json_decode(@file_get_contents($file), true); + $json = @json_decode(@file_get_contents($file), true, flags:$flags); $this->loadData($json); $this->loadedFile = $this->currentFile; diff --git a/tests/foo/bar/dataset.json b/tests/foo/bar/dataset.json deleted file mode 100644 index 6e05263..0000000 --- a/tests/foo/bar/dataset.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "datasets": { - "baz": { - "filename": "data/test.csv" - } - } -} diff --git a/tests/src/DatasetManagerTest.php b/tests/src/DatasetManagerTest.php index b5d7780..cc77053 100644 --- a/tests/src/DatasetManagerTest.php +++ b/tests/src/DatasetManagerTest.php @@ -16,6 +16,6 @@ class DatasetManagerTest extends \PHPUnit\Framework\TestCase $dm = new DatasetManager(); $sets = $dm->getAvailableDatasets(); - $this->assertEquals(1, count($sets), "Expected 1 loaded set"); + $this->assertEquals(2, count($sets), "Expected 2 loaded set"); } } diff --git a/tests/src/FilteringReaderIteratorTest.php b/tests/src/FilteringReaderIteratorTest.php new file mode 100644 index 0000000..2a224f2 --- /dev/null +++ b/tests/src/FilteringReaderIteratorTest.php @@ -0,0 +1,72 @@ +getTestIterator(), [ + "alpha" => 3 + ])); + + $this->assertEquals(1, count($arr)); + $this->assertEquals(3, $arr[2]['alpha']); + $this->assertEquals('red', $arr[2]['beta']); + $this->assertEquals(null, $arr[2]['gamma']); + + } + + /** + * @covers FilteringReaderIterator + */ + public function testFilteringOnRange() + { + $arr = iterator_to_array( + new FilteringReaderIterator( + $this->getTestIterator(), [ + "alpha" => [ 'gt' => 2, 'lt' => 5 ] + ])); + + $this->assertEquals(2, count($arr)); + } + + /** + * @covers FilteringReaderIterator + */ + public function testFilteringOnRangeInclusive() + { + $arr = iterator_to_array( + new FilteringReaderIterator( + $this->getTestIterator(), [ + "alpha" => [ 'gt' => 2, 'lte' => 5 ] + ])); + + $this->assertEquals(3, count($arr)); + } + + private function getTestData(): array + { + return [ + [ 'alpha' => 1, 'beta' => 'green', 'gamma' => true ], + [ 'alpha' => 2, 'beta' => 'blue', 'gamma' => false ], + [ 'alpha' => 3, 'beta' => 'red', 'gamma' => null ], + [ 'alpha' => 4, 'beta' => 'yellow', 'gamma' => false ], + [ 'alpha' => 5, 'beta' => 'pink', 'gamma' => true ], + ]; + } + + private function getTestIterator(): Iterator + { + return new ArrayIterator($this->getTestData()); + } +} diff --git a/tests/src/Readers/CsvReaderTest.php b/tests/src/Readers/CsvReaderTest.php new file mode 100644 index 0000000..6d56c2c --- /dev/null +++ b/tests/src/Readers/CsvReaderTest.php @@ -0,0 +1,29 @@ +assertEquals(5, count($arr), "Expected 5 items"); + } + + /** + * @covers CsvReader + */ + public function testReadingSplitFile() + { + $reader = new CsvReader(__DIR__."/../../test/test/data/*.csv", []); + $arr = iterator_to_array($reader); + + $this->assertEquals(5, count($arr), "Expected 5 items"); + } +} \ No newline at end of file diff --git a/tests/test/test/data/data.csv b/tests/test/test/data/data.csv new file mode 100644 index 0000000..e3d837b --- /dev/null +++ b/tests/test/test/data/data.csv @@ -0,0 +1,6 @@ +"a";"b" +1;a +2;b +3;c +4;d +5;e \ No newline at end of file diff --git a/tests/test/test/data/data.json b/tests/test/test/data/data.json new file mode 100644 index 0000000..a92c895 --- /dev/null +++ b/tests/test/test/data/data.json @@ -0,0 +1,7 @@ +[ + { "a": 1, "b": "a" }, + { "a": 2, "b": "b" }, + { "a": 3, "b": "c" }, + { "a": 4, "b": "d" }, + { "a": 5, "b": "e" } +] \ No newline at end of file diff --git a/tests/test/test/dataset.json b/tests/test/test/dataset.json new file mode 100644 index 0000000..c734226 --- /dev/null +++ b/tests/test/test/dataset.json @@ -0,0 +1,10 @@ +{ + "datasets": { + "csv": { + "filename": "data/data.csv" + }, + "json": { + "filename": "data/data.json" + } + } +}