Compare commits

...

7 Commits

20 changed files with 572 additions and 51 deletions

3
.gitignore vendored
View File

@ -1,3 +1,4 @@
/vendor
/composer.lock
/*.zip
/.phpunit.cache

View File

@ -1,22 +1,83 @@
# Dataset Library for PHP/Composer
This is a library for loading datasets from bundled packages.
This is a library for loading datasets from bundled packages. The idea isn't to
use the classes in this library as a generic datasource, although you could
probably pull that off. Instead, use the datasets to import the relevant data to
your database of choice, and optionally keep track of the version numbers so
new data can be imported automatically when the dependencies have been updated
and a new version has been installed.
## Installing
To install dataset, require it using composer:
```shell
$ composer require noccylabs/dataset
```
You also need some actual datasets. Some interesting ones could be:
Package | Description
---|---
[noccylabs/dataset-postal](https://dev.noccylabs.info/noccy/dataset-postal) | Patterns and info for postal (zip) code validation
[noccylabs/dataset-calendar](https://dev.noccylabs.info/noccy/dataset-calendar) | Bank holidays
[noccylabs/dataset-iso3166](https://dev.noccylabs.info/noccy/dataset-iso3166) | ISO 3166 country codes and namess
## Example
Require Dataset, and some actual datasets:
$ composer require noccylabs/dataset juicebox/flavordata
You can now open the desired sets and do useful stuff with the data:
```php
use NoccyLabs\Dataset\DatasetManager;
$dm = new DatasetManager();
$ds = $dm->getDataset("juicebox/flavordata#flavors");
foreach ($ds as $row) {
// Call on getDataset() if you want access to the metadata,
// Replace with openDataset() to quicly call getDataset()->open()
$ds = $dm->getDataset("noccylabs/dataset-iso3166#countries");
// This is how you get the metadata
echo "Dataset ID: ".$ds->getIdentifier(); // noccylabs/dataset-iso3166#countries
echo "Dataset version: ".$ds->getVersion(); // 2022.10.1
// Get a reader by calling open()
$reader = $ds->open();
foreach ($reader as $row) {
// row is an array
}
```
# Documentation
## DatasetManager
The `DatasetManager` will automatically locate and load datasets on startup.
```
getDataset(string $identifier): Dataset
Return a Dataset object, or throw exception on error
openDataset(string $identifer): Iterator
Return a reader for a Dataset, same as getDataset()->open()
getAvailableDatasets(): array
Returns the Dataset objects for all datasets found
```
## Dataset
```
open(): Iterator
Return an iterator to iterate over the data
filter(array|callable $condition): Iterator
Return an iterator that only returns rows matching filter
getIdentifier(): string
Return the dataset identifier (vendor/package#dataset)
getVersion(): string
Return the package version of the dataset
getPackageName(): string
Return the package name (vendor/package)
getDatasetName(): string
Return the dataset name (dataset)
getLicense(): ?string
Return the license for the dataset
getComment(): ?string
Return the dataset comment
```

View File

@ -2,13 +2,16 @@
<?php
foreach ([
__DIR__."/../vendor",
__DIR__."/../../../vendor",
__DIR__."/vendor",
__DIR__."/..",
__DIR__."/../..",
__DIR__."/../../..",
] as $dir) {
if (file_exists($dir."/autoload.php")) {
define("COMPOSER_VENDOR_PATH", $dir);
require_once $dir."/autoload.php";
} elseif (file_exists($dir."/vendor/autoload.php")) {
define("COMPOSER_VENDOR_PATH", $dir."/vendor");
require_once $dir."/vendor/autoload.php";
}
}
@ -16,8 +19,20 @@ $datasetManager = new NoccyLabs\Dataset\DatasetManager();
$datasets = $datasetManager->getAvailableDatasets();
function _printf(string $fmt, ...$args): void {
$out = sprintf($fmt, ...$args);
if (!posix_isatty(STDOUT)) {
$out = preg_replace("<\e\\[[0-9;]+m>", "", $out);
}
echo $out;
}
foreach ($datasets as $dataset) {
echo $dataset->getIdentifier()." (".$dataset->getVersion().")\n";
_printf("Identifier: \e[36m%s\e[35m#\e[36;1m%s\e[0m\n", $dataset->getPackageName(), $dataset->getDatasetName());
// $dataset->getIdentifier()."\n";
_printf(" Package: \e[33m%s\e[0m\n", $dataset->getPackageName());
_printf(" Dataset: \e[33m%s\e[0m\n", $dataset->getDatasetName());
_printf(" Version: \e[33m%s\e[0m\n", $dataset->getVersion());
$reader = $dataset->open();
$rows = 0;
$headers = null;
@ -25,5 +40,6 @@ foreach ($datasets as $dataset) {
if (!$headers) $headers = array_keys($row);
$rows++;
}
echo " ".$rows." rows\n - ".join("\n - ",$headers)."\n";
_printf(" Rows: \e[33m%d\e[0m\n", $rows);
_printf(" Fields: \e[32;1m%s\e[0m\n", join("\e[0m\n \e[32;1m",$headers));
}

View File

@ -14,6 +14,14 @@
"email": "cvagnetoft@gmail.com"
}
],
"require": {},
"bin": [ "bin/dataset-info" ]
"bin": [ "bin/dataset-info" ],
"require": {
"php": "^7.4|^8.0",
"ext-simplexml": "*",
"ext-json": "*"
},
"require-dev": {
"phpunit/phpunit": "^9.5",
"phpstan/phpstan": "^1.8"
}
}

13
phpstan.neon Normal file
View File

@ -0,0 +1,13 @@
parameters:
level: 5
excludePaths:
- doc
- vendor
- var
- tests
# Paths to include in the analysis
paths:
- src

27
phpunit.xml Normal file
View File

@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/9.5/phpunit.xsd"
bootstrap="vendor/autoload.php"
cacheResultFile=".phpunit.cache/test-results"
executionOrder="depends,defects"
forceCoversAnnotation="true"
beStrictAboutCoversAnnotation="true"
beStrictAboutOutputDuringTests="true"
beStrictAboutTodoAnnotatedTests="true"
convertDeprecationsToExceptions="true"
failOnRisky="true"
failOnWarning="true"
verbose="true">
<testsuites>
<testsuite name="default">
<directory>tests</directory>
</testsuite>
</testsuites>
<coverage cacheDirectory=".phpunit.cache/code-coverage"
processUncoveredFiles="true">
<include>
<directory suffix=".php">src</directory>
</include>
</coverage>
</phpunit>

View File

@ -2,35 +2,102 @@
namespace NoccyLabs\Dataset;
use Iterator;
use NoccyLabs\Dataset\Readers\CsvReader;
use NoccyLabs\Dataset\Readers\JsonReader;
/**
*
*
*
*/
class Dataset
{
protected string $packageName;
protected string $datasetName;
protected string $identifier;
protected array $options;
protected ?string $version;
protected string $version;
/**
*
*
* @param string $identifier The identifier for the dataset (vendor/package#dataset)
* @param array $options Configured options for the dataset
* @param string|null $version The package version
*/
public function __construct(string $identifier, array $options, ?string $version=null)
{
$this->identifier = $identifier;
$this->options = $options;
$this->version = $version;
$this->version = $version??"0.0.0.0";
[$this->packageName, $this->datasetName] = explode("#", $identifier, 2);
}
/**
*
* @return string
*/
public function getIdentifier(): string
{
return $this->identifier;
}
public function getVersion(): ?string
/**
*
* @return string
*/
public function getPackageName(): string
{
return $this->packageName;
}
/**
*
* @return string
*/
public function getDatasetName(): string
{
return $this->datasetName;
}
/**
*
* @return string
*/
public function getVersion(): string
{
return $this->version;
}
public function open(): ReaderInterface
/**
*
* @return string|null
*/
public function getComment(): ?string
{
return array_key_exists('comment', $this->options) ? $this->options['comment'] : null;
}
/**
*
* @return string|null
*/
public function getLicense(): ?string
{
return array_key_exists('license', $this->options) ? $this->options['license'] : null;
}
/**
*
* @return Iterator
*/
public function open(): Iterator
{
$filename = $this->options['filename'];
$reader = $this->determineReaderForFile($filename);
@ -38,17 +105,32 @@ class Dataset
return $inst;
}
/**
*
* @param string $filename
* @return string
*/
private function determineReaderForFile(string $filename): string
{
if ($reader = $this->options['reader']??null) {
return $reader;
}
$ext = pathinfo($filename, PATHINFO_EXTENSION);
/*
return match ($ext) {
'json' => JsonReader::class,
'csv' => CsvReader::class,
default => throw new \RuntimeException("Unable to determine reader for dataset file")
};
*/
// PHP7.4 compat: use switch instead of match
switch ($ext) {
case 'json': return JsonReader::class;
case 'csv': return CsvReader::class;
default: throw new \RuntimeException("Unable to determine reader for dataset file");
}
}
}
}

View File

@ -2,6 +2,8 @@
namespace NoccyLabs\Dataset;
use Iterator;
/**
* DatasetManager is the central class of noccylabs/dataset.
*
@ -22,16 +24,46 @@ class DatasetManager
}
}
/**
* Return all the available datasets
*
* @return Array<Dataset> The available datasets
*/
public function getAvailableDatasets(): array
{
return self::$datasets;
}
public function openDataset(string $identifier): ReaderInterface
/**
* Directly return a reader for a specific dataset.
*
* @param string $identifier The dataset identifier
* @return Iterator A reader for the data
* @throws InvalidDatasetException if the dataset can not be opened
* @throws UnknownDatasetException if the dataset does not exist
*/
public function openDataset(string $identifier): Iterator
{
return $this->getDataset($identifier)->open();
}
/**
* Return a Dataset object containing metadata and methods to retrieve
* a reader for the data in the set.
*
* @param string $identifier The dataset identifier
* @throws UnknownDatasetException if the dataset does not exist
*/
public function getDataset(string $identifier): Dataset
{
if (!array_key_exists($identifier, self::$datasets)) {
throw UnknownDatasetException::DatasetNotFound();
}
return self::$datasets[$identifier];
}
/**
* Find the vendor directory and try to locate all bundled datasets
*
@ -67,8 +99,8 @@ class DatasetManager
*
*
*
* @param string The package name (org/package)
* @param string The full path to the package (..../vendor/org/package)
* @param string $package The package name (org/package)
* @param string $path The full path to the package (..../vendor/org/package)
*/
private function scanPackageDatasets(string $package, string $path)
{
@ -84,10 +116,15 @@ class DatasetManager
}
$this->loadDatasets($json['datasets'], null, $package, $path);
//printf("found %d sets in %s\n", count($json['datasets']), $package);
}
/**
*
*
*
*
*/
private function loadDatasets(array $datasets, ?string $prefix, string $package, string $path)
{
foreach ($datasets as $name=>$options) {
@ -103,18 +140,30 @@ class DatasetManager
}
}
/**
*
*
*
*/
private function determineVendorPath(): ?string
{
if (defined("COMPOSER_VENDOR_PATH")) {
return COMPOSER_VENDOR_PATH;
$d = defined("NOCCYLABS_DATASET_TEST") ? (dirname(__DIR__)."/tests") : __DIR__;
while ($d != dirname($d)) {
if (file_exists($d."/autoload.php")) break;
$d = dirname($d);
}
if (file_exists(__DIR__."/../../../autoload.php")) {
// we are installed as a composer package
return dirname(__DIR__, 3);
if (file_exists($d."/autoload.php")) {
return $d;
}
return null;
}
/**
* Register a dataset
*
*
*/
public function registerDataset(Dataset $dataset)
{
$id = $dataset->getIdentifier();
@ -127,17 +176,4 @@ class DatasetManager
self::$datasets[$id] = $dataset;
}
/**
*
*
* @throws InvalidDatasetException if the dataset can not be opened
* @throws UnknownDatasetExcception if the dataset does not exist
*/
public function getDataset(string $identifier): Dataset
{
return self::$datasets[$identifier];
}
}

View File

@ -0,0 +1,96 @@
<?php
namespace NoccyLabs\Dataset;
use Iterator;
class FilteringReaderIterator implements Iterator
{
private Iterator $reader;
private $condition;
/**
*
* @note Removed support for callable in cond (php7.4 compat)
*/
public function __construct(Iterator $reader, array $condition)
{
$this->reader = $reader;
$this->condition = $condition;
}
public function current(): mixed
{
return $this->reader->current();
}
public function key(): mixed
{
return $this->reader->key();
}
public function valid(): bool
{
return $this->reader->valid();
}
public function next(): void
{
$this->reader->next();
while ($this->reader->valid()) {
$curr = $this->reader->current();
if ($this->matchCondition($curr))
break;
$this->reader->next();
}
}
public function rewind(): void
{
$this->reader->rewind();
while ($this->reader->valid()) {
$curr = $this->reader->current();
if ($this->matchCondition($curr))
break;
$this->reader->next();
}
}
/**
* Test a condition against a row
*
* The matched conditions are:
* - string/bool/int/float - match value
* - array - all conditions in array must match:
* - eq equals
* - neq not equals
* - gt greater than
* - gte greater than or equal
* - lt less than
* - in value in array
* - nin value not in array
*/
private function matchCondition(array $row): bool
{
if (is_callable($this->condition)) {
return (bool)call_user_func($this->condition, $row);
}
foreach ($this->condition as $field=>$test) {
if (!array_key_exists($field, $row)) continue;
if (is_array($test)) {
if (array_key_exists('eq',$test) && $row[$field] != $test['eq']) return false;
if (array_key_exists('neq',$test) && $row[$field] == $test['neq']) return false;
if (array_key_exists('gt',$test) && $row[$field] <= $test['gt']) return false;
if (array_key_exists('gte',$test) && $row[$field] < $test['gte']) return false;
if (array_key_exists('lt',$test) && $row[$field] >= $test['lt']) return false;
if (array_key_exists('lte',$test) && $row[$field] > $test['lte']) return false;
if (array_key_exists('in',$test) && !in_array($row[$field], $test['in'])) return false;
if (array_key_exists('nin',$test) && in_array($row[$field], $test['in'])) return false;
} else {
if ($row[$field] != $test) return false;
}
}
return true;
}
}

View File

@ -0,0 +1,8 @@
<?php
namespace NoccyLabs\Dataset;
use RuntimeException;
class InvalidDatasetException extends RuntimeException
{}

View File

@ -8,6 +8,8 @@ class CsvReader implements ReaderInterface
{
private array $files = [];
private array $options = [];
private int $currentFile = 0;
private ?int $loadedFile = null;
@ -21,6 +23,7 @@ class CsvReader implements ReaderInterface
public function __construct(string $filename, array $options)
{
$this->files = glob($filename);
$this->options = $options;
}
private function checkLoadedSlice()
@ -42,12 +45,16 @@ class CsvReader implements ReaderInterface
private function loadData(array $data)
{
// FIXME parse data according to directives if present
$separator = $this->options['separator']??',';
$enclosure = $this->options['enclosure']??'"';
$escape = $this->options['escape']??"\\";
$head = str_getcsv(array_shift($data));
$this->data = [];
foreach ($data as $row) {
if ($row) {
$row = str_getcsv($row);
$row = str_getcsv($row, $separator, $enclosure, $escape);
$this->data[] = array_combine($head, $row);
}
}
@ -63,13 +70,13 @@ class CsvReader implements ReaderInterface
$this->checkLoadedSlice();
}
public function key(): mixed
public function key()
{
//$this->checkLoadedSlice();
return $this->counter;
}
public function current(): mixed
public function current()
{
//$this->checkLoadedSlice();
return $this->data[$this->currentIndex];
@ -95,4 +102,4 @@ class CsvReader implements ReaderInterface
return ($this->currentFile < count($this->files) && ($this->currentIndex < count($this->data)));
}
}
}

View File

@ -8,6 +8,8 @@ class JsonReader implements ReaderInterface
{
private array $files = [];
private array $options = [];
private int $currentFile = 0;
private ?int $loadedFile = null;
@ -21,6 +23,7 @@ class JsonReader implements ReaderInterface
public function __construct(string $filename, array $options)
{
$this->files = glob($filename);
$this->options = $options;
}
private function checkLoadedSlice()
@ -32,8 +35,10 @@ class JsonReader implements ReaderInterface
//printf("Reached end of set at slice=%d\n", $this->currentFile);
return;
}
$flags = ($this->options['bigintAsString']??false)?JSON_BIGINT_AS_STRING:0;
$file = $this->files[$this->currentFile];
$json = @json_decode(@file_get_contents($file), true);
$json = @json_decode(@file_get_contents($file), true, 512, $flags);
$this->loadData($json);
$this->loadedFile = $this->currentFile;
@ -56,13 +61,13 @@ class JsonReader implements ReaderInterface
$this->checkLoadedSlice();
}
public function key(): mixed
public function key()
{
//$this->checkLoadedSlice();
return $this->counter;
}
public function current(): mixed
public function current()
{
//$this->checkLoadedSlice();
return $this->data[$this->currentIndex];

View File

@ -0,0 +1,13 @@
<?php
namespace NoccyLabs\Dataset;
use RuntimeException;
class UnknownDatasetException extends RuntimeException
{
public static function DatasetNotFound(): self
{
return new UnknownDatasetException("The requested dataset is not available");
}
}

3
tests/autoload.php Normal file
View File

@ -0,0 +1,3 @@
<?php
// This is not an autoloader, it is used by the tests

View File

@ -0,0 +1,21 @@
<?php
namespace NoccyLabs\Dataset;
defined("NOCCYLABS_DATASET_TEST") || define("NOCCYLABS_DATASET_TEST", true);
class DatasetManagerTest extends \PHPUnit\Framework\TestCase
{
/**
* @covers DatasetManager::__construct
* @covers DatasetManager::determineVendorPath
*/
public function testCreatingTheDatasetManager()
{
$dm = new DatasetManager();
$sets = $dm->getAvailableDatasets();
$this->assertEquals(2, count($sets), "Expected 2 loaded set");
}
}

View File

@ -0,0 +1,72 @@
<?php
namespace NoccyLabs\Dataset;
use ArrayIterator;
use Iterator;
class FilteringReaderIteratorTest extends \PHPUnit\Framework\TestCase
{
/**
* @covers FilteringReaderIterator
*/
public function testSimpleFiltering()
{
$arr = iterator_to_array(
new FilteringReaderIterator(
$this->getTestIterator(), [
"alpha" => 3
]));
$this->assertEquals(1, count($arr));
$this->assertEquals(3, $arr[2]['alpha']);
$this->assertEquals('red', $arr[2]['beta']);
$this->assertEquals(null, $arr[2]['gamma']);
}
/**
* @covers FilteringReaderIterator
*/
public function testFilteringOnRange()
{
$arr = iterator_to_array(
new FilteringReaderIterator(
$this->getTestIterator(), [
"alpha" => [ 'gt' => 2, 'lt' => 5 ]
]));
$this->assertEquals(2, count($arr));
}
/**
* @covers FilteringReaderIterator
*/
public function testFilteringOnRangeInclusive()
{
$arr = iterator_to_array(
new FilteringReaderIterator(
$this->getTestIterator(), [
"alpha" => [ 'gt' => 2, 'lte' => 5 ]
]));
$this->assertEquals(3, count($arr));
}
private function getTestData(): array
{
return [
[ 'alpha' => 1, 'beta' => 'green', 'gamma' => true ],
[ 'alpha' => 2, 'beta' => 'blue', 'gamma' => false ],
[ 'alpha' => 3, 'beta' => 'red', 'gamma' => null ],
[ 'alpha' => 4, 'beta' => 'yellow', 'gamma' => false ],
[ 'alpha' => 5, 'beta' => 'pink', 'gamma' => true ],
];
}
private function getTestIterator(): Iterator
{
return new ArrayIterator($this->getTestData());
}
}

View File

@ -0,0 +1,29 @@
<?php
namespace NoccyLabs\Dataset\Readers;
class CsvReaderTest extends \PHPUnit\Framework\TestCase
{
/**
* @covers CsvReader
*/
public function testReadingSingleFile()
{
$reader = new CsvReader(__DIR__."/../../test/test/data/data.csv", []);
$arr = iterator_to_array($reader);
$this->assertEquals(5, count($arr), "Expected 5 items");
}
/**
* @covers CsvReader
*/
public function testReadingSplitFile()
{
$reader = new CsvReader(__DIR__."/../../test/test/data/*.csv", []);
$arr = iterator_to_array($reader);
$this->assertEquals(5, count($arr), "Expected 5 items");
}
}

View File

@ -0,0 +1,6 @@
"a";"b"
1;a
2;b
3;c
4;d
5;e
1 a b
2 1 a
3 2 b
4 3 c
5 4 d
6 5 e

View File

@ -0,0 +1,7 @@
[
{ "a": 1, "b": "a" },
{ "a": 2, "b": "b" },
{ "a": 3, "b": "c" },
{ "a": 4, "b": "d" },
{ "a": 5, "b": "e" }
]

View File

@ -0,0 +1,10 @@
{
"datasets": {
"csv": {
"filename": "data/data.csv"
},
"json": {
"filename": "data/data.json"
}
}
}