Compare commits
No commits in common. "master" and "0.1.0" have entirely different histories.
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,4 +1,3 @@
|
|||||||
/vendor
|
/vendor
|
||||||
/composer.lock
|
/composer.lock
|
||||||
/*.zip
|
|
||||||
/.phpunit.cache
|
|
||||||
|
77
README.md
77
README.md
@ -1,83 +1,22 @@
|
|||||||
# Dataset Library for PHP/Composer
|
# Dataset Library for PHP/Composer
|
||||||
|
|
||||||
This is a library for loading datasets from bundled packages. The idea isn't to
|
This is a library for loading datasets from bundled packages.
|
||||||
use the classes in this library as a generic datasource, although you could
|
|
||||||
probably pull that off. Instead, use the datasets to import the relevant data to
|
|
||||||
your database of choice, and optionally keep track of the version numbers so
|
|
||||||
new data can be imported automatically when the dependencies have been updated
|
|
||||||
and a new version has been installed.
|
|
||||||
|
|
||||||
## Installing
|
|
||||||
|
|
||||||
To install dataset, require it using composer:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ composer require noccylabs/dataset
|
|
||||||
```
|
|
||||||
|
|
||||||
You also need some actual datasets. Some interesting ones could be:
|
|
||||||
|
|
||||||
Package | Description
|
|
||||||
---|---
|
|
||||||
[noccylabs/dataset-postal](https://dev.noccylabs.info/noccy/dataset-postal) | Patterns and info for postal (zip) code validation
|
|
||||||
[noccylabs/dataset-calendar](https://dev.noccylabs.info/noccy/dataset-calendar) | Bank holidays
|
|
||||||
[noccylabs/dataset-iso3166](https://dev.noccylabs.info/noccy/dataset-iso3166) | ISO 3166 country codes and namess
|
|
||||||
|
|
||||||
## Example
|
## Example
|
||||||
|
|
||||||
|
Require Dataset, and some actual datasets:
|
||||||
|
|
||||||
|
$ composer require noccylabs/dataset juicebox/flavordata
|
||||||
|
|
||||||
|
You can now open the desired sets and do useful stuff with the data:
|
||||||
|
|
||||||
```php
|
```php
|
||||||
use NoccyLabs\Dataset\DatasetManager;
|
use NoccyLabs\Dataset\DatasetManager;
|
||||||
|
|
||||||
$dm = new DatasetManager();
|
$dm = new DatasetManager();
|
||||||
|
$ds = $dm->getDataset("juicebox/flavordata#flavors");
|
||||||
|
|
||||||
// Call on getDataset() if you want access to the metadata,
|
foreach ($ds as $row) {
|
||||||
// Replace with openDataset() to quicly call getDataset()->open()
|
|
||||||
$ds = $dm->getDataset("noccylabs/dataset-iso3166#countries");
|
|
||||||
|
|
||||||
// This is how you get the metadata
|
|
||||||
echo "Dataset ID: ".$ds->getIdentifier(); // noccylabs/dataset-iso3166#countries
|
|
||||||
echo "Dataset version: ".$ds->getVersion(); // 2022.10.1
|
|
||||||
|
|
||||||
// Get a reader by calling open()
|
|
||||||
$reader = $ds->open();
|
|
||||||
foreach ($reader as $row) {
|
|
||||||
// row is an array
|
// row is an array
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
# Documentation
|
|
||||||
|
|
||||||
## DatasetManager
|
|
||||||
|
|
||||||
The `DatasetManager` will automatically locate and load datasets on startup.
|
|
||||||
|
|
||||||
```
|
|
||||||
getDataset(string $identifier): Dataset
|
|
||||||
Return a Dataset object, or throw exception on error
|
|
||||||
openDataset(string $identifer): Iterator
|
|
||||||
Return a reader for a Dataset, same as getDataset()->open()
|
|
||||||
getAvailableDatasets(): array
|
|
||||||
Returns the Dataset objects for all datasets found
|
|
||||||
```
|
|
||||||
|
|
||||||
## Dataset
|
|
||||||
|
|
||||||
```
|
|
||||||
open(): Iterator
|
|
||||||
Return an iterator to iterate over the data
|
|
||||||
filter(array|callable $condition): Iterator
|
|
||||||
Return an iterator that only returns rows matching filter
|
|
||||||
getIdentifier(): string
|
|
||||||
Return the dataset identifier (vendor/package#dataset)
|
|
||||||
getVersion(): string
|
|
||||||
Return the package version of the dataset
|
|
||||||
getPackageName(): string
|
|
||||||
Return the package name (vendor/package)
|
|
||||||
getDatasetName(): string
|
|
||||||
Return the dataset name (dataset)
|
|
||||||
getLicense(): ?string
|
|
||||||
Return the license for the dataset
|
|
||||||
getComment(): ?string
|
|
||||||
Return the dataset comment
|
|
||||||
```
|
|
@ -1,45 +0,0 @@
|
|||||||
#!/usr/bin/env php
|
|
||||||
<?php
|
|
||||||
|
|
||||||
foreach ([
|
|
||||||
__DIR__."/..",
|
|
||||||
__DIR__."/../..",
|
|
||||||
__DIR__."/../../..",
|
|
||||||
] as $dir) {
|
|
||||||
if (file_exists($dir."/autoload.php")) {
|
|
||||||
define("COMPOSER_VENDOR_PATH", $dir);
|
|
||||||
require_once $dir."/autoload.php";
|
|
||||||
} elseif (file_exists($dir."/vendor/autoload.php")) {
|
|
||||||
define("COMPOSER_VENDOR_PATH", $dir."/vendor");
|
|
||||||
require_once $dir."/vendor/autoload.php";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$datasetManager = new NoccyLabs\Dataset\DatasetManager();
|
|
||||||
|
|
||||||
$datasets = $datasetManager->getAvailableDatasets();
|
|
||||||
|
|
||||||
function _printf(string $fmt, ...$args): void {
|
|
||||||
$out = sprintf($fmt, ...$args);
|
|
||||||
if (!posix_isatty(STDOUT)) {
|
|
||||||
$out = preg_replace("<\e\\[[0-9;]+m>", "", $out);
|
|
||||||
}
|
|
||||||
echo $out;
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach ($datasets as $dataset) {
|
|
||||||
_printf("Identifier: \e[36m%s\e[35m#\e[36;1m%s\e[0m\n", $dataset->getPackageName(), $dataset->getDatasetName());
|
|
||||||
// $dataset->getIdentifier()."\n";
|
|
||||||
_printf(" Package: \e[33m%s\e[0m\n", $dataset->getPackageName());
|
|
||||||
_printf(" Dataset: \e[33m%s\e[0m\n", $dataset->getDatasetName());
|
|
||||||
_printf(" Version: \e[33m%s\e[0m\n", $dataset->getVersion());
|
|
||||||
$reader = $dataset->open();
|
|
||||||
$rows = 0;
|
|
||||||
$headers = null;
|
|
||||||
foreach ($reader as $row) {
|
|
||||||
if (!$headers) $headers = array_keys($row);
|
|
||||||
$rows++;
|
|
||||||
}
|
|
||||||
_printf(" Rows: \e[33m%d\e[0m\n", $rows);
|
|
||||||
_printf(" Fields: \e[32;1m%s\e[0m\n", join("\e[0m\n \e[32;1m",$headers));
|
|
||||||
}
|
|
@ -14,14 +14,5 @@
|
|||||||
"email": "cvagnetoft@gmail.com"
|
"email": "cvagnetoft@gmail.com"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"bin": [ "bin/dataset-info" ],
|
"require": {}
|
||||||
"require": {
|
|
||||||
"php": "^7.4|^8.0",
|
|
||||||
"ext-simplexml": "*",
|
|
||||||
"ext-json": "*"
|
|
||||||
},
|
|
||||||
"require-dev": {
|
|
||||||
"phpunit/phpunit": "^9.5",
|
|
||||||
"phpstan/phpstan": "^1.8"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
13
phpstan.neon
13
phpstan.neon
@ -1,13 +0,0 @@
|
|||||||
parameters:
|
|
||||||
level: 5
|
|
||||||
|
|
||||||
excludePaths:
|
|
||||||
- doc
|
|
||||||
- vendor
|
|
||||||
- var
|
|
||||||
- tests
|
|
||||||
|
|
||||||
# Paths to include in the analysis
|
|
||||||
paths:
|
|
||||||
- src
|
|
||||||
|
|
27
phpunit.xml
27
phpunit.xml
@ -1,27 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
||||||
xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/9.5/phpunit.xsd"
|
|
||||||
bootstrap="vendor/autoload.php"
|
|
||||||
cacheResultFile=".phpunit.cache/test-results"
|
|
||||||
executionOrder="depends,defects"
|
|
||||||
forceCoversAnnotation="true"
|
|
||||||
beStrictAboutCoversAnnotation="true"
|
|
||||||
beStrictAboutOutputDuringTests="true"
|
|
||||||
beStrictAboutTodoAnnotatedTests="true"
|
|
||||||
convertDeprecationsToExceptions="true"
|
|
||||||
failOnRisky="true"
|
|
||||||
failOnWarning="true"
|
|
||||||
verbose="true">
|
|
||||||
<testsuites>
|
|
||||||
<testsuite name="default">
|
|
||||||
<directory>tests</directory>
|
|
||||||
</testsuite>
|
|
||||||
</testsuites>
|
|
||||||
|
|
||||||
<coverage cacheDirectory=".phpunit.cache/code-coverage"
|
|
||||||
processUncoveredFiles="true">
|
|
||||||
<include>
|
|
||||||
<directory suffix=".php">src</directory>
|
|
||||||
</include>
|
|
||||||
</coverage>
|
|
||||||
</phpunit>
|
|
@ -2,102 +2,27 @@
|
|||||||
|
|
||||||
namespace NoccyLabs\Dataset;
|
namespace NoccyLabs\Dataset;
|
||||||
|
|
||||||
use Iterator;
|
|
||||||
use NoccyLabs\Dataset\Readers\CsvReader;
|
use NoccyLabs\Dataset\Readers\CsvReader;
|
||||||
use NoccyLabs\Dataset\Readers\JsonReader;
|
use NoccyLabs\Dataset\Readers\JsonReader;
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
*
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
class Dataset
|
class Dataset
|
||||||
{
|
{
|
||||||
protected string $packageName;
|
|
||||||
|
|
||||||
protected string $datasetName;
|
|
||||||
|
|
||||||
protected string $identifier;
|
protected string $identifier;
|
||||||
|
|
||||||
protected array $options;
|
protected array $options;
|
||||||
|
|
||||||
protected string $version;
|
public function __construct(string $identifier, array $options)
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
*
|
|
||||||
* @param string $identifier The identifier for the dataset (vendor/package#dataset)
|
|
||||||
* @param array $options Configured options for the dataset
|
|
||||||
* @param string|null $version The package version
|
|
||||||
*/
|
|
||||||
public function __construct(string $identifier, array $options, ?string $version=null)
|
|
||||||
{
|
{
|
||||||
$this->identifier = $identifier;
|
$this->identifier = $identifier;
|
||||||
$this->options = $options;
|
$this->options = $options;
|
||||||
$this->version = $version??"0.0.0.0";
|
|
||||||
|
|
||||||
[$this->packageName, $this->datasetName] = explode("#", $identifier, 2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function getIdentifier(): string
|
public function getIdentifier(): string
|
||||||
{
|
{
|
||||||
return $this->identifier;
|
return $this->identifier;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public function open(): ReaderInterface
|
||||||
*
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function getPackageName(): string
|
|
||||||
{
|
|
||||||
return $this->packageName;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function getDatasetName(): string
|
|
||||||
{
|
|
||||||
return $this->datasetName;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function getVersion(): string
|
|
||||||
{
|
|
||||||
return $this->version;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @return string|null
|
|
||||||
*/
|
|
||||||
public function getComment(): ?string
|
|
||||||
{
|
|
||||||
return array_key_exists('comment', $this->options) ? $this->options['comment'] : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @return string|null
|
|
||||||
*/
|
|
||||||
public function getLicense(): ?string
|
|
||||||
{
|
|
||||||
return array_key_exists('license', $this->options) ? $this->options['license'] : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @return Iterator
|
|
||||||
*/
|
|
||||||
public function open(): Iterator
|
|
||||||
{
|
{
|
||||||
$filename = $this->options['filename'];
|
$filename = $this->options['filename'];
|
||||||
$reader = $this->determineReaderForFile($filename);
|
$reader = $this->determineReaderForFile($filename);
|
||||||
@ -105,32 +30,17 @@ class Dataset
|
|||||||
return $inst;
|
return $inst;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @param string $filename
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
private function determineReaderForFile(string $filename): string
|
private function determineReaderForFile(string $filename): string
|
||||||
{
|
{
|
||||||
if ($reader = $this->options['reader']??null) {
|
if ($reader = $this->options['reader']??null) {
|
||||||
return $reader;
|
return $reader;
|
||||||
}
|
}
|
||||||
$ext = pathinfo($filename, PATHINFO_EXTENSION);
|
$ext = pathinfo($filename, PATHINFO_EXTENSION);
|
||||||
|
|
||||||
/*
|
|
||||||
return match ($ext) {
|
return match ($ext) {
|
||||||
'json' => JsonReader::class,
|
'json' => JsonReader::class,
|
||||||
'csv' => CsvReader::class,
|
'csv' => CsvReader::class,
|
||||||
default => throw new \RuntimeException("Unable to determine reader for dataset file")
|
default => throw new \RuntimeException("Unable to determine reader for dataset file")
|
||||||
};
|
};
|
||||||
*/
|
|
||||||
|
|
||||||
// PHP7.4 compat: use switch instead of match
|
|
||||||
switch ($ext) {
|
|
||||||
case 'json': return JsonReader::class;
|
|
||||||
case 'csv': return CsvReader::class;
|
|
||||||
default: throw new \RuntimeException("Unable to determine reader for dataset file");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -2,8 +2,6 @@
|
|||||||
|
|
||||||
namespace NoccyLabs\Dataset;
|
namespace NoccyLabs\Dataset;
|
||||||
|
|
||||||
use Iterator;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* DatasetManager is the central class of noccylabs/dataset.
|
* DatasetManager is the central class of noccylabs/dataset.
|
||||||
*
|
*
|
||||||
@ -13,9 +11,9 @@ use Iterator;
|
|||||||
*/
|
*/
|
||||||
class DatasetManager
|
class DatasetManager
|
||||||
{
|
{
|
||||||
private static array $packageVersions = [];
|
|
||||||
|
|
||||||
private static array $datasets = [];
|
|
||||||
|
private static $datasets = [];
|
||||||
|
|
||||||
public function __construct()
|
public function __construct()
|
||||||
{
|
{
|
||||||
@ -24,46 +22,16 @@ class DatasetManager
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Return all the available datasets
|
|
||||||
*
|
|
||||||
* @return Array<Dataset> The available datasets
|
|
||||||
*/
|
|
||||||
public function getAvailableDatasets(): array
|
public function getAvailableDatasets(): array
|
||||||
{
|
{
|
||||||
return self::$datasets;
|
return self::$datasets;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public function openDataset(string $identifier): ReaderInterface
|
||||||
* Directly return a reader for a specific dataset.
|
|
||||||
*
|
|
||||||
* @param string $identifier The dataset identifier
|
|
||||||
* @return Iterator A reader for the data
|
|
||||||
* @throws InvalidDatasetException if the dataset can not be opened
|
|
||||||
* @throws UnknownDatasetException if the dataset does not exist
|
|
||||||
*/
|
|
||||||
public function openDataset(string $identifier): Iterator
|
|
||||||
{
|
{
|
||||||
return $this->getDataset($identifier)->open();
|
return $this->getDataset($identifier)->open();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return a Dataset object containing metadata and methods to retrieve
|
|
||||||
* a reader for the data in the set.
|
|
||||||
*
|
|
||||||
* @param string $identifier The dataset identifier
|
|
||||||
* @throws UnknownDatasetException if the dataset does not exist
|
|
||||||
*/
|
|
||||||
public function getDataset(string $identifier): Dataset
|
|
||||||
{
|
|
||||||
if (!array_key_exists($identifier, self::$datasets)) {
|
|
||||||
throw UnknownDatasetException::DatasetNotFound();
|
|
||||||
}
|
|
||||||
|
|
||||||
return self::$datasets[$identifier];
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find the vendor directory and try to locate all bundled datasets
|
* Find the vendor directory and try to locate all bundled datasets
|
||||||
*
|
*
|
||||||
@ -79,15 +47,6 @@ class DatasetManager
|
|||||||
|
|
||||||
$glob = glob($root."/*/*/dataset.json");
|
$glob = glob($root."/*/*/dataset.json");
|
||||||
|
|
||||||
self::$packageVersions = [];
|
|
||||||
$fn = realpath($root."/composer/installed.php");
|
|
||||||
if (file_exists($fn)) {
|
|
||||||
$versions = include $fn;
|
|
||||||
foreach ($versions['versions'] as $name=>$version) {
|
|
||||||
self::$packageVersions[$name] = array_key_exists('version',$version) ? $version['version'] : null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach ($glob as $match) {
|
foreach ($glob as $match) {
|
||||||
$path = dirname($match);
|
$path = dirname($match);
|
||||||
$package = basename(dirname($path))."/".basename($path);
|
$package = basename(dirname($path))."/".basename($path);
|
||||||
@ -99,8 +58,8 @@ class DatasetManager
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
* @param string $package The package name (org/package)
|
* @param string The package name (org/package)
|
||||||
* @param string $path The full path to the package (..../vendor/org/package)
|
* @param string The full path to the package (..../vendor/org/package)
|
||||||
*/
|
*/
|
||||||
private function scanPackageDatasets(string $package, string $path)
|
private function scanPackageDatasets(string $package, string $path)
|
||||||
{
|
{
|
||||||
@ -116,54 +75,33 @@ class DatasetManager
|
|||||||
}
|
}
|
||||||
|
|
||||||
$this->loadDatasets($json['datasets'], null, $package, $path);
|
$this->loadDatasets($json['datasets'], null, $package, $path);
|
||||||
|
//printf("found %d sets in %s\n", count($json['datasets']), $package);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
*
|
|
||||||
*
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
private function loadDatasets(array $datasets, ?string $prefix, string $package, string $path)
|
private function loadDatasets(array $datasets, ?string $prefix, string $package, string $path)
|
||||||
{
|
{
|
||||||
foreach ($datasets as $name=>$options) {
|
foreach ($datasets as $name=>$info) {
|
||||||
if (!array_key_exists('filename', $options)) {
|
if (!array_key_exists('filename', $info)) {
|
||||||
$this->loadDatasets($options, ltrim($prefix . "." . $name, "."), $package, $path);
|
$this->loadDatasets($info, ltrim($prefix . "." . $name, "."), $package, $path);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
$options['filename'] = $path . "/" . $options['filename'];
|
$info['filename'] = $path . "/" . $info['filename'];
|
||||||
$pn = sprintf("%s#%s", $package, ltrim($prefix.".".$name,"."));
|
$pn = sprintf("%s#%s", $package, ltrim($prefix.".".$name,"."));
|
||||||
$pv = self::$packageVersions[$package]??null;
|
$ds = new Dataset($pn, $info);
|
||||||
$ds = new Dataset($pn, $options, $pv);
|
|
||||||
$this->registerDataset($ds);
|
$this->registerDataset($ds);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
*
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
private function determineVendorPath(): ?string
|
private function determineVendorPath(): ?string
|
||||||
{
|
{
|
||||||
$d = defined("NOCCYLABS_DATASET_TEST") ? (dirname(__DIR__)."/tests") : __DIR__;
|
if (file_exists(__DIR__."/../../../autoload.php")) {
|
||||||
while ($d != dirname($d)) {
|
// we are installed as a composer package
|
||||||
if (file_exists($d."/autoload.php")) break;
|
return dirname(__DIR__, 3);
|
||||||
$d = dirname($d);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (file_exists($d."/autoload.php")) {
|
|
||||||
return $d;
|
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Register a dataset
|
|
||||||
*
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public function registerDataset(Dataset $dataset)
|
public function registerDataset(Dataset $dataset)
|
||||||
{
|
{
|
||||||
$id = $dataset->getIdentifier();
|
$id = $dataset->getIdentifier();
|
||||||
@ -176,4 +114,17 @@ class DatasetManager
|
|||||||
self::$datasets[$id] = $dataset;
|
self::$datasets[$id] = $dataset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* @throws InvalidDatasetException if the dataset can not be opened
|
||||||
|
* @throws UnknownDatasetExcception if the dataset does not exist
|
||||||
|
*/
|
||||||
|
public function getDataset(string $identifier): Dataset
|
||||||
|
{
|
||||||
|
|
||||||
|
return self::$datasets[$identifier];
|
||||||
|
}
|
||||||
}
|
}
|
@ -1,96 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
namespace NoccyLabs\Dataset;
|
|
||||||
|
|
||||||
use Iterator;
|
|
||||||
|
|
||||||
class FilteringReaderIterator implements Iterator
|
|
||||||
{
|
|
||||||
private Iterator $reader;
|
|
||||||
|
|
||||||
private $condition;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @note Removed support for callable in cond (php7.4 compat)
|
|
||||||
*/
|
|
||||||
public function __construct(Iterator $reader, array $condition)
|
|
||||||
{
|
|
||||||
$this->reader = $reader;
|
|
||||||
$this->condition = $condition;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function current(): mixed
|
|
||||||
{
|
|
||||||
return $this->reader->current();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function key(): mixed
|
|
||||||
{
|
|
||||||
return $this->reader->key();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function valid(): bool
|
|
||||||
{
|
|
||||||
return $this->reader->valid();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function next(): void
|
|
||||||
{
|
|
||||||
$this->reader->next();
|
|
||||||
while ($this->reader->valid()) {
|
|
||||||
$curr = $this->reader->current();
|
|
||||||
if ($this->matchCondition($curr))
|
|
||||||
break;
|
|
||||||
$this->reader->next();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public function rewind(): void
|
|
||||||
{
|
|
||||||
$this->reader->rewind();
|
|
||||||
while ($this->reader->valid()) {
|
|
||||||
$curr = $this->reader->current();
|
|
||||||
if ($this->matchCondition($curr))
|
|
||||||
break;
|
|
||||||
$this->reader->next();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Test a condition against a row
|
|
||||||
*
|
|
||||||
* The matched conditions are:
|
|
||||||
* - string/bool/int/float - match value
|
|
||||||
* - array - all conditions in array must match:
|
|
||||||
* - eq equals
|
|
||||||
* - neq not equals
|
|
||||||
* - gt greater than
|
|
||||||
* - gte greater than or equal
|
|
||||||
* - lt less than
|
|
||||||
* - in value in array
|
|
||||||
* - nin value not in array
|
|
||||||
*/
|
|
||||||
private function matchCondition(array $row): bool
|
|
||||||
{
|
|
||||||
if (is_callable($this->condition)) {
|
|
||||||
return (bool)call_user_func($this->condition, $row);
|
|
||||||
}
|
|
||||||
foreach ($this->condition as $field=>$test) {
|
|
||||||
if (!array_key_exists($field, $row)) continue;
|
|
||||||
if (is_array($test)) {
|
|
||||||
if (array_key_exists('eq',$test) && $row[$field] != $test['eq']) return false;
|
|
||||||
if (array_key_exists('neq',$test) && $row[$field] == $test['neq']) return false;
|
|
||||||
if (array_key_exists('gt',$test) && $row[$field] <= $test['gt']) return false;
|
|
||||||
if (array_key_exists('gte',$test) && $row[$field] < $test['gte']) return false;
|
|
||||||
if (array_key_exists('lt',$test) && $row[$field] >= $test['lt']) return false;
|
|
||||||
if (array_key_exists('lte',$test) && $row[$field] > $test['lte']) return false;
|
|
||||||
if (array_key_exists('in',$test) && !in_array($row[$field], $test['in'])) return false;
|
|
||||||
if (array_key_exists('nin',$test) && in_array($row[$field], $test['in'])) return false;
|
|
||||||
} else {
|
|
||||||
if ($row[$field] != $test) return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,8 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
namespace NoccyLabs\Dataset;
|
|
||||||
|
|
||||||
use RuntimeException;
|
|
||||||
|
|
||||||
class InvalidDatasetException extends RuntimeException
|
|
||||||
{}
|
|
@ -8,8 +8,6 @@ class CsvReader implements ReaderInterface
|
|||||||
{
|
{
|
||||||
private array $files = [];
|
private array $files = [];
|
||||||
|
|
||||||
private array $options = [];
|
|
||||||
|
|
||||||
private int $currentFile = 0;
|
private int $currentFile = 0;
|
||||||
|
|
||||||
private ?int $loadedFile = null;
|
private ?int $loadedFile = null;
|
||||||
@ -23,7 +21,6 @@ class CsvReader implements ReaderInterface
|
|||||||
public function __construct(string $filename, array $options)
|
public function __construct(string $filename, array $options)
|
||||||
{
|
{
|
||||||
$this->files = glob($filename);
|
$this->files = glob($filename);
|
||||||
$this->options = $options;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private function checkLoadedSlice()
|
private function checkLoadedSlice()
|
||||||
@ -45,16 +42,12 @@ class CsvReader implements ReaderInterface
|
|||||||
|
|
||||||
private function loadData(array $data)
|
private function loadData(array $data)
|
||||||
{
|
{
|
||||||
$separator = $this->options['separator']??',';
|
// FIXME parse data according to directives if present
|
||||||
$enclosure = $this->options['enclosure']??'"';
|
|
||||||
$escape = $this->options['escape']??"\\";
|
|
||||||
|
|
||||||
|
|
||||||
$head = str_getcsv(array_shift($data));
|
$head = str_getcsv(array_shift($data));
|
||||||
$this->data = [];
|
$this->data = [];
|
||||||
foreach ($data as $row) {
|
foreach ($data as $row) {
|
||||||
if ($row) {
|
if ($row) {
|
||||||
$row = str_getcsv($row, $separator, $enclosure, $escape);
|
$row = str_getcsv($row);
|
||||||
$this->data[] = array_combine($head, $row);
|
$this->data[] = array_combine($head, $row);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -70,13 +63,13 @@ class CsvReader implements ReaderInterface
|
|||||||
$this->checkLoadedSlice();
|
$this->checkLoadedSlice();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function key()
|
public function key(): mixed
|
||||||
{
|
{
|
||||||
//$this->checkLoadedSlice();
|
//$this->checkLoadedSlice();
|
||||||
return $this->counter;
|
return $this->counter;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function current()
|
public function current(): mixed
|
||||||
{
|
{
|
||||||
//$this->checkLoadedSlice();
|
//$this->checkLoadedSlice();
|
||||||
return $this->data[$this->currentIndex];
|
return $this->data[$this->currentIndex];
|
||||||
|
@ -8,8 +8,6 @@ class JsonReader implements ReaderInterface
|
|||||||
{
|
{
|
||||||
private array $files = [];
|
private array $files = [];
|
||||||
|
|
||||||
private array $options = [];
|
|
||||||
|
|
||||||
private int $currentFile = 0;
|
private int $currentFile = 0;
|
||||||
|
|
||||||
private ?int $loadedFile = null;
|
private ?int $loadedFile = null;
|
||||||
@ -23,7 +21,6 @@ class JsonReader implements ReaderInterface
|
|||||||
public function __construct(string $filename, array $options)
|
public function __construct(string $filename, array $options)
|
||||||
{
|
{
|
||||||
$this->files = glob($filename);
|
$this->files = glob($filename);
|
||||||
$this->options = $options;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private function checkLoadedSlice()
|
private function checkLoadedSlice()
|
||||||
@ -35,10 +32,8 @@ class JsonReader implements ReaderInterface
|
|||||||
//printf("Reached end of set at slice=%d\n", $this->currentFile);
|
//printf("Reached end of set at slice=%d\n", $this->currentFile);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
$flags = ($this->options['bigintAsString']??false)?JSON_BIGINT_AS_STRING:0;
|
|
||||||
$file = $this->files[$this->currentFile];
|
$file = $this->files[$this->currentFile];
|
||||||
$json = @json_decode(@file_get_contents($file), true, 512, $flags);
|
$json = @json_decode(@file_get_contents($file), true);
|
||||||
|
|
||||||
$this->loadData($json);
|
$this->loadData($json);
|
||||||
$this->loadedFile = $this->currentFile;
|
$this->loadedFile = $this->currentFile;
|
||||||
@ -61,13 +56,13 @@ class JsonReader implements ReaderInterface
|
|||||||
$this->checkLoadedSlice();
|
$this->checkLoadedSlice();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function key()
|
public function key(): mixed
|
||||||
{
|
{
|
||||||
//$this->checkLoadedSlice();
|
//$this->checkLoadedSlice();
|
||||||
return $this->counter;
|
return $this->counter;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function current()
|
public function current(): mixed
|
||||||
{
|
{
|
||||||
//$this->checkLoadedSlice();
|
//$this->checkLoadedSlice();
|
||||||
return $this->data[$this->currentIndex];
|
return $this->data[$this->currentIndex];
|
||||||
|
@ -1,13 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
namespace NoccyLabs\Dataset;
|
|
||||||
|
|
||||||
use RuntimeException;
|
|
||||||
|
|
||||||
class UnknownDatasetException extends RuntimeException
|
|
||||||
{
|
|
||||||
public static function DatasetNotFound(): self
|
|
||||||
{
|
|
||||||
return new UnknownDatasetException("The requested dataset is not available");
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,3 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
// This is not an autoloader, it is used by the tests
|
|
@ -1,21 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
namespace NoccyLabs\Dataset;
|
|
||||||
|
|
||||||
defined("NOCCYLABS_DATASET_TEST") || define("NOCCYLABS_DATASET_TEST", true);
|
|
||||||
|
|
||||||
class DatasetManagerTest extends \PHPUnit\Framework\TestCase
|
|
||||||
{
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @covers DatasetManager::__construct
|
|
||||||
* @covers DatasetManager::determineVendorPath
|
|
||||||
*/
|
|
||||||
public function testCreatingTheDatasetManager()
|
|
||||||
{
|
|
||||||
$dm = new DatasetManager();
|
|
||||||
|
|
||||||
$sets = $dm->getAvailableDatasets();
|
|
||||||
$this->assertEquals(2, count($sets), "Expected 2 loaded set");
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,72 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
namespace NoccyLabs\Dataset;
|
|
||||||
|
|
||||||
use ArrayIterator;
|
|
||||||
use Iterator;
|
|
||||||
|
|
||||||
class FilteringReaderIteratorTest extends \PHPUnit\Framework\TestCase
|
|
||||||
{
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @covers FilteringReaderIterator
|
|
||||||
*/
|
|
||||||
public function testSimpleFiltering()
|
|
||||||
{
|
|
||||||
$arr = iterator_to_array(
|
|
||||||
new FilteringReaderIterator(
|
|
||||||
$this->getTestIterator(), [
|
|
||||||
"alpha" => 3
|
|
||||||
]));
|
|
||||||
|
|
||||||
$this->assertEquals(1, count($arr));
|
|
||||||
$this->assertEquals(3, $arr[2]['alpha']);
|
|
||||||
$this->assertEquals('red', $arr[2]['beta']);
|
|
||||||
$this->assertEquals(null, $arr[2]['gamma']);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @covers FilteringReaderIterator
|
|
||||||
*/
|
|
||||||
public function testFilteringOnRange()
|
|
||||||
{
|
|
||||||
$arr = iterator_to_array(
|
|
||||||
new FilteringReaderIterator(
|
|
||||||
$this->getTestIterator(), [
|
|
||||||
"alpha" => [ 'gt' => 2, 'lt' => 5 ]
|
|
||||||
]));
|
|
||||||
|
|
||||||
$this->assertEquals(2, count($arr));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @covers FilteringReaderIterator
|
|
||||||
*/
|
|
||||||
public function testFilteringOnRangeInclusive()
|
|
||||||
{
|
|
||||||
$arr = iterator_to_array(
|
|
||||||
new FilteringReaderIterator(
|
|
||||||
$this->getTestIterator(), [
|
|
||||||
"alpha" => [ 'gt' => 2, 'lte' => 5 ]
|
|
||||||
]));
|
|
||||||
|
|
||||||
$this->assertEquals(3, count($arr));
|
|
||||||
}
|
|
||||||
|
|
||||||
private function getTestData(): array
|
|
||||||
{
|
|
||||||
return [
|
|
||||||
[ 'alpha' => 1, 'beta' => 'green', 'gamma' => true ],
|
|
||||||
[ 'alpha' => 2, 'beta' => 'blue', 'gamma' => false ],
|
|
||||||
[ 'alpha' => 3, 'beta' => 'red', 'gamma' => null ],
|
|
||||||
[ 'alpha' => 4, 'beta' => 'yellow', 'gamma' => false ],
|
|
||||||
[ 'alpha' => 5, 'beta' => 'pink', 'gamma' => true ],
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
private function getTestIterator(): Iterator
|
|
||||||
{
|
|
||||||
return new ArrayIterator($this->getTestData());
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,29 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
namespace NoccyLabs\Dataset\Readers;
|
|
||||||
|
|
||||||
|
|
||||||
class CsvReaderTest extends \PHPUnit\Framework\TestCase
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* @covers CsvReader
|
|
||||||
*/
|
|
||||||
public function testReadingSingleFile()
|
|
||||||
{
|
|
||||||
$reader = new CsvReader(__DIR__."/../../test/test/data/data.csv", []);
|
|
||||||
$arr = iterator_to_array($reader);
|
|
||||||
|
|
||||||
$this->assertEquals(5, count($arr), "Expected 5 items");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @covers CsvReader
|
|
||||||
*/
|
|
||||||
public function testReadingSplitFile()
|
|
||||||
{
|
|
||||||
$reader = new CsvReader(__DIR__."/../../test/test/data/*.csv", []);
|
|
||||||
$arr = iterator_to_array($reader);
|
|
||||||
|
|
||||||
$this->assertEquals(5, count($arr), "Expected 5 items");
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,6 +0,0 @@
|
|||||||
"a";"b"
|
|
||||||
1;a
|
|
||||||
2;b
|
|
||||||
3;c
|
|
||||||
4;d
|
|
||||||
5;e
|
|
|
@ -1,7 +0,0 @@
|
|||||||
[
|
|
||||||
{ "a": 1, "b": "a" },
|
|
||||||
{ "a": 2, "b": "b" },
|
|
||||||
{ "a": 3, "b": "c" },
|
|
||||||
{ "a": 4, "b": "d" },
|
|
||||||
{ "a": 5, "b": "e" }
|
|
||||||
]
|
|
@ -1,10 +0,0 @@
|
|||||||
{
|
|
||||||
"datasets": {
|
|
||||||
"csv": {
|
|
||||||
"filename": "data/data.csv"
|
|
||||||
},
|
|
||||||
"json": {
|
|
||||||
"filename": "data/data.json"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
x
Reference in New Issue
Block a user