4 Commits
0.1.1 ... 0.1.2

Author SHA1 Message Date
d0956f851c Initial unit tests, code cleanup 2022-10-31 00:42:29 +01:00
abde48640f updated readme 2022-10-30 23:24:27 +01:00
f1ab6bd74b Updated readme, bins 2022-10-30 23:21:53 +01:00
8d6c1800b9 Cleanup, bugfixes 2022-10-30 23:02:14 +01:00
15 changed files with 255 additions and 42 deletions

3
.gitignore vendored
View File

@ -1,3 +1,4 @@
/vendor /vendor
/composer.lock /composer.lock
/*.zip
/.phpunit.cache

View File

@ -1,22 +1,47 @@
# Dataset Library for PHP/Composer # Dataset Library for PHP/Composer
This is a library for loading datasets from bundled packages. This is a library for loading datasets from bundled packages. The idea isn't to
use the classes in this library as a generic datasource, although you could
probably pull that off. Instead, use the datasets to import the relevant data to
your database of choice, and optionally keep track of the version numbers so
new data can be imported automatically when the dependencies have been updated
and a new version has been installed.
## Installing
To install dataset, require it using composer:
```shell
$ composer require noccylabs/dataset
```
You also need some actual datasets. Some interesting ones could be:
Package | Description
---|---
[noccylabs/dataset-postal](https://dev.noccylabs.info/noccy/dataset-postal) | Patterns and info for postal (zip) code validation
[noccylabs/dataset-calendar](https://dev.noccylabs.info/noccy/dataset-calendar) | Bank holidays
[noccylabs/dataset-iso3166](https://dev.noccylabs.info/noccy/dataset-iso3166) | ISO 3166 country codes and namess
## Example ## Example
Require Dataset, and some actual datasets:
$ composer require noccylabs/dataset juicebox/flavordata
You can now open the desired sets and do useful stuff with the data:
```php ```php
use NoccyLabs\Dataset\DatasetManager; use NoccyLabs\Dataset\DatasetManager;
$dm = new DatasetManager(); $dm = new DatasetManager();
$ds = $dm->getDataset("juicebox/flavordata#flavors");
foreach ($ds as $row) { // Call on getDataset() if you want access to the metadata,
// Replace with openDataset() to quicly call getDataset()->open()
$ds = $dm->getDataset("noccylabs/dataset-iso3166#countries");
// This is how you get the metadata
echo "Dataset ID: ".$ds->getIdentifier(); // noccylabs/dataset-iso3166#countries
echo "Dataset version: ".$ds->getVersion(); // 2022.10.1
// Get a reader by calling open()
$reader = $ds->open();
foreach ($reader as $row) {
// row is an array // row is an array
} }
``` ```

View File

@ -2,12 +2,14 @@
<?php <?php
foreach ([ foreach ([
__DIR__."/..",
__DIR__."/../vendor", __DIR__."/../vendor",
__DIR__."/../../../vendor" __DIR__."/../../..",
__DIR__."/vendor",
] as $dir) { ] as $dir) {
if (file_exists($dir."/autoload.php")) { if (file_exists($dir."/autoload.php")) {
define("COMPOSER_VENDOR_PATH", $dir);
require_once $dir."/autoload.php"; require_once $dir."/autoload.php";
echo $dir."\n";
} }
} }
@ -16,5 +18,13 @@ $datasetManager = new NoccyLabs\Dataset\DatasetManager();
$datasets = $datasetManager->getAvailableDatasets(); $datasets = $datasetManager->getAvailableDatasets();
foreach ($datasets as $dataset) { foreach ($datasets as $dataset) {
echo $dataset->getIdentifier()."\n"; echo $dataset->getIdentifier()." (".$dataset->getVersion().")\n";
$reader = $dataset->open();
$rows = 0;
$headers = null;
foreach ($reader as $row) {
if (!$headers) $headers = array_keys($row);
$rows++;
}
echo " # ".$rows." rows\n - ".join("\n - ",$headers)."\n";
} }

View File

@ -14,6 +14,9 @@
"email": "cvagnetoft@gmail.com" "email": "cvagnetoft@gmail.com"
} }
], ],
"require": {}, "bin": [ "bin/dataset-info" ],
"bin": [ "bin/dataset-info" ] "require-dev": {
"phpunit/phpunit": "^9.5",
"phpstan/phpstan": "^1.8"
}
} }

13
phpstan.neon Normal file
View File

@ -0,0 +1,13 @@
parameters:
level: 5
excludePaths:
- doc
- vendor
- var
- tests
# Paths to include in the analysis
paths:
- src

27
phpunit.xml Normal file
View File

@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/9.5/phpunit.xsd"
bootstrap="vendor/autoload.php"
cacheResultFile=".phpunit.cache/test-results"
executionOrder="depends,defects"
forceCoversAnnotation="true"
beStrictAboutCoversAnnotation="true"
beStrictAboutOutputDuringTests="true"
beStrictAboutTodoAnnotatedTests="true"
convertDeprecationsToExceptions="true"
failOnRisky="true"
failOnWarning="true"
verbose="true">
<testsuites>
<testsuite name="default">
<directory>tests</directory>
</testsuite>
</testsuites>
<coverage cacheDirectory=".phpunit.cache/code-coverage"
processUncoveredFiles="true">
<include>
<directory suffix=".php">src</directory>
</include>
</coverage>
</phpunit>

View File

@ -7,14 +7,23 @@ use NoccyLabs\Dataset\Readers\JsonReader;
class Dataset class Dataset
{ {
protected string $packageName;
protected string $datasetName;
protected string $identifier; protected string $identifier;
protected array $options; protected array $options;
public function __construct(string $identifier, array $options) protected ?string $version;
public function __construct(string $identifier, array $options, ?string $version=null)
{ {
$this->identifier = $identifier; $this->identifier = $identifier;
$this->options = $options; $this->options = $options;
$this->version = $version;
[$this->packageName, $this->datasetName] = explode("#", $identifier, 2);
} }
public function getIdentifier(): string public function getIdentifier(): string
@ -22,6 +31,31 @@ class Dataset
return $this->identifier; return $this->identifier;
} }
public function getPackageName(): string
{
return $this->packageName;
}
public function getDatasetName(): string
{
return $this->datasetName;
}
public function getVersion(): ?string
{
return $this->version;
}
public function getComment(): ?string
{
return array_key_exists('comment', $this->options) ? $this->options['comment'] : null;
}
public function getLicense(): ?string
{
return array_key_exists('license', $this->options) ? $this->options['license'] : null;
}
public function open(): ReaderInterface public function open(): ReaderInterface
{ {
$filename = $this->options['filename']; $filename = $this->options['filename'];

View File

@ -11,9 +11,9 @@ namespace NoccyLabs\Dataset;
*/ */
class DatasetManager class DatasetManager
{ {
private static array $packageVersions = [];
private static array $datasets = [];
private static $datasets = [];
public function __construct() public function __construct()
{ {
@ -22,16 +22,46 @@ class DatasetManager
} }
} }
/**
* Return all the available datasets
*
* @return Array<Dataset> The available datasets
*/
public function getAvailableDatasets(): array public function getAvailableDatasets(): array
{ {
return self::$datasets; return self::$datasets;
} }
/**
* Directly return a reader for a specific dataset.
*
* @param string $identifier The dataset identifier
* @return ReaderInterface A reader for the data
* @throws InvalidDatasetException if the dataset can not be opened
* @throws UnknownDatasetException if the dataset does not exist
*/
public function openDataset(string $identifier): ReaderInterface public function openDataset(string $identifier): ReaderInterface
{ {
return $this->getDataset($identifier)->open(); return $this->getDataset($identifier)->open();
} }
/**
* Return a Dataset object containing metadata and methods to retrieve
* a reader for the data in the set.
*
* @param string $identifier The dataset identifier
* @throws UnknownDatasetException if the dataset does not exist
*/
public function getDataset(string $identifier): Dataset
{
if (!array_key_exists($identifier, self::$datasets)) {
throw UnknownDatasetException::DatasetNotFound();
}
return self::$datasets[$identifier];
}
/** /**
* Find the vendor directory and try to locate all bundled datasets * Find the vendor directory and try to locate all bundled datasets
* *
@ -47,6 +77,15 @@ class DatasetManager
$glob = glob($root."/*/*/dataset.json"); $glob = glob($root."/*/*/dataset.json");
self::$packageVersions = [];
$fn = realpath($root."/composer/installed.php");
if (file_exists($fn)) {
$versions = include $fn;
foreach ($versions['versions'] as $name=>$version) {
self::$packageVersions[$name] = $version['version'];
}
}
foreach ($glob as $match) { foreach ($glob as $match) {
$path = dirname($match); $path = dirname($match);
$package = basename(dirname($path))."/".basename($path); $package = basename(dirname($path))."/".basename($path);
@ -58,8 +97,8 @@ class DatasetManager
* *
* *
* *
* @param string The package name (org/package) * @param string $package The package name (org/package)
* @param string The full path to the package (..../vendor/org/package) * @param string $path The full path to the package (..../vendor/org/package)
*/ */
private function scanPackageDatasets(string $package, string $path) private function scanPackageDatasets(string $package, string $path)
{ {
@ -75,29 +114,45 @@ class DatasetManager
} }
$this->loadDatasets($json['datasets'], null, $package, $path); $this->loadDatasets($json['datasets'], null, $package, $path);
//printf("found %d sets in %s\n", count($json['datasets']), $package);
} }
/**
*
*
*
*
*/
private function loadDatasets(array $datasets, ?string $prefix, string $package, string $path) private function loadDatasets(array $datasets, ?string $prefix, string $package, string $path)
{ {
foreach ($datasets as $name=>$info) { foreach ($datasets as $name=>$options) {
if (!array_key_exists('filename', $info)) { if (!array_key_exists('filename', $options)) {
$this->loadDatasets($info, ltrim($prefix . "." . $name, "."), $package, $path); $this->loadDatasets($options, ltrim($prefix . "." . $name, "."), $package, $path);
return; return;
} }
$info['filename'] = $path . "/" . $info['filename']; $options['filename'] = $path . "/" . $options['filename'];
$pn = sprintf("%s#%s", $package, ltrim($prefix.".".$name,".")); $pn = sprintf("%s#%s", $package, ltrim($prefix.".".$name,"."));
$ds = new Dataset($pn, $info); $pv = self::$packageVersions[$package]??null;
$ds = new Dataset($pn, $options, $pv);
$this->registerDataset($ds); $this->registerDataset($ds);
} }
} }
/**
*
*
*
*/
private function determineVendorPath(): ?string private function determineVendorPath(): ?string
{ {
if (file_exists(__DIR__."/../../../autoload.php")) { $d = defined("NOCCYLABS_DATASET_TEST") ? (dirname(__DIR__)."/tests") : __DIR__;
// we are installed as a composer package while ($d != dirname($d)) {
return dirname(__DIR__, 3); if (file_exists($d."/autoload.php")) break;
$d = dirname($d);
}
if (file_exists($d."/autoload.php")) {
return $d;
} }
return null; return null;
} }
@ -114,17 +169,4 @@ class DatasetManager
self::$datasets[$id] = $dataset; self::$datasets[$id] = $dataset;
} }
}
/**
*
*
* @throws InvalidDatasetException if the dataset can not be opened
* @throws UnknownDatasetExcception if the dataset does not exist
*/
public function getDataset(string $identifier): Dataset
{
return self::$datasets[$identifier];
}
}

View File

@ -0,0 +1,8 @@
<?php
namespace NoccyLabs\Dataset;
use RuntimeException;
class InvalidDatasetException extends RuntimeException
{}

View File

@ -8,6 +8,8 @@ class CsvReader implements ReaderInterface
{ {
private array $files = []; private array $files = [];
private array $options = [];
private int $currentFile = 0; private int $currentFile = 0;
private ?int $loadedFile = null; private ?int $loadedFile = null;
@ -21,6 +23,7 @@ class CsvReader implements ReaderInterface
public function __construct(string $filename, array $options) public function __construct(string $filename, array $options)
{ {
$this->files = glob($filename); $this->files = glob($filename);
$this->options = $options;
} }
private function checkLoadedSlice() private function checkLoadedSlice()

View File

@ -8,6 +8,8 @@ class JsonReader implements ReaderInterface
{ {
private array $files = []; private array $files = [];
private array $options = [];
private int $currentFile = 0; private int $currentFile = 0;
private ?int $loadedFile = null; private ?int $loadedFile = null;
@ -21,6 +23,7 @@ class JsonReader implements ReaderInterface
public function __construct(string $filename, array $options) public function __construct(string $filename, array $options)
{ {
$this->files = glob($filename); $this->files = glob($filename);
$this->options = $options;
} }
private function checkLoadedSlice() private function checkLoadedSlice()

View File

@ -0,0 +1,13 @@
<?php
namespace NoccyLabs\Dataset;
use RuntimeException;
class UnknownDatasetException extends RuntimeException
{
public static function DatasetNotFound(): self
{
return new UnknownDatasetException("The requested dataset is not available");
}
}

3
tests/autoload.php Normal file
View File

@ -0,0 +1,3 @@
<?php
// This is not an autoloader, it is used by the tests

View File

@ -0,0 +1,7 @@
{
"datasets": {
"baz": {
"filename": "data/test.csv"
}
}
}

View File

@ -0,0 +1,21 @@
<?php
namespace NoccyLabs\Dataset;
defined("NOCCYLABS_DATASET_TEST") || define("NOCCYLABS_DATASET_TEST", true);
class DatasetManagerTest extends \PHPUnit\Framework\TestCase
{
/**
* @covers DatasetManager::__construct
* @covers DatasetManager::determineVendorPath
*/
public function testCreatingTheDatasetManager()
{
$dm = new DatasetManager();
$sets = $dm->getAvailableDatasets();
$this->assertEquals(1, count($sets), "Expected 1 loaded set");
}
}