Compare commits
5 Commits
Author | SHA1 | Date | |
---|---|---|---|
d0956f851c | |||
abde48640f | |||
f1ab6bd74b | |||
8d6c1800b9 | |||
c3e651440f |
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,3 +1,4 @@
|
||||
/vendor
|
||||
/composer.lock
|
||||
|
||||
/*.zip
|
||||
/.phpunit.cache
|
||||
|
41
README.md
41
README.md
@ -1,22 +1,47 @@
|
||||
# Dataset Library for PHP/Composer
|
||||
|
||||
This is a library for loading datasets from bundled packages.
|
||||
This is a library for loading datasets from bundled packages. The idea isn't to
|
||||
use the classes in this library as a generic datasource, although you could
|
||||
probably pull that off. Instead, use the datasets to import the relevant data to
|
||||
your database of choice, and optionally keep track of the version numbers so
|
||||
new data can be imported automatically when the dependencies have been updated
|
||||
and a new version has been installed.
|
||||
|
||||
## Installing
|
||||
|
||||
To install dataset, require it using composer:
|
||||
|
||||
```shell
|
||||
$ composer require noccylabs/dataset
|
||||
```
|
||||
|
||||
You also need some actual datasets. Some interesting ones could be:
|
||||
|
||||
Package | Description
|
||||
---|---
|
||||
[noccylabs/dataset-postal](https://dev.noccylabs.info/noccy/dataset-postal) | Patterns and info for postal (zip) code validation
|
||||
[noccylabs/dataset-calendar](https://dev.noccylabs.info/noccy/dataset-calendar) | Bank holidays
|
||||
[noccylabs/dataset-iso3166](https://dev.noccylabs.info/noccy/dataset-iso3166) | ISO 3166 country codes and namess
|
||||
|
||||
## Example
|
||||
|
||||
Require Dataset, and some actual datasets:
|
||||
|
||||
$ composer require noccylabs/dataset juicebox/flavordata
|
||||
|
||||
You can now open the desired sets and do useful stuff with the data:
|
||||
|
||||
```php
|
||||
use NoccyLabs\Dataset\DatasetManager;
|
||||
|
||||
$dm = new DatasetManager();
|
||||
$ds = $dm->getDataset("juicebox/flavordata#flavors");
|
||||
|
||||
foreach ($ds as $row) {
|
||||
// Call on getDataset() if you want access to the metadata,
|
||||
// Replace with openDataset() to quicly call getDataset()->open()
|
||||
$ds = $dm->getDataset("noccylabs/dataset-iso3166#countries");
|
||||
|
||||
// This is how you get the metadata
|
||||
echo "Dataset ID: ".$ds->getIdentifier(); // noccylabs/dataset-iso3166#countries
|
||||
echo "Dataset version: ".$ds->getVersion(); // 2022.10.1
|
||||
|
||||
// Get a reader by calling open()
|
||||
$reader = $ds->open();
|
||||
foreach ($reader as $row) {
|
||||
// row is an array
|
||||
}
|
||||
```
|
||||
|
30
bin/dataset-info
Executable file
30
bin/dataset-info
Executable file
@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env php
|
||||
<?php
|
||||
|
||||
foreach ([
|
||||
__DIR__."/..",
|
||||
__DIR__."/../vendor",
|
||||
__DIR__."/../../..",
|
||||
__DIR__."/vendor",
|
||||
] as $dir) {
|
||||
if (file_exists($dir."/autoload.php")) {
|
||||
define("COMPOSER_VENDOR_PATH", $dir);
|
||||
require_once $dir."/autoload.php";
|
||||
}
|
||||
}
|
||||
|
||||
$datasetManager = new NoccyLabs\Dataset\DatasetManager();
|
||||
|
||||
$datasets = $datasetManager->getAvailableDatasets();
|
||||
|
||||
foreach ($datasets as $dataset) {
|
||||
echo $dataset->getIdentifier()." (".$dataset->getVersion().")\n";
|
||||
$reader = $dataset->open();
|
||||
$rows = 0;
|
||||
$headers = null;
|
||||
foreach ($reader as $row) {
|
||||
if (!$headers) $headers = array_keys($row);
|
||||
$rows++;
|
||||
}
|
||||
echo " # ".$rows." rows\n - ".join("\n - ",$headers)."\n";
|
||||
}
|
@ -14,5 +14,9 @@
|
||||
"email": "cvagnetoft@gmail.com"
|
||||
}
|
||||
],
|
||||
"require": {}
|
||||
"bin": [ "bin/dataset-info" ],
|
||||
"require-dev": {
|
||||
"phpunit/phpunit": "^9.5",
|
||||
"phpstan/phpstan": "^1.8"
|
||||
}
|
||||
}
|
||||
|
13
phpstan.neon
Normal file
13
phpstan.neon
Normal file
@ -0,0 +1,13 @@
|
||||
parameters:
|
||||
level: 5
|
||||
|
||||
excludePaths:
|
||||
- doc
|
||||
- vendor
|
||||
- var
|
||||
- tests
|
||||
|
||||
# Paths to include in the analysis
|
||||
paths:
|
||||
- src
|
||||
|
27
phpunit.xml
Normal file
27
phpunit.xml
Normal file
@ -0,0 +1,27 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/9.5/phpunit.xsd"
|
||||
bootstrap="vendor/autoload.php"
|
||||
cacheResultFile=".phpunit.cache/test-results"
|
||||
executionOrder="depends,defects"
|
||||
forceCoversAnnotation="true"
|
||||
beStrictAboutCoversAnnotation="true"
|
||||
beStrictAboutOutputDuringTests="true"
|
||||
beStrictAboutTodoAnnotatedTests="true"
|
||||
convertDeprecationsToExceptions="true"
|
||||
failOnRisky="true"
|
||||
failOnWarning="true"
|
||||
verbose="true">
|
||||
<testsuites>
|
||||
<testsuite name="default">
|
||||
<directory>tests</directory>
|
||||
</testsuite>
|
||||
</testsuites>
|
||||
|
||||
<coverage cacheDirectory=".phpunit.cache/code-coverage"
|
||||
processUncoveredFiles="true">
|
||||
<include>
|
||||
<directory suffix=".php">src</directory>
|
||||
</include>
|
||||
</coverage>
|
||||
</phpunit>
|
@ -7,14 +7,23 @@ use NoccyLabs\Dataset\Readers\JsonReader;
|
||||
|
||||
class Dataset
|
||||
{
|
||||
protected string $packageName;
|
||||
|
||||
protected string $datasetName;
|
||||
|
||||
protected string $identifier;
|
||||
|
||||
protected array $options;
|
||||
|
||||
public function __construct(string $identifier, array $options)
|
||||
protected ?string $version;
|
||||
|
||||
public function __construct(string $identifier, array $options, ?string $version=null)
|
||||
{
|
||||
$this->identifier = $identifier;
|
||||
$this->options = $options;
|
||||
$this->version = $version;
|
||||
|
||||
[$this->packageName, $this->datasetName] = explode("#", $identifier, 2);
|
||||
}
|
||||
|
||||
public function getIdentifier(): string
|
||||
@ -22,6 +31,31 @@ class Dataset
|
||||
return $this->identifier;
|
||||
}
|
||||
|
||||
public function getPackageName(): string
|
||||
{
|
||||
return $this->packageName;
|
||||
}
|
||||
|
||||
public function getDatasetName(): string
|
||||
{
|
||||
return $this->datasetName;
|
||||
}
|
||||
|
||||
public function getVersion(): ?string
|
||||
{
|
||||
return $this->version;
|
||||
}
|
||||
|
||||
public function getComment(): ?string
|
||||
{
|
||||
return array_key_exists('comment', $this->options) ? $this->options['comment'] : null;
|
||||
}
|
||||
|
||||
public function getLicense(): ?string
|
||||
{
|
||||
return array_key_exists('license', $this->options) ? $this->options['license'] : null;
|
||||
}
|
||||
|
||||
public function open(): ReaderInterface
|
||||
{
|
||||
$filename = $this->options['filename'];
|
||||
|
@ -11,9 +11,9 @@ namespace NoccyLabs\Dataset;
|
||||
*/
|
||||
class DatasetManager
|
||||
{
|
||||
private static array $packageVersions = [];
|
||||
|
||||
|
||||
private static $datasets = [];
|
||||
private static array $datasets = [];
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
@ -22,16 +22,46 @@ class DatasetManager
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return all the available datasets
|
||||
*
|
||||
* @return Array<Dataset> The available datasets
|
||||
*/
|
||||
public function getAvailableDatasets(): array
|
||||
{
|
||||
return self::$datasets;
|
||||
}
|
||||
|
||||
/**
|
||||
* Directly return a reader for a specific dataset.
|
||||
*
|
||||
* @param string $identifier The dataset identifier
|
||||
* @return ReaderInterface A reader for the data
|
||||
* @throws InvalidDatasetException if the dataset can not be opened
|
||||
* @throws UnknownDatasetException if the dataset does not exist
|
||||
*/
|
||||
public function openDataset(string $identifier): ReaderInterface
|
||||
{
|
||||
return $this->getDataset($identifier)->open();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return a Dataset object containing metadata and methods to retrieve
|
||||
* a reader for the data in the set.
|
||||
*
|
||||
* @param string $identifier The dataset identifier
|
||||
* @throws UnknownDatasetException if the dataset does not exist
|
||||
*/
|
||||
public function getDataset(string $identifier): Dataset
|
||||
{
|
||||
if (!array_key_exists($identifier, self::$datasets)) {
|
||||
throw UnknownDatasetException::DatasetNotFound();
|
||||
}
|
||||
|
||||
return self::$datasets[$identifier];
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the vendor directory and try to locate all bundled datasets
|
||||
*
|
||||
@ -47,6 +77,15 @@ class DatasetManager
|
||||
|
||||
$glob = glob($root."/*/*/dataset.json");
|
||||
|
||||
self::$packageVersions = [];
|
||||
$fn = realpath($root."/composer/installed.php");
|
||||
if (file_exists($fn)) {
|
||||
$versions = include $fn;
|
||||
foreach ($versions['versions'] as $name=>$version) {
|
||||
self::$packageVersions[$name] = $version['version'];
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($glob as $match) {
|
||||
$path = dirname($match);
|
||||
$package = basename(dirname($path))."/".basename($path);
|
||||
@ -58,8 +97,8 @@ class DatasetManager
|
||||
*
|
||||
*
|
||||
*
|
||||
* @param string The package name (org/package)
|
||||
* @param string The full path to the package (..../vendor/org/package)
|
||||
* @param string $package The package name (org/package)
|
||||
* @param string $path The full path to the package (..../vendor/org/package)
|
||||
*/
|
||||
private function scanPackageDatasets(string $package, string $path)
|
||||
{
|
||||
@ -75,29 +114,45 @@ class DatasetManager
|
||||
}
|
||||
|
||||
$this->loadDatasets($json['datasets'], null, $package, $path);
|
||||
//printf("found %d sets in %s\n", count($json['datasets']), $package);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*/
|
||||
private function loadDatasets(array $datasets, ?string $prefix, string $package, string $path)
|
||||
{
|
||||
foreach ($datasets as $name=>$info) {
|
||||
if (!array_key_exists('filename', $info)) {
|
||||
$this->loadDatasets($info, ltrim($prefix . "." . $name, "."), $package, $path);
|
||||
foreach ($datasets as $name=>$options) {
|
||||
if (!array_key_exists('filename', $options)) {
|
||||
$this->loadDatasets($options, ltrim($prefix . "." . $name, "."), $package, $path);
|
||||
return;
|
||||
}
|
||||
$info['filename'] = $path . "/" . $info['filename'];
|
||||
$options['filename'] = $path . "/" . $options['filename'];
|
||||
$pn = sprintf("%s#%s", $package, ltrim($prefix.".".$name,"."));
|
||||
$ds = new Dataset($pn, $info);
|
||||
$pv = self::$packageVersions[$package]??null;
|
||||
$ds = new Dataset($pn, $options, $pv);
|
||||
$this->registerDataset($ds);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
*
|
||||
*/
|
||||
private function determineVendorPath(): ?string
|
||||
{
|
||||
if (file_exists(__DIR__."/../../../autoload.php")) {
|
||||
// we are installed as a composer package
|
||||
return dirname(__DIR__, 3);
|
||||
$d = defined("NOCCYLABS_DATASET_TEST") ? (dirname(__DIR__)."/tests") : __DIR__;
|
||||
while ($d != dirname($d)) {
|
||||
if (file_exists($d."/autoload.php")) break;
|
||||
$d = dirname($d);
|
||||
}
|
||||
|
||||
if (file_exists($d."/autoload.php")) {
|
||||
return $d;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
@ -114,17 +169,4 @@ class DatasetManager
|
||||
self::$datasets[$id] = $dataset;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
* @throws InvalidDatasetException if the dataset can not be opened
|
||||
* @throws UnknownDatasetExcception if the dataset does not exist
|
||||
*/
|
||||
public function getDataset(string $identifier): Dataset
|
||||
{
|
||||
|
||||
return self::$datasets[$identifier];
|
||||
}
|
||||
}
|
8
src/InvalidDatasetException.php
Normal file
8
src/InvalidDatasetException.php
Normal file
@ -0,0 +1,8 @@
|
||||
<?php
|
||||
|
||||
namespace NoccyLabs\Dataset;
|
||||
|
||||
use RuntimeException;
|
||||
|
||||
class InvalidDatasetException extends RuntimeException
|
||||
{}
|
@ -8,6 +8,8 @@ class CsvReader implements ReaderInterface
|
||||
{
|
||||
private array $files = [];
|
||||
|
||||
private array $options = [];
|
||||
|
||||
private int $currentFile = 0;
|
||||
|
||||
private ?int $loadedFile = null;
|
||||
@ -21,6 +23,7 @@ class CsvReader implements ReaderInterface
|
||||
public function __construct(string $filename, array $options)
|
||||
{
|
||||
$this->files = glob($filename);
|
||||
$this->options = $options;
|
||||
}
|
||||
|
||||
private function checkLoadedSlice()
|
||||
|
@ -8,6 +8,8 @@ class JsonReader implements ReaderInterface
|
||||
{
|
||||
private array $files = [];
|
||||
|
||||
private array $options = [];
|
||||
|
||||
private int $currentFile = 0;
|
||||
|
||||
private ?int $loadedFile = null;
|
||||
@ -21,6 +23,7 @@ class JsonReader implements ReaderInterface
|
||||
public function __construct(string $filename, array $options)
|
||||
{
|
||||
$this->files = glob($filename);
|
||||
$this->options = $options;
|
||||
}
|
||||
|
||||
private function checkLoadedSlice()
|
||||
|
13
src/UnknownDatasetExeption.php
Normal file
13
src/UnknownDatasetExeption.php
Normal file
@ -0,0 +1,13 @@
|
||||
<?php
|
||||
|
||||
namespace NoccyLabs\Dataset;
|
||||
|
||||
use RuntimeException;
|
||||
|
||||
class UnknownDatasetException extends RuntimeException
|
||||
{
|
||||
public static function DatasetNotFound(): self
|
||||
{
|
||||
return new UnknownDatasetException("The requested dataset is not available");
|
||||
}
|
||||
}
|
3
tests/autoload.php
Normal file
3
tests/autoload.php
Normal file
@ -0,0 +1,3 @@
|
||||
<?php
|
||||
|
||||
// This is not an autoloader, it is used by the tests
|
7
tests/foo/bar/dataset.json
Normal file
7
tests/foo/bar/dataset.json
Normal file
@ -0,0 +1,7 @@
|
||||
{
|
||||
"datasets": {
|
||||
"baz": {
|
||||
"filename": "data/test.csv"
|
||||
}
|
||||
}
|
||||
}
|
21
tests/src/DatasetManagerTest.php
Normal file
21
tests/src/DatasetManagerTest.php
Normal file
@ -0,0 +1,21 @@
|
||||
<?php
|
||||
|
||||
namespace NoccyLabs\Dataset;
|
||||
|
||||
defined("NOCCYLABS_DATASET_TEST") || define("NOCCYLABS_DATASET_TEST", true);
|
||||
|
||||
class DatasetManagerTest extends \PHPUnit\Framework\TestCase
|
||||
{
|
||||
|
||||
/**
|
||||
* @covers DatasetManager::__construct
|
||||
* @covers DatasetManager::determineVendorPath
|
||||
*/
|
||||
public function testCreatingTheDatasetManager()
|
||||
{
|
||||
$dm = new DatasetManager();
|
||||
|
||||
$sets = $dm->getAvailableDatasets();
|
||||
$this->assertEquals(1, count($sets), "Expected 1 loaded set");
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user