From 3b475c209a11341de51510297f7ab7a79848fbf3 Mon Sep 17 00:00:00 2001 From: ignace nyamagana butera Date: Thu, 28 Sep 2023 22:18:40 +0200 Subject: [PATCH] Prepare 0.3.0 Release --- CHANGELOG.md | 4 +- README.md | 147 +++++++++++++++++++++++++++++++-------------- composer.json | 4 +- src/Parser.php | 117 +++++++++++++++++++++--------------- src/ParserTest.php | 48 +++++++++++++-- 5 files changed, 222 insertions(+), 98 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f6d8b68..73584e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,11 +2,12 @@ All Notable changes to `bakame/html-table` will be documented in this file. -## [0.3.0](https://github.com/bakame-php/html-table/compare/0.2.0...0.3.0) - 2023-09-27 +## [0.3.0](https://github.com/bakame-php/html-table/compare/0.2.0...0.3.0) - 2023-09-29 ### Added - `Parser::tableXpathPosition` +- `Parser::tableCaption` - `Table` class which implements the `TabularDataReader` interface. - `Parser::includeSections` and `Parser::excludeSections` to improve section parsing. @@ -14,6 +15,7 @@ All Notable changes to `bakame/html-table` will be documented in this file. - Improve identifier validation for `Parser::tablePosition` - Remove the `$tableOffset` property. +- `tableHeader` can now re-arrange the table column and remove any unwanted column. ### Deprecated diff --git a/README.md b/README.md index 2158c26..fdcc92a 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,8 @@ [![Total Downloads](https://img.shields.io/packagist/dt/bakame/html-table.svg?style=flat-square)](https://packagist.org/packages/bakame/html-table) [![Sponsor development of this project](https://img.shields.io/badge/sponsor%20this%20package-%E2%9D%A4-ff69b4.svg?style=flat-square)](https://github.com/sponsors/nyamsprod) -`bakame/html-table` is a small PHP package that allows you to parse, import tabular data represented as -HTML Table. Once installed you will be able to do the following: +`bakame/html-table` is a small PHP package that allows you to parse, import and manipualte +tabular data represented as HTML Table. Once installed you will be able to do the following: ```php use Bakame\HtmlTable\Parser; @@ -33,10 +33,6 @@ $table // ] ``` -The Package is responsible for the parsing of the HTML, the manipulation methods used -are part of the `league\csv` package. Please refer to -[its documentation](https://csv.thephpleague.com) for more information. - ## System Requirements **league\csv >= 9.11.0** library is required. @@ -53,11 +49,28 @@ composer require bakame/html-table The `Parser` can convert a file (a PHP stream or a Path with an optional context like `fopen`) or an HTML document into a `League\Csv\TabularData` implementing object. Once converted you -can use all the methods and feature made available by this interface -(see [ResultSet](https://csv.thephpleague.com/9.0/reader/resultset/)) for more information. +can use all the methods and feature made available by the interface (see [ResultSet](https://csv.thephpleague.com/9.0/reader/resultset/)) +for more information. **The `Parser` itself is immutable, whenever you change a configuration option a new instance is returned.** +**The `Parser` constructor is private to instantiate the object you are required to use the `new` method instead** + +```php +use Bakame\HtmlTable\Parser; + +$parser = Parser::new() + ->ignoreTableHeader() + ->ignoreXmlErrors() + ->withoutFormatter() + ->tableCaption('This is a beautiful table'); +``` + +### parseHtml and parseFile + +To extract and parse your table use either the `parseHtml` or `parseFile` methods. +If parsing is not possible a `ParseError` exception will be thrown. + ```php use Bakame\HtmlTable\Parser; @@ -67,20 +80,18 @@ $table = $parser->parseHtml('...
'); $table = $parser->parseFile('path/to/html/file.html'); ``` -### parseHtml and parseFile - -The `parseHtml` or `parseFile` methods extract and parse your table. If parsing -is not possible a `ParseError` exception will be thrown. - `parseHtml` parses an HTML page represented by: - a `string`, - a `Stringable` object, - a `DOMDocument`, - a `DOMElement`, -- and/or a `SimpleXMLElement` +- or a `SimpleXMLElement` -whereas `parseFile` works with a filepath and/or a PHP readable stream. +whereas `parseFile` works with: + +- a filepath, +- or a PHP readable stream. Both methods return a `Table` instance which implements the `League\Csv\TabularDataReader` interface and also give access to the table caption if present via the `getCaption` method. @@ -137,36 +148,33 @@ By default, when calling the `Parser::new()` named constructor the parser will: - have no formatter attached. - have no default caption to used if none is present in the table. -Each of the following settings can be changed to improve HTML to object conversion for your specific needs: +Each of the following settings can be changed to improve the conversion against your business rules: ### tablePosition and tableXpathPosition -Selecting the table to parse in the HTML page can be done usage two (2) methods +Selecting the table to parse in the HTML page can be done using two (2) methods `Parser::tablePosition` and `Parser::tableXpathPosition` If you know the table position in the page in relation with its integer offset or if you know it's `id` attribute value you should use `Parser::tablePosition` otherwise -for any other complex situations you should favor `Parser::tableXpathPosition` -which expects an `xpath` expression. If the expression is valid, the first -result of the expression will be returned. - -- a string; it will represent the value of the table "id" attribute. -- a positive integer or `0`; it will represent the table offset. +favor `Parser::tableXpathPosition` which expects an `xpath` expression. +If the expression is valid, and a list of table is found, the first result will be returned. ```php use Bakame\HtmlTable\Parser; -$parser = Parser::new()->tablePosition('table-id'); // parse the +$parser = Parser::new()->tablePosition(3); // parses the 4th table of the page $parser = Parser::new()->tableXPathPosition("//main/div/table"); +//parse the first table that matches the xpath expression ``` -`Parser::tableXpathPosition` and `Parser::tablePosition` override each other. It is -recommended to use one or the other but not both at the same time. +**`Parser::tableXpathPosition` and `Parser::tablePosition` override each other. It is +recommended to use one or the other but not both at the same time.** ### tableCaption -You can optionnally define a caption for your table if none is present or found during parsing. +You can optionally define a caption for your table if none is present or found during parsing. ```php use Bakame\HtmlTable\Parser; @@ -175,18 +183,13 @@ $parser = Parser::new()->tableCaption('this is a generated caption'); $parser = Parser::new()->tableCaption(null); // remove any default caption set ``` -### ignoreTableHeader and resolveTableHeader +### tableHeader, tableHeaderPosition, ignoreTableHeader and resolveTableHeader -Tells the parser to attempt or not table header resolution. +The following settings configure the `Parser` in relation to the table header. By default, +the parser will try to parse the first `tr` tag found in the `thead` section of the table. +But you can override this behaviour using one of these settings: -```php -use Bakame\HtmlTable\Parser; - -$parser = Parser::new()->ignoreTableHeader(); // no table header will be resolved -$parser = Parser::new()->resolveTableHeader(); // will attempt to resolve the table header -``` - -### tableHeaderPosition +#### tableHeaderPosition Tells where to locate and resolve the table header @@ -198,7 +201,8 @@ $parser = Parser::new()->tableHeaderPosition(Section::thead, 3); // header is the 4th row in the table section ``` -use the `Bakame\HtmlTable\Section` enum to designate which table section to use to resolve the header +The method uses the `Bakame\HtmlTable\Section` enum to designate which table section to use +to resolve the header ```php use Bakame\HtmlTable\Section; @@ -213,12 +217,24 @@ enum Section ``` If `Section::tr` is used, `tr` tags will be used independently of their section. -The second argument is the table header offset; it defaults to `0` (ie: the first row). +The second argument is the table header `tr` offset; it defaults to `0` (ie: the first row). + +#### ignoreTableHeader and resolveTableHeader -### tableHeader +Instructs the parser to resolve or not the table header using `tableHeaderPosition` configuration. +If no resolution is done, no header will be included in the returned `Table` instance. + +```php +use Bakame\HtmlTable\Parser; + +$parser = Parser::new()->ignoreTableHeader(); // no table header will be resolved +$parser = Parser::new()->resolveTableHeader(); // will attempt to resolve the table header +``` + +#### tableHeader You can specify directly the header of your table and override any other table header -related configuration with this one +related configuration with this configuration ```php use Bakame\HtmlTable\Parser; @@ -231,6 +247,19 @@ $parser = Parser::new()->tableHeader(['rank', 'team', 'winner']); **Because it is a tabular data each cell MUST be unique otherwise an exception will be thrown** +You can skip or re-arrange the source columns by skipping them by their offsets and/or by +re-ordering the offsets. + +```php +use Bakame\HtmlTable\Parser; +use Bakame\HtmlTable\Section; + +$parser = Parser::new()->tableHeader([3 => 'rank', 7 => 'winner', 5 => 'team']); +// only 3 column will be extracted the 4th, 6th and 8th columns +// and re-arrange as 'rank' first and 'team' last +// if a column is missing its value will be PHP `null` type +``` + ### includeSection and excludeSection Tells which section should be parsed based on the `Section` enum @@ -239,13 +268,24 @@ Tells which section should be parsed based on the `Section` enum use Bakame\HtmlTable\Parser; use Bakame\HtmlTable\Section; -$parser = Parser::new()->includeSection(Section::thead); // thead is included during parsing -$parser = Parser::new()->excludeSection(Section::tr); // table direct tr children are not included during parsing +$parser = Parser::new()->includeSection(Section::tbody); // thead and tfoot are included during parsing +$parser = Parser::new()->excludeSection(Section::tr, Section::tfoot); // table direct tr children and tfoot are not included during parsing ``` **By default, the `thead` section is not parse. If a `thead` row is selected to be the header, it will be parsed independently of this setting.** +**⚠️Tips:** to be sure of which sections will be modified, first remove all previous setting +before applying your configuration as shown below: + +```diff +- Parser::new()->includeSection(Section::tbody); ++ Parser::new()->excludeSection(...Section::cases())->includeSection(Section::tbody); +``` + +The first call will still include the `tfoot` and the `tr` sections, whereas the second call +remove any previous setting guaranting that only the `tbody` if present will be parsed. + ### withFormatter and withoutFormatter Adds or remove a record formatter applied to the data extracted from the table before you @@ -267,6 +307,25 @@ function (array $record): array; If a header was defined or specified, the submitted record will have the header definition set, otherwise an array list is provided. +The following formatter will work on any table content as long as it is defined as a string. + +```php +$formatter = fn (array $record): array => array_map(strtolower(...), $record); +// the following formatter will convert all the fields from your table to lowercase. +``` + +The following formatter will only work if the table has a header attached to it with +a column named `count`. + +```php +$formatter = function (array $record): array { + $record['count'] = (int) $record['count']; + + return $record; +} +// the following formatter will convert the data of all count column into integer.. +``` + ### ignoreXmlErrors and failOnXmlErrors Tells whether the parser should ignore or throw in case of malformed HTML content. diff --git a/composer.json b/composer.json index 6cb058b..8385cee 100644 --- a/composer.json +++ b/composer.json @@ -27,20 +27,22 @@ ], "require": { "ext-dom": "*", - "ext-json": "*", "ext-libxml": "*", "ext-mbstring": "*", "ext-simplexml": "*", "league/csv": "^9.11.0" }, "require-dev": { + "ext-curl": "*", "ext-xdebug": "*", "friendsofphp/php-cs-fixer": "^v3.28.0", + "laravel/prompts": "^0.1.9", "phpstan/phpstan": "^1.10.35", "phpstan/phpstan-deprecation-rules": "^1.1.4", "phpstan/phpstan-phpunit": "^1.3.14", "phpstan/phpstan-strict-rules": "^1.5.1", "phpunit/phpunit": "^10.3.5", + "symfony/css-selector": "^6.3", "symfony/var-dumper": "^6.3.4" }, "autoload": { diff --git a/src/Parser.php b/src/Parser.php index 4b13323..61f8d16 100644 --- a/src/Parser.php +++ b/src/Parser.php @@ -14,20 +14,15 @@ use Iterator; use League\Csv\ResultSet; use League\Csv\SyntaxError; -use League\Csv\TabularDataReader; use SimpleXMLElement; use Stringable; -use function array_combine; use function array_fill; use function array_filter; use function array_key_exists; use function array_merge; -use function array_pad; use function array_shift; -use function array_slice; use function array_unique; -use function count; use function fclose; use function fopen; use function in_array; @@ -52,13 +47,13 @@ final class Parser */ private function __construct( private readonly string $tableExpression, + private readonly ?string $caption, private readonly array $tableHeader, private readonly bool $ignoreTableHeader, private readonly string $tableHeaderExpression, - private readonly bool $throwOnXmlErrors, private readonly array $includedSections, private readonly ?Closure $formatter, - private readonly ?string $caption, + private readonly bool $throwOnXmlErrors, ) { } @@ -66,13 +61,13 @@ public static function new(): self { return new self( '(//table)[1]', + null, [], false, '(//table/thead/tr)[1]', - false, [Section::tbody->value => 1, Section::tr->value => 1, Section::tfoot->value => 1], null, - null, + false, ); } @@ -84,13 +79,13 @@ public function tableXPathPosition(string $expression): self false === (new DOMXPath(new DOMDocument()))->query($expression) => throw new ParserError('The xpath expression `'.$expression.'` is invalie.'), default => new self( $expression, + $this->caption, $this->tableHeader, $this->ignoreTableHeader, $this->tableHeaderExpression, - $this->throwOnXmlErrors, $this->includedSections, $this->formatter, - $this->caption, + $this->throwOnXmlErrors, ), }; restore_error_handler(); @@ -126,13 +121,13 @@ public function tableHeader(array $headerRow): self $headerRow !== array_unique($filteredHeader) => throw ParserError::dueToDuplicateHeaderColumnNames($headerRow), default => new self( $this->tableExpression, + $this->caption, $headerRow, $this->ignoreTableHeader, $this->tableHeaderExpression, - $this->throwOnXmlErrors, $this->includedSections, $this->formatter, - $this->caption, + $this->throwOnXmlErrors, ), }; } @@ -143,13 +138,13 @@ public function ignoreTableHeader(): self true => $this, false => new self( $this->tableExpression, + $this->caption, $this->tableHeader, true, $this->tableHeaderExpression, - $this->throwOnXmlErrors, $this->includedSections, $this->formatter, - $this->caption, + $this->throwOnXmlErrors, ), }; } @@ -160,13 +155,13 @@ public function resolveTableHeader(): self false => $this, true => new self( $this->tableExpression, + $this->caption, $this->tableHeader, false, $this->tableHeaderExpression, - $this->throwOnXmlErrors, $this->includedSections, $this->formatter, - $this->caption, + $this->throwOnXmlErrors, ), }; } @@ -182,53 +177,67 @@ public function tableHeaderPosition(Section $section, int $offset = 0): self $expression => $this, default => new self( $this->tableExpression, + $this->caption, $this->tableHeader, $this->ignoreTableHeader, $expression, - $this->throwOnXmlErrors, $this->includedSections, $this->formatter, - $this->caption, + $this->throwOnXmlErrors, ), }; } - public function includeSection(Section $section): self + public function includeSection(Section ...$sections): self { - $includedSections = $this->includedSections; - $includedSections[$section->value] = 1; + $includedSections = array_reduce( + $sections, + function (array $carry, Section $section) { + $carry[$section->value] = 1; + + return $carry; + }, + $this->includedSections + ); return match ($this->includedSections) { $includedSections => $this, default => new self( $this->tableExpression, + $this->caption, $this->tableHeader, $this->ignoreTableHeader, $this->tableHeaderExpression, - $this->throwOnXmlErrors, $includedSections, $this->formatter, - $this->caption, + $this->throwOnXmlErrors, ), }; } - public function excludeSection(Section $section): self + public function excludeSection(Section ...$sections): self { - $includedSections = $this->includedSections; - unset($includedSections[$section->value]); + $includedSections = array_reduce( + $sections, + function (array $carry, Section $section) { + unset($carry[$section->value]); + + return $carry; + }, + $this->includedSections + ); return match ($this->includedSections) { $includedSections => $this, default => new self( $this->tableExpression, + $this->caption, $this->tableHeader, $this->ignoreTableHeader, $this->tableHeaderExpression, - $this->throwOnXmlErrors, $includedSections, $this->formatter, - $this->caption, + $this->throwOnXmlErrors, ), }; } @@ -239,13 +248,13 @@ public function failOnXmlErrors(): self true => $this, false => new self( $this->tableExpression, + $this->caption, $this->tableHeader, $this->ignoreTableHeader, $this->tableHeaderExpression, - true, $this->includedSections, $this->formatter, - $this->caption, + true, ), }; } @@ -256,13 +265,13 @@ public function ignoreXmlErrors(): self false => $this, true => new self( $this->tableExpression, + $this->caption, $this->tableHeader, $this->ignoreTableHeader, $this->tableHeaderExpression, - false, $this->includedSections, $this->formatter, - $this->caption, + false, ), }; } @@ -271,13 +280,13 @@ public function withFormatter(Closure $formatter): self { return new self( $this->tableExpression, + $this->caption, $this->tableHeader, $this->ignoreTableHeader, $this->tableHeaderExpression, - $this->throwOnXmlErrors, $this->includedSections, $formatter, - $this->caption, + $this->throwOnXmlErrors, ); } @@ -287,30 +296,30 @@ public function withoutFormatter(): self $this->formatter => $this, default => new self( $this->tableExpression, + $this->caption, $this->tableHeader, $this->ignoreTableHeader, $this->tableHeaderExpression, - $this->throwOnXmlErrors, $this->includedSections, null, - $this->caption, + $this->throwOnXmlErrors, ), }; } - public function tableCaption(?string $caption = null): self + public function tableCaption(?string $caption): self { return match ($this->caption) { $caption => $this, default => new self( $this->tableExpression, + $caption, $this->tableHeader, $this->ignoreTableHeader, $this->tableHeaderExpression, - $this->throwOnXmlErrors, $this->includedSections, $this->formatter, - $caption, + $this->throwOnXmlErrors, ), }; } @@ -322,7 +331,7 @@ public function tableCaption(?string $caption = null): self * @throws ParserError * @throws SyntaxError */ - public function parseFile(mixed $filenameOrStream, $filenameContext = null): TabularDataReader + public function parseFile(mixed $filenameOrStream, $filenameContext = null): Table { if (is_resource($filenameOrStream)) { return $this->parseHtml($this->streamToString($filenameOrStream)); @@ -374,7 +383,7 @@ public function parseHtml(DOMDocument|DOMElement|SimpleXMLElement|Stringable|str $result = $xpath->query('(//caption)[1]'); $caption = $result->item(0)?->nodeValue ?? $this->caption; - return new Table(new ResultSet($this->extractTableContents($xpath, $header), $header), $caption); + return new Table(new ResultSet($this->extractTableContents($xpath, $header), array_values($header)), $caption); } /** @@ -564,11 +573,9 @@ private function extractRecord(DOMElement $tr, array &$rowSpanIndices = []): arr */ private function formatRecord(array $record, array $header): array { - $cellCount = count($header); - $record = match ($cellCount) { - 0 => $record, - count($record) => array_combine($header, $record), - default => array_combine($header, array_slice(array_pad($record, $cellCount, ''), 0, $cellCount)), + $record = match ([]) { + $header => $record, + default => $this->combineArray($record, $header), }; return match (null) { @@ -576,4 +583,20 @@ private function formatRecord(array $record, array $header): array default => ($this->formatter)($record), }; } + + /** + * @param array $record + * @param array $header + * + * @return array + */ + private function combineArray(array $record, array $header): array + { + $row = []; + foreach ($header as $offset => $value) { + $row[$value] = $record[$offset] ?? null; + } + + return $row; + } } diff --git a/src/ParserTest.php b/src/ParserTest.php index c4e7403..cae3a63 100644 --- a/src/ParserTest.php +++ b/src/ParserTest.php @@ -47,14 +47,12 @@ public function it_will_return_the_same_options(): void $parser ->tablePosition(0) ->tableHeaderPosition(Section::thead, 0) - ->includeSection(Section::tbody) - ->includeSection(Section::tfoot) - ->includeSection(Section::tr) + ->includeSection(Section::tbody, Section::tfoot, Section::tr) ->tableHeader([]) ->resolveTableHeader() ->ignoreXmlErrors() ->withoutFormatter() - ->tableCaption() + ->tableCaption(null) ); } @@ -286,6 +284,47 @@ public function it_will_use_the_submitted_headers(): void ], $table->first()); } + + #[Test] + public function it_will_rearrange_the_content_with_table_header(): void + { + $html = <<
+ + + + + + +
Abel14M2004
Abiga6F2004
Aboubacar8M2004
Aboubakar6M2004
+TABLE; + + $header = [3 => 'Annee', 2 => 'Sexe', 0 => 'Firstname', 1 => 'Count']; + $table = Parser::new() + ->tableHeader($header) + ->parseHtml($html); + + self::assertSame($table->getHeader(), array_values($header)); + self::assertSame([ + 'Annee' => '2004', + 'Sexe' => 'M', + 'Firstname' => 'Abel', + 'Count' => '14', + ], $table->first()); + + $header = [3 => 'Annee', 0 => 'Firstname', 1 => 'Count']; + $table = Parser::new() + ->tableHeader($header) + ->parseHtml($html); + + self::assertSame($table->getHeader(), array_values($header)); + self::assertSame([ + 'Annee' => '2004', + 'Firstname' => 'Abel', + 'Count' => '14', + ], $table->first()); + } + #[Test] public function it_will_duplicate_colspan_data(): void { @@ -391,7 +430,6 @@ public function it_will_found_no_header_in_any_section(): void self::assertSame([], $table->getHeader()); } - #[Test] public function it_will_use_the_table_footer(): void {