diff --git a/CHANGELOG.md b/CHANGELOG.md
index f6d8b68..73584e9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,11 +2,12 @@
All Notable changes to `bakame/html-table` will be documented in this file.
-## [0.3.0](https://github.com/bakame-php/html-table/compare/0.2.0...0.3.0) - 2023-09-27
+## [0.3.0](https://github.com/bakame-php/html-table/compare/0.2.0...0.3.0) - 2023-09-29
### Added
- `Parser::tableXpathPosition`
+- `Parser::tableCaption`
- `Table` class which implements the `TabularDataReader` interface.
- `Parser::includeSections` and `Parser::excludeSections` to improve section parsing.
@@ -14,6 +15,7 @@ All Notable changes to `bakame/html-table` will be documented in this file.
- Improve identifier validation for `Parser::tablePosition`
- Remove the `$tableOffset` property.
+- `tableHeader` can now re-arrange the table column and remove any unwanted column.
### Deprecated
diff --git a/README.md b/README.md
index 2158c26..fdcc92a 100644
--- a/README.md
+++ b/README.md
@@ -7,8 +7,8 @@
[![Total Downloads](https://img.shields.io/packagist/dt/bakame/html-table.svg?style=flat-square)](https://packagist.org/packages/bakame/html-table)
[![Sponsor development of this project](https://img.shields.io/badge/sponsor%20this%20package-%E2%9D%A4-ff69b4.svg?style=flat-square)](https://github.com/sponsors/nyamsprod)
-`bakame/html-table` is a small PHP package that allows you to parse, import tabular data represented as
-HTML Table. Once installed you will be able to do the following:
+`bakame/html-table` is a small PHP package that allows you to parse, import and manipualte
+tabular data represented as HTML Table. Once installed you will be able to do the following:
```php
use Bakame\HtmlTable\Parser;
@@ -33,10 +33,6 @@ $table
// ]
```
-The Package is responsible for the parsing of the HTML, the manipulation methods used
-are part of the `league\csv` package. Please refer to
-[its documentation](https://csv.thephpleague.com) for more information.
-
## System Requirements
**league\csv >= 9.11.0** library is required.
@@ -53,11 +49,28 @@ composer require bakame/html-table
The `Parser` can convert a file (a PHP stream or a Path with an optional context like `fopen`)
or an HTML document into a `League\Csv\TabularData` implementing object. Once converted you
-can use all the methods and feature made available by this interface
-(see [ResultSet](https://csv.thephpleague.com/9.0/reader/resultset/)) for more information.
+can use all the methods and feature made available by the interface (see [ResultSet](https://csv.thephpleague.com/9.0/reader/resultset/))
+for more information.
**The `Parser` itself is immutable, whenever you change a configuration option a new instance is returned.**
+**The `Parser` constructor is private to instantiate the object you are required to use the `new` method instead**
+
+```php
+use Bakame\HtmlTable\Parser;
+
+$parser = Parser::new()
+ ->ignoreTableHeader()
+ ->ignoreXmlErrors()
+ ->withoutFormatter()
+ ->tableCaption('This is a beautiful table');
+```
+
+### parseHtml and parseFile
+
+To extract and parse your table use either the `parseHtml` or `parseFile` methods.
+If parsing is not possible a `ParseError` exception will be thrown.
+
```php
use Bakame\HtmlTable\Parser;
@@ -67,20 +80,18 @@ $table = $parser->parseHtml('
');
$table = $parser->parseFile('path/to/html/file.html');
```
-### parseHtml and parseFile
-
-The `parseHtml` or `parseFile` methods extract and parse your table. If parsing
-is not possible a `ParseError` exception will be thrown.
-
`parseHtml` parses an HTML page represented by:
- a `string`,
- a `Stringable` object,
- a `DOMDocument`,
- a `DOMElement`,
-- and/or a `SimpleXMLElement`
+- or a `SimpleXMLElement`
-whereas `parseFile` works with a filepath and/or a PHP readable stream.
+whereas `parseFile` works with:
+
+- a filepath,
+- or a PHP readable stream.
Both methods return a `Table` instance which implements the `League\Csv\TabularDataReader`
interface and also give access to the table caption if present via the `getCaption` method.
@@ -137,36 +148,33 @@ By default, when calling the `Parser::new()` named constructor the parser will:
- have no formatter attached.
- have no default caption to used if none is present in the table.
-Each of the following settings can be changed to improve HTML to object conversion for your specific needs:
+Each of the following settings can be changed to improve the conversion against your business rules:
### tablePosition and tableXpathPosition
-Selecting the table to parse in the HTML page can be done usage two (2) methods
+Selecting the table to parse in the HTML page can be done using two (2) methods
`Parser::tablePosition` and `Parser::tableXpathPosition`
If you know the table position in the page in relation with its integer offset or if
you know it's `id` attribute value you should use `Parser::tablePosition` otherwise
-for any other complex situations you should favor `Parser::tableXpathPosition`
-which expects an `xpath` expression. If the expression is valid, the first
-result of the expression will be returned.
-
-- a string; it will represent the value of the table "id" attribute.
-- a positive integer or `0`; it will represent the table offset.
+favor `Parser::tableXpathPosition` which expects an `xpath` expression.
+If the expression is valid, and a list of table is found, the first result will be returned.
```php
use Bakame\HtmlTable\Parser;
-$parser = Parser::new()->tablePosition('table-id'); // parse the
+$parser = Parser::new()->tablePosition(3); // parses the 4th table of the page
$parser = Parser::new()->tableXPathPosition("//main/div/table");
+//parse the first table that matches the xpath expression
```
-`Parser::tableXpathPosition` and `Parser::tablePosition` override each other. It is
-recommended to use one or the other but not both at the same time.
+**`Parser::tableXpathPosition` and `Parser::tablePosition` override each other. It is
+recommended to use one or the other but not both at the same time.**
### tableCaption
-You can optionnally define a caption for your table if none is present or found during parsing.
+You can optionally define a caption for your table if none is present or found during parsing.
```php
use Bakame\HtmlTable\Parser;
@@ -175,18 +183,13 @@ $parser = Parser::new()->tableCaption('this is a generated caption');
$parser = Parser::new()->tableCaption(null); // remove any default caption set
```
-### ignoreTableHeader and resolveTableHeader
+### tableHeader, tableHeaderPosition, ignoreTableHeader and resolveTableHeader
-Tells the parser to attempt or not table header resolution.
+The following settings configure the `Parser` in relation to the table header. By default,
+the parser will try to parse the first `tr` tag found in the `thead` section of the table.
+But you can override this behaviour using one of these settings:
-```php
-use Bakame\HtmlTable\Parser;
-
-$parser = Parser::new()->ignoreTableHeader(); // no table header will be resolved
-$parser = Parser::new()->resolveTableHeader(); // will attempt to resolve the table header
-```
-
-### tableHeaderPosition
+#### tableHeaderPosition
Tells where to locate and resolve the table header
@@ -198,7 +201,8 @@ $parser = Parser::new()->tableHeaderPosition(Section::thead, 3);
// header is the 4th row in the table section
```
-use the `Bakame\HtmlTable\Section` enum to designate which table section to use to resolve the header
+The method uses the `Bakame\HtmlTable\Section` enum to designate which table section to use
+to resolve the header
```php
use Bakame\HtmlTable\Section;
@@ -213,12 +217,24 @@ enum Section
```
If `Section::tr` is used, `tr` tags will be used independently of their section.
-The second argument is the table header offset; it defaults to `0` (ie: the first row).
+The second argument is the table header `tr` offset; it defaults to `0` (ie: the first row).
+
+#### ignoreTableHeader and resolveTableHeader
-### tableHeader
+Instructs the parser to resolve or not the table header using `tableHeaderPosition` configuration.
+If no resolution is done, no header will be included in the returned `Table` instance.
+
+```php
+use Bakame\HtmlTable\Parser;
+
+$parser = Parser::new()->ignoreTableHeader(); // no table header will be resolved
+$parser = Parser::new()->resolveTableHeader(); // will attempt to resolve the table header
+```
+
+#### tableHeader
You can specify directly the header of your table and override any other table header
-related configuration with this one
+related configuration with this configuration
```php
use Bakame\HtmlTable\Parser;
@@ -231,6 +247,19 @@ $parser = Parser::new()->tableHeader(['rank', 'team', 'winner']);
**Because it is a tabular data each cell MUST be unique otherwise an exception will be thrown**
+You can skip or re-arrange the source columns by skipping them by their offsets and/or by
+re-ordering the offsets.
+
+```php
+use Bakame\HtmlTable\Parser;
+use Bakame\HtmlTable\Section;
+
+$parser = Parser::new()->tableHeader([3 => 'rank', 7 => 'winner', 5 => 'team']);
+// only 3 column will be extracted the 4th, 6th and 8th columns
+// and re-arrange as 'rank' first and 'team' last
+// if a column is missing its value will be PHP `null` type
+```
+
### includeSection and excludeSection
Tells which section should be parsed based on the `Section` enum
@@ -239,13 +268,24 @@ Tells which section should be parsed based on the `Section` enum
use Bakame\HtmlTable\Parser;
use Bakame\HtmlTable\Section;
-$parser = Parser::new()->includeSection(Section::thead); // thead is included during parsing
-$parser = Parser::new()->excludeSection(Section::tr); // table direct tr children are not included during parsing
+$parser = Parser::new()->includeSection(Section::tbody); // thead and tfoot are included during parsing
+$parser = Parser::new()->excludeSection(Section::tr, Section::tfoot); // table direct tr children and tfoot are not included during parsing
```
**By default, the `thead` section is not parse. If a `thead` row is selected to be the header, it will
be parsed independently of this setting.**
+**⚠️Tips:** to be sure of which sections will be modified, first remove all previous setting
+before applying your configuration as shown below:
+
+```diff
+- Parser::new()->includeSection(Section::tbody);
++ Parser::new()->excludeSection(...Section::cases())->includeSection(Section::tbody);
+```
+
+The first call will still include the `tfoot` and the `tr` sections, whereas the second call
+remove any previous setting guaranting that only the `tbody` if present will be parsed.
+
### withFormatter and withoutFormatter
Adds or remove a record formatter applied to the data extracted from the table before you
@@ -267,6 +307,25 @@ function (array $record): array;
If a header was defined or specified, the submitted record will have the header definition set,
otherwise an array list is provided.
+The following formatter will work on any table content as long as it is defined as a string.
+
+```php
+$formatter = fn (array $record): array => array_map(strtolower(...), $record);
+// the following formatter will convert all the fields from your table to lowercase.
+```
+
+The following formatter will only work if the table has a header attached to it with
+a column named `count`.
+
+```php
+$formatter = function (array $record): array {
+ $record['count'] = (int) $record['count'];
+
+ return $record;
+}
+// the following formatter will convert the data of all count column into integer..
+```
+
### ignoreXmlErrors and failOnXmlErrors
Tells whether the parser should ignore or throw in case of malformed HTML content.
diff --git a/composer.json b/composer.json
index 6cb058b..8385cee 100644
--- a/composer.json
+++ b/composer.json
@@ -27,20 +27,22 @@
],
"require": {
"ext-dom": "*",
- "ext-json": "*",
"ext-libxml": "*",
"ext-mbstring": "*",
"ext-simplexml": "*",
"league/csv": "^9.11.0"
},
"require-dev": {
+ "ext-curl": "*",
"ext-xdebug": "*",
"friendsofphp/php-cs-fixer": "^v3.28.0",
+ "laravel/prompts": "^0.1.9",
"phpstan/phpstan": "^1.10.35",
"phpstan/phpstan-deprecation-rules": "^1.1.4",
"phpstan/phpstan-phpunit": "^1.3.14",
"phpstan/phpstan-strict-rules": "^1.5.1",
"phpunit/phpunit": "^10.3.5",
+ "symfony/css-selector": "^6.3",
"symfony/var-dumper": "^6.3.4"
},
"autoload": {
diff --git a/src/Parser.php b/src/Parser.php
index 4b13323..61f8d16 100644
--- a/src/Parser.php
+++ b/src/Parser.php
@@ -14,20 +14,15 @@
use Iterator;
use League\Csv\ResultSet;
use League\Csv\SyntaxError;
-use League\Csv\TabularDataReader;
use SimpleXMLElement;
use Stringable;
-use function array_combine;
use function array_fill;
use function array_filter;
use function array_key_exists;
use function array_merge;
-use function array_pad;
use function array_shift;
-use function array_slice;
use function array_unique;
-use function count;
use function fclose;
use function fopen;
use function in_array;
@@ -52,13 +47,13 @@ final class Parser
*/
private function __construct(
private readonly string $tableExpression,
+ private readonly ?string $caption,
private readonly array $tableHeader,
private readonly bool $ignoreTableHeader,
private readonly string $tableHeaderExpression,
- private readonly bool $throwOnXmlErrors,
private readonly array $includedSections,
private readonly ?Closure $formatter,
- private readonly ?string $caption,
+ private readonly bool $throwOnXmlErrors,
) {
}
@@ -66,13 +61,13 @@ public static function new(): self
{
return new self(
'(//table)[1]',
+ null,
[],
false,
'(//table/thead/tr)[1]',
- false,
[Section::tbody->value => 1, Section::tr->value => 1, Section::tfoot->value => 1],
null,
- null,
+ false,
);
}
@@ -84,13 +79,13 @@ public function tableXPathPosition(string $expression): self
false === (new DOMXPath(new DOMDocument()))->query($expression) => throw new ParserError('The xpath expression `'.$expression.'` is invalie.'),
default => new self(
$expression,
+ $this->caption,
$this->tableHeader,
$this->ignoreTableHeader,
$this->tableHeaderExpression,
- $this->throwOnXmlErrors,
$this->includedSections,
$this->formatter,
- $this->caption,
+ $this->throwOnXmlErrors,
),
};
restore_error_handler();
@@ -126,13 +121,13 @@ public function tableHeader(array $headerRow): self
$headerRow !== array_unique($filteredHeader) => throw ParserError::dueToDuplicateHeaderColumnNames($headerRow),
default => new self(
$this->tableExpression,
+ $this->caption,
$headerRow,
$this->ignoreTableHeader,
$this->tableHeaderExpression,
- $this->throwOnXmlErrors,
$this->includedSections,
$this->formatter,
- $this->caption,
+ $this->throwOnXmlErrors,
),
};
}
@@ -143,13 +138,13 @@ public function ignoreTableHeader(): self
true => $this,
false => new self(
$this->tableExpression,
+ $this->caption,
$this->tableHeader,
true,
$this->tableHeaderExpression,
- $this->throwOnXmlErrors,
$this->includedSections,
$this->formatter,
- $this->caption,
+ $this->throwOnXmlErrors,
),
};
}
@@ -160,13 +155,13 @@ public function resolveTableHeader(): self
false => $this,
true => new self(
$this->tableExpression,
+ $this->caption,
$this->tableHeader,
false,
$this->tableHeaderExpression,
- $this->throwOnXmlErrors,
$this->includedSections,
$this->formatter,
- $this->caption,
+ $this->throwOnXmlErrors,
),
};
}
@@ -182,53 +177,67 @@ public function tableHeaderPosition(Section $section, int $offset = 0): self
$expression => $this,
default => new self(
$this->tableExpression,
+ $this->caption,
$this->tableHeader,
$this->ignoreTableHeader,
$expression,
- $this->throwOnXmlErrors,
$this->includedSections,
$this->formatter,
- $this->caption,
+ $this->throwOnXmlErrors,
),
};
}
- public function includeSection(Section $section): self
+ public function includeSection(Section ...$sections): self
{
- $includedSections = $this->includedSections;
- $includedSections[$section->value] = 1;
+ $includedSections = array_reduce(
+ $sections,
+ function (array $carry, Section $section) {
+ $carry[$section->value] = 1;
+
+ return $carry;
+ },
+ $this->includedSections
+ );
return match ($this->includedSections) {
$includedSections => $this,
default => new self(
$this->tableExpression,
+ $this->caption,
$this->tableHeader,
$this->ignoreTableHeader,
$this->tableHeaderExpression,
- $this->throwOnXmlErrors,
$includedSections,
$this->formatter,
- $this->caption,
+ $this->throwOnXmlErrors,
),
};
}
- public function excludeSection(Section $section): self
+ public function excludeSection(Section ...$sections): self
{
- $includedSections = $this->includedSections;
- unset($includedSections[$section->value]);
+ $includedSections = array_reduce(
+ $sections,
+ function (array $carry, Section $section) {
+ unset($carry[$section->value]);
+
+ return $carry;
+ },
+ $this->includedSections
+ );
return match ($this->includedSections) {
$includedSections => $this,
default => new self(
$this->tableExpression,
+ $this->caption,
$this->tableHeader,
$this->ignoreTableHeader,
$this->tableHeaderExpression,
- $this->throwOnXmlErrors,
$includedSections,
$this->formatter,
- $this->caption,
+ $this->throwOnXmlErrors,
),
};
}
@@ -239,13 +248,13 @@ public function failOnXmlErrors(): self
true => $this,
false => new self(
$this->tableExpression,
+ $this->caption,
$this->tableHeader,
$this->ignoreTableHeader,
$this->tableHeaderExpression,
- true,
$this->includedSections,
$this->formatter,
- $this->caption,
+ true,
),
};
}
@@ -256,13 +265,13 @@ public function ignoreXmlErrors(): self
false => $this,
true => new self(
$this->tableExpression,
+ $this->caption,
$this->tableHeader,
$this->ignoreTableHeader,
$this->tableHeaderExpression,
- false,
$this->includedSections,
$this->formatter,
- $this->caption,
+ false,
),
};
}
@@ -271,13 +280,13 @@ public function withFormatter(Closure $formatter): self
{
return new self(
$this->tableExpression,
+ $this->caption,
$this->tableHeader,
$this->ignoreTableHeader,
$this->tableHeaderExpression,
- $this->throwOnXmlErrors,
$this->includedSections,
$formatter,
- $this->caption,
+ $this->throwOnXmlErrors,
);
}
@@ -287,30 +296,30 @@ public function withoutFormatter(): self
$this->formatter => $this,
default => new self(
$this->tableExpression,
+ $this->caption,
$this->tableHeader,
$this->ignoreTableHeader,
$this->tableHeaderExpression,
- $this->throwOnXmlErrors,
$this->includedSections,
null,
- $this->caption,
+ $this->throwOnXmlErrors,
),
};
}
- public function tableCaption(?string $caption = null): self
+ public function tableCaption(?string $caption): self
{
return match ($this->caption) {
$caption => $this,
default => new self(
$this->tableExpression,
+ $caption,
$this->tableHeader,
$this->ignoreTableHeader,
$this->tableHeaderExpression,
- $this->throwOnXmlErrors,
$this->includedSections,
$this->formatter,
- $caption,
+ $this->throwOnXmlErrors,
),
};
}
@@ -322,7 +331,7 @@ public function tableCaption(?string $caption = null): self
* @throws ParserError
* @throws SyntaxError
*/
- public function parseFile(mixed $filenameOrStream, $filenameContext = null): TabularDataReader
+ public function parseFile(mixed $filenameOrStream, $filenameContext = null): Table
{
if (is_resource($filenameOrStream)) {
return $this->parseHtml($this->streamToString($filenameOrStream));
@@ -374,7 +383,7 @@ public function parseHtml(DOMDocument|DOMElement|SimpleXMLElement|Stringable|str
$result = $xpath->query('(//caption)[1]');
$caption = $result->item(0)?->nodeValue ?? $this->caption;
- return new Table(new ResultSet($this->extractTableContents($xpath, $header), $header), $caption);
+ return new Table(new ResultSet($this->extractTableContents($xpath, $header), array_values($header)), $caption);
}
/**
@@ -564,11 +573,9 @@ private function extractRecord(DOMElement $tr, array &$rowSpanIndices = []): arr
*/
private function formatRecord(array $record, array $header): array
{
- $cellCount = count($header);
- $record = match ($cellCount) {
- 0 => $record,
- count($record) => array_combine($header, $record),
- default => array_combine($header, array_slice(array_pad($record, $cellCount, ''), 0, $cellCount)),
+ $record = match ([]) {
+ $header => $record,
+ default => $this->combineArray($record, $header),
};
return match (null) {
@@ -576,4 +583,20 @@ private function formatRecord(array $record, array $header): array
default => ($this->formatter)($record),
};
}
+
+ /**
+ * @param array $record
+ * @param array $header
+ *
+ * @return array
+ */
+ private function combineArray(array $record, array $header): array
+ {
+ $row = [];
+ foreach ($header as $offset => $value) {
+ $row[$value] = $record[$offset] ?? null;
+ }
+
+ return $row;
+ }
}
diff --git a/src/ParserTest.php b/src/ParserTest.php
index c4e7403..cae3a63 100644
--- a/src/ParserTest.php
+++ b/src/ParserTest.php
@@ -47,14 +47,12 @@ public function it_will_return_the_same_options(): void
$parser
->tablePosition(0)
->tableHeaderPosition(Section::thead, 0)
- ->includeSection(Section::tbody)
- ->includeSection(Section::tfoot)
- ->includeSection(Section::tr)
+ ->includeSection(Section::tbody, Section::tfoot, Section::tr)
->tableHeader([])
->resolveTableHeader()
->ignoreXmlErrors()
->withoutFormatter()
- ->tableCaption()
+ ->tableCaption(null)
);
}
@@ -286,6 +284,47 @@ public function it_will_use_the_submitted_headers(): void
], $table->first());
}
+
+ #[Test]
+ public function it_will_rearrange_the_content_with_table_header(): void
+ {
+ $html = <<
+
+Abel | 14 | M | 2004 |
+Abiga | 6 | F | 2004 |
+Aboubacar | 8 | M | 2004 |
+Aboubakar | 6 | M | 2004 |
+
+
+TABLE;
+
+ $header = [3 => 'Annee', 2 => 'Sexe', 0 => 'Firstname', 1 => 'Count'];
+ $table = Parser::new()
+ ->tableHeader($header)
+ ->parseHtml($html);
+
+ self::assertSame($table->getHeader(), array_values($header));
+ self::assertSame([
+ 'Annee' => '2004',
+ 'Sexe' => 'M',
+ 'Firstname' => 'Abel',
+ 'Count' => '14',
+ ], $table->first());
+
+ $header = [3 => 'Annee', 0 => 'Firstname', 1 => 'Count'];
+ $table = Parser::new()
+ ->tableHeader($header)
+ ->parseHtml($html);
+
+ self::assertSame($table->getHeader(), array_values($header));
+ self::assertSame([
+ 'Annee' => '2004',
+ 'Firstname' => 'Abel',
+ 'Count' => '14',
+ ], $table->first());
+ }
+
#[Test]
public function it_will_duplicate_colspan_data(): void
{
@@ -391,7 +430,6 @@ public function it_will_found_no_header_in_any_section(): void
self::assertSame([], $table->getHeader());
}
-
#[Test]
public function it_will_use_the_table_footer(): void
{