-
Notifications
You must be signed in to change notification settings - Fork 36
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added ParsedDOM API to safely manipulate XML
- Loading branch information
Showing
12 changed files
with
849 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
### Getting started | ||
|
||
The ParsedDOM utility allows you to load the [parsed representation of a text](/Getting_started/How_it_works/) (XML that is usually stored in a database) into a DOM document and operate on it with regular DOM methods as well as a specialized API. Unlike native string manipulation it provides better guarantees that the resulting XML will match what the parser would normally produce. It is best suited for maintenance tasks. For lightweight, real-time operations, it is recommended to use the limited but more efficient [Utils](https://s9e.github.io/TextFormatter/api/s9e/TextFormatter/Utils.html) class if possible. | ||
|
||
```php | ||
// Start with the parsed representation of the text | ||
$xml = '<r><p>Hello <EM><s>*</s>world<e>*</e></EM> 😀</p></r>'; | ||
|
||
// Load it into a DOM document | ||
$dom = s9e\TextFormatter\Utils\ParsedDOM::loadXML($xml); | ||
|
||
// Select each EM elements using XPath... | ||
foreach ($dom->query('//EM') as $em) | ||
{ | ||
// ...and unparse it | ||
$em->unparse(); | ||
} | ||
|
||
// Converting the document to a string will serialize it back to XML in a way that | ||
// matches what the parser would output. This is different from calling saveXML() | ||
echo '__toString() ', (string) $dom . "\n"; | ||
echo 'saveXML() ', $dom->saveXML(); | ||
``` | ||
``` | ||
__toString() <t><p>Hello *world* 😀</p></t> | ||
saveXML() <?xml version="1.0"?> | ||
<t><p>Hello *world* 😀</p></t> | ||
``` | ||
|
||
|
||
### Replacing a tag and its markup | ||
|
||
In the following example, we replace Markdown-style emphasis with a `I` BBCode. The Litedown plugin uses `EM` tags for emphasis whereas the BBCodes plugin uses `I` tags for `I` BBCodes, so we have to replace element with a new tag, and replace its markup without touching its content. | ||
|
||
```php | ||
$xml = '<r><p>Hello <EM><s>*</s>world<e>*</e></EM></p></r>'; | ||
$dom = s9e\TextFormatter\Utils\ParsedDOM::loadXML($xml); | ||
|
||
// Select each EM element | ||
foreach ($dom->query('//EM') as $em) | ||
{ | ||
// Replace it with what a I tag would generate (a I element) | ||
$b = $em->replaceTag('I'); | ||
|
||
// Set the markup for this new element/tag, it will be placed in the appropriate location | ||
$b->setMarkupStart('[i]'); | ||
$b->setMarkupEnd('[/i]'); | ||
} | ||
|
||
echo $dom; | ||
``` | ||
``` | ||
<r><p>Hello <I><s>[i]</s>world<e>[/i]</e></I></p></r> | ||
``` | ||
|
||
|
||
### Replacing a tag and its content | ||
|
||
In the following example, we replace an embedded YouTube video with a normal text link using BBCode markup. Here we set its text content to be the YouTube URL, but it could be replaced by something more meaningful such as the video's title. | ||
|
||
|
||
```php | ||
$xml = '<r><YOUTUBE id="QH2-TGUlwu4">https://www.youtube.com/watch?v=QH2-TGUlwu4</YOUTUBE></r>'; | ||
$dom = s9e\TextFormatter\Utils\ParsedDOM::loadXML($xml); | ||
|
||
// Select each YOUTUBE element with an id attribute | ||
foreach ($dom->query('//YOUTUBE[@id]') as $youtubeElement) | ||
{ | ||
// Generate a URL for the original video | ||
$url = str_starts_with($youtubeElement->textContent, 'https://') | ||
? $youtubeElement->textContent | ||
: 'https://youtu.be/' . $youtubeElement->getAttribute('id'); | ||
|
||
// Replace the YOUTUBE element with what a [url] BBCode would produce. The default [url] | ||
// BBCode uses a URL tag with a url attribute | ||
$urlElement = $youtubeElement->replaceTag('URL', ['url' => $url]); | ||
|
||
// Reset its text content and add the appropriate markup. The order is important here as | ||
// overwriting the text content of an element will remove its markup | ||
$urlElement->textContent = $url; | ||
$urlElement->setMarkupStart('[url]'); | ||
$urlElement->setMarkupEnd('[/url]'); | ||
} | ||
|
||
echo $dom; | ||
``` | ||
``` | ||
<r><URL url="https://www.youtube.com/watch?v=QH2-TGUlwu4"><s>[url]</s>https://www.youtube.com/watch?v=QH2-TGUlwu4<e>[/url]</e></URL></r> | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
<?php declare(strict_types=1); | ||
|
||
/** | ||
* @package s9e\TextFormatter | ||
* @copyright Copyright (c) The s9e authors | ||
* @license http://www.opensource.org/licenses/mit-license.php The MIT License | ||
*/ | ||
namespace s9e\TextFormatter\Utils; | ||
|
||
use const LIBXML_NONET; | ||
use s9e\TextFormatter\Utils\ParsedDOM\Document; | ||
|
||
abstract class ParsedDOM | ||
{ | ||
public static function loadXML(string $xml): Document | ||
{ | ||
$dom = new Document; | ||
$dom->loadXML($xml, LIBXML_NONET); | ||
|
||
return $dom; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
<?php declare(strict_types=1); | ||
|
||
/** | ||
* @package s9e\TextFormatter | ||
* @copyright Copyright (c) The s9e authors | ||
* @license http://www.opensource.org/licenses/mit-license.php The MIT License | ||
*/ | ||
namespace s9e\TextFormatter\Utils\ParsedDOM; | ||
|
||
use const LIBXML_NSCLEAN, SORT_STRING, false; | ||
use function ksort, substr, strpos; | ||
use s9e\SweetDOM\Document as SweetDocument; | ||
use s9e\TextFormatter\Configurator\Validators\TagName; | ||
use s9e\TextFormatter\Configurator\Validators\AttributeName; | ||
use s9e\TextFormatter\Utils; | ||
|
||
class Document extends SweetDocument | ||
{ | ||
/** | ||
* @link https://www.php.net/manual/domdocument.construct.php | ||
* | ||
* @param string $version Version number of the document | ||
* @param string $encoding Encoding of the document | ||
*/ | ||
public function __construct(string $version = '1.0', string $encoding = 'utf-8') | ||
{ | ||
parent::__construct($version, $encoding); | ||
|
||
$this->registerNodeClass('DOMElement', Element::class); | ||
} | ||
|
||
public function __toString(): string | ||
{ | ||
$this->formatOutput = false; | ||
$this->normalizeDocument(); | ||
|
||
$xml = $this->saveXML($this->documentElement, LIBXML_NSCLEAN); | ||
$xml = Utils::encodeUnicodeSupplementaryCharacters($xml); | ||
|
||
return ($xml === '<t/>') ? '<t></t>' : $xml; | ||
} | ||
|
||
/** | ||
* @link https://www.php.net/manual/en/domdocument.normalizedocument.php | ||
*/ | ||
public function normalizeDocument(): void | ||
{ | ||
parent::normalizeDocument(); | ||
$this->documentElement->normalize(); | ||
|
||
$nodeName = $this->documentElement->firstOf('.//*[name() != "br"][name() != "p"]') ? 'r' : 't'; | ||
|
||
$root = $this->createElement($nodeName); | ||
while (isset($this->documentElement->firstChild)) | ||
{ | ||
$root->appendChild($this->documentElement->firstChild); | ||
} | ||
$this->documentElement->replaceWith($root); | ||
} | ||
|
||
/** | ||
* Create an element that represents a tag | ||
* | ||
* @param string $tagName | ||
* @param array<string, string> $attributes | ||
* @return Element | ||
*/ | ||
public function createTagElement(string $tagName, array $attributes = []): Element | ||
{ | ||
$tagName = TagName::normalize($tagName); | ||
$pos = strpos($tagName, ':'); | ||
|
||
if ($pos === false) | ||
{ | ||
$element = $this->createElement($tagName); | ||
} | ||
else | ||
{ | ||
$prefix = substr($tagName, 0, $pos); | ||
$namespaceURI = 'urn:s9e:TextFormatter:' . $prefix; | ||
$this->documentElement->setAttributeNS( | ||
'http://www.w3.org/2000/xmlns/', | ||
'xmlns:' . $prefix, | ||
$namespaceURI | ||
); | ||
|
||
$element = $this->createElementNS($namespaceURI, $tagName); | ||
} | ||
|
||
foreach ($this->normalizeAttributeMap($attributes) as $attrName => $attrValue) | ||
{ | ||
$element->setAttribute($attrName, $attrValue); | ||
} | ||
|
||
return $element; | ||
} | ||
|
||
/** | ||
* @param array<string, string> $attributes | ||
* @return array<string, string> $attributes | ||
*/ | ||
protected function normalizeAttributeMap(array $attributes): array | ||
{ | ||
$map = []; | ||
foreach ($attributes as $attrName => $attrValue) | ||
{ | ||
$attrName = AttributeName::normalize($attrName); | ||
$map[$attrName] = (string) $attrValue; | ||
|
||
} | ||
ksort($map, SORT_STRING); | ||
|
||
return $map; | ||
} | ||
} |
Oops, something went wrong.