Skip to content

Commit

Permalink
Escaped characters (#394)
Browse files Browse the repository at this point in the history
* bug: certain elements do not encode properly on render
closes #356

* test: test bug #356 provided by @kartofelek007

* test: extra assertions - check escaped string are not in the render

* test: Remove unnecessary echo

* scratch: Marcin's test isolation

* fix: more work towards #356

* tweak: remove scratch file

* fix: better solution to #356 using processing entity
  • Loading branch information
g105b authored Sep 25, 2022
1 parent 2616823 commit 19ad483
Show file tree
Hide file tree
Showing 7 changed files with 256 additions and 39 deletions.
20 changes: 0 additions & 20 deletions scratch.php

This file was deleted.

54 changes: 48 additions & 6 deletions src/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -53,29 +53,28 @@ abstract class Document extends DOMDocument implements Stringable, StreamInterfa
DOMText::class => Text::class,
DOMProcessingInstruction::class => ProcessingInstruction::class,
];
const DOCTYPE = "<!doctype html>";

public function __construct(
public readonly string $characterSet,
public readonly string $contentType,
) {
parent::__construct("1.0", $this->characterSet);
$this->encoding = $this->characterSet;
$this->substituteEntities = true;
$this->registerNodeClasses();
libxml_use_internal_errors(true);
}

public function __toString():string {
if(get_class($this) === HTMLDocument::class) {
$string = $this->saveHTML();
$string = self::DOCTYPE . "\n";
$string .= $this->saveHTML($this->documentElement);
}
else {
$string = $this->saveXML();
}

$string = mb_convert_encoding(
$string,
"UTF-8",
"HTML-ENTITIES"
);
return trim($string) . "\n";
}

Expand Down Expand Up @@ -363,4 +362,47 @@ private function registerNodeClasses():void {
$this->registerNodeClass($nativeClass, $gtClass);
}
}

/**
* Due to the way HTML is rendered, non-ASCII characters are converted
* into their HTML-encoded counterparts, but this behaviour breaks
* script tags that have inline JavaScript. This function extracts the
* raw innerHTML of each script, so injectScriptHTML can be called after
* page render, retaining the original characters.
*
* @return array<string, string> Key = a unique string of characters
* that the script tag's innerHTML is replaced with, before rendering
* the document. This key will be replaced with the value of the array
* item after render.
*/
private function extractScriptHTML():array {
$scriptHtmlList = [];

foreach($this->querySelectorAll("script") as $script) {
if(strlen($script->textContent) === 0) {
continue;
}
$html = html_entity_decode($script->innerHTML ?? "");
$key = str_repeat("@", 16)
. uniqid("---script-") . "---"
. str_repeat("@", 16);
$scriptHtmlList[$key] = $html;
$script->innerHTML = $key;
}

return $scriptHtmlList;
}

// public function saveHTML(DOMNode $node = null):string {
// $scriptHtmlList = $this->extractScriptHTML();
// if(!$node) {
// $node = $this->documentElement;
// }
// $html = parent::saveHTML((new \DOMXPath($this))->query('/')->item(0));
// foreach($scriptHtmlList as $key => $js) {
// $html = str_replace($key, $js, $html);
// }
// var_Dump($this->encoding, $this->substituteEntities);die();
// return $html;
// }
}
17 changes: 12 additions & 5 deletions src/HTMLDocument.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,20 @@ public function __construct(
"text/html",
);

$html = mb_convert_encoding(
$html,
"HTML-ENTITIES",
$this->characterSet,
);
// Workaround for handling UTF-8 encoding correctly.
// @link https://stackoverflow.com/questions/8218230/php-domdocument-loadhtml-not-encoding-utf-8-correctly
$html = '<?xml encoding="'
. strtolower($this->encoding)
. '" ?>'
. $html;
$this->loadHTML($html, LIBXML_SCHEMA_CREATE | LIBXML_COMPACT);
foreach($this->childNodes as $child) {
if($child instanceof ProcessingInstruction) {
$this->removeChild($child);
}
}

/** @var array<Node> $nonElementChildNodes */
$nonElementChildNodes = [];
foreach($this->childNodes as $child) {
if($child instanceof DocumentType
Expand Down
2 changes: 1 addition & 1 deletion src/ParentNode.php
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ public function getElementsByTagName(string $qualifiedName):HTMLCollection {
*
* @param Node|Element|Text|Comment $child
*/
public function removeChild(Node|Element|Text|Comment|DOMNode $child):Node|Element|Text|Comment {
public function removeChild(Node|Element|Text|Comment|DOMNode|ProcessingInstruction $child):Node|Element|Text|Comment|CdataSection|ProcessingInstruction {
try {
/** @var Node|Element|Text|Comment $removed */
$removed = parent::removeChild($child);
Expand Down
4 changes: 2 additions & 2 deletions test/phpunit/DocumentStreamTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ public function testEof():void {
while(!$sut->eof()) {
$bytes .= $sut->read(10);
}
self::assertEquals("<!DOCTYPE html>\n<html><head></head><body><example></example></body></html>\n", $bytes);
self::assertEquals("<!doctype html>\n<html><head></head><body><example></example></body></html>\n", $bytes);
}

public function testIsSeekableBeforeOpen():void {
Expand Down Expand Up @@ -132,7 +132,7 @@ public function testGetContents():void {
$sut->body->appendChild($sut->createElement("example"));
$sut->open();
$contents = $sut->getContents();
self::assertEquals("<!DOCTYPE html>\n<html><head></head><body><example></example></body></html>\n", $contents);
self::assertEquals("<!doctype html>\n<html><head></head><body><example></example></body></html>\n", $contents);
}

public function testGetMetaData():void {
Expand Down
28 changes: 28 additions & 0 deletions test/phpunit/ElementTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,34 @@ public function testInnerHTMLReset():void {
self::assertEquals("And another", $sut->children[1]->innerHTML);
}

public function testInnerText():void {
$document = new HTMLDocument();
$sut = $document->createElement("span");
$sut->innerText = "Hello, World!";
self::assertSame($sut->innerText, $sut->innerHTML);
}

public function testInnerText_containsHTML():void {
$document = new HTMLDocument();
$sut = $document->createElement("span");
$textWithHTML = "Hello, <b>World</b>!";
$sut->innerText = $textWithHTML;
self::assertSame($textWithHTML, $sut->innerText);
self::assertSame("Hello, &lt;b&gt;World&lt;/b&gt;!", $sut->innerHTML);
}

public function testTextContent():void {
$document = new HTMLDocument();
$sut = $document->createElement("span");
$document->body->appendChild($sut);

$textWithHTML = "Hello, <b>World</b>!";
$sut->textContent = $textWithHTML;
self::assertNotSame($textWithHTML, $sut->innerHTML);
self::assertSame($textWithHTML, $sut->innerText);
self::assertSame("Hello, &lt;b&gt;World&lt;/b&gt;!", $sut->innerHTML);
}

public function testOuterHTML():void {
$document = new HTMLDocument();
$sut = $document->createElement("example");
Expand Down
170 changes: 165 additions & 5 deletions test/phpunit/HTMLDocumentTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public function testAppendChild_createdElementsAreNotNamespaced():void {
public function testToString_emojiEncoding():void {
$html = "<h1>I ❤️ my 🐈</h1>";
$sut = new HTMLDocument($html);
self::assertStringContainsString("$html", (string)$sut);
self::assertStringContainsString("<h1>I ❤️ my 🐈</h1>", (string)$sut);
}

public function testPropBody_readOnly():void {
Expand All @@ -83,13 +83,13 @@ public function testPropBody_instanceOfHTMLBodyElementDefaultHTML():void {
public function testToString_emptyHTML():void {
$sut = new HTMLDocument();
/** @noinspection HtmlRequiredLangAttribute */
self::assertEquals("<!DOCTYPE html>\n<html><head></head><body></body></html>\n", (string)$sut);
self::assertEquals("<!doctype html>\n<html><head></head><body></body></html>\n", (string)$sut);
}

public function testToStringDefaultHTML():void {
$sut = new HTMLDocument(DocumentTestFactory::HTML_DEFAULT);
/** @noinspection HtmlRequiredLangAttribute */
self::assertEquals("<!DOCTYPE html>\n<html><head></head><body><h1>Hello, PHP.Gt!</h1></body></html>\n", (string)$sut);
self::assertEquals("<!doctype html>\n<html><head></head><body><h1>Hello, PHP.Gt!</h1></body></html>\n", (string)$sut);
}

public function testPropCharacter_default():void {
Expand Down Expand Up @@ -262,7 +262,7 @@ public function testWriteHTMLDocument():void {
$contents = stream_get_contents($stream);
/** @noinspection HtmlRequiredLangAttribute */
$expected = <<<HTML
<!DOCTYPE html>
<!doctype html>
<html><head></head><body><h1>Hello, PHP.Gt!</h1>$message</body></html>
HTML;
Expand All @@ -281,7 +281,7 @@ public function testWritelnHTMLDocument():void {
$contents = stream_get_contents($stream);
/** @noinspection HtmlRequiredLangAttribute */
$expected = <<<HTML
<!DOCTYPE html>
<!doctype html>
<html><head></head><body><h1>Hello, PHP.Gt!</h1>$message1
$message2
</body></html>
Expand Down Expand Up @@ -621,4 +621,164 @@ public function testGetElementById_afterIdChangedViaNode():void {
self::assertSame("changed", $child->getAttribute("id"));
self::assertSame($child, $sut->getElementById("changed"));
}

public function testSaveHTML_XSS():void {
$html = <<<HTML
<!doctype html>
<h1>Hello, <span>you</span>!</h1>
HTML;

// Create a new document with the above HTML.
$document = new HTMLDocument($html);
$document->loadHTML($html);

// Get reference to span tag.
$span = $document->getElementsByTagName("span")->item(0);

// Set the span's tag to user-supplied $name (malicious user can enter JavaScript!)
$name = "<script>alert('XSS');</script>";
$span->textContent = $name;

$script = $document->querySelector("script");
self::assertNull($script);

$documentString = (string)$document;
self::assertStringNotContainsString("<script>", $documentString);
}

public function testEscapedCharacters():void {
$content = <<<HTML
<!doctype html>
<script>
p.append(" są ");
</script>
<h1 id="pageTitle">This is the page title</h1>
HTML;

$sut = new HTMLDocument($content);
$h1 = $sut->querySelector("#pageTitle");
$div = $sut->createElement("div");
$div->innerHTML = "lorem";
$h1->after($div);

$htmlString = (string)$sut;
self::assertStringContainsString("<script>\np.append(\"\");\n</script>", $htmlString);
}

public function testEscapedCharacters_insideScriptTag():void {
$content = <<<HTML
<div>
<p id="testNodeText">
Hello, Marcin!
</p>
<button class="btn-textNodeTest">Button</button>
</div>
<script>
{
const p = document.querySelector("#testNodeText");
const btn = document.querySelector(".btn-textNodeTest");
const word1 = document.createTextNode("Psy");
p.append(word1);
p.append(" są ");
const word2 = document.createTextNode("fajne");
p.append(word2);
btn.addEventListener("click", () => {
console.dir(word1);
word1.textContent = "Koty też";
word2.textContent = "super!";
});
}
</script>
HTML;

$sut = new HTMLDocument($content);
$renderedHTML = (string)$sut;

self::assertStringContainsString('p.append(" są ");', $renderedHTML);
self::assertStringNotContainsString('p.append(" s&#261; ");', $renderedHTML);
self::assertStringContainsString('document.createTextNode("fajne");', $renderedHTML);
self::assertStringContainsString('word1.textContent = "Koty też";', $renderedHTML);
self::assertStringNotContainsString('word1.textContent = "Koty te&#380;";', $renderedHTML);
}

public function testEscapedCharacters_multipleScriptTagsShouldNotBeSlow():void {
$content = <<<HTML
<!doctype html>
<html>
<head>
<meta charset="utf-8" />
<title>Speed test using lots of script tags</title>
</head>
<body>
<h1>Speed test using lots of script tags</h1>
</body>
</html>
HTML;

$sut = new HTMLDocument($content);

for($i = 0; $i < 1000; $i++) {
$script = $sut->createElement("script");
$script->innerHTML = "console.log('Polski jest pięknym językiem');";
if($i % 2 === 0) {
$sut->head->appendChild($script);
}
else {
$sut->body->appendChild($script);
}
}

$timeStart = microtime(true);
$renderedHTML = (string)$sut;
$timeEnd = microtime(true);
self::assertLessThan(
1,
$timeEnd - $timeStart,
"It should never take a second to render the HTML, even with 1,000 script nodes"
);

self::assertStringContainsString("Polski jest pięknym językiem", $renderedHTML);
self::assertEquals(1000, substr_count($renderedHTML, "Polski jest pięknym językiem"));
}

public function testEscapedCharacters_entireDom():void {
$content = <<<HTML
<!doctype html>
<body>
<h1>Tworzenie i usuwanie elementów</h1>
<pre class="line-numbers"><code class="language-js">
Koty też
</code></pre>
<script>
console.log("zobaczyć co możemy użyć");
</script>
</body>
HTML;

$stringsToExpect = [
"Tworzenie i usuwanie elementów", // within the h1
"Koty też", // within the pre
"zobaczyć co możemy użyć", // within the script tag
];
$stringsToNotExpect = [
"&oacute;",
"&#380;",
];

$sut = new HTMLDocument($content);
$domString = (string)$sut;

foreach($stringsToExpect as $needle) {
self::assertStringContainsString($needle, $domString);
}
foreach($stringsToNotExpect as $needle) {
self::assertStringNotContainsString($needle, $domString);
}
}
}

0 comments on commit 19ad483

Please sign in to comment.