diff --git a/README.md b/README.md index d02fbd1..21992db 100644 --- a/README.md +++ b/README.md @@ -197,6 +197,10 @@ Emitted when a closing tag is parsed. An object containing the `name` of the tag Emitted when a processing instruction (such as ``) is parsed. An object with the `contents` of the processing instruction is passed. +#### `documenttypedefinition` + +Emitted when a document type definition (such as ``) is parsed. An object with the `contents` of the document type definition is passed. + #### `text` Emitted when a text node between two tags is parsed. An object with the `contents` of the text node is passed. You might need to expand XML entities inside the contents of the text node, using `Saxophone.parseEntities`. @@ -213,7 +217,6 @@ Emitted when a comment (such as ``) is parsed. An object with t Emitted when a parsing error is encountered while reading the XML stream such that the rest of the XML cannot be correctly interpreted: -* when a DOCTYPE node is found (not supported yet); * when a comment node contains the `--` sequence; * when opening and closing tags are mismatched or missing; * when a tag name starts with white space; diff --git a/lib/Saxophone.js b/lib/Saxophone.js index 9617b72..6adaf53 100644 --- a/lib/Saxophone.js +++ b/lib/Saxophone.js @@ -65,6 +65,22 @@ const {findIndexOutside} = require('./util'); * @type {ProcessingInstructionNode} */ +/** + * Information about the document type definition node + * (). + * + * @typedef DocumentTypeDefinitionNode + * @type {object} + * @prop {string} contents The definition contents + */ + +/** + * Emitted whenever a document type definition node is encountered. + * + * @event Saxophone#documenttypedefinition + * @type {DocumentTypeDefinitionNode} + */ + /** * Information about an opened tag * (). @@ -111,6 +127,7 @@ const Node = { comment: 'comment', markupDeclaration: 'markupDeclaration', processingInstruction: 'processinginstruction', + documentTypeDefinition: 'documenttypedefinition', tagOpen: 'tagopen', tagClose: 'tagclose', }; @@ -320,7 +337,63 @@ class Saxophone extends Writable { continue; } - // TODO: recognize DOCTYPEs here + if ( + 'DOCTYPE '.indexOf(input.slice( + chunkPos, + chunkPos + 8 + )) > -1 + ) { + chunkPos += 8; + var dtdPos = chunkPos; + + // According to spec. the DTD is followed by the + // name, then by a terminating > or a preceding + // external id (SYSTEM / PUBLIC), with one or two + // strings encapsulated by quotes ('") and a [] + // section, whichever comes first we deal with + for (;;) { + for (const dtdChar of '\'"[>') { + const nextDtdPos = input.indexOf(dtdChar, dtdPos); + if (nextDtdPos !== -1) { + dtdPos = nextDtdPos; + break; + } + } + + // We are done or need to wait for more data + if (dtdPos === -1 || input[dtdPos] === '>') { + break; + } + + // Search for the matching string end '" or ] + dtdPos = input.indexOf(input[dtdPos] === '[' + ? ']' : input[dtdPos], dtdPos + 1); + + if (dtdPos === -1) { + break; + } else { + dtdPos++; + } + } + + // Incomplete DTD, we need to wait for upcoming data + if (dtdPos === -1) { + this._wait( + Node.documentTypeDefinition, + input.slice(chunkPos - 10) + ); + break; + } + + this.emit( + Node.documentTypeDefinition, + {contents: input.slice(chunkPos, dtdPos)} + ); + + chunkPos = dtdPos + 1; + continue; + } + callback(new Error('Unrecognized sequence: { ); }); +test('should parse minimal document type definition', assert => { + expectEvents(assert, + '', + [['documenttypedefinition', {contents: 'DocType'}]] + ); +}); + +test('should parse document type definition variation 1', assert => { + expectEvents(assert, + '', + [['documenttypedefinition', {contents: 'DocType SYSTEM "file.dtd"'}]] + ); +}); + +test('should parse document type definition variation 2', assert => { + expectEvents(assert, + '', + [['documenttypedefinition', {contents: 'DocType SYSTEM "file.dtd"'}]] + ); +}); + +test('should parse document type definition variation 3', assert => { + expectEvents(assert, + '', + [['documenttypedefinition', {contents: 'DocType SYSTEM \'file.dtd\' [ any content ] '}]] + ); +}); + +test('should parse document type definition variation 4', assert => { + expectEvents(assert, + '', + [['documenttypedefinition', {contents: 'DocType PUBLIC "Public Identifier" \'file.dtd\''}]] + ); +}); + +test('should parse document type definition variation 5', assert => { + expectEvents(assert, + '', + [['documenttypedefinition', {contents: 'DocType PUBLIC \'Public Identifier\' "file.dtd" [ any content ]'}]] + ); +}); + +test('should parse document type definition variation 6', assert => { + expectEvents(assert, + '', + [['documenttypedefinition', {contents: 'DocType [ any content ] '}]] + ); +}); + +test('should parse complex document type definition', assert => { + expectEvents(assert, + ` + + + + + +] + +>`, + [['documenttypedefinition', {contents: `DocType PUBLIC "Public Identifier" 'file.dtd' [ + + + + + + + +] + +`}]] + ); +}); + test('should parse simple tags', assert => { expectEvents(assert, '', @@ -180,13 +268,6 @@ test('should not parse unclosed tags 3', assert => { ); }); -test('should not parse DOCTYPEs', assert => { - expectEvents(assert, - '', - [['error', new Error('Unrecognized sequence: { expectEvents(assert, '< invalid>', @@ -266,6 +347,10 @@ test('should parse a complete document', assert => { expectEvents(assert, tags.stripIndent` + + + ]> @@ -275,6 +360,11 @@ test('should parse a complete document', assert => { `, [ ['processinginstruction', {contents: 'xml version="1.0" encoding="UTF-8" '}], + ['text', {contents: '\n'}], + ['documenttypedefinition', {contents: `PersonType [ + + +]`}], ['text', {contents: '\n'}], ['tagopen', {name: 'persons', attrs: '', isSelfClosing: false}], ['text', {contents: '\n '}],