Added SDK
This commit is contained in:
@@ -0,0 +1,485 @@
|
||||
/*
|
||||
* Copyright (C) 2019 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
||||
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
|
||||
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
||||
* THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
HTMLParser = class HTMLParser {
|
||||
|
||||
// Public
|
||||
|
||||
parseDocument(sourceText, treeBuilder, {isXML} = {})
|
||||
{
|
||||
console.assert(typeof sourceText === "string");
|
||||
console.assert(treeBuilder);
|
||||
console.assert(treeBuilder.pushParserNode);
|
||||
|
||||
this._treeBuilder = treeBuilder;
|
||||
|
||||
this._pos = 0;
|
||||
this._mode = HTMLParser.Mode.Data;
|
||||
this._data = sourceText;
|
||||
this._bogusCommentOpener = null;
|
||||
this._isXML = !!isXML;
|
||||
|
||||
if (this._treeBuilder.begin)
|
||||
this._treeBuilder.begin();
|
||||
|
||||
while (this._pos < this._data.length)
|
||||
this._parse();
|
||||
|
||||
if (this._treeBuilder.end)
|
||||
this._treeBuilder.end();
|
||||
}
|
||||
|
||||
// Private
|
||||
|
||||
_isEOF()
|
||||
{
|
||||
return this._pos === this._data.length;
|
||||
}
|
||||
|
||||
_peek(n = 1)
|
||||
{
|
||||
return this._data.substring(this._pos, this._pos + n);
|
||||
}
|
||||
|
||||
_peekCharacterRegex(regex)
|
||||
{
|
||||
return regex.test(this._data.charAt(this._pos));
|
||||
}
|
||||
|
||||
_peekString(str)
|
||||
{
|
||||
for (let i = 0; i < str.length; ++i) {
|
||||
let c = str[i];
|
||||
if (this._data.charAt(this._pos + i) !== c)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
_peekCaseInsensitiveString(str)
|
||||
{
|
||||
console.assert(str.toLowerCase() === str, "String should be passed in as lowercase.");
|
||||
|
||||
for (let i = 0; i < str.length; ++i) {
|
||||
let d = this._data.charAt(this._pos + i);
|
||||
if (!d)
|
||||
return false;
|
||||
let c = str[i];
|
||||
if (d.toLowerCase() !== c)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
_consumeRegex(regex)
|
||||
{
|
||||
let startIndex = this._pos;
|
||||
while (regex.test(this._data.charAt(this._pos)))
|
||||
this._pos++;
|
||||
|
||||
return this._data.substring(startIndex, this._pos);
|
||||
}
|
||||
|
||||
_consumeWhitespace()
|
||||
{
|
||||
return this._consumeRegex(/\s/);
|
||||
}
|
||||
|
||||
_consumeUntilString(str, newMode)
|
||||
{
|
||||
let index = this._data.indexOf(str, this._pos);
|
||||
if (index === -1) {
|
||||
let startIndex = this._pos;
|
||||
this._pos = this._data.length;
|
||||
if (newMode)
|
||||
this._mode = newMode;
|
||||
return this._data.substring(startIndex, this._data.length);
|
||||
}
|
||||
|
||||
let startIndex = this._pos;
|
||||
this._pos = index + str.length;
|
||||
if (newMode)
|
||||
this._mode = newMode;
|
||||
return this._data.substring(startIndex, index);
|
||||
}
|
||||
|
||||
_consumeDoubleQuotedString()
|
||||
{
|
||||
console.assert(this._peekString(`"`));
|
||||
this._pos++;
|
||||
let string = this._consumeUntilString(`"`);
|
||||
return string;
|
||||
}
|
||||
|
||||
_consumeSingleQuotedString()
|
||||
{
|
||||
console.assert(this._peekString(`'`));
|
||||
this._pos++;
|
||||
let string = this._consumeUntilString(`'`);
|
||||
return string;
|
||||
}
|
||||
|
||||
// Parser
|
||||
// This is a crude implementation of HTML tokenization:
|
||||
// https://html.spec.whatwg.org/multipage/parsing.html
|
||||
|
||||
_parse()
|
||||
{
|
||||
switch (this._mode) {
|
||||
case HTMLParser.Mode.Data:
|
||||
return this._parseData();
|
||||
case HTMLParser.Mode.ScriptData:
|
||||
return this._parseScriptData();
|
||||
case HTMLParser.Mode.TagOpen:
|
||||
return this._parseTagOpen();
|
||||
case HTMLParser.Mode.Attr:
|
||||
return this._parseAttr();
|
||||
case HTMLParser.Mode.CData:
|
||||
return this._parseCData();
|
||||
case HTMLParser.Mode.Doctype:
|
||||
return this._parseDoctype();
|
||||
case HTMLParser.Mode.Comment:
|
||||
return this._parseComment();
|
||||
case HTMLParser.Mode.BogusComment:
|
||||
return this._parseBogusComment();
|
||||
}
|
||||
|
||||
console.assert();
|
||||
throw "Missing parser mode";
|
||||
}
|
||||
|
||||
_parseData()
|
||||
{
|
||||
let startPos = this._pos;
|
||||
let text = this._consumeUntilString("<", HTMLParser.Mode.TagOpen);
|
||||
if (text)
|
||||
this._push({type: HTMLParser.NodeType.Text, data: text, pos: startPos});
|
||||
|
||||
if (this._isEOF() && this._data.endsWith("<"))
|
||||
this._handleEOF(this._pos - 1);
|
||||
}
|
||||
|
||||
_parseScriptData()
|
||||
{
|
||||
let startPos = this._pos;
|
||||
let scriptText = "";
|
||||
|
||||
// Parse as text until </script>.
|
||||
while (true) {
|
||||
scriptText += this._consumeUntilString("<");
|
||||
if (this._peekCaseInsensitiveString("/script>")) {
|
||||
this._pos += "/script>".length;
|
||||
this._mode = HTMLParser.Mode.Data;
|
||||
break;
|
||||
}
|
||||
if (this._handleEOF(startPos))
|
||||
return;
|
||||
scriptText += "<";
|
||||
}
|
||||
|
||||
if (scriptText)
|
||||
this._push({type: HTMLParser.NodeType.Text, data: scriptText, pos: startPos});
|
||||
this._push({type: HTMLParser.NodeType.CloseTag, name: "script", pos: startPos + scriptText.length});
|
||||
}
|
||||
|
||||
_parseTagOpen()
|
||||
{
|
||||
// |<tag
|
||||
this._currentTagStartPos = this._pos - 1;
|
||||
|
||||
if (this._peekString("!")) {
|
||||
// Comment.
|
||||
if (this._peekString("!--")) {
|
||||
this._pos += "!--".length;
|
||||
this._mode = HTMLParser.Mode.Comment;
|
||||
this._handleEOF(this._currentTagStartPos);
|
||||
return;
|
||||
}
|
||||
|
||||
// DOCTYPE.
|
||||
if (this._peekCaseInsensitiveString("!doctype")) {
|
||||
let startPos = this._pos;
|
||||
this._pos += "!DOCTYPE".length;
|
||||
this._doctypeRaw = this._data.substring(startPos, this._pos);
|
||||
this._mode = HTMLParser.Mode.Doctype;
|
||||
this._handleEOF(this._currentTagStartPos);
|
||||
return;
|
||||
}
|
||||
|
||||
// CDATA.
|
||||
if (this._peekString("![CDATA[")) {
|
||||
this._pos += "![CDATA[".length;
|
||||
this._mode = HTMLParser.Mode.CData;
|
||||
this._handleEOF(this._currentTagStartPos);
|
||||
return;
|
||||
}
|
||||
|
||||
// Bogus Comment.
|
||||
this._pos++;
|
||||
this._mode = HTMLParser.Mode.BogusComment;
|
||||
this._handleEOF(this._currentTagStartPos);
|
||||
return;
|
||||
}
|
||||
|
||||
if (this._peekString("?")) {
|
||||
// Bogus Comment.
|
||||
this._pos++;
|
||||
this._mode = HTMLParser.Mode.BogusComment;
|
||||
this._bogusCommentOpener = "<?";
|
||||
this._handleEOF(this._currentTagStartPos);
|
||||
return;
|
||||
}
|
||||
|
||||
if (this._peekString("/")) {
|
||||
// End Tag.
|
||||
this._pos++;
|
||||
let text = this._consumeUntilString(">", HTMLParser.Mode.Data);
|
||||
this._push({type: HTMLParser.NodeType.CloseTag, name: text, pos: this._currentTagStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
// ASCII - Open Tag
|
||||
if (this._peekCharacterRegex(/[a-z]/i)) {
|
||||
let text = this._consumeRegex(/[^\s/>]+/);
|
||||
if (text) {
|
||||
if (this._peekCharacterRegex(/\s/)) {
|
||||
this._currentTagName = text;
|
||||
this._currentTagAttributes = [];
|
||||
this._mode = HTMLParser.Mode.Attr;
|
||||
return;
|
||||
}
|
||||
|
||||
if (this._peekString("/>")) {
|
||||
this._pos += "/>".length;
|
||||
this._mode = HTMLParser.Mode.Data;
|
||||
this._push({type: HTMLParser.NodeType.OpenTag, name: text, closed: true, pos: this._currentTagStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
if (this._peekString(">")) {
|
||||
this._pos++;
|
||||
this._mode = HTMLParser.Mode.Data;
|
||||
this._push({type: HTMLParser.NodeType.OpenTag, name: text, closed: false, pos: this._currentTagStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
// End of document. Output any remaining data as error text.
|
||||
console.assert(this._isEOF());
|
||||
this._push({type: HTMLParser.NodeType.ErrorText, data: "<" + text, pos: this._currentTagStartPos});
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Anything else, treat as text.
|
||||
this._push({type: HTMLParser.NodeType.Text, data: "<", pos: this._currentTagStartPos});
|
||||
this._mode = HTMLParser.Mode.Data;
|
||||
}
|
||||
|
||||
_parseAttr()
|
||||
{
|
||||
this._consumeWhitespace();
|
||||
|
||||
if (this._peekString("/>")) {
|
||||
this._pos += "/>".length;
|
||||
this._mode = HTMLParser.Mode.Data;
|
||||
this._push({type: HTMLParser.NodeType.OpenTag, name: this._currentTagName, closed: true, attributes: this._currentTagAttributes, pos: this._currentTagStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
if (this._peekString(">")) {
|
||||
this._pos++;
|
||||
this._mode = HTMLParser.Mode.Data;
|
||||
this._push({type: HTMLParser.NodeType.OpenTag, name: this._currentTagName, closed: false, attributes: this._currentTagAttributes, pos: this._currentTagStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
// <tag |attr
|
||||
let attributeNameStartPos = this._pos;
|
||||
|
||||
let attributeName = this._consumeRegex(/[^\s=/>]+/);
|
||||
// console.assert(attributeName.length > 0, "Unexpected empty attribute name");
|
||||
if (this._peekString("/") || this._peekString(">")) {
|
||||
if (attributeName)
|
||||
this._pushAttribute({name: attributeName, value: undefined, namePos: attributeNameStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
this._consumeWhitespace();
|
||||
|
||||
if (this._peekString("=")) {
|
||||
this._pos++;
|
||||
|
||||
// <tag attr=|value
|
||||
let attributeValueStartPos = this._pos;
|
||||
|
||||
this._consumeWhitespace();
|
||||
|
||||
if (this._peekString(`"`)) {
|
||||
let attributeValue = this._consumeDoubleQuotedString();
|
||||
this._pushAttribute({name: attributeName, value: attributeValue, quote: HTMLParser.AttrQuoteType.Double, namePos: attributeNameStartPos, valuePos: attributeValueStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
if (this._peekString(`'`)) {
|
||||
let attributeValue = this._consumeSingleQuotedString();
|
||||
this._pushAttribute({name: attributeName, value: attributeValue, quote: HTMLParser.AttrQuoteType.Single, namePos: attributeNameStartPos, valuePos: attributeValueStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
if (this._peekString(">")) {
|
||||
this._pos++;
|
||||
this._mode = HTMLParser.Mode.Data;
|
||||
this._push({type: HTMLParser.NodeType.OpenTag, name: this._currentTagName, closed: false, attributes: this._currentTagAttributes, pos: this._currentTagStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
let whitespace = this._consumeWhitespace();
|
||||
if (whitespace) {
|
||||
this._pushAttribute({name: attributeName, value: undefined, quote: HTMLParser.AttrQuoteType.None, namePos: attributeNameStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
let attributeValue = this._consumeRegex(/[^\s=>]+/);
|
||||
this._pushAttribute({name: attributeName, value: attributeValue, quote: HTMLParser.AttrQuoteType.None, namePos: attributeNameStartPos, valuePos: attributeValueStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
if (!this._isEOF()) {
|
||||
this._pushAttribute({name: attributeName, value: undefined, quote: HTMLParser.AttrQuoteType.None, namePos: attributeNameStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
// End of document. Treat everything up to now as error text.
|
||||
console.assert(this._isEOF());
|
||||
this._push({type: HTMLParser.NodeType.ErrorText, data: this._data.substring(this._currentTagStartPos), pos: this._currentTagStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
_parseComment()
|
||||
{
|
||||
let text = this._consumeUntilString("-->", HTMLParser.Mode.Data);
|
||||
if (this._isEOF() && !this._data.endsWith("-->")) {
|
||||
this._push({type: HTMLParser.NodeType.ErrorText, data: this._data.substring(this._currentTagStartPos), pos: this._currentTagStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
let closePos = this._pos - "-->".length;
|
||||
this._push({type: HTMLParser.NodeType.Comment, data: text, pos: this._currentTagStartPos, closePos});
|
||||
}
|
||||
|
||||
_parseBogusComment()
|
||||
{
|
||||
let text = this._consumeUntilString(">", HTMLParser.Mode.Data);
|
||||
if (this._isEOF() && !this._data.endsWith(">")) {
|
||||
this._push({type: HTMLParser.NodeType.ErrorText, data: this._data.substring(this._currentTagStartPos), pos: this._currentTagStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
let closePos = this._pos - ">".length;
|
||||
this._push({type: HTMLParser.NodeType.Comment, data: text, opener: this._bogusCommentOpener || "", pos: this._currentTagStartPos, closePos});
|
||||
this._bogusCommentOpener = null;
|
||||
}
|
||||
|
||||
_parseDoctype()
|
||||
{
|
||||
let text = this._consumeUntilString(">", HTMLParser.Mode.Data);
|
||||
if (this._isEOF() && !this._data.endsWith(">")) {
|
||||
this._push({type: HTMLParser.NodeType.ErrorText, data: this._data.substring(this._currentTagStartPos), pos: this._currentTagStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
let closePos = this._pos - ">".length;
|
||||
this._push({type: HTMLParser.NodeType.Doctype, data: text, raw: this._doctypeRaw, pos: this._currentTagStartPos, closePos});
|
||||
this._doctypeRaw = null;
|
||||
}
|
||||
|
||||
_parseCData()
|
||||
{
|
||||
let text = this._consumeUntilString("]]>", HTMLParser.Mode.Data);
|
||||
if (this._isEOF() && !this._data.endsWith("]]>")) {
|
||||
this._push({type: HTMLParser.NodeType.ErrorText, data: this._data.substring(this._currentTagStartPos), pos: this._currentTagStartPos});
|
||||
return;
|
||||
}
|
||||
|
||||
let closePos = this._pos - "]]>".length;
|
||||
this._push({type: HTMLParser.NodeType.CData, data: text, pos: this._currentTagStartPos, closePos});
|
||||
}
|
||||
|
||||
_pushAttribute(attr)
|
||||
{
|
||||
this._currentTagAttributes.push(attr);
|
||||
this._handleEOF(this._currentTagStartPos);
|
||||
}
|
||||
|
||||
_handleEOF(lastPosition)
|
||||
{
|
||||
if (!this._isEOF())
|
||||
return false;
|
||||
|
||||
// End of document. Treat everything from the last position as error text.
|
||||
this._push({type: HTMLParser.NodeType.ErrorText, data: this._data.substring(lastPosition), pos: lastPosition});
|
||||
return true;
|
||||
}
|
||||
|
||||
_push(node)
|
||||
{
|
||||
// Custom mode for some elements.
|
||||
if (node.type === HTMLParser.NodeType.OpenTag) {
|
||||
if (!this._isXML && node.name.toLowerCase() === "script")
|
||||
this._mode = HTMLParser.Mode.ScriptData;
|
||||
}
|
||||
|
||||
this._treeBuilder.pushParserNode(node);
|
||||
}
|
||||
};
|
||||
|
||||
HTMLParser.Mode = {
|
||||
Data: "data",
|
||||
TagOpen: "tag-open",
|
||||
ScriptData: "script-data",
|
||||
Attr: "attr",
|
||||
CData: "cdata",
|
||||
Doctype: "doctype",
|
||||
Comment: "comment",
|
||||
BogusComment: "bogus-comment",
|
||||
};
|
||||
|
||||
HTMLParser.NodeType = {
|
||||
Text: "text",
|
||||
ErrorText: "error-text",
|
||||
OpenTag: "open-tag",
|
||||
CloseTag: "close-tag",
|
||||
Comment: "comment",
|
||||
Doctype: "doctype",
|
||||
CData: "cdata",
|
||||
};
|
||||
|
||||
HTMLParser.AttrQuoteType = {
|
||||
None: "none",
|
||||
Double: "double",
|
||||
Single: "single",
|
||||
};
|
||||
Reference in New Issue
Block a user