You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
159 lines
4.4 KiB
159 lines
4.4 KiB
'use strict'; |
|
|
|
const unicode = require('../common/unicode'); |
|
const ERR = require('../common/error-codes'); |
|
|
|
//Aliases |
|
const $ = unicode.CODE_POINTS; |
|
|
|
//Const |
|
const DEFAULT_BUFFER_WATERLINE = 1 << 16; |
|
|
|
//Preprocessor |
|
//NOTE: HTML input preprocessing |
|
//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream) |
|
class Preprocessor { |
|
constructor() { |
|
this.html = null; |
|
|
|
this.pos = -1; |
|
this.lastGapPos = -1; |
|
this.lastCharPos = -1; |
|
|
|
this.gapStack = []; |
|
|
|
this.skipNextNewLine = false; |
|
|
|
this.lastChunkWritten = false; |
|
this.endOfChunkHit = false; |
|
this.bufferWaterline = DEFAULT_BUFFER_WATERLINE; |
|
} |
|
|
|
_err() { |
|
// NOTE: err reporting is noop by default. Enabled by mixin. |
|
} |
|
|
|
_addGap() { |
|
this.gapStack.push(this.lastGapPos); |
|
this.lastGapPos = this.pos; |
|
} |
|
|
|
_processSurrogate(cp) { |
|
//NOTE: try to peek a surrogate pair |
|
if (this.pos !== this.lastCharPos) { |
|
const nextCp = this.html.charCodeAt(this.pos + 1); |
|
|
|
if (unicode.isSurrogatePair(nextCp)) { |
|
//NOTE: we have a surrogate pair. Peek pair character and recalculate code point. |
|
this.pos++; |
|
|
|
//NOTE: add gap that should be avoided during retreat |
|
this._addGap(); |
|
|
|
return unicode.getSurrogatePairCodePoint(cp, nextCp); |
|
} |
|
} |
|
|
|
//NOTE: we are at the end of a chunk, therefore we can't infer surrogate pair yet. |
|
else if (!this.lastChunkWritten) { |
|
this.endOfChunkHit = true; |
|
return $.EOF; |
|
} |
|
|
|
//NOTE: isolated surrogate |
|
this._err(ERR.surrogateInInputStream); |
|
|
|
return cp; |
|
} |
|
|
|
dropParsedChunk() { |
|
if (this.pos > this.bufferWaterline) { |
|
this.lastCharPos -= this.pos; |
|
this.html = this.html.substring(this.pos); |
|
this.pos = 0; |
|
this.lastGapPos = -1; |
|
this.gapStack = []; |
|
} |
|
} |
|
|
|
write(chunk, isLastChunk) { |
|
if (this.html) { |
|
this.html += chunk; |
|
} else { |
|
this.html = chunk; |
|
} |
|
|
|
this.lastCharPos = this.html.length - 1; |
|
this.endOfChunkHit = false; |
|
this.lastChunkWritten = isLastChunk; |
|
} |
|
|
|
insertHtmlAtCurrentPos(chunk) { |
|
this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1, this.html.length); |
|
|
|
this.lastCharPos = this.html.length - 1; |
|
this.endOfChunkHit = false; |
|
} |
|
|
|
advance() { |
|
this.pos++; |
|
|
|
if (this.pos > this.lastCharPos) { |
|
this.endOfChunkHit = !this.lastChunkWritten; |
|
return $.EOF; |
|
} |
|
|
|
let cp = this.html.charCodeAt(this.pos); |
|
|
|
//NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character |
|
//must be ignored. |
|
if (this.skipNextNewLine && cp === $.LINE_FEED) { |
|
this.skipNextNewLine = false; |
|
this._addGap(); |
|
return this.advance(); |
|
} |
|
|
|
//NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters |
|
if (cp === $.CARRIAGE_RETURN) { |
|
this.skipNextNewLine = true; |
|
return $.LINE_FEED; |
|
} |
|
|
|
this.skipNextNewLine = false; |
|
|
|
if (unicode.isSurrogate(cp)) { |
|
cp = this._processSurrogate(cp); |
|
} |
|
|
|
//OPTIMIZATION: first check if code point is in the common allowed |
|
//range (ASCII alphanumeric, whitespaces, big chunk of BMP) |
|
//before going into detailed performance cost validation. |
|
const isCommonValidRange = |
|
(cp > 0x1f && cp < 0x7f) || cp === $.LINE_FEED || cp === $.CARRIAGE_RETURN || (cp > 0x9f && cp < 0xfdd0); |
|
|
|
if (!isCommonValidRange) { |
|
this._checkForProblematicCharacters(cp); |
|
} |
|
|
|
return cp; |
|
} |
|
|
|
_checkForProblematicCharacters(cp) { |
|
if (unicode.isControlCodePoint(cp)) { |
|
this._err(ERR.controlCharacterInInputStream); |
|
} else if (unicode.isUndefinedCodePoint(cp)) { |
|
this._err(ERR.noncharacterInInputStream); |
|
} |
|
} |
|
|
|
retreat() { |
|
if (this.pos === this.lastGapPos) { |
|
this.lastGapPos = this.gapStack.pop(); |
|
this.pos--; |
|
} |
|
|
|
this.pos--; |
|
} |
|
} |
|
|
|
module.exports = Preprocessor;
|
|
|