From 729ad758604ce0f2ae2b047ca9f9fdaceac01068 Mon Sep 17 00:00:00 2001 From: Brian White Date: Wed, 27 Jan 2016 12:29:38 -0500 Subject: [PATCH] url: improve url.parse() performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit improves url.parse() performance by 50-210% with the existing url/url-parse benchmarks. Also, the optimizations made in url.format() result in a 40% increase in performance for url.resolve(). Some optimization strategies used in this commit include: * Combining multiple searches on the same string into a single loop * Avoiding unnecessary string.split() and array.join() * Minimizing creation of temporary strings * Using a faster alternative to encodeURIComponent, borrowed from the querystring module PR-URL: https://github.com/nodejs/node/pull/4892 Reviewed-By: James M Snell Reviewed-By: Ryan Graham Reviewed-By: Johan Bergström --- lib/url.js | 567 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 414 insertions(+), 153 deletions(-) diff --git a/lib/url.js b/lib/url.js index d9be77a1ec64f1..f2103b726056b8 100644 --- a/lib/url.js +++ b/lib/url.js @@ -34,24 +34,7 @@ const portPattern = /:[0-9]*$/; // Special case for a simple path URL const simplePathPattern = /^(\/\/?(?!\/)[^\?\s]*)(\?[^\s]*)?$/; -// RFC 2396: characters reserved for delimiting URLs. -// We actually just auto-escape these. -const delims = ['<', '>', '"', '`', ' ', '\r', '\n', '\t']; - -// RFC 2396: characters not allowed for various reasons. -const unwise = ['{', '}', '|', '\\', '^', '`'].concat(delims); - -// Allowed by RFCs, but cause of XSS attacks. Always escape these. -const autoEscape = ['\''].concat(unwise); - -// Characters that are never ever allowed in a hostname. -// Note that any invalid chars are also handled, but these -// are the ones that are *expected* to be seen, so we fast-path them. -const nonHostChars = ['%', '/', '?', ';', '#'].concat(autoEscape); -const hostEndingChars = ['/', '?', '#']; const hostnameMaxLen = 255; -const hostnamePartPattern = /^[+a-z0-9A-Z_-]{0,63}$/; -const hostnamePartStart = /^([+a-z0-9A-Z_-]{0,63})(.*)$/; // protocols that can allow "unsafe" and "unwise" chars. const unsafeProtocol = { 'javascript': true, @@ -93,21 +76,82 @@ Url.prototype.parse = function(url, parseQueryString, slashesDenoteHost) { // Copy chrome, IE, opera backslash-handling behavior. // Back slashes before the query string get converted to forward slashes // See: https://code.google.com/p/chromium/issues/detail?id=25916 - const queryIndex = url.indexOf('?'); - const splitter = - (queryIndex !== -1 && queryIndex < url.indexOf('#')) ? '?' : '#'; - const uSplit = url.split(splitter); - const slashRegex = /\\/g; - uSplit[0] = uSplit[0].replace(slashRegex, '/'); - url = uSplit.join(splitter); + var hasHash = false; + var start = -1; + var end = -1; + var rest = ''; + var lastPos = 0; + for (var i = 0, inWs = false, split = false; i < url.length; ++i) { + var code = url.charCodeAt(i); + + // Find first and last non-whitespace characters for trimming + var isWs = code === 32/* */ || + code === 9/*\t*/ || + code === 13/*\r*/ || + code === 10/*\n*/ || + code === 12/*\f*/ || + code === 160/*\u00A0*/ || + code === 65279/*\uFEFF*/; + if (start === -1) { + if (isWs) + continue; + lastPos = start = i; + } else { + if (inWs) { + if (!isWs) { + end = -1; + inWs = false; + } + } else if (isWs) { + end = i; + inWs = true; + } + } - var rest = url; + // Only convert backslashes while we haven't seen a split character + if (!split) { + switch (code) { + case 35: // '#' + hasHash = true; + // Fall through + case 63: // '?' + split = true; + break; + case 92: // '\\' + if (i - lastPos > 0) + rest += url.slice(lastPos, i); + rest += '/'; + lastPos = i + 1; + break; + } + } else if (!hasHash && code === 35/*#*/) { + hasHash = true; + } + } - // trim before proceeding. - // This is to support parse stuff like " http://foo.com \n" - rest = rest.trim(); + // Check if string was non-empty (including strings with only whitespace) + if (start !== -1) { + if (lastPos === start) { + // We didn't convert any backslashes + + if (end === -1) { + if (start === 0) + rest = url; + else + rest = url.slice(start); + } else { + rest = url.slice(start, end); + } + } else if (end === -1 && lastPos < url.length) { + // We converted some backslashes and have only part of the entire string + rest += url.slice(lastPos); + } else if (end !== -1 && lastPos < end) { + // We converted some backslashes and have only part of the entire string + rest += url.slice(lastPos, end); + } + } - if (!slashesDenoteHost && url.split('#').length === 1) { + if (!slashesDenoteHost && !hasHash) { // Try fast path regexp var simplePath = simplePathPattern.exec(rest); if (simplePath) { @@ -117,9 +161,9 @@ Url.prototype.parse = function(url, parseQueryString, slashesDenoteHost) { if (simplePath[2]) { this.search = simplePath[2]; if (parseQueryString) { - this.query = querystring.parse(this.search.substr(1)); + this.query = querystring.parse(this.search.slice(1)); } else { - this.query = this.search.substr(1); + this.query = this.search.slice(1); } } else if (parseQueryString) { this.search = ''; @@ -134,17 +178,18 @@ Url.prototype.parse = function(url, parseQueryString, slashesDenoteHost) { proto = proto[0]; var lowerProto = proto.toLowerCase(); this.protocol = lowerProto; - rest = rest.substr(proto.length); + rest = rest.slice(proto.length); } // figure out if it's got a host // user@server is *always* interpreted as a hostname, and url // resolution will treat //foo/bar as host=foo,path=bar because that's // how the browser resolves relative URLs. - if (slashesDenoteHost || proto || rest.match(/^\/\/[^@\/]+@[^@\/]+/)) { - var slashes = rest.substr(0, 2) === '//'; + if (slashesDenoteHost || proto || /^\/\/[^@\/]+@[^@\/]+/.test(rest)) { + var slashes = rest.charCodeAt(0) === 47/*/*/ && + rest.charCodeAt(1) === 47/*/*/; if (slashes && !(proto && hostlessProtocol[proto])) { - rest = rest.substr(2); + rest = rest.slice(2); this.slashes = true; } } @@ -167,95 +212,83 @@ Url.prototype.parse = function(url, parseQueryString, slashesDenoteHost) { // v0.12 TODO(isaacs): This is not quite how Chrome does things. // Review our test case against browsers more comprehensively. - // find the first instance of any hostEndingChars var hostEnd = -1; - for (let i = 0; i < hostEndingChars.length; i++) { - const hec = rest.indexOf(hostEndingChars[i]); - if (hec !== -1 && (hostEnd === -1 || hec < hostEnd)) - hostEnd = hec; - } - - // at this point, either we have an explicit point where the - // auth portion cannot go past, or the last @ char is the decider. - var auth, atSign; - if (hostEnd === -1) { - // atSign can be anywhere. - atSign = rest.lastIndexOf('@'); - } else { - // atSign must be in auth portion. - // http://a@b/c@d => host:b auth:a path:/c@d - atSign = rest.lastIndexOf('@', hostEnd); + var atSign = -1; + var nonHost = -1; + for (var i = 0; i < rest.length; ++i) { + switch (rest.charCodeAt(i)) { + case 9: // '\t' + case 10: // '\n' + case 13: // '\r' + case 32: // ' ' + case 34: // '"' + case 37: // '%' + case 39: // '\'' + case 59: // ';' + case 60: // '<' + case 62: // '>' + case 92: // '\\' + case 94: // '^' + case 96: // '`' + case 123: // '{' + case 124: // '|' + case 125: // '}' + // Characters that are never ever allowed in a hostname from RFC 2396 + if (nonHost === -1) + nonHost = i; + break; + case 35: // '#' + case 47: // '/' + case 63: // '?' + // Find the first instance of any host-ending characters + if (nonHost === -1) + nonHost = i; + hostEnd = i; + break; + case 64: // '@' + // At this point, either we have an explicit point where the + // auth portion cannot go past, or the last @ char is the decider. + atSign = i; + nonHost = -1; + break; + } + if (hostEnd !== -1) + break; } - - // Now we have a portion which is definitely the auth. - // Pull that off. + var start = 0; if (atSign !== -1) { - auth = rest.slice(0, atSign); - rest = rest.slice(atSign + 1); - this.auth = decodeURIComponent(auth); + this.auth = decodeURIComponent(rest.slice(0, atSign)); + start = atSign + 1; } - - // the host is the remaining to the left of the first non-host char - hostEnd = -1; - for (let i = 0; i < nonHostChars.length; i++) { - const hec = rest.indexOf(nonHostChars[i]); - if (hec !== -1 && (hostEnd === -1 || hec < hostEnd)) - hostEnd = hec; + if (nonHost === -1) { + this.host = rest.slice(start); + rest = ''; + } else { + this.host = rest.slice(start, nonHost); + rest = rest.slice(nonHost); } - // if we still have not hit it, then the entire thing is a host. - if (hostEnd === -1) - hostEnd = rest.length; - - this.host = rest.slice(0, hostEnd); - rest = rest.slice(hostEnd); // pull out port. this.parseHost(); // we've indicated that there is a hostname, // so even if it's empty, it has to be present. - this.hostname = this.hostname || ''; + if (typeof this.hostname !== 'string') + this.hostname = ''; + + var hostname = this.hostname; // if hostname begins with [ and ends with ] // assume that it's an IPv6 address. - var ipv6Hostname = this.hostname[0] === '[' && - this.hostname[this.hostname.length - 1] === ']'; + var ipv6Hostname = hostname.charCodeAt(0) === 91/*[*/ && + hostname.charCodeAt(hostname.length - 1) === 93/*]*/; // validate a little. + var result; if (!ipv6Hostname) { - var hostparts = this.hostname.split(/\./); - for (let i = 0, l = hostparts.length; i < l; i++) { - var part = hostparts[i]; - if (!part) continue; - if (!part.match(hostnamePartPattern)) { - var newpart = ''; - for (let j = 0, k = part.length; j < k; j++) { - if (part.charCodeAt(j) > 127) { - // we replace non-ASCII char with a temporary placeholder - // we need this to make sure size of hostname is not - // broken by replacing non-ASCII by nothing - newpart += 'x'; - } else { - newpart += part[j]; - } - } - // we test again with ASCII char only - if (!newpart.match(hostnamePartPattern)) { - var validParts = hostparts.slice(0, i); - var notHost = hostparts.slice(i + 1); - var bit = part.match(hostnamePartStart); - if (bit) { - validParts.push(bit[1]); - notHost.unshift(bit[2]); - } - if (notHost.length) { - rest = '/' + notHost.join('.') + rest; - } - this.hostname = validParts.join('.'); - break; - } - } - } + result = validateHostname(this, rest, hostname); + if (result !== undefined) + rest = result; } if (this.hostname.length > hostnameMaxLen) { @@ -280,7 +313,7 @@ Url.prototype.parse = function(url, parseQueryString, slashesDenoteHost) { // strip [ and ] from the hostname // the host field still retains them, though if (ipv6Hostname) { - this.hostname = this.hostname.substr(1, this.hostname.length - 2); + this.hostname = this.hostname.slice(1, -1); if (rest[0] !== '/') { rest = '/' + rest; } @@ -290,53 +323,63 @@ Url.prototype.parse = function(url, parseQueryString, slashesDenoteHost) { // now rest is set to the post-host stuff. // chop off any delim chars. if (!unsafeProtocol[lowerProto]) { - // First, make 100% sure that any "autoEscape" chars get // escaped, even if encodeURIComponent doesn't think they // need to be. - for (let i = 0, l = autoEscape.length; i < l; i++) { - var ae = autoEscape[i]; - if (rest.indexOf(ae) === -1) - continue; - var esc = encodeURIComponent(ae); - if (esc === ae) { - esc = escape(ae); - } - rest = rest.split(ae).join(esc); - } + result = autoEscapeStr(rest); + if (result !== undefined) + rest = result; } - - // chop off from the tail first. - var hash = rest.indexOf('#'); - if (hash !== -1) { - // got a fragment string. - this.hash = rest.substr(hash); - rest = rest.slice(0, hash); + var questionIdx = -1; + var hashIdx = -1; + for (var i = 0; i < rest.length; ++i) { + var code = rest.charCodeAt(i); + if (code === 35/*#*/) { + this.hash = rest.slice(i); + hashIdx = i; + break; + } else if (code === 63/*?*/ && questionIdx === -1) { + questionIdx = i; + } } - var qm = rest.indexOf('?'); - if (qm !== -1) { - this.search = rest.substr(qm); - this.query = rest.substr(qm + 1); + + if (questionIdx !== -1) { + if (hashIdx === -1) { + this.search = rest.slice(questionIdx); + this.query = rest.slice(questionIdx + 1); + } else { + this.search = rest.slice(questionIdx, hashIdx); + this.query = rest.slice(questionIdx + 1, hashIdx); + } if (parseQueryString) { this.query = querystring.parse(this.query); } - rest = rest.slice(0, qm); } else if (parseQueryString) { // no query string, but parseQueryString still requested this.search = ''; this.query = {}; } - if (rest) this.pathname = rest; + + var firstIdx = (questionIdx !== -1 && + (hashIdx === -1 || questionIdx < hashIdx) + ? questionIdx + : hashIdx); + if (firstIdx === -1) { + if (rest.length > 0) + this.pathname = rest; + } else if (firstIdx > 0) { + this.pathname = rest.slice(0, firstIdx); + } if (slashedProtocol[lowerProto] && this.hostname && !this.pathname) { this.pathname = '/'; } - //to support http.request + // to support http.request if (this.pathname || this.search) { - const p = this.pathname || ''; - const s = this.search || ''; + var p = this.pathname || ''; + var s = this.search || ''; this.path = p + s; } @@ -345,6 +388,138 @@ Url.prototype.parse = function(url, parseQueryString, slashesDenoteHost) { return this; }; +function validateHostname(self, rest, hostname) { + for (var i = 0, lastPos; i <= hostname.length; ++i) { + var code; + if (i < hostname.length) + code = hostname.charCodeAt(i); + if (code === 46/*.*/ || i === hostname.length) { + if (i - lastPos > 0) { + if (i - lastPos > 63) { + self.hostname = hostname.slice(0, lastPos + 63); + return '/' + hostname.slice(lastPos + 63) + rest; + } + } + lastPos = i + 1; + continue; + } else if ((code >= 48/*0*/ && code <= 57/*9*/) || + (code >= 97/*a*/ && code <= 122/*z*/) || + code === 45/*-*/ || + (code >= 65/*A*/ && code <= 90/*Z*/) || + code === 43/*+*/ || + code === 95/*_*/ || + code > 127) { + continue; + } + // Invalid host character + self.hostname = hostname.slice(0, i); + if (i < hostname.length - 1) + return '/' + hostname.slice(i) + rest; + break; + } +} + +function autoEscapeStr(rest) { + var newRest = ''; + var lastPos = 0; + for (var i = 0; i < rest.length; ++i) { + // Automatically escape all delimiters and unwise characters from RFC 2396 + // Also escape single quotes in case of an XSS attack + switch (rest.charCodeAt(i)) { + case 9: // '\t' + if (i - lastPos > 0) + newRest += rest.slice(lastPos, i); + newRest += '%09'; + lastPos = i + 1; + break; + case 10: // '\n' + if (i - lastPos > 0) + newRest += rest.slice(lastPos, i); + newRest += '%0A'; + lastPos = i + 1; + break; + case 13: // '\r' + if (i - lastPos > 0) + newRest += rest.slice(lastPos, i); + newRest += '%0D'; + lastPos = i + 1; + break; + case 32: // ' ' + if (i - lastPos > 0) + newRest += rest.slice(lastPos, i); + newRest += '%20'; + lastPos = i + 1; + break; + case 34: // '"' + if (i - lastPos > 0) + newRest += rest.slice(lastPos, i); + newRest += '%22'; + lastPos = i + 1; + break; + case 39: // '\'' + if (i - lastPos > 0) + newRest += rest.slice(lastPos, i); + newRest += '%27'; + lastPos = i + 1; + break; + case 60: // '<' + if (i - lastPos > 0) + newRest += rest.slice(lastPos, i); + newRest += '%3C'; + lastPos = i + 1; + break; + case 62: // '>' + if (i - lastPos > 0) + newRest += rest.slice(lastPos, i); + newRest += '%3E'; + lastPos = i + 1; + break; + case 92: // '\\' + if (i - lastPos > 0) + newRest += rest.slice(lastPos, i); + newRest += '%5C'; + lastPos = i + 1; + break; + case 94: // '^' + if (i - lastPos > 0) + newRest += rest.slice(lastPos, i); + newRest += '%5E'; + lastPos = i + 1; + break; + case 96: // '`' + if (i - lastPos > 0) + newRest += rest.slice(lastPos, i); + newRest += '%60'; + lastPos = i + 1; + break; + case 123: // '{' + if (i - lastPos > 0) + newRest += rest.slice(lastPos, i); + newRest += '%7B'; + lastPos = i + 1; + break; + case 124: // '|' + if (i - lastPos > 0) + newRest += rest.slice(lastPos, i); + newRest += '%7C'; + lastPos = i + 1; + break; + case 125: // '}' + if (i - lastPos > 0) + newRest += rest.slice(lastPos, i); + newRest += '%7D'; + lastPos = i + 1; + break; + } + } + if (lastPos === 0) + return; + if (lastPos < rest.length) + return newRest + rest.slice(lastPos); + else + return newRest; +} + // format a parsed object into a url string function urlFormat(obj) { // ensure it's an object, and not a string url. @@ -365,8 +540,7 @@ function urlFormat(obj) { Url.prototype.format = function() { var auth = this.auth || ''; if (auth) { - auth = encodeURIComponent(auth); - auth = auth.replace(/%3A/i, ':'); + auth = encodeAuth(auth); auth += '@'; } @@ -387,34 +561,55 @@ Url.prototype.format = function() { } } - if (this.query !== null && - typeof this.query === 'object' && - Object.keys(this.query).length) { + if (this.query !== null && typeof this.query === 'object') query = querystring.stringify(this.query); - } var search = this.search || (query && ('?' + query)) || ''; - if (protocol && protocol.substr(-1) !== ':') protocol += ':'; + if (protocol && protocol.charCodeAt(protocol.length - 1) !== 58/*:*/) + protocol += ':'; + + var newPathname = ''; + var lastPos = 0; + for (var i = 0; i < pathname.length; ++i) { + switch (pathname.charCodeAt(i)) { + case 35: // '#' + if (i - lastPos > 0) + newPathname += pathname.slice(lastPos, i); + newPathname += '%23'; + lastPos = i + 1; + break; + case 63: // '?' + if (i - lastPos > 0) + newPathname += pathname.slice(lastPos, i); + newPathname += '%3F'; + lastPos = i + 1; + break; + } + } + if (lastPos > 0) { + if (lastPos !== pathname.length) + pathname = newPathname + pathname.slice(lastPos); + else + pathname = newPathname; + } // only the slashedProtocols get the //. Not mailto:, xmpp:, etc. // unless they had them to begin with. if (this.slashes || (!protocol || slashedProtocol[protocol]) && host !== false) { host = '//' + (host || ''); - if (pathname && pathname.charAt(0) !== '/') pathname = '/' + pathname; + if (pathname && pathname.charCodeAt(0) !== 47/*/*/) + pathname = '/' + pathname; } else if (!host) { host = ''; } - if (hash && hash.charAt(0) !== '#') hash = '#' + hash; - if (search && search.charAt(0) !== '?') search = '?' + search; - - pathname = pathname.replace(/[?#]/g, function(match) { - return encodeURIComponent(match); - }); search = search.replace('#', '%23'); + if (hash && hash.charCodeAt(0) !== 35/*#*/) hash = '#' + hash; + if (search && search.charCodeAt(0) !== 63/*?*/) search = '?' + search; + return protocol + host + pathname + search + hash; }; @@ -533,7 +728,7 @@ Url.prototype.resolveObject = function(relative) { var mustEndAbs = (isRelAbs || isSourceAbs || (result.host && relative.pathname)); const removeAllDots = mustEndAbs; - let srcPath = result.pathname && result.pathname.split('/') || []; + var srcPath = result.pathname && result.pathname.split('/') || []; const relPath = relative.pathname && relative.pathname.split('/') || []; const psychotic = result.protocol && !slashedProtocol[result.protocol]; @@ -632,7 +827,7 @@ Url.prototype.resolveObject = function(relative) { // strip single dots, resolve double dots to parent dir // if the path tries to go above the root, `up` ends up > 0 var up = 0; - for (let i = srcPath.length; i >= 0; i--) { + for (var i = srcPath.length; i >= 0; i--) { last = srcPath[i]; if (last === '.') { spliceOne(srcPath, i); @@ -709,9 +904,9 @@ Url.prototype.parseHost = function() { if (port) { port = port[0]; if (port !== ':') { - this.port = port.substr(1); + this.port = port.slice(1); } - host = host.substr(0, host.length - port.length); + host = host.slice(0, host.length - port.length); } if (host) this.hostname = host; }; @@ -722,3 +917,69 @@ function spliceOne(list, index) { list[i] = list[k]; list.pop(); } + +var hexTable = new Array(256); +for (var i = 0; i < 256; ++i) + hexTable[i] = '%' + ((i < 16 ? '0' : '') + i.toString(16)).toUpperCase(); +function encodeAuth(str) { + // faster encodeURIComponent alternative for encoding auth uri components + var out = ''; + var lastPos = 0; + for (var i = 0; i < str.length; ++i) { + var c = str.charCodeAt(i); + + // These characters do not need escaping: + // ! - . _ ~ + // ' ( ) * : + // digits + // alpha (uppercase) + // alpha (lowercase) + if (c === 0x21 || c === 0x2D || c === 0x2E || c === 0x5F || c === 0x7E || + (c >= 0x27 && c <= 0x2A) || + (c >= 0x30 && c <= 0x3A) || + (c >= 0x41 && c <= 0x5A) || + (c >= 0x61 && c <= 0x7A)) { + continue; + } + + if (i - lastPos > 0) + out += str.slice(lastPos, i); + + lastPos = i + 1; + + // Other ASCII characters + if (c < 0x80) { + out += hexTable[c]; + continue; + } + + // Multi-byte characters ... + if (c < 0x800) { + out += hexTable[0xC0 | (c >> 6)] + hexTable[0x80 | (c & 0x3F)]; + continue; + } + if (c < 0xD800 || c >= 0xE000) { + out += hexTable[0xE0 | (c >> 12)] + + hexTable[0x80 | ((c >> 6) & 0x3F)] + + hexTable[0x80 | (c & 0x3F)]; + continue; + } + // Surrogate pair + ++i; + var c2; + if (i < str.length) + c2 = str.charCodeAt(i) & 0x3FF; + else + c2 = 0; + c = 0x10000 + (((c & 0x3FF) << 10) | c2); + out += hexTable[0xF0 | (c >> 18)] + + hexTable[0x80 | ((c >> 12) & 0x3F)] + + hexTable[0x80 | ((c >> 6) & 0x3F)] + + hexTable[0x80 | (c & 0x3F)]; + } + if (lastPos === 0) + return str; + if (lastPos < str.length) + return out + str.slice(lastPos); + return out; +}