Skip to content

Commit

Permalink
fixing URI.withinString to not use URI.find_uri_expression anymore an…
Browse files Browse the repository at this point in the history
…d allow filtering URLs by optional RegExp as well as ignore URLs in HTML - closing #117, #131
  • Loading branch information
rodneyrehm committed Jan 22, 2014
1 parent bccca64 commit 9f04605
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 25 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,8 @@ URI.js is published under the [MIT license](http://www.opensource.org/licenses/m
* fixing [`.absoluteTo()`](http://medialize.github.com/URI.js/docs.html#absoluteto) to comply with [RFC3986 Reference Resolution Examples](http://tools.ietf.org/html/rfc3986#section-5.4) - ([Issue #113](https://github.com/medialize/URI.js/issues/113))
* fixing [`.normalizePath()`](http://medialize.github.com/URI.js/docs.html#normalize-path) to maintain leading parent references (`../`) for relative paths, while removing them for absolute paths - ([Issue #133](https://github.com/medialize/URI.js/issues/133))
* fixing `URI.protocol_expression` to properly accept `.` in compliance with [RFC 3986 - Scheme](http://tools.ietf.org/html/rfc3986#section-3.1) - ([Issue #132](https://github.com/medialize/URI.js/issues/132))
* fixing [`URI.withinString()`](http://medialize.github.com/URI.js/docs.html#static-withinString) to not use backtracking prone regular expression `URI.find_uri_expression` anymore - ([Issue #131](https://github.com/medialize/URI.js/issues/131))
* fixing [`URI.withinString()`](http://medialize.github.com/URI.js/docs.html#static-withinString) to accept options `ignore` and `ignoreHtml` to allow better control over which detected URLs get handled - ([Issue #117](https://github.com/medialize/URI.js/issues/117))

### 1.11.2 (August 14th 2013) ###

Expand Down
52 changes: 46 additions & 6 deletions docs.html
Original file line number Diff line number Diff line change
Expand Up @@ -258,8 +258,8 @@ <h3 id="constructor">URI Constructor</h3>

<h3 id="clone">cloning URIs</h3>
<p>Get a copy of the current URI instance</p>
<pre class="prettyprint lang-js">var uri = new URI("http://example.org"),
uri2 = uri.clone();
<pre class="prettyprint lang-js">var uri = new URI("http://example.org");
var uri2 = uri.clone();

uri2.tld("com");
uri == "http://example.org/";
Expand Down Expand Up @@ -990,8 +990,8 @@ <h3 id="static-parseHost">URI.parseHost(<em>string</em> url, <em>object</em> par
<p>parses a string's beginning into its URI components hostname, port.
Found components are appended to the <code>parts</code> parameter.
Remaining string is returned</p>
<pre class="prettyprint lang-js">var parts = {},
result = URI.parseAuthority("example.org:8080/foo.html", parts);
<pre class="prettyprint lang-js">var parts = {};
var result = URI.parseAuthority("example.org:8080/foo.html", parts);
result === "/foo.html";
parts === {
hostname: "example.org",
Expand Down Expand Up @@ -1066,8 +1066,8 @@ <h3 id="static-buildQuery">URI.buildQuery(<em>object</em> data, [<em>boolean</em
// Note: duplicate hello=mars is preserved
URI.buildQuery(data, true) === "foo=bar&amp;hello=world&amp;hello=mars&amp;hello=mars&amp;bam=&amp;yup";</pre>
<p>To preserve duplicate values, use URI.buildQuery() directly:</p>
<pre class="prettyprint lang-js">var uri = new URI("http://example.org/foo.html?bar=baz"),
data = uri.query(true);
<pre class="prettyprint lang-js">var uri = new URI("http://example.org/foo.html?bar=baz");
var data = uri.query(true);

data.some = "new data";
uri.query(URI.buildQuery(data, true));
Expand Down Expand Up @@ -1250,6 +1250,46 @@ <h3 id="static-withinString">URI.withinString()</h3>
+ escapeHtml(url.<a href="#readable">readable</a>()) + "&lt;/a&gt;";
});
</pre>
<p>As of URI.js 1.12.0 withinString accepts the following parameters:</p>
<pre class="prettyprint lang-js">var source = "Hello www.example.com.";
var decorate = function(url) {
return "&lt;code&gt;" + url + "&lt;/code&gt;";
};
var result = null;

// access to the original input text from the callback
URI.withinString(source, function(url, start, end, source) {
source.slice(start, end) === url;
return url;
});

// ignore certain URLs
source = "Hello www.example.com,\n"
+ "ohgodno://example.org/ is a a protocol we want ignored";
result = URI.withinString(source, decorate, {
ignore: /^ohgodno:/i
});

/* result is:
Hello <strong>&lt;code&gt;www.example.com&lt;/code&gt;</strong>,
ohgodno://example.org/ is a a protocol we want ignored
*/

// ignore URLs in HTML
source = "Hello www.example.com,\n"
+ '&lt;img src=&quot;http://example.org/image.png&quot; alt=&quot;&quot;&gt; is HTML,\n'
+ "&lt;a href='http://example.org/target.html'&gt; is HTML&lt;/a&gt;,\n"
+ "&lt;a href=http://example.org/target.html&gt; is HTML, too&lt;/a&gt;.";
result = URI.withinString(source, decorate, {
ignoreHtml: true
});

/* result is:
Hello <strong>&lt;code&gt;www.example.com&lt;/code&gt;</strong>,
&lt;img src=&quot;http://example.org/image.png&quot; alt=&quot;&quot;&gt; is HTML,
&lt;a href='http://example.org/target.html'&gt; is HTML&lt;/a&gt;,
&lt;a href=http://example.org/target.html&gt; is HTML, too&lt;/a&gt;
*/</pre>

<h3 id="static-iso8859">URI.iso8859()</h3>
<p>URI.iso8859() tells URI.js to use the older escape/unescape methods, for backwards compatibility with non-unicode platforms.</p>
Expand Down
52 changes: 45 additions & 7 deletions src/URI.js
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,16 @@ URI.ip4_expression = /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/;
// source: http://forums.intermapper.com/viewtopic.php?p=1096#1096
// specification: http://www.ietf.org/rfc/rfc4291.txt
URI.ip6_expression = /^\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?\s*$/;
// gruber revised expression - http://rodneyrehm.de/t/url-regex.html
// expression used is "gruber revised" (@gruber v2) determined to be the
// best solution in a regex-golf we did a couple of ages ago at
// * http://mathiasbynens.be/demo/url-regex
// * http://rodneyrehm.de/t/url-regex.html
URI.find_uri_expression = /\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))/ig;
URI.findUri = {
start: /\b(?:([a-z][a-z0-9.+-]*:\/\/)|www\.)/gi,
end: /[\s\r\n]|$/,
trim: /[`!()\[\]{};:'".,<>?«»“”„‘’]+$/
};
// http://www.iana.org/assignments/uri-schemes.html
// http://en.wikipedia.org/wiki/List_of_TCP_and_UDP_port_numbers#Well-known_ports
URI.defaultPorts = {
Expand Down Expand Up @@ -758,13 +766,43 @@ URI.commonPath = function(one, two) {
return one.substring(0, pos + 1);
};

URI.withinString = function(string, callback) {
// expression used is "gruber revised" (@gruber v2) determined to be the best solution in
// a regex sprint we did a couple of ages ago at
// * http://mathiasbynens.be/demo/url-regex
// * http://rodneyrehm.de/t/url-regex.html
URI.withinString = function(string, callback, options) {
options || (options = {});
var _start = options.start || URI.findUri.start;
var _end = options.end || URI.findUri.end;
var _trim = options.trim || URI.findUri.trim;
var _attributeOpen = /[a-z0-9-]=["']?$/i;

_start.lastIndex = 0;
while (true) {
var match = _start.exec(string);
if (!match) {
break;
}

var start = match.index;
if (options.ignoreHtml) {
// attribut(e=["']?$)
var attributeOpen = string.slice(Math.max(start - 3, 0), start);
if (attributeOpen && _attributeOpen.test(attributeOpen)) {
continue;
}
}

var end = start + string.slice(start).search(_end);
var slice = string.slice(start, end).replace(_trim, '');
if (options.ignore && options.ignore.test(slice)) {
continue;
}

end = start + slice.length;
var result = callback(slice, start, end, string);
string = string.slice(0, start) + result + string.slice(end);
_start.lastIndex = start + result.length;
}

return string.replace(URI.find_uri_expression, callback);
_start.lastIndex = 0;
return string;
};

URI.ensureValidHostname = function(v) {
Expand Down
52 changes: 40 additions & 12 deletions test/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -1340,21 +1340,49 @@ test("relativeTo", function() {
module("static helpers");
test("withinString", function() {
var source = "Hello www.example.com,\n"
+ "http://google.com is a search engine, like http://www.bing.com\n"
+ "http://exämple.org/foo.html?baz=la#bumm is an IDN URL,\n"
+ "http://123.123.123.123/foo.html is IPv4 and http://fe80:0000:0000:0000:0204:61ff:fe9d:f156/foobar.html is IPv6.\n"
+ "links can also be in parens (http://example.org) or quotes »http://example.org«.",
expected = "Hello <a>www.example.com</a>,\n"
+ "<a>http://google.com</a> is a search engine, like <a>http://www.bing.com</a>\n"
+ "<a>http://exämple.org/foo.html?baz=la#bumm</a> is an IDN URL,\n"
+ "<a>http://123.123.123.123/foo.html</a> is IPv4 and <a>http://fe80:0000:0000:0000:0204:61ff:fe9d:f156/foobar.html</a> is IPv6.\n"
+ "links can also be in parens (<a>http://example.org</a>) or quotes »<a>http://example.org</a>«.",
result = URI.withinString(source, function(url) {
return '<a>' + url + '</a>';
});
+ "http://google.com is a search engine, like http://www.bing.com\n"
+ "http://exämple.org/foo.html?baz=la#bumm is an IDN URL,\n"
+ "http://123.123.123.123/foo.html is IPv4 and http://fe80:0000:0000:0000:0204:61ff:fe9d:f156/foobar.html is IPv6.\n"
+ "links can also be in parens (http://example.org) or quotes »http://example.org«.";
var expected = "Hello <a>www.example.com</a>,\n"
+ "<a>http://google.com</a> is a search engine, like <a>http://www.bing.com</a>\n"
+ "<a>http://exämple.org/foo.html?baz=la#bumm</a> is an IDN URL,\n"
+ "<a>http://123.123.123.123/foo.html</a> is IPv4 and <a>http://fe80:0000:0000:0000:0204:61ff:fe9d:f156/foobar.html</a> is IPv6.\n"
+ "links can also be in parens (<a>http://example.org</a>) or quotes »<a>http://example.org</a>«.";
var result = URI.withinString(source, function(url) {
return '<a>' + url + '</a>';
});

equal(result, expected, "in string URI identification");
});
test("withinString - ignore", function() {
var decorate = function(url) {
return '<a>' + url + '</a>';
};
var source = "Hello www.example.com,\n"
+ "proto://example.org/foo.html?baz=la#bumm is an URL.\n";
var expected = "Hello <a>www.example.com</a>,\n"
+ "proto://example.org/foo.html?baz=la#bumm is an URL.\n";
var result = URI.withinString(source, decorate, {ignore: /^proto:/i});

equal(result, expected, "filtered in string URI identification");
});
test("withinString - ignoreHtml", function() {
var decorate = function(url) {
return '<a>' + url + '</a>';
};
var source = "Hello www.example.com,\n"
+ "<a href=http://example.org/foo.html?baz=la#bumm is an URL</a>.\n"
+ "<a href='http://example.org/foo.html?baz=la#bumm> is an URL</a>.\n"
+ "<a href=\"http://example.org/foo.html?baz=la#bumm\"> is an URL</a>.\n";
var expected = "Hello <a>www.example.com</a>,\n"
+ "<a href=http://example.org/foo.html?baz=la#bumm is an URL</a>.\n"
+ "<a href='http://example.org/foo.html?baz=la#bumm> is an URL</a>.\n"
+ "<a href=\"http://example.org/foo.html?baz=la#bumm\"> is an URL</a>.\n";
var result = URI.withinString(source, decorate, {ignoreHtml: true});

equal(result, expected, "filtered in string URI identification");
});
test("noConflict", function() {
var actual_lib = URI; // actual library; after loading, before noConflict()
var unconflicted = URI.noConflict();
Expand Down

0 comments on commit 9f04605

Please sign in to comment.