Skip to content

Commit

Permalink
Merge pull request #56 from serega/rfc-3987-iri-optional
Browse files Browse the repository at this point in the history
Added a flag url_can_bi_iri that allows disabling IRI parsing
  • Loading branch information
robinst committed Jun 24, 2023
2 parents c492f01 + e2e9ae9 commit 7331d75
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 15 deletions.
5 changes: 5 additions & 0 deletions src/domains.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ pub(crate) fn find_authority_end(
mut userinfo_allowed: bool,
require_host: bool,
port_allowed: bool,
iri_parsing_enabled: bool,
) -> (Option<usize>, Option<usize>) {
let mut end = Some(0);

Expand All @@ -47,6 +48,9 @@ pub(crate) fn find_authority_end(
let can_be_last = match c {
// ALPHA
'a'..='z' | 'A'..='Z' | '\u{80}'..=char::MAX => {
if !iri_parsing_enabled && c > '\u{80}' {
break;
}
// Can start or end a domain label, but not numeric
dot_allowed = true;
hyphen_allowed = true;
Expand All @@ -59,6 +63,7 @@ pub(crate) fn find_authority_end(

!require_host || !host_ended
}

// DIGIT
'0'..='9' => {
// Same as above, except numeric
Expand Down
2 changes: 1 addition & 1 deletion src/email.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ impl EmailScanner {

// See "Domain" in RFC 5321, plus extension of "sub-domain" in RFC 6531
fn find_end(&self, s: &str) -> Option<usize> {
if let (Some(end), last_dot) = find_authority_end(s, false, true, false) {
if let (Some(end), last_dot) = find_authority_end(s, false, true, false, true) {
if !self.domain_must_have_dot || last_dot.is_some() {
Some(end)
} else {
Expand Down
20 changes: 18 additions & 2 deletions src/finder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ pub struct LinkFinder {
email_domain_must_have_dot: bool,
url: bool,
url_must_have_scheme: bool,
url_can_be_iri: bool,
}

/// Iterator for finding links.
Expand Down Expand Up @@ -133,6 +134,7 @@ impl LinkFinder {
email_domain_must_have_dot: true,
url: true,
url_must_have_scheme: true,
url_can_be_iri: true,
}
}

Expand All @@ -154,6 +156,14 @@ impl LinkFinder {
self
}

/// Sets whether URLs can be IRI according to RFC-3987.
/// The default is `true`.
/// Setting it to `false` means domains can contain ASCII characters only.
pub fn url_can_be_iri(&mut self, url_can_be_iri: bool) -> &mut LinkFinder {
self.url_can_be_iri = url_can_be_iri;
self
}

/// Restrict the kinds of links that should be found to the specified ones.
pub fn kinds(&mut self, kinds: &[LinkKind]) -> &mut LinkFinder {
self.email = false;
Expand All @@ -178,6 +188,7 @@ impl LinkFinder {
self.url_must_have_scheme,
self.email,
self.email_domain_must_have_dot,
self.url_can_be_iri,
)
}

Expand Down Expand Up @@ -212,9 +223,14 @@ impl<'t> Links<'t> {
url_must_have_scheme: bool,
email: bool,
email_domain_must_have_dot: bool,
iri_parsing_enabled: bool,
) -> Links<'t> {
let url_scanner = UrlScanner;
let domain_scanner = DomainScanner;
let url_scanner = UrlScanner {
iri_parsing_enabled,
};
let domain_scanner = DomainScanner {
iri_parsing_enabled,
};
let email_scanner = EmailScanner {
domain_must_have_dot: email_domain_must_have_dot,
};
Expand Down
36 changes: 25 additions & 11 deletions src/url.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,14 @@ const QUOTES: &[char] = &['\'', '\"'];
/// Scan for URLs starting from the trigger character ":" (requires "://").
///
/// Based on RFC 3986.
pub struct UrlScanner;
pub struct UrlScanner {
pub iri_parsing_enabled: bool,
}

/// Scan for plain domains (without scheme) such as `test.com` or `test.com/hi-there`.
pub struct DomainScanner;
pub struct DomainScanner {
pub iri_parsing_enabled: bool,
}

impl Scanner for UrlScanner {
/// Scan for an URL at the given separator index in the string.
Expand Down Expand Up @@ -51,8 +55,12 @@ impl Scanner for UrlScanner {

let require_host = scheme_requires_host(scheme);

if let (Some(after_authority), _) = find_authority_end(s, true, require_host, true) {
if let Some(end) = find_url_end(&s[after_authority..], quote) {
if let (Some(after_authority), _) =
find_authority_end(s, true, require_host, true, self.iri_parsing_enabled)
{
if let Some(end) =
find_url_end(&s[after_authority..], quote, self.iri_parsing_enabled)
{
if after_authority == 0 && end == 0 {
return None;
}
Expand All @@ -77,11 +85,14 @@ impl Scanner for DomainScanner {
return None;
}

if let (Some(start), quote) = find_domain_start(&s[0..separator]) {
if let (Some(start), quote) = find_domain_start(&s[0..separator], self.iri_parsing_enabled)
{
let s = &s[start..];

if let (Some(domain_end), Some(_)) = find_authority_end(s, false, true, true) {
if let Some(end) = find_url_end(&s[domain_end..], quote) {
if let (Some(domain_end), Some(_)) =
find_authority_end(s, false, true, true, self.iri_parsing_enabled)
{
if let Some(end) = find_url_end(&s[domain_end..], quote, self.iri_parsing_enabled) {
let range = Range {
start,
end: start + domain_end + end,
Expand Down Expand Up @@ -146,14 +157,15 @@ fn scheme_requires_host(scheme: &str) -> bool {
/// - Domain is labels separated by `.`. Because we're starting at the first `.`, we only need to
/// handle one label.
/// - Label can not start or end with `-`
/// - Label can contain letters, digits, `-` or Unicode
fn find_domain_start(s: &str) -> (Option<usize>, Option<char>) {
/// - Label can contain letters, digits, `-` or Unicode if iri_allowed flag is true
fn find_domain_start(s: &str, iri_parsing_enabled: bool) -> (Option<usize>, Option<char>) {
let mut first = None;
let mut quote = None;

for (i, c) in s.char_indices().rev() {
match c {
'a'..='z' | 'A'..='Z' | '0'..='9' | '\u{80}'..=char::MAX => first = Some(i),
'a'..='z' | 'A'..='Z' | '0'..='9' => first = Some(i),
'\u{80}'..=char::MAX if iri_parsing_enabled => first = Some(i),
// If we had something valid like `https://www.` we'd have found it with the ":"
// scanner already. We don't want to allow `.../www.example.com` just by itself.
// We *could* allow `//www.example.com` (scheme-relative URLs) in the future.
Expand Down Expand Up @@ -192,7 +204,7 @@ fn find_domain_start(s: &str) -> (Option<usize>, Option<char>) {

/// Find the end of a URL. At this point we already scanned past a valid authority. So e.g. in
/// `https://example.com/foo` we're starting at `/` and want to end at `o`.
fn find_url_end(s: &str, quote: Option<char>) -> Option<usize> {
fn find_url_end(s: &str, quote: Option<char>, iri_parsing_enabled: bool) -> Option<usize> {
let mut round = 0;
let mut square = 0;
let mut curly = 0;
Expand Down Expand Up @@ -272,6 +284,8 @@ fn find_url_end(s: &str, quote: Option<char>) -> Option<usize> {
// A single quote can only be the end of an URL if there's an even number
!single_quote
}
'\u{80}'..=char::MAX if !iri_parsing_enabled => false,

_ => true,
};
if can_be_last {
Expand Down
22 changes: 21 additions & 1 deletion tests/domains.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
mod common;

use crate::common::assert_linked_with;
use linkify::LinkFinder;
use linkify::{LinkFinder, LinkKind};

#[test]
fn domain_valid() {
Expand Down Expand Up @@ -155,6 +155,26 @@ fn without_scheme_should_stop() {
assert_not_linked("1abc://example.com");
}

#[test]
pub fn test_international_not_allowed() {
let mut finder = LinkFinder::new();
finder.url_must_have_scheme(false);
finder.url_can_be_iri(false);
let link = finder.links("\u{A1}\u{A2}example.com").next().unwrap();
assert_eq!(link.kind(), &LinkKind::Url);
assert_eq!(link.as_str(), "example.com");
}

#[test]
pub fn test_international_allowed() {
let mut finder = LinkFinder::new();
finder.url_must_have_scheme(false);
finder.url_can_be_iri(true);
let link = finder.links("\u{A1}\u{A2}example.com").next().unwrap();
assert_eq!(link.kind(), &LinkKind::Url);
assert_eq!(link.as_str(), "\u{A1}\u{A2}example.com");
}

fn assert_linked(input: &str, expected: &str) {
let mut finder = LinkFinder::new();
finder.url_must_have_scheme(false);
Expand Down
50 changes: 50 additions & 0 deletions tests/url.rs
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,56 @@ fn international() {
);
}

#[test]
fn international_not_allowed() {
let mut finder = LinkFinder::new();
finder.url_can_be_iri(false);
finder.url_must_have_scheme(true);
finder.kinds(&[LinkKind::Url]);
assert_linked_with(&finder, "http://üñîçøðé.com", "http://üñîçøðé.com");
assert_linked_with(&finder, "http://üñîçøðé.com/ä", "http://üñîçøðé.com/ä");
assert_linked_with(
&finder,
"http://example.org/\u{A1}",
"|http://example.org/|\u{A1}",
);
assert_linked_with(
&finder,
"http://example.org/\u{A2}",
"|http://example.org/|\u{A2}",
);
assert_linked_with(
&finder,
"http://example.org/\u{1F600}",
"|http://example.org/|\u{1F600}",
);
assert_linked_with(
&finder,
"http://example.org/\u{A2}/",
"|http://example.org/|\u{A2}/",
);
assert_linked_with(
&finder,
"http://xn--c1h.example.com/",
"|http://xn--c1h.example.com/|",
);
}

#[test]
fn international_not_allowed_without_protocol() {
let mut finder = LinkFinder::new();
finder.url_can_be_iri(false);
finder.url_must_have_scheme(false);
finder.kinds(&[LinkKind::Url]);
assert_linked_with(&finder, "üñîçøðé.com", "üñîçøðé.com");
assert_linked_with(&finder, "üñîçøðé.com/ä", "üñîçøðé.com/ä");
assert_linked_with(&finder, "example.org/\u{A1}", "|example.org/|\u{A1}");
assert_linked_with(&finder, "example.org/\u{A2}", "|example.org/|\u{A2}");
assert_linked_with(&finder, "example.org/\u{1F600}", "|example.org/|\u{1F600}");
assert_linked_with(&finder, "example.org/\u{A2}/", "|example.org/|\u{A2}/");
assert_linked_with(&finder, "xn--c1h.example.com/", "|xn--c1h.example.com/|");
}

#[test]
fn international_without_protocol() {
assert_urls_without_protocol("üñîçøðé.com", "|üñîçøðé.com|");
Expand Down

0 comments on commit 7331d75

Please sign in to comment.