From 776b0d83534310a9e91fa9641c54f57c73f69b51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Mart=C3=AD?= Date: Mon, 15 Jul 2019 16:54:20 +0900 Subject: [PATCH] allow Relaxed to match punycode TLDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For example, it should match "test.xn--8y0a063a" just like it matches "test.联通". Instead of doubling the size of the regexp by adding the punycode version of every known TLD, simply match any valid punycode string which follows "xn--". It's highly unlikely that this would cause false positives. Fixes #27. --- xurls.go | 4 +++- xurls_test.go | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/xurls.go b/xurls.go index d6279ae..6585a05 100644 --- a/xurls.go +++ b/xurls.go @@ -72,7 +72,9 @@ func strictExp() string { } func relaxedExp() string { - site := domain + `(?i)` + anyOf(append(TLDs, PseudoTLDs...)...) + `(?-i)` + punycode := `xn--[a-z0-9-]+` + knownTLDs := anyOf(append(TLDs, PseudoTLDs...)...) + site := domain + `(?i)(` + punycode + `|` + knownTLDs + `)(?-i)` hostName := `(` + site + `|` + ipAddr + `)` webURL := hostName + port + `(/|/` + pathCont + `?|\b|$)` return strictExp() + `|` + webURL diff --git a/xurls_test.go b/xurls_test.go index dd22692..c99c578 100644 --- a/xurls_test.go +++ b/xurls_test.go @@ -188,6 +188,11 @@ func TestRegexes(t *testing.T) { {`foo.onion`, true}, {`中国.中国`, true}, {`中国.中国/foo中国`, true}, + {`test.联通`, true}, + {`test.xn--8y0a063a`, true}, + {`test.xn--8y0a063a/foobar`, true}, + {`test.xn-foo`, nil}, + {`test.xn--`, nil}, {`foo.com/`, true}, {`1.1.1.1`, true}, {`10.50.23.250`, true},