-
Notifications
You must be signed in to change notification settings - Fork 0
/
regexp_dotno
executable file
·63 lines (56 loc) · 2.34 KB
/
regexp_dotno
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#! /usr/bin/env bash
SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
#
# For usage, see ./test_regexp
#
# Source https://www.norid.no/en/regelverk/navnepolitikk/#link3
# Verified 2018-12-11
# The lines below prefixed with #* are read by the code in
# mcn-source-simple/permutate
# Name of letter Unicode Name of letter Unicode
#* a with acute accent á 00E1* o with acute accent ó 00F3*
#* a with grave accent à 00E0* o with grave accent ò 00F2*
#* a with umlaut ä 00E4* o with circumflex ô 00F4*
#* c with caron č 010D* o with umlaut ö 00F6*
#* c with cedilla ç 00E7* s with caron š 0161*
#* d with crossbar đ 0111* t with crossbar ŧ 0167*
#* e with acute accent é 00E9* u with umlaut ü 00FC*
#* e with grave accent è 00E8* z with caron ž 017E*
#* e with circumflex ê 00EA* æ æ 00E6*
#* i with umlaut ï 00EF* ø ø 00F8*
#* ENG ŋ 014B* å å 00E5*
#* n with acute accent ń 0144* n with tilde ñ 00F1*
VCHAR_EX_DASH='[a-z0-9\x{E0}\x{E1}\x{E4}\x{E5}\x{E6}\x{E7}\x{E8}\x{E9}\x{EA}\x{EF}\x{F1}\x{F2}\x{F3}\x{F4}\x{F6}\x{F8}\x{FC}\x{10D}\x{111}\x{144}\x{14B}\x{161}\x{167}\x{17E}]'
# All valid category domains
CAT_DOMAINS=$(${SOURCE_DIR}/cat_domains_to_regexp)
EXP_REGEXP=$(cat <<'EOF'
(
(?:
# $x are all valid chars except - (hyphen)
# A normal domain, min 2 chars (neither -) with 0-61 chars (incl. -) in between
$x(?:$x|-){0,61}$x
|
# $y are all category domain surrounded by (?:...)
# valid: x.<cat>.no and xx.<cat>.no, but not -.<cat>.no, -x.<cat>.no or x-.<cat>.no
(?:
$x
|
$x(?:$x|-){0,61}$x
)\.$y
)
\.no
)
(?!$x)
EOF
)
# Remove comments, spaces and newlines from regexp
REGEXP="$(echo "$EXP_REGEXP" | sed -e 's/#.*$//' -e 's/ //g' | tr -d '\n')"
if [ -t 0 ] ; then
if [ "$#" -ne 1 ]; then
echo "No STDIN input detected (non-terminal), and no file argument supplied"
exit 1;
fi
perl -CSD -e "open my \$in, \"$1\"; while(<\$in>) { \$x=qr/${VCHAR_EX_DASH}/i; \$y=qr/(?:${CAT_DOMAINS})/i; while(/${REGEXP}/gi) { print \$1,\"\\n\"; } } close(\$in);"
else
perl -CSD -ne "BEGIN { \$x=qr/${VCHAR_EX_DASH}/i; \$y=qr/(?:${CAT_DOMAINS})/i; } while(/${REGEXP}/goi) { print \$1,\"\\n\"; }"
fi