-
Notifications
You must be signed in to change notification settings - Fork 0
/
infotype
164 lines (139 loc) · 3.83 KB
/
infotype
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/awk -f
# Analyze a Markdown (MDITA) file
# and tag it with a topic type
# based on its content. Patterns
# and weights can be adjusted to suit.
#
# Usage: awk -f [me] [-v var=value] file.md
# var:
# DEBUG - spit out debug msgs if true (0)
# generic - default topic type (topic)
BEGIN {
# set up the weight table (see end of script)
numtests = weight_setup();
if( !generic ) generic = ""; # default type if no clear winner (blank means type "topic")
if( !DEBUG ) DEBUG = 0;
}
FILENAME != oldfile { # new file incoming
type = get_weight();
if( !type ) type = generic;
if( type && oldfile != "" ) {
sysstr = parse_hdg( hdg1, oldfile, type );
if( DEBUG ) print "###parse_hdg returns:", sysstr;
if( sysstr ) system( sysstr );
}
hdg1 = "";
delete results;
delete tests;
oldfile = FILENAME;
}
/^# / {
# save the heading (but let the pipeline test it)
hdg1 = $0;
}
{
# run tests on each line
# skip tests that have already passed
for( i=1; i<=numtests; ++i ) {
if( !tests[i] && $0 ~ weight[i,"pattern"] ) {
results[weight[i,"topictype"]] += weight[i,"weight"];
tests[i] = 1; # mark test as passed
}
}
next;
}
END {
type = get_weight();
if( !type ) type = generic;
if( type && oldfile ) {
sysstr = parse_hdg( hdg1, oldfile, type );
if( DEBUG ) print "###parse_hdg returns:", sysstr;
if( sysstr ) system( sysstr );
}
}
# # # # functions follow # # # #
# returns the topic type with the highest score
# return can be blank, in which case use the generic global
# ties go to the first type to be checked
function get_weight( win, hiscore, t) {
# uses global results array
win = "";
hiscore = 0;
for( t in results ) {
if( results[t] > hiscore ) {
win = t;
hiscore = results[t];
}
}
if( DEBUG ) print "###get_weight: hiscore=" hiscore "; type=" win;
return win;
}
# returns a command line to add the proper topic type to the heading
# return can be blank, in which case don't do anything
function parse_hdg( hd, fn, newtype, str ) {
# skip if the heading already has a topic type or outputclass
if( hd ~ /{.*\.[a-z]/ )
return "";
# check for ID construct: {#idname}
if( hd ~ /{.*#[a-zA-Z]/ ) {
str = "sed -i -e '/^# /s/{/{." newtype " /' " fn;
return str;
}
# default, no ID/outputclass
str = "sed -i -e '/^# /s/$/ {." newtype "}/' " fn;
return str;
}
# You can add your own rules here
function weight_setup( w ) {
# format: pattern, topic type, weight
# returns: number of entries
w = 1;
# bullet list = concept (maybe)
weight[w,"pattern"] = "^[ \t]*\\* +";
weight[w,"topictype"] = "concept";
weight[w,"weight"] = 0.5;
++w;
# numlist = task
weight[w,"pattern"] = "^[ \t]*[0-9]+\\. +";
weight[w,"topictype"] = "task";
weight[w,"weight"] = 1.5;
++w;
# first word of title ending in "ing" = task
# doesn't always seem to help
# weight[w,"pattern"] = "^# +[^ ]*ing ";
# weight[w,"topictype"] = "task";
# weight[w,"weight"] = 0.5;
# ++w;
# deflist = reference
weight[w,"pattern"] = "^ *: *";
weight[w,"topictype"] = "reference";
weight[w,"weight"] = 1.0;
++w;
# table = lean toward reference
weight[w,"pattern"] = "^\\| ";
weight[w,"topictype"] = "reference";
weight[w,"weight"] = 0.5;
++w;
# tables might be HTML
weight[w,"pattern"] = "<table";
weight[w,"topictype"] = "reference";
weight[w,"weight"] = 0.5;
++w;
# "About" in the heading is usually a concept
weight[w,"pattern"] = "^# .*[Aa]bout";
weight[w,"topictype"] = "concept";
weight[w,"weight"] = 1.5;
++w;
# "Commands" in the heading is usually a reference
weight[w,"pattern"] = "^# .*[Cc]ommands";
weight[w,"topictype"] = "reference";
weight[w,"weight"] = 1.5;
++w;
# "Syntax" at the beginning of a paragraph is prob. a reference
weight[w,"pattern"] = "^Syntax:";
weight[w,"topictype"] = "reference";
weight[w,"weight"] = 1.0;
++w;
if( DEBUG ) print "###weight_setup:", w, "tests read.";
return w;
}