-
Notifications
You must be signed in to change notification settings - Fork 0
/
Lex.cpp
486 lines (401 loc) · 14.9 KB
/
Lex.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
#include "StdFuncs.h"
#include <string.h>
#include "Lex.h"
/**
* Initialises the TLex class for destructive extraction of tokens.
* When initialising the TLex instance with this constructor, subsequent parsing
* of the tokens will be destructive in nature. That is, the parsing can only be
* done once and the contents of the string will be altered. This method of
* parsing is less flexible than non destructive parsing but is useful in situations
* where one wishes to extract a simple series of white space separated tokens
* without needing to allocate memory into which to place the tokens after extraction.
*
* @date Saturday 24-Nov-2012 9:34 am
* @param a_pcString Ptr to string to be parsed for tokens
*/
TLex::TLex(char *a_pcString)
{
/* Remember to set these in the other constructor as well */
m_pccString = m_pccOriginalString = NULL;
m_pcString = a_pcString;
m_pccQuotes = "\"'";
m_pccWhitespace = " \t";
m_iLength = m_iOriginalLength = (TInt) strlen(a_pcString);
m_iQuotesLength = 2;
m_iWhitespaceLength = 2;
m_bKeepQuotes = m_bKeepWhitespace = m_bKeepNonAlphaNum = EFalse;
}
/**
* Initialises the TLex class for non destructive extraction of tokens.
* When initialising the TLex instance with this constructor, subsequent parsing
* of the tokens will be non destructive in nature. That is, the parsing can be
* done as many times desired and the contents of the string will not be altered.
* This method of parsing is more flexible than destructive parsing but requires
* the caller to subsequently store the extracted tokens somewhere.
*
* @date Saturday 24-Nov-2012 9:30 am
* @param a_pccString Ptr to string to be parsed for tokens
* @param a_iLength The length of the string passed in, excluding NULL terminator
*/
TLex::TLex(const char *a_pccString, TInt a_iLength)
{
/* Remember to set these in the other constructor as well */
m_pccString = m_pccOriginalString = a_pccString;
m_pcString = NULL;
m_pccQuotes = "\"'";
m_pccWhitespace = " \t";
m_iLength = m_iOriginalLength = a_iLength;
m_iQuotesLength = 2;
m_iWhitespaceLength = 2;
m_bKeepQuotes = m_bKeepWhitespace = m_bKeepNonAlphaNum = EFalse;
}
/**
* Checks to see if a character is a quote.
* Checks a character against the contents of the quote list to see if it is a
* quote. This function uses an internal (and user definable) list to define
* what a quote is.
*
* @date Tuesday 26-Jul-2022 7:11 am, Code HQ Tokyo Tsukuda
* @param a_cCharacter Character to be checked
* @return ETrue if a_cCharacter is a quote, else EFalse
*/
TBool TLex::IsQuote(char a_cCharacter)
{
TInt QuoteIndex;
for (QuoteIndex = 0; QuoteIndex < m_iQuotesLength; ++QuoteIndex)
{
if (a_cCharacter == m_pccQuotes[QuoteIndex])
{
break;
}
}
return(QuoteIndex < m_iQuotesLength);
}
/**
* Checks to see if a character is white space.
* Checks a character against the contents of the white space list to see
* if it is white space. This function uses an internal (and user definable)
* list to define what white space is, but also treats CR and LF implicitly
* as white space.
*
* @date Wednesday 05-Dec-2012 5:37 am
* @param a_cCharacter Character to be checked
* @return ETrue if a_cCharacter is white space, else EFalse
*/
TBool TLex::IsWhitespace(char a_cCharacter)
{
TInt Index;
/* Scan through the white space list and see if the character passed */
/* in is valid white space */
for (Index = 0; Index < m_iWhitespaceLength; ++Index)
{
if ((a_cCharacter == m_pccWhitespace[Index]) || (a_cCharacter == '\r') || (a_cCharacter == '\n'))
{
break;
}
}
return(Index < m_iWhitespaceLength);
}
/**
* Counts the number of tokens contained in the TLex instance.
* Iterates through the data loaded into the TLex instance and parses the tokens
* without extracting them, in order to count how many there are. When doing so,
* this routine respects any whitespace settings that have been applied to the
* instance. Note that this is an expensive routine as it must dynamically parse
* the contents of the TLex instance, and it will affect the state of the TLex
* instance as it goes through the tokens, so it should not be used for such things
* as the counter for a for loop, but its value should be instead be cached.
* Note that this function can only operate on TLex instances that have been
* initialised in non destructive mode.
*
* @date Tuesday 23-04-2013 7:19 am
* @return Number of tokens contained in the TLex instance
*/
TInt TLex::Count()
{
const char *Token;
TInt Length, RetVal;
ASSERTM((m_pccString != NULL), "TLex::Count() => Function operates only in non destructive mode");
RetVal = 0;
/* Reinitialise the instance data so that scanning starts at the beginning */
m_pccString = m_pccOriginalString;
m_iLength = m_iOriginalLength;
/* Determine how many tokens are present */
while ((Token = NextToken(&Length)) != NULL)
{
++RetVal;
}
/* Reinitialise the instance data so that future scanning starts at the beginning */
m_pccString = m_pccOriginalString;
m_iLength = m_iOriginalLength;
return(RetVal);
}
/**
* Extracts the next available token from the string.
* Parses the string with which the TLex was initialised and returns a ptr to the
* next available token in the string. This token will be NULL terminated. The
* first call to this routine will return the first token in the string and successive
* calls will return successive tokens. This is a destructive routine in that it
* "destroys" the string passed in by writing NULL terminators into it, moving
* through the source string as tokens are extracted.
*
* @date Monday 21-Jun-2010 6:51 am
* @return The next token available in the TLex instance
*/
char *TLex::NextToken()
{
char *RetVal;
TInt Offset;
ASSERTM((m_pcString != NULL), "TLex::NextToken() => String to parse not initialised");
/* In order to use the class for destructive and non destructive parsing, while */
/* reusing parsing code ensuring that constness is respected by the non destructive */
/* parsing, we do some trickery here. It is ok to case these results as we know */
/* the strings we are working with really are writeable */
m_pccString = m_pcString;
RetVal = (char *) NextToken(&Offset);
m_pcString = (char *) m_pccString;
/* If a token was found then NULL terminate it */
if (RetVal)
{
RetVal[Offset] = '\0';
}
return(RetVal);
}
/**
* Extracts the next available token from the string.
* Parses the string with which the TLex was initialised and returns a ptr to the
* next available token in the string. This token will NOT be NULL terminated but
* its length will instead be returned in the variable pointed to by a_piLength.
* In the case where no token is returned, the contents of a_piLength will be set
* to 0. This is a non destructive routine.
*
* @pre a_piLength must not be NULL
*
* @date Thursday 22-Nov-2012 6:12 am
* @param a_piLength A ptr to a variable into which to place the length of the token
* @return The next token available in the TLex instance, or NULL if there are none
*/
const char *TLex::NextToken(TInt *a_piLength)
{
char QuoteChar;
const char *NextToken, *RetVal;
TBool FoundQuotes;
TInt Index;
ASSERTM((m_pccString != NULL), "TLex::NextToken() => String to parse not initialised");
ASSERTM((a_piLength != NULL), "TLex::NextToken() => Address of length variable must be given");
/* Assume the token will be extracted from the start of the string */
NextToken = RetVal = m_pccString;
Index = 0;
FoundQuotes = (m_iQuotesLength != 2);
QuoteChar = '"';
/* Skip past any white space at the start of the string */
while ((Index < m_iLength) && (IsWhitespace(*NextToken)))
{
++NextToken;
++Index;
}
/* If there was white space and the class is configured to keep white space then */
/* we already have a token to return */
if ((Index == 0) || (!(m_bKeepWhitespace)))
{
RetVal = NextToken;
/* If the new token starts with a quote then extract up until the next quote and include any */
/* white space found between the start and end quote characters */
if (IsQuote(*NextToken) && !FoundQuotes)
{
QuoteChar = *NextToken;
++NextToken;
++Index;
FoundQuotes = ETrue;
/* Only skip the beginning " or ' if we are not configured to keep it */
if (!(m_bKeepQuotes))
{
++RetVal;
}
/* Extract the string itself as the token */
while ((Index < m_iLength) && (*NextToken != QuoteChar))
{
++NextToken;
++Index;
}
/* Only extract the end " or ' if we are configured to keep it and if it actually exists */
if ((m_bKeepQuotes) && (*NextToken == QuoteChar))
{
++NextToken;
++Index;
}
}
/* If we are in the middle of extracting a quoted string then just continue extracting that */
else if (FoundQuotes)
{
while ((Index < m_iLength) && !IsQuote(*NextToken))
{
++NextToken;
++Index;
}
/* And extract the end " or ' if we are configured to keep it and if it actually exists */
if ((Index < m_iLength) && m_bKeepQuotes && IsQuote(*NextToken))
{
++NextToken;
++Index;
}
}
/* If we are not configured to treat runs of non alpha numeric characters as a token, then */
/* just extract until the next white space character is found */
else if (!m_bKeepNonAlphaNum)
{
while ((Index < m_iLength) && (!(IsWhitespace(*NextToken))))
{
++NextToken;
++Index;
}
}
/* Otherwise it's a bit more complicated. We are giving special treatment to non alpha numeric characters */
/* and can return tokens such as "abcde" or "!(#=)", so we need to extract either a run of alpha numeric */
/* characters or a run of non alpha numeric characters, depending on what the next character is */
else
{
if (isalnum((unsigned char) *NextToken))
{
while ((Index < m_iLength) && isalnum((unsigned char) *NextToken) && !IsWhitespace(*NextToken) &&
(!IsQuote(*NextToken)))
{
++NextToken;
++Index;
}
}
else
{
while ((Index < m_iLength) && !isalnum((unsigned char) *NextToken) && !IsWhitespace(*NextToken) &&
!IsQuote(*NextToken))
{
++NextToken;
++Index;
}
}
}
}
/* If the token found contains any characters then determine its length */
if (NextToken > RetVal)
{
*a_piLength = (TInt) (NextToken - RetVal);
/* If we have found some white space then skip past it. This is only */
/* required for use by the destructive token extractor, which will put */
/* a NULL terminator into what is currently pointed to by NextToken, */
/* thus causing the next call to this function to fail */
if (IsWhitespace(*NextToken))
{
/* Only skip if we are not configured to treat white space as a */
/* token. Note that this is incompatible with the destructive */
/* token extractor! */
if (!(m_bKeepWhitespace))
{
++NextToken;
++Index;
}
}
/* If this is an end quote then skip it, but only if we are not keeping */
/* quotes. Otherwise if two quote separated tokens are next to one */
/* another without whitespace, the start of the second token will be */
/* skipped instead */
else if ((*NextToken == QuoteChar) && (FoundQuotes) && (!(m_bKeepQuotes)))
{
++NextToken;
++Index;
}
}
/* Otherwise signal that no more tokens were found */
else
{
*a_piLength = 0;
RetVal = NULL;
}
/* Save the current position in the string for use in the next call */
m_pccString = NextToken;
m_iLength -= Index;
return(RetVal);
}
/**
* Moves the internal text ptr forwards by a certain number of characters.
* This function should only be used if you really know what you are doing
* as it is breaking C++ abstraction rules and is use-at-your-own-risk!
* It is supplied only so that the Lex class can be used by more advanced
* tokenising code.
*
* @date Tuesday 08-Jan-2013 6:34 am, Vis à Vis Hotel, Lindau
* @param a_iLength Number of characters to skip
*/
void TLex::MoveForwards(TInt a_iLength)
{
m_pccString += a_iLength;
m_iLength -= a_iLength;
ASSERTM((m_iLength >= 0), "TLex::MoveForwards() => Moved forwards too far");
}
/**
* Moves the internal text ptr backwards by a certain number of characters.
* This function should only be used if you really know what you are doing
* as it is breaking C++ abstraction rules and is use-at-your-own-risk!
* It is supplied only so that the Lex class can be used by more advanced
* tokenising code.
*
* @date Tuesday 08-Jan-2013 6:24 am, Vis à Vis Hotel, Lindau
* @param a_iLength Number of characters to skip backwards
*/
void TLex::MoveBackwards(TInt a_iLength)
{
m_pccString -= a_iLength;
m_iLength += a_iLength;
}
/**
* Configures the TLex class such that it retains white space, quotes or both.
* Note that if you use the destructive version of the TLex::NextToken() then
* this function can cause incompatibilities with it, as the destructive NextToken()
* depends on being able to write its NULL terminator into the white space. If
* you need to extract white space then you will need to use the non destructive
* version of TLex::NextToken().
*
* @date Tuesday 27-Nov-2012 5:52 am
* @param a_bKeepQuotes ETrue to retain the " or ' quotation marks in extracted strings
* @param a_bKeepWhitespace ETrue to return white space as a token
* @param a_bKeepNonAlphaNum ETrue to return non alpha numeric characters as a token
*/
void TLex::SetConfig(TBool a_bKeepQuotes, TBool a_bKeepWhitespace, TBool a_bKeepNonAlphaNum)
{
m_bKeepQuotes = a_bKeepQuotes;
m_bKeepWhitespace = a_bKeepWhitespace;
m_bKeepNonAlphaNum = a_bKeepNonAlphaNum;
}
/**
* Sets the user definable quoted character list.
* Sets the list of characters that are treated as delimeters for quoted strings. By
* default, these are the " and ' characters.
*
* @date Friday 21-Mar-2014 7:14 am
* @param a_pccQuotes Ptr to string containing the new quote characters.
* Contents must be valid for the duration of the class's use
*/
void TLex::SetQuotes(const char *a_pccQuotes)
{
ASSERTM((a_pccQuotes != NULL), "TLex::SetQuotes() => Ptr to quote string can not be NULL");
m_pccQuotes = a_pccQuotes;
m_iQuotesLength = (TInt) strlen(a_pccQuotes);
}
/**
* Sets the user definable white space character list.
* Sets the white space character list so that other characters can be treated as
* white space. For example, to parse the string ".cpp;.c;.h" into its separate
* tokens you would use a white space string of ";". Note that white space is
* treated specially by the destructive extraction routines and thus setting the
* white space separator is only allowed for the non destructive routines.
*
* @date Wednesday 05-Dec-2012 5:30 am
* @param a_pccWhitespace Ptr to string containing the new white space characters
* Contents must be valid for the duration of the class's use
*/
void TLex::SetWhitespace(const char *a_pccWhitespace)
{
ASSERTM((a_pccWhitespace != NULL), "TLex::SetWhitespace() => Ptr to white space can not be NULL");
ASSERTM((m_pcString == NULL), "TLex::SetWhitespace() => Alternate white space can only be used for non destructive extraction");
m_pccWhitespace = a_pccWhitespace;
m_iWhitespaceLength = (TInt) strlen(a_pccWhitespace);
}