vul_reg_exp.cxx
Go to the documentation of this file.
1 // This is core/vul/vul_reg_exp.cxx
2 #include <iostream>
3 #include <cstring>
4 #include <cstddef>
5 #include "vul_reg_exp.h"
6 //:
7 // \file
8 //
9 // Copyright (C) 1991 Texas Instruments Incorporated.
10 //
11 // Permission is granted to any individual or institution to use, copy, modify,
12 // and distribute this software, provided that this complete copyright and
13 // permission notice is maintained, intact, in all copies and supporting
14 // documentation.
15 //
16 // Texas Instruments Incorporated provides this software "as is" without
17 // express or implied warranty.
18 //
19 //
20 // Created: MNF Jun 13, 1989 Initial Design and Implementation
21 // Updated: LGO Aug 09, 1989 Inherit from Generic
22 // Updated: MBN Sep 07, 1989 Added conditional exception handling
23 // Updated: MBN Dec 15, 1989 Sprinkled "const" qualifiers all over the place!
24 // Updated: DLS Mar 22, 1991 New lite version
25 //
26 // This is the header file for the regular expression class. An object of
27 // this class contains a regular expression, in a special "compiled" format.
28 // This compiled format consists of several slots all kept as the objects
29 // private data. The vul_reg_exp class provides a convenient way to represent
30 // regular expressions. It makes it easy to search for the same regular
31 // expression in many different strings without having to compile a string to
32 // regular expression format more than necessary.
33 //
34 // A regular expression allows a programmer to specify complex patterns that
35 // can be searched for and matched against the character string of a String
36 // object. In its simplest case, a regular expression is a sequence of
37 // characters with which you can search for exact character matches. However,
38 // many times you may not know the exact sequence you want to find, or you may
39 // only want to find a match at the beginning or end of a String. The vul_reg_exp
40 // object allows specification of such patterns by utilizing the following
41 // regular expression meta-characters (note that more one of these
42 // meta-characters can be used in a single regular expression in order to
43 // create complex search patterns):
44 //
45 // - ^ Match at beginning of line
46 // - $ Match at end of line
47 // - . Match any single character
48 // - [ ] Match any one character inside the brackets
49 // - [^ ] Match any character NOT inside the brackets
50 // - - Match any character in range on either side of dash
51 // - * Match preceding pattern zero or more times
52 // - + Match preceding pattern one or more times
53 // - ? Match preceding pattern zero or once only
54 // - () Save a matched expression and use it in a further match.
55 //
56 // There are three constructors for vul_reg_exp. One just creates an empty vul_reg_exp
57 // object. Another creates a vul_reg_exp object and initializes it with a regular
58 // expression that is given in the form of a char*. The third takes a
59 // reference to a vul_reg_exp object as an argument and creates an object
60 // initialized with the information from the given vul_reg_exp object.
61 //
62 // The find member function finds the first occurrence of the regular
63 // expression of that object in the string given to find as an argument. Find
64 // returns a boolean, and if true, mutates the private data appropriately.
65 // Find sets pointers to the beginning and end of the thing last found, they
66 // are pointers into the actual string that was searched. The start and end
67 // member functions return indices into the searched string that correspond
68 // to the beginning and end pointers respectively. The compile member
69 // function takes a char* and puts the compiled version of the char* argument
70 // into the object's private data fields. The == and != operators only check
71 // the to see if the compiled regular expression is the same, and the
72 // deep_equal functions also checks to see if the start and end pointers are
73 // the same. The is_valid function returns false if program is set to NULL,
74 // (i.e. there is no valid compiled expression). The set_invalid function sets
75 // the program to NULL (Warning: this deletes the compiled expression). The
76 // following examples may help clarify regular expression usage:
77 //
78 // * The regular expression "^hello" matches a "hello" only at the
79 // beginning of a line. It would match "hello there" but not "hi,
80 // hello there".
81 //
82 // * The regular expression "long$" matches a "long" only at the end
83 // of a line. It would match "so long\0", but not "long ago".
84 //
85 // * The regular expression "t..t..g" will match anything that has a
86 // "t" then any two characters, another "t", any two characters and
87 // then a "g". It will match "testing", or "test again" but would
88 // not match "toasting"
89 //
90 // * The regular expression "[1-9ab]" matches any number one through
91 // nine, and the characters "a" and "b". It would match "hello 1"
92 // or "begin", but would not match "no-match".
93 //
94 // * The regular expression "[^1-9ab]" matches any character that is
95 // not a number one through nine, or an "a" or "b". It would NOT
96 // match "hello 1" or "begin", but would match "no-match".
97 //
98 // * The regular expression "br* " matches something that begins with
99 // a "b", is followed by zero or more "r"s, and ends in a space. It
100 // would match "brrrrr ", and "b ", but would not match "brrh ".
101 //
102 // * The regular expression "br+ " matches something that begins with
103 // a "b", is followed by one or more "r"s, and ends in a space. It
104 // would match "brrrrr ", and "br ", but would not match "b " or
105 // "brrh ".
106 //
107 // * The regular expression "br? " matches something that begins with
108 // a "b", is followed by zero or one "r"s, and ends in a space. It
109 // would match "br ", and "b ", but would not match "brrrr " or
110 // "brrh ".
111 //
112 // * The regular expression "(..p)b" matches something ending with pb
113 // and beginning with whatever the two characters before the first p
114 // encountered in the line were. It would find "repb" in "rep drepa
115 // qrepb". The regular expression "(..p)a" would find "repa qrepb"
116 // in "rep drepa qrepb"
117 //
118 // * The regular expression "d(..p)" matches something ending with p,
119 // beginning with d, and having two characters in between that are
120 // the same as the two characters before the first p encountered in
121 // the line. It would match "drepa qrepb" in "rep drepa qrepb".
122 
123 #ifdef _MSC_VER
124 # include <vcl_msvc_warnings.h>
125 #endif
126 
127 //: Copies the given regular expression.
128 
130 {
131  int ind;
132  this->progsize = rxp.progsize; // Copy regular expression size
133  this->program = new char[this->progsize]; // Allocate storage
134  for (ind=this->progsize; ind-- != 0;) // Copy regular expression
135  this->program[ind] = rxp.program[ind];
136  this->startp[0] = rxp.startp[0]; // Copy pointers into last
137  this->endp[0] = rxp.endp[0]; // Successful "find" operation
138  this->regmust = rxp.regmust; // Copy field
139  if (rxp.regmust != nullptr) {
140  char* dum = rxp.program;
141  ind = 0;
142  while (dum != rxp.regmust) {
143  ++dum;
144  ++ind;
145  }
146  this->regmust = this->program + ind;
147  }
148  this->regstart = rxp.regstart; // Copy starting index
149  this->reganch = rxp.reganch; // Copy remaining private data
150  this->regmlen = rxp.regmlen; // Copy remaining private data
151 }
152 
153 
154 //: Returns true if two regular expressions have the same compiled program for pattern matching.
155 
156 bool vul_reg_exp::operator== (vul_reg_exp const& rxp) const
157 {
158  if (this != &rxp) { // Same address?
159  int ind = this->progsize; // Get regular expression size
160  if (ind != rxp.progsize) // If different size regexp
161  return false; // Return failure
162  while (ind-- != 0) // Else while still characters
163  if (this->program[ind] != rxp.program[ind])// If regexp are different
164  return false; // Return failure
165  }
166  return true; // Else same, return success
167 }
168 
169 
170 //: Returns true if have the same compiled regular expressions and the same start and end pointers.
171 
172 bool vul_reg_exp::deep_equal (vul_reg_exp const& rxp) const
173 {
174  int ind = this->progsize; // Get regular expression size
175  if (ind != rxp.progsize) // If different size regexp
176  return false; // Return failure
177  while (ind-- != 0) // Else while still characters
178  if (this->program[ind] != rxp.program[ind]) // If regexp are different
179  return false; // Return failure
180  return this->startp[0] == rxp.startp[0] && // Else if same start/end ptrs,
181  this->endp[0] == rxp.endp[0]; // Return true
182 }
183 
184 
185 // The remaining code in this file is derived from the regular expression code
186 // whose copyright statement appears below. It has been changed to work
187 // with the class concepts of C++ and COOL.
188 
189 //
190 // compile and find
191 //
192 // Copyright (c) 1986 by University of Toronto.
193 // Written by Henry Spencer. Not derived from licensed software.
194 //
195 // Permission is granted to anyone to use this software for any
196 // purpose on any computer system, and to redistribute it freely,
197 // subject to the following restrictions:
198 //
199 // 1. The author is not responsible for the consequences of use of
200 // this software, no matter how awful, even if they arise
201 // from defects in it.
202 //
203 // 2. The origin of this software must not be misrepresented, either
204 // by explicit claim or by omission.
205 //
206 // 3. Altered versions must be plainly marked as such, and must not
207 // be misrepresented as being the original software.
208 //
209 // Beware that some of this code is subtly aware of the way operator
210 // precedence is structured in regular expressions. Serious changes in
211 // regular-expression syntax might require a total rethink.
212 //
213 
214 //
215 // The "internal use only" fields in regexp.h are present to pass info from
216 // compile to execute that permits the execute phase to run lots faster on
217 // simple cases. They are:
218 //
219 // regstart char that must begin a match; '\0' if none obvious
220 // reganch is the match anchored (at beginning-of-line only)?
221 // regmust string (pointer into program) that match must include, or NULL
222 // regmlen length of regmust string
223 //
224 // Regstart and reganch permit very fast decisions on suitable starting points
225 // for a match, cutting down the work a lot. Regmust permits fast rejection
226 // of lines that cannot possibly match. The regmust tests are costly enough
227 // that compile() supplies a regmust only if the r.e. contains something
228 // potentially expensive (at present, the only such thing detected is * or +
229 // at the start of the r.e., which can involve a lot of backup). Regmlen is
230 // supplied because the test in find() needs it and compile() is computing
231 // it anyway.
232 //
233 
234 //
235 // Structure for regexp "program". This is essentially a linear encoding
236 // of a nondeterministic finite-state machine (aka syntax charts or
237 // "railroad normal form" in parsing technology). Each node is an opcode
238 // plus a "next" pointer, possibly plus an operand. "Next" pointers of
239 // all nodes except BRANCH implement concatenation; a "next" pointer with
240 // a BRANCH on both ends of it is connecting two alternatives. (Here we
241 // have one of the subtle syntax dependencies: an individual BRANCH (as
242 // opposed to a collection of them) is never concatenated with anything
243 // because of operator precedence.) The operand of some types of node is
244 // a literal string; for others, it is a node leading into a sub-FSM. In
245 // particular, the operand of a BRANCH node is the first node of the branch.
246 // (NB this is \e not a tree structure: the tail of the branch connects
247 // to the thing following the set of BRANCHes.) The opcodes are:
248 //
249 
250 // definition number opnd? meaning
251 #define END 0 // no End of program.
252 #define BOL 1 // no Match "" at beginning of line.
253 #define EOL 2 // no Match "" at end of line.
254 #define ANY 3 // no Match any one character.
255 #define ANYOF 4 // str Match any character in this string.
256 #define ANYBUT 5 // str Match any character not in this string.
257 #define BRANCH 6 // node Match this alternative, or the next...
258 #define BACK 7 // no Match "", "next" ptr points backward.
259 #define EXACTLY 8 // str Match this string.
260 #define NOTHING 9 // no Match empty string.
261 #define STAR 10 // node Match this (simple) thing 0 or more times.
262 #define PLUS 11 // node Match this (simple) thing 1 or more times.
263 #define OPEN 20 // no Mark this point in input as start of #n.
264 // OPEN+1 is number 1, etc.
265 #define CLOSE 30 // no Analogous to OPEN.
266 
267 //
268 // Opcode notes:
269 //
270 // BRANCH The set of branches constituting a single choice are hooked
271 // together with their "next" pointers, since precedence prevents
272 // anything being concatenated to any individual branch. The
273 // "next" pointer of the last BRANCH in a choice points to the
274 // thing following the whole choice. This is also where the
275 // final "next" pointer of each individual branch points; each
276 // branch starts with the operand node of a BRANCH node.
277 //
278 // BACK Normal "next" pointers all implicitly point forward; BACK
279 // exists to make loop structures possible.
280 //
281 // STAR,PLUS '?', and complex '*' and '+', are implemented as circular
282 // BRANCH structures using BACK. Simple cases (one character
283 // per match) are implemented with STAR and PLUS for speed
284 // and to minimize recursive plunges.
285 //
286 // OPEN,CLOSE ...are numbered at compile time.
287 //
288 
289 //
290 // A node is one char of opcode followed by two chars of "next" pointer.
291 // "Next" pointers are stored as two 8-bit pieces, high order first. The
292 // value is a positive offset from the opcode of the node containing it.
293 // An operand, if any, simply follows the node. (Note that much of the
294 // code generation knows about this implicit relationship.)
295 //
296 // Using two bytes for the "next" pointer is vast overkill for most things,
297 // but allows patterns to get big without disasters.
298 //
299 
300 #define OP(p) (*(p))
301 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
302 #define OPERAND(p) ((p) + 3)
303 
304 constexpr unsigned char MAGIC = 0234;
305 
306 //
307 // Utility definitions.
308 //
309 #define UCHARAT(p) ((const unsigned char*)(p))[0]
310 
311 #define FAIL(m) { regerror(m); return NULL; }
312 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
313 #define META "^$.[()|?+*\\"
314 
315 //
316 // Flags to be passed up and down.
317 //
318 #define HASWIDTH 01 // Known never to match null string.
319 #define SIMPLE 02 // Simple enough to be STAR/PLUS operand.
320 #define SPSTART 04 // Starts with * or +.
321 #define WORST 0 // Worst case.
322 
323 
324 //: Return an expression that will match precisely c
325 // The returned string is owned by the function, and
326 // will be overwritten in subsequent calls.
327 const char * vul_reg_exp::protect(char c)
328 {
329  //: This should be in thread local storage.
330  static char pattern[3];
331 
332  if (std::strchr(META, c) != nullptr)
333  {
334  pattern[0] = '\\';
335  pattern[1] = c;
336  pattern[2] = 0;
337  }
338  else
339  {
340  pattern[0] = c;
341  pattern[1] = 0;
342  }
343  return pattern;
344 }
345 
346 
347 /////////////////////////////////////////////////////////////////////////
348 //
349 // COMPILE AND ASSOCIATED FUNCTIONS
350 //
351 /////////////////////////////////////////////////////////////////////////
352 
353 
354 //
355 // Global work variables for compile().
356 //
357 static const char* regparse; // Input-scan pointer.
358 static int regnpar; // () count.
359 static char regdummy;
360 static char* regcode; // Code-emit pointer; &regdummy = don't.
361 static long regsize; // Code size.
362 
363 //
364 // Forward declarations for compile()'s friends.
365 //
366 static char* reg (int, int*);
367 static char* regbranch (int*);
368 static char* regpiece (int*);
369 static char* regatom (int*);
370 static char* regnode (char);
371 static const char* regnext ( const char*);
372 static char* regnext ( char*);
373 static void regc (unsigned char);
374 static void reginsert (char, char*);
375 static void regtail (char*, const char*);
376 static void regoptail (char*, const char*);
377 
378 
379 //
380 // We can't allocate space until we know how big the compiled form will be,
381 // but we can't compile it (and thus know how big it is) until we've got a
382 // place to put the code. So we cheat: we compile it twice, once with code
383 // generation turned off and size counting turned on, and once "for real".
384 // This also means that we don't allocate space until we are sure that the
385 // thing really will compile successfully, and we never have to move the
386 // code and thus invalidate pointers into it. (Note that it has to be in
387 // one piece because free() must be able to free it all.)
388 //
389 // Beware that the optimization-preparation code in here knows about some
390 // of the structure of the compiled regexp.
391 //
392 
393 
394 //: Compile a regular expression into internal code for later pattern matching.
395 
396 void vul_reg_exp::compile (char const* exp)
397 {
398  const char* scan;
399  const char* longest;
400  unsigned long len;
401  int flags;
402 
403  if (exp == nullptr) {
404  //RAISE Error, SYM(vul_reg_exp), SYM(No_Expr),
405  std::cout << "vul_reg_exp::compile(): No expression supplied.\n";
406  return;
407  }
408 
409  // First pass: determine size, legality.
410  regparse = exp;
411  regnpar = 1;
412  regsize = 0L;
413  regcode = &regdummy;
414  regc(MAGIC);
415  if (!reg(0, &flags))
416  {
417  std::cout << "vul_reg_exp::compile(): Error in compile.\n";
418  return;
419  }
420  this->startp[0] = this->endp[0] = this->searchstring = nullptr;
421 
422  // Small enough for pointer-storage convention?
423  if (regsize >= 32767L) // Probably could be 65535L.
424  {
425  //RAISE Error, SYM(vul_reg_exp), SYM(Expr_Too_Big),
426  std::cout << "vul_reg_exp::compile(): Expression too big.\n";
427  return;
428  }
429 
430  // Allocate space.
431 //#ifndef _WIN32
432  if (this->program != nullptr) delete [] this->program;
433 //#endif
434  this->program = new char[regsize];
435  this->progsize = (int) regsize;
436 
437  if (this->program == nullptr) {
438  //RAISE Error, SYM(vul_reg_exp), SYM(Out_Of_Memory),
439  std::cout << "vul_reg_exp::compile(): Out of memory.\n";
440  return;
441  }
442 
443  // Second pass: emit code.
444  regparse = exp;
445  regnpar = 1;
446  regcode = this->program;
447  regc(MAGIC);
448  reg(0, &flags);
449 
450  // Dig out information for optimizations.
451  this->regstart = '\0'; // Worst-case defaults.
452  this->reganch = 0;
453  this->regmust = nullptr;
454  this->regmlen = 0;
455  scan = this->program + 1; // First BRANCH.
456  if (OP(regnext(scan)) == END) // Only one top-level choice.
457  {
458  scan = OPERAND(scan);
459 
460  // Starting-point info.
461  if (OP(scan) == EXACTLY)
462  this->regstart = *OPERAND(scan);
463  else if (OP(scan) == BOL)
464  this->reganch++;
465 
466  //
467  // If there's something expensive in the r.e., find the longest
468  // literal string that must appear and make it the regmust. Resolve
469  // ties in favor of later strings, since the regstart check works
470  // with the beginning of the r.e. and avoiding duplication
471  // strengthens checking. Not a strong reason, but sufficient in the
472  // absence of others.
473  //
474  if (flags & SPSTART) {
475  longest = nullptr;
476  len = 0L;
477  for (; scan != nullptr; scan = regnext(scan))
478  if (OP(scan) == EXACTLY && std::strlen(OPERAND(scan)) >= len) {
479  longest = OPERAND(scan);
480  len = (unsigned long)std::strlen(OPERAND(scan));
481  }
482  this->regmust = longest;
483  this->regmlen = (int)len;
484  }
485  }
486 }
487 
488 
489 // regular expression, i.e. main body or parenthesized thing
490 //
491 // Caller must absorb opening parenthesis.
492 //
493 // Combining parenthesis handling with the base level of regular expression
494 // is a trifle forced, but the need to tie the tails of the branches to what
495 // follows makes it hard to avoid.
496 //
497 static char* reg (int paren, int *flagp)
498 {
499  char* ret;
500  char* br;
501  char* ender;
502  int parno =0;
503  int flags;
504 
505  *flagp = HASWIDTH; // Tentatively.
506 
507  // Make an OPEN node, if parenthesized.
508  if (paren) {
509  if (regnpar >= vul_reg_exp_nsubexp) {
510  //RAISE Error, SYM(vul_reg_exp), SYM(Too_Many_Parens),
511  std::cout << "vul_reg_exp::compile(): Too many parentheses.\n";
512  return nullptr;
513  }
514  parno = regnpar;
515  regnpar++;
516  ret = regnode(char(OPEN + parno));
517  }
518  else
519  ret = nullptr;
520 
521  // Pick up the branches, linking them together.
522  br = regbranch(&flags);
523  if (br == nullptr)
524  return nullptr;
525  if (ret != nullptr)
526  regtail(ret, br); // OPEN -> first.
527  else
528  ret = br;
529  if (!(flags & HASWIDTH))
530  *flagp &= ~HASWIDTH;
531  *flagp |= flags & SPSTART;
532  while (*regparse == '|')
533  {
534  regparse++;
535  br = regbranch(&flags);
536  if (br == nullptr)
537  return nullptr;
538  regtail(ret, br); // BRANCH -> BRANCH.
539  if (!(flags & HASWIDTH))
540  *flagp &= ~HASWIDTH;
541  *flagp |= flags & SPSTART;
542  }
543 
544  // Make a closing node, and hook it on the end.
545  ender = regnode((paren) ? char(CLOSE + parno) : char(END));
546  regtail(ret, ender);
547 
548  // Hook the tails of the branches to the closing node.
549  for (br = ret; br != nullptr; br = regnext(br))
550  regoptail(br, ender);
551 
552  // Check for proper termination.
553  if (paren && *regparse++ != ')') {
554  //RAISE Error, SYM(vul_reg_exp), SYM(Unmatched_Parens),
555  std::cout << "vul_reg_exp::compile(): Unmatched parentheses.\n";
556  return nullptr;
557  }
558  else if (!paren && *regparse != '\0') {
559  if (*regparse == ')') {
560  //RAISE Error, SYM(vul_reg_exp), SYM(Unmatched_Parens),
561  std::cout << "vul_reg_exp::compile(): Unmatched parentheses.\n";
562  return nullptr;
563  }
564  else {
565  //RAISE Error, SYM(vul_reg_exp), SYM(Internal_Error),
566  std::cout << "vul_reg_exp::compile(): Internal error.\n";
567  return nullptr;
568  }
569  // NOTREACHED
570  }
571  return ret;
572 }
573 
574 
575 // one alternative of an | operator
576 //
577 // Implements the concatenation operator.
578 //
579 static char* regbranch (int *flagp)
580 {
581  char* ret;
582  char* chain;
583  char* latest;
584  int flags;
585 
586  *flagp = WORST; // Tentatively.
587 
588  ret = regnode(BRANCH);
589  chain = nullptr;
590  while (*regparse != '\0' && *regparse != '|' && *regparse != ')')
591  {
592  latest = regpiece(&flags);
593  if (latest == nullptr)
594  return nullptr;
595  *flagp |= flags & HASWIDTH;
596  if (chain == nullptr) // First piece.
597  *flagp |= flags & SPSTART;
598  else
599  regtail(chain, latest);
600  chain = latest;
601  }
602  if (chain == nullptr) // Loop ran zero times.
603  regnode(NOTHING);
604 
605  return ret;
606 }
607 
608 
609 //
610 // something followed by possible [*+?]
611 //
612 // Note that the branching code sequences used for ? and the general cases
613 // of * and + are somewhat optimized: they use the same NOTHING node as
614 // both the endmarker for their branch list and the body of the last branch.
615 // It might seem that this node could be dispensed with entirely, but the
616 // endmarker role is not redundant.
617 //
618 static char* regpiece (int *flagp)
619 {
620  char* ret;
621  char op;
622  char* next;
623  int flags;
624 
625  ret = regatom(&flags);
626  if (ret == nullptr)
627  return nullptr;
628 
629  op = *regparse;
630  if (!ISMULT(op)) {
631  *flagp = flags;
632  return ret;
633  }
634 
635  if (!(flags & HASWIDTH) && op != '?') {
636  //RAISE Error, SYM(vul_reg_exp), SYM(Empty_Operand),
637  std::cout << "vul_reg_exp::compile() : *+ operand could be empty.\n";
638  return nullptr;
639  }
640  *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
641 
642  if (op == '*' && (flags & SIMPLE))
643  reginsert(STAR, ret);
644  else if (op == '*') {
645  // Emit x* as (x&|), where & means "self".
646  reginsert(BRANCH, ret); // Either x
647  regoptail(ret, regnode(BACK)); // and loop
648  regoptail(ret, ret); // back
649  regtail(ret, regnode(BRANCH)); // or
650  regtail(ret, regnode(NOTHING)); // null.
651  }
652  else if (op == '+' && (flags & SIMPLE))
653  reginsert(PLUS, ret);
654  else if (op == '+') {
655  // Emit x+ as x(&|), where & means "self".
656  next = regnode(BRANCH); // Either
657  regtail(ret, next);
658  regtail(regnode(BACK), ret); // loop back
659  regtail(next, regnode(BRANCH)); // or
660  regtail(ret, regnode(NOTHING)); // null.
661  }
662  else if (op == '?') {
663  // Emit x? as (x|)
664  reginsert(BRANCH, ret); // Either x
665  regtail(ret, regnode(BRANCH)); // or
666  next = regnode(NOTHING);// null.
667  regtail(ret, next);
668  regoptail(ret, next);
669  }
670  regparse++;
671  if (ISMULT(*regparse)) {
672  //RAISE Error, SYM(vul_reg_exp), SYM(Nested_Operand),
673  std::cout << "vul_reg_exp::compile(): Nested *?+.\n";
674  return nullptr;
675  }
676  return ret;
677 }
678 
679 
680 // the lowest level
681 //
682 // Optimization: gobbles an entire sequence of ordinary characters so that
683 // it can turn them into a single node, which is smaller to store and
684 // faster to run. Backslashed characters are exceptions, each becoming a
685 // separate node; the code is simpler that way and it's not worth fixing.
686 //
687 static char* regatom (int *flagp)
688 {
689  char* ret;
690  int flags;
691 
692  *flagp = WORST; // Tentatively.
693 
694  switch (*regparse++)
695  {
696  case '^':
697  ret = regnode(BOL);
698  break;
699  case '$':
700  ret = regnode(EOL);
701  break;
702  case '.':
703  ret = regnode(ANY);
704  *flagp |= HASWIDTH | SIMPLE;
705  break;
706  case '[':
707  {
708  int rxpclass;
709  int rxpclassend;
710 
711  if (*regparse == '^') { // Complement of range.
712  ret = regnode(ANYBUT);
713  regparse++;
714  }
715  else
716  ret = regnode(ANYOF);
717  if (*regparse == ']' || *regparse == '-')
718  regc(*regparse++);
719  while (*regparse != '\0' && *regparse != ']')
720  {
721  if (*regparse == '-')
722  {
723  regparse++;
724  if (*regparse == ']' || *regparse == '\0')
725  regc('-');
726  else {
727  rxpclass = UCHARAT(regparse - 2) + 1;
728  rxpclassend = UCHARAT(regparse);
729  if (rxpclass > rxpclassend + 1) {
730  //RAISE Error, SYM(vul_reg_exp), SYM(Invalid_Range),
731  std::cout << "vul_reg_exp::compile(): Invalid range in [].\n";
732  return nullptr;
733  }
734  for (; rxpclass <= rxpclassend; rxpclass++)
735  regc(static_cast<unsigned char>(rxpclass));
736  regparse++;
737  }
738  }
739  else
740  regc(*regparse++);
741  }
742  regc('\0');
743  if (*regparse != ']') {
744  //RAISE Error, SYM(vul_reg_exp), SYM(Unmatched_Bracket),
745  std::cout << "vul_reg_exp::compile(): Unmatched [].\n";
746  return nullptr;
747  }
748  regparse++;
749  *flagp |= HASWIDTH | SIMPLE;
750  break;
751  }
752  case '(':
753  ret = reg(1, &flags);
754  if (ret == nullptr)
755  return nullptr;
756  *flagp |= flags & (HASWIDTH | SPSTART);
757  break;
758  case '\0':
759  case '|':
760  case ')':
761  //RAISE Error, SYM(vul_reg_exp), SYM(Internal_Error),
762  std::cout << "vul_reg_exp::compile(): Internal error.\n"; // Never here
763  return nullptr;
764  case '?':
765  case '+':
766  case '*':
767  //RAISE Error, SYM(vul_reg_exp), SYM(No_Operand),
768  std::cout << "vul_reg_exp::compile(): ?+* follows nothing.\n";
769  return nullptr;
770  case '\\':
771  if (*regparse == '\0') {
772  //RAISE Error, SYM(vul_reg_exp), SYM(Trailing_Backslash),
773  std::cout << "vul_reg_exp::compile(): Trailing backslash.\n";
774  return nullptr;
775  }
776  ret = regnode(EXACTLY);
777  regc(*regparse++);
778  regc('\0');
779  *flagp |= HASWIDTH | SIMPLE;
780  break;
781  default:
782  {
783  int len;
784  char ender;
785 
786  regparse--;
787  len = (int)std::strcspn(regparse, META);
788  if (len <= 0) {
789  //RAISE Error, SYM(vul_reg_exp), SYM(Internal_Error),
790  std::cout << "vul_reg_exp::compile(): Internal error.\n";
791  return nullptr;
792  }
793  ender = *(regparse + len);
794  if (len > 1 && ISMULT(ender))
795  len--; // Back off clear of ?+* operand.
796  *flagp |= HASWIDTH;
797  if (len == 1)
798  *flagp |= SIMPLE;
799  ret = regnode(EXACTLY);
800  while (len > 0) {
801  regc(*regparse++);
802  len--;
803  }
804  regc('\0');
805  break;
806  }
807  }
808  return ret;
809 }
810 
811 
812 // emit a node
813 // Location.
814 //
815 static char* regnode (char op)
816 {
817  char* ret;
818  char* ptr;
819 
820  ret = regcode;
821  if (ret == &regdummy) {
822  regsize += 3;
823  return ret;
824  }
825 
826  ptr = ret;
827  *ptr++ = op;
828  *ptr++ = '\0'; // Null "next" pointer.
829  *ptr++ = '\0';
830  regcode = ptr;
831 
832  return ret;
833 }
834 
835 
836 // emit (if appropriate) a byte of code
837 //
838 static void regc (unsigned char b)
839 {
840  if (regcode != &regdummy)
841  *regcode++ = b;
842  else
843  regsize++;
844 }
845 
846 
847 // insert an operator in front of already-emitted operand
848 //
849 // Means relocating the operand.
850 //
851 static void reginsert (char op, char* opnd)
852 {
853  char* src;
854  char* dst;
855  char* place;
856 
857  if (regcode == &regdummy) {
858  regsize += 3;
859  return;
860  }
861 
862  src = regcode;
863  regcode += 3;
864  dst = regcode;
865  while (src > opnd)
866  *--dst = *--src;
867 
868  place = opnd; // Op node, where operand used to be.
869  *place++ = op;
870  *place++ = '\0';
871  *place = '\0';
872 }
873 
874 
875 // set the next-pointer at the end of a node chain
876 //
877 static void regtail (char* p, const char* val)
878 {
879  char* scan;
880  char* temp;
881  std::ptrdiff_t offset;
882 
883  if (p == &regdummy)
884  return;
885 
886  // Find last node.
887  scan = p;
888  for (;;) {
889  temp = regnext(scan);
890  if (temp == nullptr)
891  break;
892  scan = temp;
893  }
894 
895  if (OP(scan) == BACK)
896  offset = (const char*)scan - val;
897  else
898  offset = val - scan;
899  *(scan + 1) = (char)((offset >> 8) & 0377);
900  *(scan + 2) = (char)(offset & 0377);
901 }
902 
903 
904 // regtail on operand of first argument; nop if operandless
905 //
906 static void regoptail (char* p, const char* val)
907 {
908  // "Operandless" and "op != BRANCH" are synonymous in practice.
909  if (p == nullptr || p == &regdummy || OP(p) != BRANCH)
910  return;
911  regtail(OPERAND(p), val);
912 }
913 
914 
915 ////////////////////////////////////////////////////////////////////////
916 //
917 // find and friends
918 //
919 ////////////////////////////////////////////////////////////////////////
920 
921 
922 //
923 // Global work variables for find().
924 //
925 static const char* reginput; // String-input pointer.
926 static const char* regbol; // Beginning of input, for ^ check.
927 static const char* *regstartp; // Pointer to startp array.
928 static const char* *regendp; // Ditto for endp.
929 
930 //
931 // Forwards.
932 //
933 static int regtry (const char*, const char* *, const char* *, const char*);
934 static int regmatch (const char*);
935 static int regrepeat (const char*);
936 
937 #ifdef DEBUG
938 int regnarrate = 0;
939 void regdump ();
940 static char* regprop ();
941 #endif
942 
943 bool vul_reg_exp::find (std::string const& s)
944 {
945  return find(s.c_str());
946 }
947 
948 
949 //: Matches the regular expression to the given string.
950 // Returns true if found, and sets start and end indexes accordingly.
951 
952 bool vul_reg_exp::find (char const* string)
953 {
954  const char* s;
955 
956  this->searchstring = string;
957 
958  // Check validity of program.
959  if (!this->program || UCHARAT(this->program) != MAGIC) {
960  //RAISE Error, SYM(vul_reg_exp), SYM(Internal_Error),
961  std::cout << "vul_reg_exp::find(): Compiled regular expression corrupted.\n";
962  return false;
963  }
964 
965  // If there is a "must appear" string, look for it.
966  if (this->regmust != nullptr)
967  {
968  s = string;
969  while ((s = std::strchr(s, this->regmust[0])) != nullptr) {
970  if (std::strncmp(s, this->regmust, this->regmlen) == 0)
971  break; // Found it.
972  s++;
973  }
974  if (s == nullptr) // Not present.
975  return false;
976  }
977 
978  // Mark beginning of line for ^ .
979  regbol = string;
980 
981  // Simplest case: anchored match need be tried only once.
982  if (this->reganch)
983  return regtry(string, this->startp, this->endp, this->program) != 0;
984 
985  // Messy cases: unanchored match.
986  s = string;
987  if (this->regstart != '\0')
988  // We know what char it must start with.
989  while ((s = std::strchr(s, this->regstart)) != nullptr) {
990  if (regtry(s, this->startp, this->endp, this->program))
991  return true;
992  s++;
993  }
994  else
995  // We don't - general case.
996  do {
997  if (regtry(s, this->startp, this->endp, this->program))
998  return true;
999  } while (*s++ != '\0');
1000 
1001  // Failure.
1002  return false;
1003 }
1004 
1005 
1006 // try match at specific point
1007 // 0 failure, 1 success
1008 //
1009 static int regtry(const char* string, const char* *start,
1010  const char* *end, const char* prog)
1011 {
1012  int i;
1013  const char* *sp1;
1014  const char* *ep;
1015 
1016  reginput = string;
1017  regstartp = start;
1018  regendp = end;
1019 
1020  sp1 = start;
1021  ep = end;
1022  for (i = vul_reg_exp_nsubexp; i > 0; i--) {
1023  *sp1++ = nullptr;
1024  *ep++ = nullptr;
1025  }
1026  if (regmatch(prog + 1)) {
1027  start[0] = string;
1028  end[0] = reginput;
1029  return 1;
1030  }
1031  else
1032  return 0;
1033 }
1034 
1035 
1036 // main matching routine
1037 //
1038 // Conceptually the strategy is simple: check to see whether the current
1039 // node matches, call self recursively to see whether the rest matches,
1040 // and then act accordingly. In practice we make some effort to avoid
1041 // recursion, in particular by going through "ordinary" nodes (that don't
1042 // need to know whether the rest of the match failed) by a loop instead of
1043 // by recursion.
1044 // 0 failure, 1 success
1045 //
1046 static int regmatch(const char* prog)
1047 {
1048  const char* scan; // Current node.
1049  const char* next; // Next node.
1050 
1051  scan = prog;
1052 
1053  while (scan != nullptr)
1054  {
1055  next = regnext(scan);
1056 
1057  switch (OP(scan))
1058  {
1059  case BOL:
1060  if (reginput != regbol)
1061  return 0;
1062  break;
1063  case EOL:
1064  if (*reginput != '\0')
1065  return 0;
1066  break;
1067  case ANY:
1068  if (*reginput == '\0')
1069  return 0;
1070  reginput++;
1071  break;
1072  case EXACTLY:
1073  {
1074  int len;
1075  const char* opnd;
1076 
1077  opnd = OPERAND(scan);
1078  // Inline the first character, for speed.
1079  if (*opnd != *reginput)
1080  return 0;
1081  len = (int)std::strlen(opnd);
1082  if (len > 1 && std::strncmp(opnd, reginput, len) != 0)
1083  return 0;
1084  reginput += len;
1085  break;
1086  }
1087  case ANYOF:
1088  if (*reginput == '\0' || std::strchr(OPERAND(scan), *reginput) == nullptr)
1089  return 0;
1090  reginput++;
1091  break;
1092  case ANYBUT:
1093  if (*reginput == '\0' || std::strchr(OPERAND(scan), *reginput) != nullptr)
1094  return 0;
1095  reginput++;
1096  break;
1097  case NOTHING:
1098  break;
1099  case BACK:
1100  break;
1101  case OPEN+1: case OPEN+2: case OPEN+3: case OPEN+4: case OPEN+5: case OPEN+6: case OPEN+7: case OPEN+8: case OPEN+9:
1102  {
1103  int no;
1104  const char* save;
1105 
1106  no = OP(scan) - OPEN;
1107  save = reginput;
1108 
1109  if (regmatch(next))
1110  {
1111  //
1112  // Don't set startp if some later invocation of the
1113  // same parentheses already has.
1114  //
1115  if (regstartp[no] == nullptr)
1116  regstartp[no] = save;
1117  return 1;
1118  }
1119  else
1120  return 0;
1121  }
1122  case CLOSE+1: case CLOSE+2: case CLOSE+3: case CLOSE+4: case CLOSE+5: case CLOSE+6: case CLOSE+7: case CLOSE+8: case CLOSE+9:
1123  {
1124  int no;
1125  const char* save;
1126 
1127  no = OP(scan) - CLOSE;
1128  save = reginput;
1129 
1130  if (regmatch(next))
1131  {
1132  //
1133  // Don't set endp if some later invocation of the
1134  // same parentheses already has.
1135  //
1136  if (regendp[no] == nullptr)
1137  regendp[no] = save;
1138  return 1;
1139  }
1140  else
1141  return 0;
1142  }
1143  case BRANCH:
1144  {
1145  const char* save;
1146 
1147  if (OP(next) != BRANCH) // No choice.
1148  next = OPERAND(scan); // Avoid recursion.
1149  else {
1150  do {
1151  save = reginput;
1152  if (regmatch(OPERAND(scan)))
1153  return 1;
1154  reginput = save;
1155  scan = regnext(scan);
1156  } while (scan != nullptr && OP(scan) == BRANCH);
1157  return 0;
1158  // NOTREACHED
1159  }
1160  break;
1161  }
1162  case STAR:
1163  case PLUS:
1164  {
1165  char nextch;
1166  int no;
1167  const char* save;
1168  int min_no;
1169 
1170  //
1171  // Lookahead to avoid useless match attempts when we know
1172  // what character comes next.
1173  //
1174  nextch = '\0';
1175  if (OP(next) == EXACTLY)
1176  nextch = *OPERAND(next);
1177  min_no = (OP(scan) == STAR) ? 0 : 1;
1178  save = reginput;
1179  no = regrepeat(OPERAND(scan));
1180  while (no >= min_no) {
1181  // If it could work, try it.
1182  if (nextch == '\0' || *reginput == nextch)
1183  if (regmatch(next))
1184  return 1;
1185  // Couldn't or didn't - back up.
1186  no--;
1187  reginput = save + no;
1188  }
1189  return 0;
1190  }
1191  case END:
1192  return 1; // Success!
1193 
1194  default:
1195  //RAISE Error, SYM(vul_reg_exp), SYM(Internal_Error),
1196  std::cout << "vul_reg_exp::find(): Internal error -- memory corrupted.\n";
1197  return 0;
1198  }
1199  scan = next;
1200  }
1201 
1202  //
1203  // We get here only if there's trouble - normally "case END" is the
1204  // terminating point.
1205  //
1206  //RAISE Error, SYM(vul_reg_exp), SYM(Internal_Error),
1207  std::cout << "vul_reg_exp::find(): Internal error -- corrupted pointers.\n";
1208  return 0;
1209 }
1210 
1211 
1212 // repeatedly match something simple, report how many
1213 //
1214 static int regrepeat(const char* p)
1215 {
1216  int count = 0;
1217  const char* scan;
1218  const char* opnd;
1219 
1220  scan = reginput;
1221  opnd = OPERAND(p);
1222  switch (OP(p))
1223  {
1224  case ANY:
1225  count = (int)std::strlen(scan);
1226  scan += count;
1227  break;
1228  case EXACTLY:
1229  while (*opnd == *scan) {
1230  count++;
1231  scan++;
1232  }
1233  break;
1234  case ANYOF:
1235  while (*scan != '\0' && std::strchr(opnd, *scan) != nullptr) {
1236  count++;
1237  scan++;
1238  }
1239  break;
1240  case ANYBUT:
1241  while (*scan != '\0' && std::strchr(opnd, *scan) == nullptr) {
1242  count++;
1243  scan++;
1244  }
1245  break;
1246  default: // Oh dear. Called inappropriately.
1247  //RAISE Error, SYM(vul_reg_exp), SYM(Internal_Error),
1248  std::cout << "vul_reg_exp::find(): Internal error.\n";
1249  return 0;
1250  }
1251  reginput = scan;
1252  return count;
1253 }
1254 
1255 
1256 // dig the "next" pointer out of a node
1257 //
1258 static const char* regnext(const char* p)
1259 {
1260  int offset;
1261 
1262  if (p == &regdummy)
1263  return nullptr;
1264 
1265  offset = NEXT(p);
1266  if (offset == 0)
1267  return nullptr;
1268 
1269  if (OP(p) == BACK)
1270  return p - offset;
1271  else
1272  return p + offset;
1273 }
1274 
1275 
1276 static char* regnext(char* p)
1277 {
1278  int offset;
1279 
1280  if (p == &regdummy)
1281  return nullptr;
1282 
1283  offset = NEXT(p);
1284  if (offset == 0)
1285  return nullptr;
1286 
1287  if (OP(p) == BACK)
1288  return p - offset;
1289  else
1290  return p + offset;
1291 }
#define UCHARAT(p)
#define STAR
#define SIMPLE
#define ISMULT(c)
const char * startp[vul_reg_exp_nsubexp]
anchor point of start position for n-th matching regular expression.
Definition: vul_reg_exp.h:85
#define EXACTLY
#define BRANCH
Pattern matching with regular expressions.
Definition: vul_reg_exp.h:82
char regstart
Internal use only.
Definition: vul_reg_exp.h:89
#define BOL
#define EOL
bool find(char const *)
true if regexp in char* arg.
#define META
#define NOTHING
#define CLOSE
#define OP(p)
static const char * protect(char c)
Return an expression that will match precisely c.
#define HASWIDTH
const char * regmust
Internal use only.
Definition: vul_reg_exp.h:93
#define ANY
#define BACK
bool operator==(vul_reg_exp const &) const
Equality operator.
const char * endp[vul_reg_exp_nsubexp]
anchor point of end position for n-th matching regular expression.
Definition: vul_reg_exp.h:87
#define SPSTART
char * program
Definition: vul_reg_exp.h:96
#define PLUS
const char * searchstring
Definition: vul_reg_exp.h:98
constexpr int vul_reg_exp_nsubexp
Definition: vul_reg_exp.h:35
contains class for pattern matching with regular expressions
#define OPERAND(p)
constexpr unsigned char MAGIC
bool deep_equal(vul_reg_exp const &) const
Same regexp and state?.
#define NEXT(p)
int regmlen
Internal use only.
Definition: vul_reg_exp.h:95
#define OPEN
#define ANYBUT
#define END
vul_reg_exp()
Creates an empty regular expression.
Definition: vul_reg_exp.h:101
char reganch
Internal use only.
Definition: vul_reg_exp.h:91
#define ANYOF
#define WORST
void compile(char const *)
Compiles char* --> regexp.