View Javadoc

1   /**
2    * Copyright 2005-2011 The Kuali Foundation
3    *
4    * Licensed under the Educational Community License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    * http://www.opensource.org/licenses/ecl2.php
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package org.kuali.rice.krad.uif.util;
17  
18  import org.springframework.expression.spel.InternalParseException;
19  import org.springframework.expression.spel.SpelMessage;
20  import org.springframework.expression.spel.SpelParseException;
21  import org.springframework.util.Assert;
22  
23  import java.util.ArrayList;
24  import java.util.Arrays;
25  import java.util.List;
26  
27  /**
28   * @author Kuali Rice Team (rice.collab@kuali.org)
29   */
30  public class Tokenizer {
31      String expressionString;
32      char[] toProcess;
33      int pos;
34      int max;
35      List<Token> tokens = new ArrayList<Token>();
36  
37      protected Tokenizer(String inputdata) {
38          for (int ch = '0'; ch <= '9'; ch++) {
39              flags[ch] |= IS_DIGIT | IS_HEXDIGIT;
40          }
41          for (int ch = 'A'; ch <= 'F'; ch++) {
42              flags[ch] |= IS_HEXDIGIT;
43          }
44          for (int ch = 'a'; ch <= 'f'; ch++) {
45              flags[ch] |= IS_HEXDIGIT;
46          }
47          for (int ch = 'A'; ch <= 'Z'; ch++) {
48              flags[ch] |= IS_ALPHA;
49          }
50          for (int ch = 'a'; ch <= 'z'; ch++) {
51              flags[ch] |= IS_ALPHA;
52          }
53  
54          this.expressionString = inputdata;
55          this.toProcess = (inputdata + "\0").toCharArray();
56          this.max = toProcess.length;
57          this.pos = 0;
58          process();
59      }
60  
61      public void process() {
62          while (pos < max) {
63              char ch = toProcess[pos];
64              if (isAlphabetic(ch)) {
65                  lexIdentifier();
66              } else {
67                  switch (ch) {
68                      case '+':
69                          pushCharToken(TokenKind.PLUS);
70                          break;
71                      case '_': // the other way to start an identifier
72                          lexIdentifier();
73                          break;
74                      case '-':
75                          pushCharToken(TokenKind.MINUS);
76                          break;
77                      case ':':
78                          pushCharToken(TokenKind.COLON);
79                          break;
80                      case '.':
81                          pushCharToken(TokenKind.DOT);
82                          break;
83                      case ',':
84                          pushCharToken(TokenKind.COMMA);
85                          break;
86                      case '*':
87                          pushCharToken(TokenKind.STAR);
88                          break;
89                      case '/':
90                          pushCharToken(TokenKind.DIV);
91                          break;
92                      case '%':
93                          pushCharToken(TokenKind.MOD);
94                          break;
95                      case '(':
96                          pushCharToken(TokenKind.LPAREN);
97                          break;
98                      case ')':
99                          pushCharToken(TokenKind.RPAREN);
100                         break;
101                     case '[':
102                         pushCharToken(TokenKind.LSQUARE);
103                         break;
104                     case '#':
105                         pushCharToken(TokenKind.HASH);
106                         break;
107                     case ']':
108                         pushCharToken(TokenKind.RSQUARE);
109                         break;
110                     case '{':
111                         pushCharToken(TokenKind.LCURLY);
112                         break;
113                     case '}':
114                         pushCharToken(TokenKind.RCURLY);
115                         break;
116                     case '@':
117                         pushCharToken(TokenKind.BEAN_REF);
118                         break;
119                     case '^':
120                         if (isTwoCharToken(TokenKind.SELECT_FIRST)) {
121                             pushPairToken(TokenKind.SELECT_FIRST);
122                         } else {
123                             pushCharToken(TokenKind.POWER);
124                         }
125                         break;
126                     case '!':
127                         if (isTwoCharToken(TokenKind.NE)) {
128                             pushPairToken(TokenKind.NE);
129                         } else if (isTwoCharToken(TokenKind.PROJECT)) {
130                             pushPairToken(TokenKind.PROJECT);
131                         } else {
132                             pushCharToken(TokenKind.NOT);
133                         }
134                         break;
135                     case '=':
136                         if (isTwoCharToken(TokenKind.EQ)) {
137                             pushPairToken(TokenKind.EQ);
138                         } else {
139                             pushCharToken(TokenKind.ASSIGN);
140                         }
141                         break;
142                     case '?':
143                         if (isTwoCharToken(TokenKind.SELECT)) {
144                             pushPairToken(TokenKind.SELECT);
145                         } else if (isTwoCharToken(TokenKind.ELVIS)) {
146                             pushPairToken(TokenKind.ELVIS);
147                         } else if (isTwoCharToken(TokenKind.SAFE_NAVI)) {
148                             pushPairToken(TokenKind.SAFE_NAVI);
149                         } else {
150                             pushCharToken(TokenKind.QMARK);
151                         }
152                         break;
153                     case '$':
154                         if (isTwoCharToken(TokenKind.SELECT_LAST)) {
155                             pushPairToken(TokenKind.SELECT_LAST);
156                         } else {
157                             lexIdentifier();
158                         }
159                         break;
160                     case '>':
161                         if (isTwoCharToken(TokenKind.GE)) {
162                             pushPairToken(TokenKind.GE);
163                         } else {
164                             pushCharToken(TokenKind.GT);
165                         }
166                         break;
167                     case '<':
168                         if (isTwoCharToken(TokenKind.LE)) {
169                             pushPairToken(TokenKind.LE);
170                         } else {
171                             pushCharToken(TokenKind.LT);
172                         }
173                         break;
174                     case '0':
175                     case '1':
176                     case '2':
177                     case '3':
178                     case '4':
179                     case '5':
180                     case '6':
181                     case '7':
182                     case '8':
183                     case '9':
184                         lexNumericLiteral(ch == '0');
185                         break;
186                     case ' ':
187                     case '\t':
188                     case '\r':
189                     case '\n':
190                         // drift over white space
191                         pos++;
192                         break;
193                     case '\'':
194                         lexQuotedStringLiteral();
195                         break;
196                     case '"':
197                         lexDoubleQuotedStringLiteral();
198                         break;
199                     case 0:
200                         // hit sentinel at end of value
201                         pos++; // will take us to the end
202                         break;
203                     default:
204                         throw new IllegalStateException("Cannot handle ("
205                                 + Integer.valueOf(ch)
206                                 + ") '"
207                                 + ch
208                                 + "', in expression: "
209                                 + expressionString);
210                 }
211             }
212         }
213     }
214 
215     public List<Token> getTokens() {
216         return tokens;
217     }
218 
219     // STRING_LITERAL: '\''! (APOS|~'\'')* '\''!;
220     private void lexQuotedStringLiteral() {
221         int start = pos;
222         boolean terminated = false;
223         while (!terminated) {
224             pos++;
225             char ch = toProcess[pos];
226             if (ch == '\'') {
227                 // may not be the end if the char after is also a '
228                 if (toProcess[pos + 1] == '\'') {
229                     pos++; // skip over that too, and continue
230                 } else {
231                     terminated = true;
232                 }
233             }
234             if (ch == 0) {
235                 throw new InternalParseException(new SpelParseException(expressionString, start,
236                         SpelMessage.NON_TERMINATING_QUOTED_STRING));
237             }
238         }
239         pos++;
240         tokens.add(new Token(TokenKind.LITERAL_STRING, subarray(start, pos), start, pos));
241     }
242 
243     // DQ_STRING_LITERAL:	'"'! (~'"')* '"'!;
244     private void lexDoubleQuotedStringLiteral() {
245         int start = pos;
246         boolean terminated = false;
247         while (!terminated) {
248             pos++;
249             char ch = toProcess[pos];
250             if (ch == '"') {
251                 terminated = true;
252             }
253             if (ch == 0) {
254                 throw new InternalParseException(new SpelParseException(expressionString, start,
255                         SpelMessage.NON_TERMINATING_DOUBLE_QUOTED_STRING));
256             }
257         }
258         pos++;
259         tokens.add(new Token(TokenKind.LITERAL_STRING, subarray(start, pos), start, pos));
260     }
261 
262     //	REAL_LITERAL :
263     //	  ('.' (DECIMAL_DIGIT)+ (EXPONENT_PART)? (REAL_TYPE_SUFFIX)?) |
264     //		((DECIMAL_DIGIT)+ '.' (DECIMAL_DIGIT)+ (EXPONENT_PART)? (REAL_TYPE_SUFFIX)?) |
265     //		((DECIMAL_DIGIT)+ (EXPONENT_PART) (REAL_TYPE_SUFFIX)?) |
266     //		((DECIMAL_DIGIT)+ (REAL_TYPE_SUFFIX));
267     //	fragment INTEGER_TYPE_SUFFIX : ( 'L' | 'l' );
268     //	fragment HEX_DIGIT : '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7'|'8'|'9'|'A'|'B'|'C'|'D'|'E'|'F'|'a'|'b'|'c'|'d'|'e'|'f';
269     //
270     //	fragment EXPONENT_PART : 'e'  (SIGN)*  (DECIMAL_DIGIT)+ | 'E'  (SIGN)*  (DECIMAL_DIGIT)+ ;
271     //	fragment SIGN :	'+' | '-' ;
272     //	fragment REAL_TYPE_SUFFIX : 'F' | 'f' | 'D' | 'd';
273     //	INTEGER_LITERAL
274     //	: (DECIMAL_DIGIT)+ (INTEGER_TYPE_SUFFIX)?;
275 
276     private void lexNumericLiteral(boolean firstCharIsZero) {
277         boolean isReal = false;
278         int start = pos;
279         char ch = toProcess[pos + 1];
280         boolean isHex = ch == 'x' || ch == 'X';
281 
282         // deal with hexadecimal
283         if (firstCharIsZero && isHex) {
284             pos = pos + 1;
285             do {
286                 pos++;
287             } while (isHexadecimalDigit(toProcess[pos]));
288             if (isChar('L', 'l')) {
289                 pushHexIntToken(subarray(start + 2, pos), true, start, pos);
290                 pos++;
291             } else {
292                 pushHexIntToken(subarray(start + 2, pos), false, start, pos);
293             }
294             return;
295         }
296 
297         // real numbers must have leading digits
298 
299         // Consume first part of number
300         do {
301             pos++;
302         } while (isDigit(toProcess[pos]));
303 
304         // a '.' indicates this number is a real
305         ch = toProcess[pos];
306         if (ch == '.') {
307             isReal = true;
308             // carry on consuming digits
309             do {
310                 pos++;
311             } while (isDigit(toProcess[pos]));
312         }
313 
314         int endOfNumber = pos;
315 
316         // Now there may or may not be an exponent
317 
318         // is it a long ?
319         if (isChar('L', 'l')) {
320             if (isReal) { // 3.4L - not allowed
321                 throw new InternalParseException(new SpelParseException(expressionString, start,
322                         SpelMessage.REAL_CANNOT_BE_LONG));
323             }
324             pushIntToken(subarray(start, endOfNumber), true, start, endOfNumber);
325             pos++;
326         } else if (isExponentChar(toProcess[pos])) {
327             isReal = true; // if it wasnt before, it is now
328             pos++;
329             char possibleSign = toProcess[pos];
330             if (isSign(possibleSign)) {
331                 pos++;
332             }
333 
334             // exponent digits
335             do {
336                 pos++;
337             } while (isDigit(toProcess[pos]));
338             boolean isFloat = false;
339             if (isFloatSuffix(toProcess[pos])) {
340                 isFloat = true;
341                 endOfNumber = ++pos;
342             } else if (isDoubleSuffix(toProcess[pos])) {
343                 endOfNumber = ++pos;
344             }
345             pushRealToken(subarray(start, pos), isFloat, start, pos);
346         } else {
347             ch = toProcess[pos];
348             boolean isFloat = false;
349             if (isFloatSuffix(ch)) {
350                 isReal = true;
351                 isFloat = true;
352                 endOfNumber = ++pos;
353             } else if (isDoubleSuffix(ch)) {
354                 isReal = true;
355                 endOfNumber = ++pos;
356             }
357             if (isReal) {
358                 pushRealToken(subarray(start, endOfNumber), isFloat, start, endOfNumber);
359             } else {
360                 pushIntToken(subarray(start, endOfNumber), false, start, endOfNumber);
361             }
362         }
363     }
364 
365     // if this is changed, it must remain sorted
366     private String[] alternativeOperatorNames = {"DIV", "EQ", "GE", "GT", "LE", "LT", "MOD", "NE", "NOT"};
367 
368     private void lexIdentifier() {
369         int start = pos;
370         do {
371             pos++;
372         } while (isIdentifier(toProcess[pos]));
373         char[] subarray = subarray(start, pos);
374 
375         // Check if this is the alternative (textual) representation of an operator (see alternativeOperatorNames)
376         if ((pos - start) == 2 || (pos - start) == 3) {
377             String asString = new String(subarray).toUpperCase();
378             int idx = Arrays.binarySearch(alternativeOperatorNames, asString);
379             if (idx >= 0) {
380                 pushOneCharOrTwoCharToken(TokenKind.valueOf(asString), start);
381                 return;
382             }
383         }
384         tokens.add(new Token(TokenKind.IDENTIFIER, subarray, start, pos));
385     }
386 
387     private void pushIntToken(char[] data, boolean isLong, int start, int end) {
388         if (isLong) {
389             tokens.add(new Token(TokenKind.LITERAL_LONG, data, start, end));
390         } else {
391             tokens.add(new Token(TokenKind.LITERAL_INT, data, start, end));
392         }
393     }
394 
395     private void pushHexIntToken(char[] data, boolean isLong, int start, int end) {
396         if (data.length == 0) {
397             if (isLong) {
398                 throw new InternalParseException(new SpelParseException(expressionString, start, SpelMessage.NOT_A_LONG,
399                         expressionString.substring(start, end + 1)));
400             } else {
401                 throw new InternalParseException(new SpelParseException(expressionString, start,
402                         SpelMessage.NOT_AN_INTEGER, expressionString.substring(start, end)));
403             }
404         }
405         if (isLong) {
406             tokens.add(new Token(TokenKind.LITERAL_HEXLONG, data, start, end));
407         } else {
408             tokens.add(new Token(TokenKind.LITERAL_HEXINT, data, start, end));
409         }
410     }
411 
412     private void pushRealToken(char[] data, boolean isFloat, int start, int end) {
413         if (isFloat) {
414             tokens.add(new Token(TokenKind.LITERAL_REAL_FLOAT, data, start, end));
415         } else {
416             tokens.add(new Token(TokenKind.LITERAL_REAL, data, start, end));
417         }
418     }
419 
420     private char[] subarray(int start, int end) {
421         char[] result = new char[end - start];
422         System.arraycopy(toProcess, start, result, 0, end - start);
423         return result;
424     }
425 
426     /**
427      * Check if this might be a two character token.
428      */
429     private boolean isTwoCharToken(TokenKind kind) {
430         Assert.isTrue(kind.tokenChars.length == 2);
431         Assert.isTrue(toProcess[pos] == kind.tokenChars[0]);
432         return toProcess[pos + 1] == kind.tokenChars[1];
433     }
434 
435     /**
436      * Push a token of just one character in length.
437      */
438     private void pushCharToken(TokenKind kind) {
439         tokens.add(new Token(kind, pos, pos + 1));
440         pos++;
441     }
442 
443     /**
444      * Push a token of two characters in length.
445      */
446     private void pushPairToken(TokenKind kind) {
447         tokens.add(new Token(kind, pos, pos + 2));
448         pos += 2;
449     }
450 
451     private void pushOneCharOrTwoCharToken(TokenKind kind, int pos) {
452         tokens.add(new Token(kind, pos, pos + kind.getLength()));
453     }
454 
455     //	ID:	('a'..'z'|'A'..'Z'|'_'|'$') ('a'..'z'|'A'..'Z'|'_'|'$'|'0'..'9'|DOT_ESCAPED)*;
456     private boolean isIdentifier(char ch) {
457         return isAlphabetic(ch) || isDigit(ch) || ch == '_' || ch == '$';
458     }
459 
460     private boolean isChar(char a, char b) {
461         char ch = toProcess[pos];
462         return ch == a || ch == b;
463     }
464 
465     private boolean isExponentChar(char ch) {
466         return ch == 'e' || ch == 'E';
467     }
468 
469     private boolean isFloatSuffix(char ch) {
470         return ch == 'f' || ch == 'F';
471     }
472 
473     private boolean isDoubleSuffix(char ch) {
474         return ch == 'd' || ch == 'D';
475     }
476 
477     private boolean isSign(char ch) {
478         return ch == '+' || ch == '-';
479     }
480 
481     private boolean isDigit(char ch) {
482         if (ch > 255) {
483             return false;
484         }
485         return (flags[ch] & IS_DIGIT) != 0;
486     }
487 
488     private boolean isAlphabetic(char ch) {
489         if (ch > 255) {
490             return false;
491         }
492         return (flags[ch] & IS_ALPHA) != 0;
493     }
494 
495     private boolean isHexadecimalDigit(char ch) {
496         if (ch > 255) {
497             return false;
498         }
499         return (flags[ch] & IS_HEXDIGIT) != 0;
500     }
501 
502     private final byte flags[] = new byte[256];
503     private static final byte IS_DIGIT = 0x01;
504     private static final byte IS_HEXDIGIT = 0x02;
505     private static final byte IS_ALPHA = 0x04;
506 
507     public class Token {
508         TokenKind kind;
509         String data;
510         int startpos; // index of first character
511         int endpos;   // index of char after the last character
512 
513         /**
514          * Constructor for use when there is no particular data for the token (eg. TRUE or '+')
515          *
516          * @param startpos the exact start
517          * @param endpos the index to the last character
518          */
519         public Token(TokenKind tokenKind, int startpos, int endpos) {
520             this.kind = tokenKind;
521             this.startpos = startpos;
522             this.endpos = endpos;
523         }
524 
525         Token(TokenKind tokenKind, char[] tokenData, int pos, int endpos) {
526             this(tokenKind, pos, endpos);
527             this.data = new String(tokenData);
528         }
529 
530         public TokenKind getKind() {
531             return kind;
532         }
533 
534         public String toString() {
535             StringBuilder s = new StringBuilder();
536             s.append("[").append(kind.toString());
537             if (kind.hasPayload()) {
538                 s.append(":").append(data);
539             }
540             s.append("]");
541             s.append("(").append(startpos).append(",").append(endpos).append(")");
542             return s.toString();
543         }
544 
545         public boolean isIdentifier() {
546             return kind == TokenKind.IDENTIFIER;
547         }
548 
549         public boolean isNumericRelationalOperator() {
550             return kind == TokenKind.GT
551                     || kind == TokenKind.GE
552                     || kind == TokenKind.LT
553                     || kind == TokenKind.LE
554                     || kind == TokenKind.EQ
555                     || kind == TokenKind.NE;
556         }
557 
558         public String stringValue() {
559             return data;
560         }
561 
562         public Token asInstanceOfToken() {
563             return new Token(TokenKind.INSTANCEOF, startpos, endpos);
564         }
565 
566         public Token asMatchesToken() {
567             return new Token(TokenKind.MATCHES, startpos, endpos);
568         }
569 
570         public Token asBetweenToken() {
571             return new Token(TokenKind.BETWEEN, startpos, endpos);
572         }
573     }
574 
575     public enum TokenKind {
576         // ordered by priority - operands first
577         LITERAL_INT, LITERAL_LONG, LITERAL_HEXINT, LITERAL_HEXLONG, LITERAL_STRING, LITERAL_REAL, LITERAL_REAL_FLOAT,
578         LPAREN("("), RPAREN(")"), COMMA(","), IDENTIFIER,
579         COLON(":"), HASH("#"), RSQUARE("]"), LSQUARE("["),
580         LCURLY("{"), RCURLY("}"),
581         DOT("."), PLUS("+"), STAR("*"), DIV("/"), NOT("!"), MINUS("-"), SELECT_FIRST("^["), SELECT_LAST("$["), QMARK(
582                 "?"), PROJECT("!["),
583         GE(">="), GT(">"), LE("<="), LT("<"), EQ("=="), NE("!="), ASSIGN("="), INSTANCEOF("instanceof"), MATCHES(
584                 "matches"), BETWEEN("between"),
585         SELECT("?["), MOD("%"), POWER("^"),
586         ELVIS("?:"), SAFE_NAVI("?."), BEAN_REF("@");
587 
588         char[] tokenChars;
589         private boolean hasPayload; // is there more to this token than simply the kind
590 
591         private TokenKind(String tokenString) {
592             tokenChars = tokenString.toCharArray();
593             hasPayload = tokenChars.length == 0;
594         }
595 
596         private TokenKind() {
597             this("");
598         }
599 
600         public String toString() {
601             return this.name() + (tokenChars.length != 0 ? "(" + new String(tokenChars) + ")" : "");
602         }
603 
604         public boolean hasPayload() {
605             return hasPayload;
606         }
607 
608         public int getLength() {
609             return tokenChars.length;
610         }
611     }
612 }