AbstractTokenizer.java


001 /**

002  * BSD-style license; for more info see http://pmd.sourceforge.net/license.html

003  * @author Zev Blut zb@ubit.com

004  * @author Romain PELISSE belaran@gmail.com

005  */

006 package net.sourceforge.pmd.cpd;

007 

008 import java.util.List;

009 

010 public abstract class AbstractTokenizer implements Tokenizer

011 {

012 

013   protected List<String> stringToken;      // List<String>, should be setted by children classes

014   protected List<String> ignorableCharacter;   // List<String>, should be setted by children classes

015                         // FIXME:Maybe an array of 'char' would be better for perfomance ?

016   protected List<String> ignorableStmt;     // List<String>, should be setted by children classes

017   protected char oneLineCommentChar = '#'; // Most script language ( shell, ruby, python,...) use this symbol for comment line

018 

019   private List<String> code;

020   private int lineNumber = 0;

021   private String currentLine;

022 

023   protected boolean spanMultipleLinesString = true;  // Most language does, so default is true

024 

025   private boolean downcaseString = true;

026 

027     public void tokenize(SourceCode tokens, Tokens tokenEntries) {

028         this.code = tokens.getCode();

029 

030         for ( this.lineNumber = 0; lineNumber < this.code.size(); lineNumber++ ) {

031           this.currentLine = this.code.get(this.lineNumber);

032             int loc = 0;

033             while ( loc < currentLine.length() ) {

034                 StringBuffer token = new StringBuffer();

035                 loc = getTokenFromLine(token,loc);

036                 if (token.length() > 0 && !isIgnorableString(token.toString())) {

037                     if (downcaseString) {

038                         token = new StringBuffer(token.toString().toLowerCase());

039                     }

040                     if ( CPD.debugEnable ) {

041                       System.out.println("Token added:" + token.toString());

042                     }

043                     tokenEntries.add(new TokenEntry(token.toString(),

044                             tokens.getFileName(),

045                             lineNumber));

046 

047                 }

048             }

049         }

050         tokenEntries.add(TokenEntry.getEOF());

051     }

052 

053     private int getTokenFromLine(StringBuffer token, int loc) {

054         for (int j = loc; j < this.currentLine.length(); j++) {

055             char tok = this.currentLine.charAt(j);

056             if (!Character.isWhitespace(tok) && !ignoreCharacter(tok)) {

057                 if (isComment(tok)) {

058                     if (token.length() > 0) {

059                         return j;

060                     } else {

061                         return getCommentToken(token, loc);

062                     }

063                 } else if (isString(tok)) {

064                     if (token.length() > 0) {

065                         return j; // we need to now parse the string as a seperate token.

066                     } else {

067                         // we are at the start of a string

068                         return parseString(token, j, tok);

069                     }

070                 } else {

071                     token.append(tok);

072                 }

073             } else {

074                 if (token.length() > 0) {

075                     return j;

076                 }

077             }

078             loc = j;

079         }

080         return loc + 1;

081     }

082 

083     private int parseString(StringBuffer token, int loc, char stringDelimiter) {

084         boolean escaped = false;

085         boolean done = false;

086         char tok = ' '; // this will be replaced.

087         while ((loc < currentLine.length()) && ! done) {

088             tok = currentLine.charAt(loc);

089             if (escaped && tok == stringDelimiter) { // Found an escaped string

090                 escaped = false;

091             } else if (tok == stringDelimiter && (token.length() > 0)) { // We are done, we found the end of the string...

092                 done = true;

093             } else if (tok == '\\') { // Found an escaped char

094                 escaped = true;

095             } else {  // Adding char...

096                 escaped = false;

097             }

098             //Adding char to String:" + token.toString());

099             token.append(tok);

100             loc++;

101         }

102         // Handling multiple lines string

103         if (   ! done &&  // ... we didn't find the end of the string

104             loc >= currentLine.length() && // ... we have reach the end of the line ( the String is incomplete, for the moment at least)

105             this.spanMultipleLinesString && // ... the language allow multiple line span Strings

106             this.lineNumber < this.code.size() - 1 // ... there is still more lines to parse

107           ) {

108           // parsing new line

109           this.currentLine = this.code.get(++this.lineNumber);

110           // Warning : recursive call !

111           loc = this.parseString(token, loc, stringDelimiter);

112         }

113         return loc + 1;

114     }

115 

116     private boolean ignoreCharacter(char tok)

117     {

118       return this.ignorableCharacter.contains(String.valueOf(tok));

119     }

120 

121     private boolean isString(char tok)

122     {

123       return this.stringToken.contains(String.valueOf(tok));

124     }

125 

126     private boolean isComment(char tok)

127     {

128         return tok == oneLineCommentChar;

129     }

130 

131     private int getCommentToken(StringBuffer token, int loc)

132     {

133         while (loc < this.currentLine.length())

134         {

135             token.append(this.currentLine.charAt(loc++));

136         }

137         return loc;

138     }

139 

140     private boolean isIgnorableString(String token)

141     {

142       return this.ignorableStmt.contains(token);

143     }

144 }