001 /**
002 * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
003 * @author Zev Blut zb@ubit.com
004 * @author Romain PELISSE belaran@gmail.com
005 */
006 package net.sourceforge.pmd.cpd;
007
008 import java.util.List;
009
010 public abstract class AbstractTokenizer implements Tokenizer
011 {
012
013 protected List<String> stringToken; // List<String>, should be setted by children classes
014 protected List<String> ignorableCharacter; // List<String>, should be setted by children classes
015 // FIXME:Maybe an array of 'char' would be better for perfomance ?
016 protected List<String> ignorableStmt; // List<String>, should be setted by children classes
017 protected char oneLineCommentChar = '#'; // Most script language ( shell, ruby, python,...) use this symbol for comment line
018
019 private List<String> code;
020 private int lineNumber = 0;
021 private String currentLine;
022
023 protected boolean spanMultipleLinesString = true; // Most language does, so default is true
024
025 private boolean downcaseString = true;
026
027 public void tokenize(SourceCode tokens, Tokens tokenEntries) {
028 this.code = tokens.getCode();
029
030 for ( this.lineNumber = 0; lineNumber < this.code.size(); lineNumber++ ) {
031 this.currentLine = this.code.get(this.lineNumber);
032 int loc = 0;
033 while ( loc < currentLine.length() ) {
034 StringBuffer token = new StringBuffer();
035 loc = getTokenFromLine(token,loc);
036 if (token.length() > 0 && !isIgnorableString(token.toString())) {
037 if (downcaseString) {
038 token = new StringBuffer(token.toString().toLowerCase());
039 }
040 if ( CPD.debugEnable ) {
041 System.out.println("Token added:" + token.toString());
042 }
043 tokenEntries.add(new TokenEntry(token.toString(),
044 tokens.getFileName(),
045 lineNumber));
046
047 }
048 }
049 }
050 tokenEntries.add(TokenEntry.getEOF());
051 }
052
053 private int getTokenFromLine(StringBuffer token, int loc) {
054 for (int j = loc; j < this.currentLine.length(); j++) {
055 char tok = this.currentLine.charAt(j);
056 if (!Character.isWhitespace(tok) && !ignoreCharacter(tok)) {
057 if (isComment(tok)) {
058 if (token.length() > 0) {
059 return j;
060 } else {
061 return getCommentToken(token, loc);
062 }
063 } else if (isString(tok)) {
064 if (token.length() > 0) {
065 return j; // we need to now parse the string as a seperate token.
066 } else {
067 // we are at the start of a string
068 return parseString(token, j, tok);
069 }
070 } else {
071 token.append(tok);
072 }
073 } else {
074 if (token.length() > 0) {
075 return j;
076 }
077 }
078 loc = j;
079 }
080 return loc + 1;
081 }
082
083 private int parseString(StringBuffer token, int loc, char stringDelimiter) {
084 boolean escaped = false;
085 boolean done = false;
086 char tok = ' '; // this will be replaced.
087 while ((loc < currentLine.length()) && ! done) {
088 tok = currentLine.charAt(loc);
089 if (escaped && tok == stringDelimiter) { // Found an escaped string
090 escaped = false;
091 } else if (tok == stringDelimiter && (token.length() > 0)) { // We are done, we found the end of the string...
092 done = true;
093 } else if (tok == '\\') { // Found an escaped char
094 escaped = true;
095 } else { // Adding char...
096 escaped = false;
097 }
098 //Adding char to String:" + token.toString());
099 token.append(tok);
100 loc++;
101 }
102 // Handling multiple lines string
103 if ( ! done && // ... we didn't find the end of the string
104 loc >= currentLine.length() && // ... we have reach the end of the line ( the String is incomplete, for the moment at least)
105 this.spanMultipleLinesString && // ... the language allow multiple line span Strings
106 this.lineNumber < this.code.size() - 1 // ... there is still more lines to parse
107 ) {
108 // parsing new line
109 this.currentLine = this.code.get(++this.lineNumber);
110 // Warning : recursive call !
111 loc = this.parseString(token, loc, stringDelimiter);
112 }
113 return loc + 1;
114 }
115
116 private boolean ignoreCharacter(char tok)
117 {
118 return this.ignorableCharacter.contains(String.valueOf(tok));
119 }
120
121 private boolean isString(char tok)
122 {
123 return this.stringToken.contains(String.valueOf(tok));
124 }
125
126 private boolean isComment(char tok)
127 {
128 return tok == oneLineCommentChar;
129 }
130
131 private int getCommentToken(StringBuffer token, int loc)
132 {
133 while (loc < this.currentLine.length())
134 {
135 token.append(this.currentLine.charAt(loc++));
136 }
137 return loc;
138 }
139
140 private boolean isIgnorableString(String token)
141 {
142 return this.ignorableStmt.contains(token);
143 }
144 }
|