001 /**
002 * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
003 */
004 package net.sourceforge.pmd.cpd;
005
006 import java.util.ArrayList;
007 import java.util.Collections;
008 import java.util.HashMap;
009 import java.util.HashSet;
010 import java.util.Iterator;
011 import java.util.List;
012 import java.util.Map;
013 import java.util.Set;
014
015 public class MatchCollector {
016
017 private MatchAlgorithm ma;
018 private Map<Match.MatchCode, Match> startMap = new HashMap<Match.MatchCode, Match>();
019 private Map<String, List<Match>> fileMap = new HashMap<String, List<Match>>();
020
021 public MatchCollector(MatchAlgorithm ma) {
022 this.ma = ma;
023 }
024
025 public void collect(List<TokenEntry> marks) {
026 //first get a pairwise collection of all maximal matches
027 for (int i = 0; i < marks.size() - 1; i++) {
028 TokenEntry mark1 = marks.get(i);
029 for (int j = i + 1; j < marks.size(); j++) {
030 TokenEntry mark2 = marks.get(j);
031 int diff = mark1.getIndex() - mark2.getIndex();
032 if (-diff < ma.getMinimumTileSize()) {
033 continue;
034 }
035 if (hasPreviousDupe(mark1, mark2)) {
036 continue;
037 }
038
039 // "match too small" check
040 int dupes = countDuplicateTokens(mark1, mark2);
041 if (dupes < ma.getMinimumTileSize()) {
042 continue;
043 }
044 // is it still too close together
045 if (diff + dupes >= 1) {
046 continue;
047 }
048 determineMatch(mark1, mark2, dupes);
049 }
050 }
051 }
052
053 @SuppressWarnings("PMD.CompareObjectsWithEquals")
054 public List<Match> getMatches() {
055 List<Match> matchList = new ArrayList<Match>(startMap.values());
056 Collections.sort(matchList);
057 Set<Match.MatchCode> matchSet = new HashSet<Match.MatchCode>();
058 Match.MatchCode matchCode = new Match.MatchCode();
059 for (int i = matchList.size(); i > 1; i--) {
060 Match match1 = matchList.get(i - 1);
061 TokenEntry mark1 = match1.getMarkSet().iterator().next();
062 matchSet.clear();
063 matchSet.add(match1.getMatchCode());
064 for (int j = i - 1; j > 0; j--) {
065 Match match2 = matchList.get(j - 1);
066 if (match1.getTokenCount() != match2.getTokenCount()) {
067 break;
068 }
069 TokenEntry mark2 = null;
070 for (Iterator<TokenEntry> iter = match2.getMarkSet().iterator(); iter.hasNext();) {
071 mark2 = iter.next();
072 if (mark2 != mark1) {
073 break;
074 }
075 }
076 int dupes = countDuplicateTokens(mark1, mark2);
077 if (dupes < match1.getTokenCount()) {
078 break;
079 }
080 matchSet.add(match2.getMatchCode());
081 match1.getMarkSet().addAll(match2.getMarkSet());
082 matchList.remove(i - 2);
083 i--;
084 }
085 if (matchSet.size() == 1) {
086 continue;
087 }
088 //prune the mark set
089 Set<TokenEntry> pruned = match1.getMarkSet();
090 boolean done = false;
091 ArrayList<TokenEntry> a1 = new ArrayList<TokenEntry>(match1.getMarkSet());
092 Collections.sort(a1);
093 for (int outer = 0; outer < a1.size() - 1 && !done; outer++) {
094 TokenEntry cmark1 = a1.get(outer);
095 for (int inner = outer + 1; inner < a1.size() && !done; inner++) {
096 TokenEntry cmark2 = a1.get(inner);
097 matchCode.setFirst(cmark1.getIndex());
098 matchCode.setSecond(cmark2.getIndex());
099 if (!matchSet.contains(matchCode)) {
100 if (pruned.size() > 2) {
101 pruned.remove(cmark2);
102 }
103 if (pruned.size() == 2) {
104 done = true;
105 }
106 }
107 }
108 }
109 }
110 return matchList;
111 }
112
113 /**
114 * A greedy algorithm for determining non-overlapping matches
115 */
116 private void determineMatch(TokenEntry mark1, TokenEntry mark2, int dupes) {
117 Match match = new Match(dupes, mark1, mark2);
118 String fileKey = mark1.getTokenSrcID() + mark2.getTokenSrcID();
119 List<Match> pairMatches = fileMap.get(fileKey);
120 if (pairMatches == null) {
121 pairMatches = new ArrayList<Match>();
122 fileMap.put(fileKey, pairMatches);
123 }
124 boolean add = true;
125 for (int i = 0; i < pairMatches.size(); i++) {
126 Match other = pairMatches.get(i);
127 if (other.getFirstMark().getIndex() + other.getTokenCount() - mark1.getIndex()
128 > 0) {
129 boolean ordered = other.getSecondMark().getIndex() - mark2.getIndex() < 0;
130 if ((ordered && (other.getEndIndex() - mark2.getIndex() > 0))
131 || (!ordered && (match.getEndIndex() - other.getSecondMark().getIndex()) > 0)) {
132 if (other.getTokenCount() >= match.getTokenCount()) {
133 add = false;
134 break;
135 } else {
136 pairMatches.remove(i);
137 startMap.remove(other.getMatchCode());
138 }
139 }
140 }
141 }
142 if (add) {
143 pairMatches.add(match);
144 startMap.put(match.getMatchCode(), match);
145 }
146 }
147
148 private boolean hasPreviousDupe(TokenEntry mark1, TokenEntry mark2) {
149 if (mark1.getIndex() == 0) {
150 return false;
151 }
152 return !matchEnded(ma.tokenAt(-1, mark1), ma.tokenAt(-1, mark2));
153 }
154
155 private int countDuplicateTokens(TokenEntry mark1, TokenEntry mark2) {
156 int index = 0;
157 while (!matchEnded(ma.tokenAt(index, mark1), ma.tokenAt(index, mark2))) {
158 index++;
159 }
160 return index;
161 }
162
163 private boolean matchEnded(TokenEntry token1, TokenEntry token2) {
164 return token1.getIdentifier() != token2.getIdentifier() || token1 == TokenEntry.EOF || token2 == TokenEntry.EOF;
165 }
166 }
|