import java.io.File; import java.util.HashMap; import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.Set; /* * SD2x Homework #11 * Improve the efficiency of the code below according to the guidelines in the assignment description. * Please be sure not to change the signature of the detectPlagiarism method! * However, you may modify the signatures of any of the other methods as needed. */ public class PlagiarismDetector { public static Map detectPlagiarism(String dirName, int windowSize, int threshold) { File dirFile = new File(dirName); String[] files = dirFile.list(); Map> filePhrases= new HashMap>(); Map numberOfMatches = new HashMap(); for(int i = 0; i < files.length; i++){ filePhrases.put(files[i], createPhrases(dirName + "/" + files[i], windowSize)); } for (int i = 0; i < files.length; i++) { String file1 = files[i]; Set file1Phrases = filePhrases.get(file1); for (int j = i; j < files.length; j++) { String file2 = files[j]; if (numberOfMatches.containsKey(file2 + "-" + file1) || file1.equals(file2)) { continue; } Set file2Phrases = filePhrases.get(file2); if (file1Phrases == null || file2Phrases == null) return null; int matches = findMatches(file1Phrases, file2Phrases); if (matches > threshold) { String key = file1 + "-" + file2; numberOfMatches.put(key,matches); } } } return sortResults(numberOfMatches); } /* * This method reads the given file and then converts it into a Collection of Strings. * It does not include punctuation and converts all words in the file to uppercase. */ protected static List readFile(String filename) { if (filename == null) return null; List words = new LinkedList(); try { Scanner in = new Scanner(new File(filename)); while (in.hasNext()) { words.add(in.next().replaceAll("[^a-zA-Z]", "").toUpperCase()); } } catch (Exception e) { e.printStackTrace(); return null; } return words; } /* * This method reads a file and converts it into a Set/List of distinct phrases, * each of size "window". The Strings in each phrase are whitespace-separated. */ protected static Set createPhrases(String filename, int window) { if (filename == null || window < 1) return null; List words = readFile(filename); Set phrases = new HashSet(); for (int i = 0; i < words.size() - window + 1; i++) { String phrase = ""; for (int j = 0; j < window; j++) { phrase += words.get(i+j) + " "; } phrases.add(phrase); } return phrases; } /* * Returns a Set of Strings that occur in both of the Set parameters. * However, the comparison is case-insensitive. */ protected static int findMatches(Set myPhrases, Set yourPhrases) { int matches = 0; Set smallText = myPhrases.size() < yourPhrases.size() ? myPhrases : yourPhrases; Set bigText = myPhrases.size() > yourPhrases.size() ? myPhrases : yourPhrases; String[] smallTextOrdered = smallText.toArray(new String[smallText.size()]); String[] bigTextOrdered = bigText.toArray(new String[bigText.size()]); if (myPhrases != null && yourPhrases != null) { for (int i = 0; i < smallText.size(); i++) { for (int j = 0; j < bigText.size(); j++) { if (smallTextOrdered[i].equalsIgnoreCase(bigTextOrdered[j])) { matches++; } } } } return matches; } /* * Returns a LinkedHashMap in which the elements of the Map parameter * are sorted according to the value of the Integer, in non-ascending order. */ protected static LinkedHashMap sortResults(Map possibleMatches) { // Because this approach modifies the Map as a side effect of printing // the results, it is necessary to make a copy of the original Map Map copy = new HashMap(); for (String key : possibleMatches.keySet()) { copy.put(key, possibleMatches.get(key)); } LinkedHashMap list = new LinkedHashMap(); for (int i = 0; i < copy.size(); i++) { int maxValue = 0; String maxKey = null; for (String key : copy.keySet()) { if (copy.get(key) > maxValue) { maxValue = copy.get(key); maxKey = key; } } list.put(maxKey, maxValue); copy.put(maxKey, -1); } return list; } /* * This method is here to help you measure the execution time and get the output of the program. * You do not need to consider it for improving the efficiency of the detectPlagiarism method. */ public static void main(String[] args) { if (args.length == 0) { System.out.println("Please specify the name of the directory containing the corpus."); System.exit(0); } String directory = args[0]; long start = System.currentTimeMillis(); Map map = PlagiarismDetector.detectPlagiarism(directory, 4, 5); long end = System.currentTimeMillis(); double timeInSeconds = (end - start) / (double)1000; System.out.println("Execution time (wall clock): " + timeInSeconds + " seconds"); Set> entries = map.entrySet(); for (Map.Entry entry : entries) { System.out.println(entry.getKey() + ": " + entry.getValue()); } } }