import java.io.File;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
* SD2x Homework #11
* Improve the efficiency of the code below according to the guidelines in the assignment description.
* Please be sure not to change the signature of the detectPlagiarism method!
* However, you may modify the signatures of any of the other methods as needed.
public class PlagiarismDetector {
public static Map<String, Integer> detectPlagiarism(String dirName, int windowSize, int threshold) {
File dirFile = new File(dirName);
String[] files = dirFile.list();
Map<String, Set<String>> filePhrases= new HashMap<String, Set<String>>();
Map<String, Integer> numberOfMatches = new HashMap<String, Integer>();
for(int i = 0; i < files.length; i++){
filePhrases.put(files[i], createPhrases(dirName + "/" + files[i], windowSize));
for (int i = 0; i < files.length; i++) {
String file1 = files[i];
Set<String> file1Phrases = filePhrases.get(file1);
for (int j = i; j < files.length; j++) {
String file2 = files[j];
if (numberOfMatches.containsKey(file2 + "-" + file1) || file1.equals(file2)) {
Set<String> file2Phrases = filePhrases.get(file2);
if (file1Phrases == null || file2Phrases == null)
return null;
int matches = findMatches(file1Phrases, file2Phrases);
if (matches > threshold) {
String key = file1 + "-" + file2;
return sortResults(numberOfMatches);
* This method reads the given file and then converts it into a Collection of Strings.
* It does not include punctuation and converts all words in the file to uppercase.
protected static List<String> readFile(String filename) {
if (filename == null) return null;
List<String> words = new LinkedList<String>();
try {
Scanner in = new Scanner(new File(filename));
while (in.hasNext()) {
words.add(in.next().replaceAll("[^a-zA-Z]", "").toUpperCase());
catch (Exception e) {
return null;
return words;
* This method reads a file and converts it into a Set/List of distinct phrases,
* each of size "window". The Strings in each phrase are whitespace-separated.
protected static Set<String> createPhrases(String filename, int window) {
if (filename == null || window < 1) return null;
List<String> words = readFile(filename);
Set<String> phrases = new HashSet<String>();
for (int i = 0; i < words.size() - window + 1; i++) {
String phrase = "";
for (int j = 0; j < window; j++) {
phrase += words.get(i+j) + " ";
return phrases;
* Returns a Set of Strings that occur in both of the Set parameters.
* However, the comparison is case-insensitive.
protected static int findMatches(Set<String> myPhrases, Set<String> yourPhrases) {
int matches = 0;
Set<String> smallText = myPhrases.size() < yourPhrases.size() ? myPhrases : yourPhrases;
Set<String> bigText = myPhrases.size() > yourPhrases.size() ? myPhrases : yourPhrases;
String[] smallTextOrdered = smallText.toArray(new String[smallText.size()]);
String[] bigTextOrdered = bigText.toArray(new String[bigText.size()]);
if (myPhrases != null && yourPhrases != null) {
for (int i = 0; i < smallText.size(); i++) {
for (int j = 0; j < bigText.size(); j++) {
if (smallTextOrdered[i].equalsIgnoreCase(bigTextOrdered[j])) {
return matches;
* Returns a LinkedHashMap in which the elements of the Map parameter
* are sorted according to the value of the Integer, in non-ascending order.
protected static LinkedHashMap<String, Integer> sortResults(Map<String, Integer> possibleMatches) {
// Because this approach modifies the Map as a side effect of printing
// the results, it is necessary to make a copy of the original Map
Map<String, Integer> copy = new HashMap<String, Integer>();
for (String key : possibleMatches.keySet()) {
copy.put(key, possibleMatches.get(key));
LinkedHashMap<String, Integer> list = new LinkedHashMap<String, Integer>();
for (int i = 0; i < copy.size(); i++) {
int maxValue = 0;
String maxKey = null;
for (String key : copy.keySet()) {
if (copy.get(key) > maxValue) {
maxValue = copy.get(key);
maxKey = key;
list.put(maxKey, maxValue);
copy.put(maxKey, -1);
return list;
* This method is here to help you measure the execution time and get the output of the program.
* You do not need to consider it for improving the efficiency of the detectPlagiarism method.
public static void main(String[] args) {
if (args.length == 0) {
System.out.println("Please specify the name of the directory containing the corpus.");
String directory = args[0];
long start = System.currentTimeMillis();
Map<String, Integer> map = PlagiarismDetector.detectPlagiarism(directory, 4, 5);
long end = System.currentTimeMillis();
double timeInSeconds = (end - start) / (double)1000;
System.out.println("Execution time (wall clock): " + timeInSeconds + " seconds");
Set<Map.Entry<String, Integer>> entries = map.entrySet();
for (Map.Entry<String, Integer> entry : entries) {
System.out.println(entry.getKey() + ": " + entry.getValue());