|
|
|
|
import java.io.File;
|
|
import java.util.HashMap;
|
|
import java.util.ArrayList;
|
|
import java.util.HashSet;
|
|
import java.util.LinkedHashMap;
|
|
import java.util.LinkedList;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.Scanner;
|
|
import java.util.Set;
|
|
|
|
/*
|
|
* SD2x Homework #11
|
|
* Improve the efficiency of the code below according to the guidelines in the assignment description.
|
|
* Please be sure not to change the signature of the detectPlagiarism method!
|
|
* However, you may modify the signatures of any of the other methods as needed.
|
|
*/
|
|
|
|
public class PlagiarismDetector {
|
|
|
|
public static Map<String, Integer> detectPlagiarism(String dirName, int windowSize, int threshold) {
|
|
File dirFile = new File(dirName);
|
|
String[] files = dirFile.list();
|
|
Map<String, Set<String>> filePhrases= new HashMap<String, Set<String>>();
|
|
|
|
Map<String, Integer> numberOfMatches = new HashMap<String, Integer>();
|
|
|
|
for(int i = 0; i < files.length; i++){
|
|
filePhrases.put(files[i], createPhrases(dirName + "/" + files[i], windowSize));
|
|
}
|
|
|
|
for (int i = 0; i < files.length; i++) {
|
|
String file1 = files[i];
|
|
Set<String> file1Phrases = filePhrases.get(file1);
|
|
|
|
for (int j = i; j < files.length; j++) {
|
|
String file2 = files[j];
|
|
if (numberOfMatches.containsKey(file2 + "-" + file1) || file1.equals(file2)) {
|
|
continue;
|
|
}
|
|
|
|
Set<String> file2Phrases = filePhrases.get(file2);
|
|
|
|
if (file1Phrases == null || file2Phrases == null)
|
|
return null;
|
|
|
|
int matches = findMatches(file1Phrases, file2Phrases);
|
|
|
|
if (matches > threshold) {
|
|
String key = file1 + "-" + file2;
|
|
numberOfMatches.put(key,matches);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
return sortResults(numberOfMatches);
|
|
}
|
|
|
|
|
|
/*
|
|
* This method reads the given file and then converts it into a Collection of Strings.
|
|
* It does not include punctuation and converts all words in the file to uppercase.
|
|
*/
|
|
protected static List<String> readFile(String filename) {
|
|
if (filename == null) return null;
|
|
|
|
List<String> words = new LinkedList<String>();
|
|
|
|
try {
|
|
Scanner in = new Scanner(new File(filename));
|
|
while (in.hasNext()) {
|
|
words.add(in.next().replaceAll("[^a-zA-Z]", "").toUpperCase());
|
|
}
|
|
}
|
|
catch (Exception e) {
|
|
e.printStackTrace();
|
|
return null;
|
|
}
|
|
|
|
return words;
|
|
}
|
|
|
|
|
|
/*
|
|
* This method reads a file and converts it into a Set/List of distinct phrases,
|
|
* each of size "window". The Strings in each phrase are whitespace-separated.
|
|
*/
|
|
protected static Set<String> createPhrases(String filename, int window) {
|
|
if (filename == null || window < 1) return null;
|
|
|
|
List<String> words = readFile(filename);
|
|
|
|
Set<String> phrases = new HashSet<String>();
|
|
|
|
for (int i = 0; i < words.size() - window + 1; i++) {
|
|
String phrase = "";
|
|
for (int j = 0; j < window; j++) {
|
|
phrase += words.get(i+j) + " ";
|
|
}
|
|
|
|
phrases.add(phrase);
|
|
}
|
|
|
|
return phrases;
|
|
}
|
|
|
|
/*
|
|
* Returns a Set of Strings that occur in both of the Set parameters.
|
|
* However, the comparison is case-insensitive.
|
|
*/
|
|
protected static int findMatches(Set<String> myPhrases, Set<String> yourPhrases) {
|
|
|
|
int matches = 0;
|
|
Set<String> smallText = myPhrases.size() < yourPhrases.size() ? myPhrases : yourPhrases;
|
|
Set<String> bigText = myPhrases.size() > yourPhrases.size() ? myPhrases : yourPhrases;
|
|
String[] smallTextOrdered = smallText.toArray(new String[smallText.size()]);
|
|
String[] bigTextOrdered = bigText.toArray(new String[bigText.size()]);
|
|
if (myPhrases != null && yourPhrases != null) {
|
|
|
|
for (int i = 0; i < smallText.size(); i++) {
|
|
for (int j = 0; j < bigText.size(); j++) {
|
|
if (smallTextOrdered[i].equalsIgnoreCase(bigTextOrdered[j])) {
|
|
matches++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return matches;
|
|
}
|
|
|
|
/*
|
|
* Returns a LinkedHashMap in which the elements of the Map parameter
|
|
* are sorted according to the value of the Integer, in non-ascending order.
|
|
*/
|
|
protected static LinkedHashMap<String, Integer> sortResults(Map<String, Integer> possibleMatches) {
|
|
|
|
// Because this approach modifies the Map as a side effect of printing
|
|
// the results, it is necessary to make a copy of the original Map
|
|
Map<String, Integer> copy = new HashMap<String, Integer>();
|
|
|
|
for (String key : possibleMatches.keySet()) {
|
|
copy.put(key, possibleMatches.get(key));
|
|
}
|
|
|
|
LinkedHashMap<String, Integer> list = new LinkedHashMap<String, Integer>();
|
|
|
|
for (int i = 0; i < copy.size(); i++) {
|
|
int maxValue = 0;
|
|
String maxKey = null;
|
|
for (String key : copy.keySet()) {
|
|
if (copy.get(key) > maxValue) {
|
|
maxValue = copy.get(key);
|
|
maxKey = key;
|
|
}
|
|
}
|
|
|
|
list.put(maxKey, maxValue);
|
|
|
|
copy.put(maxKey, -1);
|
|
}
|
|
|
|
return list;
|
|
}
|
|
|
|
/*
|
|
* This method is here to help you measure the execution time and get the output of the program.
|
|
* You do not need to consider it for improving the efficiency of the detectPlagiarism method.
|
|
*/
|
|
public static void main(String[] args) {
|
|
if (args.length == 0) {
|
|
System.out.println("Please specify the name of the directory containing the corpus.");
|
|
System.exit(0);
|
|
}
|
|
String directory = args[0];
|
|
long start = System.currentTimeMillis();
|
|
Map<String, Integer> map = PlagiarismDetector.detectPlagiarism(directory, 4, 5);
|
|
long end = System.currentTimeMillis();
|
|
double timeInSeconds = (end - start) / (double)1000;
|
|
System.out.println("Execution time (wall clock): " + timeInSeconds + " seconds");
|
|
Set<Map.Entry<String, Integer>> entries = map.entrySet();
|
|
for (Map.Entry<String, Integer> entry : entries) {
|
|
System.out.println(entry.getKey() + ": " + entry.getValue());
|
|
}
|
|
}
|
|
|
|
}
|