yigitcolakoglu
/
PennX_SDx2



								import java.io.File;

								import java.util.HashMap;

								import java.util.ArrayList;

								import java.util.HashSet;

								import java.util.LinkedHashMap;

								import java.util.LinkedList;

								import java.util.List;

								import java.util.Map;

								import java.util.Scanner;

								import java.util.Set;


								/*

								 * SD2x Homework #11

								 * Improve the efficiency of the code below according to the guidelines in the assignment description.

								 * Please be sure not to change the signature of the detectPlagiarism method!

								 * However, you may modify the signatures of any of the other methods as needed.

								 */


								public class PlagiarismDetector {


									public static Map<String, Integer> detectPlagiarism(String dirName, int windowSize, int threshold) {

										File dirFile = new File(dirName);

										String[] files = dirFile.list();

										Map<String, Set<String>> filePhrases= new HashMap<String, Set<String>>();


										Map<String, Integer> numberOfMatches = new HashMap<String, Integer>();


										for(int i = 0; i < files.length; i++){

											filePhrases.put(files[i], createPhrases(dirName + "/" + files[i], windowSize));

										}


										for (int i = 0; i < files.length; i++) {

											String file1 = files[i];

											Set<String> file1Phrases = filePhrases.get(file1);


											for (int j = i; j < files.length; j++) {

												String file2 = files[j];

												if (numberOfMatches.containsKey(file2 + "-" + file1) || file1.equals(file2)) {

													continue;

												}


												Set<String> file2Phrases = filePhrases.get(file2);


												if (file1Phrases == null || file2Phrases == null)

													return null;


												int matches = findMatches(file1Phrases, file2Phrases);


												if (matches > threshold) {

													String key = file1 + "-" + file2;

													numberOfMatches.put(key,matches);

												}

											}


										}


										return sortResults(numberOfMatches);

									}


									/*

									 * This method reads the given file and then converts it into a Collection of Strings.

									 * It does not include punctuation and converts all words in the file to uppercase.

									 */

									protected static List<String> readFile(String filename) {

										if (filename == null) return null;


										List<String> words = new LinkedList<String>();


										try {

											Scanner in = new Scanner(new File(filename));

											while (in.hasNext()) {

												words.add(in.next().replaceAll("[^a-zA-Z]", "").toUpperCase());

											}

										}

										catch (Exception e) {

											e.printStackTrace();

											return null;

										}


										return words;

									}


									/*

									 * This method reads a file and converts it into a Set/List of distinct phrases,

									 * each of size "window". The Strings in each phrase are whitespace-separated.

									 */

									protected static Set<String> createPhrases(String filename, int window) {

										if (filename == null || window < 1) return null;


										List<String> words = readFile(filename);


										Set<String> phrases = new HashSet<String>();


										for (int i = 0; i < words.size() - window + 1; i++) {

											String phrase = "";

											for (int j = 0; j < window; j++) {

												phrase += words.get(i+j) + " ";

											}


											phrases.add(phrase);

										}


										return phrases;

									}


									/*

									 * Returns a Set of Strings that occur in both of the Set parameters.

									 * However, the comparison is case-insensitive.

									 */

									protected static int findMatches(Set<String> myPhrases, Set<String> yourPhrases) {


										int matches = 0;

										Set<String> smallText = myPhrases.size() < yourPhrases.size() ? myPhrases : yourPhrases;

										Set<String> bigText = myPhrases.size() > yourPhrases.size() ? myPhrases : yourPhrases;

										String[] smallTextOrdered = smallText.toArray(new String[smallText.size()]);

										String[] bigTextOrdered = bigText.toArray(new String[bigText.size()]);

										if (myPhrases != null && yourPhrases != null) {


											for (int i = 0; i < smallText.size(); i++) {

												for (int j = 0; j < bigText.size(); j++) {

													if (smallTextOrdered[i].equalsIgnoreCase(bigTextOrdered[j])) {

														matches++;

													}

												}

											}

										}

										return matches;

									}


									/*

									 * Returns a LinkedHashMap in which the elements of the Map parameter

									 * are sorted according to the value of the Integer, in non-ascending order.

									 */

									protected static LinkedHashMap<String, Integer> sortResults(Map<String, Integer> possibleMatches) {


										// Because this approach modifies the Map as a side effect of printing

										// the results, it is necessary to make a copy of the original Map

										Map<String, Integer> copy = new HashMap<String, Integer>();


										for (String key : possibleMatches.keySet()) {

											copy.put(key, possibleMatches.get(key));

										}


										LinkedHashMap<String, Integer> list = new LinkedHashMap<String, Integer>();


										for (int i = 0; i < copy.size(); i++) {

											int maxValue = 0;

											String maxKey = null;

											for (String key : copy.keySet()) {

												if (copy.get(key) > maxValue) {

													maxValue = copy.get(key);

													maxKey = key;

												}

											}


											list.put(maxKey, maxValue);


											copy.put(maxKey, -1);

										}


										return list;

									}


									/*

									 * This method is here to help you measure the execution time and get the output of the program.

									 * You do not need to consider it for improving the efficiency of the detectPlagiarism method.

									 */

								    public static void main(String[] args) {

								    	if (args.length == 0) {

								    		System.out.println("Please specify the name of the directory containing the corpus.");

								    		System.exit(0);

								    	}

								    	String directory = args[0];

								    	long start = System.currentTimeMillis();

								    	Map<String, Integer> map = PlagiarismDetector.detectPlagiarism(directory, 4, 5);

								    	long end = System.currentTimeMillis();

								    	double timeInSeconds = (end - start) / (double)1000;

								    	System.out.println("Execution time (wall clock): " + timeInSeconds + " seconds");

								    	Set<Map.Entry<String, Integer>> entries = map.entrySet();

								    	for (Map.Entry<String, Integer> entry : entries) {

								    		System.out.println(entry.getKey() + ": " + entry.getValue());

								    	}

								    }


								}