This repository acts as a personal archive for my solutions to EdX course *Data Structures and Software Design* from PennX.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

189 lines
5.5 KiB

  1. import java.io.File;
  2. import java.util.HashMap;
  3. import java.util.ArrayList;
  4. import java.util.HashSet;
  5. import java.util.LinkedHashMap;
  6. import java.util.LinkedList;
  7. import java.util.List;
  8. import java.util.Map;
  9. import java.util.Scanner;
  10. import java.util.Set;
  11. /*
  12. * SD2x Homework #11
  13. * Improve the efficiency of the code below according to the guidelines in the assignment description.
  14. * Please be sure not to change the signature of the detectPlagiarism method!
  15. * However, you may modify the signatures of any of the other methods as needed.
  16. */
  17. public class PlagiarismDetector {
  18. public static Map<String, Integer> detectPlagiarism(String dirName, int windowSize, int threshold) {
  19. File dirFile = new File(dirName);
  20. String[] files = dirFile.list();
  21. Map<String, Set<String>> filePhrases= new HashMap<String, Set<String>>();
  22. Map<String, Integer> numberOfMatches = new HashMap<String, Integer>();
  23. for(int i = 0; i < files.length; i++){
  24. filePhrases.put(files[i], createPhrases(dirName + "/" + files[i], windowSize));
  25. }
  26. for (int i = 0; i < files.length; i++) {
  27. String file1 = files[i];
  28. Set<String> file1Phrases = filePhrases.get(file1);
  29. for (int j = i; j < files.length; j++) {
  30. String file2 = files[j];
  31. if (numberOfMatches.containsKey(file2 + "-" + file1) || file1.equals(file2)) {
  32. continue;
  33. }
  34. Set<String> file2Phrases = filePhrases.get(file2);
  35. if (file1Phrases == null || file2Phrases == null)
  36. return null;
  37. int matches = findMatches(file1Phrases, file2Phrases);
  38. if (matches > threshold) {
  39. String key = file1 + "-" + file2;
  40. numberOfMatches.put(key,matches);
  41. }
  42. }
  43. }
  44. return sortResults(numberOfMatches);
  45. }
  46. /*
  47. * This method reads the given file and then converts it into a Collection of Strings.
  48. * It does not include punctuation and converts all words in the file to uppercase.
  49. */
  50. protected static List<String> readFile(String filename) {
  51. if (filename == null) return null;
  52. List<String> words = new LinkedList<String>();
  53. try {
  54. Scanner in = new Scanner(new File(filename));
  55. while (in.hasNext()) {
  56. words.add(in.next().replaceAll("[^a-zA-Z]", "").toUpperCase());
  57. }
  58. }
  59. catch (Exception e) {
  60. e.printStackTrace();
  61. return null;
  62. }
  63. return words;
  64. }
  65. /*
  66. * This method reads a file and converts it into a Set/List of distinct phrases,
  67. * each of size "window". The Strings in each phrase are whitespace-separated.
  68. */
  69. protected static Set<String> createPhrases(String filename, int window) {
  70. if (filename == null || window < 1) return null;
  71. List<String> words = readFile(filename);
  72. Set<String> phrases = new HashSet<String>();
  73. for (int i = 0; i < words.size() - window + 1; i++) {
  74. String phrase = "";
  75. for (int j = 0; j < window; j++) {
  76. phrase += words.get(i+j) + " ";
  77. }
  78. phrases.add(phrase);
  79. }
  80. return phrases;
  81. }
  82. /*
  83. * Returns a Set of Strings that occur in both of the Set parameters.
  84. * However, the comparison is case-insensitive.
  85. */
  86. protected static int findMatches(Set<String> myPhrases, Set<String> yourPhrases) {
  87. int matches = 0;
  88. Set<String> smallText = myPhrases.size() < yourPhrases.size() ? myPhrases : yourPhrases;
  89. Set<String> bigText = myPhrases.size() > yourPhrases.size() ? myPhrases : yourPhrases;
  90. String[] smallTextOrdered = smallText.toArray(new String[smallText.size()]);
  91. String[] bigTextOrdered = bigText.toArray(new String[bigText.size()]);
  92. if (myPhrases != null && yourPhrases != null) {
  93. for (int i = 0; i < smallText.size(); i++) {
  94. for (int j = 0; j < bigText.size(); j++) {
  95. if (smallTextOrdered[i].equalsIgnoreCase(bigTextOrdered[j])) {
  96. matches++;
  97. }
  98. }
  99. }
  100. }
  101. return matches;
  102. }
  103. /*
  104. * Returns a LinkedHashMap in which the elements of the Map parameter
  105. * are sorted according to the value of the Integer, in non-ascending order.
  106. */
  107. protected static LinkedHashMap<String, Integer> sortResults(Map<String, Integer> possibleMatches) {
  108. // Because this approach modifies the Map as a side effect of printing
  109. // the results, it is necessary to make a copy of the original Map
  110. Map<String, Integer> copy = new HashMap<String, Integer>();
  111. for (String key : possibleMatches.keySet()) {
  112. copy.put(key, possibleMatches.get(key));
  113. }
  114. LinkedHashMap<String, Integer> list = new LinkedHashMap<String, Integer>();
  115. for (int i = 0; i < copy.size(); i++) {
  116. int maxValue = 0;
  117. String maxKey = null;
  118. for (String key : copy.keySet()) {
  119. if (copy.get(key) > maxValue) {
  120. maxValue = copy.get(key);
  121. maxKey = key;
  122. }
  123. }
  124. list.put(maxKey, maxValue);
  125. copy.put(maxKey, -1);
  126. }
  127. return list;
  128. }
  129. /*
  130. * This method is here to help you measure the execution time and get the output of the program.
  131. * You do not need to consider it for improving the efficiency of the detectPlagiarism method.
  132. */
  133. public static void main(String[] args) {
  134. if (args.length == 0) {
  135. System.out.println("Please specify the name of the directory containing the corpus.");
  136. System.exit(0);
  137. }
  138. String directory = args[0];
  139. long start = System.currentTimeMillis();
  140. Map<String, Integer> map = PlagiarismDetector.detectPlagiarism(directory, 4, 5);
  141. long end = System.currentTimeMillis();
  142. double timeInSeconds = (end - start) / (double)1000;
  143. System.out.println("Execution time (wall clock): " + timeInSeconds + " seconds");
  144. Set<Map.Entry<String, Integer>> entries = map.entrySet();
  145. for (Map.Entry<String, Integer> entry : entries) {
  146. System.out.println(entry.getKey() + ": " + entry.getValue());
  147. }
  148. }
  149. }