Kulyutmaz is a project developed for the regional TUBITAK competition's programming field thart aims to create a more advanced phishing e-mail detection algorithm using website content checking and a neural network that we have trained.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

114 lines
4.8 KiB

4 years ago
  1. import re
  2. import requests
  3. import math
  4. import jsbeautifier
  5. testUrl = "https://apple.com/metrics/target/scripts/1.0/at.js"
  6. js_keywords = ["break", "case", "catch", "continue", "debugger", "default", "delete", "do", "else", "finally", "for", "function", "if", "in", "instanceof", "new", "return", "switch", "this", "throw", "try", "typeof", "var", "void", "while", "with"]
  7. def parseJavascript(context, isUrl):
  8. if isUrl:
  9. try:
  10. r = requests.get(context, allow_redirects=False)
  11. except:
  12. print("Unexpected Error")
  13. return (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
  14. temp_code = r.content.decode("ISO-8859-1")
  15. else:
  16. temp_code = context
  17. js_code = jsbeautifier.beautify(temp_code)
  18. noOfEvalFunc = len(re.findall("eval\(", js_code))
  19. noOfSetTimeOutFunc = len(re.findall("setTimeout\(", js_code))
  20. noOfiframe = len(re.findall("iframe", js_code))
  21. noOfUnescapeFunc = len(re.findall("unescape\(", js_code))
  22. noOfEscapeFunc = len(re.findall("escape\(", js_code))
  23. noOfClassid = len(re.findall("classid", js_code))
  24. noOfParseIntFunc = len(re.findall("parseInt\(", js_code))
  25. noOfFromCharCodeFunc = len(re.findall("fromCharCode\(", js_code))
  26. noOfActiveXObjectFunc = len(re.findall("ActiveXObject\(", js_code))
  27. noOfStringAssigments = len(re.findall(r'"((?<=\\)"|([^"]))"', js_code))
  28. noOfConcatFunc = len(re.findall("concat\(", js_code))
  29. noOfIndexOfFunc = len(re.findall("indexOf\(", js_code))
  30. noOfSubstringFunc = len(re.findall("substring\(", js_code))
  31. noOfReplaceFunc = len(re.findall("replace\(", js_code))
  32. noOfEventListenerFunc = len(re.findall("document.addEventListener\(", js_code))
  33. noOfAttachEventFunc = len(re.findall("attachEvent\(", js_code))
  34. noOfCreateElementFunc = len(re.findall("createElement\(", js_code))
  35. noOfGetElementByIdFunc = len(re.findall("getElementById\(", js_code))
  36. noOfDocumentWriteFunc = len(re.findall("document.write\(", js_code))
  37. noOfWords = len(re.findall(r'\w+', js_code))
  38. noOfKeyWords = findKeyWords(js_code)
  39. noOfCharacters = len(js_code)
  40. try:
  41. ratioOfKeywordsAndWords = noOfKeyWords / noOfWords
  42. except ZeroDivisionError:
  43. ratioOfKeywordsAndWords = 0
  44. entropyOfJS = Entropy(js_code)
  45. longestWord = len(findTheLongestWord(js_code))
  46. noOfLongStirngs = len(re.findall(r'"((?<=\\)"|([^"])){200,}"', js_code))
  47. shortestWord = len(findTheShortestWord(js_code))
  48. entropyOfLongestWord = Entropy(findTheLongestWord(js_code))
  49. noOfBlankSpaces = len(re.findall(" ", js_code))
  50. try:
  51. avgLenOfWords = noOfCharacters / noOfWords
  52. except ZeroDivisionError:
  53. avgLenOfWords = 0
  54. noOfHexValues = findHexNumbers(js_code)
  55. try:
  56. shareOfSpaceChar = len(re.findall(" ", js_code)) / noOfCharacters
  57. except ZeroDivisionError:
  58. shareOfSpaceChar = 0
  59. #print(noOfHexValues)
  60. return (noOfEvalFunc, noOfSetTimeOutFunc, noOfiframe, noOfUnescapeFunc, noOfEscapeFunc, noOfClassid, noOfParseIntFunc, noOfFromCharCodeFunc, noOfActiveXObjectFunc,
  61. noOfStringAssigments, noOfConcatFunc, noOfIndexOfFunc, noOfSubstringFunc, noOfReplaceFunc, noOfEventListenerFunc, noOfAttachEventFunc, noOfCreateElementFunc, noOfGetElementByIdFunc,
  62. noOfDocumentWriteFunc, noOfWords, noOfKeyWords, noOfCharacters, ratioOfKeywordsAndWords, entropyOfJS, longestWord, noOfLongStirngs, shortestWord, entropyOfLongestWord, noOfBlankSpaces,
  63. avgLenOfWords, noOfHexValues, shareOfSpaceChar)
  64. def Entropy(string,base = 2.0): # I copied this from net
  65. dct = dict.fromkeys(list(string))
  66. pkvec = [float(string.count(c)) / len(string) for c in dct]
  67. H = -sum([pk * math.log(pk) / math.log(base) for pk in pkvec ])
  68. return H
  69. def findTheLongestWord(text):
  70. tempList = text.split(' ')
  71. return max(tempList, key=len)
  72. def findTheShortestWord(text):
  73. tempList = text.split(' ')
  74. return min(tempList, key=len)
  75. """def findLongStings(text):
  76. longs = []
  77. temp = re.findall(r'var = .*\".*\"', text)
  78. for string in temp:
  79. if len(string) > 200:
  80. longs.append(string)"""
  81. def findKeyWords(text):
  82. numberOfKeywords = 0
  83. temp = re.findall(r'\w+', text)
  84. for word in temp:
  85. if word in js_keywords:
  86. numberOfKeywords += 1
  87. temp = re.findall(r'"((?<=\\)"|([^"]))"', text)
  88. for word in temp:
  89. if word in js_keywords:
  90. numberOfKeywords += -1
  91. return numberOfKeywords
  92. def findHexNumbers(text):
  93. number = len(re.findall(r'0x', text)) + len(re.findall(r'parseInt\([^,]*,? *16\)', text))
  94. return number
  95. #print(parseJavascript("https://ajax.googleapis.com/ajax/libs/jquery/3.2.1/jquery.min.js"))