Kulyutmaz is a project developed for the regional TUBITAK competition's programming field thart aims to create a more advanced phishing e-mail detection algorithm using website content checking and a neural network that we have trained.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

205 lines
6.7 KiB

4 years ago
  1. import pandas as pd
  2. import os
  3. import urllib.request
  4. from datetime import datetime
  5. from progressbar import ProgressBar
  6. import requests
  7. import yandex_search
  8. from urllib.parse import urlparse, quote
  9. import requests, json
  10. import urllib3
  11. from bs4 import BeautifulSoup
  12. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  13. TRUSTED_TLDS = ["com", "org", "net"]
  14. pbar = None
  15. downloaded = 0
  16. def show_progress(count, block_size, total_size):
  17. global pbar
  18. global downloaded
  19. if pbar is None:
  20. pbar = ProgressBar(maxval=total_size)
  21. downloaded += block_size
  22. pbar.update(block_size)
  23. if downloaded == total_size:
  24. pbar.finish()
  25. pbar = None
  26. downloaded = 0
  27. def get_data(phishtank_key, force_update=False):
  28. if not os.path.isfile("phishtank.csv") or force_update:
  29. urllib.request.urlretrieve("http://data.phishtank.com/data/{}/online-valid.csv".format(phishtank_key),
  30. "phishtank.csv", show_progress)
  31. if not os.path.isfile("common.csv") or force_update:
  32. data = {"url":[]}
  33. with open("keywordList") as wordlist:
  34. keywords = wordlist.read().split("\n")
  35. wordlist.close()
  36. suggestions = []
  37. for word in keywords:
  38. URL = ("http://suggestqueries.google.com/complete/search?client=firefox&q="+word)
  39. headers = {'User-agent':'Mozilla/5.0'}
  40. response = requests.get(URL, headers=headers)
  41. result = json.loads(response.content.decode('utf-8'))
  42. for r in result[1]:
  43. suggestions.append(r)
  44. yandex = yandex_search.Yandex(api_user='yksiber', api_key='03.1041007756:28d93f7d79ff3c91b861da63e38a8e5c')
  45. for word in suggestions:
  46. top10 = (yandex.search(word).items[0:10])
  47. for site in top10:
  48. data["url"].append(site)
  49. common = pd.DataFrame(data)
  50. common.to_csv("common.csv")
  51. urls = (pd.read_csv("phishtank.csv"), pd.read_csv("common.csv"))
  52. return urls
  53. def find_list_resources (tag, attribute,soup):
  54. list = []
  55. for x in soup.findAll(tag):
  56. try:
  57. list.append(x[attribute])
  58. except KeyError:
  59. pass
  60. return(list)
  61. def get_url_data(url,yandex,timeout=30):
  62. #Basic data extraction
  63. data = {}
  64. data["length"] = (len(url.split("://")[1].split("?")[0]))
  65. data["dir_num"] = (url.find("/")-2)
  66. parsed = urlparse(url)
  67. hostname_split = parsed.hostname.split(".")
  68. data["tld_trust"] = int(hostname_split[-1].lower() in TRUSTED_TLDS)
  69. data["subdomain_num"] = len(hostname_split) - 2
  70. data["subdomain_len"] = len("".join(hostname_split[:-2]))
  71. special_char_count = 0
  72. for char in parsed.hostname:
  73. if char == ".":
  74. continue
  75. if not char.encode("utf-8") == char.encode("idna"):
  76. special_char_count += 1
  77. data["special_char_num"] = special_char_count
  78. #Advanced data extraction
  79. try:
  80. data["index_num"] = yandex.search("site:{}".format(parsed.hostname)).found["all"]
  81. except yandex_search.NoResultsException:
  82. data["index_num"] = 0
  83. robot_entry_counter = 0
  84. try:
  85. response = requests.get("{}://{}/robots.txt".format(parsed.scheme, parsed.netloc), allow_redirects=True, verify=False, timeout=timeout)
  86. if response.status_code == 200:
  87. lines = response.text.split("\n")
  88. lines = [x for x in lines if x != ""]
  89. robot_entry_counter += len([x for x in lines if x[0] != "#"])
  90. else:
  91. pass
  92. except Exception as e:
  93. print(e)
  94. data["robots_entries"] = robot_entry_counter
  95. try:
  96. req = requests.get(url, verify=False, timeout=timeout)
  97. if req.status_code == 200:
  98. soup = BeautifulSoup(req.text,'html.parser')
  99. image_scr = find_list_resources('img',"src",soup)
  100. script_src = find_list_resources('script',"src",soup)
  101. css_link = find_list_resources("link","href",soup)
  102. all_links = image_scr + css_link + script_src
  103. out_links = []
  104. for link in all_links:
  105. parsed_link = urlparse(link)
  106. if parsed_link.hostname != parsed.hostname:
  107. out_links.append(link)
  108. data["out_resources"] = len(out_links)
  109. else:
  110. data["out_resources"] = -1
  111. except Exception as e:
  112. print(e)
  113. data["out_resources"] = -1
  114. data["url"] = url
  115. return data
  116. def extract_data(raw_data, force_update=False):
  117. reps = 0
  118. phishing, benign = raw_data[0], raw_data[1]
  119. data = {
  120. "phishing": [],
  121. "length": [],
  122. "out_resources": [],
  123. "dir_num": [],
  124. "special_char_num": [],
  125. "robots_entries": [],
  126. "tld_trust": [],
  127. "index_num": [],
  128. "subdomain_len": [],
  129. "subdomain_num": [],
  130. "url": []
  131. }
  132. if not os.path.isfile("dataset.csv") or force_update:
  133. largest_dataset = 0
  134. while os.path.isfile(largest_dataset + 300):
  135. largest_dataset += 300
  136. try:
  137. # filter old sites
  138. old = []
  139. for index, row in phishing.iterrows():
  140. date = datetime.strptime(row["submission_time"],"%Y-%m-%dT%H:%M:%S+00:00")
  141. if date.year < 2020:
  142. old.append(index)
  143. phishing = phishing.drop(old)
  144. yandex = yandex_search.Yandex(api_user='yksiber', api_key='03.1041007756:28d93f7d79ff3c91b861da63e38a8e5c')
  145. for index, row in phishing.iterrows():
  146. reps += 1
  147. if reps < largest_dataset:
  148. continue
  149. if reps % 300 == 0:
  150. pd.DataFrame(data).to_csv("dataset{}.csv".format(reps))
  151. url = row['url']
  152. print("[INFO]: {} : {}".format(reps, url))
  153. url_data = get_url_data(url, yandex)
  154. data["phishing"].append(1)
  155. data["length"].append(url_data["length"])
  156. data["dir_num"].append(url_data["dir_num"])
  157. data["special_char_num"].append(url_data["special_char_num"])
  158. data["tld_trust"].append(url_data["tld_trust"])
  159. data["index_num"].append(url_data["index_num"])
  160. data["subdomain_len"].append(url_data["subdomain_len"])
  161. data["subdomain_num"].append(url_data["subdomain_num"])
  162. data["out_resources"].append(url_data["out_resources"])
  163. data["robots_entries"].append(url_data["robots_entries"])
  164. data["url"].append(url_data["url"])
  165. for index, row in benign.iterrows():
  166. reps += 1
  167. if reps < largest_dataset:
  168. continue
  169. if reps % 300 == 0:
  170. pd.DataFrame(data).to_csv("dataset{}.csv".format(reps))
  171. url = row['url']
  172. print("[INFO]: {} : {}".format(reps, url))
  173. url_data = get_url_data(url, yandex)
  174. data["phishing"].append(1)
  175. data["length"].append(url_data["length"])
  176. data["dir_num"].append(url_data["dir_num"])
  177. data["special_char_num"].append(url_data["special_char_num"])
  178. data["tld_trust"].append(url_data["tld_trust"])
  179. data["index_num"].append(url_data["index_num"])
  180. data["subdomain_len"].append(url_data["subdomain_len"])
  181. data["subdomain_num"].append(url_data["subdomain_num"])
  182. data["out_resources"].append(url_data["out_resources"])
  183. data["robots_entries"].append(url_data["robots_entries"])
  184. data["url"].append(url_data["url"])
  185. pd.DataFrame(data).to_csv("dataset.csv".format(reps))
  186. except Exception as e:
  187. print("[ERROR]: {}".format(e))
  188. return pd.read_csv("dataset.csv")
  189. raw_data = get_data("01115eebdbf465734c08fedb2e4d93f414d1a31fa10bfcb248d0f75071e156ff")
  190. print("DOWNLOAD COMPLETED!")
  191. dataset = extract_data(raw_data)
  192. print("EXTRACT COMPLETED!")