Kulyutmaz is a project developed for the regional TUBITAK competition's programming field thart aims to create a more advanced phishing e-mail detection algorithm using website content checking and a neural network that we have trained.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

523 lines
16 KiB

4 years ago
  1. import mysql.connector
  2. import re
  3. import requests
  4. import urllib.parse
  5. from datetime import datetime
  6. import imaplib
  7. from email.parser import BytesParser
  8. from email.utils import getaddresses
  9. import AI
  10. import jsParser
  11. from pynotifier import Notification
  12. import platform
  13. from plyer import notification
  14. from urllib.parse import urlparse
  15. import pickle
  16. import numpy
  17. import yandex_search
  18. from bs4 import BeautifulSoup
  19. MAILS_TO_CACHE = 5
  20. SKIP = ["facebook.com","w3.org"]
  21. class Logger:
  22. def __init__(self, filename):
  23. self.file = open(filename, "a+")
  24. def error(self, msg: str):
  25. self.file.write("[ERROR] " + self.generate_log(msg))
  26. self.file.flush()
  27. def debug(self, msg):
  28. self.file.write("[DEBUG] " + self.generate_log(msg))
  29. self.file.flush()
  30. def info(self, msg: str):
  31. self.file.write("[INFO] " + self.generate_log(msg))
  32. self.file.flush()
  33. def generate_log(self, msg: str):
  34. now = datetime.now()
  35. timestamp = now.strftime("%b %d %Y %H:%M:%S")
  36. return timestamp + " : " + msg + "\n"
  37. def stop(self):
  38. self.file.close()
  39. class MailBox:
  40. def __init__(self, email: str, passwd: str, server: str, port: int, mysql_creds: dict, sensitivity: int,
  41. threshold: int, logger: Logger):
  42. try:
  43. self.log = logger
  44. self.log.info("Started MailBox listener for " + email)
  45. self.FROM_EMAIL = email
  46. self.FROM_PWD = passwd
  47. self.SMTP_SERVER = server
  48. self.SMTP_PORT = port
  49. self.imap = imaplib.IMAP4_SSL(self.SMTP_SERVER, self.SMTP_PORT)
  50. self.imap.login(self.FROM_EMAIL, self.FROM_PWD)
  51. self.imap.select('INBOX')
  52. self.mysql_creds = mysql_creds
  53. self.threshold = threshold
  54. self.sensitivity = sensitivity
  55. self.get_ids()
  56. # The mails dictionary format it <mail_id>:(<Mail_Object>,<Modified time>)
  57. self.mails = {}
  58. self.mysql = mysql_creds
  59. self.spam_folder = self.detect_spam_folder()
  60. self.get_new_mails()
  61. except Exception as e:
  62. self.log.error(str(e))
  63. def get_ids(self):
  64. try:
  65. self.imap.select("INBOX", False)
  66. typ, ids = self.imap.uid('search', None, 'ALL')
  67. self.ids = ids[0].decode().split()
  68. self.log.debug(str(self.ids))
  69. except Exception as e:
  70. self.log.error(str(e))
  71. self.imap = imaplib.IMAP4_SSL(self.SMTP_SERVER, self.SMTP_PORT)
  72. self.imap.login(self.ORG_EMAIL, self.FROM_PWD)
  73. self.imap.select('INBOX', False)
  74. self.get_ids()
  75. def get_new_mails(self):
  76. mysql_db = mysql.connector.connect(
  77. user=self.mysql_creds["mysql_username"],
  78. password=self.mysql_creds["mysql_password"],
  79. database=self.mysql_creds["mysql_database"],
  80. host=self.mysql_creds["mysql_host"]
  81. )
  82. cursor = mysql_db.cursor()
  83. while len(self.mails) <= MAILS_TO_CACHE:
  84. id = self.ids[-1 - len(self.mails)]
  85. cursor.execute(
  86. "SELECT mail_id FROM logs WHERE mail_id = '{}' AND account = '{}'".format(id, self.FROM_EMAIL))
  87. cursor.fetchall()
  88. if cursor.rowcount > 0:
  89. self.mails[id] = "[BUFFERED MAIL]"
  90. continue
  91. typ, messageRaw = self.imap.uid('fetch', id, '(RFC822)')
  92. self.log.info("Found mail with id: " + str(id))
  93. email = messageRaw[0][1]
  94. self.mails[id] = Mail(email, self.mysql_creds, self.threshold, self.sensitivity, self.FROM_EMAIL, self.log,
  95. id, self.spam_folder)
  96. self.mails[id].check_spam()
  97. self.log.info("Checked new mail with id: " + str(id))
  98. def refresh_new_mails(self):
  99. self.log.info("Refreshing mails")
  100. old_ids = self.ids
  101. self.get_ids()
  102. #self.log.debug(str(self.ids))
  103. diff_ids = set(self.ids) - set(old_ids)
  104. for id in diff_ids:
  105. self.log.info("Found new mail with id: " + str(id))
  106. typ, messageRaw = self.imap.uid('fetch', id, '(RFC822)')
  107. self.log.info("Fetched the new mail")
  108. email = messageRaw[0][1]
  109. self.mails[id] = Mail(email, self.mysql_creds, self.threshold, self.sensitivity, self.FROM_EMAIL, self.log,
  110. id, self.spam_folder)
  111. self.mails[id].check_spam()
  112. def check_spam(self):
  113. for mail_name in self.mails:
  114. mail_obj = self.mails[mail_name]
  115. mail_obj.check_spam()
  116. self.log.info("Checked new mail with id: " + str(mail_name))
  117. if mail_obj.get_spam() == 1:
  118. self.log.info("Found spam mail sent by: " + mail_obj.header_data["From"].split("<")[1][:-1])
  119. result = self.imap.uid('COPY', mail_obj.mail_id, mail_obj.spam_folder)
  120. if result[0] == 'OK':
  121. mov, data = self.imap.uid('STORE', mail_obj.mail_id, '+FLAGS', '(\Deleted)')
  122. self.imap.expunge()
  123. def detect_spam_folder(self):
  124. folder = ""
  125. for i in self.imap.list()[1]:
  126. l = i.decode().split(' "/" ')
  127. if "Junk" in l[0]:
  128. folder = l[1]
  129. break
  130. if "Trash" in l[0]:
  131. folder = l[1]
  132. if folder[0] == "\"":
  133. folder = folder[1:]
  134. if folder[-1] == "\"":
  135. folder = folder[:-1]
  136. return folder
  137. class Mail:
  138. def __init__(self, mail_data, mysql_creds, threshold, sensitivity, account, logger, mail_id, spam_folder):
  139. self.JS_IMPORT_REGEX = r'/<script.*(?:src="(.*)").*>/s'
  140. self.JS_EXTRACT_REGEX = r'/<script.*>(.*?)<\/script>/s'
  141. self.URL_REGEX = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|[^\x00-\x7F]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
  142. self.parser = BytesParser()
  143. self.sensitivity = sensitivity
  144. self.threshold = threshold
  145. self.log = logger
  146. self.spam_folder = spam_folder
  147. self.mysql_db = mysql.connector.connect(
  148. user=mysql_creds["mysql_username"],
  149. password=mysql_creds["mysql_password"],
  150. database=mysql_creds["mysql_database"],
  151. host=mysql_creds["mysql_host"]
  152. )
  153. self.account = account
  154. self.spam_points = 0
  155. self.js_code = {}
  156. self.urls_in_document = []
  157. self.documents = {}
  158. self.mail_id = mail_id
  159. # The headers are defined as <key>:<to_remove_from key>
  160. # -1 is used to define the last header, after that comes the mail contents
  161. self.whitelisted = False
  162. self.blacklisted = False
  163. self.parsed_mail = self.parser.parsebytes(mail_data)
  164. self.header_data = dict(self.parsed_mail)
  165. self.message = ""
  166. self.extract_message()
  167. self._spam = -1
  168. self.check_whitelist()
  169. self.check_blacklisted()
  170. self.urls = re.findall(self.URL_REGEX, self.message)
  171. for i in range(len(self.urls)):
  172. self.urls[i] = self.urls[i].strip()
  173. def add_domain_to_blacklist(self,url):
  174. domain = urlparse(url).hostname
  175. cursor = self.mysql_db.cursor()
  176. cursor.execute("INSERT INTO new_blacklists(domain) VALUES('{}')".format(domain.encode("idna").decode("utf-8")))
  177. cursor.execute("INSERT INTO domain_blacklist(domain) VALUES('{}')".format(domain.encode("idna").decode("utf-8")))
  178. def extract_message(self):
  179. if self.parsed_mail.is_multipart():
  180. for i in self.parsed_mail.get_payload():
  181. payload = i.get_payload(decode=True)
  182. try:
  183. self.message += payload.decode("utf-8")
  184. except AttributeError as e:
  185. self.log.error("AttributeError while trying to get message from mail with id " + str(self.mail_id))
  186. print(e)
  187. except UnicodeDecodeError as e:
  188. self.log.error(
  189. "UnicodeDecodeError while trying to get message from mail with id " + str(self.mail_id))
  190. print(e)
  191. else:
  192. payload = self.parsed_mail.get_payload(decode=True)
  193. try:
  194. self.message += payload.decode("utf-8")
  195. except AttributeError as e:
  196. self.log.error("AttributeError while trying to get message from mail with id " + str(self.mail_id))
  197. print(e)
  198. except UnicodeDecodeError as e:
  199. self.log.error("UnicodeDecodeError while trying to get message from mail with id " + str(self.mail_id))
  200. print(e)
  201. def check_blacklisted(self, url=None):
  202. if url != None:
  203. url = url.encode("idna").decode("utf-8")
  204. cursor = self.mysql_db.cursor()
  205. if not url == None:
  206. cursor.execute("SELECT * FROM domain_blacklist WHERE domain LIKE '{}';".format(url))
  207. cursor.fetchall()
  208. if cursor.rowcount > 0:
  209. cursor.close()
  210. return True
  211. return False
  212. mail_header = self.header_data["From"].split("<")[1][:-1]
  213. mail = mail_header
  214. cursor.execute("SELECT * FROM mail_blacklist WHERE mail='{}';".format(mail))
  215. cursor.fetchall()
  216. if cursor.rowcount >= 1:
  217. print("Blacklisted")
  218. self.blacklisted = True
  219. def check_whitelist(self, url=None):
  220. if url != None:
  221. url = url.encode("idna").decode("utf-8")
  222. cursor = self.mysql_db.cursor()
  223. if not url == None:
  224. cursor.execute("SELECT * FROM domain_whitelist WHERE domain LIKE '%{}%';".format(url))
  225. cursor.fetchall()
  226. if cursor.rowcount > 0:
  227. cursor.close()
  228. return True
  229. return False
  230. mail_header = self.header_data["From"].split("<")[1][:-1]
  231. mail = mail_header
  232. cursor.execute("SELECT * FROM mail_whitelist WHERE mail='{}';".format(mail))
  233. cursor.fetchall()
  234. if cursor.rowcount >= 1:
  235. self.whitelisted = True
  236. def check_special_chars(self):
  237. for url in self.urls:
  238. parsed = urllib.parse.urlparse(url.encode("idna").decode("utf-8").encode("utf-8").decode("idna"))
  239. special_char_count = 0
  240. for char in parsed.netloc:
  241. if not char == ".":
  242. if not char.encode("utf-8") == char.encode("idna"):
  243. print("Special char detected")
  244. self._spam = 1
  245. def aiPredict(self,data):
  246. with open("aiModel","rb") as m:
  247. aiModel = pickle.load(m)
  248. ai_in = (data["dir_num"], data["index_num"], data["length"], data["out_resources"], data["robots_entries"], data["special_char_num"], data["subdomain_len"], data["subdomain_num"], data["tld_trust"])
  249. return aiModel.predict(numpy.reshape(ai_in,(1, 9)))
  250. def find_list_resources (self,tag, attribute,soup):
  251. list = []
  252. for x in soup.findAll(tag):
  253. try:
  254. list.append(x[attribute])
  255. except KeyError:
  256. pass
  257. return(list)
  258. def get_url_data(self,url,yandex,timeout=30):
  259. data = {}
  260. data["length"] = (len(url.split("://")[1].split("?")[0]))
  261. data["dir_num"] = (url.find("/")-2)
  262. parsed = urlparse(url.encode("idna").decode("utf-8").encode("utf-8").decode("idna"))
  263. hostname_split = parsed.hostname.split(".")
  264. data["tld_trust"] = int(hostname_split[-1].lower() in ["com", "org", "net"])
  265. data["subdomain_num"] = len(hostname_split) - 2
  266. data["subdomain_len"] = len("".join(hostname_split[:-2]))
  267. special_char_count = 0
  268. for char in parsed.hostname:
  269. if char == ".":
  270. continue
  271. if not char.encode("utf-8") == char.encode("idna"):
  272. special_char_count += 1
  273. data["special_char_num"] = special_char_count
  274. #Advanced data extraction
  275. try:
  276. data["index_num"] = int(yandex.search("site:{}".format(parsed.hostname)).found["all"])
  277. except yandex_search.NoResultsException:
  278. data["index_num"] = 0
  279. robot_entry_counter = 0
  280. try:
  281. response = requests.get("{}://{}/robots.txt".format(parsed.scheme, parsed.netloc), allow_redirects=True, verify=False, timeout=timeout)
  282. if response.status_code == 200:
  283. lines = response.text.split("\n")
  284. lines = [x for x in lines if x != ""]
  285. robot_entry_counter += len([x for x in lines if x[0] != "#"])
  286. else:
  287. pass
  288. except Exception as e:
  289. print(e)
  290. data["robots_entries"] = robot_entry_counter
  291. try:
  292. req = requests.get(url, verify=False, timeout=timeout)
  293. if req.status_code == 200:
  294. soup = BeautifulSoup(req.text,'html.parser')
  295. image_scr = self.find_list_resources('img',"src",soup)
  296. script_src = self.find_list_resources('script',"src",soup)
  297. css_link = self.find_list_resources("link","href",soup)
  298. all_links = image_scr + css_link + script_src
  299. out_links = []
  300. for link in all_links:
  301. parsed_link = urlparse(link)
  302. if parsed_link.hostname != parsed.hostname:
  303. out_links.append(link)
  304. data["out_resources"] = len(out_links)
  305. else:
  306. data["out_resources"] = -1
  307. except Exception as e:
  308. print(e)
  309. data["out_resources"] = -1
  310. return data
  311. def check_url(self, url):
  312. yandex = yandex_search.Yandex(api_user='raporcubaba@gmail.com', api_key='03.1042294429:b8e679f9acadef49ebab0d9726ccef58')
  313. data = self.get_url_data(url,yandex,timeout=10)
  314. if self.aiPredict(data):
  315. self.add_domain_to_blacklist(url)
  316. self.spam_points += self.sensitivity
  317. def check_js(self):
  318. for url in self.js_code:
  319. for js in self.js_code[url]:
  320. if self.check_blacklisted(url=js):
  321. self._spam = 1
  322. if self.check_js_code(self.js_code[js]):
  323. self.add_domain_to_blacklist(url)
  324. self.spam_points += self.sensitivity
  325. def check_disallowed_chars(self, url_start, chars=["<", ">", "'", "\""]):
  326. url = url_start
  327. for char in chars:
  328. if char in url:
  329. return True
  330. if urllib.parse.quote_plus(char) in url:
  331. return True
  332. if urllib.parse.quote_plus(urllib.parse.quote_plus(char)) in url:
  333. return True
  334. return False
  335. def check_tld(self,url):
  336. if urlparse(url).hostname.split(".")[-1] in ["info","tk","gq"]:
  337. self._spam = 1
  338. def check_domain_name(self,url):
  339. pass # TODO fill this up
  340. def discover_url(self,urls):
  341. for url in urls:
  342. foo = False
  343. for i in self.documents:
  344. if urlparse(url).hostname == urlparse(i).hostname:
  345. foo = True
  346. if foo:
  347. continue
  348. while 1:
  349. if self.check_whitelist(url=url):
  350. break
  351. if self.check_blacklisted(url=url):
  352. self._spam = 1
  353. return
  354. self.log.info(url)
  355. self.check_disallowed_chars(url)
  356. self.keyword_search(url)
  357. self.check_xss(url)
  358. self.check_url(url)
  359. self.check_tld(url)
  360. self.check_domain_name(url)
  361. try:
  362. r = requests.get(url, allow_redirects=False)
  363. except requests.exceptions.ConnectionError:
  364. break
  365. #detect all status codes in the format 3xx
  366. try:
  367. self.documents[url] = r.content.decode()
  368. for i in self.extract_urls(self.documents[url]):
  369. foo = False
  370. for i in self.documents:
  371. if urlparse(url).hostname == urlparse(i).hostname:
  372. foo = True
  373. if foo:
  374. continue
  375. skip = False
  376. for j in SKIP:
  377. if j in i:
  378. skip = True
  379. if skip:
  380. continue
  381. foo = []
  382. foo.append(i.strip())
  383. self.discover_url(foo)
  384. except UnicodeDecodeError:
  385. pass
  386. if r.status_code == 302 or r.status_code == 303:
  387. location = r.headers["location"]
  388. if location.startswith("http"):
  389. url = location
  390. else:
  391. url = "/".join(url.split("/")[:-1]) + location
  392. continue
  393. else:
  394. break
  395. def keyword_search(self, url):
  396. keywords = self.mysql_db.cursor()
  397. keywords.execute("SELECT * FROM keywordlist;")
  398. result = keywords.fetchall()
  399. for row in result:
  400. if row[0] in url:
  401. if ".".join(urlparse(url).hostname.split(".")[-2:]) != row[1]:
  402. self._spam = 1
  403. def check_xss(self, url):
  404. malicious = self.check_disallowed_chars(url)
  405. if malicious:
  406. print("ADDED SPAM POINTS")
  407. self.spam_points += self.sensitivity
  408. #TODO add deobfuscation
  409. def extract_javascript(self): #TODO not working
  410. p = re.compile(self.JS_IMPORT_REGEX)
  411. for doc in self.documents:
  412. self.js_code[doc] = []
  413. for url in p.findall(self.documents[doc]):
  414. r = requests.get(url, allow_redirects=False)
  415. if r.status_code == 200:
  416. self.js_code[doc].append(r.content)
  417. p = re.compile(self.JS_EXTRACT_REGEX)
  418. for js in p.findall(doc):
  419. if js != "":
  420. self.js_code[doc].append(js)
  421. def extract_urls(self, doc): # Extract URLs from the documents and save them to the array urls_in_document
  422. p = re.compile(self.URL_REGEX)
  423. res = p.findall(doc)
  424. return res
  425. def check_stored_xss(self):
  426. self.extract_javascript()
  427. for url in self.js_code:
  428. url_spam_count = 0
  429. for js in self.js_code[url]:
  430. if url_spam_count > 6:
  431. self.add_domain_to_blacklist(url)
  432. if self.check_blacklisted(url=url):
  433. self._spam = 1
  434. return
  435. if self.check_js_code(js):
  436. url_spam_count += 3
  437. self.spam_points += self.sensitivity
  438. def check_js_code(self, code):
  439. parsedJs = jsParser.parseJavascript(code, False)
  440. return AI.aiPredict(parsedJs)
  441. def log_mail(self):
  442. cursor = self.mysql_db.cursor()
  443. domain = getaddresses(self.parsed_mail.get_all('from', []))[0][1]
  444. cursor.execute(
  445. "INSERT INTO logs (sender_domain,result, account, mail_id) VALUES ('{}','{}','{}','{}')".format(domain,
  446. self.get_spam(),
  447. self.account,
  448. self.mail_id))
  449. self.mysql_db.commit()
  450. cursor.close()
  451. def check_spam(self):
  452. if self.whitelisted:
  453. self._spam = 0
  454. self.log_mail()
  455. return
  456. elif self.blacklisted:
  457. self._spam = 1
  458. else:
  459. self.discover_url(self.urls)
  460. self.check_stored_xss()
  461. if self.threshold < self.spam_points:
  462. self._spam = 1
  463. if self.get_spam() == 1:
  464. self.log.info("Mail moved to spam with id: " + str(self.mail_id))
  465. if platform.system() == "Windows":
  466. notification.notify(
  467. title='Found spam mail by: ' + getaddresses(self.parsed_mail.get_all('from', []))[0][1],
  468. message=self.account,
  469. app_icon=None,
  470. timeout=10, )
  471. else:
  472. Notification(
  473. title='Found spam mail by: ' + getaddresses(self.parsed_mail.get_all('from', []))[0][1],
  474. description=self.account,
  475. duration=10,
  476. urgency=Notification.URGENCY_CRITICAL
  477. ).send()
  478. self.log_mail()
  479. def get_spam(self) -> int:
  480. if self.whitelisted:
  481. return 0
  482. return self._spam