yigitcolakoglu
/
KulYutmaz


								import mysql.connector

								import re

								import requests

								import urllib.parse

								from datetime import datetime

								import imaplib

								from email.parser import BytesParser

								from email.utils import getaddresses

								import AI

								import jsParser

								from pynotifier import Notification

								import platform

								from plyer import notification

								from urllib.parse import urlparse

								import pickle

								import numpy

								import yandex_search

								from bs4 import BeautifulSoup


								MAILS_TO_CACHE = 5


								SKIP = ["facebook.com","w3.org"]

								class Logger:

									def __init__(self, filename):

										self.file = open(filename, "a+")


									def error(self, msg: str):

										self.file.write("[ERROR] " + self.generate_log(msg))

										self.file.flush()


									def debug(self, msg):

										self.file.write("[DEBUG] " + self.generate_log(msg))

										self.file.flush()


									def info(self, msg: str):

										self.file.write("[INFO] " + self.generate_log(msg))

										self.file.flush()


									def generate_log(self, msg: str):

										now = datetime.now()

										timestamp = now.strftime("%b %d %Y %H:%M:%S")

										return timestamp + " : " + msg + "\n"


									def stop(self):

										self.file.close()


								class MailBox:


									def __init__(self, email: str, passwd: str, server: str, port: int, mysql_creds: dict, sensitivity: int,

												 threshold: int, logger: Logger):

										try:

											self.log = logger

											self.log.info("Started MailBox listener for " + email)

											self.FROM_EMAIL = email

											self.FROM_PWD = passwd

											self.SMTP_SERVER = server

											self.SMTP_PORT = port

											self.imap = imaplib.IMAP4_SSL(self.SMTP_SERVER, self.SMTP_PORT)

											self.imap.login(self.FROM_EMAIL, self.FROM_PWD)

											self.imap.select('INBOX')

											self.mysql_creds = mysql_creds

											self.threshold = threshold

											self.sensitivity = sensitivity

											self.get_ids()

											# The mails dictionary format it <mail_id>:(<Mail_Object>,<Modified time>)

											self.mails = {}

											self.mysql = mysql_creds

											self.spam_folder = self.detect_spam_folder()

											self.get_new_mails()

										except Exception as e:

											self.log.error(str(e))


									def get_ids(self):

										try:

											self.imap.select("INBOX", False)

											typ, ids = self.imap.uid('search', None, 'ALL')

											self.ids = ids[0].decode().split()

											self.log.debug(str(self.ids))

										except Exception as e:

											self.log.error(str(e))

											self.imap = imaplib.IMAP4_SSL(self.SMTP_SERVER, self.SMTP_PORT)

											self.imap.login(self.ORG_EMAIL, self.FROM_PWD)

											self.imap.select('INBOX', False)

											self.get_ids()


									def get_new_mails(self):

										mysql_db = mysql.connector.connect(

												user=self.mysql_creds["mysql_username"],

												password=self.mysql_creds["mysql_password"],

												database=self.mysql_creds["mysql_database"],

												host=self.mysql_creds["mysql_host"]

										)

										cursor = mysql_db.cursor()

										while len(self.mails) <= MAILS_TO_CACHE:

											id = self.ids[-1 - len(self.mails)]

											cursor.execute(

												"SELECT mail_id FROM logs WHERE mail_id = '{}' AND account = '{}'".format(id, self.FROM_EMAIL))

											cursor.fetchall()

											if cursor.rowcount > 0:

												self.mails[id] = "[BUFFERED MAIL]"

												continue

											typ, messageRaw = self.imap.uid('fetch', id, '(RFC822)')

											self.log.info("Found mail with id: " + str(id))

											email = messageRaw[0][1]

											self.mails[id] = Mail(email, self.mysql_creds, self.threshold, self.sensitivity, self.FROM_EMAIL, self.log,

																  id, self.spam_folder)

											self.mails[id].check_spam()

											self.log.info("Checked new mail with id: " + str(id))


									def refresh_new_mails(self):

										self.log.info("Refreshing mails")

										old_ids = self.ids

										self.get_ids()

										#self.log.debug(str(self.ids))

										diff_ids = set(self.ids) - set(old_ids)

										for id in diff_ids:

											self.log.info("Found new mail with id: " + str(id))

											typ, messageRaw = self.imap.uid('fetch', id, '(RFC822)')

											self.log.info("Fetched the new mail")

											email = messageRaw[0][1]

											self.mails[id] = Mail(email, self.mysql_creds, self.threshold, self.sensitivity, self.FROM_EMAIL, self.log,

																  id, self.spam_folder)

											self.mails[id].check_spam()


									def check_spam(self):

										for mail_name in self.mails:

											mail_obj = self.mails[mail_name]

											mail_obj.check_spam()

											self.log.info("Checked new mail with id: " + str(mail_name))

											if mail_obj.get_spam() == 1:

												self.log.info("Found spam mail sent by: " + mail_obj.header_data["From"].split("<")[1][:-1])

												result = self.imap.uid('COPY', mail_obj.mail_id, mail_obj.spam_folder)

												if result[0] == 'OK':

													mov, data = self.imap.uid('STORE', mail_obj.mail_id, '+FLAGS', '(\Deleted)')

													self.imap.expunge()


									def detect_spam_folder(self):

										folder = ""

										for i in self.imap.list()[1]:

											l = i.decode().split(' "/" ')

											if "Junk" in l[0]:

												folder = l[1]

												break

											if "Trash" in l[0]:

												folder = l[1]

										if folder[0] == "\"":

											folder = folder[1:]

										if folder[-1] == "\"":

											folder = folder[:-1]

										return folder


								class Mail:


									def __init__(self, mail_data, mysql_creds, threshold, sensitivity, account, logger, mail_id, spam_folder):

										self.JS_IMPORT_REGEX = r'/<script.*(?:src="(.*)").*>/s'

										self.JS_EXTRACT_REGEX = r'/<script.*>(.*?)<\/script>/s'

										self.URL_REGEX = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|[^\x00-\x7F]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"

										self.parser = BytesParser()

										self.sensitivity = sensitivity

										self.threshold = threshold

										self.log = logger

										self.spam_folder = spam_folder

										self.mysql_db = mysql.connector.connect(

												user=mysql_creds["mysql_username"],

												password=mysql_creds["mysql_password"],

												database=mysql_creds["mysql_database"],

												host=mysql_creds["mysql_host"]

										)

										self.account = account

										self.spam_points = 0

										self.js_code = {}

										self.urls_in_document = []

										self.documents = {}

										self.mail_id = mail_id

										# The headers are defined as <key>:<to_remove_from key>

										# -1 is used to define the last header, after that comes the mail contents

										self.whitelisted = False

										self.blacklisted = False

										self.parsed_mail = self.parser.parsebytes(mail_data)

										self.header_data = dict(self.parsed_mail)

										self.message = ""

										self.extract_message()

										self._spam = -1

										self.check_whitelist()

										self.check_blacklisted()

										self.urls = re.findall(self.URL_REGEX, self.message)

										for i in range(len(self.urls)):

											self.urls[i] = self.urls[i].strip()


									def add_domain_to_blacklist(self,url):

										domain = urlparse(url).hostname

										cursor = self.mysql_db.cursor()

										cursor.execute("INSERT INTO new_blacklists(domain) VALUES('{}')".format(domain.encode("idna").decode("utf-8")))

										cursor.execute("INSERT INTO domain_blacklist(domain) VALUES('{}')".format(domain.encode("idna").decode("utf-8")))


									def extract_message(self):

										if self.parsed_mail.is_multipart():

											for i in self.parsed_mail.get_payload():

												payload = i.get_payload(decode=True)

												try:

													self.message += payload.decode("utf-8")

												except AttributeError as e:

													self.log.error("AttributeError while trying to get message from mail with id " + str(self.mail_id))

													print(e)

												except UnicodeDecodeError as e:

													self.log.error(

														"UnicodeDecodeError while trying to get message from mail with id " + str(self.mail_id))

													print(e)

										else:

											payload = self.parsed_mail.get_payload(decode=True)

											try:

												self.message += payload.decode("utf-8")

											except AttributeError as e:

												self.log.error("AttributeError while trying to get message from mail with id " + str(self.mail_id))

												print(e)

											except UnicodeDecodeError as e:

												self.log.error("UnicodeDecodeError while trying to get message from mail with id " + str(self.mail_id))

												print(e)


									def check_blacklisted(self, url=None):

										if url != None:

											url = url.encode("idna").decode("utf-8")

										cursor = self.mysql_db.cursor()

										if not url == None:

											cursor.execute("SELECT * FROM domain_blacklist WHERE domain LIKE '{}';".format(url))

											cursor.fetchall()

											if cursor.rowcount > 0:

												cursor.close()

												return True

											return False

										mail_header = self.header_data["From"].split("<")[1][:-1]

										mail = mail_header

										cursor.execute("SELECT * FROM mail_blacklist WHERE mail='{}';".format(mail))

										cursor.fetchall()

										if cursor.rowcount >= 1:

											print("Blacklisted")

											self.blacklisted = True


									def check_whitelist(self, url=None):

										if url != None:

											url = url.encode("idna").decode("utf-8")

										cursor = self.mysql_db.cursor()

										if not url == None:

											cursor.execute("SELECT * FROM domain_whitelist WHERE domain LIKE '%{}%';".format(url))

											cursor.fetchall()

											if cursor.rowcount > 0:

												cursor.close()

												return True

											return False

										mail_header = self.header_data["From"].split("<")[1][:-1]

										mail = mail_header

										cursor.execute("SELECT * FROM mail_whitelist WHERE mail='{}';".format(mail))

										cursor.fetchall()

										if cursor.rowcount >= 1:

											self.whitelisted = True


									def check_special_chars(self):

										for url in self.urls:

											parsed = urllib.parse.urlparse(url.encode("idna").decode("utf-8").encode("utf-8").decode("idna"))

											special_char_count = 0

											for char in parsed.netloc:

												if not char == ".":

													if not char.encode("utf-8") == char.encode("idna"):

														print("Special char detected")

														self._spam = 1


									def aiPredict(self,data):

										with open("aiModel","rb") as m:

											aiModel = pickle.load(m)

											ai_in = (data["dir_num"], data["index_num"], data["length"], data["out_resources"], data["robots_entries"], data["special_char_num"], data["subdomain_len"], data["subdomain_num"], data["tld_trust"])

											return aiModel.predict(numpy.reshape(ai_in,(1, 9)))


									def find_list_resources (self,tag, attribute,soup):

										list = []

										for x in soup.findAll(tag):

											try:

												list.append(x[attribute])

											except KeyError:

												pass

										return(list)


									def get_url_data(self,url,yandex,timeout=30):

										data = {}

										data["length"] = (len(url.split("://")[1].split("?")[0]))

										data["dir_num"] = (url.find("/")-2)

										parsed = urlparse(url.encode("idna").decode("utf-8").encode("utf-8").decode("idna"))

										hostname_split = parsed.hostname.split(".")

										data["tld_trust"] = int(hostname_split[-1].lower() in ["com", "org", "net"])

										data["subdomain_num"] = len(hostname_split) - 2

										data["subdomain_len"] = len("".join(hostname_split[:-2]))

										special_char_count = 0

										for char in parsed.hostname:

											if char == ".":

												continue

											if not char.encode("utf-8") == char.encode("idna"):

												special_char_count += 1

										data["special_char_num"] = special_char_count

										#Advanced data extraction

										try:

											data["index_num"] = int(yandex.search("site:{}".format(parsed.hostname)).found["all"])

										except yandex_search.NoResultsException:

											data["index_num"] = 0

										robot_entry_counter = 0

										try:

											response = requests.get("{}://{}/robots.txt".format(parsed.scheme, parsed.netloc), allow_redirects=True, verify=False, timeout=timeout)

											if response.status_code == 200:

												lines = response.text.split("\n")

												lines = [x for x in lines if x != ""]

												robot_entry_counter += len([x for x in lines if x[0] != "#"])

											else:

												pass

										except Exception as e:

											print(e)

										data["robots_entries"] = robot_entry_counter

										try:

											req = requests.get(url, verify=False, timeout=timeout)

											if req.status_code == 200:

												soup = BeautifulSoup(req.text,'html.parser')

												image_scr = self.find_list_resources('img',"src",soup)

												script_src = self.find_list_resources('script',"src",soup)

												css_link = self.find_list_resources("link","href",soup)

												all_links = image_scr + css_link + script_src

												out_links = []

												for link in all_links:

													parsed_link = urlparse(link)

													if parsed_link.hostname != parsed.hostname:

														out_links.append(link)

												data["out_resources"] = len(out_links)

											else:

												data["out_resources"] = -1

										except Exception as e:

											print(e)

											data["out_resources"] = -1

										return data


									def check_url(self, url):

										yandex = yandex_search.Yandex(api_user='raporcubaba@gmail.com', api_key='03.1042294429:b8e679f9acadef49ebab0d9726ccef58')

										data = self.get_url_data(url,yandex,timeout=10)

										if self.aiPredict(data):

											self.add_domain_to_blacklist(url)

											self.spam_points += self.sensitivity


									def check_js(self):

										for url in self.js_code:

											for js in self.js_code[url]:

												if self.check_blacklisted(url=js):

													self._spam = 1

												if self.check_js_code(self.js_code[js]):

													self.add_domain_to_blacklist(url)

													self.spam_points += self.sensitivity


									def check_disallowed_chars(self, url_start, chars=["<", ">", "'", "\""]):

										url = url_start

										for char in chars:

											if char in url:

												return True

											if urllib.parse.quote_plus(char) in url:

												return True

											if urllib.parse.quote_plus(urllib.parse.quote_plus(char)) in url:

												return True

										return False


									def check_tld(self,url):

										if urlparse(url).hostname.split(".")[-1] in ["info","tk","gq"]:

											self._spam = 1


									def check_domain_name(self,url):

										pass # TODO fill this up


									def discover_url(self,urls):

										for url in urls:

											foo = False

											for i in self.documents:

												if urlparse(url).hostname == urlparse(i).hostname:

													foo = True

											if foo:

												continue

											while 1:

												if self.check_whitelist(url=url):

													break

												if self.check_blacklisted(url=url):

													self._spam = 1

													return

												self.log.info(url)

												self.check_disallowed_chars(url)

												self.keyword_search(url)

												self.check_xss(url)

												self.check_url(url)

												self.check_tld(url)

												self.check_domain_name(url)

												try:

													r = requests.get(url, allow_redirects=False)

												except requests.exceptions.ConnectionError:

													break

												#detect all status codes in the format 3xx

												try:

													self.documents[url] = r.content.decode()

													for i in self.extract_urls(self.documents[url]):

														foo = False

														for i in self.documents:

															if urlparse(url).hostname == urlparse(i).hostname:

																foo = True

														if foo:

															continue

														skip = False

														for j in SKIP:

															if j in i:

																skip = True

														if skip:

															continue

														foo = []

														foo.append(i.strip())

														self.discover_url(foo)

												except UnicodeDecodeError:

													pass

												if r.status_code == 302 or r.status_code == 303:

													location = r.headers["location"]

													if location.startswith("http"):

														url = location

													else:

														url = "/".join(url.split("/")[:-1]) + location

													continue

												else:

													break


									def keyword_search(self, url):

										keywords = self.mysql_db.cursor()

										keywords.execute("SELECT * FROM keywordlist;")

										result = keywords.fetchall()

										for row in result:

											if row[0] in url:

												if  ".".join(urlparse(url).hostname.split(".")[-2:]) != row[1]:

													self._spam = 1


									def check_xss(self, url):

										malicious = self.check_disallowed_chars(url)

										if malicious:

											print("ADDED SPAM POINTS")

											self.spam_points += self.sensitivity


									#TODO add deobfuscation

									def extract_javascript(self): #TODO not working

										p = re.compile(self.JS_IMPORT_REGEX)

										for doc in self.documents:

											self.js_code[doc] = []

											for url in p.findall(self.documents[doc]):

												r = requests.get(url, allow_redirects=False)

												if r.status_code == 200:

													self.js_code[doc].append(r.content)

											p = re.compile(self.JS_EXTRACT_REGEX)

											for js in p.findall(doc):

												if js != "":

													self.js_code[doc].append(js)


									def extract_urls(self, doc): # Extract URLs from the documents and save them to the array urls_in_document

										p = re.compile(self.URL_REGEX)

										res = p.findall(doc)

										return res


									def check_stored_xss(self):

										self.extract_javascript()

										for url in self.js_code:

											url_spam_count = 0

											for js in self.js_code[url]:

												if url_spam_count > 6:

													self.add_domain_to_blacklist(url)

												if self.check_blacklisted(url=url):

													self._spam = 1

													return

												if self.check_js_code(js):

													url_spam_count += 3

													self.spam_points += self.sensitivity


									def check_js_code(self, code):

										parsedJs = jsParser.parseJavascript(code, False)

										return AI.aiPredict(parsedJs)


									def log_mail(self):

										cursor = self.mysql_db.cursor()

										domain = getaddresses(self.parsed_mail.get_all('from', []))[0][1]

										cursor.execute(

											"INSERT INTO logs (sender_domain,result, account, mail_id) VALUES ('{}','{}','{}','{}')".format(domain,

																																			self.get_spam(),

																																			self.account,

																																			self.mail_id))

										self.mysql_db.commit()

										cursor.close()


									def check_spam(self):

										if self.whitelisted:

											self._spam = 0

											self.log_mail()

											return

										elif self.blacklisted:

											self._spam = 1

										else:

											self.discover_url(self.urls)


											self.check_stored_xss()

											if self.threshold < self.spam_points:

												self._spam = 1

										if self.get_spam() == 1:

											self.log.info("Mail moved to spam with id: " + str(self.mail_id))

											if platform.system() == "Windows":

												notification.notify(

														title='Found spam mail by: ' + getaddresses(self.parsed_mail.get_all('from', []))[0][1],

														message=self.account,

														app_icon=None,

														timeout=10, )

											else:

												Notification(

														title='Found spam mail by: ' + getaddresses(self.parsed_mail.get_all('from', []))[0][1],

														description=self.account,

														duration=10,

														urgency=Notification.URGENCY_CRITICAL

												).send()

										self.log_mail()


									def get_spam(self) -> int:

										if self.whitelisted:

											return 0

										return self._spam