yigitcolakoglu
/
KulYutmaz

import mysql.connectorimport reimport requestsimport urllib.parsefrom datetime import datetimeimport imaplibfrom email.parser import BytesParserfrom email.utils import getaddressesimport AIimport jsParserfrom pynotifier import Notificationimport platformfrom plyer import notificationfrom urllib.parse import urlparseimport pickleimport numpyimport yandex_searchfrom bs4 import BeautifulSoup
MAILS_TO_CACHE = 5
SKIP = ["facebook.com","w3.org"]class Logger:	def __init__(self, filename):		self.file = open(filename, "a+")
	def error(self, msg: str):		self.file.write("[ERROR] " + self.generate_log(msg))		self.file.flush()
	def debug(self, msg):		self.file.write("[DEBUG] " + self.generate_log(msg))		self.file.flush()
	def info(self, msg: str):		self.file.write("[INFO] " + self.generate_log(msg))		self.file.flush()
	def generate_log(self, msg: str):		now = datetime.now()		timestamp = now.strftime("%b %d %Y %H:%M:%S")		return timestamp + " : " + msg + "\n"
	def stop(self):		self.file.close()

class MailBox:
	def __init__(self, email: str, passwd: str, server: str, port: int, mysql_creds: dict, sensitivity: int,				 threshold: int, logger: Logger):		try:			self.log = logger			self.log.info("Started MailBox listener for " + email)			self.FROM_EMAIL = email			self.FROM_PWD = passwd			self.SMTP_SERVER = server			self.SMTP_PORT = port			self.imap = imaplib.IMAP4_SSL(self.SMTP_SERVER, self.SMTP_PORT)			self.imap.login(self.FROM_EMAIL, self.FROM_PWD)			self.imap.select('INBOX')			self.mysql_creds = mysql_creds			self.threshold = threshold			self.sensitivity = sensitivity			self.get_ids()			# The mails dictionary format it <mail_id>:(<Mail_Object>,<Modified time>)			self.mails = {}			self.mysql = mysql_creds			self.spam_folder = self.detect_spam_folder()			self.get_new_mails()		except Exception as e:			self.log.error(str(e))
	def get_ids(self):		try:			self.imap.select("INBOX", False)			typ, ids = self.imap.uid('search', None, 'ALL')			self.ids = ids[0].decode().split()			self.log.debug(str(self.ids))		except Exception as e:			self.log.error(str(e))			self.imap = imaplib.IMAP4_SSL(self.SMTP_SERVER, self.SMTP_PORT)			self.imap.login(self.ORG_EMAIL, self.FROM_PWD)			self.imap.select('INBOX', False)			self.get_ids()
	def get_new_mails(self):		mysql_db = mysql.connector.connect(				user=self.mysql_creds["mysql_username"],				password=self.mysql_creds["mysql_password"],				database=self.mysql_creds["mysql_database"],				host=self.mysql_creds["mysql_host"]		)		cursor = mysql_db.cursor()		while len(self.mails) <= MAILS_TO_CACHE:			id = self.ids[-1 - len(self.mails)]			cursor.execute(				"SELECT mail_id FROM logs WHERE mail_id = '{}' AND account = '{}'".format(id, self.FROM_EMAIL))			cursor.fetchall()			if cursor.rowcount > 0:				self.mails[id] = "[BUFFERED MAIL]"				continue			typ, messageRaw = self.imap.uid('fetch', id, '(RFC822)')			self.log.info("Found mail with id: " + str(id))			email = messageRaw[0][1]			self.mails[id] = Mail(email, self.mysql_creds, self.threshold, self.sensitivity, self.FROM_EMAIL, self.log,								  id, self.spam_folder)			self.mails[id].check_spam()			self.log.info("Checked new mail with id: " + str(id))
	def refresh_new_mails(self):		self.log.info("Refreshing mails")		old_ids = self.ids		self.get_ids()		#self.log.debug(str(self.ids))		diff_ids = set(self.ids) - set(old_ids)		for id in diff_ids:			self.log.info("Found new mail with id: " + str(id))			typ, messageRaw = self.imap.uid('fetch', id, '(RFC822)')			self.log.info("Fetched the new mail")			email = messageRaw[0][1]			self.mails[id] = Mail(email, self.mysql_creds, self.threshold, self.sensitivity, self.FROM_EMAIL, self.log,								  id, self.spam_folder)			self.mails[id].check_spam()
	def check_spam(self):		for mail_name in self.mails:			mail_obj = self.mails[mail_name]			mail_obj.check_spam()			self.log.info("Checked new mail with id: " + str(mail_name))			if mail_obj.get_spam() == 1:				self.log.info("Found spam mail sent by: " + mail_obj.header_data["From"].split("<")[1][:-1])				result = self.imap.uid('COPY', mail_obj.mail_id, mail_obj.spam_folder)				if result[0] == 'OK':					mov, data = self.imap.uid('STORE', mail_obj.mail_id, '+FLAGS', '(\Deleted)')					self.imap.expunge()
	def detect_spam_folder(self):		folder = ""		for i in self.imap.list()[1]:			l = i.decode().split(' "/" ')			if "Junk" in l[0]:				folder = l[1]				break			if "Trash" in l[0]:				folder = l[1]		if folder[0] == "\"":			folder = folder[1:]		if folder[-1] == "\"":			folder = folder[:-1]		return folder

class Mail:
	def __init__(self, mail_data, mysql_creds, threshold, sensitivity, account, logger, mail_id, spam_folder):		self.JS_IMPORT_REGEX = r'/<script.*(?:src="(.*)").*>/s'		self.JS_EXTRACT_REGEX = r'/<script.*>(.*?)<\/script>/s'		self.URL_REGEX = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|[^\x00-\x7F]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"		self.parser = BytesParser()		self.sensitivity = sensitivity		self.threshold = threshold		self.log = logger		self.spam_folder = spam_folder		self.mysql_db = mysql.connector.connect(				user=mysql_creds["mysql_username"],				password=mysql_creds["mysql_password"],				database=mysql_creds["mysql_database"],				host=mysql_creds["mysql_host"]		)		self.account = account		self.spam_points = 0		self.js_code = {}		self.urls_in_document = []		self.documents = {}		self.mail_id = mail_id		# The headers are defined as <key>:<to_remove_from key>		# -1 is used to define the last header, after that comes the mail contents		self.whitelisted = False		self.blacklisted = False		self.parsed_mail = self.parser.parsebytes(mail_data)		self.header_data = dict(self.parsed_mail)		self.message = ""		self.extract_message()		self._spam = -1		self.check_whitelist()		self.check_blacklisted()		self.urls = re.findall(self.URL_REGEX, self.message)		for i in range(len(self.urls)):			self.urls[i] = self.urls[i].strip()
	def add_domain_to_blacklist(self,url):		domain = urlparse(url).hostname		cursor = self.mysql_db.cursor()		cursor.execute("INSERT INTO new_blacklists(domain) VALUES('{}')".format(domain.encode("idna").decode("utf-8")))		cursor.execute("INSERT INTO domain_blacklist(domain) VALUES('{}')".format(domain.encode("idna").decode("utf-8")))
	def extract_message(self):		if self.parsed_mail.is_multipart():			for i in self.parsed_mail.get_payload():				payload = i.get_payload(decode=True)				try:					self.message += payload.decode("utf-8")				except AttributeError as e:					self.log.error("AttributeError while trying to get message from mail with id " + str(self.mail_id))					print(e)				except UnicodeDecodeError as e:					self.log.error(						"UnicodeDecodeError while trying to get message from mail with id " + str(self.mail_id))					print(e)		else:			payload = self.parsed_mail.get_payload(decode=True)			try:				self.message += payload.decode("utf-8")			except AttributeError as e:				self.log.error("AttributeError while trying to get message from mail with id " + str(self.mail_id))				print(e)			except UnicodeDecodeError as e:				self.log.error("UnicodeDecodeError while trying to get message from mail with id " + str(self.mail_id))				print(e)
	def check_blacklisted(self, url=None):		if url != None:			url = url.encode("idna").decode("utf-8")		cursor = self.mysql_db.cursor()		if not url == None:			cursor.execute("SELECT * FROM domain_blacklist WHERE domain LIKE '{}';".format(url))			cursor.fetchall()			if cursor.rowcount > 0:				cursor.close()				return True			return False		mail_header = self.header_data["From"].split("<")[1][:-1]		mail = mail_header		cursor.execute("SELECT * FROM mail_blacklist WHERE mail='{}';".format(mail))		cursor.fetchall()		if cursor.rowcount >= 1:			print("Blacklisted")			self.blacklisted = True
	def check_whitelist(self, url=None):		if url != None:			url = url.encode("idna").decode("utf-8")		cursor = self.mysql_db.cursor()		if not url == None:			cursor.execute("SELECT * FROM domain_whitelist WHERE domain LIKE '%{}%';".format(url))			cursor.fetchall()			if cursor.rowcount > 0:				cursor.close()				return True			return False		mail_header = self.header_data["From"].split("<")[1][:-1]		mail = mail_header		cursor.execute("SELECT * FROM mail_whitelist WHERE mail='{}';".format(mail))		cursor.fetchall()		if cursor.rowcount >= 1:			self.whitelisted = True
	def check_special_chars(self):		for url in self.urls:			parsed = urllib.parse.urlparse(url.encode("idna").decode("utf-8").encode("utf-8").decode("idna"))			special_char_count = 0			for char in parsed.netloc:				if not char == ".":					if not char.encode("utf-8") == char.encode("idna"):						print("Special char detected")						self._spam = 1
	def aiPredict(self,data):		with open("aiModel","rb") as m:			aiModel = pickle.load(m)			ai_in = (data["dir_num"], data["index_num"], data["length"], data["out_resources"], data["robots_entries"], data["special_char_num"], data["subdomain_len"], data["subdomain_num"], data["tld_trust"])			return aiModel.predict(numpy.reshape(ai_in,(1, 9)))
	def find_list_resources (self,tag, attribute,soup):		list = []		for x in soup.findAll(tag):			try:				list.append(x[attribute])			except KeyError:				pass		return(list)
	def get_url_data(self,url,yandex,timeout=30):		data = {}		data["length"] = (len(url.split("://")[1].split("?")[0]))		data["dir_num"] = (url.find("/")-2)		parsed = urlparse(url.encode("idna").decode("utf-8").encode("utf-8").decode("idna"))		hostname_split = parsed.hostname.split(".")		data["tld_trust"] = int(hostname_split[-1].lower() in ["com", "org", "net"])		data["subdomain_num"] = len(hostname_split) - 2		data["subdomain_len"] = len("".join(hostname_split[:-2]))		special_char_count = 0		for char in parsed.hostname:			if char == ".":				continue			if not char.encode("utf-8") == char.encode("idna"):				special_char_count += 1		data["special_char_num"] = special_char_count		#Advanced data extraction		try:			data["index_num"] = int(yandex.search("site:{}".format(parsed.hostname)).found["all"])		except yandex_search.NoResultsException:			data["index_num"] = 0		robot_entry_counter = 0		try:			response = requests.get("{}://{}/robots.txt".format(parsed.scheme, parsed.netloc), allow_redirects=True, verify=False, timeout=timeout)			if response.status_code == 200:				lines = response.text.split("\n")				lines = [x for x in lines if x != ""]				robot_entry_counter += len([x for x in lines if x[0] != "#"])			else:				pass		except Exception as e:			print(e)		data["robots_entries"] = robot_entry_counter		try:			req = requests.get(url, verify=False, timeout=timeout)			if req.status_code == 200:				soup = BeautifulSoup(req.text,'html.parser')				image_scr = self.find_list_resources('img',"src",soup)				script_src = self.find_list_resources('script',"src",soup)				css_link = self.find_list_resources("link","href",soup)				all_links = image_scr + css_link + script_src				out_links = []				for link in all_links:					parsed_link = urlparse(link)					if parsed_link.hostname != parsed.hostname:						out_links.append(link)				data["out_resources"] = len(out_links)			else:				data["out_resources"] = -1		except Exception as e:			print(e)			data["out_resources"] = -1		return data
	def check_url(self, url):		yandex = yandex_search.Yandex(api_user='raporcubaba@gmail.com', api_key='03.1042294429:b8e679f9acadef49ebab0d9726ccef58')		data = self.get_url_data(url,yandex,timeout=10)		if self.aiPredict(data):			self.add_domain_to_blacklist(url)			self.spam_points += self.sensitivity
	def check_js(self):		for url in self.js_code:			for js in self.js_code[url]:				if self.check_blacklisted(url=js):					self._spam = 1				if self.check_js_code(self.js_code[js]):					self.add_domain_to_blacklist(url)					self.spam_points += self.sensitivity
	def check_disallowed_chars(self, url_start, chars=["<", ">", "'", "\""]):		url = url_start		for char in chars:			if char in url:				return True			if urllib.parse.quote_plus(char) in url:				return True			if urllib.parse.quote_plus(urllib.parse.quote_plus(char)) in url:				return True		return False
	def check_tld(self,url):		if urlparse(url).hostname.split(".")[-1] in ["info","tk","gq"]:			self._spam = 1
	def check_domain_name(self,url):		pass # TODO fill this up
	def discover_url(self,urls):		for url in urls:			foo = False			for i in self.documents:				if urlparse(url).hostname == urlparse(i).hostname:					foo = True			if foo:				continue			while 1:				if self.check_whitelist(url=url):					break				if self.check_blacklisted(url=url):					self._spam = 1					return				self.log.info(url)				self.check_disallowed_chars(url)				self.keyword_search(url)				self.check_xss(url)				self.check_url(url)				self.check_tld(url)				self.check_domain_name(url)				try:					r = requests.get(url, allow_redirects=False)				except requests.exceptions.ConnectionError:					break				#detect all status codes in the format 3xx				try:					self.documents[url] = r.content.decode()					for i in self.extract_urls(self.documents[url]):						foo = False						for i in self.documents:							if urlparse(url).hostname == urlparse(i).hostname:								foo = True						if foo:							continue						skip = False						for j in SKIP:							if j in i:								skip = True						if skip:							continue						foo = []						foo.append(i.strip())						self.discover_url(foo)				except UnicodeDecodeError:					pass				if r.status_code == 302 or r.status_code == 303:					location = r.headers["location"]					if location.startswith("http"):						url = location					else:						url = "/".join(url.split("/")[:-1]) + location					continue				else:					break
	def keyword_search(self, url):		keywords = self.mysql_db.cursor()		keywords.execute("SELECT * FROM keywordlist;")		result = keywords.fetchall()		for row in result:			if row[0] in url:				if  ".".join(urlparse(url).hostname.split(".")[-2:]) != row[1]:					self._spam = 1
	def check_xss(self, url):		malicious = self.check_disallowed_chars(url)		if malicious:			print("ADDED SPAM POINTS")			self.spam_points += self.sensitivity
	#TODO add deobfuscation	def extract_javascript(self): #TODO not working		p = re.compile(self.JS_IMPORT_REGEX)		for doc in self.documents:			self.js_code[doc] = []			for url in p.findall(self.documents[doc]):				r = requests.get(url, allow_redirects=False)				if r.status_code == 200:					self.js_code[doc].append(r.content)			p = re.compile(self.JS_EXTRACT_REGEX)			for js in p.findall(doc):				if js != "":					self.js_code[doc].append(js)
	def extract_urls(self, doc): # Extract URLs from the documents and save them to the array urls_in_document		p = re.compile(self.URL_REGEX)		res = p.findall(doc)		return res
	def check_stored_xss(self):		self.extract_javascript()		for url in self.js_code:			url_spam_count = 0			for js in self.js_code[url]:				if url_spam_count > 6:					self.add_domain_to_blacklist(url)				if self.check_blacklisted(url=url):					self._spam = 1					return				if self.check_js_code(js):					url_spam_count += 3					self.spam_points += self.sensitivity
	def check_js_code(self, code):		parsedJs = jsParser.parseJavascript(code, False)		return AI.aiPredict(parsedJs)
	def log_mail(self):		cursor = self.mysql_db.cursor()		domain = getaddresses(self.parsed_mail.get_all('from', []))[0][1]		cursor.execute(			"INSERT INTO logs (sender_domain,result, account, mail_id) VALUES ('{}','{}','{}','{}')".format(domain,																											self.get_spam(),																											self.account,																											self.mail_id))		self.mysql_db.commit()		cursor.close()
	def check_spam(self):		if self.whitelisted:			self._spam = 0			self.log_mail()			return		elif self.blacklisted:			self._spam = 1		else:			self.discover_url(self.urls)
			self.check_stored_xss()			if self.threshold < self.spam_points:				self._spam = 1		if self.get_spam() == 1:			self.log.info("Mail moved to spam with id: " + str(self.mail_id))			if platform.system() == "Windows":				notification.notify(						title='Found spam mail by: ' + getaddresses(self.parsed_mail.get_all('from', []))[0][1],						message=self.account,						app_icon=None,						timeout=10, )			else:				Notification(						title='Found spam mail by: ' + getaddresses(self.parsed_mail.get_all('from', []))[0][1],						description=self.account,						duration=10,						urgency=Notification.URGENCY_CRITICAL				).send()		self.log_mail()
	def get_spam(self) -> int:		if self.whitelisted:			return 0		return self._spam