Kulyutmaz is a project developed for the regional TUBITAK competition's programming field thart aims to create a more advanced phishing e-mail detection algorithm using website content checking and a neural network that we have trained.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

205 lines
6.7 KiB

import pandas as pd
import os
import urllib.request
from datetime import datetime
from progressbar import ProgressBar
import requests
import yandex_search
from urllib.parse import urlparse, quote
import requests, json
import urllib3
from bs4 import BeautifulSoup
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
TRUSTED_TLDS = ["com", "org", "net"]
pbar = None
downloaded = 0
def show_progress(count, block_size, total_size):
global pbar
global downloaded
if pbar is None:
pbar = ProgressBar(maxval=total_size)
downloaded += block_size
pbar.update(block_size)
if downloaded == total_size:
pbar.finish()
pbar = None
downloaded = 0
def get_data(phishtank_key, force_update=False):
if not os.path.isfile("phishtank.csv") or force_update:
urllib.request.urlretrieve("http://data.phishtank.com/data/{}/online-valid.csv".format(phishtank_key),
"phishtank.csv", show_progress)
if not os.path.isfile("common.csv") or force_update:
data = {"url":[]}
with open("keywordList") as wordlist:
keywords = wordlist.read().split("\n")
wordlist.close()
suggestions = []
for word in keywords:
URL = ("http://suggestqueries.google.com/complete/search?client=firefox&q="+word)
headers = {'User-agent':'Mozilla/5.0'}
response = requests.get(URL, headers=headers)
result = json.loads(response.content.decode('utf-8'))
for r in result[1]:
suggestions.append(r)
yandex = yandex_search.Yandex(api_user='yksiber', api_key='03.1041007756:28d93f7d79ff3c91b861da63e38a8e5c')
for word in suggestions:
top10 = (yandex.search(word).items[0:10])
for site in top10:
data["url"].append(site)
common = pd.DataFrame(data)
common.to_csv("common.csv")
urls = (pd.read_csv("phishtank.csv"), pd.read_csv("common.csv"))
return urls
def find_list_resources (tag, attribute,soup):
list = []
for x in soup.findAll(tag):
try:
list.append(x[attribute])
except KeyError:
pass
return(list)
def get_url_data(url,yandex,timeout=30):
#Basic data extraction
data = {}
data["length"] = (len(url.split("://")[1].split("?")[0]))
data["dir_num"] = (url.find("/")-2)
parsed = urlparse(url)
hostname_split = parsed.hostname.split(".")
data["tld_trust"] = int(hostname_split[-1].lower() in TRUSTED_TLDS)
data["subdomain_num"] = len(hostname_split) - 2
data["subdomain_len"] = len("".join(hostname_split[:-2]))
special_char_count = 0
for char in parsed.hostname:
if char == ".":
continue
if not char.encode("utf-8") == char.encode("idna"):
special_char_count += 1
data["special_char_num"] = special_char_count
#Advanced data extraction
try:
data["index_num"] = yandex.search("site:{}".format(parsed.hostname)).found["all"]
except yandex_search.NoResultsException:
data["index_num"] = 0
robot_entry_counter = 0
try:
response = requests.get("{}://{}/robots.txt".format(parsed.scheme, parsed.netloc), allow_redirects=True, verify=False, timeout=timeout)
if response.status_code == 200:
lines = response.text.split("\n")
lines = [x for x in lines if x != ""]
robot_entry_counter += len([x for x in lines if x[0] != "#"])
else:
pass
except Exception as e:
print(e)
data["robots_entries"] = robot_entry_counter
try:
req = requests.get(url, verify=False, timeout=timeout)
if req.status_code == 200:
soup = BeautifulSoup(req.text,'html.parser')
image_scr = find_list_resources('img',"src",soup)
script_src = find_list_resources('script',"src",soup)
css_link = find_list_resources("link","href",soup)
all_links = image_scr + css_link + script_src
out_links = []
for link in all_links:
parsed_link = urlparse(link)
if parsed_link.hostname != parsed.hostname:
out_links.append(link)
data["out_resources"] = len(out_links)
else:
data["out_resources"] = -1
except Exception as e:
print(e)
data["out_resources"] = -1
data["url"] = url
return data
def extract_data(raw_data, force_update=False):
reps = 0
phishing, benign = raw_data[0], raw_data[1]
data = {
"phishing": [],
"length": [],
"out_resources": [],
"dir_num": [],
"special_char_num": [],
"robots_entries": [],
"tld_trust": [],
"index_num": [],
"subdomain_len": [],
"subdomain_num": [],
"url": []
}
if not os.path.isfile("dataset.csv") or force_update:
largest_dataset = 0
while os.path.isfile(largest_dataset + 300):
largest_dataset += 300
try:
# filter old sites
old = []
for index, row in phishing.iterrows():
date = datetime.strptime(row["submission_time"],"%Y-%m-%dT%H:%M:%S+00:00")
if date.year < 2020:
old.append(index)
phishing = phishing.drop(old)
yandex = yandex_search.Yandex(api_user='yksiber', api_key='03.1041007756:28d93f7d79ff3c91b861da63e38a8e5c')
for index, row in phishing.iterrows():
reps += 1
if reps < largest_dataset:
continue
if reps % 300 == 0:
pd.DataFrame(data).to_csv("dataset{}.csv".format(reps))
url = row['url']
print("[INFO]: {} : {}".format(reps, url))
url_data = get_url_data(url, yandex)
data["phishing"].append(1)
data["length"].append(url_data["length"])
data["dir_num"].append(url_data["dir_num"])
data["special_char_num"].append(url_data["special_char_num"])
data["tld_trust"].append(url_data["tld_trust"])
data["index_num"].append(url_data["index_num"])
data["subdomain_len"].append(url_data["subdomain_len"])
data["subdomain_num"].append(url_data["subdomain_num"])
data["out_resources"].append(url_data["out_resources"])
data["robots_entries"].append(url_data["robots_entries"])
data["url"].append(url_data["url"])
for index, row in benign.iterrows():
reps += 1
if reps < largest_dataset:
continue
if reps % 300 == 0:
pd.DataFrame(data).to_csv("dataset{}.csv".format(reps))
url = row['url']
print("[INFO]: {} : {}".format(reps, url))
url_data = get_url_data(url, yandex)
data["phishing"].append(1)
data["length"].append(url_data["length"])
data["dir_num"].append(url_data["dir_num"])
data["special_char_num"].append(url_data["special_char_num"])
data["tld_trust"].append(url_data["tld_trust"])
data["index_num"].append(url_data["index_num"])
data["subdomain_len"].append(url_data["subdomain_len"])
data["subdomain_num"].append(url_data["subdomain_num"])
data["out_resources"].append(url_data["out_resources"])
data["robots_entries"].append(url_data["robots_entries"])
data["url"].append(url_data["url"])
pd.DataFrame(data).to_csv("dataset.csv".format(reps))
except Exception as e:
print("[ERROR]: {}".format(e))
return pd.read_csv("dataset.csv")
raw_data = get_data("01115eebdbf465734c08fedb2e4d93f414d1a31fa10bfcb248d0f75071e156ff")
print("DOWNLOAD COMPLETED!")
dataset = extract_data(raw_data)
print("EXTRACT COMPLETED!")