|
import pandas as pd
|
|
import os
|
|
import urllib.request
|
|
from datetime import datetime
|
|
from progressbar import ProgressBar
|
|
import requests
|
|
import yandex_search
|
|
from urllib.parse import urlparse, quote
|
|
import requests, json
|
|
import urllib3
|
|
from bs4 import BeautifulSoup
|
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
TRUSTED_TLDS = ["com", "org", "net"]
|
|
pbar = None
|
|
downloaded = 0
|
|
|
|
|
|
def show_progress(count, block_size, total_size):
|
|
global pbar
|
|
global downloaded
|
|
if pbar is None:
|
|
pbar = ProgressBar(maxval=total_size)
|
|
|
|
downloaded += block_size
|
|
pbar.update(block_size)
|
|
if downloaded == total_size:
|
|
pbar.finish()
|
|
pbar = None
|
|
downloaded = 0
|
|
|
|
|
|
def get_data(phishtank_key, force_update=False):
|
|
if not os.path.isfile("phishtank.csv") or force_update:
|
|
urllib.request.urlretrieve("http://data.phishtank.com/data/{}/online-valid.csv".format(phishtank_key),
|
|
"phishtank.csv", show_progress)
|
|
if not os.path.isfile("common.csv") or force_update:
|
|
data = {"url":[]}
|
|
with open("keywordList") as wordlist:
|
|
keywords = wordlist.read().split("\n")
|
|
wordlist.close()
|
|
suggestions = []
|
|
for word in keywords:
|
|
URL = ("http://suggestqueries.google.com/complete/search?client=firefox&q="+word)
|
|
headers = {'User-agent':'Mozilla/5.0'}
|
|
response = requests.get(URL, headers=headers)
|
|
result = json.loads(response.content.decode('utf-8'))
|
|
for r in result[1]:
|
|
suggestions.append(r)
|
|
yandex = yandex_search.Yandex(api_user='yksiber', api_key='03.1041007756:28d93f7d79ff3c91b861da63e38a8e5c')
|
|
for word in suggestions:
|
|
top10 = (yandex.search(word).items[0:10])
|
|
for site in top10:
|
|
data["url"].append(site)
|
|
common = pd.DataFrame(data)
|
|
common.to_csv("common.csv")
|
|
urls = (pd.read_csv("phishtank.csv"), pd.read_csv("common.csv"))
|
|
return urls
|
|
|
|
def find_list_resources (tag, attribute,soup):
|
|
list = []
|
|
for x in soup.findAll(tag):
|
|
try:
|
|
list.append(x[attribute])
|
|
except KeyError:
|
|
pass
|
|
return(list)
|
|
|
|
def get_url_data(url,yandex,timeout=30):
|
|
#Basic data extraction
|
|
data = {}
|
|
data["length"] = (len(url.split("://")[1].split("?")[0]))
|
|
data["dir_num"] = (url.find("/")-2)
|
|
parsed = urlparse(url)
|
|
hostname_split = parsed.hostname.split(".")
|
|
data["tld_trust"] = int(hostname_split[-1].lower() in TRUSTED_TLDS)
|
|
data["subdomain_num"] = len(hostname_split) - 2
|
|
data["subdomain_len"] = len("".join(hostname_split[:-2]))
|
|
special_char_count = 0
|
|
for char in parsed.hostname:
|
|
if char == ".":
|
|
continue
|
|
if not char.encode("utf-8") == char.encode("idna"):
|
|
special_char_count += 1
|
|
data["special_char_num"] = special_char_count
|
|
#Advanced data extraction
|
|
try:
|
|
data["index_num"] = yandex.search("site:{}".format(parsed.hostname)).found["all"]
|
|
except yandex_search.NoResultsException:
|
|
data["index_num"] = 0
|
|
robot_entry_counter = 0
|
|
try:
|
|
response = requests.get("{}://{}/robots.txt".format(parsed.scheme, parsed.netloc), allow_redirects=True, verify=False, timeout=timeout)
|
|
if response.status_code == 200:
|
|
lines = response.text.split("\n")
|
|
lines = [x for x in lines if x != ""]
|
|
robot_entry_counter += len([x for x in lines if x[0] != "#"])
|
|
else:
|
|
pass
|
|
except Exception as e:
|
|
print(e)
|
|
data["robots_entries"] = robot_entry_counter
|
|
try:
|
|
req = requests.get(url, verify=False, timeout=timeout)
|
|
if req.status_code == 200:
|
|
soup = BeautifulSoup(req.text,'html.parser')
|
|
image_scr = find_list_resources('img',"src",soup)
|
|
script_src = find_list_resources('script',"src",soup)
|
|
css_link = find_list_resources("link","href",soup)
|
|
all_links = image_scr + css_link + script_src
|
|
out_links = []
|
|
for link in all_links:
|
|
parsed_link = urlparse(link)
|
|
if parsed_link.hostname != parsed.hostname:
|
|
out_links.append(link)
|
|
data["out_resources"] = len(out_links)
|
|
else:
|
|
data["out_resources"] = -1
|
|
except Exception as e:
|
|
print(e)
|
|
data["out_resources"] = -1
|
|
data["url"] = url
|
|
return data
|
|
|
|
|
|
def extract_data(raw_data, force_update=False):
|
|
reps = 0
|
|
phishing, benign = raw_data[0], raw_data[1]
|
|
data = {
|
|
"phishing": [],
|
|
"length": [],
|
|
"out_resources": [],
|
|
"dir_num": [],
|
|
"special_char_num": [],
|
|
"robots_entries": [],
|
|
"tld_trust": [],
|
|
"index_num": [],
|
|
"subdomain_len": [],
|
|
"subdomain_num": [],
|
|
"url": []
|
|
}
|
|
if not os.path.isfile("dataset.csv") or force_update:
|
|
largest_dataset = 0
|
|
while os.path.isfile(largest_dataset + 300):
|
|
largest_dataset += 300
|
|
try:
|
|
# filter old sites
|
|
old = []
|
|
for index, row in phishing.iterrows():
|
|
date = datetime.strptime(row["submission_time"],"%Y-%m-%dT%H:%M:%S+00:00")
|
|
if date.year < 2020:
|
|
old.append(index)
|
|
phishing = phishing.drop(old)
|
|
yandex = yandex_search.Yandex(api_user='yksiber', api_key='03.1041007756:28d93f7d79ff3c91b861da63e38a8e5c')
|
|
for index, row in phishing.iterrows():
|
|
reps += 1
|
|
if reps < largest_dataset:
|
|
continue
|
|
if reps % 300 == 0:
|
|
pd.DataFrame(data).to_csv("dataset{}.csv".format(reps))
|
|
url = row['url']
|
|
print("[INFO]: {} : {}".format(reps, url))
|
|
url_data = get_url_data(url, yandex)
|
|
data["phishing"].append(1)
|
|
data["length"].append(url_data["length"])
|
|
data["dir_num"].append(url_data["dir_num"])
|
|
data["special_char_num"].append(url_data["special_char_num"])
|
|
data["tld_trust"].append(url_data["tld_trust"])
|
|
data["index_num"].append(url_data["index_num"])
|
|
data["subdomain_len"].append(url_data["subdomain_len"])
|
|
data["subdomain_num"].append(url_data["subdomain_num"])
|
|
data["out_resources"].append(url_data["out_resources"])
|
|
data["robots_entries"].append(url_data["robots_entries"])
|
|
data["url"].append(url_data["url"])
|
|
for index, row in benign.iterrows():
|
|
reps += 1
|
|
if reps < largest_dataset:
|
|
continue
|
|
if reps % 300 == 0:
|
|
pd.DataFrame(data).to_csv("dataset{}.csv".format(reps))
|
|
url = row['url']
|
|
print("[INFO]: {} : {}".format(reps, url))
|
|
url_data = get_url_data(url, yandex)
|
|
data["phishing"].append(1)
|
|
data["length"].append(url_data["length"])
|
|
data["dir_num"].append(url_data["dir_num"])
|
|
data["special_char_num"].append(url_data["special_char_num"])
|
|
data["tld_trust"].append(url_data["tld_trust"])
|
|
data["index_num"].append(url_data["index_num"])
|
|
data["subdomain_len"].append(url_data["subdomain_len"])
|
|
data["subdomain_num"].append(url_data["subdomain_num"])
|
|
data["out_resources"].append(url_data["out_resources"])
|
|
data["robots_entries"].append(url_data["robots_entries"])
|
|
data["url"].append(url_data["url"])
|
|
pd.DataFrame(data).to_csv("dataset.csv".format(reps))
|
|
except Exception as e:
|
|
print("[ERROR]: {}".format(e))
|
|
return pd.read_csv("dataset.csv")
|
|
|
|
|
|
raw_data = get_data("01115eebdbf465734c08fedb2e4d93f414d1a31fa10bfcb248d0f75071e156ff")
|
|
print("DOWNLOAD COMPLETED!")
|
|
dataset = extract_data(raw_data)
|
|
print("EXTRACT COMPLETED!")
|