|
|
- import pandas as pd
- import os
- import urllib.request
- from datetime import datetime
- from progressbar import ProgressBar
- import requests
- import yandex_search
- from urllib.parse import urlparse, quote
- import requests, json
- import urllib3
- from bs4 import BeautifulSoup
-
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
- TRUSTED_TLDS = ["com", "org", "net"]
- pbar = None
- downloaded = 0
-
-
- def show_progress(count, block_size, total_size):
- global pbar
- global downloaded
- if pbar is None:
- pbar = ProgressBar(maxval=total_size)
-
- downloaded += block_size
- pbar.update(block_size)
- if downloaded == total_size:
- pbar.finish()
- pbar = None
- downloaded = 0
-
-
- def get_data(phishtank_key, force_update=False):
- if not os.path.isfile("phishtank.csv") or force_update:
- urllib.request.urlretrieve("http://data.phishtank.com/data/{}/online-valid.csv".format(phishtank_key),
- "phishtank.csv", show_progress)
- if not os.path.isfile("common.csv") or force_update:
- data = {"url":[]}
- with open("keywordList") as wordlist:
- keywords = wordlist.read().split("\n")
- wordlist.close()
- suggestions = []
- for word in keywords:
- URL = ("http://suggestqueries.google.com/complete/search?client=firefox&q="+word)
- headers = {'User-agent':'Mozilla/5.0'}
- response = requests.get(URL, headers=headers)
- result = json.loads(response.content.decode('utf-8'))
- for r in result[1]:
- suggestions.append(r)
- yandex = yandex_search.Yandex(api_user='yksiber', api_key='03.1041007756:28d93f7d79ff3c91b861da63e38a8e5c')
- for word in suggestions:
- top10 = (yandex.search(word).items[0:10])
- for site in top10:
- data["url"].append(site)
- common = pd.DataFrame(data)
- common.to_csv("common.csv")
- urls = (pd.read_csv("phishtank.csv"), pd.read_csv("common.csv"))
- return urls
-
- def find_list_resources (tag, attribute,soup):
- list = []
- for x in soup.findAll(tag):
- try:
- list.append(x[attribute])
- except KeyError:
- pass
- return(list)
-
- def get_url_data(url,yandex,timeout=30):
- #Basic data extraction
- data = {}
- data["length"] = (len(url.split("://")[1].split("?")[0]))
- data["dir_num"] = (url.find("/")-2)
- parsed = urlparse(url)
- hostname_split = parsed.hostname.split(".")
- data["tld_trust"] = int(hostname_split[-1].lower() in TRUSTED_TLDS)
- data["subdomain_num"] = len(hostname_split) - 2
- data["subdomain_len"] = len("".join(hostname_split[:-2]))
- special_char_count = 0
- for char in parsed.hostname:
- if char == ".":
- continue
- if not char.encode("utf-8") == char.encode("idna"):
- special_char_count += 1
- data["special_char_num"] = special_char_count
- #Advanced data extraction
- try:
- data["index_num"] = yandex.search("site:{}".format(parsed.hostname)).found["all"]
- except yandex_search.NoResultsException:
- data["index_num"] = 0
- robot_entry_counter = 0
- try:
- response = requests.get("{}://{}/robots.txt".format(parsed.scheme, parsed.netloc), allow_redirects=True, verify=False, timeout=timeout)
- if response.status_code == 200:
- lines = response.text.split("\n")
- lines = [x for x in lines if x != ""]
- robot_entry_counter += len([x for x in lines if x[0] != "#"])
- else:
- pass
- except Exception as e:
- print(e)
- data["robots_entries"] = robot_entry_counter
- try:
- req = requests.get(url, verify=False, timeout=timeout)
- if req.status_code == 200:
- soup = BeautifulSoup(req.text,'html.parser')
- image_scr = find_list_resources('img',"src",soup)
- script_src = find_list_resources('script',"src",soup)
- css_link = find_list_resources("link","href",soup)
- all_links = image_scr + css_link + script_src
- out_links = []
- for link in all_links:
- parsed_link = urlparse(link)
- if parsed_link.hostname != parsed.hostname:
- out_links.append(link)
- data["out_resources"] = len(out_links)
- else:
- data["out_resources"] = -1
- except Exception as e:
- print(e)
- data["out_resources"] = -1
- data["url"] = url
- return data
-
-
- def extract_data(raw_data, force_update=False):
- reps = 0
- phishing, benign = raw_data[0], raw_data[1]
- data = {
- "phishing": [],
- "length": [],
- "out_resources": [],
- "dir_num": [],
- "special_char_num": [],
- "robots_entries": [],
- "tld_trust": [],
- "index_num": [],
- "subdomain_len": [],
- "subdomain_num": [],
- "url": []
- }
- if not os.path.isfile("dataset.csv") or force_update:
- largest_dataset = 0
- while os.path.isfile(largest_dataset + 300):
- largest_dataset += 300
- try:
- # filter old sites
- old = []
- for index, row in phishing.iterrows():
- date = datetime.strptime(row["submission_time"],"%Y-%m-%dT%H:%M:%S+00:00")
- if date.year < 2020:
- old.append(index)
- phishing = phishing.drop(old)
- yandex = yandex_search.Yandex(api_user='yksiber', api_key='03.1041007756:28d93f7d79ff3c91b861da63e38a8e5c')
- for index, row in phishing.iterrows():
- reps += 1
- if reps < largest_dataset:
- continue
- if reps % 300 == 0:
- pd.DataFrame(data).to_csv("dataset{}.csv".format(reps))
- url = row['url']
- print("[INFO]: {} : {}".format(reps, url))
- url_data = get_url_data(url, yandex)
- data["phishing"].append(1)
- data["length"].append(url_data["length"])
- data["dir_num"].append(url_data["dir_num"])
- data["special_char_num"].append(url_data["special_char_num"])
- data["tld_trust"].append(url_data["tld_trust"])
- data["index_num"].append(url_data["index_num"])
- data["subdomain_len"].append(url_data["subdomain_len"])
- data["subdomain_num"].append(url_data["subdomain_num"])
- data["out_resources"].append(url_data["out_resources"])
- data["robots_entries"].append(url_data["robots_entries"])
- data["url"].append(url_data["url"])
- for index, row in benign.iterrows():
- reps += 1
- if reps < largest_dataset:
- continue
- if reps % 300 == 0:
- pd.DataFrame(data).to_csv("dataset{}.csv".format(reps))
- url = row['url']
- print("[INFO]: {} : {}".format(reps, url))
- url_data = get_url_data(url, yandex)
- data["phishing"].append(1)
- data["length"].append(url_data["length"])
- data["dir_num"].append(url_data["dir_num"])
- data["special_char_num"].append(url_data["special_char_num"])
- data["tld_trust"].append(url_data["tld_trust"])
- data["index_num"].append(url_data["index_num"])
- data["subdomain_len"].append(url_data["subdomain_len"])
- data["subdomain_num"].append(url_data["subdomain_num"])
- data["out_resources"].append(url_data["out_resources"])
- data["robots_entries"].append(url_data["robots_entries"])
- data["url"].append(url_data["url"])
- pd.DataFrame(data).to_csv("dataset.csv".format(reps))
- except Exception as e:
- print("[ERROR]: {}".format(e))
- return pd.read_csv("dataset.csv")
-
-
- raw_data = get_data("01115eebdbf465734c08fedb2e4d93f414d1a31fa10bfcb248d0f75071e156ff")
- print("DOWNLOAD COMPLETED!")
- dataset = extract_data(raw_data)
- print("EXTRACT COMPLETED!")
|