Browse Source

Red-Black self balancing tree implemented.

main
Yigit Colakoglu 4 years ago
parent
commit
a905826e1e
9 changed files with 239 additions and 82 deletions
  1. +10
    -1
      README.md
  2. +3
    -3
      linkedlist.c
  3. +1
    -1
      linkedlist.h
  4. BIN
      massurl
  5. +103
    -47
      massurl.c
  6. +108
    -26
      tree.c
  7. +9
    -3
      tree.h
  8. +4
    -1
      urlparse.c
  9. +1
    -0
      urlparse.h

+ 10
- 1
README.md View File

@ -2,7 +2,16 @@
massurl is a simple tool that aims to parse the outputs of tools like gau, and extract the parameters for each URL, remove duplicates and do it all very quickly. Because web scraping tools' outputs can get very large very quickly, it is nice to have a tool that parses them and and outputs something clean and easy to read.
## How to use?
Simply clone the git repository and run `make` which outputs the binary *massurl*. You can then simply pipe the output of any command that outputs urls into it or use pass the filename where you want it to read the urls from. It expects each line to have only one url.
Simply clone the git repository and run `make` which outputs the binary *massurl*. You can then simply pipe the output of any command that outputs urls into it or pass the filename where you want it to read the urls from. It expects each line to have only one url. It has several parameters:
```
usage: massurl [-v] [-o outfile] [-p payloads] [-n minparamnum] input_file
```
You can specify an output file, which it will write instead of stdout, you can also give it a list of payloads which massurl will automatically enter as the values for each parameter. And finally, you can specify the minimum amount of parameters a url must have to be outputted, this value is zero by default but I recommend you use 1.
## How fast is it?
The tool uses a binary tree to store the urls and keeps it balanced using the red-black self balancing tree algorithm, which allows it to run at incredible speeds.
## Contributing
This is a very simple project so you shouldn't have trouble reading the code and fixing the bugs you encounter. If you do so, feel free to send a PR. Or, if you can't seem to fix it yourself, don't be shy and open an issue!

+ 3
- 3
linkedlist.c View File

@ -36,10 +36,10 @@ LinkedList *linkedlistadd(LinkedList *p, char *data){
return p;
}
void linkedlistprint(LinkedList *p, FILE *out){
void linkedlistprint(LinkedList *p, FILE *out, char* payload){
if(p != NULL){
(p->data == NULL) ? fprintf(out, "NULL") : fprintf(out, "%s", p->data);
(p->data == NULL) ? fprintf(out, "NULL=NULL") : fprintf(out, "%s=%s", p->data, payload);
(p->next == NULL) ? : fprintf(out, "%c",'&');
linkedlistprint(p->next, out);
linkedlistprint(p->next, out, payload);
}
}

+ 1
- 1
linkedlist.h View File

@ -18,5 +18,5 @@ typedef struct {
LinkedList *linkedlistalloc(void);
int linkedlistfind(LinkedList *p, char *str);
LinkedList*linkedlistadd(LinkedList *p, char *data);
void linkedlistprint(LinkedList *p, FILE *out);
void linkedlistprint(LinkedList *p, FILE *out, char *payload);
#endif /* Symbol’s value as variable is void: \. */

BIN
massurl View File


+ 103
- 47
massurl.c View File

@ -5,64 +5,120 @@
* Copyright yigit@yigitcolakoglu.com. 2021. All rights reserved.
*/
#include <stdio.h>
#include "urlparse.h"
#include "tree.h"
#include "strings.h"
#include "tree.h"
#include "urlparse.h"
#include <time.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#define MAXURL 100000
#define MAXPAYLOAD 10000
static void usage(void){
fputs("\n", stderr);
static void usage(void) {
fputs("usage: massurl [-v] [-o outfile] [-p payloads] [-n minparamnum] input_file\n", stderr);
exit(1);
}
enum outformat{ PLAIN = 01 };
enum outformat { PLAIN = 01 };
TreeNode *root = NULL;
int main(int argc, char *argv[]) {
FILE *fin = stdin, *fout = stdout;
char *param, urlstr[MAXURL];
FILE *fin = stdin, *fout = stdout, *payloads = NULL;
char *param, urlstr[MAXURL], payload[MAXPAYLOAD];
int minparamn, verbose = 0, npayloads = 1;
time_t begin = time(NULL);
unsigned long lines, errors = 0;
while(--argc > 0){
param= *++argv;
if(param[0] == '-'){
param++;
argc--;
switch(*param){
case 'o':
if ((fout = fopen(*++argv, "w")) == NULL) {
fprintf(stderr, "Can't open output file for writing.\n");
return 1;
}
if(ferror(fout)){
fprintf(stderr, "Can't open output file for writing.\n");
return 1;
}
break;
default:
fprintf(stderr, "Parameter -%c does not exist!", *param);
}
}else{
if((fin = fopen(param, "r")) == NULL){
fprintf(stderr, "Can't open file %s\n", param);
return 1;
}
while (--argc > 0) {
param = *++argv;
if (param[0] == '-') {
param++;
argc--;
switch (*param) {
case 'o':
if ((fout = fopen(*++argv, "w")) == NULL) {
fprintf(stderr, "Can't open output file for writing.\n");
return 1;
}
}
TreeNode *urltree = treealloc();
URL *url;
while(fgets(urlstr, MAXURL, fin) != NULL){
if((url = parseurl(urlstr)) == NULL){
fprintf(stderr, "Malformed URL %s", urlstr);
continue;
if (ferror(fout)) {
fprintf(stderr, "Can't open output file for writing.\n");
return 1;
}
break;
case 'n':
minparamn = atoi(*++argv);
break;
case 'v':
verbose = 1;
break;
case 'h':
usage();
break;
case 'p':
if ((payloads = fopen(*++argv, "r")) == NULL) {
fprintf(stderr, "Can't open payload file for reading.\n");
return 1;
}
if(urltree->path == NULL){
urltree->path = url->base;
urltree->params = url->params;
}else{
urltree = addtree(urltree, url);
if (ferror(fout)) {
fprintf(stderr, "Can't open payload file for reading.\n");
return 1;
}
break;
default:
fprintf(stderr, "Parameter -%c does not exist!\n", *param);
usage();
}
} else {
if ((fin = fopen(param, "r")) == NULL) {
fprintf(stderr, "Can't open file %s\n", param);
return 1;
}
}
}
URL *url;
while (fgets(urlstr, MAXURL, fin) != NULL) {
lines++;
if ((url = parseurl(urlstr)) == NULL) {
errors++;
if (verbose)
fprintf(stderr, "Malformed URL %s", urlstr);
continue;
}
if (url->nparams >= minparamn) {
TreeNode *newnode = treealloc();
newnode->path = url->base;
newnode->params = url->params;
newnode->parent = NULL;
newnode->left = newnode->right = NULL;
newnode->nparams = url->nparams;
newnode->red = 1; /* Always color new nodes red */
root = addtree(root, newnode);
balancetree(root, newnode);
}
}
int printzeros = 0;
if (payloads == NULL)
printtree(root, fout, "%s", 0);
else {
while (fgets(payload, MAXPAYLOAD, payloads) != NULL) {
npayloads++;
for(int i=0; i<strlen(payload); i++){
if(*(payload+i) == '\n'){
*(payload+i)='\0';
break;
}
}
printtree(root, fout, payload, printzeros);
printzeros = 1;
}
printtree(urltree, fout);
return 0;
}
time_t end = time(NULL);
fprintf(stderr, "%lu urls processeed in %d seconds\nGenerated %lu urls\nSkipped %lu malformed urls\n", lines, (end-begin), npayloads*lines, errors);
return 0;
}

+ 108
- 26
tree.c View File

@ -8,44 +8,126 @@
#include "tree.h"
#include "linkedlist.h"
#include "urlparse.h"
#include <stdlib.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
TreeNode *addtree(TreeNode *p, URL *url) {
if (p == NULL) {
TreeNode *newnode = treealloc();
newnode->path = url->base;
newnode->params = url->params;
newnode->left = newnode->right = NULL;
return newnode;
}
int strdiff = strcmp(url->base, p->path);
extern TreeNode *root;
TreeNode *addtree(TreeNode *parent, TreeNode *p) {
if (parent == NULL)
return p;
int strdiff = strcmp(parent->path, p->path);
if (!strdiff) {
while(url->params != NULL){
if(p->params == NULL || linkedlistfind(p->params, url->params->data) == -1){
p->params = linkedlistadd(p->params, url->params->data);
while (p->params != NULL) {
if (p->params == NULL ||
linkedlistfind(parent->params, p->params->data) == -1) {
p->params = linkedlistadd(parent->params, p->params->data);
}
url->params = url->params->next;
p->params = p->params->next;
}
} else if (strdiff < 0) {
p->left = addtree(p->left, url);
parent->left = addtree(parent->left, p);
parent->left->parent = parent;
} else {
p->right = addtree(p->right, url);
parent->right = addtree(parent->right, p);
parent->right->parent = parent;
}
return p;
return parent;
}
void rotatetreeleft(TreeNode *p) {
TreeNode *r = p->right;
p->right = r->left;
if (p->right)
p->right->parent = p;
r->parent = p->parent;
if (p->parent == NULL)
root = r;
else if (p->parent->left == p)
p->parent->left = r;
else
p->parent->right = r;
r->left = p;
p->parent = r;
}
void rotatetreeright(TreeNode *p) {
TreeNode *l = p->left;
p->left = l->right;
if (p->left)
p->left->parent = p;
l->parent = p->parent;
if (p->parent == NULL)
root = l;
else if (p->parent->left == p)
p->parent->left = l;
else
p->parent->right = l;
l->right = p;
p->parent = l;
}
TreeNode *treealloc(void){
return (TreeNode *) malloc(sizeof(TreeNode));
void balancetree(TreeNode *root, TreeNode *node) {
TreeNode *p = NULL;
TreeNode *gP = NULL;
while (node->parent != NULL && node->parent->parent != NULL && node->red && node->parent->red ) {
p = node->parent;
gP = node->parent->parent;
if (gP->left == p) {
if (gP->right != NULL && gP->right->red) {
gP->red = 1;
gP->left->red = 0;
gP->right->red = 0;
node = gP;
}else{
if(p->right == node){
rotatetreeleft(p);
node = p;
p = node->parent;
}else{
rotatetreeright(gP);
int c = p->red;
p->red = gP->red;
gP->red = c;
node = p;
}
}
} else {
if(gP->left != NULL && gP->left->red){
gP->red = 1;
gP->left->red = 0;
gP->right->red = 0;
node = gP;
}else{
if(p->left == node){
rotatetreeright(p);
node = p;
p = node->parent;
}else{
rotatetreeleft(gP);
int c = p->red;
p->red = gP->red;
gP->red = c;
node = p;
}
}
}
}
root->red = 0;
}
void printtree(TreeNode *root, FILE *out){
if(root != NULL){
printtree(root->left, out);
fprintf(out, "%s ", root->path);
linkedlistprint(root->params, out);
fprintf(out, "%c", '\n');
printtree(root->right, out);
TreeNode *treealloc(void) { return (TreeNode *)malloc(sizeof(TreeNode)); }
void printtree(TreeNode *root, FILE *out, char *payload, int minparams) {
if (root != NULL) {
printtree(root->left, out, payload, minparams);
if(root->nparams >= minparams){
fprintf(out, "%s?", root->path);
linkedlistprint(root->params, out, payload);
fprintf(out, "%c", '\n');
}
printtree(root->right, out, payload, minparams);
}
}

+ 9
- 3
tree.h View File

@ -11,15 +11,21 @@
#ifndef tree_h
#define tree_h
typedef struct {
typedef struct tnode {
char *path;
unsigned int red : 1;
LinkedList *params;
int nparams;
struct tnode *parent;
struct tnode *left;
struct tnode *right;
} TreeNode;
TreeNode *addtree(TreeNode *root, URL *url);
TreeNode *addtree(TreeNode *parent, TreeNode *node);
void rotatetreeright(TreeNode *node);
void rotatetreeleft(TreeNode *node);
void balancetree(TreeNode *root, TreeNode *node);
TreeNode *treealloc(void);
void printtree(TreeNode *root, FILE *out);
void printtree(TreeNode *root, FILE *out, char *payload, int minparams);
#endif /* Symbol’s value as variable is void: \. */

+ 4
- 1
urlparse.c View File

@ -14,19 +14,21 @@
URL *parseurl(char *url) {
URL *urlp = urlalloc();
urlp->params = NULL;
urlp->nparams = 0;
short stage = 0; /* var to keep track of where we are in url */
int counter = 0;
while (*url != '\0' && *url != '\n') {
switch (*url++) {
case ':':
counter++;
if (stage == 0) {
urlp->https = *(url - 2) == 's';
if (*(url + 1) == '\0' || *url == '\0' || *url == '\n') /* weird stuff would happen with strings like "http:" */
return NULL;
url += 2; /* Skip the // after the :*/
stage = 1;
counter+=4;
counter+=3;
}
break;
@ -53,6 +55,7 @@ URL *parseurl(char *url) {
urlp->params->data = foo;
}else
urlp->params = linkedlistadd(urlp->params, foo);
urlp->nparams++;
while(*url != '&' && *url != '\0' && *url != '\n')
url++;
url++;


+ 1
- 0
urlparse.h View File

@ -14,6 +14,7 @@ typedef struct{
unsigned int https : 1;
char *base;
LinkedList *params;
int nparams;
} URL;
URL *parseurl(char *urlstr);


Loading…
Cancel
Save