crawlbox.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
# ----------------------------------------------------------------------
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>
# ----------------------------------------------------------------------
# @author Abay | github.com/abaykan


import os
import sys
import locale
import subprocess
import re
import platform
import inspect
import argparse
import requests
import time
from fake_useragent import UserAgent
from requests_ntlm import HttpNtlmAuth
from requests.auth import HTTPBasicAuth
from requests.auth import HTTPDigestAuth
from requests.packages.urllib3.exceptions import InsecureRequestWarning


from libs.colorama import Fore, Back, Style
from libs import FileUtils
from libs.tldextract import *

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

if platform.system() == 'Windows':
    from libs.colorama.win32 import *

__version__ = '1.0'
__description__ = '''\
  ___________________________________________

  CrawlBox | v.''' + __version__ + '''
  Author: Abay
  Github: https://github.com/abaykan
  ___________________________________________
'''

# Returns abbreviated commit hash number as retrieved with "git rev-parse --short HEAD"
def getRevisionNumber():

    retVal = None
    filePath = None
    _ = os.path.dirname(__file__)

    while True:
        filePath = os.path.join(_, ".git", "HEAD")
        if os.path.exists(filePath):
            break
        else:
            filePath = None
            if _ == os.path.dirname(_):
                break
            else:
                _ = os.path.dirname(_)

    while True:
        if filePath and os.path.isfile(filePath):
            with open(filePath, "r") as f:
                content = f.read()
                filePath = None
                if content.startswith("ref: "):
                    filePath = os.path.join(_, ".git", content.replace("ref: ", "")).strip()
                else:
                    match = re.match(r"(?i)[0-9a-f]{32}", content)
                    retVal = match.group(0) if match else None
                    break
        else:
            break

    if not retVal:
        process = subprocess.Popen("git rev-parse --verify HEAD", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, _ = process.communicate()
        match = re.search(r"(?i)[0-9a-f]{32}", stdout or "")
        retVal = match.group(0) if match else None

    return retVal[:7] if retVal else None

# unicode representation of the supplied value
def getUnicode(value, encoding=None, noneToNull=False):
    
    if noneToNull and value is None:
        return NULL

    if isinstance(value, unicode):
        return value
    elif isinstance(value, basestring):
        while True:
            try:
                return unicode(value, encoding or "utf8")
            except UnicodeDecodeError as ex:
                try:
                    return unicode(value, "utf8")
                except:
                    value = value[:ex.start] + "".join(r"\x%02x" % ord(_) for _ in value[ex.start:ex.end]) + value[ex.end:]
    else:
        try:
            return unicode(value)
        except UnicodeDecodeError:
            return unicode(str(value), errors="ignore")

# get directory path CrawlBox
def modulePath():
    weAreFrozen = hasattr(sys, "frozen")
    
    try:
        _ = sys.executable if weAreFrozen else __file__
    except NameError:
        _ = inspect.getsourcefile(modulePath)

    return getUnicode(os.path.dirname(os.path.realpath(_)), encoding=sys.getfilesystemencoding() or "utf8")

# print banner
def header():
    MAYOR_VERSION = 1
    MINOR_VERSION = 0
    REVISION = 0
    VERSION = {
        "MAYOR_VERSION": MAYOR_VERSION,
        "MINOR_VERSION": MINOR_VERSION,
        "REVISION": REVISION
    }

    PROGRAM_BANNER = open(FileUtils.buildPath("banner.txt")).read().format(**VERSION)
    message = Style.BRIGHT + Fore.CYAN + PROGRAM_BANNER + Style.RESET_ALL
    write(message)
    __version__ = '1.0'
__description__ = '''\
  ___________________________________________

  CrawlBox | v.''' + __version__ + '''
  Author: Abay
  Github: https://github.com/abaykan
  ___________________________________________
'''

#ask_change_url
def yes_no(answer):
    yes = set(['yes', 'y', 'ye', ''])
    no = set(['no', 'n'])

    while True:
        choice = answer.lower()
        if choice in yes:
            return True
        elif choice in no:
            return False

def write(string):
    if platform.system() == 'Windows':
        sys.stdout.write(string)
        sys.stdout.flush()
        sys.stdout.write('\n')
        sys.stdout.flush()
    else:
        sys.stdout.write(string + '\n')
    sys.stdout.flush()
    sys.stdout.flush()

# check url work
def checkUrl(url):

    # check
    try:
        ress1 = requests.head(url , allow_redirects=True)

        if url != ress1.url:
            return "Maybe you should use ;"+ress1.url
        else:
            ress = requests.get(url)
            code = ress.status_code
            if (code == 200):
                return True
            else:
               return False
    except requests.exceptions.ConnectionError:
        return "Try a different url please"
    except requests.exceptions.MissingSchema:
        return "Try a different url please"
    except:
        return False

## make input work both in python3 and 2
try:
    input = raw_input
except NameError:
    input = input

# read url
def read(url):

    ret = checkUrl(url)
    url_ok = False
    if "Maybe" in str(ret):
        w = "Would you like to change url to "+ ret.rsplit(';', 1)[1] + " (Y/n) : "
        choice = input(w)
        res = yes_no(choice)
        if res:
            url_ok = True
            url = ret.rsplit(';', 1)[1]
    if ret != True and url_ok != True:
        message = "Check url (ex: https://google.com) " + (ret if "Try" in str(ret) else "" )
        message = "\n\n" + Fore.YELLOW + "[-]" + Style.RESET_ALL + Style.BRIGHT + Back.RED + message
        message += Style.RESET_ALL
        exit(write(message))

    # print Target
    message = Style.BRIGHT + Fore.YELLOW
    message += '\nTarget: {0}\n'.format(Fore.CYAN + url + Fore.YELLOW)
    message += Style.RESET_ALL
    write(message)
    
    return url


# crawl directory
def crowl(dirs, url, args):

    # args strings
    domain = args.url
    wlist = args.wordlist
    delay = args.delay
    random_agent = args.randomAgent
    auth_type = args.authType.lower() if args.authType is not None else ""
    auth_cred = "".join(args.authCred).rsplit(':') if args.authCred is not None else ""
    proxy = "".join(args.proxy) if args.proxy is not None else None

    # init count valid url
    count = 0

    # get domain
    extracted = tldextract.extract(url)
    domain = "{}.{}".format(extracted.domain, extracted.suffix)

    if not os.path.exists("reports"):
        os.makedirs("reports")
    logfile = open("reports/" + domain + "_logs.txt", "w+")

    # init user agent
    if random_agent == True:
        ua = UserAgent()
     
    # init default user agent    
    headers = { 'User-Agent':  'CrawlBox' }
    
    # init default proxy 
    proxies = {"http": proxy,"https": proxy}
    
    for dir in dirs:

        dir = dir.replace("\n", "")
        dir = "%s" % (dir)

        res = ""
        save = 0
        f_url  = url + "/" + dir
        
        # add cookie header
        
        if random_agent == True:
            headers = { 'User-Agent':  ua.random }
                        
        
        # make request with different type of authentication
        if auth_type == "basic":
            try:
                ress = requests.get(f_url, headers=headers ,auth=HTTPBasicAuth(auth_cred[0], auth_cred[1]),allow_redirects=False, proxies=proxies, verify=False)
            except requests.exceptions.ProxyError:
                exit(write("Check your proxy please! "))
                
        elif auth_type == "digest":
            try:
                ress = requests.get(f_url, headers=headers ,auth=HTTPDigestAuth(auth_cred[0], auth_cred[1]),allow_redirects=False, proxies=proxies, verify=False)
            except requests.exceptions.ProxyError:
                exit(write("Check your proxy please! "))
                
        elif auth_type == "ntlm":
            try:
                ress = requests.get(f_url, headers=headers ,auth=HttpNtlmAuth(auth_cred[0], auth_cred[1]),allow_redirects=False, proxies=proxies, verify=False)
            except requests.exceptions.ProxyError:
                exit(write("Check your proxy please! "))

        else:
            try:
                ress = requests.get(f_url, headers=headers ,allow_redirects=False, proxies=proxies, verify=False)
            except requests.exceptions.ProxyError:
                exit(write("Check your proxy please! "))
                
        response = ress.status_code

        # size
        try:
            if (ress.headers['content-length'] is not None):
                size = int(ress.headers['content-length'])
            else:
                size = 0 
                
        except (KeyError, ValueError, TypeError):
            size = len(ress.content)
        finally:
            f_size = FileUtils.sizeHuman(size)

        # check reponse
        if (response == 200 or response == 302 or response == 304):
            res = "[+] %s - %s : HTTP %s Found" % (f_url, f_size, response)
            res = Fore.GREEN + res + Style.RESET_ALL
            save = 1
            count += 1
        elif (response == 401):
            res = "[-] %s - %s : HTTP %s : Unauthorized" % (f_url, f_size, response)
            res = message = Fore.YELLOW + res + Style.RESET_ALL
        elif (response == 403):
            res = "[-] %s - %s : HTTP %s : Needs authorization" % (f_url, f_size, response)
            res = Fore.BLUE + res + Style.RESET_ALL
        elif (response == 404):
            res = "[-] %s - %s : HTTP %s : Not Found" % (f_url, f_size, response)
        elif (response == 405):
            res = "[-] %s - %s : HTTP %s : Method Not Allowed" % (f_url, f_size, response)
        elif (response == 406):
            res = "[-] %s - %s : HTTP %s : Not Acceptable" % (f_url, f_size, response)
        else :
            res = "[-] %s - %s : HTTP %s : Unknown response" % (f_url, f_size, response)


        # print result
        if response != "":
            write(res)

        # save founded url log
        if save == 1:
            found = url + dir
            logfile.writelines(found + "\n")

        if delay > 0:
            time.sleep(float(delay))
            print("Sleeping for %s seconds" % str(delay))

    write("\n\n[+]Found : %s directory" % (count))
    logfile.close()


def main():

    try:
        global list
        parser = argparse.ArgumentParser(
            # version=__version__,
            formatter_class=argparse.RawTextHelpFormatter,
            prog='CrawlBox',
            description=__description__,
            epilog='''\
        EXAMPLE:
        web site scan with internal wordlist
          CrawlBox www.domain.com
        web site scan with external wordlist
          CrawlBox www.domain.com -w wordlist.txt
                    ''')

        parser.add_argument('-u', dest='url', help='specific target url, like domain.com', type=str)

        parser.add_argument('-w', help='specific path to wordlist file',
                            nargs=1, dest='wordlist', type=str, required=False)

        parser.add_argument('-d', help='add delay between requests',
                            nargs=1, dest='delay', type=float, default=0)
                            
        parser.add_argument('--random-agent', dest="randomAgent", 
                             help='Use randomly selected HTTP User-Agent header value',
                             action='store_true')
                             
        parser.add_argument("--auth-type", dest="authType", 
                            nargs='?', type=str, help="HTTP authentication type ""(Basic, Digest or NTLM)", required=False)
                            
        parser.add_argument("--auth-cred", dest="authCred",
                            nargs=1, type=str, help="HTTP authentication credentials ""(name:password)", required=False)
                            
        parser.add_argument("--proxy", dest="proxy",
                            nargs=1, type=str, help="Use a proxy to connect to the target URL", required=False)

        parser.add_argument('--version', action='version', version=__version__)

        args = parser.parse_args()

        required_together = ('authType','authCred')

        # args.authType will be None if authType is not provided
        if any([getattr(args,x) for x in required_together]):
            if not all([getattr(args,x) for x in required_together]):
                exit(write("Cannot supply --auth-type without --auth-cred"))
                

        # args strings
        domain = args.url
        wlist = args.wordlist
        
        if wlist:
            wlist = wlist[0]

        # print banner
        header()

        # check args
        if domain:
            if wlist:
                list = open(wlist, "r")
            else:
                list = open("list.txt", "r")
        else:
            exit(write('Error arguments: use crawlbox.py -h to help'))

        # read
        url = read(domain)
        
        # After check ,start scan
        crowl(list, url, args)
    
        # close
        list.close()

    except KeyboardInterrupt:

        print('[!] Ctrl + C detected\n[!] Exiting')
        sys.exit(0)

    except EOFError:

        print('[!] Ctrl + D detected\n[!] Exiting')
        sys.exit(0)


if __name__ == '__main__':
    main()